Before executing the program, create a database "pachong" in MySQL.
import pymysql import requests import re # Get resources and download def resp(listURL): # Connect to the database conn = ( host = '127.0.0.1', port = 3306, user = 'root', password = '******', #Please enter the database password according to your actual password database = 'pachong', charset = 'utf8' ) # Create database cursors cursor = () # Create list t_movieTOP250 (execute sql statement) ('create table t_movieTOP250(id INT PRIMARY KEY auto_increment NOT NULL ,movieName VARCHAR(20) NOT NULL ,pictrue_address VARCHAR(100))') try: # Crawl the data for urlPath in listURL: # Getting the source code of a web page response = (urlPath) html = # Regular expressions namePat = r'alt="(.*?)" src=' imgPat = r'src="(.*?)" class=' # Match regular (ranking [replace with id in database, auto-generated and sorted], movie title, movie poster (image address)) res2 = (namePat) res3 = (imgPat) textList2 = (html) textList3 = (html) # Iterate over the elements of the list and store the data in the database. for i in range(len(textList3)): ('insert into t_movieTOP250(movieName,pictrue_address) VALUES("%s","%s")' % (textList2[i],textList3[i])) # Getting results from a cursor () # Submission of results () print("Results submitted") except Exception as e: # Data rollback () print("Data has been rolled back.") # Close the database () #top250 All Web Sites def page(url): urlList = [] for i in range(10): num = str(25*i) pagePat = r'?start=' + num + '&filter=' urL = url+pagePat (urL) return urlList if __name__ == '__main__': url = r"/top250" listURL = page(url) resp(listURL)
The results are shown below:
The above is what I shared, if there is any shortcomings please point out, more communication, thank you!
Above is the detailed content of python crawl douban movie TOP250 data, more information about python crawl douban movie please pay attention to my other related articles!