Beginner python, grabbing Sogou weibo articles to deposit into mysql
mysql table:
Code:
import requests import json import re import pymysql # Create a connection conn = (host='Your database address', port=ports, user='Username', passwd='Password', db='Database name', charset='utf8') # Create cursors cursor = () ("select * from hd_gzh") effect_row = () from bs4 import BeautifulSoup (60) count = 1 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'} #Abu cloud ip proxy is not used for the time being # proxyHost = "" # proxyPort = "9030" # # Proxy tunnel authentication information # proxyUser = "H56761606429T7UC" # proxyPass = "9168EB00C4167176" # proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { # "host" : proxyHost, # "port" : proxyPort, # "user" : proxyUser, # "pass" : proxyPass, # } # proxies = { # "http" : proxyMeta, # "https" : proxyMeta, # } #Check if data already exists def checkData(name): sql = "select * from gzh_article where title = '%s'" data = (name,) count = (sql % data) () if(count!=0): return False else: return True #Insert data def insertData(title,picture,author,content): sql = "insert into gzh_article (title,picture,author,content) values ('%s', '%s','%s', '%s')" data = (title,picture,author,content) (sql % data) () print("Insert a piece of data") return for row in effect_row: newsurl = '/weixin?type=1&s_from=input&query=' + row[1] + '&ie=utf8&_sug_=n&_sug_type_=' res = (newsurl,headers=headers) = 'utf-8' soup = BeautifulSoup(,'') url = '' + ('.tit a')[0]['href'] res2 = (url,headers=headers) = 'utf-8' soup2 = BeautifulSoup(,'') pattern = (r"url \+= '(.*?)';", | ) script = ("script") url2 = ().group(1) res3 = (url2,headers=headers) = 'utf-8' soup3 = BeautifulSoup(,'') print() pattern2 = (r"var msgList = (.*?);$", | ) script2 = ("script", text=pattern2) s2 = (().group(1)) # Wait 10s (10) for news in s2["list"]: articleurl = ""+news["app_msg_ext_info"]["content_url"] articleurl = ('&','&') res4 = (articleurl,headers=headers) = 'utf-8' soup4 = BeautifulSoup(,'') if(checkData(news["app_msg_ext_info"]["title"])): insertData(news["app_msg_ext_info"]["title"],news["app_msg_ext_info"]["cover"],news["app_msg_ext_info"]["author"],pymysql.escape_string(str(soup4))) count += 1 # Wait for 5s (10) for news2 in news["app_msg_ext_info"]["multi_app_msg_item_list"]: articleurl2 = ""+news2["content_url"] articleurl2 = ('&','&') res5 = (articleurl2,headers=headers) = 'utf-8' soup5 = BeautifulSoup(,'') if(checkData(news2["title"])): insertData(news2["title"],news2["cover"],news2["author"],pymysql.escape_string(str(soup5))) count += 1 # Wait 10s (10) () () print("Operation complete.")
This is the whole content of this article.