SoFunction
Updated on 2024-11-19

python crawl Sogou WeChat public number articles

Beginner python, grabbing Sogou weibo articles to deposit into mysql

mysql table:

Code:

import requests
import json
import re
import pymysql
 
# Create a connection
conn = (host='Your database address', port=ports, user='Username', passwd='Password', db='Database name', charset='utf8')
# Create cursors
cursor = ()

("select * from hd_gzh")
effect_row = ()
from bs4 import BeautifulSoup

(60)
count = 1
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'}
#Abu cloud ip proxy is not used for the time being
# proxyHost = ""
# proxyPort = "9030"
# # Proxy tunnel authentication information
# proxyUser = "H56761606429T7UC"
# proxyPass = "9168EB00C4167176"

# proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
#  "host" : proxyHost,
#  "port" : proxyPort,
#  "user" : proxyUser,
#  "pass" : proxyPass,
# }

# proxies = {
#   "http" : proxyMeta,
#   "https" : proxyMeta,
# }

#Check if data already exists
def checkData(name):
  sql = "select * from gzh_article where title = '%s'"
  data = (name,)
  count = (sql % data)
  ()
  if(count!=0):
    return False
  else:
    return True
#Insert data
def insertData(title,picture,author,content):
  sql = "insert into gzh_article (title,picture,author,content) values ('%s', '%s','%s', '%s')"
  data = (title,picture,author,content)
  (sql % data)
  ()
  print("Insert a piece of data")
  return
  
for row in effect_row:
  newsurl = '/weixin?type=1&s_from=input&query=' + row[1] + '&ie=utf8&_sug_=n&_sug_type_='
  res = (newsurl,headers=headers)
   = 'utf-8'
  soup = BeautifulSoup(,'')
  url = '' + ('.tit a')[0]['href']
  res2 = (url,headers=headers)
   = 'utf-8'
  soup2 = BeautifulSoup(,'')
  pattern = (r"url \+= '(.*?)';",  | )
  script = ("script")
  url2 = ().group(1)
  res3 = (url2,headers=headers)
   = 'utf-8'
  soup3 = BeautifulSoup(,'')
  print()
  pattern2 = (r"var msgList = (.*?);$",  | )
  script2 = ("script", text=pattern2)
  s2 = (().group(1))
  # Wait 10s
  (10)
  
  for news in s2["list"]:
    articleurl = ""+news["app_msg_ext_info"]["content_url"]
    articleurl = ('&','&')
    res4 = (articleurl,headers=headers)
     = 'utf-8'
    soup4 = BeautifulSoup(,'')
    if(checkData(news["app_msg_ext_info"]["title"])):
      insertData(news["app_msg_ext_info"]["title"],news["app_msg_ext_info"]["cover"],news["app_msg_ext_info"]["author"],pymysql.escape_string(str(soup4)))
    count += 1
    # Wait for 5s
    (10)
    for news2 in news["app_msg_ext_info"]["multi_app_msg_item_list"]:
      articleurl2 = ""+news2["content_url"]
      articleurl2 = ('&','&')
      res5 = (articleurl2,headers=headers)
       = 'utf-8'
      soup5 = BeautifulSoup(,'')
      if(checkData(news2["title"])):
        insertData(news2["title"],news2["cover"],news2["author"],pymysql.escape_string(str(soup5)))
      count += 1
      # Wait 10s
      (10)
()
()
print("Operation complete.")

This is the whole content of this article.