News Crawler
Information Display: tkinter
Crawl and request: requests, BeautifulSoup
Setting up the news list API
Open Tencent News web page - > right mouse button check / keyboard F12 key - > network - > refresh the page!
Then right click and copy the link address that is
Program running effect
File Write Contents
Refer to the coding section - two files
Note the setting of the local file path!!!!
Data crawling file Myspider_news.py
import requests from bs4 import BeautifulSoup class MySpider: def __init__(self): = [] def getResponse(self, url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54'} r = (url, timeout=30, headers=headers) r.raise_for_status() = r.apparent_encoding return r def getJSONText(self, r): ulist = [] data = () news = data['data']['list'] for n in news: title = n['title'] publish_time = n['publish_time'] href = n['url'] ([title, publish_time, href]) = ulist return ulist def writeFile(self, file=''): print("ulist", ) with open(file, "w", encoding='utf-8') as f: for i, item in enumerate(): (f"{i}::{item[0]}::{item[1]}::{item[2]}\n") def getNewsContent(self, r): data = '' soup = BeautifulSoup(, 'lxml') datas = ('div#ArticleContent>-p') title = ("h1")[0].get_text() for d in datas: data += d.get_text() + "\n" return title, data
Window display file MySpiderGui_news.py
from tkinter import * from tkinter import messagebox from Myspider_news import * class MySpiderGUI_news: def __init__(self): = Tk() ("News Crawl") Label(, text="Tencent News.", font=("bold.", 26, 'bold')).pack() f1 = Frame() (fill="both") Label(f1, text="Please enter the URL:", font=('Bold', 12)).pack(side="left") = StringVar() # ("") # Entry(f1, textvariable=).pack(side="left", fill="x", expand=1) ("/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/" "list?sub_srv_id=24hours&srv_id=pc&offset=0&limit=20&strategy=1&ext=" "{%22pool%22:[%22top%22,%22hot%22],%22is_filter%22:7,%22check_type%22:true}") Entry(f1, text="Number of displays: ", font=('Bold', 12)).pack(side="left") = IntVar() Entry(f1, textvariable=).pack(side="left") Button(f1, text="OK.", command=, padx=10).pack(side="left") Button(f1, text="Empty.", command=, padx=10).pack(side="left") f2 = Frame() (fill="both", expand=1) scrollbarx = Scrollbar(f2, orient="horizontal") scrollbary = Scrollbar(f2, orient="vertical") (side="bottom", fill=X) (side="right", fill=Y) = Text(f2, wrap='none', width=60, xscrollcommand=, yscrollcommand=) (command=) (command=) (fill="both", expand=1) Label(f2, text="News id :", font=('Bold', 12)).pack(side="left") self.news_id = IntVar() Entry(f2, textvariable=self.news_id).pack(side="left") Button(f2, text="Show News", command=, padx=10).pack(side="left") = "" () def btOK(self): (1.0, END) # tplt = "{0:^5} {1:{3}^18} {2:< 10}" tplt = "{0:^5} {1:{3}^18} {2:<10}" ('end', ("Serial number", "News", "Time.", chr(12288))) ('end', '\n') if () > 20: ("Error.", "Too many news items entered.") return ulist = [] ("Tip.", "Start crawling.") url = () spider = MySpider() try: r = (url) ulist = (r) = r"G:\(Your local file path)test-file\" () except Exception as ex: print("Program error:", ex) for i in range(()): # print(()) u = ulist[i] if len(u[0]) > 15: u[0] = u[0][:15] u[0] = self.strB2Q(u[0]) u[0] = u[0] + "..." else: u[0] = self.strB2Q(u[0]) u[0] = u[0] + "..." + chr(12288) * (15 - len(u[0])) if len(u[1]) > 10: u[1] = u[1][:10] # print(u[1]) tplt = "{0:^5} {1:^18} {2:<10}" ('end', (str(i), u[0], u[1])) ('end', "\n") ('end', "\n") ('end', "Shared records" + str(()) + "Article.") ('end', '\n') def btCancel(self): (0) (1.0, END) tplt = "{0:^2} {1:{3}^18} {2:<10}" ("end", ("Serial number", "News.", "Time.", chr(1288))) ('end', '\n') def btNews(self): root = Tk() ("Show News") = Label(root, text=" ", font=('Bold', 22, 'bold')) () f1 = Frame(root) (fill="both", expand=1) scrollbarx = Scrollbar(f1, orient="horizontal") scrollbary = Scrollbar(f1, orient="vertical") (side="bottom", fill=X) (side="right", fill=Y) self.news_text = Text(f1, wrap="none", width=60, height=10, xscrollcommand=, yscrollcommand=) (command=) (command=) self.news_text.pack(fill="both", expand=1) Button(f1, text="Close the window.", command=, padx=10).pack() () () def displayNews(self): f = open(, "r", encoding='utf-8') datas = ()[self.news_id.get()] # Read a specific line data = ("::", 4) news_url = data[3] title = "" content = "" newsSpider = MySpider() try: r = (news_url) title, content = (r) except Exception as ex: print("Program error:", ex) ["text"] = title self.news_text.insert('end', "Title: " + title) self.news_text.insert('end', "\n") self.news_text.insert('end', "Contents:") self.news_text.insert('end', content) self.news_text.insert('end', "n") def strB2Q(self, ustring): rstring = "" for uchar in ustring: inside_code = ord(uchar) if inside_code == 32: # Space conversion inside_code = 12288 elif 32 <= inside_code <= 126: # Half angle range inside_code += 65248 rstring += chr(inside_code) return rstring MySpiderGUI_news()
Note the setting of the local file path!!!!
The above is based on Python to realize the news crawling system in detail, more information about Python news crawling please pay attention to my other related articles!