SoFunction
Updated on 2024-12-10

Python based news crawler system implementation

News Crawler

Information Display: tkinter

Crawl and request: requests, BeautifulSoup

Setting up the news list API

Open Tencent News web page - > right mouse button check / keyboard F12 key - > network - > refresh the page!

Then right click and copy the link address that is

Program running effect

File Write Contents

Refer to the coding section - two files

Note the setting of the local file path!!!!

Data crawling file Myspider_news.py

import requests
from bs4 import BeautifulSoup
class MySpider:
    def __init__(self):
         = []
    def getResponse(self, url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54'}
        r = (url, timeout=30, headers=headers)
        r.raise_for_status()
         = r.apparent_encoding
        return r
    def getJSONText(self, r):
        ulist = []
        data = ()
        news = data['data']['list']
        for n in news:
            title = n['title']
            publish_time = n['publish_time']
            href = n['url']
            ([title, publish_time, href])
         = ulist
        return ulist
    def writeFile(self, file=''):
        print("ulist", )
        with open(file, "w", encoding='utf-8') as f:
            for i, item in enumerate():
                (f"{i}::{item[0]}::{item[1]}::{item[2]}\n")
    def getNewsContent(self, r):
        data = ''
        soup = BeautifulSoup(, 'lxml')
        datas = ('div#ArticleContent>-p')
        title = ("h1")[0].get_text()
        for d in datas:
            data += d.get_text() + "\n"
        return title, data

Window display file MySpiderGui_news.py

from tkinter import *
from tkinter import messagebox
from Myspider_news import *
class MySpiderGUI_news:
    def __init__(self):
         = Tk()
        ("News Crawl")
        Label(, text="Tencent News.", font=("bold.", 26, 'bold')).pack()
        f1 = Frame()
        (fill="both")
        Label(f1, text="Please enter the URL:", font=('Bold', 12)).pack(side="left")
         = StringVar()
        # ("")
        # Entry(f1, textvariable=).pack(side="left", fill="x", expand=1)
        ("/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/"
                     "list?sub_srv_id=24hours&srv_id=pc&offset=0&limit=20&strategy=1&ext="
                     "{%22pool%22:[%22top%22,%22hot%22],%22is_filter%22:7,%22check_type%22:true}")
        Entry(f1, text="Number of displays: ", font=('Bold', 12)).pack(side="left")
         = IntVar()
        Entry(f1, textvariable=).pack(side="left")
        Button(f1, text="OK.", command=, padx=10).pack(side="left")
        Button(f1, text="Empty.", command=, padx=10).pack(side="left")
        f2 = Frame()
        (fill="both", expand=1)
        scrollbarx = Scrollbar(f2, orient="horizontal")
        scrollbary = Scrollbar(f2, orient="vertical")
        (side="bottom", fill=X)
        (side="right", fill=Y)
         = Text(f2, wrap='none', width=60,
                         xscrollcommand=,
                         yscrollcommand=)
        (command=)
        (command=)
        (fill="both", expand=1)
        Label(f2, text="News id :", font=('Bold', 12)).pack(side="left")
        self.news_id = IntVar()
        Entry(f2, textvariable=self.news_id).pack(side="left")
        Button(f2, text="Show News", command=, padx=10).pack(side="left")
         = ""
        ()
    def btOK(self):
        (1.0, END)
        # tplt = "{0:^5} {1:{3}^18} {2:< 10}"
        tplt = "{0:^5} {1:{3}^18} {2:<10}"
        ('end', ("Serial number", "News", "Time.", chr(12288)))
        ('end', '\n')
        if () > 20:
            ("Error.", "Too many news items entered.")
            return
        ulist = []
        ("Tip.", "Start crawling.")
        url = ()
        spider = MySpider()
        try:
            r = (url)
            ulist = (r)
             = r"G:\(Your local file path)test-file\"
            ()
        except Exception as ex:
            print("Program error:", ex)
        for i in range(()):
            # print(())
            u = ulist[i]
            if len(u[0]) > 15:
                u[0] = u[0][:15]
                u[0] = self.strB2Q(u[0])
                u[0] = u[0] + "..."
            else:
                u[0] = self.strB2Q(u[0])
                u[0] = u[0] + "..." + chr(12288) * (15 - len(u[0]))
            if len(u[1]) > 10:
                u[1] = u[1][:10]
                # print(u[1])
            tplt = "{0:^5} {1:^18} {2:<10}"
            ('end', (str(i), u[0], u[1]))
            ('end', "\n")
            ('end', "\n")
        ('end', "Shared records" + str(()) + "Article.")
        ('end', '\n')
    def btCancel(self):
        (0)
        (1.0, END)
        tplt = "{0:^2} {1:{3}^18} {2:<10}"
        ("end", ("Serial number", "News.", "Time.", chr(1288)))
        ('end', '\n')
    def btNews(self):
        root = Tk()
        ("Show News")
         = Label(root, text=" ", font=('Bold', 22, 'bold'))
        ()
        f1 = Frame(root)
        (fill="both", expand=1)
        scrollbarx = Scrollbar(f1, orient="horizontal")
        scrollbary = Scrollbar(f1, orient="vertical")
        (side="bottom", fill=X)
        (side="right", fill=Y)
        self.news_text = Text(f1, wrap="none", width=60, height=10,
                              xscrollcommand=,
                              yscrollcommand=)
        (command=)
        (command=)
        self.news_text.pack(fill="both", expand=1)
        Button(f1, text="Close the window.", command=, padx=10).pack()
        ()
        ()
    def displayNews(self):
        f = open(, "r", encoding='utf-8')
        datas = ()[self.news_id.get()]
        # Read a specific line
        data = ("::", 4)
        news_url = data[3]
        title = ""
        content = ""
        newsSpider = MySpider()
        try:
            r = (news_url)
            title, content = (r)
        except Exception as ex:
            print("Program error:", ex)
        ["text"] = title
        self.news_text.insert('end', "Title: " + title)
        self.news_text.insert('end', "\n")
        self.news_text.insert('end', "Contents:")
        self.news_text.insert('end', content)
        self.news_text.insert('end', "n")
    def strB2Q(self, ustring):
        rstring = ""
        for uchar in ustring:
            inside_code = ord(uchar)
            if inside_code == 32:
                # Space conversion
                inside_code = 12288
            elif 32 <= inside_code <= 126:  # Half angle range
                inside_code += 65248
            rstring += chr(inside_code)
        return rstring
MySpiderGUI_news()

Note the setting of the local file path!!!!

The above is based on Python to realize the news crawling system in detail, more information about Python news crawling please pay attention to my other related articles!