SoFunction
Updated on 2024-11-21

Example of python crawling news portals

Project Address:

/Python3Spiders/AllNewsSpider

How to use

The code under each folder is the news crawler for the corresponding platform

  1. The py file runs directly
  2. The pyd file is required, assuming it is pengpai_news_spider.pyd

Download the pyd file locally, create a new project and put the pyd file in it.

Create a new directory in the project root and write the following code to run and crawl

import pengpai_news_spider
pengpai_news_spider.main()

sample code (computing)

Baidu news

# -*- coding: utf-8 -*-
# File Remarks Information If you encounter a situation where you can not open, you can first open the browser to Baidu search engine

import requests

from datetime import datetime, timedelta

from lxml import etree

import csv

import os

from time import sleep
from random import randint


def parseTime(unformatedTime):
    if 'Minutes' in unformatedTime:
        minute = unformatedTime[:('Minutes')]
        minute = timedelta(minutes=int(minute))
        return (() -
                minute).strftime('%Y-%m-%d %H:%M')
    elif 'Hours' in unformatedTime:
        hour = unformatedTime[:('Hours')]
        hour = timedelta(hours=int(hour))
        return (() -
                hour).strftime('%Y-%m-%d %H:%M')
    else:
        return unformatedTime


def dealHtml(html):
    results = ('//div[@class="result-op c-container xpath-log new-pmd"]')

    saveData = []

    for result in results:
        title = ('.//h3/a')[0]
        title = ('string(.)').strip()

        summary = ('.//span[@class="c-font-normal c-color-text"]')[0]
        summary = ('string(.)').strip()

        # . / is a direct subordinate, . // is a direct/indirect subordinate
        infos = ('.//div[@class="news-source"]')[0]
        source, dateTime = (".//span[last()-1]/text()")[0], \
                           (".//span[last()]/text()")[0]

        dateTime = parseTime(dateTime)

        print('Title', title)
        print('Source', source)
        print('Time', dateTime)
        print('Summary', summary)
        print('\n')

        ({
            'title': title,
            'source': source,
            'time': dateTime,
            'summary': summary
        })
    with open(fileName, 'a+', encoding='utf-8-sig', newline='') as f:
        writer = (f)
        for row in saveData:
            ([row['title'], row['source'], row['time'], row['summary']])


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    'Referer': '/s?rtt=1&bsst=1&cl=2&tn=news&word=%B0%D9%B6%C8%D0%C2%CE%C5&fr=zhidao'
}

url = '/s'

params = {
    'ie': 'utf-8',
    'medium': 0,
    # rtt=4 Sort by time rtt=1 Sort by focus
    'rtt': 1,
    'bsst': 1,
    'rsv_dl': 'news_t_sk',
    'cl': 2,
    'tn': 'news',
    'rsv_bp': 1,
    'oq': '',
    'rsv_btype': 't',
    'f': 8,
}


def doSpider(keyword, sortBy = 'focus'):
    '''
    :param keyword: search keyword
    :param sortBy: sorting rule, optional: focus, time, default focus
    :return.
    '''
    global fileName
    fileName = '{}.csv'.format(keyword)

    if not (fileName):
        with open(fileName, 'w+', encoding='utf-8-sig', newline='') as f:
            writer = (f)
            (['title', 'source', 'time', 'summary'])

    params['wd'] = keyword
    if sortBy == 'time':
        params['rtt'] = 4

    response = (url=url, params=params, headers=headers)

    html = ()

    dealHtml(html)

    total = ('//div[@]/span/text()')[0]

    total = (',', '')

    total = int(total[7:-1])

    pageNum = total // 10

    for page in range(1, pageNum):
        print('(prefix indicating ordinal number, e.g. first, number two etc) {} leaf\n\n'.format(page))
        headers['Referer'] = 
        params['pn'] = page * 10

        response = (url=url, headers=headers, params=params)

        html = ()

        dealHtml(html)

        sleep(randint(2, 4))
    ...


if __name__ == "__main__":
    doSpider(keyword = 'Ma Baoguo', sortBy='focus')

Above is the details of python crawl news portal example, more information about python crawl news portal please pay attention to my other related articles!