Project Address:
/Python3Spiders/AllNewsSpider
How to use
The code under each folder is the news crawler for the corresponding platform
- The py file runs directly
- The pyd file is required, assuming it is pengpai_news_spider.pyd
Download the pyd file locally, create a new project and put the pyd file in it.
Create a new directory in the project root and write the following code to run and crawl
import pengpai_news_spider pengpai_news_spider.main()
sample code (computing)
Baidu news
# -*- coding: utf-8 -*- # File Remarks Information If you encounter a situation where you can not open, you can first open the browser to Baidu search engine import requests from datetime import datetime, timedelta from lxml import etree import csv import os from time import sleep from random import randint def parseTime(unformatedTime): if 'Minutes' in unformatedTime: minute = unformatedTime[:('Minutes')] minute = timedelta(minutes=int(minute)) return (() - minute).strftime('%Y-%m-%d %H:%M') elif 'Hours' in unformatedTime: hour = unformatedTime[:('Hours')] hour = timedelta(hours=int(hour)) return (() - hour).strftime('%Y-%m-%d %H:%M') else: return unformatedTime def dealHtml(html): results = ('//div[@class="result-op c-container xpath-log new-pmd"]') saveData = [] for result in results: title = ('.//h3/a')[0] title = ('string(.)').strip() summary = ('.//span[@class="c-font-normal c-color-text"]')[0] summary = ('string(.)').strip() # . / is a direct subordinate, . // is a direct/indirect subordinate infos = ('.//div[@class="news-source"]')[0] source, dateTime = (".//span[last()-1]/text()")[0], \ (".//span[last()]/text()")[0] dateTime = parseTime(dateTime) print('Title', title) print('Source', source) print('Time', dateTime) print('Summary', summary) print('\n') ({ 'title': title, 'source': source, 'time': dateTime, 'summary': summary }) with open(fileName, 'a+', encoding='utf-8-sig', newline='') as f: writer = (f) for row in saveData: ([row['title'], row['source'], row['time'], row['summary']]) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 'Referer': '/s?rtt=1&bsst=1&cl=2&tn=news&word=%B0%D9%B6%C8%D0%C2%CE%C5&fr=zhidao' } url = '/s' params = { 'ie': 'utf-8', 'medium': 0, # rtt=4 Sort by time rtt=1 Sort by focus 'rtt': 1, 'bsst': 1, 'rsv_dl': 'news_t_sk', 'cl': 2, 'tn': 'news', 'rsv_bp': 1, 'oq': '', 'rsv_btype': 't', 'f': 8, } def doSpider(keyword, sortBy = 'focus'): ''' :param keyword: search keyword :param sortBy: sorting rule, optional: focus, time, default focus :return. ''' global fileName fileName = '{}.csv'.format(keyword) if not (fileName): with open(fileName, 'w+', encoding='utf-8-sig', newline='') as f: writer = (f) (['title', 'source', 'time', 'summary']) params['wd'] = keyword if sortBy == 'time': params['rtt'] = 4 response = (url=url, params=params, headers=headers) html = () dealHtml(html) total = ('//div[@]/span/text()')[0] total = (',', '') total = int(total[7:-1]) pageNum = total // 10 for page in range(1, pageNum): print('(prefix indicating ordinal number, e.g. first, number two etc) {} leaf\n\n'.format(page)) headers['Referer'] = params['pn'] = page * 10 response = (url=url, headers=headers, params=params) html = () dealHtml(html) sleep(randint(2, 4)) ... if __name__ == "__main__": doSpider(keyword = 'Ma Baoguo', sortBy='focus')
Above is the details of python crawl news portal example, more information about python crawl news portal please pay attention to my other related articles!