This article will use two popular libraries, requests and BeautifulSoup.
1. Preparation
First install the necessary libraries:
pip install requests beautifulsoup4
2. Basic crawler implementation
import requests from bs4 import BeautifulSoup import time import random def get_csdn_articles(keyword, pages=1): """ Crawl articles with specified keywords on CSDN :param keyword: search keyword :param pages: Number of pages to be crawled :return: Article list, including title, link, introduction and other information """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } base_url = "/so/search" articles = [] for page in range(1, pages + 1): params = { 'q': keyword, 't': 'blog', 'p': page } try: response = (base_url, headers=headers, params=params) response.raise_for_status() soup = BeautifulSoup(, '') items = soup.find_all('div', class_='search-item') for item in items: title_tag = ('a', class_='title') if not title_tag: continue title = title_tag.get_text().strip() link = title_tag['href'] # Get an introduction desc_tag = ('p', class_='content') description = desc_tag.get_text().strip() if desc_tag else 'No introduction' # Get the number of views and publishing time info_tags = item.find_all('span', class_='date') read_count = info_tags[0].get_text().strip() if len(info_tags) > 0 else 'unknown' publish_time = info_tags[1].get_text().strip() if len(info_tags) > 1 else 'unknown' ({ 'title': title, 'link': link, 'description': description, 'read_count': read_count, 'publish_time': publish_time }) print(f"Crawled {page} Page,common {len(items)} Article") # Random delay to avoid being blocked ((1, 3)) except Exception as e: print(f"Catch the {page} Page时出错: {e}") continue return articles if __name__ == '__main__': # Example: Crawl the first 3 pages of articles about "Python Crawler" keyword = "
3. Advanced feature extensions
3.1 Crawl article details
def get_article_detail(url): """Crawl article details""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } try: response = (url, headers=headers) response.raise_for_status() soup = BeautifulSoup(, '') # Obtain the main content of the article content = ('article') if content: # Clean up unnecessary tags for tag in content(['script', 'style', 'iframe', 'nav', 'footer']): () return content.get_text().strip() return "Unable to obtain article content" except Exception as e: print(f"An error occurred when crawling the details of the article: {e}") return None
3.2 Save data to file
import json import csv def save_to_json(data, filename): """Save data to JSON file""" with open(filename, 'w', encoding='utf-8') as f: (data, f, ensure_ascii=False, indent=2) def save_to_csv(data, filename): """Save data to CSV file""" if not data: return keys = data[0].keys() with open(filename, 'w', newline='', encoding='utf-8') as f: writer = (f, fieldnames=keys) () (data)
4. Complete example
if __name__ == '__main__': # Crawl the article list keyword = "Python Crawler" articles = get_csdn_articles(keyword, pages=2) #Crawl the details of the first 3 articles for article in articles[:3]: article['content'] = get_article_detail(article['link']) ((1, 2)) # Delay # Save data save_to_json(articles, 'csdn_articles.json') save_to_csv(articles, 'csdn_articles.csv') print("Data crawling is completed and saved!")
5. Anti-crawler strategy response
1. Set the request header: simulate browser access
2. Random delay: Avoid excessive frequent requests
3. Use proxy IP: prevent IP from being blocked
4. Processing verification code: manual intervention may be required
5. Comply with: Respect the crawler rules of the website
This is the article about this article teaching you how to quickly and accurately capture web page data in Python. For more related content of Python crawling web page data, please search for my previous articles or continue browsing the related articles below. I hope everyone will support me in the future!