analyze
Using CrawlSpider in combination with LinkExtractor and Rule to crawl web information
LinkExtractor is used to define link extraction rules, generally use the allow parameter.
LinkExtractor(allow=(), # Define extraction rules using regular deny=(), # Exclusion rules allow_domains=(), # Limited domain scope deny_domains=(), # Excluded domain ranges restrict_xpaths=(), # Use xpath to define extraction queues tags=('a', 'area'), attrs=('href',), canonicalize=False, unique=True, process_value=None, deny_extensions=None, restrict_css=(), # Define extraction rules using css selectors strip=True):
Rule is used to define CrawlSpider's crawling rules, which are automatically recognized by the Spider, submit the request, get the response, and give it to the callback method specified by callback to process the response.
If callback is specified, the parameter follow defaults to False; if callback is None, follow defaults to True.
Rule(link_extractor, # LinkExtractor object, mandatory parameter. callback=None, # Callback method, optional cb_kwargs=None, follow=None, # Whether to do deep crawling,True, False process_links=None, # Used to process links (some anti-crawl tactics return fake urls) process_request=identity)
source code (computing)
class BosszhipinItem(): """Boss Direct Pytho Job Crawler Item""" # Position title position=() # Company name company=() # Salary salary=() # Duty stations location=() # Academic requirements education=() # Working hours year=()
spiders/bosszhipin_spider.py
# !/usr/bin/env python # -*- coding:utf-8 -*- import scrapy from import CrawlSpider,Rule from import LinkExtractor from import BosszhipinItem class BosszhipinSpider(CrawlSpider): """ Boss Direct Python Job Crawler Spider Implemented using the CrawlSpider base class """ name = 'bosszhipin' allowed_domains=['',] start_urls=['/c100010000/h_100010000/?query=Python&page=1',] # Link extractor object (specifying link extraction rules) link_extractor=LinkExtractor(allow=(r'page=\d+')) # List of link extraction rule objects # Automatically call the method specified by callback to crawl the url matched by the link extraction rules specified by link_extractor. # Principle: link_extractor.extract_links(response) returns the matched links. rules = [ Rule(link_extractor=link_extractor,callback='parse_page',follow=True), ] def parse_page(self,response): """Define callback methods for parsing each response object.""" job_list=('//div[@class="job-list"]//li') for job in job_list: position = ('.//div[@class="info-primary"]//h3[@class="name"]/a/text()')[0].extract() salary =('.//div[@class="info-primary"]//h3[@class="name"]//span/text()')[0].extract() company =('.//div[@class="company-text"]//a/text()')[0].extract() location =('.//div[@class="info-primary"]/p/text()[1]')[0].extract() year =('.//div[@class="info-primary"]/p/text()[2]')[0].extract() education =('.//div[@class="info-primary"]/p/text()[3]')[0].extract() item=BosszhipinItem() item['position']=position item['salary']=salary item['company']=company item['location']=location item['year']=year item['education']=education yield item
class BosszhipinPipeline(object): """Boss Direct Python Job Crawler Item Pipeline.""" def __init__(self): =open('data/',mode='wb') (b'[') def process_item(self,item,spider): data=(dict(item),ensure_ascii=False,indent=4) (('utf-8')) (b',') return item def close_spider(self,spider): (b']') ()
ITEM_PIPELINES = { '': 1, }
running result
summarize
Above is the entire content of this article, I hope the content of this article for your study or work has a certain reference learning value, thank you for your support. If you want to know more about the content please check the following related links