Build a crawler example step by step to crawl the Humblebee segments
Parsing without using the beautifulsoup package first
initial stepTo access the URL and grab the source code
# -*- coding: utf-8 -*- # @Author: HaonanWu # @Date: 2016-12-22 16:16:08 # @Last Modified by: HaonanWu # @Last Modified time: 2016-12-22 20:17:13 import urllib import urllib2 import re import os if __name__ == '__main__': # Visit the URL and grab the source code url = '/textnew/page/1/?s=4941357' user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent':user_agent} try: request = (url = url, headers = headers) response = (request) content = () except as e: print e exit() except as e: print e exit() print ('utf-8')
second stepThe information is extracted using regular expressions.
First, start by observing the source code, where you need the content and how it is recognized
Then use a regular expression to recognize the reads
Note that the regular expression . is not able to match \n, so you need to set the match pattern.
# -*- coding: utf-8 -*- # @Author: HaonanWu # @Date: 2016-12-22 16:16:08 # @Last Modified by: HaonanWu # @Last Modified time: 2016-12-22 20:17:13 import urllib import urllib2 import re import os if __name__ == '__main__': # Visit the URL and grab the source code url = '/textnew/page/1/?s=4941357' user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent':user_agent} try: request = (url = url, headers = headers) response = (request) content = () except as e: print e exit() except as e: print e exit() regex = ('<div class="content">.*?<span>(.*?)</span>.*?</div>', ) items = (regex, content) # Extract data # Note the line breaks, set . can match newlines for item in items: print item
third stepThe data is corrected and saved to a file
# -*- coding: utf-8 -*- # @Author: HaonanWu # @Date: 2016-12-22 16:16:08 # @Last Modified by: HaonanWu # @Last Modified time: 2016-12-22 21:41:32 import urllib import urllib2 import re import os if __name__ == '__main__': # Visit the URL and grab the source code url = '/textnew/page/1/?s=4941357' user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent':user_agent} try: request = (url = url, headers = headers) response = (request) content = () except as e: print e exit() except as e: print e exit() regex = ('<div class="content">.*?<span>(.*?)</span>.*?</div>', ) items = (regex, content) # Extract data # Note the line breaks, set . can match newlines path = './qiubai' if not (path): (path) count = 1 for item in items: # Organize the data, remove \n, replace <br/> with \n item = ('\n', '').replace('<br/>', '\n') filepath = path + '/' + str(count) + '.txt' f = open(filepath, 'w') (item) () count += 1
fourth stepThe content under multiple pages will be crawled.
# -*- coding: utf-8 -*- # @Author: HaonanWu # @Date: 2016-12-22 16:16:08 # @Last Modified by: HaonanWu # @Last Modified time: 2016-12-22 20:17:13 import urllib import urllib2 import re import os if __name__ == '__main__': # Visit the URL and grab the source code path = './qiubai' if not (path): (path) user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent':user_agent} regex = ('<div class="content">.*?<span>(.*?)</span>.*?</div>', ) count = 1 for cnt in range(1, 35): print 'First' + str(cnt) + 'Wheel' url = '/textnew/page/' + str(cnt) + '/?s=4941357' try: request = (url = url, headers = headers) response = (request) content = () except as e: print e exit() except as e: print e exit() # print content # Extract data # Note the line breaks, set . can match newlines items = (regex, content) # Preservation of information for item in items: # print item # Organize the data, remove \n, replace <br/> with \n item = ('\n', '').replace('<br/>', '\n') filepath = path + '/' + str(count) + '.txt' f = open(filepath, 'w') (item) () count += 1 print 'Done'
Parsing the source code with BeautifulSoup
# -*- coding: utf-8 -*- # @Author: HaonanWu # @Date: 2016-12-22 16:16:08 # @Last Modified by: HaonanWu # @Last Modified time: 2016-12-22 21:34:02 import urllib import urllib2 import re import os from bs4 import BeautifulSoup if __name__ == '__main__': url = '/textnew/page/1/?s=4941357' user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent':user_agent} request = (url = url, headers = headers) response = (request) # print () soup_packetpage = BeautifulSoup(response, 'lxml') items = soup_packetpage.find_all("div", class_="content") for item in items: try: content = except AttributeError as e: print e exit() if content: print content + "\n"
Here's the code to use BeautifulSoup to grab the book and its price
A comparison can be made to bs4's reading of tags as well as tag contents
(Since I haven't learned this part myself, I can only write from scratch at the moment)
# -*- coding: utf-8 -*- # @Author: HaonanWu # @Date: 2016-12-22 20:37:38 # @Last Modified by: HaonanWu # @Last Modified time: 2016-12-22 21:27:30 import urllib2 import urllib import re from bs4 import BeautifulSoup url = "/all" try: html = (url) except as e: print e exit() soup_packtpage = BeautifulSoup(html, 'lxml') all_book_title = soup_packtpage.find_all("div", class_="book-block-title") price_regexp = (u"\s+\$\s\d+\.\d+") for book_title in all_book_title: try: print "Book's name is " + book_title.() except AttributeError as e: print e exit() book_price = book_title.find_next(text=price_regexp) try: print "Book's price is "+ book_price.strip() except AttributeError as e: print e exit() print ""
All of the above is the entire content of this post.