SoFunction
Updated on 2024-11-12

Python crawler package BeautifulSoup example (III)

Build a crawler example step by step to crawl the Humblebee segments

Parsing without using the beautifulsoup package first

initial stepTo access the URL and grab the source code

# -*- coding: utf-8 -*-
# @Author: HaonanWu
# @Date:  2016-12-22 16:16:08
# @Last Modified by:  HaonanWu
# @Last Modified time: 2016-12-22 20:17:13

import urllib
import urllib2
import re
import os

if __name__ == '__main__':
  # Visit the URL and grab the source code
  url = '/textnew/page/1/?s=4941357'
  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
  headers = {'User-Agent':user_agent}
  try:
    request = (url = url, headers = headers)
    response = (request)
    content = ()
  except  as e:
    print e
    exit()
  except  as e:
    print e
    exit()
  print ('utf-8')

second stepThe information is extracted using regular expressions.

First, start by observing the source code, where you need the content and how it is recognized
Then use a regular expression to recognize the reads
Note that the regular expression . is not able to match \n, so you need to set the match pattern.

# -*- coding: utf-8 -*-
# @Author: HaonanWu
# @Date:  2016-12-22 16:16:08
# @Last Modified by:  HaonanWu
# @Last Modified time: 2016-12-22 20:17:13

import urllib
import urllib2
import re
import os

if __name__ == '__main__':
  # Visit the URL and grab the source code
  url = '/textnew/page/1/?s=4941357'
  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
  headers = {'User-Agent':user_agent}
  try:
    request = (url = url, headers = headers)
    response = (request)
    content = ()
  except  as e:
    print e
    exit()
  except  as e:
    print e
    exit()

  regex = ('<div class="content">.*?<span>(.*?)</span>.*?</div>', )
  items = (regex, content)

  # Extract data
  # Note the line breaks, set . can match newlines
  for item in items:
    print item

third stepThe data is corrected and saved to a file

# -*- coding: utf-8 -*-
# @Author: HaonanWu
# @Date:  2016-12-22 16:16:08
# @Last Modified by:  HaonanWu
# @Last Modified time: 2016-12-22 21:41:32

import urllib
import urllib2
import re
import os

if __name__ == '__main__':
  # Visit the URL and grab the source code
  url = '/textnew/page/1/?s=4941357'
  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
  headers = {'User-Agent':user_agent}
  try:
    request = (url = url, headers = headers)
    response = (request)
    content = ()
  except  as e:
    print e
    exit()
  except  as e:
    print e
    exit()

  regex = ('<div class="content">.*?<span>(.*?)</span>.*?</div>', )
  items = (regex, content)

  # Extract data
  # Note the line breaks, set . can match newlines
  path = './qiubai'
  if not (path):
    (path)
  count = 1
  for item in items:
    # Organize the data, remove \n, replace <br/> with \n
    item = ('\n', '').replace('<br/>', '\n')
    filepath = path + '/' + str(count) + '.txt'
    f = open(filepath, 'w')
    (item)
    ()
    count += 1

fourth stepThe content under multiple pages will be crawled.

# -*- coding: utf-8 -*-
# @Author: HaonanWu
# @Date:  2016-12-22 16:16:08
# @Last Modified by:  HaonanWu
# @Last Modified time: 2016-12-22 20:17:13

import urllib
import urllib2
import re
import os

if __name__ == '__main__':
  # Visit the URL and grab the source code
  path = './qiubai'
  if not (path):
    (path)
  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
  headers = {'User-Agent':user_agent}
  regex = ('<div class="content">.*?<span>(.*?)</span>.*?</div>', )
  count = 1
  for cnt in range(1, 35):
    print 'First' + str(cnt) + 'Wheel'
    url = '/textnew/page/' + str(cnt) + '/?s=4941357'
    try:
      request = (url = url, headers = headers)
      response = (request)
      content = ()
    except  as e:
      print e
      exit()
    except  as e:
      print e
      exit()
    # print content

    # Extract data
    # Note the line breaks, set . can match newlines
    items = (regex, content)

    # Preservation of information
    for item in items:
      #  print item
      # Organize the data, remove \n, replace <br/> with \n
      item = ('\n', '').replace('<br/>', '\n')
      filepath = path + '/' + str(count) + '.txt'
      f = open(filepath, 'w')
      (item)
      ()
      count += 1

  print 'Done'

Parsing the source code with BeautifulSoup

# -*- coding: utf-8 -*-
# @Author: HaonanWu
# @Date:  2016-12-22 16:16:08
# @Last Modified by:  HaonanWu
# @Last Modified time: 2016-12-22 21:34:02

import urllib
import urllib2
import re
import os
from bs4 import BeautifulSoup

if __name__ == '__main__':
  url = '/textnew/page/1/?s=4941357'
  user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
  headers = {'User-Agent':user_agent}
  request = (url = url, headers = headers)
  response = (request)
  # print ()
  soup_packetpage = BeautifulSoup(response, 'lxml')
  items = soup_packetpage.find_all("div", class_="content")

  for item in items:
    try:
      content = 
    except AttributeError as e:
      print e
      exit()

    if content:
      print content + "\n"

Here's the code to use BeautifulSoup to grab the book and its price
A comparison can be made to bs4's reading of tags as well as tag contents
(Since I haven't learned this part myself, I can only write from scratch at the moment)

# -*- coding: utf-8 -*-
# @Author: HaonanWu
# @Date:  2016-12-22 20:37:38
# @Last Modified by:  HaonanWu
# @Last Modified time: 2016-12-22 21:27:30
import urllib2
import urllib
import re 

from bs4 import BeautifulSoup 


url = "/all"
try:
  html = (url) 
except  as e:
  print e
  exit()

soup_packtpage = BeautifulSoup(html, 'lxml') 
all_book_title = soup_packtpage.find_all("div", class_="book-block-title") 

price_regexp = (u"\s+\$\s\d+\.\d+") 

for book_title in all_book_title: 
  try:
    print "Book's name is " + book_title.()
  except AttributeError as e:
    print e
    exit()
  book_price = book_title.find_next(text=price_regexp) 
  try:
    print "Book's price is "+ book_price.strip()
  except AttributeError as e:
    print e
    exit()
  print ""

All of the above is the entire content of this post.