SoFunction
Updated on 2024-11-16

Python to crawl Amazon data and print out Excel file operation example

This article example describes the Python implementation of crawling Amazon data and print out the Excel file operation. Shared for your reference, as follows:

python gods do not spray, the code is written very rough, mainly to complete the function, to be able to learn from it to see it, I am learning java, after all, not learning python, their own self-study to see a little bit of python, hope to understand.

#!/usr/bin/env python3
# encoding=UTF-8
import sys
import re
import 
import json
import time
import zlib
from html import unescape
import threading
import os
import xlwt
import math
import requests
#For example, here the recursion is set to one million
(1000000000)
## Get all columns
def getProUrl():
  urlList = []
  headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
  session = ()
  furl="/?tag=baidu250-23&hvadid={creative}&ref=pz_ic_22fvxh4dwf_e&page="
  for i in range(0,1):
    html=""
    html = (furl+str(i),headers = headers)
     = 'utf-8'
    s=('gb2312','ignore').decode('gb2312')
    url=r'</li><li  data-asin="(.+?)" class="s-result-item celwidget">'
    reg=(url,)
    name='"category" : "' + '(.*?)' + '"'
    reg1=(name,)
    urlList = ()
    return urlList
## Get data links based on categories
def getUrlData(ci):
   url="/s/ref=nb_sb_noss_2?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&url=search-alias%3Daps&field-keywords="+ci+"&page=1&sort=review-rank"
   return url
##Timed task, wait 1 second before proceeding ##
def fun_timer():
  (3)
## Query web content per category based on links
def getProData(allUrlList):
  webContentHtmlList = []
  headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
  for ci in allUrlList:
    session = ()
    fun_timer()
    html = (getUrlData(ci),headers = headers)
    # Setting the code
     = 'utf-8'
    ('gb2312', 'ignore').decode('gb2312')
    gxg = r'</li><li  data-asin="(.+?)" class="s-result-item celwidget">'
    reg = (gxg, )
    items = ()
    print()
    ()
  return webContentHtmlList
## Filter desired attributes and values based on web content
def getProValue():
  list1 = [] * 5
  list2 = [] * 5
  list3 = [] * 5
  list4 = [] * 5
  list5 = [] * 5
  list6 = [] * 5
  list7 = [] * 5
  list8 = [] * 5
  urlList = getProUrl();
  ('All Categories')
  ('Prime Membership Priority Purchase')
  index = 0
  for head in urlList:
    if index >= 0 and index < 5:
      (head)
      index = index + 1
    if index >= 5 and index < 10:
      (head)
      index = index + 1
    if index >= 10 and index < 15:
      (head)
      index = index + 1
    if index >= 15 and index < 20:
      (head)
      index = index + 1
    if index >= 20 and index < 25:
      (head)
      index = index + 1
    if index >= 25 and index < 30:
      (head)
      index = index + 1
    if index >= 30 and index < 35:
      (head)
      index = index + 1
    if index >= 35 and index < 40:
      (head)
      index = index + 1
  webContentHtmlList1 = []
  webContentHtmlList1 = getProData(list1)
  webContentHtmlList2 = []
  webContentHtmlList2 = getProData(list2)
  webContentHtmlList3 = []
  webContentHtmlList3 = getProData(list3)
  webContentHtmlList4 = []
  webContentHtmlList4 = getProData(list4)
  webContentHtmlList5 = []
  webContentHtmlList5 = getProData(list5)
  webContentHtmlList6 = []
  webContentHtmlList6 = getProData(list6)
  webContentHtmlList7 = []
  webContentHtmlList7 = getProData(list7)
  webContentHtmlList8 = []
  webContentHtmlList8 = getProData(list8)
  ## Store a collection of all data
  dataTwoAllList1 = []
  print("Start retrieving data, retrieve data in ..........")
  ## Web page content 1
  for html in webContentHtmlList1:
    for i in range(15):
      dataList = []
      (unescape(getProCategory(html,i)))
      (unescape(getProTitle(html,i)))
      (getProPrice(html,i))
      (getSellerCount(html,i))
      (getProStar(html,i))
      (getProCommentCount(html,i))
      print(dataList)
      (dataList)
  ## Web page content 2
  for html in webContentHtmlList2:
    for i in range(15):
      dataList = []
      (unescape(getProCategory(html,i)))
      (unescape(getProTitle(html,i)))
      (getProPrice(html,i))
      (getSellerCount(html,i))
      (getProStar(html,i))
      (getProCommentCount(html,i))
      print(dataList)
      (dataList)
  ## Web page content 3
  for html in webContentHtmlList3:
    for i in range(15):
      dataList = []
      (unescape(getProCategory(html,i)))
      (unescape(getProTitle(html,i)))
      (getProPrice(html,i))
      (getSellerCount(html,i))
      (getProStar(html,i))
      (getProCommentCount(html,i))
      print(dataList)
      (dataList)
  ## Web page content 4
  for html in webContentHtmlList4:
    for i in range(15):
      dataList = []
      (unescape(getProCategory(html,i)))
      (unescape(getProTitle(html,i)))
      (getProPrice(html,i))
      (getSellerCount(html,i))
      (getProStar(html,i))
      (getProCommentCount(html,i))
      print(dataList)
      (dataList)
  ## Web page content 5
  for html in webContentHtmlList5:
    for i in range(15):
      dataList = []
      (unescape(getProCategory(html,i)))
      (unescape(getProTitle(html,i)))
      (getProPrice(html,i))
      (getSellerCount(html,i))
      (getProStar(html,i))
      (getProCommentCount(html,i))
      print(dataList)
      (dataList)
  ## Web page content 6
  for html in webContentHtmlList6:
    for i in range(15):
      dataList = []
      (unescape(getProCategory(html,i)))
      (unescape(getProTitle(html,i)))
      (getProPrice(html,i))
      (getSellerCount(html,i))
      (getProStar(html,i))
      (getProCommentCount(html,i))
      print(dataList)
      (dataList)
  ## Web page content 7
  for html in webContentHtmlList7:
    for i in range(15):
      dataList = []
      (unescape(getProCategory(html,i)))
      (unescape(getProTitle(html,i)))
      (getProPrice(html,i))
      (getSellerCount(html,i))
      (getProStar(html,i))
      (getProCommentCount(html,i))
      print(dataList)
      (dataList)
  ## Web page content 8
  for html in webContentHtmlList8:
    for i in range(15):
      dataList = []
      (unescape(getProCategory(html,i)))
      (unescape(getProTitle(html,i)))
      (getProPrice(html,i))
      (getSellerCount(html,i))
      (getProStar(html,i))
      (getProCommentCount(html,i))
      print(dataList)
      (dataList)
  print("Retrieval of data completed !!!!")
  print("Start saving and printing Excel document data !!!!")
  ##Save the document
  createTable(("%Y%m%d") + 'Amazon sales statistics.xls', dataTwoAllList1)
## Extraction category
def getProCategory(html,i):
    i = 0;
    name = '<span class="a-color-state a-text-bold">' + '(.*?)' + '</span>'
    reg=(name,)
    items = (html)
    if len(items)==0:
      return ""
    else:
      if i<len(items):
        return items[i]
      else:
        return ""
## Extract the title
def getProTitle(html,i):
  html = getHtmlById(html,i)
  name = '<a class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal" target="_blank" title="' + '(.*?)' + '"'
  reg=(name,)
  items = (html)
  if len(items)==0:
    return ""
  else:
    return items[0]
##extraction price<a class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal" target="_blank" title="
def getProPrice(html,i):
  html = getHtmlById(html,i)
  name = '<span class="a-size-base a-color-price s-price a-text-bold">' + '(.*?)' + '</span>'
  reg=(name,)
  items = (html)
  if len(items)==0:
    return "¥0"
  else:
    return items[0]
## Extracting seller statistics
def getSellerCount(html,i):
  html = getHtmlById(html,i)
  name = '<span class="a-color-secondary">' + '(.*?)' + '</span>'
  reg=(name,)
  items = (html)
  if len(items)==0:
    return "(0 seller (of goods))"
  else:
    return checkSellerCount(items,0)
##Check seller stats
def checkSellerCount(items,i):
  result = items[i].find('Seller') >= 0
  if result:
    if len(items[i])<=9:
      return items[i]
    else:
      return '(0 seller (of goods))'
  else:
    if i + 1 < len(items):
      i = i + 1
      result = items[i].find('Seller') >= 0
      if result:
        if len(items[i]) <= 9:
          return items[i]
        else:
          return '(0 seller (of goods))'
        if i + 1 < len(items[i]):
          i = i + 1
          result = items[i].find('Seller') >= 0
          if result:
            if len(items[i]) <= 9:
              return items[i]
            else:
              return '(0 seller (of goods))'
          else:
            return '(0 seller (of goods))'
        else:
          return '(0 seller (of goods))'
      else:
        return '(0 seller (of goods))'
    else:
      return '(0 seller (of goods))'
    return '(0 seller (of goods))'
##Extract star rating <span class="a-icon-alt">
def getProStar(html,i):
  html = getHtmlById(html,i)
  name = '<span class="a-icon-alt">' + '(.*?)' + '</span>'
  reg=(name,)
  items = (html)
  if len(items)==0:
    return "Average 0 stars"
  else:
    return checkProStar(items,0)
##Check the stars
def checkProStar(items,i):
  result = items[i].find('Star') >= 0
  if result:
      return items[i]
  else:
    if i + 1 < len(items):
      i = i + 1
      result = items[i].find('Star') >= 0
      if result:
        return items[i]
      else:
        return 'Average 0 stars'
    else:
      return 'Average 0 stars'
    return 'Average 0 stars'
## Extract the number of product reviews Sales
##<a class="a-size-small a-link-normal a-text-normal" target="_blank" href="/dp/B073LBRNV2/ref=sr_1_1?ie=UTF8&qid=1521782688&sr=8-1&keywords=%E5%9B%BE%E4%B9%A6#customerReviews" rel="external nofollow" >56</a>
def getProCommentCount(html,i):
  name = '<a class="a-size-small a-link-normal a-text-normal" target="_blank" href=".*?#customerReviews" rel="external nofollow" ' + '(.*?)' + '</a>'
  reg=(name,)
  items = (html)
  if len(items)==0:
    return "0"
  else:
    if i<len(items):
      return items[i].strip(">")
    else:
      return "0"
## Take out the contents of the html according to ids
def get_id_tag(content, id_name):
 id_name = id_name.strip()
 patt_id_tag = """<[^>]*id=['"]?""" + id_name + """['" ][^>]*>"""
 id_tag = (patt_id_tag, content, |)
 if id_tag:
   id_tag = id_tag[0]
 else:
   id_tag=""
 return id_tag
## Narrow the range Position the value
def getHtmlById(html,i):
    start = get_id_tag(html,"result_"+str(i))
    i=i+1
    end = get_id_tag(html, "result_" + str(i))
    name = start + '.*?'+end
    reg = (name, )
    html = ()
    items = (html)
    if len(items) == 0:
      return ""
    else:
      return items[0]
## Generate word documents
def createTable(tableName,dataTwoAllList):
  flag = 1
  results = []
  ("Category, Title, Price, Seller Stats, Star Rating, Number of Reviews")
  columnName = results[0].split(',')
  # Create an excel workbook, encoding utf-8, table support Chinese
  wb = (encoding='utf-8')
  # Create a sheet
  sheet = wb.add_sheet('sheet 1')
  # Get the number of rows
  rows = (len(dataTwoAllList))
  # Get the number of columns
  columns = len(columnName)
  # Create format style
  style = ()
  # Create the font, set the font
  font = ()
  # Font Format
   = 'Times New Roman'
  # Apply the font font, to the format style
   = font
  # Create alignment, center
  alignment = ()
  # Centered
   = .HORZ_CENTER
  # Apply to formatstyle
   = alignment
  style1 = ()
  font1 = ()
   = 'Times New Roman'
  # Font color (green)
  # font1.colour_index = 3
  # Font Bolding
   = True
   = font1
   = alignment
  for i in range(columns):
    # Set the width of the column
    (i).width = 5000
  # Insert column name
  for i in range(columns):
    (0, i, columnName[i], style1)
  for i in range(1,rows):
    for j in range(0,columns):
      (i, j, dataTwoAllList[i-1][j], style)
    (tableName)
## The entrance begins ##
input("Press enter to start exporting ..........")
fun_timer()
print("Grabbing data will start in three seconds ....... Please wait!")
getProValue();
print("Data exported successfully! Please pay attention to check!")
print("The data file "Amazon Sales Data Statistics.xls" has been saved under C:\Windows\SysWOW64 under that path on the C drive !!!!")
input()

Results data:

Packed into an exe file, you can directly click to run: packing process I will not say it all, are some command operations:

To install pyinstaller, type the command as an exe: --inco is the icon, the path is the same as the current path of the project

encountered a lot of problems on the way, all one by one to solve the problem, garbled code, ip restrictions, packaged after the introduction of the module can not be found, the maximum number of recursion, filtering of some of the issues

pyinstaller -F -c --icon= This is the packing command.

Rendering:

More about Python related content can be viewed on this site's topic: thePython Socket Programming Tips Summary》、《Python Regular Expression Usage Summary》、《Python Data Structures and Algorithms Tutorial》、《Summary of Python function usage tips》、《Summary of Python string manipulation techniques》、《Python introductory and advanced classic tutorialsand theSummary of Python file and directory manipulation techniques

I hope that what I have said in this article will help you in Python programming.