SoFunction
Updated on 2024-12-16

Simple Python Crawler for Grabbing Taobao Images


# -*- coding: cp936 -*-
import urllib2
import urllib
mmurl="/json/request_top_list.htm?type=0&page="
i=0#The second page with a person has no picture, there will be an IO error.
while i<15:
        url=mmurl+str(i)
#print url #print out the url of the list
up=(url)#open the page, deposit in handle
        cont=()
#print len(cont)# the length of the page
ahref='<a href="http'#Filter keywords for in-page web links
        target="target"
pa=(ahref)#Find the header position of a web link
pt=(target,pa)#find out where the tail of the web link is located
for a in range(0,20):# How can I not hardcode 20 in? How do I find the end of a file?
urlx=cont[pa+len(ahref)-4:pt-2]#Store the web link into the variable from header to tailer
if len(urlx) < 60:# if the web link length fits [len() !!!!]
urla=urlx #then it's ready to print it out
print urla #This is the desired model's personal URL.
######### Here is the start of the mod's personal URL #########
mup=(urla)#Open the mod's personal page, store in handle
mcont=()# Read the handle of the model page and store it in the mcont string.
imgh="<img style=" #filter keywords for in-page [image] links
                    imgt=".jpg"
iph=(imgh)#find the header position of the [image] link
ipt=(imgt,iph)#find out where the tail of the [image] link is located
for b in range(0,10):#hardcoded again ----
mpic=mcont[iph:ipt+len(imgt)]#link to original image, too much noise in link character
iph1=("http")#Filter the above link again
ipt1=(imgt) #ditto
                            picx=mpic[iph1:ipt1+len(imgt)]
if len(picx)<150:# there are still some URLs that are "http: ><dfsdf>.jpg" (setting it to 100 is surprisingly misleading)
pica=picx # [it's len(picx)<100 not picx!!!! Otherwise it won't show
                                    print pica
                                    ############################
########### start downloading pica this image
                                    (pica,"pic\\tb"+str(i)+"x"+str(a)+"x"+str(b)+".jpg")                                  
########### pica image download complete. (add the numbers of each loop body to avoid duplicate names)
                                    ############################
iph=(imgh,iph+len(imgh))#start next loop
                            ipt=(imgt,iph)
############model personal URL within the [image link] extraction is complete ##########
pa=(ahref,pa+len(ahref))# Use the original head part as a starting point and continue backward to find the next head
pt=(target,pa)# Keep looking for the next tail
        i+=1