#-*- encoding: utf-8 -*-
'''
Created on 2014-4-24
@author: Leon Wong
'''
import urllib2
import urllib
import re
import time
import os
import uuid
# Get secondary page url
def findUrl2(html):
re1 = r'/\d+/\d+/|http://\w+(?<!photos)./\d+/'
url2list = (re1,html)
url2lstfltr = list(set(url2list))
(key=)
#print url2lstfltr
return url2lstfltr
# Get html text
def getHtml(url):
html = (url).read().decode('utf-8')# decode to utf-8
return html
#Download images locally
def download(html_page , pageNo):
#Define the name of the folder
x = (())
foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
re2=r'/.+/f/.+\.jpg'
imglist=(re2,html_page)
print imglist
download_img=None
for imgurl in imglist:
picpath = 'D:\\TuChong\\%s\\%s' % (foldername,str(pageNo))
filename = str(uuid.uuid1())
if not (picpath):
(picpath)
target = picpath+"\\%" % filename
print "The photos location is:"+target
download_img = (imgurl, target)# download the image to the specified path
(1)
print(imgurl)
return download_img
# def callback(blocknum, blocksize, totalsize):
# '''Callback function'''
# @blocknum: blocks already downloaded
# @blocksize: size of the data block
# @totalsize: size of the remote file
# '''
# print str(blocknum),str(blocksize),str(totalsize)
# if blocknum * blocksize >= totalsize:
# print 'download complete'
def quitit():
print "Bye!"
exit(0)
if __name__ == '__main__':
print ''' *****************************************
** Welcome to Spider for TUCHONG **
** Created on 2014-4-24 **
** @author: Leon Wong **
*****************************************'''
pageNo = raw_input("Input the page number you want to scratch (1-100),please input 'quit' if you want to quit>")
while not () or int(pageNo) > 100 :
if pageNo == 'quit':quitit()
print "Param is invalid , please try again."
pageNo = raw_input("Input the page number you want to scratch >")
#Crawl against the Tupelo Portrait Module
html = getHtml("/tags/%E4%BA%BA%E5%83%8F/?page="+str(pageNo))
detllst = findUrl2(html)
for detail in detllst:
html2 = getHtml(detail)
download(html2,pageNo)
print "Finished."