There are three ways, one is to use the extension library win32com provided by Microsoft to manipulate IE, the second is to use selenium's webdriver, the third is to use python's own HTMLParser parsing. win32com can get similar to the js inside the document object, but it seems to be read-only (the document are not found). selenium provides support for Chrome, IE, FireFox, etc. Each browser has execute_script and find_element_by_xx methods, which make it easy to execute js scripts (including modifying elements) and read elements inside html. The disadvantage is that selenium only provides support for python2.6 and 2.7. HTMLParser needs to write its own class to inherit the base class and rewrite the methods for parsing elements. Personally, I feel that selenium is more convenient to use, it is easy to manipulate the elements in html.
The code is as follows:
win32com:
#Slide the scrollbar to the end, up to 20,000 pixels.
# Simulate right click on keyboard to view multiple images
import sys
import ,win32api
import
import time
import os
def main():
# Get parameters
url=[1]
#Operate IE
ie=("")
(url)
=True
last_url=''
dir_name=''
while last_url!=url:
print('\nThe URL is:',url,'\n')
while != 4:
(1)
while != "complete":
(1)
# Slide the scrollbar
win=
lastY=-1;
for i in range(40):
(0,500*i)
nowY=
if(nowY==lastY):
break
lastY=nowY
(0.4)
print('Document load state:',)
doc=
# The first time you need to create a catalog
if(dir_name==''):
root_dir='E:\\img'
dir_name=root_dir+'\\'+
dir_name=dir_name.replace('|','-')
if((root_dir)!=True):
(root_dir)
if((dir_name)!=True):
(dir_name)
all_image=
print('total',all_image.length,'images')
count=0;
for img in all_image:
if(=='b_img'):
count=count+1
print(count,)
(1)
img_file=()
byte=img_file.read()
print(count,'donwload complete!','-'*10,'size:','{:.3}'.format(byte.__len__()/1024),'KB')
if(byte.__len__()>7000):
file_name=('/','_')
file_name=file_name.replace(':','_')
end=file_name.__len__()
if(file_name.rfind('!')!=-1):
end=file_name.rfind('!')
if(file_name.rfind('?')!=-1):
end=file_name.rfind('?')
file_name=file_name[:end]
write_file=open(dir_name+'\\'+file_name,'wb')
write_file.write(byte)
write_file.close()
print(count,file_name,'complete!')
#Next
last_url=url
win32api.keybd_event(39,0)
(1)
url=
print(last_url,url)
#()
if __name__ == '__main__':
main()
selenium:
# -*- coding: cp936 -*-
import sys
import urllib
import time
import os
from selenium import webdriver
def main():
# Get parameters
url=[1]
#Operate IE
driver=()
(url)
driver.execute_script("(0, );")
#Creating a Catalog
dir_name=driver.find_element_by_tag_name('title').text
print dir_name
root_dir='E:\\img'
dir_name=root_dir+'\\'+dir_name
dir_name=dir_name.replace('|','-')
if((root_dir)!=True):
(root_dir)
if((dir_name)!=True):
(dir_name)
images=driver.find_elements_by_tag_name('img')
count=0
for image in images:
count=count+1
image_url=str(image.get_attribute('src'))
img_file=(image_url)
byte=img_file.read()
print count,'donwload complete!','-'*10,'size:',byte.__len__()/1024,'KB'
if(byte.__len__()>7000):
file_name=image_url.replace('/','_')
file_name=file_name.replace(':','_')
end=file_name.__len__()
if(file_name.rfind('!')!=-1):
end=file_name.rfind('!')
if(file_name.rfind('?')!=-1):
end=file_name.rfind('?')
file_name=file_name[:end]
write_file=open(dir_name+'\\'+file_name,'wb')
write_file.write(byte)
write_file.close()
print count,file_name,'complete!'
()
if __name__ == '__main__':
main()
HTMLParser:
# import modules used here -- sys is a very standard one
import sys
import
# Gather our code in a main() function
from import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self,tag,attrs):
if(tag=='img'):
for attr in attrs:
if(attr[0]=='src'):
img_file=(attr[1])
byte=img_file.read()
#If the file is larger than 1000b, then generate the file, add the count, download as many pictures as you want, and display the html code.
if(byte.__len__()>1000):
file_name=attr[1].replace('/','_')
file_name=file_name.replace(':','_')
end=file_name.__len__()
if(file_name.rfind('!')!=-1):
end=file_name.rfind('!')
if(file_name.rfind('?')!=-1):
end=file_name.rfind('?')
file_name=file_name[:end]
## print(file_name)
write_file=open('E:\\img\\'+file_name,'wb')
write_file.write(byte)
write_file.close()
def main():
# Get parameters
url=[1]
print('\nThe URL is:',url,'\n')
#Read the resource pointed to by the url
html_file=(url)
byte_content=html_file.read()
#Save the html page
url_file=open('E:\\img\\html\\','wb')
url_file.write(byte_content)
url_file.close()
# Convert from bytes to strings
s=str(byte_content, encoding = "utf-8")
#print(s)
#(html_file.read())
parser=MyHTMLParser(strict=False)
(s)
# Standard boilerplate to call the main() function to begin
# the program.
if __name__ == '__main__':
main()