SoFunction
Updated on 2024-11-15

Three ways to batch download images in python

There are three ways, one is to use the extension library win32com provided by Microsoft to manipulate IE, the second is to use selenium's webdriver, the third is to use python's own HTMLParser parsing. win32com can get similar to the js inside the document object, but it seems to be read-only (the document are not found). selenium provides support for Chrome, IE, FireFox, etc. Each browser has execute_script and find_element_by_xx methods, which make it easy to execute js scripts (including modifying elements) and read elements inside html. The disadvantage is that selenium only provides support for python2.6 and 2.7. HTMLParser needs to write its own class to inherit the base class and rewrite the methods for parsing elements. Personally, I feel that selenium is more convenient to use, it is easy to manipulate the elements in html.
The code is as follows:

win32com:

Copy Code The code is as follows.

#Slide the scrollbar to the end, up to 20,000 pixels.
# Simulate right click on keyboard to view multiple images
import sys
import ,win32api
import
import time
import os

def main():
# Get parameters
    url=[1]
#Operate IE
    ie=("")
    (url)
    =True
    last_url=''
    dir_name=''
    while last_url!=url:
        print('\nThe URL is:',url,'\n')
        while != 4:   
            (1)
        while != "complete":
            (1)
# Slide the scrollbar
        win=
        lastY=-1;
        for i in range(40):
            (0,500*i)
            nowY=
            if(nowY==lastY):
                break
            lastY=nowY
            (0.4)
        print('Document load state:',)
        doc=
# The first time you need to create a catalog
        if(dir_name==''):
            root_dir='E:\\img'
            dir_name=root_dir+'\\'+
            dir_name=dir_name.replace('|','-')
            if((root_dir)!=True):
                (root_dir)
            if((dir_name)!=True):
                (dir_name)
        all_image=
print('total',all_image.length,'images')
        count=0;
        for img in all_image:
            if(=='b_img'):
                count=count+1
                print(count,)
                (1)
                img_file=()
                byte=img_file.read()
                print(count,'donwload complete!','-'*10,'size:','{:.3}'.format(byte.__len__()/1024),'KB')
                if(byte.__len__()>7000):
                    file_name=('/','_')
                    file_name=file_name.replace(':','_')
                    end=file_name.__len__()
                    if(file_name.rfind('!')!=-1):
                        end=file_name.rfind('!')
                    if(file_name.rfind('?')!=-1):
                        end=file_name.rfind('?')
                    file_name=file_name[:end]
                    write_file=open(dir_name+'\\'+file_name,'wb')
                    write_file.write(byte)
                    write_file.close()
                    print(count,file_name,'complete!')
#Next
        last_url=url
        win32api.keybd_event(39,0)
        (1)
        url=
        print(last_url,url)
    #()
if __name__ == '__main__':
    main()

selenium:

Copy Code The code is as follows.

# -*- coding: cp936 -*-
import sys
import urllib
import time
import os
from selenium import webdriver

def main():
# Get parameters
    url=[1]
#Operate IE
    driver=()
    (url)
    driver.execute_script("(0, );")
#Creating a Catalog
    dir_name=driver.find_element_by_tag_name('title').text
    print dir_name
    root_dir='E:\\img'
    dir_name=root_dir+'\\'+dir_name
    dir_name=dir_name.replace('|','-')
    if((root_dir)!=True):
        (root_dir)
    if((dir_name)!=True):
        (dir_name)
    images=driver.find_elements_by_tag_name('img')
    count=0
    for image in images:
        count=count+1
        image_url=str(image.get_attribute('src'))
        img_file=(image_url)
        byte=img_file.read()
        print count,'donwload complete!','-'*10,'size:',byte.__len__()/1024,'KB'
        if(byte.__len__()>7000):
            file_name=image_url.replace('/','_')
            file_name=file_name.replace(':','_')
            end=file_name.__len__()
            if(file_name.rfind('!')!=-1):
                end=file_name.rfind('!')
            if(file_name.rfind('?')!=-1):
                end=file_name.rfind('?')
            file_name=file_name[:end]
            write_file=open(dir_name+'\\'+file_name,'wb')
            write_file.write(byte)
            write_file.close()
            print count,file_name,'complete!'

    ()
if __name__ == '__main__':
    main()

HTMLParser:

Copy Code The code is as follows.

# import modules used here -- sys is a very standard one
import sys
import
# Gather our code in a main() function

from import HTMLParser
class MyHTMLParser(HTMLParser):
    def handle_starttag(self,tag,attrs):
        if(tag=='img'):
            for attr in attrs:
                if(attr[0]=='src'):
                    img_file=(attr[1])
                    byte=img_file.read()
#If the file is larger than 1000b, then generate the file, add the count, download as many pictures as you want, and display the html code.
                    if(byte.__len__()>1000):
                        file_name=attr[1].replace('/','_')
                        file_name=file_name.replace(':','_')
                        end=file_name.__len__()
                        if(file_name.rfind('!')!=-1):
                            end=file_name.rfind('!')
                        if(file_name.rfind('?')!=-1):
                            end=file_name.rfind('?')
                        file_name=file_name[:end]
##                        print(file_name)
                        write_file=open('E:\\img\\'+file_name,'wb')
                        write_file.write(byte)
                        write_file.close()

def main():
# Get parameters
    url=[1]
    print('\nThe URL is:',url,'\n')
#Read the resource pointed to by the url
    html_file=(url)
    byte_content=html_file.read()
#Save the html page
    url_file=open('E:\\img\\html\\','wb')
    url_file.write(byte_content)
    url_file.close()
# Convert from bytes to strings
    s=str(byte_content, encoding = "utf-8")
    #print(s)
    #(html_file.read())
    parser=MyHTMLParser(strict=False)
    (s)
# Standard boilerplate to call the main() function to begin
# the program.
if __name__ == '__main__':
    main()