1. Grabbing street photos
Street Photo URL
2. Analyzing the structure of street photography images
keyword: street photography pd: atlas dvpf: pc aid: 4916 page_num: 1 search_json: {"from_search_id":"20220104115420010212192151532E8188","origin_keyword":"Street Photography.","image_keyword":"Street Photography."} rawJSON: 1 search_id: 202201041159040101501341671A4749C4
It's possible to find patterns.page_num from 1
Starts to accumulate, other parameters remain unchanged
3. Developing different ways of organizing code by function
3.1 Get web page data in json format
def get_page(page_num): global headers headers = { 'Host': '', #'Referer': '/search?keyword=%E8%A1%97%E6%8B%8D&pd=atlas&dvpf=pc&aid=4916&page_num=0&search_json={%22from_search_id%22:%22202112272022060101510440283EE83D67%22,%22origin_keyword%22:%22%E8%A1%97%E6%8B%8D%22,%22image_keyword%22:%22%E8%A1%97%E6%8B%8D%22}', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', 'Cookie': 'msToken=S0DFBkZ9hmyLOGYd3_QjhhXgrm38qTyOITnkNb0t_oavfbVxuYV1JZ0tT5hLgswSfmZLFD6c2lONm_5TomUQXVXjen7CIxM2AGwbhHRYKjhg; _S_DPR=1.5; _S_IPAD=0; MONITOR_WEB_ID=7046351002275317255; ttwid=1%7C0YdWalNdIiSpIk3CvvHwV25U8drq3QAj08E8QOApXhs%7C1640607595%7C720e971d353416921df127996ed708931b4ae28a0a8691a5466347697e581ce8; _S_WIN_WH=262_623' } params = { 'keyword': 'Street Photography', 'pd': 'atlas', 'dvpf': 'pc', 'aid': '4916', 'page_num': page_num, 'search_json': '%7B%22from_search_id%22%3A%22202112272022060101510440283EE83D67%22%2C%22origin_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22image_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%7D', 'rawJSON': 1, 'search_id': '2021122721183101015104402851E3883D' } url = '/search?' + urlencode(params) print(url) try: response=(url,headers=headers,params=params) if response.status_code == 200: #if : #print(()) return () except : return None
3.2 Extract street photo images from json format data
def get_images(json): images = ('rawData').get('data') for image in images: link = ('img_url') yield link
3.3 Name the street photography image after its md5 code and save the image
Implement a method to save an imagesave_image()
where item is a dictionary returned by the previous get_images() method. In that method, the first thing you do is look at theitem
title to create the folder, then request this image link to get the binary data of the image and write it to the file as a binary. The name of the image can use the MD5 value of its content so that duplicates can be removed. Related
The code is as follows:
def save_image(link): data = (link).content with open(f'./image/{md5(data).hexdigest()}.jpg', 'wb')as f:#Use the md5 code of data as the image name (data)
3.4 main() calls to other functions
def main(page_num): json = get_page(page_num) for link in get_images(json): #print(link) save_image(link)
4 Grab 20pages of today's headline street photo data
The number of starting and ending pages for paging is defined here asGROUP_START
cap (a poem)GROUP_END
It also makes use of a multi-threaded thread pool and calls its map() method to download the program.
if __name__ == '__main__': GROUP_START = 1 GROUP_END = 20 pool = Pool() groups = ([x for x in range(GROUP_START, GROUP_END + 1)]) #print(groups) (main, groups) () ()
import requests from import urlencode from hashlib import md5 from import Pool def get_page(page_num): global headers headers = { 'Host': '', #'Referer': '/search?keyword=%E8%A1%97%E6%8B%8D&pd=atlas&dvpf=pc&aid=4916&page_num=0&search_json={%22from_search_id%22:%22202112272022060101510440283EE83D67%22,%22origin_keyword%22:%22%E8%A1%97%E6%8B%8D%22,%22image_keyword%22:%22%E8%A1%97%E6%8B%8D%22}', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', 'Cookie': 'msToken=S0DFBkZ9hmyLOGYd3_QjhhXgrm38qTyOITnkNb0t_oavfbVxuYV1JZ0tT5hLgswSfmZLFD6c2lONm_5TomUQXVXjen7CIxM2AGwbhHRYKjhg; _S_DPR=1.5; _S_IPAD=0; MONITOR_WEB_ID=7046351002275317255; ttwid=1%7C0YdWalNdIiSpIk3CvvHwV25U8drq3QAj08E8QOApXhs%7C1640607595%7C720e971d353416921df127996ed708931b4ae28a0a8691a5466347697e581ce8; _S_WIN_WH=262_623' } params = { 'keyword': 'Street Photography', 'pd': 'atlas', 'dvpf': 'pc', 'aid': '4916', 'page_num': page_num, 'search_json': '%7B%22from_search_id%22%3A%22202112272022060101510440283EE83D67%22%2C%22origin_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22image_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%7D', 'rawJSON': 1, 'search_id': '2021122721183101015104402851E3883D' } url = '/search?' + urlencode(params) print(url) try: response=(url,headers=headers,params=params) if response.status_code == 200: #if : #print(()) return () except : return None def get_images(json): images = ('rawData').get('data') for image in images: link = ('img_url') yield link def save_image(link): data = (link).content with open(f'./image/{md5(data).hexdigest()}.jpg', 'wb')as f:#Use the md5 code of data as the image name (data) def main(page_num): json = get_page(page_num) for link in get_images(json): #print(link) save_image(link) if __name__ == '__main__': GROUP_START = 1 GROUP_END = 20 pool = Pool() groups = ([x for x in range(GROUP_START, GROUP_END + 1)]) #print(groups) (main, groups) () ()
To this article on the Python Ajax crawler case to share the article is introduced to this, more related Python Ajax crawler content please search for my previous posts or continue to browse the following related articles I hope you will support me in the future more!