Python crawler battle of batch download fast platform video data

point of knowledge (math.)

requests
json
re
pprint

Development Environment:

Version: anaconda5.2.0 (python3.6.5)
Editor: pycharm

Case realization steps:

I. Analysis of data sources

(Only if you find the source of the data, you can implement it in code)

1. Determine the requirements (what is the content to be crawled?)

Crawl the video corresponding to a keyword Save mp4

2. Packet capture analysis through developer tools Analyze where the data comes from (find out the real source of data)?

Statically loaded pages
PENGQI for example
Dynamic page loading
Developer tools to grab packets

[Paid VIP Full Version] Just watch and learn the tutorials, 80 episodes of Python basic introductory video teaching

Click here to watch online for free

II. Code implementation process

Find the target site
Send request get post
Parsing data (get video address video title)
Send request Requests per video address
Save Video

Today's goals

III. Individual videos

Import the required modules

import json
import requests
import re

Send Request

data = {
    'operationName': "visionSearchPhoto",
    'query': "query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n  visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      type\n      author {\n        id\n        name\n        following\n        headerUrl\n        headerUrls {\n          cdn\n          url\n          __typename\n        }\n        __typename\n      }\n      tags {\n        type\n        name\n        __typename\n      }\n      photo {\n        id\n        duration\n        caption\n        likeCount\n        realLikeCount\n        coverUrl\n        photoUrl\n        liked\n        timestamp\n        expTag\n        coverUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrls {\n          cdn\n          url\n          __typename\n        }\n        animatedCoverUrl\n        stereoType\n        videoRatio\n        __typename\n      }\n      canAddComment\n      currentPcursor\n      llsid\n      status\n      __typename\n    }\n    searchSessionId\n    pcursor\n    aladdinBanner {\n      imgUrl\n      link\n      __typename\n    }\n    __typename\n  }\n}\n",
    'variables': {
        'keyword': 'Zhang San',
        'pcursor': ' ',
        'page': "search",
        'searchSessionId': "MTRfMjcwOTMyMTQ2XzE2Mjk5ODcyODQ2NTJf5oWi5pGHXzQzMQ"
    }

response = ('/graphql', data=data)

Add request header

headers = {
    # There are four formats of Content-Type (corresponding to data): these are
    # Crawler Basics/xml: Transferring xml as a file
    # multipart/form-data: for file uploads
    'content-type': 'application/json',
    # User identification
    'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_721a784b472981d650bcb8bbc5e9c9c2',
    # Browser information (sending requests disguised as a browser)
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

json serialization operations

# json data exchange format, before the advent of JSON, we have been using XML to pass data
# Since all languages support JSON, and JSON supports a variety of data types, JSON is commonly used in our daily HTTP interactions, data storage, and so on.
# Encoding python objects into Json strings
data = (data)
json_data = ('/graphql', headers=headers, data=data).json()

dictionary value

feeds = json_data['data']['visionSearchPhoto']['feeds']
for feed in feeds:
    caption = feed['photo']['caption']
    photoUrl = feed['photo']['photoUrl']
    new_title = (r'[/\:*?<>/\n] ', '-', caption)

Re-send request

resp = (photoUrl).content

Save data

with open('video\\' + title + '.mp4', mode='wb') as f:
    (resp)
print(title, 'Crawl successful!!!')

IV. Page Crawl

import module

import 
import time

Send Request

def get_json(url, data):
    response = (url, headers=headers, data=data).json()
    return response

Modify the title

def change_title(title):
    # windows system file naming Can't contain special characters...
    # windows file naming String can't exceed 256...
    new_title = (r'[/\\|:?<>"*\n]', '_', title)
    if len(new_title) > 50:
        new_title = new_title[:10]
    return new_title

data extraction

def parse(json_data):
    data_list = json_data['data']['visionSearchPhoto']['feeds']
    info_list = []
    for data in data_list:
        # Extract title
        title = data['photo']['caption']
        new_title = change_title(title)
        url_1 = data['photo']['photoUrl']
        info_list.append([new_title, url_1])
    return info_list

Save data

def save(title, url_1):
    resp = (url_1).content
    with open('video\\' + title + '.mp4', mode='wb') as f:
        (resp)
    print(title, 'Crawl successful!!!')

Main function Calls all functions

def run(url, data):
    """Main function Mobilizes all functions."""
    json_data = get_json(url, data)
    info_list = parse(json_data)
    for title, url_1 in info_list:
        save(title, url_1)

if __name__ == '__main__':
    start_time = ()
    with (max_workers=10) as executor:
        for page in range(1, 5):
            url = '/graphql'
            data = {
                'operationName': "visionSearchPhoto",
                'query': "query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n  visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      type\n      author {\n        id\n        name\n        following\n        headerUrl\n        headerUrls {\n          cdn\n          url\n          __typename\n        }\n        __typename\n      }\n      tags {\n        type\n        name\n        __typename\n      }\n      photo {\n        id\n        duration\n        caption\n        likeCount\n        realLikeCount\n        coverUrl\n        photoUrl\n        liked\n        timestamp\n        expTag\n        coverUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrls {\n          cdn\n          url\n          __typename\n        }\n        animatedCoverUrl\n        stereoType\n        videoRatio\n        __typename\n      }\n      canAddComment\n      currentPcursor\n      llsid\n      status\n      __typename\n    }\n    searchSessionId\n    pcursor\n    aladdinBanner {\n      imgUrl\n      link\n      __typename\n    }\n    __typename\n  }\n}\n",
                'variables': {
                    'keyword': 'Cao Fen',
                    # 'keyword': keyword,
                    'pcursor': str(page),
                    'page': "search",
                    'searchSessionId': "MTRfMjcwOTMyMTQ2XzE2Mjk5ODcyODQ2NTJf5oWi5pGHXzQzMQ"
                }
            }
            data = (data)
            (run, url, data, )
    print('In total, it cost:', ()-start_time)

Takes 57.7 seconds

This article on the Python crawler battle batch download Racer platform video data is introduced to this article, more related Python batch download Racer video content, please search for my previous posts or continue to browse the following related articles I hope that you will support me more in the future!