SoFunction
Updated on 2024-11-12

The python implementation of the Zhihu high-color image crawling

Importing related packages

import time
import pydash
import base64
import requests
from lxml import etree
from aip import AipFace
from pathlib import Path

Baidu Cloud Face Detection Application Information

#The only information you have to fill in are these three lines #
APP_ID = "xxxxxxxx"
API_KEY = "xxxxxxxxxxxxxxxx"
SECRET_KEY = "xxxxxxxxxxxxxxxx"
# Filter the face value threshold, feel free if you have a lot of storage space
BEAUTY_THRESHOLD = 55
AUTHORIZATION = "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
# If the permissions are wrong, open Knowles in the browser and copy one in the developer tools without logging in
# It is advisable to change,Because I don't know the anti-crawler strategy of Knowledgeable,If too many people use the same,May affect program operation

None of the following changes are necessary

# The length of the discussion list for each request, it is not recommended to set it too long.
LIMIT = 5
# This is the ID of the topic "Beauty", which is the parent topic of "Face Value" (20013528).
SOURCE = "19552207"

Crawler pretends to be a normal browser request

USER_AGENT = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3"
REFERER = "/topic/%s/newest" % SOURCE
# Discussion list request url for a topic
BASE_URL = "/api/v4/topics/%s/feeds/timeline_activity"
# Request parameters attached to the initial request url
URL_QUERY = "?include=data%5B%3F%%3Dtopic_sticky_module%29%%5B%3F%%3Danswer%29%%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%%3Dtopic_sticky_module%29%%5B%3F%%3Danswer%29%.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%%5B%3F%28type%3Dbest_answerer%29%%3Bdata%5B%3F%%3Dtopic_sticky_module%29%%5B%3F%%3Darticle%29%%2Cvoteup_count%2Ccomment_count%2Cvoting%%5B%3F%28type%3Dbest_answerer%29%%3Bdata%5B%3F%%3Dtopic_sticky_module%29%%5B%3F%%3Dpeople%29%.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%%3Bdata%5B%3F%%3Danswer%29%%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%%3Danswer%29%%5B%3F%28type%3Dbest_answerer%29%%3Bdata%5B%3F%%3Darticle%29%%%5B%3F%28type%3Dbest_answerer%29%%3Bdata%5B%3F%%3Dquestion%29%.comment_count&limit=" + str(
  LIMIT)

HEADERS = {
  "User-Agent": USER_AGENT,
  "Referer": REFERER,
  "authorization": AUTHORIZATION

Specify the url to get the original content/image.

def fetch_image(url):
  try:
    response = (url, headers=HEADERS)
  except Exception as e:
    raise e
  return 

Specify the url, get the corresponding JSON Return / Topic List

def fetch_activities(url):
  try:
    response = (url, headers=HEADERS)
  except Exception as e:
    raise e
  return ()

Processing the list of returned topics

def parser_activities(datums, face_detective):
  for data in datums["data"]:
    target = data["target"]
    if "content" not in target or "question" not in target or "author" not in target:
      continue
    html = (target["content"])
    seq = 0
    title = target["question"]["title"]
    author = target["author"]["name"]
    images = ("//img/@src")
    for image in images:
      if not ("http"):
        continue
      image_data = fetch_image(image)
      score = face_detective(image_data)
      if not score:
        continue
      name = "{}--{}--{}--{}.jpg".format(score, author, title, seq)
      seq = seq + 1
      path = Path(__file__).("image").joinpath(name)
      try:
        f = open(path, "wb")
        (image_data)
        ()
        ()
        print(path)
        (2)
      except Exception as e:
        continue
  if not datums["paging"]["is_end"]:
    return datums["paging"]["next"]
  else:
    return None

Initialize Face Detection Tool

def init_detective(app_id, api_key, secret_key):
  client = AipFace(app_id, api_key, secret_key)
  options = {"face_field": "age,gender,beauty,qualities"}
  def detective(image):
    image = str(base64.b64encode(image), "utf-8")
    response = (str(image), "BASE64", options)
    response = ("result")
    if not response:
      return
    if (not response) or (response["face_num"] == 0):
      return
    face_list = response["face_list"]
    if (face_list, "0.face_probability") < 0.6:
      return
    if (face_list, "") < BEAUTY_THRESHOLD:
      return
    if (face_list, "") != "female":
      return
    score = (face_list, "")
    return score
  return detective

program entry

def main():
  face_detective = init_detective(APP_ID, API_KEY, SECRET_KEY)
  url = BASE_URL % SOURCE + URL_QUERY
  while url is not None:
    datums = fetch_activities(url)
    url = parser_activities(datums, face_detective)
    (5)
if __name__ == '__main__':
  main()

This is the whole content of this article, I hope it will help you to learn more.