SoFunction
Updated on 2024-11-16

php and python implementation of the thread pool multi-threaded crawler function example

In this paper, the example of php and python implementation of the thread pool multi-threaded crawler function. Shared for your reference, as follows:

Multi-threaded crawler can be used to capture the content of this can improve performance, here we look at php and python thread pool multi-threaded crawler example, the code is as follows:

php examples

<?php
class Connect extends Worker //worker mode
{
public function __construct()
{
}
public function getConnection()
{
if (!self::$ch)
{
self::$ch = curl_init();
curl_setopt(self::$ch, CURLOPT_TIMEOUT, 2);
curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt(self::$ch, CURLOPT_HEADER, 0);
curl_setopt(self::$ch, CURLOPT_NOSIGNAL, true);
curl_setopt(self::$ch, CURLOPT_USERAGENT, "Firefox");
curl_setopt(self::$ch, CURLOPT_FOLLOWLOCATION, 1);
}
/* do some exception/error stuff here maybe */
return self::$ch;
}
public function closeConnection()
{
curl_close(self::$ch);
}
/**
* Note that the link is stored statically, which for pthreads, means thread local
* */
protected static $ch;
}
class Query extends Threaded
{
public function __construct($url)
{
$this->url = $url;
}
public function run()
{
$ch = $this->worker->getConnection();
curl_setopt($ch, CURLOPT_URL, $this->url);
$page = curl_exec($ch);
$info = curl_getinfo($ch);
$error = curl_error($ch);
$this->deal_data($this->url, $page, $info, $error);
$this->result = $page;
}
function deal_data($url, $page, $info, $error)
{
$parts = explode(".", $url);
$id = $parts[1];
if ($info['http_code'] != 200)
{
$this->show_msg($id, $error);
} else
{
$this->show_msg($id, "OK");
}
}
function show_msg($id, $msg)
{
echo $id."\t$msg\n";
}
public function getResult()
{
return $this->result;
}
protected $url;
protected $result;
}
function check_urls_multi_pthreads()
{
global $check_urls; // Define the connection for crawling
$check_urls = array( '' => "xx.com",);
$pool = new Pool(10, "Connect", array()); //Create a pool of 10 threads
foreach ($check_urls as $url => $name)
{
$pool->submit(new Query($url));
}
$pool->shutdown();
}
check_urls_multi_pthreads();
python multi-threaded
def handle(sid):// This method performs crawler data processing.
pass
class MyThread(Thread):
"""docstring for ClassName"""
def __init__(self, sid):
Thread.__init__(self)
 = sid
def run():
handle()
threads = []
for i in xrange(1,11):
t = MyThread(i)
(t)
()
for t in threads:
()

python thread pool crawler:

from queue import Queue
from threading import Thread, Lock
import 
import socket
import re
import time
seen_urls = set(['/'])
lock = Lock()
class Fetcher(Thread):
  def __init__(self, tasks):
    Thread.__init__(self)
     = tasks
     = True
    ()
  def run(self):
    while True:
      url = ()
      print(url)
      sock = ()
      (('localhost', 3000))
      get = 'GET {} HTTP/1.0\r\nHost: localhost\r\n\r\n'.format(url)
      (('ascii'))
      response = b''
      chunk = (4096)
      while chunk:
        response += chunk
        chunk = (4096)
      links = self.parse_links(url, response)
      ()
      for link in (seen_urls):
        (link)
      seen_urls.update(links)
      ()
      .task_done()
  def parse_links(self, fetched_url, response):
    if not response:
      print('error: {}'.format(fetched_url))
      return set()
    if not self._is_html(response):
      return set()
    urls = set((r'''(?i)href=["']?([^\s"'<>]+)''',
               (response)))
    links = set()
    for url in urls:
      normalized = (fetched_url, url)
      parts = (normalized)
      if  not in ('', 'http', 'https'):
        continue
      host, port = ()
      if host and () not in ('localhost'):
        continue
      defragmented, frag = ()
      (defragmented)
    return links
  def body(self, response):
    body = (b'\r\n\r\n', 1)[1]
    return ('utf-8')
  def _is_html(self, response):
    head, body = (b'\r\n\r\n', 1)
    headers = dict((': ') for h in ().split('\r\n')[1:])
    return ('Content-Type', '').startswith('text/html')
class ThreadPool:
  def __init__(self, num_threads):
     = Queue()
    for _ in range(num_threads):
      Fetcher()
  def add_task(self, url):
    (url)
  def wait_completion(self):
    ()
if __name__ == '__main__':
  start = ()
  pool = ThreadPool(4)
  pool.add_task("/")
  pool.wait_completion()
  print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),() - start))

Readers interested in more PHP-related content can check out this site's topic: thephp curl usage summary》、《PHP array (Array) operation skills of the book》、《php sorting algorithm summary》、《PHP commonly used traversal algorithms and techniques summarized》、《PHP Data Structures and Algorithms Tutorial》、《Summary of php programming algorithms》、《Summary of PHP mathematical operations techniques》、《php regular expression usage summary》、《PHP Operations and Operators Usage Summary》、《php string (string) usage summaryand thephp summary of common database operation techniques

I hope that what I have said in this article will help you in PHP programming.