SoFunction
Updated on 2024-11-21

Python urllib, urllib2, httplib crawl web code example

Use urllib2, it's so powerful!
Tried logging in with a proxy to pull cookies and jump to grab images ......
Documentation:/library/

Directly on the demo code
Includes: direct pull, using Reuqest(post/get), using proxies, cookies, jump processing

#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: wklken
# 2012-03-17 wklken@


import urllib,urllib2,cookielib,socket

url = "....." #change yourself
# The easiest way
def use_urllib2():
 try:
  f = (url, timeout=5).read()
 except , e:
  print 
 print len(f)

#Use Request
def get_request():
 # Can set a timeout
 (5)
 # can be parameterized [no parameters, use get, this way, use post].
 params = {"wd":"a","b":"2"}
 # Can include request header information to identify
 i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5",
       "Accept": "text/plain"}
 #use post,have some params post to server,if not support ,will throw exception
 #req = (url, data=(params), headers=i_headers)
 req = (url, headers=i_headers)

 #After creating the request, you can also add others, if the key is duplicated, the latter will take effect.
 #request.add_header('Accept','application/json')
 # Can specify the submission method
 #request.get_method = lambda: 'PUT'
 try:
  page = (req)
  print len(())
  #like get
  #url_params = ({"a":"1", "b":"2"})
  #final_url = url + "?" + url_params
  #print final_url
  #data = (final_url).read()
  #print "Method:get ", len(data)
 except , e:
  print "Error Code:", 
 except , e:
  print "Error Reason:", 

def use_proxy():
 enable_proxy = False
 proxy_handler = ({"http":":8080"})
 null_proxy_handler = ({})
 if enable_proxy:
  opener = urllib2.build_opener(proxy_handler, )
 else:
  opener = urllib2.build_opener(null_proxy_handler, )
 #This sets the global opener for urllib2.
 urllib2.install_opener(opener)
 content = (url).read()
 print "proxy len:",len(content)

class NoExceptionCookieProcesser():
 def http_error_403(self, req, fp, code, msg, hdrs):
  return fp
 def http_error_400(self, req, fp, code, msg, hdrs):
  return fp
 def http_error_500(self, req, fp, code, msg, hdrs):
  return fp

def hand_cookie():
 cookie = ()
 #cookie_handler = (cookie)
 #after add error exception handler
 cookie_handler = NoExceptionCookieProcesser(cookie)
 opener = urllib2.build_opener(cookie_handler, )
 url_login = "/?login"
 params = {"username":"user","password":"111111"}
 (url_login, (params))
 for item in cookie:
  print ,
 #urllib2.install_opener(opener)
 #content = (url).read()
 #print len(content)
# Get the last page URL after N redirects
def get_request_direct():
 import httplib
  = 1
 request = ("")
 request.add_header("Accept", "text/html,*/*")
 request.add_header("Connection", "Keep-Alive")
 opener = urllib2.build_opener()
 f = (request)
 print 
 print 
 print len(())

if __name__ == "__main__":
 use_urllib2()
 get_request()
 get_request_direct()
 use_proxy()
 hand_cookie()