Focus of this article
- Systematic analysis of the nature of web pages
- Structured data parsing
- csv data saving
Environment
- python 3.8
- pycharm Professional Edition >>> Activation Code
# Module usage
- requests >>> pip install requests
- parsel >>> pip install parsel
- csv
[Paid VIP Full Version] Just watch and learn the tutorials, 80 episodes of Python basic introductory video teaching
Click here to watch online for free
Crawler code implementation steps: send request >>> get data >>> parse data >>> save data
import module
import requests # Data request module Third-party module pip install requests import parsel # Data parsing module import re import csv
Send request, for listing page send request
url = '/ershoufang/pg1/' # Need to carry a request header: disguise python code as a browser and send a request to the server. # User-Agent Basic information about the browser headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' } response = (url=url, headers=headers)
Getting data
print()
parsing data
selector_1 = () # Convert the retrieved data into a selector object. href = selector_1.css(' li a::attr(href)').getall() for link in href: html_data = (url=link, headers=headers).text selector = (html_data) # css selector syntax # try: title = ('.title h1::text').get() # Title area = ('.areaName .info a:nth-child(1)::text').get() # Region community_name = ('.communityName .info::text').get() # The neighborhood # room = ('.room .mainInfo::text').get() # Households room_type = ('.type .mainInfo::text').get() # Orientation height = ('.room .subInfo::text').get().split('/')[-1] # Floor # Middle floor / 5 floors in total split('/') do string split ['middle floor', '5 floors in total'] [-1] # ['Middle floor', 'Total 5 floors'][-1] List index position taken Fetch the last element in the list Total 5 floors # ('Total (\d+) floors', Total 5 floors) >>> [5][0] >>>> 5 height = ('common(\d+)floor (of a building)', height)[0] sub_info = ('.type .subInfo::text').get().split('/')[-1] # Renovation Elevator = ('.content li:nth-child(12)::text').get() # Elevator # if Elevator == 'No Data Elevator' or Elevator == None. # Elevator = 'no elevator' house_area = ('.content li:nth-child(3)::text').get().replace('㎡', '') # Area price = ('.price .total::text').get() # Price ($ million) date = ('.area .subInfo::text').get().replace('Year built', '') # Year dit = { 'Title': title, 'Downtown': area, 'Neighborhoods': community_name, 'Household type': room, 'Orientation': room_type, 'Floor': height, 'Renovation status': sub_info, 'Elevator': Elevator, 'area (of a floor, piece of land etc)(㎡)': house_area, 'prices(ten thousand dollars)': price, 'particular year': date, } csv_writer.writerow(dit) print(title, area, community_name, room, room_type, height, sub_info, Elevator, house_area, price, date, sep='|')
Save data
f = open('Second-hand housing data.csv', mode='a', encoding='utf-8', newline='') csv_writer = (f, fieldnames=[ 'Title', 'Downtown', 'Neighborhoods', 'Household type', 'Orientation', 'Floor', 'Renovation status', 'Elevator', 'area (of a floor, piece of land etc)(㎡)', 'prices(ten thousand dollars)', 'particular year', ]) csv_writer.writeheader()
data visualization
Import the required modules
import pandas as pd from import Map from import Bar from import Line from import Grid from import Pie from import Scatter from pyecharts import options as opts
retrieve data
df = pd.read_csv('Chain.csv', encoding = 'utf-8') ()
The number of second-hand houses in each urban areaBeijing Map
new = [x + 'District' for x in region] m = ( Map() .add('', [list(z) for z in zip(new, count)], 'Beijing') .set_global_opts( title_opts=(title='Distribution of second-hand housing in Beijing by district'), visualmap_opts=(max_=3000), ) ) m.render_notebook()
Histogram of Number of Used Homes-Average Price by Urban Area
df_price.() price = [round(x,2) for x in df_price.()] bar = ( Bar() .add_xaxis(region) .add_yaxis('Number', count, label_opts=(is_show=True)) .extend_axis( yaxis=( name="Price (in millions of dollars)", type_="value", min_=200, max_=900, interval=100, axislabel_opts=(formatter="{value}"), ) ) .set_global_opts( title_opts=(title='Histogram of the number of second-hand homes - average price by urban area'), tooltip_opts=( is_show=True, trigger="axis", axis_pointer_type="cross" ), xaxis_opts=( type_="category", axispointer_opts=(is_show=True, type_="shadow"), ), yaxis_opts=(name='Number', axistick_opts=(is_show=True), splitline_opts=(is_show=False),) ) ) line2 = ( Line() .add_xaxis(xaxis_data=region) .add_yaxis( series_name="Price.", yaxis_index=1, y_axis=price, label_opts=(is_show=True), z=10 ) ) (line2) grid = Grid() (bar, (pos_left="5%", pos_right="20%"), is_control_axis_index=True) grid.render_notebook()
area0 = top_price['Neighborhoods'].() count = top_price['prices(ten thousand dollars)'].() bar = ( Bar() .add_xaxis(area0) .add_yaxis('Number', count,category_gap = '50%') .set_global_opts( yaxis_opts=(name='prices(ten thousand dollars)'), xaxis_opts=(name='Number'), ) ) bar.render_notebook()
scatterplot
s = ( Scatter() .add_xaxis(df['area (of a floor, piece of land etc)(㎡)'].()) .add_yaxis('',df['prices(ten thousand dollars)'].()) .set_global_opts(xaxis_opts=(type_='value')) ) s.render_notebook()
Percentage of house orientation
directions = df_direction.() count = df_direction.() c1 = ( Pie(init_opts=( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count)], radius=['20%', '60%'], center=['40%', '50%'], # rosetype="radius", label_opts=(is_show=True), ) .set_global_opts(title_opts=(title='Percentage of house orientation',pos_left='33%',pos_top="5%"), legend_opts=(type_="scroll", pos_left="80%",pos_top="25%",orient="vertical") ) .set_series_opts(label_opts=(formatter='{b}:{c} ({d}%)'),position="outside") ) c1.render_notebook()
Furnished condition/with or without elevator rose diagram (combination diagram)
fitment = df_fitment.() count1 = df_fitment.() directions = df_direction.() count2 = df_direction.() bar = ( Bar() .add_xaxis(fitment) .add_yaxis('', count1, category_gap = '50%') .reversal_axis() .set_series_opts(label_opts=(position='right')) .set_global_opts( xaxis_opts=(name='Number'), title_opts=(title='Condition of renovation/with or without elevator rose diagram (combination diagram)',pos_left='33%',pos_top="5%"), legend_opts=(type_="scroll", pos_left="90%",pos_top="58%",orient="vertical") ) ) c2 = ( Pie(init_opts=( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count2)], radius=['10%', '30%'], center=['75%', '65%'], rosetype="radius", label_opts=(is_show=True), ) .set_global_opts(title_opts=(title='With/without elevator',pos_left='33%',pos_top="5%"), legend_opts=(type_="scroll", pos_left="90%",pos_top="15%",orient="vertical") ) .set_series_opts(label_opts=(formatter='{b}:{c} \n ({d}%)'),position="outside") ) (c2) bar.render_notebook()
Columnar scaling diagram of floor distribution of second-hand houses
floor = df_floor.() count = df_floor.() bar = ( Bar() .add_xaxis(floor) .add_yaxis('Number', count) .set_global_opts( title_opts=(title='Columnar scaling diagram of floor distribution of second-hand houses'), yaxis_opts=(name='Number'), xaxis_opts=(name='Floor'), datazoom_opts=(type_='slider') ) ) bar.render_notebook()
Vertical bar chart of housing area distribution
area = df_area.() count = df_area.() bar = ( Bar() .add_xaxis(area) .add_yaxis('Number', count) .reversal_axis() .set_series_opts(label_opts=(position="right")) .set_global_opts( title_opts=(title='Vertical bar chart of house size distribution'), yaxis_opts=(name='area (of a floor, piece of land etc)(㎡)'), xaxis_opts=(name='Number'), ) ) bar.render_notebook()
To this point this article on Python crawler introductory case of crawling the second-hand housing data is introduced to this article, more related Python crawling second-hand housing data content, please search for my previous posts or continue to browse the following related articles I hope you will support me in the future more!