SoFunction
Updated on 2024-11-19

Python crawler introductory case of crawling the second-hand housing data

Focus of this article

  • Systematic analysis of the nature of web pages
  • Structured data parsing
  • csv data saving

Environment

  • python 3.8
  • pycharm Professional Edition >>> Activation Code

# Module usage

  • requests >>> pip install requests
  • parsel >>> pip install parsel
  • csv

[Paid VIP Full Version] Just watch and learn the tutorials, 80 episodes of Python basic introductory video teaching

Click here to watch online for free

Crawler code implementation steps: send request >>> get data >>> parse data >>> save data

import module

import requests # Data request module Third-party module pip install requests
import parsel # Data parsing module
import re
import csv

Send request, for listing page send request

url = '/ershoufang/pg1/'
# Need to carry a request header: disguise python code as a browser and send a request to the server.
# User-Agent Basic information about the browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
response = (url=url, headers=headers)

Getting data

print()

parsing data

selector_1 = ()
# Convert the retrieved data into a selector object.
href = selector_1.css(' li  a::attr(href)').getall()
for link in href:
    html_data = (url=link, headers=headers).text
    selector = (html_data)
    # css selector syntax
    # try:
    title = ('.title h1::text').get() # Title
    area = ('.areaName .info a:nth-child(1)::text').get()  # Region
    community_name = ('.communityName .info::text').get()  # The neighborhood #
    room = ('.room .mainInfo::text').get()  # Households
    room_type = ('.type .mainInfo::text').get()  # Orientation
    height = ('.room .subInfo::text').get().split('/')[-1]  # Floor
    # Middle floor / 5 floors in total split('/') do string split ['middle floor', '5 floors in total'] [-1]
    # ['Middle floor', 'Total 5 floors'][-1] List index position taken Fetch the last element in the list Total 5 floors
    # ('Total (\d+) floors', Total 5 floors) >>> [5][0] >>>> 5
    height = ('common(\d+)floor (of a building)', height)[0]
    sub_info = ('.type .subInfo::text').get().split('/')[-1]  # Renovation
    Elevator = ('.content li:nth-child(12)::text').get()  # Elevator
    # if Elevator == 'No Data Elevator' or Elevator == None.
    # Elevator = 'no elevator'
    house_area = ('.content li:nth-child(3)::text').get().replace('㎡', '')  # Area
    price = ('.price .total::text').get()  # Price ($ million)
    date = ('.area .subInfo::text').get().replace('Year built', '')  # Year
    dit = {
        'Title': title,
        'Downtown': area,
        'Neighborhoods': community_name,
        'Household type': room,
        'Orientation': room_type,
        'Floor': height,
        'Renovation status': sub_info,
        'Elevator': Elevator,
        'area (of a floor, piece of land etc)(㎡)': house_area,
        'prices(ten thousand dollars)': price,
        'particular year': date,
    }
    csv_writer.writerow(dit)
    print(title, area, community_name, room, room_type, height, sub_info, Elevator, house_area, price, date,
          sep='|')

Save data

f = open('Second-hand housing data.csv', mode='a', encoding='utf-8', newline='')
csv_writer = (f, fieldnames=[
    'Title',
    'Downtown',
    'Neighborhoods',
    'Household type',
    'Orientation',
    'Floor',
    'Renovation status',
    'Elevator',
    'area (of a floor, piece of land etc)(㎡)',
    'prices(ten thousand dollars)',
    'particular year',
])
csv_writer.writeheader()

data visualization

Import the required modules

import pandas as pd
from  import Map
from  import Bar
from  import Line
from  import Grid
from  import Pie
from  import Scatter
from pyecharts import options as opts

retrieve data

df = pd.read_csv('Chain.csv', encoding = 'utf-8')
()

The number of second-hand houses in each urban areaBeijing Map

new = [x + 'District' for x in region]
m = (
        Map()
        .add('', [list(z) for z in zip(new, count)], 'Beijing')
        .set_global_opts(
            title_opts=(title='Distribution of second-hand housing in Beijing by district'),
            visualmap_opts=(max_=3000),
        )
    )
m.render_notebook()

Histogram of Number of Used Homes-Average Price by Urban Area

df_price.()
price = [round(x,2) for x in df_price.()]
bar = (
    Bar()
    .add_xaxis(region)
    .add_yaxis('Number', count,
              label_opts=(is_show=True))
    .extend_axis(
        yaxis=(
            name="Price (in millions of dollars)",
            type_="value",
            min_=200,
            max_=900,
            interval=100,
            axislabel_opts=(formatter="{value}"),
        )
    )
    .set_global_opts(
        title_opts=(title='Histogram of the number of second-hand homes - average price by urban area'),
        tooltip_opts=(
            is_show=True, trigger="axis", axis_pointer_type="cross"
        ),
        xaxis_opts=(
            type_="category",
            axispointer_opts=(is_show=True, type_="shadow"),
        ),
        yaxis_opts=(name='Number',
            axistick_opts=(is_show=True),
            splitline_opts=(is_show=False),)
    )
)

line2 = (
    Line()
    .add_xaxis(xaxis_data=region)
    .add_yaxis(
        
        series_name="Price.",
        yaxis_index=1,
        y_axis=price,
        label_opts=(is_show=True),
        z=10
        )
)

(line2)
grid = Grid()
(bar, (pos_left="5%", pos_right="20%"), is_control_axis_index=True)
grid.render_notebook()

area0 = top_price['Neighborhoods'].()
count = top_price['prices(ten thousand dollars)'].()

bar = (
    Bar()
    .add_xaxis(area0)
    .add_yaxis('Number', count,category_gap = '50%')
    .set_global_opts(
        yaxis_opts=(name='prices(ten thousand dollars)'),
        xaxis_opts=(name='Number'),
    )
)
bar.render_notebook()

scatterplot

s = (
    Scatter()
    .add_xaxis(df['area (of a floor, piece of land etc)(㎡)'].())
    .add_yaxis('',df['prices(ten thousand dollars)'].())
    .set_global_opts(xaxis_opts=(type_='value'))
)
s.render_notebook()

Percentage of house orientation

directions = df_direction.()
count = df_direction.()

c1 = (
    Pie(init_opts=(
            width='800px', height='600px',
            )
       )
        .add(
        '',
        [list(z) for z in zip(directions, count)],
        radius=['20%', '60%'],
        center=['40%', '50%'],
#         rosetype="radius",
        label_opts=(is_show=True),
        )    
        .set_global_opts(title_opts=(title='Percentage of house orientation',pos_left='33%',pos_top="5%"),
                        legend_opts=(type_="scroll", pos_left="80%",pos_top="25%",orient="vertical")
                        )
        .set_series_opts(label_opts=(formatter='{b}:{c} ({d}%)'),position="outside")
    )
c1.render_notebook()

Furnished condition/with or without elevator rose diagram (combination diagram)

fitment = df_fitment.()
count1 = df_fitment.()

directions = df_direction.()
count2 = df_direction.()

bar = (
    Bar()
    .add_xaxis(fitment)
    .add_yaxis('', count1, category_gap = '50%')
    .reversal_axis()
    .set_series_opts(label_opts=(position='right'))    
    .set_global_opts(
        xaxis_opts=(name='Number'),
        title_opts=(title='Condition of renovation/with or without elevator rose diagram (combination diagram)',pos_left='33%',pos_top="5%"),
        legend_opts=(type_="scroll", pos_left="90%",pos_top="58%",orient="vertical")
    )
)

c2 = (
    Pie(init_opts=(
            width='800px', height='600px',
            )
       )
        .add(
        '',
        [list(z) for z in zip(directions, count2)],
        radius=['10%', '30%'],
        center=['75%', '65%'],
        rosetype="radius",
        label_opts=(is_show=True),
        )    
        .set_global_opts(title_opts=(title='With/without elevator',pos_left='33%',pos_top="5%"),
                        legend_opts=(type_="scroll", pos_left="90%",pos_top="15%",orient="vertical")
                        )
        .set_series_opts(label_opts=(formatter='{b}:{c} \n ({d}%)'),position="outside")
    )

(c2)
bar.render_notebook()

Columnar scaling diagram of floor distribution of second-hand houses

floor = df_floor.()
count = df_floor.()
bar = (
    Bar()
    .add_xaxis(floor)
    .add_yaxis('Number', count)
    .set_global_opts(
        title_opts=(title='Columnar scaling diagram of floor distribution of second-hand houses'),
        yaxis_opts=(name='Number'),
        xaxis_opts=(name='Floor'),
        datazoom_opts=(type_='slider')
    )
)
bar.render_notebook()

Vertical bar chart of housing area distribution

area = df_area.()
count = df_area.()

bar = (
    Bar()
    .add_xaxis(area)
    .add_yaxis('Number', count)
    .reversal_axis()
    .set_series_opts(label_opts=(position="right"))
    .set_global_opts(
        title_opts=(title='Vertical bar chart of house size distribution'),
        yaxis_opts=(name='area (of a floor, piece of land etc)(㎡)'),
        xaxis_opts=(name='Number'),
    )
)
bar.render_notebook()

To this point this article on Python crawler introductory case of crawling the second-hand housing data is introduced to this article, more related Python crawling second-hand housing data content, please search for my previous posts or continue to browse the following related articles I hope you will support me in the future more!