SoFunction
Updated on 2024-11-13

Python data analysis library pandas basic operation methods

What is pandas?

Is that it?

。。。。 Apparently pandas isn't as cute as this guy 。。。。

Let's take a look at how the official pandas website comes to define itself:

pandas is an open source, easy-to-use data structures and data analysis tools for the Python programming language.

Obviously, pandas is a very powerful data analysis library for python!

Let's learn about it!

sequences

import numpy as np 
import pandas as pd 
 
s_data = ([1,3,5,7,,9,11])Functions in #pandas that produce sequences, similar to what we usually call arrays
print s_data 

Data structure DataFrame

import numpy as np 
import pandas as pd 
 
# Produced backward in time points using 20170220 as a base point
dates = pd.date_range('20170220',periods=6) 
#DataFrame generating function with rows indexed to points in time and columns indexed to ABCDs
data = ((6,4),index=dates,columns=list('ABCD')) 
print data 
print 
print  
print 
print  

Some operations(1)

import numpy as np
import pandas as pd
# Design a dictionary
d_data = {'A':1,'B':('20170220'),'C':range(4),'D':(4)}
print d_data
# Generate a DataFrame using a dictionary
df_data = (d_data)
print df_data
Type of each column in the #DataFrame
print df_data.dtypes
# Print column A
print df_data.A
# Print column B
print df_data.B
# Type of column B
print type(df_data.B)

Some operations(2)

import numpy as np 
import pandas as pd 
 
dates = pd.date_range('20170220',periods=6) 
data = ((6,4),index=dates,columns=list('ABCD')) 
print data 
print 
# Output DataFrame header data, default is the first 5 rows
print () 
# Output outputs the first row of DataFrame data
print (1) 
# Output the data at the end of the DataFrame, the default is the last 5 rows.
print () 
# Output outputs the last row of DataFrame data
print (1) 
# Output line index
print  
# Output column index
print  
# Output DataFrame data values
print  
# Output DataFrame details
print () 

Some operations(3)

import numpy as np 
import pandas as pd 
 
dates = pd.date_range('20170220',periods=6) 
data = ((6,4),index=dates,columns=list('ABCD')) 
print data 
print 
# Transpose
print  
# Output dimension information
print  
# Dimensional information after transposition
print  
# Sort the column index
print data.sort_index(axis = 1) 
# Sort column indexes, descending order
print data.sort_index(axis = 1,ascending=False) 
# Sort the row indexes, descending order
print data.sort_index(axis = 0,ascending=False) 
# Sort the values in column A in ascending order
print data.sort_values(by='A') 

Some operations (4)

import numpy as np 
import pandas as pd 
 
dates = pd.date_range('20170220',periods=6) 
data = ((6,4),index=dates,columns=list('ABCD')) 
print data 
# Output column A
print  
# Output column A
print data['A'] 
# Output 3,4 lines
print data[2:4] 
#Output 3, 4 lines
print data['20170222':'20170223'] 
# Output 3,4 lines
print ['20170222':'20170223'] 
# Output 3,4 lines
print [2:4] 
exportsB,Ctwo columns 
print [:,['B','C']] 

Some operations (5)

import numpy as np 
import pandas as pd 
 
dates = pd.date_range('20170220',periods=6) 
data = ((6,4),index=dates,columns=list('ABCD')) 
print data 
# Output rows greater than 0 in column A
print data[ > 0] 
# Output data greater than 0, less than or equal to 0 with NaN complementary bit
print data[data > 0] 
#Copy the data
data2 = () 
print data2 
tag = ['a'] * 2 + ['b'] * 2 + ['c'] * 2 
# Add TAG column in data2 with tag assignment
data2['TAG'] = tag 
print data2 
# Print the rows with a,c in the TAG columns.
print data2[(['a','c'])] 

Some operations (6)

import numpy as np 
import pandas as pd 
 
dates = pd.date_range('20170220',periods=6) 
data = ((6,4),index=dates,columns=list('ABCD')) 
print data 
# Assign the first row and column elements to 100
[0,0] = 100 
print data 
# Assign the elements of column A with range(6)
 = range(6) 
print data 
# Assign column B elements to 200
 = 200 
print data 
# Assign column 3,4 elements to 1000
[:,2:5] = 1000 
print data 

Some operations (7)

import numpy as np 
import pandas as pd 
 
dates = pd.date_range('20170220',periods = 6) 
df = ((6,4) , index = dates , columns = list('ABCD')) 
print df 
# Redefine the index and add column E
dfl = (index = dates[0:4],columns = list()+['E']) 
print dfl 
# Assign rows 2,3 in column E to 2
[dates[1:3],'E'] = 2 
print dfl 
# Remove rows with NaN elements present
print () 
# Assigning NaN elements to 5
print (5) 
# Determine if each element is NaN
print (dfl) 
# Finding the average of columns
print () 
#Accumulate each column
print () 

Some operations (8)

import numpy as np 
import pandas as pd 
dates = pd.date_range('20170220',periods = 6) 
df = ((6,4) , index = dates , columns = list('ABCD')) 
print df 
dfl = (index = dates[0:4],columns = list()+['E']) 
print dfl 
# Averaging over rows
print (axis=1) 
# Generate the sequence and shift it two places to the right
s = ([1,3,5,,6,8],index = dates).shift(2) 
print s 
#df subtracts from s.
print (s,axis = 'index') 
#Accumulate in each column
print () 
# Maximum value minus minimum value in each column
print (lambda x: () - ()) 

Some operations (9)

import numpy as np 
import pandas as pd 
dates = pd.date_range('20170220',periods = 6) 
df = ((6,4) , index = dates , columns = list('ABCD')) 
print df 
#Define a function
def _sum(x): 
 print(type(x)) 
 return () 
The #apply function can take a function as an argument
print (_sum) 
s = ((10,20,size = 15)) 
print s 
# Count the number of times each element in the sequence occurs
print s.value_counts() 
# Returns the element with the highest number of occurrences
print () 

Some operations (10)

import numpy as np 
import pandas as pd 
 
df = ((10,4) , columns = list('ABCD')) 
print df 
# Merge function
dfl = ([[:3],[3:7],[7:]]) 
print dfl 
# Determine whether the elements in two DataFrames are equal.
print df == dfl 

Some operations (11)

import numpy as np
import pandas as pd

df = ((10,4) , columns = list('ABCD'))
print df
left = ({'key':['foo','foo'],'lval':[1,2]})
right = ({'key':['foo','foo'],'rval':[4,5]})
print left
print right
# Merge data by key
print (left,right,on='key')
s = ((1,5,size = 4),index = list('ABCD'))
print s
# Add a line through the sequence
print (s,ignore_index = True)

Some operations (12)

import numpy as np 
import pandas as pd 
df = ({'A': ['foo','bar','foo','bar', 
       'foo','bar','foo','bar'], 
     'B': ['one','one','two','three', 
       'two','two','one','three'], 
     'C': (8), 
     'D': (8)}) 
print df 
print 
# Summing based on the index of column A
print ('A').sum() 
print 
# Sum the indexes of columns A and B first.
print (['A','B']).sum() 
print 
# Sum the indexes of columns B and A first.
print (['B','A']).sum() 

Some operations (13)

import pandas as pd 
import numpy as np 
The #zip function can be packed into a tuple
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 
      'foo', 'foo', 'qux', 'qux'], 
     ['one', 'two', 'one', 'two', 
      'one', 'two', 'one', 'two']])) 
print tuples 
# Generate a multi-level index
index = .from_tuples(tuples, names=['first', 'second']) 
print index 
print 
df = ((8, 2), index=index, columns=['A', 'B']) 
print df 
print 
# Turn column indexes into row indexes
print () 

Some operations (14)

import pandas as pd 
import numpy as np 
 
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 
      'foo', 'foo', 'qux', 'qux'], 
     ['one', 'two', 'one', 'two', 
      'one', 'two', 'one', 'two']])) 
index = .from_tuples(tuples, names=['first', 'second']) 
df = ((8, 2), index=index, columns=['A', 'B']) 
print df 
print 
stacked = () 
print stacked 
# Convert row indexes to column indexes
print () 
# Convert twice
print ().unstack() 

Some operations (15)

import pandas as pd 
import numpy as np 
df = ({'A' : ['one', 'one', 'two', 'three'] * 3, 
     'B' : ['A', 'B', 'C'] * 4, 
     'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 
     'D' : (12), 
     'E' : (12)}) 
print df 
# Process the value of D based on A, B indexed as rows and C indexed as columns
print pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) 
# Feeling that column A is equal to one is indexed, based on the average of the combinations of columns C
print df[=='one'].groupby('C').mean() 

18. Time series (1)

import pandas as pd 
import numpy as np 
 
#Create a time series of 600 advances in seconds based on 20170220
rng = pd.date_range('20170220', periods=600, freq='s') 
print rng 
# Sequences indexed by time series
print ((0, 500, len(rng)), index=rng) 

19. Time series (2)

import pandas as pd 
import numpy as np 
 
rng = pd.date_range('20170220', periods=600, freq='s') 
ts = ((0, 500, len(rng)), index=rng) 
#Resampling, summing samples in 2-minute increments
print ('2Min', how='sum') 
# Listed from 1Q 2011 to 1Q 2017
rng1 = pd.period_range('2011Q1','2017Q1',freq='Q') 
print rng1 
# Converted to timestamp form
print rng1.to_timestamp() 
# Time addition and subtraction
print ('20170220') - ('20170112') 
print ('20170220') + (days=12) 

20. Data categories

import pandas as pd 
import numpy as np 
 
df = ({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) 
print df 
#Add category data, using the value of raw_grade as the basis for the category
df["grade"] = df["raw_grade"].astype("category") 
print df 
#Print Category
print df["grade"]. 
# Change category
df["grade"]. = ["very good", "good", "very bad"] 
print df 
# Sort by the value of grade
print df.sort_values(by='grade', ascending=True) 
# Display quantity sorted by grade
print ("grade").size() 

21. Data visualization

import pandas as pd 
import numpy as np 
import  as plt 
 
ts = ((1000), index=pd.date_range('20170220', periods=1000)) 
ts = () 
print ts 
() 
() 

22. Data reading and writing

import pandas as pd 
import numpy as np 
 
df = ((10, 4), columns=list('ABCD')) 
# Data saved, relative path
df.to_csv('') 
#Data reading
print pd.read_csv('', index_col=0) 

The data is saved to this file:

Open it up:

Above this Python data analysis library pandas basic operation method is all I share with you, I hope to give you a reference, and I hope you support me more.