What is pandas?
Is that it?
。。。。 Apparently pandas isn't as cute as this guy 。。。。
Let's take a look at how the official pandas website comes to define itself:
pandas is an open source, easy-to-use data structures and data analysis tools for the Python programming language.
Obviously, pandas is a very powerful data analysis library for python!
Let's learn about it!
sequences
import numpy as np import pandas as pd s_data = ([1,3,5,7,,9,11])Functions in #pandas that produce sequences, similar to what we usually call arrays print s_data
Data structure DataFrame
import numpy as np import pandas as pd # Produced backward in time points using 20170220 as a base point dates = pd.date_range('20170220',periods=6) #DataFrame generating function with rows indexed to points in time and columns indexed to ABCDs data = ((6,4),index=dates,columns=list('ABCD')) print data print print print print
Some operations(1)
import numpy as np import pandas as pd # Design a dictionary d_data = {'A':1,'B':('20170220'),'C':range(4),'D':(4)} print d_data # Generate a DataFrame using a dictionary df_data = (d_data) print df_data Type of each column in the #DataFrame print df_data.dtypes # Print column A print df_data.A # Print column B print df_data.B # Type of column B print type(df_data.B)
Some operations(2)
import numpy as np import pandas as pd dates = pd.date_range('20170220',periods=6) data = ((6,4),index=dates,columns=list('ABCD')) print data print # Output DataFrame header data, default is the first 5 rows print () # Output outputs the first row of DataFrame data print (1) # Output the data at the end of the DataFrame, the default is the last 5 rows. print () # Output outputs the last row of DataFrame data print (1) # Output line index print # Output column index print # Output DataFrame data values print # Output DataFrame details print ()
Some operations(3)
import numpy as np import pandas as pd dates = pd.date_range('20170220',periods=6) data = ((6,4),index=dates,columns=list('ABCD')) print data print # Transpose print # Output dimension information print # Dimensional information after transposition print # Sort the column index print data.sort_index(axis = 1) # Sort column indexes, descending order print data.sort_index(axis = 1,ascending=False) # Sort the row indexes, descending order print data.sort_index(axis = 0,ascending=False) # Sort the values in column A in ascending order print data.sort_values(by='A')
Some operations (4)
import numpy as np import pandas as pd dates = pd.date_range('20170220',periods=6) data = ((6,4),index=dates,columns=list('ABCD')) print data # Output column A print # Output column A print data['A'] # Output 3,4 lines print data[2:4] #Output 3, 4 lines print data['20170222':'20170223'] # Output 3,4 lines print ['20170222':'20170223'] # Output 3,4 lines print [2:4] exportsB,Ctwo columns print [:,['B','C']]
Some operations (5)
import numpy as np import pandas as pd dates = pd.date_range('20170220',periods=6) data = ((6,4),index=dates,columns=list('ABCD')) print data # Output rows greater than 0 in column A print data[ > 0] # Output data greater than 0, less than or equal to 0 with NaN complementary bit print data[data > 0] #Copy the data data2 = () print data2 tag = ['a'] * 2 + ['b'] * 2 + ['c'] * 2 # Add TAG column in data2 with tag assignment data2['TAG'] = tag print data2 # Print the rows with a,c in the TAG columns. print data2[(['a','c'])]
Some operations (6)
import numpy as np import pandas as pd dates = pd.date_range('20170220',periods=6) data = ((6,4),index=dates,columns=list('ABCD')) print data # Assign the first row and column elements to 100 [0,0] = 100 print data # Assign the elements of column A with range(6) = range(6) print data # Assign column B elements to 200 = 200 print data # Assign column 3,4 elements to 1000 [:,2:5] = 1000 print data
Some operations (7)
import numpy as np import pandas as pd dates = pd.date_range('20170220',periods = 6) df = ((6,4) , index = dates , columns = list('ABCD')) print df # Redefine the index and add column E dfl = (index = dates[0:4],columns = list()+['E']) print dfl # Assign rows 2,3 in column E to 2 [dates[1:3],'E'] = 2 print dfl # Remove rows with NaN elements present print () # Assigning NaN elements to 5 print (5) # Determine if each element is NaN print (dfl) # Finding the average of columns print () #Accumulate each column print ()
Some operations (8)
import numpy as np import pandas as pd dates = pd.date_range('20170220',periods = 6) df = ((6,4) , index = dates , columns = list('ABCD')) print df dfl = (index = dates[0:4],columns = list()+['E']) print dfl # Averaging over rows print (axis=1) # Generate the sequence and shift it two places to the right s = ([1,3,5,,6,8],index = dates).shift(2) print s #df subtracts from s. print (s,axis = 'index') #Accumulate in each column print () # Maximum value minus minimum value in each column print (lambda x: () - ())
Some operations (9)
import numpy as np import pandas as pd dates = pd.date_range('20170220',periods = 6) df = ((6,4) , index = dates , columns = list('ABCD')) print df #Define a function def _sum(x): print(type(x)) return () The #apply function can take a function as an argument print (_sum) s = ((10,20,size = 15)) print s # Count the number of times each element in the sequence occurs print s.value_counts() # Returns the element with the highest number of occurrences print ()
Some operations (10)
import numpy as np import pandas as pd df = ((10,4) , columns = list('ABCD')) print df # Merge function dfl = ([[:3],[3:7],[7:]]) print dfl # Determine whether the elements in two DataFrames are equal. print df == dfl
Some operations (11)
import numpy as np import pandas as pd df = ((10,4) , columns = list('ABCD')) print df left = ({'key':['foo','foo'],'lval':[1,2]}) right = ({'key':['foo','foo'],'rval':[4,5]}) print left print right # Merge data by key print (left,right,on='key') s = ((1,5,size = 4),index = list('ABCD')) print s # Add a line through the sequence print (s,ignore_index = True)
Some operations (12)
import numpy as np import pandas as pd df = ({'A': ['foo','bar','foo','bar', 'foo','bar','foo','bar'], 'B': ['one','one','two','three', 'two','two','one','three'], 'C': (8), 'D': (8)}) print df print # Summing based on the index of column A print ('A').sum() print # Sum the indexes of columns A and B first. print (['A','B']).sum() print # Sum the indexes of columns B and A first. print (['B','A']).sum()
Some operations (13)
import pandas as pd import numpy as np The #zip function can be packed into a tuple tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) print tuples # Generate a multi-level index index = .from_tuples(tuples, names=['first', 'second']) print index print df = ((8, 2), index=index, columns=['A', 'B']) print df print # Turn column indexes into row indexes print ()
Some operations (14)
import pandas as pd import numpy as np tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) index = .from_tuples(tuples, names=['first', 'second']) df = ((8, 2), index=index, columns=['A', 'B']) print df print stacked = () print stacked # Convert row indexes to column indexes print () # Convert twice print ().unstack()
Some operations (15)
import pandas as pd import numpy as np df = ({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : (12), 'E' : (12)}) print df # Process the value of D based on A, B indexed as rows and C indexed as columns print pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) # Feeling that column A is equal to one is indexed, based on the average of the combinations of columns C print df[=='one'].groupby('C').mean()
18. Time series (1)
import pandas as pd import numpy as np #Create a time series of 600 advances in seconds based on 20170220 rng = pd.date_range('20170220', periods=600, freq='s') print rng # Sequences indexed by time series print ((0, 500, len(rng)), index=rng)
19. Time series (2)
import pandas as pd import numpy as np rng = pd.date_range('20170220', periods=600, freq='s') ts = ((0, 500, len(rng)), index=rng) #Resampling, summing samples in 2-minute increments print ('2Min', how='sum') # Listed from 1Q 2011 to 1Q 2017 rng1 = pd.period_range('2011Q1','2017Q1',freq='Q') print rng1 # Converted to timestamp form print rng1.to_timestamp() # Time addition and subtraction print ('20170220') - ('20170112') print ('20170220') + (days=12)
20. Data categories
import pandas as pd import numpy as np df = ({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) print df #Add category data, using the value of raw_grade as the basis for the category df["grade"] = df["raw_grade"].astype("category") print df #Print Category print df["grade"]. # Change category df["grade"]. = ["very good", "good", "very bad"] print df # Sort by the value of grade print df.sort_values(by='grade', ascending=True) # Display quantity sorted by grade print ("grade").size()
21. Data visualization
import pandas as pd import numpy as np import as plt ts = ((1000), index=pd.date_range('20170220', periods=1000)) ts = () print ts () ()
22. Data reading and writing
import pandas as pd import numpy as np df = ((10, 4), columns=list('ABCD')) # Data saved, relative path df.to_csv('') #Data reading print pd.read_csv('', index_col=0)
The data is saved to this file:
Open it up:
Above this Python data analysis library pandas basic operation method is all I share with you, I hope to give you a reference, and I hope you support me more.