#!/usr/bin/env python # coding: utf-8 #

Table of Contents

#
# In[1]: import pandas as pd import numpy as np # In[2]: import builtins def print(*args, **kwargs): builtins.print(*args, **kwargs, end='\n\n') # ## Series # # A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index. # In[3]: obj = pd.Series([4, 7, -5, 3]) print(obj) print(obj.index) print(obj.values) # In[4]: obj2 = pd.Series([4, 7, -5, 3], index=['d', 'a', 'c', 'b']) print(obj2) print(obj2.index) # In[5]: obj2['a'] # In[6]: obj2[['c', 'a', 'd']] # In[7]: sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} obj3 = pd.Series(sdata) print(obj3) # In[8]: states = ['California', 'Ohio', 'Oregon', 'Texas'] obj4 = pd.Series(sdata, index=states) print(obj4) # In[9]: print(obj4.isnull()) print(obj4.notnull()) # In[10]: print(obj3) print(obj4) print(obj3 + obj4) # In[11]: obj4.name = 'population' obj4.index.name = 'state' print(obj4) # In[12]: print(obj) obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan'] print(obj) # ## DataFrame # # A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.). # In[13]: data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]} frame = pd.DataFrame(data) print(frame) # In[14]: pd.DataFrame(data, columns=['year', 'state', 'pop']) # In[15]: # u pass a column that isn’t contained in the dict, # it will appear with missing values frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five', 'six']) print(frame2) print(frame2.columns) # In[16]: print(frame['state']) print(frame.year) # In[17]: # Rows can also be retrieved by # position or name with the special loc attribute frame2.loc['three'] # In[18]: print(frame2) frame2['debt'] = 16.5 print(frame2) frame2['debt'] = np.arange(6.) print(frame2) # In[19]: val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2['debt'] = val print(frame2) # In[20]: # Assigning a column that doesn’t exist will create a new column. # The del keyword will delete columns as with a dict. # As an example of del, I first add a new column of boolean # values where the state column equals 'Ohio': frame2['eastern'] = frame2['state'] == 'Ohio' print(frame2) del frame2['eastern'] frame2.columns # In[21]: # another way is a nested dict of dicts pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}} # If the nested dict is passed to the DataFrame, # pandas will interpret the outer dict keys # as the columns and the inner keys as the row indices frame3 = pd.DataFrame(pop) print(frame3) # In[22]: # transpose dataframe frame3.T # In[23]: frame3.index.name = 'year'; frame3.columns.name = 'state' print(frame3) # ## Using the axis parameter # # 1) axis = 0 => row # * Move across/down the row axis # * Direction of operation is down # # 2) axis = 1 => column # * Move along the column axis # * Direction of operation is sideways # # In[24]: drinks = pd.read_csv('http://bit.ly/drinksbycountry') drinks.head() # In[25]: # dropping a column drinks.drop('continent', axis=1).head() # In[26]: # dropping a row drinks.drop(2, axis=0).head() # In[27]: # default axis=0 # mean of each column print(drinks.mean().shape) drinks.mean() # In[28]: # mean of each row print(drinks.mean(axis=1).shape) drinks.mean(axis=1) # ## Essesntial Functionality # ### Reindexing # # An important method on pandas objects is **reindex**, which means to create a new object with the data conformed to a new index # In[29]: obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c']) print(obj) obj2 = obj.reindex(['a', 'b', 'c', 'd']) print(obj2) # For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The **method** option allows us to do this, using a # method such as **ffill**, which forward-fills the values # In[30]: obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4]) print(obj3) obj3.reindex(range(6), method='ffill') # In[31]: frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']) frame.index.name = 'letters' frame # In[32]: frame2 = frame.reindex(['a', 'b', 'c', 'd']) frame2 # In[33]: # The columns can be reindexed with the columns keyword: states = ['Texas', 'Utah', 'California'] frame.reindex(columns=states) # In[34]: frame.reindex(['a', 'b', 'c', 'd'], columns=states) # In[35]: frame.reindex(['a', 'b', 'c', 'd'], columns=states, fill_value=0) # ### Dropping Entries from an axis # # **drop** method will return a new object with the indicated value or values deleted from an axis # In[36]: obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) obj # In[37]: new_obj = obj.drop('c') new_obj # In[38]: obj.drop(['d', 'c']) # With DataFrame, index values can be deleted from either axis # In[39]: data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) data # In[40]: # Calling drop with a sequence of labels will # drop values from the row labels (axis 0): data.drop(['Colorado', 'Ohio']) # In[41]: # drop values from columns: pass axis=1 or axis='columns' data.drop(['two', 'four'], axis=1) # In[42]: print(obj) obj.drop('c', inplace=True) print(obj) # ### Indexing, Selection, and Filtering # #### indexing # # Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers. # In[43]: obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd']) obj # In[44]: print(obj['b']) print(obj[1]) print(obj[2:4]) print(obj[['b', 'a', 'd']]) print(obj[[1, 3]]) # In[45]: print(obj[obj < 2]) # Slicing with labels behaves differently than normal Python slicing in that the **end‐point is inclusive** # In[46]: obj['b': 'c'] # Indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence # In[47]: data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) data # In[48]: print(data['two']) print(data[['three', 'four']]) # The row selection syntax **data[:2]** is provided as a convenience. Passing a single element or a list to the [ ] operator selects columns. # In[49]: data[:2] # In[50]: data[data['three'] > 5] # In[51]: data < 5 # #### setting # Setting using these methods modifies the corresponding section of the Series # In[52]: obj['b': 'c'] = 5 obj # In[53]: data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) data # In[54]: data[data < 5] = 0 data # #### Selection with loc, iloc # For DataFrame label-indexing on the rows, I introduce the special indexing operators loc and iloc. They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation using either axis labels (loc) or integers (iloc). # In[55]: # select a single row and multiple columns by label # loc[Row, col] data.loc['Colorado', ['two', 'three']] # In[56]: # with integers using iloc data.iloc[1, [1, 2]] # In[57]: print(data.iloc[2]) print(data.iloc[[1, 2], [3, 0, 1]]) # In[58]: print(data) print(data.loc[:'Utah', 'two']) # In[59]: data.iloc[:, :3][data.three > 5] # ### Arithmetic and Data Alignment # An important pandas feature for some applications is the behavior of arithmetic # between objects with different indexes. When you are adding together objects, if any index pairs are not the same, # the respective index in the result will be the union of the index pairs. # For users with database experience, this is similar to an automatic outer join on the index labels. # In[60]: s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e']) s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g']) # In[61]: print(s1) print(s2) # In[62]: s1 + s2 # In the case of DataFrame, alignment is performed on both the rows and the columns # In[63]: df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado']) df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) # In[64]: print(df1) print(df2) # In[65]: df1 + df2 # In[66]: df1 = pd.DataFrame({'A': [1, 2]}) df2 = pd.DataFrame({'B': [3, 4]}) print(df1) print(df2) # In[67]: df1 - df2 # In arithmetic operations between differently indexed objects, you might want to fill # with a special value, like 0, when an axis label is found in one object but not the other # In[68]: df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd")) df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde')) # In[69]: df2.loc[1, 'b'] = np.nan # In[70]: print(df1) print(df2) # In[71]: # Adding these together results in NA values in the locations that don’t overlap df1 + df2 # In[72]: # Using the add method on df1,one pass df2 and an argument to fill_value df1.add(df2, fill_value=0) # In[73]: # 1 / df1 # or df1.rdiv(1) # In[74]: # operations between DataFrame and Series frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=("utah", "ohio", "texas", "oregon")) series = frame.iloc[0] print(frame) print(series) # By default, arithmetic between DataFrame and Series matches the index of the Series # on the DataFrame’s columns, broadcasting down the rows # In[75]: frame - series # If an index value is not found in either the DataFrame’s columns or the Series’s index, # the objects will be reindexed to form the union # In[76]: series2 = pd.Series(range(3), index=list('bef')) # In[77]: frame + series2 # In[78]: # broadcasting over columns series3 = frame['d'] series3 # In[79]: frame.sub(series3, axis='index') # The axis number that you pass is the axis to match on. In this case we mean to match # on the DataFrame’s row index (axis='index' or axis=0) and broadcast across. # ### Function Application and Mapping # In[80]: frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) frame # In[81]: np.abs(frame) # In[82]: # Another frequent operation is applying a function on # one-dimensional arrays to each column or row. # DataFrame’s apply method does exactly this f = lambda x: x.max() - x.min() frame.apply(f) # Here the function f, which computes the difference between the maximum and minimum of a Series, is **invoked once on each column** in frame. The result is a Series having the columns of frame as its index. # In[83]: # If you pass axis='columns' to apply, # the function will be invoked once per row frame.apply(f, axis='columns') # In[84]: # returning a Series with multiple values def f(x): return pd.Series([x.min(), x.max()], index=['min', 'max']) frame.apply(f) # Element-wise Python functions can be used, too. Suppose you wanted to compute a # formatted string from each floating-point value in frame. You can do this with # **applymap** # In[85]: format = lambda x: f'{x:.2f}' # In[86]: print(frame) frame.applymap(format) # The reason for the name applymap is that Series has a **map** method for applying an # element-wise function # In[87]: frame['e'].map(format) # ### Sorting and Ranking # #### Sorting # # Sorting a dataset by some criterion is another important built-in operation. To sort # lexicographically by row or column index, use the **sort_index** method, which returns # a new, sorted object # In[88]: obj = pd.Series(range(4), index=['b', 'd', 'a', 'c']) obj # In[89]: obj.sort_index() # In[90]: frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) frame.sort_index() # In[91]: frame.sort_index(axis=1) # In[92]: frame.sort_index(axis=1, ascending=False) # To sort a Series by its values, use its **sort_values** method. # Any missing values are sorted to the end of the Series by default # In[93]: obj = pd.Series([-4, 7, 3, 2]) obj.sort_values() # In[94]: obj = pd.Series([4, np.nan, 7, np.nan, -3, 2]) obj.sort_values() # When sorting a DataFrame, you can use the data in one or more columns as the sort # keys. To do so, pass one or more column names to the by option of sort_values # In[95]: frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame # In[96]: frame.sort_values(by='b') # In[97]: frame.sort_values(by=['a', 'b']) # #### Ranking # # *Ranking* assigns ranks from one through the number of valid data points in an array. # The rank methods for Series and DataFrame are the place to look; by default rank # breaks ties by assigning each group the mean rank # In[98]: obj = pd.Series([7, -5, 7, 4, 2, 0, 4]) obj.rank() # In[99]: # assigning rank according to the order they're observed first in the data obj.rank(method='first') # In[100]: obj.rank(ascending=False, method='max') # In[101]: frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]}) frame # In[102]: frame.rank(axis='columns') # ### Axis Indexes with Duplicate Labels # In[103]: obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c']) obj # In[104]: obj.index.is_unique # In[105]: obj.a # In[106]: df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b']) df # In[107]: df.loc['a'] # ## Summarizing and Computing Descriptive Statistics # In[108]: df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.43]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df # In[109]: df.sum() # In[110]: df.sum(axis='columns') # In[111]: df.sum(axis='columns', skipna=False) # Some methods, like idxmin and idxmax, return indirect statistics like the index value # where the minimum or maximum values are attained # In[112]: print(df.idxmax()) print(df.idxmin()) # In[113]: # accmulation df.cumsum() # Another type of method is neither a reduction nor an accumulation. **describe** is one # such example, producing multiple summary statistics in one shot # In[114]: df.describe() # In[115]: # On non-numeric data, describe produces alternative summary statistics: obj = pd.Series(['a', 'a', 'b', 'c'] * 4) obj.describe() # In[116]: df # ### Correlation and Covariance # In[117]: import pandas_datareader as web # In[118]: all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']} # In[119]: price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()}) volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()}) # In[120]: returns = price.pct_change() # In[121]: returns.tail() # The **corr** method of Series computes the correlation of the overlapping, non-NA, # aligned-by-index values in two Series. Relatedly, **cov** computes the covariance # In[122]: print(returns['MSFT'].cov(returns['IBM'])) print(returns['MSFT'].corr(returns['IBM'])) # In[123]: returns.MSFT.corr(returns.IBM) # In[124]: returns.corr() # In[125]: returns.cov() # In[126]: # corrwith method - compute pairwise correlations # between a DataFrame’s columns or rows with another Series oDataFrame’sme returns.corrwith(returns.IBM) # ### Unique Values, Value Counts, and Membership # # In[127]: # unique - gives you an array of the unique values in a Series obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) obj.unique() # In[128]: # value_counts - computes a Series containing value frequencies pd.value_counts(obj.values, sort=True) # In[129]: # isin - performs a vectorized set membership check and can be useful in filtering a # dataset down to a subset of values in a Series or column in a DataFrame obj # In[130]: mask = obj.isin(['b', 'c']) mask # In[131]: obj[mask] # In[132]: data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]}) data # In[133]: print(pd.value_counts(data.Qu1)) print(pd.value_counts(data.Qu2)) print(pd.value_counts(data.Qu3)) print(data.apply(pd.value_counts).fillna(0)) # index - distinct values in the df # column values - how many times the distinct value occur in that column