#!/usr/bin/env python
# coding: utf-8
#
Table of Contents
#
# In[1]:
import pandas as pd
import numpy as np
# In[2]:
import builtins
def print(*args, **kwargs):
builtins.print(*args, **kwargs, end='\n\n')
# ## Series
#
# A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index.
# In[3]:
obj = pd.Series([4, 7, -5, 3])
print(obj)
print(obj.index)
print(obj.values)
# In[4]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'a', 'c', 'b'])
print(obj2)
print(obj2.index)
# In[5]:
obj2['a']
# In[6]:
obj2[['c', 'a', 'd']]
# In[7]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
print(obj3)
# In[8]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
print(obj4)
# In[9]:
print(obj4.isnull())
print(obj4.notnull())
# In[10]:
print(obj3)
print(obj4)
print(obj3 + obj4)
# In[11]:
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)
# In[12]:
print(obj)
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)
# ## DataFrame
#
# A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.).
# In[13]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
print(frame)
# In[14]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])
# In[15]:
# u pass a column that isn’t contained in the dict,
# it will appear with missing values
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four', 'five', 'six'])
print(frame2)
print(frame2.columns)
# In[16]:
print(frame['state'])
print(frame.year)
# In[17]:
# Rows can also be retrieved by
# position or name with the special loc attribute
frame2.loc['three']
# In[18]:
print(frame2)
frame2['debt'] = 16.5
print(frame2)
frame2['debt'] = np.arange(6.)
print(frame2)
# In[19]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)
# In[20]:
# Assigning a column that doesn’t exist will create a new column.
# The del keyword will delete columns as with a dict.
# As an example of del, I first add a new column of boolean
# values where the state column equals 'Ohio':
frame2['eastern'] = frame2['state'] == 'Ohio'
print(frame2)
del frame2['eastern']
frame2.columns
# In[21]:
# another way is a nested dict of dicts
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
# If the nested dict is passed to the DataFrame,
# pandas will interpret the outer dict keys
# as the columns and the inner keys as the row indices
frame3 = pd.DataFrame(pop)
print(frame3)
# In[22]:
# transpose dataframe
frame3.T
# In[23]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
print(frame3)
# ## Using the axis parameter
#
# 1) axis = 0 => row
# * Move across/down the row axis
# * Direction of operation is down
#
# 2) axis = 1 => column
# * Move along the column axis
# * Direction of operation is sideways
#
# In[24]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
# In[25]:
# dropping a column
drinks.drop('continent', axis=1).head()
# In[26]:
# dropping a row
drinks.drop(2, axis=0).head()
# In[27]:
# default axis=0
# mean of each column
print(drinks.mean().shape)
drinks.mean()
# In[28]:
# mean of each row
print(drinks.mean(axis=1).shape)
drinks.mean(axis=1)
# ## Essesntial Functionality
# ### Reindexing
#
# An important method on pandas objects is **reindex**, which means to create a new object with the data conformed to a new index
# In[29]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj)
obj2 = obj.reindex(['a', 'b', 'c', 'd'])
print(obj2)
# For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The **method** option allows us to do this, using a
# method such as **ffill**, which forward-fills the values
# In[30]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
obj3.reindex(range(6), method='ffill')
# In[31]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
columns=['Ohio', 'Texas', 'California'])
frame.index.name = 'letters'
frame
# In[32]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2
# In[33]:
# The columns can be reindexed with the columns keyword:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
# In[34]:
frame.reindex(['a', 'b', 'c', 'd'], columns=states)
# In[35]:
frame.reindex(['a', 'b', 'c', 'd'], columns=states, fill_value=0)
# ### Dropping Entries from an axis
#
# **drop** method will return a new object with the indicated value or values deleted from an axis
# In[36]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
# In[37]:
new_obj = obj.drop('c')
new_obj
# In[38]:
obj.drop(['d', 'c'])
# With DataFrame, index values can be deleted from either axis
# In[39]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
# In[40]:
# Calling drop with a sequence of labels will
# drop values from the row labels (axis 0):
data.drop(['Colorado', 'Ohio'])
# In[41]:
# drop values from columns: pass axis=1 or axis='columns'
data.drop(['two', 'four'], axis=1)
# In[42]:
print(obj)
obj.drop('c', inplace=True)
print(obj)
# ### Indexing, Selection, and Filtering
# #### indexing
#
# Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers.
# In[43]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj
# In[44]:
print(obj['b'])
print(obj[1])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
# In[45]:
print(obj[obj < 2])
# Slicing with labels behaves differently than normal Python slicing in that the **end‐point is inclusive**
# In[46]:
obj['b': 'c']
# Indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence
# In[47]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
# In[48]:
print(data['two'])
print(data[['three', 'four']])
# The row selection syntax **data[:2]** is provided as a convenience. Passing a single element or a list to the [ ] operator selects columns.
# In[49]:
data[:2]
# In[50]:
data[data['three'] > 5]
# In[51]:
data < 5
# #### setting
# Setting using these methods modifies the corresponding section of the Series
# In[52]:
obj['b': 'c'] = 5
obj
# In[53]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
# In[54]:
data[data < 5] = 0
data
# #### Selection with loc, iloc
# For DataFrame label-indexing on the rows, I introduce the special indexing operators loc and iloc. They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation using either axis labels (loc) or integers (iloc).
# In[55]:
# select a single row and multiple columns by label
# loc[Row, col]
data.loc['Colorado', ['two', 'three']]
# In[56]:
# with integers using iloc
data.iloc[1, [1, 2]]
# In[57]:
print(data.iloc[2])
print(data.iloc[[1, 2], [3, 0, 1]])
# In[58]:
print(data)
print(data.loc[:'Utah', 'two'])
# In[59]:
data.iloc[:, :3][data.three > 5]
# ### Arithmetic and Data Alignment
# An important pandas feature for some applications is the behavior of arithmetic
# between objects with different indexes. When you are adding together objects, if any index pairs are not the same,
# the respective index in the result will be the union of the index pairs.
# For users with database experience, this is similar to an automatic outer join on the index labels.
# In[60]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
# In[61]:
print(s1)
print(s2)
# In[62]:
s1 + s2
# In the case of DataFrame, alignment is performed on both the rows and the columns
# In[63]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
# In[64]:
print(df1)
print(df2)
# In[65]:
df1 + df2
# In[66]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
print(df1)
print(df2)
# In[67]:
df1 - df2
# In arithmetic operations between differently indexed objects, you might want to fill
# with a special value, like 0, when an axis label is found in one object but not the other
# In[68]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
# In[69]:
df2.loc[1, 'b'] = np.nan
# In[70]:
print(df1)
print(df2)
# In[71]:
# Adding these together results in NA values in the locations that don’t overlap
df1 + df2
# In[72]:
# Using the add method on df1,one pass df2 and an argument to fill_value
df1.add(df2, fill_value=0)
# In[73]:
# 1 / df1
# or
df1.rdiv(1)
# In[74]:
# operations between DataFrame and Series
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=("utah", "ohio", "texas", "oregon"))
series = frame.iloc[0]
print(frame)
print(series)
# By default, arithmetic between DataFrame and Series matches the index of the Series
# on the DataFrame’s columns, broadcasting down the rows
# In[75]:
frame - series
# If an index value is not found in either the DataFrame’s columns or the Series’s index,
# the objects will be reindexed to form the union
# In[76]:
series2 = pd.Series(range(3), index=list('bef'))
# In[77]:
frame + series2
# In[78]:
# broadcasting over columns
series3 = frame['d']
series3
# In[79]:
frame.sub(series3, axis='index')
# The axis number that you pass is the axis to match on. In this case we mean to match
# on the DataFrame’s row index (axis='index' or axis=0) and broadcast across.
# ### Function Application and Mapping
# In[80]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
# In[81]:
np.abs(frame)
# In[82]:
# Another frequent operation is applying a function on
# one-dimensional arrays to each column or row.
# DataFrame’s apply method does exactly this
f = lambda x: x.max() - x.min()
frame.apply(f)
# Here the function f, which computes the difference between the maximum and minimum of a Series, is **invoked once on each column** in frame. The result is a Series having the columns of frame as its index.
# In[83]:
# If you pass axis='columns' to apply,
# the function will be invoked once per row
frame.apply(f, axis='columns')
# In[84]:
# returning a Series with multiple values
def f(x):
return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
# Element-wise Python functions can be used, too. Suppose you wanted to compute a
# formatted string from each floating-point value in frame. You can do this with
# **applymap**
# In[85]:
format = lambda x: f'{x:.2f}'
# In[86]:
print(frame)
frame.applymap(format)
# The reason for the name applymap is that Series has a **map** method for applying an
# element-wise function
# In[87]:
frame['e'].map(format)
# ### Sorting and Ranking
# #### Sorting
#
# Sorting a dataset by some criterion is another important built-in operation. To sort
# lexicographically by row or column index, use the **sort_index** method, which returns
# a new, sorted object
# In[88]:
obj = pd.Series(range(4), index=['b', 'd', 'a', 'c'])
obj
# In[89]:
obj.sort_index()
# In[90]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])
frame.sort_index()
# In[91]:
frame.sort_index(axis=1)
# In[92]:
frame.sort_index(axis=1, ascending=False)
# To sort a Series by its values, use its **sort_values** method.
# Any missing values are sorted to the end of the Series by default
# In[93]:
obj = pd.Series([-4, 7, 3, 2])
obj.sort_values()
# In[94]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()
# When sorting a DataFrame, you can use the data in one or more columns as the sort
# keys. To do so, pass one or more column names to the by option of sort_values
# In[95]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
# In[96]:
frame.sort_values(by='b')
# In[97]:
frame.sort_values(by=['a', 'b'])
# #### Ranking
#
# *Ranking* assigns ranks from one through the number of valid data points in an array.
# The rank methods for Series and DataFrame are the place to look; by default rank
# breaks ties by assigning each group the mean rank
# In[98]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
# In[99]:
# assigning rank according to the order they're observed first in the data
obj.rank(method='first')
# In[100]:
obj.rank(ascending=False, method='max')
# In[101]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
frame
# In[102]:
frame.rank(axis='columns')
# ### Axis Indexes with Duplicate Labels
# In[103]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj
# In[104]:
obj.index.is_unique
# In[105]:
obj.a
# In[106]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
# In[107]:
df.loc['a']
# ## Summarizing and Computing Descriptive Statistics
# In[108]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan],
[0.75, -1.43]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
# In[109]:
df.sum()
# In[110]:
df.sum(axis='columns')
# In[111]:
df.sum(axis='columns', skipna=False)
# Some methods, like idxmin and idxmax, return indirect statistics like the index value
# where the minimum or maximum values are attained
# In[112]:
print(df.idxmax())
print(df.idxmin())
# In[113]:
# accmulation
df.cumsum()
# Another type of method is neither a reduction nor an accumulation. **describe** is one
# such example, producing multiple summary statistics in one shot
# In[114]:
df.describe()
# In[115]:
# On non-numeric data, describe produces alternative summary statistics:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()
# In[116]:
df
# ### Correlation and Covariance
# In[117]:
import pandas_datareader as web
# In[118]:
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
# In[119]:
price = pd.DataFrame({ticker: data['Adj Close']
for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
for ticker, data in all_data.items()})
# In[120]:
returns = price.pct_change()
# In[121]:
returns.tail()
# The **corr** method of Series computes the correlation of the overlapping, non-NA,
# aligned-by-index values in two Series. Relatedly, **cov** computes the covariance
# In[122]:
print(returns['MSFT'].cov(returns['IBM']))
print(returns['MSFT'].corr(returns['IBM']))
# In[123]:
returns.MSFT.corr(returns.IBM)
# In[124]:
returns.corr()
# In[125]:
returns.cov()
# In[126]:
# corrwith method - compute pairwise correlations
# between a DataFrame’s columns or rows with another Series oDataFrame’sme
returns.corrwith(returns.IBM)
# ### Unique Values, Value Counts, and Membership
#
# In[127]:
# unique - gives you an array of the unique values in a Series
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj.unique()
# In[128]:
# value_counts - computes a Series containing value frequencies
pd.value_counts(obj.values, sort=True)
# In[129]:
# isin - performs a vectorized set membership check and can be useful in filtering a
# dataset down to a subset of values in a Series or column in a DataFrame
obj
# In[130]:
mask = obj.isin(['b', 'c'])
mask
# In[131]:
obj[mask]
# In[132]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
data
# In[133]:
print(pd.value_counts(data.Qu1))
print(pd.value_counts(data.Qu2))
print(pd.value_counts(data.Qu3))
print(data.apply(pd.value_counts).fillna(0))
# index - distinct values in the df
# column values - how many times the distinct value occur in that column