#!/usr/bin/env python # coding: utf-8 # In[1]: #coding:utf8 get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: from IPython.core.display import HTML css = open('style-table.css').read() + open('style-notebook.css').read() HTML(''.format(css)) # 这个一篇针对pandas新手的简短入门，想要了解更多复杂的内容，参阅[*Cookbook*](http://pandas.pydata.org/pandas-docs/stable/cookbook.html#cookbook) # 通常，我们首先要导入以下几个库： # In[3]: import pandas as pd import numpy as np import matplotlib.pyplot as plt # # 创建对象 # 通过传递一个list来创建**Series**，pandas会默认创建整型索引： # In[4]: s = pd.Series([1,3,5,np.nan,6,8]) s # 通过传递一个numpy array，日期索引以及列标签来创建一个**DataFrame**： # In[5]: dates = pd.date_range('20130101', periods=6) dates # In[6]: df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) df # 通过传递一个能够被转换为类似series的dict对象来创建一个**DataFrame**: # In[7]: df2 = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3]*4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo' }) df2 # 可以看到各列的数据类型为： # In[8]: df2.dtypes # # 查看数据 # 查看frame中头部和尾部的几行： # In[9]: df.head() # In[10]: df.tail(3) # 显示索引、列名以及底层的numpy数据 # In[11]: df.index # In[12]: df.columns # In[13]: df.values # describe()能对数据做一个快速统计汇总 # In[14]: df.describe() # 对数据做转置： # In[15]: df.T # 按轴进行排序： # In[16]: df.sort_index(axis=1, ascending=False) # 按值进行排序 : # In[17]: df.sort_values(by='B') # # 数据选择 # 注意：虽然标准的Python/Numpy的表达式能完成选择与赋值等功能，但我们仍推荐使用优化过的pandas数据访问方法：.at，.iat，.loc，.iloc和.ix # ## 选取 # 选择某一列数据，它会返回一个**Series**，等同于**df.A**： # In[18]: df['A'] # 通过使用**[ ]**进行切片选取： # In[19]: df[0:3] # In[20]: df['20130102':'20130104'] # ## 通过标签选取 # 通过标签进行交叉选取： # In[21]: df.loc[dates[0]] # 使用标签对多个轴进行选取 # In[22]: df.loc[:,['A','B']] # 进行标签切片，包含两个端点 # In[23]: df.loc['20130102':'20130104',['A','B']] # 对于返回的对象进行降维处理 # In[24]: df.loc['20130102',['A','B']] # 获取一个标量 # In[25]: df.loc[dates[0],'A'] # 快速获取标量（与上面的方法等价） # In[26]: df.at[dates[0],'A'] # ## 通过位置选取 # 通过传递整型的位置进行选取 # In[27]: df.iloc[3] # 通过整型的位置切片进行选取，与python/numpy形式相同 # In[28]: df.iloc[3:5,0:2] # 只对行进行切片 # In[29]: df.iloc[1:3,:] # 只对列进行切片 # In[30]: df.iloc[:,1:3] # 只获取某个值 # In[31]: df.iloc[1,1] # 快速获取某个值（与上面的方法等价） # In[32]: df.iat[1,1] # ## 布尔索引 # 用某列的值来选取数据 # In[33]: df[df.A > 0] # 用**where**操作来选取数据 # In[34]: df[df > 0] # 用**isin()**方法来过滤数据 # In[35]: df2 = df.copy() # In[36]: df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three'] df2 # In[37]: df2[df2['E'].isin(['two', 'four'])] # ## 赋值 # 赋值一个新的列，通过索引来自动对齐数据 # In[38]: s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102',periods=6)) s1 # In[39]: df['F'] = s1 df # 通过标签赋值 # In[40]: df.at[dates[0], 'A'] = 0 df # 通过位置赋值 # In[41]: df.iat[0,1] = 0 df # 通过传递numpy array赋值 # In[42]: df.loc[:,'D'] = np.array([5] * len(df)) df # 通过**where**操作来赋值 # In[43]: df2 = df.copy() df2[df2 > 0] = -df2 df2 # # 缺失值处理 # 在pandas中，用**np.nan**来代表缺失值，这些值默认不会参与运算。 # # reindex()允许你修改、增加、删除指定轴上的索引，并返回一个数据副本。 # In[44]: df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E']) df1.loc[dates[0]:dates[1],'E'] = 1 df1 # 剔除所有包含缺失值的行数据 # In[45]: df1.dropna(how='any') # 填充缺失值 # In[46]: df1.fillna(value=5) # 获取值是否为**nan**的布尔标记 # In[47]: pd.isnull(df1) # # 运算 # ## 统计 # 运算过程中，通常不包含缺失值。 # 进行描述性统计 # In[48]: df.mean() # 对其他轴进行同样的运算 # In[49]: df.mean(1) # 对于拥有不同维度的对象进行运算时需要对齐。除此之外，pandas会自动沿着指定维度计算。 # In[50]: s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) s # In[51]: df.sub(s, axis='index') # ## Apply 函数作用 # 通过apply()对函数作用 # In[52]: df.apply(np.cumsum) # In[53]: df.apply(lambda x:x.max()-x.min()) # ## 频数统计 # In[54]: s = pd.Series(np.random.randint(0, 7, size=10)) s # In[55]: s.value_counts() # ## 字符串方法 # 对于Series对象，在其str属性中有着一系列的字符串处理方法。就如同下段代码一样，能很方便的对array中各个元素进行运算。值得注意的是，在str属性中的模式匹配默认使用正则表达式。 # In[56]: s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) s.str.lower() # # 合并 # ## Concat 连接 # pandas中提供了大量的方法能够轻松对Series，DataFrame和Panel对象进行不同满足逻辑关系的合并操作 # 通过**concat()**来连接pandas对象 # In[57]: df = pd.DataFrame(np.random.randn(10,4)) df # In[58]: #break it into pieces pieces = [df[:3], df[3:7], df[7:]] pieces # In[59]: pd.concat(pieces) # ## Join 合并 # 类似于SQL中的合并(merge) # In[60]: left = pd.DataFrame({'key':['foo', 'foo'], 'lval':[1,2]}) left # In[61]: right = pd.DataFrame({'key':['foo', 'foo'], 'lval':[4,5]}) right # In[62]: pd.merge(left, right, on='key') # ## Append 添加 # 将若干行添加到dataFrame后面 # In[63]: df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df # In[64]: s = df.iloc[3] s # In[65]: df.append(s, ignore_index=True) # # 分组 # 对于“group by”操作，我们通常是指以下一个或几个步骤： # * **划分** 按照某些标准将数据分为不同的组 # * **应用** 对每组数据分别执行一个函数 # * **组合** 将结果组合到一个数据结构 # In[66]: df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'bar'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) df # 分组并对每个分组应用sum函数 # In[67]: df.groupby('A').sum() # 按多个列分组形成层级索引，然后应用函数 # In[68]: df.groupby(['A','B']).sum() # # 变形 # ## 堆叠 # In[69]: tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) # In[70]: index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) # In[71]: df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) # In[72]: df2 = df[:4] df2 # **stack()**方法对DataFrame的列“压缩”一个层级 # In[73]: stacked = df2.stack() stacked # 对于一个“堆叠过的”DataFrame或者Series（拥有MultiIndex作为索引），**stack()**的逆操作是**unstack()**，默认反堆叠到上一个层级 # In[74]: stacked.unstack() # In[75]: stacked.unstack(1) # In[76]: stacked.unstack(0) # ## 数据透视表 # In[77]: df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)}) df # 我们可以轻松地从这个数据得到透视表 # In[78]: pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) # # 时间序列 # pandas在对频率转换进行重新采样时拥有着简单，强大而且高效的功能（例如把按秒采样的数据转换为按5分钟采样的数据）。这在金融领域很常见，但又不限于此。 # In[79]: rng = pd.date_range('1/1/2012', periods=100, freq='S') rng # In[80]: ts = pd.Series(np.random.randint(0,500,len(rng)), index=rng) ts # In[81]: ts.resample('5Min', how='sum') # 时区表示 # In[82]: rng = pd.date_range('3/6/2012', periods=5, freq='D') rng # In[83]: ts = pd.Series(np.random.randn(len(rng)), index=rng) ts # In[84]: ts_utc = ts.tz_localize('UTC') ts_utc # 时区转换 # In[85]: ts_utc.tz_convert('US/Eastern') # 时间跨度转换 # In[86]: rng = pd.date_range('1/1/2012', periods=5, freq='M') rng # In[87]: ts = pd.Series(np.random.randn(len(rng)), index=rng) ts # In[88]: ps = ts.to_period() ps # In[89]: ps.to_timestamp() # 日期与时间戳之间的转换使得可以使用一些方便的算术函数。例如，我们把以11月为年底的季度数据转换为当前季度末月底为始的数据 # In[90]: prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') prng # In[91]: ts = pd.Series(np.random.randn(len(prng)), index = prng) ts # In[92]: ts.index = (prng.asfreq('M', 'end') ) .asfreq('H', 'start') +9 ts # # 分类 # 从版本0.15开始，pandas在**DataFrame**中开始包括分类数据。 # In[93]: df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'e', 'e']}) df # 把raw_grade转换为分类类型 # In[94]: df["grade"] = df["raw_grade"].astype("category") df["grade"] # 重命名类别名为更有意义的名称 # In[95]: df["grade"].cat.categories = ["very good", "good", "very bad"] # 对分类重新排序，并添加缺失的分类 # In[96]: df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df["grade"] # 排序是按照分类的顺序进行的，而不是字典序 # In[97]: df.sort_values(by="grade") # 按分类分组时，也会显示空的分类 # In[98]: df.groupby("grade").size() # # 绘图 # In[99]: ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() ts.plot() # 对于DataFrame类型，**plot()**能很方便地画出所有列及其标签 # In[100]: df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D']) df = df.cumsum() plt.figure(); df.plot(); plt.legend(loc='best') # # 获取数据的I/O # ## CSV # 写入一个csv文件 # In[101]: df.to_csv('foo.csv') # 从一个csv文件读入 # In[102]: pd.read_csv('foo.csv') # ## HDF5 # HDFStores的读写 # 写入一个HDF5 Store # In[103]: df.to_hdf('foo.h5', 'df') # 从一个HDF5 Store读入 # In[104]: pd.read_hdf('foo.h5', 'df') # ## Excel # MS Excel的读写 # 写入一个Excel文件 # In[105]: df.to_excel('foo.xlsx', sheet_name='Sheet1') # 从一个excel文件读入 # In[106]: pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])