#!/usr/bin/env python # coding: utf-8 # *** # *** # # Introduction to the Basics of Statistics # *** # *** # # 王成军 # # wangchengjun@nju.edu.cn # # 计算传播网 http://computational-communication.com # # 一、使用Pandas清洗泰坦尼克数据 # - 练习使用Pandas # # # 二、分析天涯回帖数据 # - 学习使用Statsmodels # # # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # ![](./img/matplotlib.svg) # In[2]: import pandas as pd # ![](./img/pandas_logo.png) # # Statsmodels # # http://statsmodels.sourceforge.net/ # # ![](./img/statsmodels_hybi_banner.png) # # Statsmodels is a Python module that allows users to explore data, estimate statistical models, and perform statistical tests. # # An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator. # # Researchers across fields may find that statsmodels fully meets their needs for statistical computing and data analysis in Python. # # 使用pandas清洗泰坦尼克数据 # # 从本机读取数据 # In[2]: import pandas as pd train = pd.read_csv('../data/tatanic_train.csv',\ sep = ",", header=0) test = pd.read_csv('../data/tatanic_test.csv',\ sep = ",", header=0) # You can easily explore a DataFrame # - .describe() summarizes the columns/features of the DataFrame, including the count of observations, mean, max and so on. # - Another useful trick is to look at the dimensions of the DataFrame. This is done by requesting the .shape attribute of your DataFrame object. (ex. your_data.shape) # In[5]: train.head() # In[6]: train.describe() # In[8]: train.shape#, len(train) #train.columns # In[11]: # Passengers that survived vs passengers that passed away train["Survived"][:3] # In[10]: # Passengers that survived vs passengers that passed away train["Survived"].value_counts() # In[9]: # As proportions train["Survived"].value_counts(normalize = True) # In[10]: train['Sex'].value_counts() # In[12]: train[train['Sex']=='female'][:3]#[train['Pclass'] == 3] # In[13]: # Males that survived vs males that passed away train[["Survived", 'Fare']][train["Sex"] == 'male'][:3] # In[15]: # Males that survived vs males that passed away train["Survived"][train["Sex"] == 'male'].value_counts() # In[31]: # Females that survived vs Females that passed away train["Survived"][train["Sex"] == 'female'].value_counts() # In[32]: # Normalized male survival train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True) # In[33]: # Normalized female survival train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True) # In[97]: # Create the column Child, and indicate whether child or not a child. Print the new column. train["Child"] = float('NaN') train.Child[train.Age < 5] = 1 train.Child[train.Age >= 5] = 0 print(train.Child[:3]) # In[12]: # Normalized Survival Rates for under 18 train.Survived[train.Child == 1].value_counts(normalize = True) # In[24]: # Normalized Survival Rates for over 18 train.Survived[train.Child == 0].value_counts(normalize = True) # In[4]: age = pd.cut(train['Age'], [0, 18, 80]) train.pivot_table('Survived', ['Sex', age], 'Pclass') # In[5]: fare = pd.qcut(train['Fare'], 2) train.pivot_table('Survived', ['Sex', age], [fare, 'Pclass']) # In[21]: # Create a copy of test: test_one test_one = test # Initialize a Survived column to 0 test_one['Survived'] = 0 # Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one` test_one.Survived[test_one.Sex =='female'] = 1 print(test_one.Survived[:3]) # In[26]: #Convert the male and female groups to integer form train["Sex"][train["Sex"] == "male"] = 0 train["Sex"][train["Sex"] == "female"] = 1 #Impute the Embarked variable train["Embarked"] = train["Embarked"].fillna('S') #Convert the Embarked classes to integer form train["Embarked"][train["Embarked"] == "S"] = 0 train["Embarked"][train["Embarked"] == "C"] = 1 train["Embarked"][train["Embarked"] == "Q"] = 2 # # 分析天涯回帖数据 # In[98]: df = pd.read_csv('../data/tianya_bbs_threads_list.txt',\ sep = "\t", names = ['title','link', \ 'author','author_page',\ 'click','reply','time']) df[:2] # In[15]: # df=df.rename(columns = {0:'title', 1:'link', \ # 2:'author',3:'author_page',\ # 4:'click', 5:'reply', 6:'time'}) # df[:5] # In[99]: da = pd.read_csv('../data/tianya_bbs_threads_author_info.txt', sep = "\t", names = ['author_page','followed_num',\ 'fans_num','post_num', \ 'comment_num']) da[:2] # In[19]: # da=da.rename(columns = {0:'author_page', 1:'followed_num',\ # 2:'fans_num',3:'post_num', \ # 4:'comment_num'}) # # da[:5] # In[101]: data = pd.concat([df,da], axis=1) len(data) # In[102]: data[:3] # # Time # In[103]: type(data.time[0]) # In[104]: # extract date from datetime # date = map(lambda x: x[:10], data.time) date = [i[:10] for i in data.time] #date = [i[:10] for i in data.time] data['date'] = pd.to_datetime(date) # In[107]: data.date[:3] # In[108]: # convert str to datetime format data.time = pd.to_datetime(data.time) data['month'] = data.time.dt.month data['year'] = data.time.dt.year data['day'] = data.time.dt.day type(data.time[0]) # In[109]: data.describe() #data.head() # # Statsmodels # # http://statsmodels.sourceforge.net/ # # ![](./img/statsmodels_hybi_banner.png) # # Statsmodels is a Python module that allows users to explore data, estimate statistical models, and perform statistical tests. # # An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator. # # Researchers across fields may find that statsmodels fully meets their needs for statistical computing and data analysis in Python. # # Features include: # # - Linear regression models # - Generalized linear models # - Discrete choice models # - Robust linear models # - Many models and functions for time series analysis # - Nonparametric estimators # - A collection of datasets for examples # - A wide range of statistical tests # - Input-output tools for producing tables in a number of formats and for reading Stata files into NumPy and Pandas. # - Plotting functions # - Extensive unit tests to ensure correctness of results # - Many more models and extensions in development # In[35]: import statsmodels.api as sm # # Describe # In[38]: data.describe() # In[39]: import numpy as np np.mean(data.click), np.std(data.click), np.sum(data.click) # In[40]: # 不加权的变量描述 d1 = sm.stats.DescrStatsW(data.click, \ weights=[1 for i in data.click]) d1.mean, d1.var, d1.std, d1.sum # In[41]: # 加权的变量描述 d1 = sm.stats.DescrStatsW(data.click, weights=data.reply) d1.mean, d1.var, d1.std, d1.sum # In[163]: np.median(data.click) # np.percentile # In[42]: plt.hist(data.click) plt.show() # In[43]: plt.hist(data.reply, color = 'green') plt.show() # In[112]: plt.hist(np.log(data.click+1), color='green') plt.hist(np.log(data.reply+1), color='red') plt.show() # In[115]: # Plot the height and weight to see plt.boxplot([np.log(data.click+1)]) plt.show() # In[47]: # Plot the height and weight to see plt.boxplot([data.click, data.reply]) plt.show() # In[116]: def transformData(dat): results = [] for i in dat: if i != 'na': results.append( int(i)) else: results.append(0) return results # In[117]: data.fans_num = transformData(data.fans_num) data.followed_num = transformData(data.followed_num ) data.post_num = transformData(data.post_num ) data.comment_num = transformData(data.comment_num ) data.describe() # In[50]: import numpy as np # Plot the height and weight to see plt.boxplot([np.log(data.click+1), np.log(data.reply+1), np.log(data.fans_num+1),\ np.log(data.followed_num + 1)], labels = ['$Click$', '$Reply$', '$Fans$',\ '$Followed$']) plt.show() # # Pandas自身已经包含了boxplot的功能 # In[118]: fig = plt.figure(figsize=(12,4)) data.boxplot(return_type='dict') plt.yscale('log') plt.show() # In[52]: from pandas.tools import plotting # fig = plt.figure(figsize=(10, 10)) plotting.scatter_matrix(data[['click', 'reply',\ 'post_num','comment_num']]) plt.show() # ### 更多使用pandas.plotting绘图的操作见: # http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html # In[53]: import seaborn # conda install seaborn seaborn.pairplot(data, vars=['click', 'reply', \ 'post_num', 'comment_num'], kind='reg') # In[54]: seaborn.pairplot(data, vars=['click', 'reply', 'post_num'], kind='reg', hue='year') # In[126]: seaborn.lmplot(y='reply', x='click', data=data, #logx = True, size = 5) plt.show() # # values_counts # In[56]: data.year.value_counts() # In[127]: d = data.year.value_counts() dd = pd.DataFrame(d) dd = dd.sort_index(axis=0, ascending=True) dd # In[128]: dd.index # In[129]: dd_date_str = list(map(lambda x: str(x) +'-01-01', dd.index)) dd_date_str # In[130]: dd_date = pd.to_datetime(list(dd_date_str)) dd_date # In[131]: plt.plot(dd_date, dd.year, 'r-o') plt.show() # In[132]: ds = dd.cumsum() ds # In[133]: d = data.year.value_counts() dd = pd.DataFrame(d) dd = dd.sort_index(axis=0, ascending=True) ds = dd.cumsum() def getDate(dat): dat_date_str = list(map(lambda x: str(x) +'-01-01', dat.index)) dat_date = pd.to_datetime(dat_date_str) return dat_date ds.date = getDate(ds) dd.date = getDate(dd) plt.plot(ds.date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$') plt.plot(dd.date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$') plt.legend(loc=2,numpoints=1,fontsize=13) plt.show() # # groupby # In[137]: dg = data.groupby('year').sum() dg # In[138]: dgs = dg.cumsum() dgs # In[139]: def getDate(dat): dat_date_str = list(map(lambda x: str(x) +'-01-01', dat.index)) dat_date = pd.to_datetime(dat_date_str) return dat_date dg.date = getDate(dg) # In[140]: fig = plt.figure(figsize=(12,5)) plt.plot(dg.date, dg.click, 'r-o', label = '$Yearly\:Number\:of\:Clicks$') plt.plot(dg.date, dg.reply, 'g-s', label = '$Yearly\:Number\:of\:Replies$') plt.plot(dg.date, dg.fans_num, 'b->', label = '$Yearly\:Number\:of\:Fans$') plt.yscale('log') plt.legend(loc=4,numpoints=1,fontsize=13) plt.show() # In[141]: data.groupby('year')['click'].sum() # In[142]: data.groupby('year')['click'].mean() # # 常用的统计分析方法 # - t检验 # - 卡方检验 # - 相关 # - 回归 # # T-test # # http://statsmodels.sourceforge.net/devel/stats.html # # RQ: 转载的文章的点击量是否显著地小于原创的文章? # In[143]: repost = [] for i in df.title: if u'转载' in i: repost.append(1) else: repost.append(0) # In[145]: df['repost'] = repost # In[146]: df.groupby('repost').median() # In[147]: df['click'][df['repost']==0][:5] # In[148]: df['click'][df['repost']==1][:5] # In[152]: from scipy import stats stats.ttest_ind(np.log(df.click+1), df.repost) # In[154]: sm.stats.ttest_ind(np.log(df.click+1), df.repost) # test statistic, pvalue and degrees of freedom # # A chi-squared test # # https://en.wikipedia.org/wiki/Chi-squared_test # # - also referred to as χ² test (or chi-square test), is any statistical hypothesis test in which the sampling distribution of the test statistic is a chi-square distribution when the null hypothesis is true. # - A chi-squared test can then be used to `reject the null hypothesis that the data are independent`. # - Test statistics that follow a chi-squared distribution arise from an `assumption of independent normally distributed data`, which is valid in many cases due to `the central limit theorem`. # - Chi-squared tests are often constructed from a `sum of squared errors`, or through the `sample variance`. # # # # - Suppose there is a city of 1 million residents with `four neighborhoods`: A, B, C, and D. # # - A random sample of 650 residents of the city is taken and their occupation is recorded as "blue collar", "white collar", or "no collar". # # - `The null hypothesis` is that each person's neighborhood of residence is independent of the person's occupational classification. The data are tabulated as: # # | | A | B |C | D | Total| # | -------------|:-------------:|:-------------:|:-------------:|-----:|-----:| # | White collar| 90 | 60 | 104 |95 | 349| # | Blue collar| 30 | 50 | 51 | 20| 151| # | No coloar| 30 | 40 | 45 | 35|150| # | Total | 150 | 150| 200| 150| 650| # - Let us take the sample living in neighborhood A, 150/650, to estimate what proportion of the whole 1 million people live in neighborhood A. # - Similarly we take 349/650 to estimate what proportion of the 1 million people are white-collar workers. # - By the assumption of independence under the hypothesis we should "expect" the number of `white-collar workers in neighborhood A` to be # # $ # \frac{150}{650} \frac{349}{650} 650 = 80.54 # $ # Then in that "cell" of the table, we have # # $\frac{(\text{observed}-\text{expected})^2}{\text{expected}} = \frac{(90-80.54)^2}{80.54}$. # # The sum of these quantities over all of the cells is the `test statistic`. # # Under the null hypothesis, it has approximately a chi-square distribution whose number of `degrees of freedom` are # # $ (\text{number of rows}-1)(\text{number of columns}-1) = (3-1)(4-1) = 6. $ # If the test statistic is improbably large according to that chi-square distribution, then one rejects the null hypothesis of independence. # ![](./img/600px-Chi-square_distributionCDF-English.png) # # scipy.stats.chisquare(f_obs, f_exp=None, ddof=0, axis=0)[source] # - Calculates a one-way chi square test. # - The chi square test tests the null hypothesis that the categorical data has the given frequencies. # # Parameters: # - f_obs : array_like Observed frequencies in each category. # - f_exp : array_like, optional Expected frequencies in each category. By default the categories are assumed to be equally likely. # - ddof : int, optional # In[428]: from scipy.stats import chisquare chisquare([16, 18, 16, 14, 12, 12], \ f_exp=[16, 16, 16, 16, 16, 8]) # ![](./img/scipy_org_logo.gif) # In[155]: from scipy.stats import chi2 # p_value = chi2.sf(chi_statistic, df) print(chi2.sf(3.5, 5)) print(1 - chi2.cdf(3.5,5)) # # Correlation # In[157]: # np.corrcoef(data.click, data.reply) np.corrcoef(np.log(data.click+1), \ np.log(data.reply+1)) # In[383]: data.corr() # In[13]: plt.plot(df.click, df.reply, 'r-o') plt.show() # In[16]: plt.plot(df.click, df.reply, 'gs') plt.xlabel('$Clicks$', fontsize = 20) plt.ylabel('$Replies$', fontsize = 20) plt.xscale('log') plt.yscale('log') plt.title('$Allowmetric\,Law$', fontsize = 20) plt.show() # # Regression # In[66]: import numpy as np import statsmodels.api as sm import statsmodels.formula.api as smf # In[67]: # Load data dat = sm.datasets.get_rdataset("Guerry", "HistData").data # Fit regression model (using the natural log of one of the regressors) results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', \ data=dat).fit() # 有些使用windows的同学无法运行上述代码: # # ## 在spyder中打开terminal # # 输入: pip install -U patsy # # https://groups.google.com/forum/#!topic/pystatsmodels/KcSzNqDxv-Q # # In[21]: # Inspect the results print results.summary() # In[68]: reg = smf.ols('reply ~ click + followed_num', \ data=data).fit() # In[138]: reg.summary() # In[60]: reg1 = smf.ols('np.log(reply+1) ~ np.log(click+1) \ +np.log(followed_num+1)+month', data=data).fit() print reg1.summary() # In[209]: fig = plt.figure(figsize=(12,8)) fig = sm.graphics.plot_partregress_grid(reg1, fig = fig) plt.show() # In[429]: import statsmodels.api as sm from statsmodels.formula.api import ols moore = sm.datasets.get_rdataset("Moore", "car", cache=True) # load data data = moore.data data = data.rename(columns={"partner.status" : "partner_status"}) # make name pythonic # In[434]: data[:5] # # 参考文献 # - scipy-lectures # - http://www.scipy-lectures.org/index.html # - http://www.scipy-lectures.org/packages/statistics/index.html