#!/usr/bin/env python
# coding: utf-8

# ***
# ***
# # Introduction to the Basics of Statistics
# ***
# ***
# 
# 王成军 
# 
# wangchengjun@nju.edu.cn
# 
# 计算传播网 http://computational-communication.com

# # 一、使用Pandas清洗泰坦尼克数据
# - 练习使用Pandas
# 
# # 二、分析天涯回帖数据
# - 学习使用Statsmodels
# 
# 

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt


# ![](./img/matplotlib.svg)

# In[2]:


import pandas as pd


# ![](./img/pandas_logo.png)

# # Statsmodels
# 
# http://statsmodels.sourceforge.net/
# 
# ![](./img/statsmodels_hybi_banner.png)
# 
# Statsmodels is a Python module that allows users to explore data, estimate statistical models, and perform statistical tests. 
# 
# An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator. 
# 
# Researchers across fields may find that statsmodels fully meets their needs for statistical computing and data analysis in Python. 

# # 使用pandas清洗泰坦尼克数据

# # 从本机读取数据

# In[2]:


import pandas as pd

train = pd.read_csv('../data/tatanic_train.csv',\
                    sep = ",", header=0)
test = pd.read_csv('../data/tatanic_test.csv',\
                    sep = ",", header=0)


# You can easily explore a DataFrame  
#  - .describe() summarizes the columns/features of the DataFrame, including the count of observations, mean, max and so on. 
#  - Another useful trick is to look at the dimensions of the DataFrame. This is done by requesting the .shape attribute of your DataFrame object. (ex. your_data.shape)

# In[5]:


train.head()


# In[6]:


train.describe()


# In[8]:


train.shape#, len(train)
#train.columns


# In[11]:


# Passengers that survived vs passengers that passed away
train["Survived"][:3]


# In[10]:


# Passengers that survived vs passengers that passed away
train["Survived"].value_counts()


# In[9]:


# As proportions
train["Survived"].value_counts(normalize = True)


# In[10]:


train['Sex'].value_counts()


# In[12]:


train[train['Sex']=='female'][:3]#[train['Pclass'] == 3]


# In[13]:


# Males that survived vs males that passed away
train[["Survived", 'Fare']][train["Sex"] == 'male'][:3]


# In[15]:


# Males that survived vs males that passed away
train["Survived"][train["Sex"] == 'male'].value_counts() 


# In[31]:


# Females that survived vs Females that passed away
train["Survived"][train["Sex"] == 'female'].value_counts() 


# In[32]:


# Normalized male survival
train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True) 


# In[33]:


# Normalized female survival
train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)


# In[97]:


# Create the column Child, and indicate whether child or not a child. Print the new column.
train["Child"] = float('NaN')
train.Child[train.Age < 5] = 1
train.Child[train.Age >= 5] = 0
print(train.Child[:3])


# In[12]:


# Normalized Survival Rates for under 18
train.Survived[train.Child == 1].value_counts(normalize = True)


# In[24]:


# Normalized Survival Rates for over 18
train.Survived[train.Child == 0].value_counts(normalize = True)


# In[4]:


age = pd.cut(train['Age'], [0, 18, 80])
train.pivot_table('Survived', ['Sex', age], 'Pclass')


# In[5]:


fare = pd.qcut(train['Fare'], 2)
train.pivot_table('Survived', ['Sex', age], [fare, 'Pclass'])


# In[21]:


# Create a copy of test: test_one
test_one = test
# Initialize a Survived column to 0
test_one['Survived'] = 0
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one.Survived[test_one.Sex =='female'] = 1

print(test_one.Survived[:3])


# In[26]:


#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1

#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')

#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2


# # 分析天涯回帖数据

# In[98]:


df = pd.read_csv('../data/tianya_bbs_threads_list.txt',\
                 sep = "\t", names = ['title','link', \
                        'author','author_page',\
                        'click','reply','time'])
df[:2]


# In[15]:


# df=df.rename(columns = {0:'title', 1:'link', \
#                         2:'author',3:'author_page',\
#                         4:'click', 5:'reply', 6:'time'})
# df[:5]


# In[99]:


da = pd.read_csv('../data/tianya_bbs_threads_author_info.txt', 
                 sep = "\t", names = ['author_page','followed_num',\
                        'fans_num','post_num', \
                        'comment_num'])
da[:2]


# In[19]:


# da=da.rename(columns = {0:'author_page', 1:'followed_num',\
#                         2:'fans_num',3:'post_num', \
#                         4:'comment_num'})
# # da[:5]


# In[101]:


data = pd.concat([df,da], axis=1)
len(data)


# In[102]:


data[:3]


# # Time

# In[103]:


type(data.time[0])


# In[104]:


# extract date from datetime
# date = map(lambda x: x[:10], data.time)
date = [i[:10] for i in data.time]
#date = [i[:10] for i in data.time]
data['date'] = pd.to_datetime(date)


# In[107]:


data.date[:3]


# In[108]:


# convert str to datetime format
data.time = pd.to_datetime(data.time)
data['month'] = data.time.dt.month
data['year'] = data.time.dt.year
data['day'] = data.time.dt.day
type(data.time[0])


# In[109]:


data.describe()
#data.head()


# # Statsmodels
# 
# http://statsmodels.sourceforge.net/
# 
# ![](./img/statsmodels_hybi_banner.png)
# 
# Statsmodels is a Python module that allows users to explore data, estimate statistical models, and perform statistical tests. 
# 
# An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator. 
# 
# Researchers across fields may find that statsmodels fully meets their needs for statistical computing and data analysis in Python. 

# # Features include:
# 
# - Linear regression models
# - Generalized linear models
# - Discrete choice models
# - Robust linear models
# - Many models and functions for time series analysis
# - Nonparametric estimators
# - A collection of datasets for examples
# - A wide range of statistical tests
# - Input-output tools for producing tables in a number of formats and for reading Stata files into NumPy and Pandas.
# - Plotting functions
# - Extensive unit tests to ensure correctness of results
# - Many more models and extensions in development

# In[35]:


import statsmodels.api as sm


# # Describe

# In[38]:


data.describe()


# In[39]:


import numpy as np

np.mean(data.click), np.std(data.click), np.sum(data.click)


# In[40]:


# 不加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, \
                          weights=[1 for i in data.click])
d1.mean, d1.var, d1.std, d1.sum


# In[41]:


# 加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=data.reply)
d1.mean, d1.var, d1.std, d1.sum


# In[163]:


np.median(data.click) # np.percentile 


# In[42]:


plt.hist(data.click)
plt.show()


# In[43]:


plt.hist(data.reply, color = 'green')
plt.show()


# In[112]:


plt.hist(np.log(data.click+1), color='green')
plt.hist(np.log(data.reply+1), color='red')
plt.show()


# In[115]:


# Plot the height and weight to see
plt.boxplot([np.log(data.click+1)])
plt.show()


# In[47]:


# Plot the height and weight to see
plt.boxplot([data.click, data.reply])
plt.show()


# In[116]:


def transformData(dat):
    results = []
    for i in dat:
        if i != 'na':
            results.append( int(i))
        else:
            results.append(0)
    return results


# In[117]:


data.fans_num = transformData(data.fans_num)
data.followed_num = transformData(data.followed_num )
data.post_num = transformData(data.post_num )
data.comment_num = transformData(data.comment_num )
data.describe()


# In[50]:


import numpy as np
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1), np.log(data.reply+1), 
             np.log(data.fans_num+1),\
             np.log(data.followed_num + 1)], 
            labels = ['$Click$', '$Reply$', '$Fans$',\
                      '$Followed$'])
plt.show()


# # Pandas自身已经包含了boxplot的功能

# In[118]:


fig = plt.figure(figsize=(12,4))
data.boxplot(return_type='dict')
plt.yscale('log')
plt.show()


# In[52]:


from pandas.tools import plotting

# fig = plt.figure(figsize=(10, 10))
plotting.scatter_matrix(data[['click', 'reply',\
                              'post_num','comment_num']]) 
plt.show()


# ### 更多使用pandas.plotting绘图的操作见：
# http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html

# In[53]:


import seaborn # conda install seaborn
seaborn.pairplot(data, vars=['click', 'reply', \
                             'post_num', 'comment_num'],
                  kind='reg')  


# In[54]:


seaborn.pairplot(data, vars=['click', 'reply', 'post_num'],
                 kind='reg', hue='year')  


# In[126]:


seaborn.lmplot(y='reply', x='click', data=data, #logx = True,
               size = 5) 
plt.show()


# # values_counts

# In[56]:


data.year.value_counts()


# In[127]:


d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
dd


# In[128]:


dd.index


# In[129]:


dd_date_str = list(map(lambda x: str(x) +'-01-01', dd.index))
dd_date_str


# In[130]:


dd_date = pd.to_datetime(list(dd_date_str))
dd_date


# In[131]:


plt.plot(dd_date, dd.year, 'r-o')
plt.show()


# In[132]:


ds = dd.cumsum()
ds


# In[133]:


d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
ds = dd.cumsum()

def getDate(dat):
    dat_date_str = list(map(lambda x: str(x) +'-01-01', dat.index))
    dat_date = pd.to_datetime(dat_date_str)
    return dat_date

ds.date = getDate(ds)
dd.date = getDate(dd)

plt.plot(ds.date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$')
plt.plot(dd.date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$')
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()


# # groupby

# In[137]:


dg = data.groupby('year').sum()
dg


# In[138]:


dgs = dg.cumsum()
dgs


# In[139]:


def getDate(dat):
    dat_date_str = list(map(lambda x: str(x) +'-01-01', dat.index))
    dat_date = pd.to_datetime(dat_date_str)
    return dat_date

dg.date = getDate(dg)


# In[140]:


fig = plt.figure(figsize=(12,5))
plt.plot(dg.date, dg.click, 'r-o', label = '$Yearly\:Number\:of\:Clicks$')
plt.plot(dg.date, dg.reply, 'g-s', label = '$Yearly\:Number\:of\:Replies$')
plt.plot(dg.date, dg.fans_num, 'b->', label = '$Yearly\:Number\:of\:Fans$')

plt.yscale('log')

plt.legend(loc=4,numpoints=1,fontsize=13)
plt.show()


# In[141]:


data.groupby('year')['click'].sum()


# In[142]:


data.groupby('year')['click'].mean()


# # 常用的统计分析方法
# - t检验
# - 卡方检验
# - 相关
# - 回归

# # T-test
# 
# http://statsmodels.sourceforge.net/devel/stats.html

# # RQ: 转载的文章的点击量是否显著地小于原创的文章？

# In[143]:


repost = []
for i in df.title:
    if u'转载' in i:
        repost.append(1)
    else:
        repost.append(0)


# In[145]:


df['repost'] = repost


# In[146]:


df.groupby('repost').median()


# In[147]:


df['click'][df['repost']==0][:5]


# In[148]:


df['click'][df['repost']==1][:5]


# In[152]:


from scipy import stats
stats.ttest_ind(np.log(df.click+1), df.repost)


# In[154]:


sm.stats.ttest_ind(np.log(df.click+1), df.repost)
# test statistic, pvalue and degrees of freedom


# # A chi-squared test
# 
# https://en.wikipedia.org/wiki/Chi-squared_test
# 
# - also referred to as χ² test (or chi-square test), is any statistical hypothesis test in which the sampling distribution of the test statistic is a chi-square distribution when the null hypothesis is true. 
# - A chi-squared test can then be used to `reject the null hypothesis that the data are independent`.
# - Test statistics that follow a chi-squared distribution arise from an `assumption of independent normally distributed data`, which is valid in many cases due to `the central limit theorem`. 
# - Chi-squared tests are often constructed from a `sum of squared errors`, or through the `sample variance`.
# 
# 
# 

# - Suppose there is a city of 1 million residents with `four neighborhoods`: A, B, C, and D. 
# 
# - A random sample of 650 residents of the city is taken and their occupation is recorded as "blue collar", "white collar", or "no collar". 
# 
# - `The null hypothesis` is that each person's neighborhood of residence is independent of the person's occupational classification. The data are tabulated as:

# 
# |               |  A           |    B          |C              | D  | Total|
# | -------------|:-------------:|:-------------:|:-------------:|-----:|-----:|
# | White collar| 90 | 60 | 104 |95 | 349|
# | Blue collar| 30 | 50 | 51 | 20| 151|
# | No coloar| 30 | 40 | 45 | 35|150|
# | Total | 150 | 150| 200| 150| 650|

# - Let us take the sample living in neighborhood A, 150/650, to estimate what proportion of the whole 1 million people live in neighborhood A. 
# - Similarly we take 349/650 to estimate what proportion of the 1 million people are white-collar workers. 
# - By the assumption of independence under the hypothesis we should "expect" the number of `white-collar workers in neighborhood A` to be
# 
# $
# \frac{150}{650} \frac{349}{650} 650 = 80.54
# $

# Then in that "cell" of the table, we have
# 
# $\frac{(\text{observed}-\text{expected})^2}{\text{expected}} = \frac{(90-80.54)^2}{80.54}$.
# 
# The sum of these quantities over all of the cells is the `test statistic`. 
# 
# Under the null hypothesis, it has approximately a chi-square distribution whose number of `degrees of freedom` are
# 
# $ (\text{number of rows}-1)(\text{number of columns}-1) = (3-1)(4-1) = 6. $

# If the test statistic is improbably large according to that chi-square distribution, then one rejects the null hypothesis of independence.

# ![](./img/600px-Chi-square_distributionCDF-English.png)

# # scipy.stats.chisquare(f_obs, f_exp=None, ddof=0, axis=0)[source]
# - Calculates a one-way chi square test.
# - The chi square test tests the null hypothesis that the categorical data has the given frequencies.
# 
# Parameters:	
# - f_obs : array_like Observed frequencies in each category.
# - f_exp : array_like, optional Expected frequencies in each category. By default the categories are assumed to be equally likely.
# - ddof : int, optional

# In[428]:


from scipy.stats import chisquare
chisquare([16, 18, 16, 14, 12, 12], \
          f_exp=[16, 16, 16, 16, 16, 8])


# ![](./img/scipy_org_logo.gif)

# In[155]:


from scipy.stats import chi2
# p_value = chi2.sf(chi_statistic, df)
print(chi2.sf(3.5, 5))
print(1 - chi2.cdf(3.5,5))


# # Correlation

# In[157]:


# np.corrcoef(data.click, data.reply)

np.corrcoef(np.log(data.click+1), \
                  np.log(data.reply+1))


# In[383]:


data.corr()


# In[13]:


plt.plot(df.click, df.reply, 'r-o')
plt.show()


# In[16]:


plt.plot(df.click, df.reply, 'gs')
plt.xlabel('$Clicks$', fontsize = 20)
plt.ylabel('$Replies$', fontsize = 20)
plt.xscale('log')
plt.yscale('log')
plt.title('$Allowmetric\,Law$', fontsize = 20)
plt.show()


# # Regression

# In[66]:


import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf


# In[67]:


# Load data
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', \
                  data=dat).fit()


# 有些使用windows的同学无法运行上述代码：
# 
# ## 在spyder中打开terminal
# # 输入: pip install -U patsy
# 
# https://groups.google.com/forum/#!topic/pystatsmodels/KcSzNqDxv-Q
# 

# In[21]:


# Inspect the results
print results.summary()


# In[68]:


reg = smf.ols('reply ~ click + followed_num', \
              data=data).fit()


# In[138]:


reg.summary()


# In[60]:


reg1 = smf.ols('np.log(reply+1) ~ np.log(click+1) \
+np.log(followed_num+1)+month', data=data).fit()
print reg1.summary()


# In[209]:


fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(reg1, fig = fig)
plt.show()


# In[429]:


import statsmodels.api as sm
from statsmodels.formula.api import ols

moore = sm.datasets.get_rdataset("Moore", "car",
                                 cache=True) # load data
data = moore.data
data = data.rename(columns={"partner.status" :
                             "partner_status"}) # make name pythonic


# In[434]:


data[:5]


# # 参考文献
# - scipy-lectures 
#     - http://www.scipy-lectures.org/index.html
#     - http://www.scipy-lectures.org/packages/statistics/index.html