%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
# Import the Pandas library
import pandas as pd
# Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
#Print the `head` of the train and test dataframes
print(train.head())
print(test.head())
PassengerId Survived Pclass \ 0 1 0 3 1 2 1 1 2 3 1 3 3 4 1 1 4 5 0 3 Name Sex Age SibSp \ 0 Braund, Mr. Owen Harris male 22.0 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 2 Heikkinen, Miss. Laina female 26.0 0 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 4 Allen, Mr. William Henry male 35.0 0 Parch Ticket Fare Cabin Embarked 0 0 A/5 21171 7.2500 NaN S 1 0 PC 17599 71.2833 C85 C 2 0 STON/O2. 3101282 7.9250 NaN S 3 0 113803 53.1000 C123 S 4 0 373450 8.0500 NaN S PassengerId Pclass Name Sex \ 0 892 3 Kelly, Mr. James male 1 893 3 Wilkes, Mrs. James (Ellen Needs) female 2 894 2 Myles, Mr. Thomas Francis male 3 895 3 Wirz, Mr. Albert male 4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female Age SibSp Parch Ticket Fare Cabin Embarked 0 34.5 0 0 330911 7.8292 NaN Q 1 47.0 1 0 363272 7.0000 NaN S 2 62.0 0 0 240276 9.6875 NaN Q 3 27.0 0 0 315154 8.6625 NaN S 4 22.0 1 1 3101298 12.2875 NaN S
train.to_csv('/Users/chengjun/github/cjc2016/data/tatanic_train.csv')
test.to_csv('/Users/chengjun/github/cjc2016/data/tatanic_test.csv')
You can easily explore a DataFrame
import pandas as pd
train = pd.read_csv('/Users/chengjun/github/cjc2016/data/tatanic_train.csv', sep = ",", header=0)
train.head()
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
train.describe()
Unnamed: 0 | PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 445.000000 | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 222.500000 | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 445.000000 | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 667.500000 | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 890.000000 | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
train.shape
(891, 13)
# Passengers that survived vs passengers that passed away
train["Survived"].value_counts()
0 549 1 342 Name: Survived, dtype: int64
# As proportions
train["Survived"].value_counts(normalize = True)
0 0.616162 1 0.383838 Name: Survived, dtype: float64
train['Sex'].value_counts()
male 577 female 314 Name: Sex, dtype: int64
# Males that survived vs males that passed away
train["Survived"][train["Sex"] == 'male'].value_counts()
0 468 1 109 Name: Survived, dtype: int64
# Females that survived vs Females that passed away
train["Survived"][train["Sex"] == 'female'].value_counts()
1 233 0 81 Name: Survived, dtype: int64
# Normalized male survival
train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)
0 0.811092 1 0.188908 Name: Survived, dtype: float64
# Normalized female survival
train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)
1 0.742038 0 0.257962 Name: Survived, dtype: float64
# Create the column Child, and indicate whether child or not a child. Print the new column.
train["Child"] = float('NaN')
train.Child[train.Age < 18] = 1
train.Child[train.Age >= 18] = 0
print train.Child[:3]
0 0.0 1 0.0 2 0.0 Name: Child, dtype: float64
/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance() /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# Normalized Survival Rates for under 18
train.Survived[train.Child == 1].value_counts(normalize = True)
1 0.539823 0 0.460177 Name: Survived, dtype: float64
# Normalized Survival Rates for over 18
train.Survived[train.Child == 0].value_counts(normalize = True)
0 0.618968 1 0.381032 Name: Survived, dtype: float64
# Create a copy of test: test_one
test_one = test
# Initialize a Survived column to 0
test_one['Survived'] = 0
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one.Survived[test_one.Sex =='female'] = 1
print test_one.Survived[:3]
0 0 1 1 2 0 Name: Survived, dtype: int64
/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')
#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from ipykernel import kernelapp as app /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance() /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df[:2]
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | 【民间语文第161期】宁波px启示:船进港湾人应上岸 | /post-free-2849477-1.shtml | 贾也 | http://www.tianya.cn/50499450 | 194675 | 2703 | 2012-10-29 07:59 |
1 | 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) | /post-free-2839539-1.shtml | 无上卫士ABC | http://www.tianya.cn/74341835 | 88244 | 1041 | 2012-10-24 12:41 |
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:5]
title | link | author | author_page | click | reply | time | |
---|---|---|---|---|---|---|---|
0 | 【民间语文第161期】宁波px启示:船进港湾人应上岸 | /post-free-2849477-1.shtml | 贾也 | http://www.tianya.cn/50499450 | 194675 | 2703 | 2012-10-29 07:59 |
1 | 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) | /post-free-2839539-1.shtml | 无上卫士ABC | http://www.tianya.cn/74341835 | 88244 | 1041 | 2012-10-24 12:41 |
2 | 宁波准备停止PX项目了,元芳,你怎么看? | /post-free-2848797-1.shtml | 牧阳光 | http://www.tianya.cn/36535656 | 82779 | 625 | 2012-10-28 19:11 |
3 | 【吴钩一言堂】漳州PX成"二踢响",说明老百姓科学素质不低 | /post-free-5043260-1.shtml | 挑灯看吴钩 | http://www.tianya.cn/36959960 | 45304 | 219 | 2015-04-07 21:30 |
4 | PX是否有毒?宁波镇海事件谁在推波助澜(转载) | /post-free-2848995-1.shtml | zzjzzpgg12 | http://www.tianya.cn/53134970 | 38132 | 835 | 2012-10-28 21:08 |
da = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_author_info.txt', sep = "\t", header=None)
da[:2]
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | http://www.tianya.cn/50499450 | 152 | 27452 | 1020 | 1341 |
1 | http://www.tianya.cn/74341835 | 0 | 1 | 2 | 5 |
da=da.rename(columns = {0:'author_page', 1:'followed_num', 2:'fans_num',3:'post_num', 4:'comment_num'})
da[:5]
author_page | followed_num | fans_num | post_num | comment_num | |
---|---|---|---|---|---|
0 | http://www.tianya.cn/50499450 | 152 | 27452 | 1020 | 1341 |
1 | http://www.tianya.cn/74341835 | 0 | 1 | 2 | 5 |
2 | http://www.tianya.cn/36535656 | 19 | 28 | 816 | 1268 |
3 | http://www.tianya.cn/36959960 | 25 | 307 | 513 | 1237 |
4 | http://www.tianya.cn/53134970 | 17 | 22 | 79 | 3256 |
data = pd.concat([df,da], axis=1)
len(data)
467
data[:3]
title | link | author | author_page | click | reply | time | author_page | followed_num | fans_num | post_num | comment_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 【民间语文第161期】宁波px启示:船进港湾人应上岸 | /post-free-2849477-1.shtml | 贾也 | http://www.tianya.cn/50499450 | 194675 | 2703 | 2012-10-29 07:59 | http://www.tianya.cn/50499450 | 152 | 27452 | 1020 | 1341 |
1 | 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) | /post-free-2839539-1.shtml | 无上卫士ABC | http://www.tianya.cn/74341835 | 88244 | 1041 | 2012-10-24 12:41 | http://www.tianya.cn/74341835 | 0 | 1 | 2 | 5 |
2 | 宁波准备停止PX项目了,元芳,你怎么看? | /post-free-2848797-1.shtml | 牧阳光 | http://www.tianya.cn/36535656 | 82779 | 625 | 2012-10-28 19:11 | http://www.tianya.cn/36535656 | 19 | 28 | 816 | 1268 |
type(data.time[0])
str
# extract date from datetime
date = map(lambda x: x[:10], data.time)
data['date'] = pd.to_datetime(date)
# convert str to datetime format
data.time = pd.to_datetime(data.time)
data['month'] = data.time.dt.month
data['year'] = data.time.dt.year
data['day'] = data.time.dt.day
type(data.time[0])
pandas.tslib.Timestamp
data[:3]
title | link | author | author_page | click | reply | time | author_page | followed_num | fans_num | post_num | comment_num | date | month | year | day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 【民间语文第161期】宁波px启示:船进港湾人应上岸 | /post-free-2849477-1.shtml | 贾也 | http://www.tianya.cn/50499450 | 194675 | 2703 | 2012-10-29 07:59:00 | http://www.tianya.cn/50499450 | 152 | 27452 | 1020 | 1341 | 2012-10-29 | 10 | 2012 | 29 |
1 | 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) | /post-free-2839539-1.shtml | 无上卫士ABC | http://www.tianya.cn/74341835 | 88244 | 1041 | 2012-10-24 12:41:00 | http://www.tianya.cn/74341835 | 0 | 1 | 2 | 5 | 2012-10-24 | 10 | 2012 | 24 |
2 | 宁波准备停止PX项目了,元芳,你怎么看? | /post-free-2848797-1.shtml | 牧阳光 | http://www.tianya.cn/36535656 | 82779 | 625 | 2012-10-28 19:11:00 | http://www.tianya.cn/36535656 | 19 | 28 | 816 | 1268 | 2012-10-28 | 10 | 2012 | 28 |
http://statsmodels.sourceforge.net/
Statsmodels is a Python module that allows users to explore data, estimate statistical models, and perform statistical tests.
An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator.
Researchers across fields may find that statsmodels fully meets their needs for statistical computing and data analysis in Python.
import statsmodels.api as sm
' '.join(dir(sm))
'GEE GLM GLS GLSAR Logit MNLogit MixedLM NegativeBinomial NominalGEE OLS OrdinalGEE PHReg Poisson ProbPlot Probit QuantReg RLM WLS __builtins__ __doc__ __file__ __name__ __package__ add_constant categorical cov_struct datasets distributions emplike families formula genmod graphics iolib load nonparametric qqline qqplot qqplot_2samples regression robust show_versions stats test tools tsa version webdoc'
' '.join(dir(sm.stats))
'CompareCox CompareJ CompareMeans DescrStatsW Describe FTestAnovaPower FTestPower GofChisquarePower HetGoldfeldQuandt NormalIndPower Runs TTestIndPower TTestPower __builtins__ __doc__ __file__ __name__ __package__ acorr_breush_godfrey acorr_ljungbox anova_lm binom_test binom_test_reject_interval binom_tost binom_tost_reject_interval breaks_cusumolsresid breaks_hansen chisquare_effectsize cochrans_q compare_cox compare_j corr_clipped corr_nearest cov_cluster cov_cluster_2groups cov_hac cov_hc0 cov_hc1 cov_hc2 cov_hc3 cov_nearest cov_nw_panel cov_white_simple diagnostic durbin_watson fdrcorrection fdrcorrection_twostage gof gof_chisquare_discrete het_arch het_breushpagan het_goldfeldquandt het_white jarque_bera lillifors linear_harvey_collier linear_lm linear_rainbow mcnemar moment_helpers multicomp multipletests normal_ad omni_normtest power_binom_tost power_ztost_prop powerdiscrepancy proportion_confint proportion_effectsize proportions_chisquare proportions_chisquare_allpairs proportions_chisquare_pairscontrol proportions_ztest proportions_ztost recursive_olsresiduals runstest_1samp runstest_2samp sandwich_covariance se_cov stattools symmetry_bowker tt_ind_solve_power tt_solve_power ttest_ind ttost_ind ttost_paired tukeyhsd unitroot_adf zconfint zt_ind_solve_power ztest ztost'
data.describe()
click | reply | month | year | day | |
---|---|---|---|---|---|
count | 467.000000 | 467.000000 | 467.000000 | 467.000000 | 467.000000 |
mean | 1534.957173 | 18.907923 | 7.432548 | 2012.620985 | 17.961456 |
std | 11099.249834 | 144.869921 | 3.084860 | 1.795269 | 9.491730 |
min | 11.000000 | 0.000000 | 1.000000 | 2006.000000 | 1.000000 |
25% | 42.500000 | 0.000000 | 5.000000 | 2013.000000 | 8.000000 |
50% | 84.000000 | 0.000000 | 6.000000 | 2013.000000 | 23.000000 |
75% | 322.000000 | 4.000000 | 11.000000 | 2013.000000 | 25.000000 |
max | 194675.000000 | 2703.000000 | 12.000000 | 2015.000000 | 31.000000 |
import numpy as np
np.mean(data.click), np.std(data.click), np.sum(data.click)
(1534.9571734475376, 11087.35990002894, 716825)
# 不加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=[1 for i in data.click])
d1.mean, d1.var, d1.std, d1.sum
(1534.9571734475376, 122929549.55276975, 11087.35990002894, 716825.0)
# 加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=data.reply)
d1.mean, d1.var, d1.std, d1.sum
(83335.963986409959, 6297145701.6868114, 79354.556905617035, 735856562.0)
np.median(data.click)
84.0
plt.hist(data.click)
plt.show()
plt.hist(data.reply, color = 'green')
plt.show()
plt.hist(np.log(data.click+1), color='green')
plt.hist(np.log(data.reply+1), color='red')
plt.show()
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1)])
plt.show()
# Plot the height and weight to see
plt.boxplot([data.click, data.reply])
plt.show()
def transformData(dat):
results = []
for i in dat:
if i != 'na':
results.append( int(i))
else:
results.append(0)
return results
data.fans_num = transformData(data.fans_num)
data.followed_num = transformData(data.followed_num )
data.post_num = transformData(data.post_num )
data.comment_num = transformData(data.comment_num )
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1), np.log(data.reply+1),
np.log(data.fans_num+1), np.log(data.followed_num + 1)],
labels = ['$Click$', '$Reply$', '$Fans$', '$Followed$'])
plt.show()
fig = plt.figure(figsize=(12,4))
data.boxplot(return_type='dict')
plt.yscale('log')
plt.show()
' '.join(dir(data))
'T _AXIS_ALIASES _AXIS_IALIASES _AXIS_LEN _AXIS_NAMES _AXIS_NUMBERS _AXIS_ORDERS _AXIS_REVERSED _AXIS_SLICEMAP __abs__ __add__ __and__ __array__ __array_wrap__ __bool__ __bytes__ __class__ __contains__ __delattr__ __delitem__ __dict__ __dir__ __div__ __doc__ __eq__ __finalize__ __floordiv__ __format__ __ge__ __getattr__ __getattribute__ __getitem__ __getstate__ __gt__ __hash__ __iadd__ __idiv__ __imul__ __init__ __invert__ __ipow__ __isub__ __iter__ __itruediv__ __le__ __len__ __lt__ __mod__ __module__ __mul__ __ne__ __neg__ __new__ __nonzero__ __or__ __pow__ __radd__ __rand__ __rdiv__ __reduce__ __reduce_ex__ __repr__ __rfloordiv__ __rmod__ __rmul__ __ror__ __round__ __rpow__ __rsub__ __rtruediv__ __rxor__ __setattr__ __setitem__ __setstate__ __sizeof__ __str__ __sub__ __subclasshook__ __truediv__ __unicode__ __weakref__ __xor__ _accessors _add_numeric_operations _add_series_only_operations _add_series_or_dataframe_operations _agg_by_level _align_frame _align_series _apply_broadcast _apply_empty_result _apply_raw _apply_standard _at _box_col_values _box_item_values _check_inplace_setting _check_is_chained_assignment_possible _check_percentile _check_setitem_copy _clear_item_cache _combine_const _combine_frame _combine_match_columns _combine_match_index _combine_series _combine_series_infer _compare_frame _compare_frame_evaluate _consolidate_inplace _construct_axes_dict _construct_axes_dict_for_slice _construct_axes_dict_from _construct_axes_from_arguments _constructor _constructor_expanddim _constructor_sliced _convert _count_level _create_indexer _dir_additions _dir_deletions _ensure_valid_index _expand_axes _flex_compare_frame _from_arrays _from_axes _get_agg_axis _get_axis _get_axis_name _get_axis_number _get_axis_resolvers _get_block_manager_axis _get_bool_data _get_cacher _get_index_resolvers _get_item_cache _get_numeric_data _get_values _getitem_array _getitem_column _getitem_frame _getitem_multilevel _getitem_slice _iat _iget_item_cache _iloc _indexed_same _info_axis _info_axis_name _info_axis_number _info_repr _init_dict _init_mgr _init_ndarray _internal_names _internal_names_set _is_cached _is_datelike_mixed_type _is_mixed_type _is_numeric_mixed_type _is_view _ix _ixs _join_compat _loc _maybe_cache_changed _maybe_update_cacher _metadata _needs_reindex_multi _nsorted _protect_consolidate _reduce _reindex_axes _reindex_axis _reindex_columns _reindex_index _reindex_multi _reindex_with_indexers _repr_fits_horizontal_ _repr_fits_vertical_ _repr_html_ _repr_latex_ _reset_cache _reset_cacher _sanitize_column _series _set_as_cached _set_axis _set_axis_name _set_is_copy _set_item _setitem_array _setitem_frame _setitem_slice _setup_axes _slice _stat_axis _stat_axis_name _stat_axis_number _typ _unpickle_frame_compat _unpickle_matrix_compat _update_inplace _validate_dtype _values _xs abs add add_prefix add_suffix align all any append apply applymap as_blocks as_matrix asfreq assign astype at at_time author author_page axes between_time bfill blocks bool boxplot click clip clip_lower clip_upper columns combine combineAdd combineMult combine_first comment_num compound consolidate convert_objects copy corr corrwith count cov cummax cummin cumprod cumsum date day describe diff div divide dot drop drop_duplicates dropna dtypes duplicated empty eq equals eval ewm expanding fans_num ffill fillna filter first first_valid_index floordiv followed_num from_csv from_dict from_items from_records ftypes ge get get_dtype_counts get_ftype_counts get_value get_values groupby gt head hist iat icol idxmax idxmin iget_value iloc index info insert interpolate irow is_copy isin isnull iteritems iterkv iterrows itertuples ix join keys kurt kurtosis last last_valid_index le link loc lookup lt mad mask max mean median memory_usage merge min mod mode month mul multiply ndim ne nlargest notnull nsmallest pct_change pipe pivot pivot_table plot pop post_num pow prod product quantile query radd rank rdiv reindex reindex_axis reindex_like rename rename_axis reorder_levels replace reply resample reset_index rfloordiv rmod rmul rolling round rpow rsub rtruediv sample select select_dtypes sem set_axis set_index set_value shape shift size skew slice_shift sort sort_index sort_values sortlevel squeeze stack std style sub subtract sum swapaxes swaplevel tail take time title to_clipboard to_csv to_dense to_dict to_excel to_gbq to_hdf to_html to_json to_latex to_msgpack to_panel to_period to_pickle to_records to_sparse to_sql to_stata to_string to_timestamp to_wide to_xarray transpose truediv truncate tshift tz_convert tz_localize unstack update values var where xs year'
from pandas.tools import plotting
#fig = plt.figure(figsize=(10, 10))
plotting.scatter_matrix(data[['click', 'reply', 'post_num','comment_num']])
plt.show()
' '.join(dir(plotting))
'AbstractMethodError Appender AreaPlot BarPlot BarhPlot BasePlotMethods BoxPlot FramePlotMethods HexBinPlot HistPlot Index KdePlot LinePlot LooseVersion MPLPlot MultiIndex PandasObject PeriodIndex PiePlot PlanePlot ScatterPlot Series SeriesPlotMethods _Options __builtins__ __doc__ __file__ __name__ __package__ _all_kinds _common_kinds _dataframe_kinds _flatten _gca _gcf _get_all_lines _get_layout _get_marker_compat _get_standard_colors _get_standard_kind _get_xlim _grouped_plot _grouped_plot_by_column _handle_shared_axes _klasses _mpl_ge_1_3_1 _mpl_ge_1_4_0 _mpl_ge_1_5_0 _mpl_le_1_2_1 _plot _plot_klass _remove_labels_from_axis _series_kinds _set_ticks_props _shared_doc_df_kwargs _shared_doc_kwargs _shared_doc_series_kwargs _shared_docs _subplots andrews_curves autocorrelation_plot bootstrap_plot boxplot boxplot_frame_groupby cache_readonly ceil colors com compat contextmanager conv cycler deprecate_kwarg df_ax df_coord df_kind df_note df_unique format_date_labels grouped_hist hist_frame hist_series klass lag_plot lmap lrange map mpl_stylesheet namedtuple np parallel_coordinates plot_frame plot_params plot_series radviz range re remove_na scatter_matrix scatter_plot series_ax series_coord series_kind series_note series_unique string_types table warnings zip'
http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html
import seaborn # conda install seaborn
seaborn.pairplot(data, vars=['click', 'reply', 'post_num', 'comment_num'],
kind='reg')
<seaborn.axisgrid.PairGrid at 0x126704590>
seaborn.pairplot(data, vars=['click', 'reply', 'post_num'],
kind='reg', hue='year')
<seaborn.axisgrid.PairGrid at 0x124e85610>
seaborn.lmplot(y='reply', x='click', data=data)
<seaborn.axisgrid.FacetGrid at 0x12324c9d0>
data.year.value_counts()
2013 304 2014 63 2007 34 2012 33 2015 20 2011 6 2009 6 2006 1 Name: year, dtype: int64
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
dd
year | |
---|---|
2006 | 1 |
2007 | 34 |
2009 | 6 |
2011 | 6 |
2012 | 33 |
2013 | 304 |
2014 | 63 |
2015 | 20 |
dd.index
Int64Index([2006, 2007, 2009, 2011, 2012, 2013, 2014, 2015], dtype='int64')
dd_date_str = map(lambda x: str(x) +'-01-01', dd.index)
dd_date_str
['2006-01-01', '2007-01-01', '2009-01-01', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01']
dd_date = pd.to_datetime(dd_date_str)
dd_date
DatetimeIndex(['2006-01-01', '2007-01-01', '2009-01-01', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01'], dtype='datetime64[ns]', freq=None)
plt.plot(dd_date, dd.year, 'r-o')
plt.show()
ds = dd.cumsum()
ds
year | |
---|---|
2006 | 1 |
2007 | 35 |
2009 | 41 |
2011 | 47 |
2012 | 80 |
2013 | 384 |
2014 | 447 |
2015 | 467 |
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
ds = dd.cumsum()
def getDate(dat):
dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
dat_date = pd.to_datetime(dat_date_str)
return dat_date
ds.date = getDate(ds)
dd.date = getDate(dd)
plt.plot(ds_date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$')
plt.plot(dd_date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$')
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()
dg = data.groupby('year').sum()
dg
click | reply | followed_num | fans_num | post_num | comment_num | month | day | |
---|---|---|---|---|---|---|---|---|
year | ||||||||
2006 | 1214 | 24 | 0 | 2 | 278 | 291 | 8 | 24 |
2007 | 28290 | 514 | 22 | 137 | 8041 | 10344 | 281 | 512 |
2009 | 18644 | 186 | 17 | 12 | 531 | 571 | 39 | 78 |
2011 | 2889 | 28 | 84 | 28 | 332 | 661 | 50 | 72 |
2012 | 463720 | 5933 | 2779 | 59511 | 12315 | 32498 | 322 | 819 |
2013 | 63140 | 937 | 571 | 43265 | 24359 | 40362 | 2458 | 6111 |
2014 | 57764 | 772 | 2216 | 16664 | 11266 | 98025 | 233 | 579 |
2015 | 81164 | 436 | 1649 | 272391 | 11217 | 20186 | 80 | 193 |
dgs = dg.cumsum()
dgs
click | reply | followed_num | fans_num | post_num | comment_num | month | day | |
---|---|---|---|---|---|---|---|---|
year | ||||||||
2006 | 1214 | 24 | 0 | 2 | 278 | 291 | 8 | 24 |
2007 | 29504 | 538 | 22 | 139 | 8319 | 10635 | 289 | 536 |
2009 | 48148 | 724 | 39 | 151 | 8850 | 11206 | 328 | 614 |
2011 | 51037 | 752 | 123 | 179 | 9182 | 11867 | 378 | 686 |
2012 | 514757 | 6685 | 2902 | 59690 | 21497 | 44365 | 700 | 1505 |
2013 | 577897 | 7622 | 3473 | 102955 | 45856 | 84727 | 3158 | 7616 |
2014 | 635661 | 8394 | 5689 | 119619 | 57122 | 182752 | 3391 | 8195 |
2015 | 716825 | 8830 | 7338 | 392010 | 68339 | 202938 | 3471 | 8388 |
def getDate(dat):
dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
dat_date = pd.to_datetime(dat_date_str)
return dat_date
dg.date = getDate(dg)
fig = plt.figure(figsize=(12,5))
plt.plot(dg.date, dg.click, 'r-o', label = '$Yearly\:Number\:of\:Clicks$')
plt.plot(dg.date, dg.reply, 'g-s', label = '$Yearly\:Number\:of\:Replies$')
plt.plot(dg.date, dg.fans_num, 'b->', label = '$Yearly\:Number\:of\:Fans$')
plt.yscale('log')
plt.legend(loc=4,numpoints=1,fontsize=13)
plt.show()
data.groupby('year')['click'].sum()
year 2006 1214 2007 28290 2009 18644 2011 2889 2012 463720 2013 63140 2014 57764 2015 81164 Name: click, dtype: int64
data.groupby('year')['click'].mean()
year 2006 1214.000000 2007 832.058824 2009 3107.333333 2011 481.500000 2012 14052.121212 2013 207.697368 2014 916.888889 2015 4058.200000 Name: click, dtype: float64
repost = []
for i in df.title:
if u'转载' in i.decode('utf8'):
repost.append(1)
else:
repost.append(0)
df['repost'] = repost
df.groupby('repost').sum()
click | reply | |
---|---|---|
repost | ||
0 | 536495 | 6208 |
1 | 180330 | 2622 |
df['click'][df['repost']==0][:5]
0 194675 2 82779 3 45304 5 27026 6 24026 Name: click, dtype: int64
df['click'][df['repost']==1][:5]
1 88244 4 38132 13 4990 16 3720 18 3421 Name: click, dtype: int64
from scipy import stats
stats.ttest_ind(data.click, data.repost)
Ttest_indResult(statistic=2.9514887561591618, pvalue=0.0032417014839700789)
sm.stats.ttest_ind(data.click, data.reply)
# test statistic, pvalue and degrees of freedom
(2.9514887561591618, 0.0032417014839700789, 932.0)
https://en.wikipedia.org/wiki/Chi-squared_test
Suppose there is a city of 1 million residents with four neighborhoods: A, B, C, and D.
A random sample of 650 residents of the city is taken and their occupation is recorded as "blue collar", "white collar", or "no collar".
The null hypothesis is that each person's neighborhood of residence is independent of the person's occupational classification. The data are tabulated as:
A | B | C | D | Total | |
---|---|---|---|---|---|
White collar | 90 | 60 | 104 | 95 | 349 |
Blue collar | 30 | 50 | 51 | 20 | 151 |
No coloar | 30 | 40 | 45 | 35 | 150 |
Total | 150 | 150 | 200 | 150 | 650 |
$ \frac{150}{650} \frac{349}{650} 650 = 80.54 $
Then in that "cell" of the table, we have
$\frac{(\text{observed}-\text{expected})^2}{\text{expected}} = \frac{(90-80.54)^2}{80.54}$.
The sum of these quantities over all of the cells is the test statistic.
Under the null hypothesis, it has approximately a chi-square distribution whose number of degrees of freedom are
$ (\text{number of rows}-1)(\text{number of columns}-1) = (3-1)(4-1) = 6. $
If the test statistic is improbably large according to that chi-square distribution, then one rejects the null hypothesis of independence.
Parameters:
from scipy.stats import chisquare
chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8])
Power_divergenceResult(statistic=3.5, pvalue=0.62338762774958223)
from scipy.stats import chisqprob, chi2
# p_value = chi2.sf(chi_statistic, df)
print chisqprob(3.94,1), 1 - chi2.cdf(3.94,1)
0.0471507774946 0.0471507774946
print np.corrcoef(data.click, data.reply)
[[ 1. 0.96396571] [ 0.96396571 1. ]]
data.corr()
click | reply | followed_num | fans_num | post_num | comment_num | month | year | day | |
---|---|---|---|---|---|---|---|---|---|
click | 1.000000 | 0.963966 | 0.143595 | 0.158116 | 0.097502 | 0.085615 | 0.038788 | -0.024827 | 0.048361 |
reply | 0.963966 | 1.000000 | 0.199270 | 0.159387 | 0.090342 | 0.123341 | 0.040165 | -0.041208 | 0.058738 |
followed_num | 0.143595 | 0.199270 | 1.000000 | 0.407656 | 0.211677 | 0.499612 | -0.036037 | 0.051187 | -0.020604 |
fans_num | 0.158116 | 0.159387 | 0.407656 | 1.000000 | 0.341724 | 0.145387 | -0.084243 | 0.102301 | -0.045883 |
post_num | 0.097502 | 0.090342 | 0.211677 | 0.341724 | 1.000000 | 0.514695 | -0.070024 | -0.011786 | -0.033254 |
comment_num | 0.085615 | 0.123341 | 0.499612 | 0.145387 | 0.514695 | 1.000000 | -0.118703 | 0.069160 | -0.119840 |
month | 0.038788 | 0.040165 | -0.036037 | -0.084243 | -0.070024 | -0.118703 | 1.000000 | -0.236920 | 0.535354 |
year | -0.024827 | -0.041208 | 0.051187 | 0.102301 | -0.011786 | 0.069160 | -0.236920 | 1.000000 | -0.046699 |
day | 0.048361 | 0.058738 | -0.020604 | -0.045883 | -0.033254 | -0.119840 | 0.535354 | -0.046699 | 1.000000 |
plt.plot(df.click, df.reply, 'r-o')
plt.show()
plt.plot(df.click, df.reply, 'gs')
plt.xlabel('$Clicks$', fontsize = 20)
plt.ylabel('$Replies$', fontsize = 20)
plt.xscale('log')
plt.yscale('log')
plt.title('$Allowmetric\,Law$', fontsize = 20)
plt.show()
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
# Load data
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()
有些使用windows的同学无法运行上述代码:
https://groups.google.com/forum/#!topic/pystatsmodels/KcSzNqDxv-Q
# Inspect the results
print results.summary()
OLS Regression Results ============================================================================== Dep. Variable: Lottery R-squared: 0.348 Model: OLS Adj. R-squared: 0.333 Method: Least Squares F-statistic: 22.20 Date: Sat, 16 Apr 2016 Prob (F-statistic): 1.90e-08 Time: 23:39:42 Log-Likelihood: -379.82 No. Observations: 86 AIC: 765.6 Df Residuals: 83 BIC: 773.0 Df Model: 2 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [95.0% Conf. Int.] ----------------------------------------------------------------------------------- Intercept 246.4341 35.233 6.995 0.000 176.358 316.510 Literacy -0.4889 0.128 -3.832 0.000 -0.743 -0.235 np.log(Pop1831) -31.3114 5.977 -5.239 0.000 -43.199 -19.424 ============================================================================== Omnibus: 3.713 Durbin-Watson: 2.019 Prob(Omnibus): 0.156 Jarque-Bera (JB): 3.394 Skew: -0.487 Prob(JB): 0.183 Kurtosis: 3.003 Cond. No. 702. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
reg = smf.ols('reply ~ click + followed_num', data=data).fit()
print reg.summary()
OLS Regression Results ============================================================================== Dep. Variable: reply R-squared: 0.933 Model: OLS Adj. R-squared: 0.933 Method: Least Squares F-statistic: 3231. Date: Sun, 17 Apr 2016 Prob (F-statistic): 4.30e-273 Time: 02:04:27 Log-Likelihood: -2354.7 No. Observations: 467 AIC: 4715. Df Residuals: 464 BIC: 4728. Df Model: 2 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [95.0% Conf. Int.] -------------------------------------------------------------------------------- Intercept -1.4024 1.766 -0.794 0.428 -4.873 2.068 click 0.0125 0.000 78.660 0.000 0.012 0.013 followed_num 0.0749 0.015 5.117 0.000 0.046 0.104 ============================================================================== Omnibus: 374.515 Durbin-Watson: 1.938 Prob(Omnibus): 0.000 Jarque-Bera (JB): 97373.297 Skew: -2.416 Prob(JB): 0.00 Kurtosis: 73.575 Cond. No. 1.14e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.14e+04. This might indicate that there are strong multicollinearity or other numerical problems.
reg1 = smf.ols('np.log(reply+1) ~ np.log(click+1) +np.log(followed_num+1)+month', data=data).fit()
print reg1.summary()
OLS Regression Results ============================================================================== Dep. Variable: np.log(reply + 1) R-squared: 0.606 Model: OLS Adj. R-squared: 0.603 Method: Least Squares F-statistic: 236.9 Date: Sun, 17 Apr 2016 Prob (F-statistic): 4.03e-93 Time: 14:22:55 Log-Likelihood: -596.73 No. Observations: 467 AIC: 1201. Df Residuals: 463 BIC: 1218. Df Model: 3 Covariance Type: nonrobust ============================================================================================ coef std err t P>|t| [95.0% Conf. Int.] -------------------------------------------------------------------------------------------- Intercept -2.6009 0.189 -13.778 0.000 -2.972 -2.230 np.log(click + 1) 0.6872 0.029 24.083 0.000 0.631 0.743 np.log(followed_num + 1) 0.0118 0.034 0.347 0.729 -0.055 0.079 month 0.0172 0.013 1.275 0.203 -0.009 0.044 ============================================================================== Omnibus: 26.408 Durbin-Watson: 1.904 Prob(Omnibus): 0.000 Jarque-Bera (JB): 44.572 Skew: -0.389 Prob(JB): 2.10e-10 Kurtosis: 4.299 Cond. No. 44.1 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(reg1, fig = fig)
plt.show()
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore = sm.datasets.get_rdataset("Moore", "car",
cache=True) # load data
data = moore.data
data = data.rename(columns={"partner.status" :
"partner_status"}) # make name pythonic
data[:5]
partner_status | conformity | fcategory | fscore | |
---|---|---|---|---|
0 | low | 8 | low | 37 |
1 | low | 4 | high | 57 |
2 | low | 8 | high | 65 |
3 | low | 7 | low | 20 |
4 | low | 10 | low | 36 |
moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
data=data).fit()
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(moore_lm, fig = fig)
plt.show()
table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 ANOVA DataFrame
print table
sum_sq df F \ C(fcategory, Sum) 11.614700 2.0 0.276958 C(partner_status, Sum) 212.213778 1.0 10.120692 C(fcategory, Sum):C(partner_status, Sum) 175.488928 2.0 4.184623 Residual 817.763961 39.0 NaN PR(>F) C(fcategory, Sum) 0.759564 C(partner_status, Sum) 0.002874 C(fcategory, Sum):C(partner_status, Sum) 0.022572 Residual NaN