import numpy as np import pandas as pd from pandas import DataFrame from pandas import Series import statsmodels #FOR NEXT STEP -- RUNNING REGRESSIONS import statsmodels.api as sm import statsmodels.formula.api as smf #FOR USING 'R'-STYLE FORMULAS FOR REGRESSIONS #Set PANDAS to show all columns in DataFrame pd.set_option('display.max_columns', None) pd.set_option('display.float_format', lambda x: '%.2f' % x) print pd.__version__ print statsmodels.__version__ df = pd.read_excel('fb.xls', 'Sheet1', header=0) print len(df) df.head(2) df.columns df.columns.tolist() df = df[['hashtag_count', 'like_count_14days', 'comment_count_14days', 'share_count_14days', 'source', 'source_External', 'video_dummy', 'picture_dummy', 'Total_Revenue', 'Log_of_Total_Revenue', 'followers_count', 'Log_of_Followers']] print len(df) print len(df.columns) df.head(2) df.describe() DataFrame.describe? dir(df.describe()) np.round(df.describe(), 2) np.round(df.describe(), 2).T np.round(df.describe(), 2).T[['count','mean', 'std', 'min', 'max']] #ALTERNATIVE WAY OF WRITING np.round(df.describe(), 2).transpose() #WITH FOUR DECIMAL PLACES (DEFAULT) df.describe().transpose().to_csv('summary stats.csv', sep=',') df.describe().transpose()[['count','mean', 'std', 'min', 'max']].to_csv('summary stats.csv', sep=',') #WITH TWO DECIMAL PLACES np.round(df.describe(), 2).T[['count','mean', 'std', 'min', 'max']].to_csv('summary stats.csv', sep=',') cols = ['hashtag_count','like_count_14days'] np.round(df[cols].describe(), 2).T[['count','mean', 'std', 'min', 'max']].to_csv('summary stats (partial).csv', sep=',')