In [30]:

import pandas as pd

In [31]:

import numpy as np
import statsmodels.api as sm #used for statistical modeling
import matplotlib.pyplot as plt
#to have plots show up automatically in notebook
%matplotlib inline 

Pandas Series Objects : glorified numpy array¶

In [32]:

a = pd.Series(np.random.randn(5), name = 'random numbers')
a

Out[32]:

0   -0.542996
1   -1.411180
2   -1.775740
3    1.705594
4    0.638039
Name: random numbers, dtype: float64

In [33]:

#Series can be sliced
a[-3:] #gives last three items of the series...the index has stayed with the numbers

Out[33]:

2   -1.775740
3    1.705594
4    0.638039
Name: random numbers, dtype: float64

In [34]:

a[3]

Out[34]:

1.705594493602101

In [35]:

a[[1,3,2]] 

Out[35]:

1   -1.411180
3    1.705594
2   -1.775740
Name: random numbers, dtype: float64

In [36]:

#Boolean indexing...indexes dont have to be a number
a[[True,False,False,True,True]]

Out[36]:

0   -0.542996
3    1.705594
4    0.638039
Name: random numbers, dtype: float64

In [37]:

a.index = list('abcde') #new defining of list- now can get index with labels...
a

Out[37]:

a   -0.542996
b   -1.411180
c   -1.775740
d    1.705594
e    0.638039
Name: random numbers, dtype: float64

In [38]:

#can now also slice with labels
a['c']

Out[38]:

-1.7757400675142934

In [39]:

#concatenating the values of lists
#Mixing data sets on an index
a[-3:] + a[:3] #need to have a common index

Out[39]:

a        NaN
b        NaN
c   -3.55148
d        NaN
e        NaN
Name: random numbers, dtype: float64

In [40]:

#A series makes a good look up table
#Making grades look up table 4->A etc
gp_lut = pd.Series(data=[4,3,2,1,0],index=list('ABCDF'))
gp_lut

Out[40]:

A    4
B    3
C    2
D    1
F    0
dtype: int64

In [41]:

my_grades = ['A','B','B','A','A']
my_points = gp_lut[my_grades]
my_points

my_gpa = my_points.mean()
my_gpa

Out[41]:

3.6000000000000001

In [42]:

students = pd.DataFrame({'name' : ['Peter', 'Paul', 'Mary',
                                    'Peter', 'Paul', 'Mary'],
                         'subject' : ['English', 'English', 'English',
                                      'Math', 'Math', 'Math'],
                         'grade' : [85.0, 76.0, 92.0, 77.0, 68.0, 87.0]})
students

Out[42]:

	grade	name	subject
0	85	Peter	English
1	76	Paul	English
2	92	Mary	English
3	77	Peter	Math
4	68	Paul	Math
5	87	Mary	Math

In [43]:

# We can get the column names
students.columns

Out[43]:

Index(['grade', 'name', 'subject'], dtype='object')

In [44]:

# ...or as an attribute
students.name

Out[44]:

0    Peter
1     Paul
2     Mary
3    Peter
4     Paul
5     Mary
Name: name, dtype: object

In [46]:

students[['subject','grade']]

Out[46]:

	subject	grade
0	English	85
1	English	76
2	English	92
3	Math	77
4	Math	68
5	Math	87

In [47]:

#create a boolean array which is only true when her name is Mary
#Now we will take only Mary's grades
students[students.name == 'Mary']

Out[47]:

	grade	name	subject
2	92	Mary	English
5	87	Mary	Math

Now Pandas By Example With Star Test Results¶

In [48]:

data = pd.read_csv('star_2013_clean_wide.csv')
data

Out[48]:

	County.Code	County.Name	Test.Id	Test.Name	Grade	Students.Tested	Mean.Scale.Score	Count.Test.Grade	Pct.Test.Grade	Population	per.capita.income	median.household.income	median.family.income	Spend	ADAttend	Spend.Per.ADA
0	1	Alameda	7	CST English-Language Arts	2	16814	372.1	464896	0.036167	1494876	34937	70821	87012	1814932885	193906	9360
1	1	Alameda	8	CST Mathematics	2	16802	398.4	464515	0.036171	1494876	34937	70821	87012	1814932885	193906	9360
2	1	Alameda	8	CST Mathematics	3	16198	411.1	443961	0.036485	1494876	34937	70821	87012	1814932885	193906	9360
3	1	Alameda	7	CST English-Language Arts	3	16126	357.2	441572	0.036520	1494876	34937	70821	87012	1814932885	193906	9360
4	1	Alameda	7	CST English-Language Arts	4	15390	387.0	428906	0.035882	1494876	34937	70821	87012	1814932885	193906	9360
5	1	Alameda	8	CST Mathematics	4	15532	405.8	433012	0.035870	1494876	34937	70821	87012	1814932885	193906	9360
6	1	Alameda	8	CST Mathematics	5	15146	411.6	432775	0.034997	1494876	34937	70821	87012	1814932885	193906	9360
7	1	Alameda	7	CST English-Language Arts	5	15077	378.4	429498	0.035104	1494876	34937	70821	87012	1814932885	193906	9360
8	1	Alameda	32	CST Science - Grade 5, Grade 8, and Grade 10 L...	5	15107	380.6	431142	0.035040	1494876	34937	70821	87012	1814932885	193906	9360
9	1	Alameda	7	CST English-Language Arts	6	15277	374.9	434374	0.035170	1494876	34937	70821	87012	1814932885	193906	9360
10	1	Alameda	8	CST Mathematics	6	15332	380.3	436563	0.035120	1494876	34937	70821	87012	1814932885	193906	9360
11	1	Alameda	7	CST English-Language Arts	7	14551	378.4	431187	0.033746	1494876	34937	70821	87012	1814932885	193906	9360
12	1	Alameda	9	CST Algebra I	7	2239	431.6	37803	0.059228	1494876	34937	70821	87012	1814932885	193906	9360
13	1	Alameda	8	CST Mathematics	7	12326	370.2	393811	0.031299	1494876	34937	70821	87012	1814932885	193906	9360
14	1	Alameda	9	CST Algebra I	8	11006	354.2	276039	0.039871	1494876	34937	70821	87012	1814932885	193906	9360
15	1	Alameda	13	CST Algebra II	8	31	458.9	680	0.045588	1494876	34937	70821	87012	1814932885	193906	9360
16	1	Alameda	7	CST English-Language Arts	8	14549	373.9	435491	0.033408	1494876	34937	70821	87012	1814932885	193906	9360
17	1	Alameda	29	CST History - Social Science Grade 8	8	15421	363.3	459125	0.033588	1494876	34937	70821	87012	1814932885	193906	9360
18	1	Alameda	28	CST General Mathematics	8	2143	308.4	145549	0.014724	1494876	34937	70821	87012	1814932885	193906	9360
19	1	Alameda	11	CST Geometry	8	1732	428.0	29035	0.059652	1494876	34937	70821	87012	1814932885	193906	9360
20	1	Alameda	32	CST Science - Grade 5, Grade 8, and Grade 10 L...	8	14552	404.5	436071	0.033371	1494876	34937	70821	87012	1814932885	193906	9360
21	1	Alameda	10	CST Integrated Math 1	9	121	298.8	980	0.123469	1494876	34937	70821	87012	1814932885	193906	9360
22	1	Alameda	20	CST Biology	9	9473	377.0	228962	0.041374	1494876	34937	70821	87012	1814932885	193906	9360
23	1	Alameda	21	CST Chemistry	9	186	364.3	5365	0.034669	1494876	34937	70821	87012	1814932885	193906	9360
24	1	Alameda	24	CST Integrated/Coordinated Science 1	9	905	321.4	33944	0.026662	1494876	34937	70821	87012	1814932885	193906	9360
25	1	Alameda	22	CST Earth Science	9	1651	328.8	133961	0.012324	1494876	34937	70821	87012	1814932885	193906	9360
26	1	Alameda	23	CST Physics	9	754	312.8	14112	0.053430	1494876	34937	70821	87012	1814932885	193906	9360
27	1	Alameda	25	CST Integrated/Coordinated Science 2	9	124	323.0	1351	0.091784	1494876	34937	70821	87012	1814932885	193906	9360
28	1	Alameda	18	CST World History	9	597	320.1	35087	0.017015	1494876	34937	70821	87012	1814932885	193906	9360
29	1	Alameda	11	CST Geometry	9	6613	353.4	144063	0.045904	1494876	34937	70821	87012	1814932885	193906	9360
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3700	58	Yuba	22	CST Earth Science	10	290	327.0	30649	0.009462	71817	20046	46617	52775	99237214	11710	8474
3701	58	Yuba	15	CST Summative High School Mathematics	10	19	353.3	24757	0.000767	71817	20046	46617	52775	99237214	11710	8474
3702	58	Yuba	21	CST Chemistry	10	166	358.8	136209	0.001219	71817	20046	46617	52775	99237214	11710	8474
3703	58	Yuba	13	CST Algebra II	10	201	335.9	131448	0.001529	71817	20046	46617	52775	99237214	11710	8474
3704	58	Yuba	9	CST Algebra I	10	259	285.9	104567	0.002477	71817	20046	46617	52775	99237214	11710	8474
3705	58	Yuba	7	CST English-Language Arts	10	931	332.0	455362	0.002045	71817	20046	46617	52775	99237214	11710	8474
3706	58	Yuba	20	CST Biology	10	238	343.3	228138	0.001043	71817	20046	46617	52775	99237214	11710	8474
3707	58	Yuba	11	CST Geometry	10	274	296.1	156167	0.001755	71817	20046	46617	52775	99237214	11710	8474
3708	58	Yuba	11	CST Geometry	11	151	287.1	78308	0.001928	71817	20046	46617	52775	99237214	11710	8474
3709	58	Yuba	19	CST U.S. History	11	895	325.7	447386	0.002001	71817	20046	46617	52775	99237214	11710	8474
3710	58	Yuba	20	CST Biology	11	186	330.3	99776	0.001864	71817	20046	46617	52775	99237214	11710	8474
3711	58	Yuba	7	CST English-Language Arts	11	899	324.9	440115	0.002043	71817	20046	46617	52775	99237214	11710	8474
3712	58	Yuba	9	CST Algebra I	11	187	281.3	49868	0.003750	71817	20046	46617	52775	99237214	11710	8474
3713	58	Yuba	13	CST Algebra II	11	199	302.4	122079	0.001630	71817	20046	46617	52775	99237214	11710	8474
3714	58	Yuba	21	CST Chemistry	11	121	342.4	133804	0.000904	71817	20046	46617	52775	99237214	11710	8474
3715	58	Yuba	15	CST Summative High School Mathematics	11	153	322.6	124304	0.001231	71817	20046	46617	52775	99237214	11710	8474
3716	58	Yuba	22	CST Earth Science	11	187	326.8	42331	0.004418	71817	20046	46617	52775	99237214	11710	8474
3717	58	Yuba	23	CST Physics	11	91	363.8	56726	0.001604	71817	20046	46617	52775	99237214	11710	8474
3718	58	Yuba	18	CST World History	11	40	300.3	16163	0.002475	71817	20046	46617	52775	99237214	11710	8474
3719	58	Yuba	18	CST World History	13	874	329.1	474255	0.001843	71817	20046	46617	52775	99237214	11710	8474
3720	58	Yuba	25	CST Integrated/Coordinated Science 2	13	14	302.3	3742	0.003741	71817	20046	46617	52775	99237214	11710	8474
3721	58	Yuba	20	CST Biology	13	876	340.6	556893	0.001573	71817	20046	46617	52775	99237214	11710	8474
3722	58	Yuba	22	CST Earth Science	13	875	328.9	206991	0.004227	71817	20046	46617	52775	99237214	11710	8474
3723	58	Yuba	23	CST Physics	13	92	363.6	80833	0.001138	71817	20046	46617	52775	99237214	11710	8474
3724	58	Yuba	28	CST General Mathematics	13	585	299.5	190615	0.003069	71817	20046	46617	52775	99237214	11710	8474
3725	58	Yuba	13	CST Algebra II	13	431	320.6	286737	0.001503	71817	20046	46617	52775	99237214	11710	8474
3726	58	Yuba	21	CST Chemistry	13	287	351.9	275452	0.001042	71817	20046	46617	52775	99237214	11710	8474
3727	58	Yuba	15	CST Summative High School Mathematics	13	172	326.0	149987	0.001147	71817	20046	46617	52775	99237214	11710	8474
3728	58	Yuba	11	CST Geometry	13	695	307.8	407658	0.001705	71817	20046	46617	52775	99237214	11710	8474
3729	58	Yuba	9	CST Algebra I	13	1495	319.4	711705	0.002101	71817	20046	46617	52775	99237214	11710	8474

3730 rows × 16 columns

In [49]:

data.columns #A look at the available columns 

Out[49]:

Index(['County.Code', 'County.Name', 'Test.Id', 'Test.Name', 'Grade', 'Students.Tested', 'Mean.Scale.Score', 'Count.Test.Grade', 'Pct.Test.Grade', 'Population', 'per.capita.income', 'median.household.income', 'median.family.income', 'Spend', 'ADAttend', 'Spend.Per.ADA'], dtype='object')

In [50]:

#What are the unique test names?
data['Test.Name'].unique()

Out[50]:

array(['CST English-Language Arts', 'CST Mathematics',
       'CST Science - Grade 5, Grade 8, and Grade 10 Life Science',
       'CST Algebra I', 'CST Algebra II',
       'CST History - Social Science Grade 8', 'CST General Mathematics',
       'CST Geometry', 'CST Integrated Math 1', 'CST Biology',
       'CST Chemistry', 'CST Integrated/Coordinated Science 1',
       'CST Earth Science', 'CST Physics',
       'CST Integrated/Coordinated Science 2', 'CST World History',
       'CST Summative High School Mathematics', 'CST Integrated Math 2',
       'CST U.S. History', 'CST Integrated/Coordinated Science 4',
       'CST Integrated Math 3', 'CST Integrated/Coordinated Science 3'], dtype=object)

How many students took each test by grade? This introduces us to pivot tables...¶

In [52]:

data.pivot_table(values='Students.Tested',index='Test.Name',columns='Grade',aggfunc='sum')
#What is 13? CST Tests?

Out[52]:

Grade	2	3	4	5	6	7	8	9	10	11	13
Test.Name
CST Algebra I	NaN	NaN	NaN	NaN	NaN	37803	276039	243349	104555	49846	711671
CST Algebra II	NaN	NaN	NaN	NaN	NaN	NaN	680	32400	131448	122079	286737
CST Biology	NaN	NaN	NaN	NaN	NaN	NaN	NaN	228962	228138	99753	556867
CST Chemistry	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5365	136209	133804	275452
CST Earth Science	NaN	NaN	NaN	NaN	NaN	NaN	NaN	133961	30649	42316	206974
CST English-Language Arts	464896	441572	428906	429498	434374	431187	435491	463195	455237	439972	NaN
CST General Mathematics	NaN	NaN	NaN	NaN	NaN	NaN	145549	45052	NaN	NaN	190615
CST Geometry	NaN	NaN	NaN	NaN	NaN	NaN	29035	144063	156167	78308	407658
CST History - Social Science Grade 8	NaN	NaN	NaN	NaN	NaN	NaN	459125	NaN	NaN	NaN	NaN
CST Integrated Math 1	NaN	NaN	NaN	NaN	NaN	NaN	125	980	5965	6318	13486
CST Integrated Math 2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	236	1432	3364	5102
CST Integrated Math 3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	61	153	530	769
CST Integrated/Coordinated Science 1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	33944	4401	7963	46386
CST Integrated/Coordinated Science 2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1351	1413	955	3742
CST Integrated/Coordinated Science 3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	11	46	874	952
CST Integrated/Coordinated Science 4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	18	NaN	NaN	43
CST Mathematics	464515	443961	433012	432775	436563	393811	NaN	NaN	NaN	NaN	NaN
CST Physics	NaN	NaN	NaN	NaN	NaN	NaN	NaN	14112	9902	56726	80833
CST Science - Grade 5, Grade 8, and Grade 10 Life Science	NaN	NaN	NaN	431142	NaN	NaN	436071	NaN	451253	NaN	NaN
CST Summative High School Mathematics	NaN	NaN	NaN	NaN	NaN	NaN	NaN	809	24757	124304	149987
CST U.S. History	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	447253	NaN
CST World History	NaN	NaN	NaN	NaN	NaN	NaN	NaN	35087	422893	16163	474237

Lets see how english grades vary by grade¶

In [54]:

#First keep english only
english = data[data['Test.Name']=='CST English-Language Arts']

In [55]:

#computes the mean of the bodies
english_grade = english.groupby('Grade')['Mean.Scale.Score'].aggregate(np.mean) 
english_grade

Out[55]:

Grade
2        355.771930
3        343.361404
4        370.684211
5        363.074138
6        361.592982
7        366.333333
8        362.998276
9        360.000000
10       344.621053
11       340.910526
Name: Mean.Scale.Score, dtype: float64

In [56]:

english_grade.plot()

Out[56]:

<matplotlib.axes._subplots.AxesSubplot at 0x10c4c5dd8>

Now we want average across counties, not students because it's not fair to average counties¶

In [59]:

# We add a new calculated column to our dataframe
english['Weighted.Score'] = english['Mean.Scale.Score'] * english['Pct.Test.Grade']
# Then group by grade and plot the aggregated sum of this column
english.groupby('Grade')['Weighted.Score'].aggregate(np.sum).plot()

Out[59]:

<matplotlib.axes._subplots.AxesSubplot at 0x10cd16898>

So This data tells us that juniors are the dumbest, and if your a good parent you should take your student out after 4th grade¶

Effect of Income for 7th grade test scores?¶

In [60]:

# Keep the relevant rows only
math_7th = data[(data['Test.Name'] == 'CST Mathematics') &
                (data['Grade'] == 7)]

In [61]:

math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter')
#We use scatter because if we did line plot, there is no garuntee that the results are in gradual 
#order, so then we would just have connected lines everywhere
#But cant we sort the data first? Would that solve the problem?

Out[61]:

<matplotlib.axes._subplots.AxesSubplot at 0x10cdef588>

In [62]:

# Let's peek at the offending row
math_7th[math_7th['Mean.Scale.Score'] < 300]

Out[62]:

	County.Code	County.Name	Test.Id	Test.Name	Grade	Students.Tested	Mean.Scale.Score	Count.Test.Grade	Pct.Test.Grade	Population	per.capita.income	median.household.income	median.family.income	Spend	ADAttend	Spend.Per.ADA
2918	46	Sierra	8	CST Mathematics	7	27	259.5	393811	0.000069	3277	26137	50308	56469	4739373	354	13391

In [66]:

math_7th.plot(x='Students.Tested', y='Mean.Scale.Score',
              kind='scatter', logx=True)
#Try running the same without logx--the data gets squished.

Out[66]:

<matplotlib.axes._subplots.AxesSubplot at 0x10e0840b8>

Clearly Sierra is an outlier...so we should take it out¶

In [67]:

math_7th = math_7th[math_7th['County.Name'] != 'Sierra']
math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter')

Out[67]:

<matplotlib.axes._subplots.AxesSubplot at 0x10e4d27b8>

Cryptic Magic Starts Here (So does all the math):¶

In [68]:

# Let's fit a linear model to our data
y = math_7th['Mean.Scale.Score']  # response
X = math_7th['median.family.income']  # predictor
X = sm.add_constant(X)  # Add a constant term to the predictor
# The actual fitting happens here
est = sm.OLS(y, X) #fit least squares model
est = est.fit()
est.summary()

Out[68]:

OLS Regression Results
Dep. Variable:	Mean.Scale.Score	R-squared:	0.298
Model:	OLS	Adj. R-squared:	0.285
Method:	Least Squares	F-statistic:	22.93
Date:	Sat, 28 Feb 2015	Prob (F-statistic):	1.35e-05
Time:	10:51:10	Log-Likelihood:	-212.31
No. Observations:	56	AIC:	428.6
Df Residuals:	54	BIC:	432.7
Df Model:	1

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	332.0775	6.059	54.808	0.000	319.930 344.225
median.family.income	0.0004	8.87e-05	4.788	0.000	0.000 0.001

Omnibus:	0.749	Durbin-Watson:	1.990
Prob(Omnibus):	0.688	Jarque-Bera (JB):	0.280
Skew:	0.142	Prob(JB):	0.869
Kurtosis:	3.197	Cond. No.	2.84e+05

In [69]:

# Let's plot the regression line on top of the data
x_ = np.array([X.min(), X.max()])
y_ = est.predict(x_)
math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter')
plt.plot(x_[:, 1], y_, 'r-')

Out[69]:

[<matplotlib.lines.Line2D at 0x10d63df28>]

Whats the effect of spending on 7th grade math scores?¶

In [70]:

# Let's fit a linear model to our data
y = math_7th['Mean.Scale.Score']  # response
X = math_7th[['median.family.income', 'Spend.Per.ADA']]  # predictor
X = sm.add_constant(X)  # Add a constant term to the predictor
# The actual fitting happens here
est = sm.OLS(y, X)
est = est.fit()
est.summary()

Out[70]:

OLS Regression Results
Dep. Variable:	Mean.Scale.Score	R-squared:	0.359
Model:	OLS	Adj. R-squared:	0.335
Method:	Least Squares	F-statistic:	14.82
Date:	Sat, 28 Feb 2015	Prob (F-statistic):	7.71e-06
Time:	10:55:14	Log-Likelihood:	-209.78
No. Observations:	56	AIC:	425.6
Df Residuals:	53	BIC:	431.6
Df Model:	2

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	308.8693	11.900	25.955	0.000	285.000 332.738
median.family.income	0.0004	8.56e-05	4.935	0.000	0.000 0.001
Spend.Per.ADA	0.0025	0.001	2.239	0.029	0.000 0.005

Omnibus:	0.769	Durbin-Watson:	1.849
Prob(Omnibus):	0.681	Jarque-Bera (JB):	0.865
Skew:	-0.189	Prob(JB):	0.649
Kurtosis:	2.523	Cond. No.	5.82e+05

In [ ]: