import pandas as pd
import numpy as np
import statsmodels.api as sm #used for statistical modeling
import matplotlib.pyplot as plt
#to have plots show up automatically in notebook
%matplotlib inline
a = pd.Series(np.random.randn(5), name = 'random numbers')
a
0 -0.542996 1 -1.411180 2 -1.775740 3 1.705594 4 0.638039 Name: random numbers, dtype: float64
#Series can be sliced
a[-3:] #gives last three items of the series...the index has stayed with the numbers
2 -1.775740 3 1.705594 4 0.638039 Name: random numbers, dtype: float64
a[3]
1.705594493602101
a[[1,3,2]]
1 -1.411180 3 1.705594 2 -1.775740 Name: random numbers, dtype: float64
#Boolean indexing...indexes dont have to be a number
a[[True,False,False,True,True]]
0 -0.542996 3 1.705594 4 0.638039 Name: random numbers, dtype: float64
a.index = list('abcde') #new defining of list- now can get index with labels...
a
a -0.542996 b -1.411180 c -1.775740 d 1.705594 e 0.638039 Name: random numbers, dtype: float64
#can now also slice with labels
a['c']
-1.7757400675142934
#concatenating the values of lists
#Mixing data sets on an index
a[-3:] + a[:3] #need to have a common index
a NaN b NaN c -3.55148 d NaN e NaN Name: random numbers, dtype: float64
#A series makes a good look up table
#Making grades look up table 4->A etc
gp_lut = pd.Series(data=[4,3,2,1,0],index=list('ABCDF'))
gp_lut
A 4 B 3 C 2 D 1 F 0 dtype: int64
my_grades = ['A','B','B','A','A']
my_points = gp_lut[my_grades]
my_points
my_gpa = my_points.mean()
my_gpa
3.6000000000000001
students = pd.DataFrame({'name' : ['Peter', 'Paul', 'Mary',
'Peter', 'Paul', 'Mary'],
'subject' : ['English', 'English', 'English',
'Math', 'Math', 'Math'],
'grade' : [85.0, 76.0, 92.0, 77.0, 68.0, 87.0]})
students
grade | name | subject | |
---|---|---|---|
0 | 85 | Peter | English |
1 | 76 | Paul | English |
2 | 92 | Mary | English |
3 | 77 | Peter | Math |
4 | 68 | Paul | Math |
5 | 87 | Mary | Math |
# We can get the column names
students.columns
Index(['grade', 'name', 'subject'], dtype='object')
# ...or as an attribute
students.name
0 Peter 1 Paul 2 Mary 3 Peter 4 Paul 5 Mary Name: name, dtype: object
students[['subject','grade']]
subject | grade | |
---|---|---|
0 | English | 85 |
1 | English | 76 |
2 | English | 92 |
3 | Math | 77 |
4 | Math | 68 |
5 | Math | 87 |
#create a boolean array which is only true when her name is Mary
#Now we will take only Mary's grades
students[students.name == 'Mary']
grade | name | subject | |
---|---|---|---|
2 | 92 | Mary | English |
5 | 87 | Mary | Math |
data = pd.read_csv('star_2013_clean_wide.csv')
data
County.Code | County.Name | Test.Id | Test.Name | Grade | Students.Tested | Mean.Scale.Score | Count.Test.Grade | Pct.Test.Grade | Population | per.capita.income | median.household.income | median.family.income | Spend | ADAttend | Spend.Per.ADA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Alameda | 7 | CST English-Language Arts | 2 | 16814 | 372.1 | 464896 | 0.036167 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
1 | 1 | Alameda | 8 | CST Mathematics | 2 | 16802 | 398.4 | 464515 | 0.036171 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
2 | 1 | Alameda | 8 | CST Mathematics | 3 | 16198 | 411.1 | 443961 | 0.036485 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
3 | 1 | Alameda | 7 | CST English-Language Arts | 3 | 16126 | 357.2 | 441572 | 0.036520 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
4 | 1 | Alameda | 7 | CST English-Language Arts | 4 | 15390 | 387.0 | 428906 | 0.035882 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
5 | 1 | Alameda | 8 | CST Mathematics | 4 | 15532 | 405.8 | 433012 | 0.035870 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
6 | 1 | Alameda | 8 | CST Mathematics | 5 | 15146 | 411.6 | 432775 | 0.034997 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
7 | 1 | Alameda | 7 | CST English-Language Arts | 5 | 15077 | 378.4 | 429498 | 0.035104 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
8 | 1 | Alameda | 32 | CST Science - Grade 5, Grade 8, and Grade 10 L... | 5 | 15107 | 380.6 | 431142 | 0.035040 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
9 | 1 | Alameda | 7 | CST English-Language Arts | 6 | 15277 | 374.9 | 434374 | 0.035170 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
10 | 1 | Alameda | 8 | CST Mathematics | 6 | 15332 | 380.3 | 436563 | 0.035120 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
11 | 1 | Alameda | 7 | CST English-Language Arts | 7 | 14551 | 378.4 | 431187 | 0.033746 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
12 | 1 | Alameda | 9 | CST Algebra I | 7 | 2239 | 431.6 | 37803 | 0.059228 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
13 | 1 | Alameda | 8 | CST Mathematics | 7 | 12326 | 370.2 | 393811 | 0.031299 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
14 | 1 | Alameda | 9 | CST Algebra I | 8 | 11006 | 354.2 | 276039 | 0.039871 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
15 | 1 | Alameda | 13 | CST Algebra II | 8 | 31 | 458.9 | 680 | 0.045588 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
16 | 1 | Alameda | 7 | CST English-Language Arts | 8 | 14549 | 373.9 | 435491 | 0.033408 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
17 | 1 | Alameda | 29 | CST History - Social Science Grade 8 | 8 | 15421 | 363.3 | 459125 | 0.033588 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
18 | 1 | Alameda | 28 | CST General Mathematics | 8 | 2143 | 308.4 | 145549 | 0.014724 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
19 | 1 | Alameda | 11 | CST Geometry | 8 | 1732 | 428.0 | 29035 | 0.059652 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
20 | 1 | Alameda | 32 | CST Science - Grade 5, Grade 8, and Grade 10 L... | 8 | 14552 | 404.5 | 436071 | 0.033371 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
21 | 1 | Alameda | 10 | CST Integrated Math 1 | 9 | 121 | 298.8 | 980 | 0.123469 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
22 | 1 | Alameda | 20 | CST Biology | 9 | 9473 | 377.0 | 228962 | 0.041374 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
23 | 1 | Alameda | 21 | CST Chemistry | 9 | 186 | 364.3 | 5365 | 0.034669 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
24 | 1 | Alameda | 24 | CST Integrated/Coordinated Science 1 | 9 | 905 | 321.4 | 33944 | 0.026662 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
25 | 1 | Alameda | 22 | CST Earth Science | 9 | 1651 | 328.8 | 133961 | 0.012324 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
26 | 1 | Alameda | 23 | CST Physics | 9 | 754 | 312.8 | 14112 | 0.053430 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
27 | 1 | Alameda | 25 | CST Integrated/Coordinated Science 2 | 9 | 124 | 323.0 | 1351 | 0.091784 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
28 | 1 | Alameda | 18 | CST World History | 9 | 597 | 320.1 | 35087 | 0.017015 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
29 | 1 | Alameda | 11 | CST Geometry | 9 | 6613 | 353.4 | 144063 | 0.045904 | 1494876 | 34937 | 70821 | 87012 | 1814932885 | 193906 | 9360 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3700 | 58 | Yuba | 22 | CST Earth Science | 10 | 290 | 327.0 | 30649 | 0.009462 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3701 | 58 | Yuba | 15 | CST Summative High School Mathematics | 10 | 19 | 353.3 | 24757 | 0.000767 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3702 | 58 | Yuba | 21 | CST Chemistry | 10 | 166 | 358.8 | 136209 | 0.001219 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3703 | 58 | Yuba | 13 | CST Algebra II | 10 | 201 | 335.9 | 131448 | 0.001529 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3704 | 58 | Yuba | 9 | CST Algebra I | 10 | 259 | 285.9 | 104567 | 0.002477 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3705 | 58 | Yuba | 7 | CST English-Language Arts | 10 | 931 | 332.0 | 455362 | 0.002045 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3706 | 58 | Yuba | 20 | CST Biology | 10 | 238 | 343.3 | 228138 | 0.001043 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3707 | 58 | Yuba | 11 | CST Geometry | 10 | 274 | 296.1 | 156167 | 0.001755 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3708 | 58 | Yuba | 11 | CST Geometry | 11 | 151 | 287.1 | 78308 | 0.001928 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3709 | 58 | Yuba | 19 | CST U.S. History | 11 | 895 | 325.7 | 447386 | 0.002001 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3710 | 58 | Yuba | 20 | CST Biology | 11 | 186 | 330.3 | 99776 | 0.001864 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3711 | 58 | Yuba | 7 | CST English-Language Arts | 11 | 899 | 324.9 | 440115 | 0.002043 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3712 | 58 | Yuba | 9 | CST Algebra I | 11 | 187 | 281.3 | 49868 | 0.003750 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3713 | 58 | Yuba | 13 | CST Algebra II | 11 | 199 | 302.4 | 122079 | 0.001630 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3714 | 58 | Yuba | 21 | CST Chemistry | 11 | 121 | 342.4 | 133804 | 0.000904 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3715 | 58 | Yuba | 15 | CST Summative High School Mathematics | 11 | 153 | 322.6 | 124304 | 0.001231 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3716 | 58 | Yuba | 22 | CST Earth Science | 11 | 187 | 326.8 | 42331 | 0.004418 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3717 | 58 | Yuba | 23 | CST Physics | 11 | 91 | 363.8 | 56726 | 0.001604 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3718 | 58 | Yuba | 18 | CST World History | 11 | 40 | 300.3 | 16163 | 0.002475 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3719 | 58 | Yuba | 18 | CST World History | 13 | 874 | 329.1 | 474255 | 0.001843 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3720 | 58 | Yuba | 25 | CST Integrated/Coordinated Science 2 | 13 | 14 | 302.3 | 3742 | 0.003741 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3721 | 58 | Yuba | 20 | CST Biology | 13 | 876 | 340.6 | 556893 | 0.001573 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3722 | 58 | Yuba | 22 | CST Earth Science | 13 | 875 | 328.9 | 206991 | 0.004227 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3723 | 58 | Yuba | 23 | CST Physics | 13 | 92 | 363.6 | 80833 | 0.001138 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3724 | 58 | Yuba | 28 | CST General Mathematics | 13 | 585 | 299.5 | 190615 | 0.003069 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3725 | 58 | Yuba | 13 | CST Algebra II | 13 | 431 | 320.6 | 286737 | 0.001503 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3726 | 58 | Yuba | 21 | CST Chemistry | 13 | 287 | 351.9 | 275452 | 0.001042 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3727 | 58 | Yuba | 15 | CST Summative High School Mathematics | 13 | 172 | 326.0 | 149987 | 0.001147 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3728 | 58 | Yuba | 11 | CST Geometry | 13 | 695 | 307.8 | 407658 | 0.001705 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3729 | 58 | Yuba | 9 | CST Algebra I | 13 | 1495 | 319.4 | 711705 | 0.002101 | 71817 | 20046 | 46617 | 52775 | 99237214 | 11710 | 8474 |
3730 rows × 16 columns
data.columns #A look at the available columns
Index(['County.Code', 'County.Name', 'Test.Id', 'Test.Name', 'Grade', 'Students.Tested', 'Mean.Scale.Score', 'Count.Test.Grade', 'Pct.Test.Grade', 'Population', 'per.capita.income', 'median.household.income', 'median.family.income', 'Spend', 'ADAttend', 'Spend.Per.ADA'], dtype='object')
#What are the unique test names?
data['Test.Name'].unique()
array(['CST English-Language Arts', 'CST Mathematics', 'CST Science - Grade 5, Grade 8, and Grade 10 Life Science', 'CST Algebra I', 'CST Algebra II', 'CST History - Social Science Grade 8', 'CST General Mathematics', 'CST Geometry', 'CST Integrated Math 1', 'CST Biology', 'CST Chemistry', 'CST Integrated/Coordinated Science 1', 'CST Earth Science', 'CST Physics', 'CST Integrated/Coordinated Science 2', 'CST World History', 'CST Summative High School Mathematics', 'CST Integrated Math 2', 'CST U.S. History', 'CST Integrated/Coordinated Science 4', 'CST Integrated Math 3', 'CST Integrated/Coordinated Science 3'], dtype=object)
data.pivot_table(values='Students.Tested',index='Test.Name',columns='Grade',aggfunc='sum')
#What is 13? CST Tests?
Grade | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 |
---|---|---|---|---|---|---|---|---|---|---|---|
Test.Name | |||||||||||
CST Algebra I | NaN | NaN | NaN | NaN | NaN | 37803 | 276039 | 243349 | 104555 | 49846 | 711671 |
CST Algebra II | NaN | NaN | NaN | NaN | NaN | NaN | 680 | 32400 | 131448 | 122079 | 286737 |
CST Biology | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 228962 | 228138 | 99753 | 556867 |
CST Chemistry | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5365 | 136209 | 133804 | 275452 |
CST Earth Science | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 133961 | 30649 | 42316 | 206974 |
CST English-Language Arts | 464896 | 441572 | 428906 | 429498 | 434374 | 431187 | 435491 | 463195 | 455237 | 439972 | NaN |
CST General Mathematics | NaN | NaN | NaN | NaN | NaN | NaN | 145549 | 45052 | NaN | NaN | 190615 |
CST Geometry | NaN | NaN | NaN | NaN | NaN | NaN | 29035 | 144063 | 156167 | 78308 | 407658 |
CST History - Social Science Grade 8 | NaN | NaN | NaN | NaN | NaN | NaN | 459125 | NaN | NaN | NaN | NaN |
CST Integrated Math 1 | NaN | NaN | NaN | NaN | NaN | NaN | 125 | 980 | 5965 | 6318 | 13486 |
CST Integrated Math 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 236 | 1432 | 3364 | 5102 |
CST Integrated Math 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 61 | 153 | 530 | 769 |
CST Integrated/Coordinated Science 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 33944 | 4401 | 7963 | 46386 |
CST Integrated/Coordinated Science 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1351 | 1413 | 955 | 3742 |
CST Integrated/Coordinated Science 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11 | 46 | 874 | 952 |
CST Integrated/Coordinated Science 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 18 | NaN | NaN | 43 |
CST Mathematics | 464515 | 443961 | 433012 | 432775 | 436563 | 393811 | NaN | NaN | NaN | NaN | NaN |
CST Physics | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 14112 | 9902 | 56726 | 80833 |
CST Science - Grade 5, Grade 8, and Grade 10 Life Science | NaN | NaN | NaN | 431142 | NaN | NaN | 436071 | NaN | 451253 | NaN | NaN |
CST Summative High School Mathematics | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 809 | 24757 | 124304 | 149987 |
CST U.S. History | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 447253 | NaN |
CST World History | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 35087 | 422893 | 16163 | 474237 |
#First keep english only
english = data[data['Test.Name']=='CST English-Language Arts']
#computes the mean of the bodies
english_grade = english.groupby('Grade')['Mean.Scale.Score'].aggregate(np.mean)
english_grade
Grade 2 355.771930 3 343.361404 4 370.684211 5 363.074138 6 361.592982 7 366.333333 8 362.998276 9 360.000000 10 344.621053 11 340.910526 Name: Mean.Scale.Score, dtype: float64
english_grade.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10c4c5dd8>
# We add a new calculated column to our dataframe
english['Weighted.Score'] = english['Mean.Scale.Score'] * english['Pct.Test.Grade']
# Then group by grade and plot the aggregated sum of this column
english.groupby('Grade')['Weighted.Score'].aggregate(np.sum).plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10cd16898>
# Keep the relevant rows only
math_7th = data[(data['Test.Name'] == 'CST Mathematics') &
(data['Grade'] == 7)]
math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter')
#We use scatter because if we did line plot, there is no garuntee that the results are in gradual
#order, so then we would just have connected lines everywhere
#But cant we sort the data first? Would that solve the problem?
<matplotlib.axes._subplots.AxesSubplot at 0x10cdef588>
# Let's peek at the offending row
math_7th[math_7th['Mean.Scale.Score'] < 300]
County.Code | County.Name | Test.Id | Test.Name | Grade | Students.Tested | Mean.Scale.Score | Count.Test.Grade | Pct.Test.Grade | Population | per.capita.income | median.household.income | median.family.income | Spend | ADAttend | Spend.Per.ADA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2918 | 46 | Sierra | 8 | CST Mathematics | 7 | 27 | 259.5 | 393811 | 0.000069 | 3277 | 26137 | 50308 | 56469 | 4739373 | 354 | 13391 |
math_7th.plot(x='Students.Tested', y='Mean.Scale.Score',
kind='scatter', logx=True)
#Try running the same without logx--the data gets squished.
<matplotlib.axes._subplots.AxesSubplot at 0x10e0840b8>
math_7th = math_7th[math_7th['County.Name'] != 'Sierra']
math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x10e4d27b8>
# Let's fit a linear model to our data
y = math_7th['Mean.Scale.Score'] # response
X = math_7th['median.family.income'] # predictor
X = sm.add_constant(X) # Add a constant term to the predictor
# The actual fitting happens here
est = sm.OLS(y, X) #fit least squares model
est = est.fit()
est.summary()
Dep. Variable: | Mean.Scale.Score | R-squared: | 0.298 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.285 |
Method: | Least Squares | F-statistic: | 22.93 |
Date: | Sat, 28 Feb 2015 | Prob (F-statistic): | 1.35e-05 |
Time: | 10:51:10 | Log-Likelihood: | -212.31 |
No. Observations: | 56 | AIC: | 428.6 |
Df Residuals: | 54 | BIC: | 432.7 |
Df Model: | 1 |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
const | 332.0775 | 6.059 | 54.808 | 0.000 | 319.930 344.225 |
median.family.income | 0.0004 | 8.87e-05 | 4.788 | 0.000 | 0.000 0.001 |
Omnibus: | 0.749 | Durbin-Watson: | 1.990 |
---|---|---|---|
Prob(Omnibus): | 0.688 | Jarque-Bera (JB): | 0.280 |
Skew: | 0.142 | Prob(JB): | 0.869 |
Kurtosis: | 3.197 | Cond. No. | 2.84e+05 |
# Let's plot the regression line on top of the data
x_ = np.array([X.min(), X.max()])
y_ = est.predict(x_)
math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter')
plt.plot(x_[:, 1], y_, 'r-')
[<matplotlib.lines.Line2D at 0x10d63df28>]
# Let's fit a linear model to our data
y = math_7th['Mean.Scale.Score'] # response
X = math_7th[['median.family.income', 'Spend.Per.ADA']] # predictor
X = sm.add_constant(X) # Add a constant term to the predictor
# The actual fitting happens here
est = sm.OLS(y, X)
est = est.fit()
est.summary()
Dep. Variable: | Mean.Scale.Score | R-squared: | 0.359 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.335 |
Method: | Least Squares | F-statistic: | 14.82 |
Date: | Sat, 28 Feb 2015 | Prob (F-statistic): | 7.71e-06 |
Time: | 10:55:14 | Log-Likelihood: | -209.78 |
No. Observations: | 56 | AIC: | 425.6 |
Df Residuals: | 53 | BIC: | 431.6 |
Df Model: | 2 |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
const | 308.8693 | 11.900 | 25.955 | 0.000 | 285.000 332.738 |
median.family.income | 0.0004 | 8.56e-05 | 4.935 | 0.000 | 0.000 0.001 |
Spend.Per.ADA | 0.0025 | 0.001 | 2.239 | 0.029 | 0.000 0.005 |
Omnibus: | 0.769 | Durbin-Watson: | 1.849 |
---|---|---|---|
Prob(Omnibus): | 0.681 | Jarque-Bera (JB): | 0.865 |
Skew: | -0.189 | Prob(JB): | 0.649 |
Kurtosis: | 2.523 | Cond. No. | 5.82e+05 |