import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame
#Airquality Data comes from https://ofmext.epa.gov/AQDMRS/aqdmrs.html
#This query includes the ozone data for the year 2011 in the state of california
a = open('AQDM_507410362.txt')
df = pd.read_csv(a)
df.head()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5 entries, 0 to 4 Data columns: Latitude 5 non-null values Longitude 5 non-null values Datum 5 non-null values Horizontal Accuracy 5 non-null values State Code 5 non-null values County Code 5 non-null values Site Num 5 non-null values Parameter Code 5 non-null values POC 5 non-null values AQS Parameter Desc 5 non-null values Date Local 5 non-null values 24 Hour Local 5 non-null values Date GMT 5 non-null values 24 Hour GMT 5 non-null values Year GMT 5 non-null values Day In Year GMT 5 non-null values Sample Measurement 5 non-null values Units of Measure 5 non-null values Sample Duration 5 non-null values Sample Frequency 0 non-null values Detection Limit 5 non-null values Measurement Uncertainty 0 non-null values Qualifier Description 0 non-null values Method Type 5 non-null values Method Description 5 non-null values dtypes: float64(13), object(12)
len(df)
2995068
Group all measurements by measurement site (identified by countycode and site number) and create a new DataFrame with data averaged over time per measurement site
county_df = df.groupby(['County Code','Site Num', 'Latitude', 'Longitude'])
county_code = list()
sitenum = list()
lat = list()
lon = list()
mean_meas = list()
max_meas = list()
min_meas = list()
median_meas = list()
for grpname,grp in county_df:
county_code.append(grpname[0])
sitenum.append(grpname[1])
lat.append(float(grpname[2]))
lon.append(float(grpname[3]))
mean_meas.append(grp['Sample Measurement'].mean())
max_meas.append(grp['Sample Measurement'].max())
min_meas.append(grp['Sample Measurement'].min())
median_meas.append(grp['Sample Measurement'].median())
sites_dict = {'County Code': county_code, 'Site Num': sitenum, 'Mean Ozone': mean_meas, 'Max Ozone': max_meas, 'Min Ozone': min_meas, 'Median Ozone': median_meas, 'Latitude': lat, 'Longitude': lon}
sites_df = DataFrame(sites_dict)
sites_df.head()
County Code | Latitude | Longitude | Max Ozone | Mean Ozone | Median Ozone | Min Ozone | Site Num | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 37.687526 | -121.784217 | 0.115 | 0.023289 | 0.023 | 0.000 | 7 |
1 | 1 | 37.687526 | -121.784217 | 0.063 | 0.033426 | 0.035 | 0.003 | 7 |
2 | 1 | 37.743065 | -122.169935 | 0.091 | 0.018510 | 0.018 | 0.000 | 9 |
3 | 1 | 37.743065 | -122.169935 | 0.051 | 0.027224 | 0.028 | 0.002 | 9 |
4 | 1 | 37.814781 | -122.282347 | 0.057 | 0.018001 | 0.018 | 0.000 | 11 |
Look at some overall stats for our new DataFrame
print 'Std ', sites_df['Mean Ozone'].std()
print 'Mean ',sites_df['Mean Ozone'].mean()
print 'Median',sites_df['Mean Ozone'].median()
print 'Lat Max',sites_df['Latitude'].max()
print 'Lat Min',sites_df['Latitude'].min()
print 'Lon Max',sites_df['Longitude'].max()
print 'Lon Min',sites_df['Longitude'].min()
Std 0.00994831782491 Mean 0.035125235605 Median 0.0332733333333 Lat Max 41.726892 Lat Min 32.552164 Lon Max -114.602886 Lon Min -124.20139
Plot all measureing stations to see if they are evenly distributed
plt.scatter(sites_df['Longitude'],sites_df['Latitude'])
<matplotlib.collections.PathCollection at 0x11126e6c>
Convert the 'County Code' into a FIPS, ie add on the '06' California prefix
#sites_df['FIPS'] = sites_df['County Code'].apply(lambda x: '0' + (str(6000 + int(x) )))
sites_df['FIPS'] = sites_df['County Code'].apply(lambda x: 6000 + int(x) )
sites_df['FIPS'][100:110]
100 6037 101 6037 102 6037 103 6037 104 6037 105 6037 106 6037 107 6037 108 6039 109 6039 Name: FIPS
county_averages = sites_df.groupby('FIPS')['Mean Ozone'].mean()
county_averages.index
Int64Index([6001, 6005, 6007, 6009, 6011, 6013, 6017, 6019, 6021, 6023, 6025, 6027, 6029, 6031, 6033, 6037, 6039, 6041, 6043, 6045, 6047, 6053, 6055, 6057, 6059, 6061, 6065, 6067, 6069, 6071, 6073, 6075, 6077, 6079, 6081, 6083, 6085, 6087, 6089, 6093, 6095, 6097, 6099, 6101, 6103, 6107, 6109, 6111, 6113], dtype=int64)
county_averages.hist(bins = 20)
<matplotlib.axes.AxesSubplot at 0xae410cc>
county_averages.mean(), county_averages.std()
(0.033272136153468475, 0.006327595808016743)
med = pd.read_csv(open('FIPSAsthmaCounty.csv'), sep=',')
med = med.set_index('FIPS')
med.columns = ['county', 'prevalence']
med.index
Int64Index([6005, 6011, 6027, 6001, 6003, 6007, 6009, 6013, 6015, 6017, 6019, 6021, 6023, 6025, 6029, 6031, 6033, 6035, 6037, 6039, 6041, 6043, 6045, 6047, 6049, 6051, 6053, 6055, 6057, 6059, 6061, 6063, 6065, 6067, 6071, 6073, 6075, 6077, 6079, 6081, 6083, 6085, 6087, 6089, 6091, 6093, 6095, 6097, 6099, 6101, 6103, 6105, 6107, 6109, 6111, 6113, 6115], dtype=int64)
ozone = DataFrame(county_averages)
both = pd.concat([med, ozone], axis = 1)
both[:10]
county | prevalence | Mean Ozone | |
---|---|---|---|
FIPS | |||
6001 | Alameda | 9.8 | 0.025886 |
6003 | Alpine | 8.7 | NaN |
6005 | Amador | 8.7 | 0.031071 |
6007 | Butte | 9.7 | 0.039171 |
6009 | Calaveras | 8.7 | 0.036388 |
6011 | Colusa | 13.7 | 0.030787 |
6013 | Contra Costa | 9.6 | 0.027290 |
6015 | Del Norte | 9.2 | NaN |
6017 | El Dorado | 8.3 | 0.040176 |
6019 | Fresno | 11.3 | 0.040742 |
both = both.dropna()
both[:10]
county | prevalence | Mean Ozone | |
---|---|---|---|
FIPS | |||
6001 | Alameda | 9.8 | 0.025886 |
6005 | Amador | 8.7 | 0.031071 |
6007 | Butte | 9.7 | 0.039171 |
6009 | Calaveras | 8.7 | 0.036388 |
6011 | Colusa | 13.7 | 0.030787 |
6013 | Contra Costa | 9.6 | 0.027290 |
6017 | El Dorado | 8.3 | 0.040176 |
6019 | Fresno | 11.3 | 0.040742 |
6021 | Glenn | 13.7 | 0.032210 |
6023 | Humboldt | 7.0 | 0.023115 |
plt.scatter(both['Mean Ozone'], both['prevalence'])
<matplotlib.collections.PathCollection at 0x20c592ac>
z = np.polyfit(both['Mean Ozone'], both['prevalence'], 1)
p = np.poly1d(z)
plt.scatter(both['Mean Ozone'], both['prevalence'])
plt.plot(both['Mean Ozone'],p(both['Mean Ozone']),"r--")
#plt.ylim(both['prevalence'].min(), both['prevalence'].max())
#plt.xlim(both['Mean Ozone'].min(), both['Mean Ozone'].max())
[<matplotlib.lines.Line2D at 0x23eb33ac>]
print 'r= '+str(both['Mean Ozone'].corr(both['prevalence']))
r= 0.218297247024
As we are dealing with atmosphere and not a physical law, this is pretty good. r should be higher once we use the data for 2007