In [3]:

import pandas as pd
from pandas import DataFrame
#This is the 'Active Asthma Prevalence, All Ages, 2007' from
#http://www.californiabreathing.org/asthma-data/county-comparisons/active-asthma-prevalence
asth = open('asthma_prev_2007.csv')

In [4]:

asth_county = pd.read_csv(asth)
asth_county

Out[4]:

	'COUNTY (OR GROUP)'	'Percent with Active Asthma'	'95% Confidence Interval'
0	'Merced'	'14.4'	'(9.3 - 19.6)'
1	'Tehama/Glenn/Colusa'	'13.7'	'(9.5 - 17.9)'
2	'Yuba'	'12.7'	'(8.9 - 16.6)'
3	'Kern'	'11.9'	'(8.6 - 15.2)'
4	'Fresno'	'11.3'	'(8.4 - 14.3)'
5	'Sacramento'	'11.2'	'(8.9 - 13.5)'
6	'Solano'	'10.9'	'(6.7 - 15.1)'
7	'San Joaquin'	'10.8'	'(7.5 - 14.2)'
8	'Santa Cruz'	'10.6'	'(6.6 - 14.5)'
9	'Kings'	'10.1'	'(7.4 - 12.9)'
10	'Sutter'	'10.0'	'(7.2 - 12.8)'
11	'Alameda'	'9.8'	'(7.5 - 12.2)'
12	'Lake'	'9.8'	'(6.9 - 12.7)'
13	'Napa'	'9.8'	'(6.8 - 12.7)'
14	'Butte'	'9.7'	'(7.2 - 12.2)'
15	'Shasta'	'9.7'	'(5.7 - 13.7)'
16	'Contra Costa'	'9.6'	'(7.2 - 11.9)'
17	'Imperial'	'9.5'	'(6.9 - 12.2)'
18	'Madera'	'9.5'	'(6.9 - 12.1)'
19	'Nevada'	'9.4'	'(6.6 - 12.1)'
20	'Del Norte/Siskiyou/Lassen/Trinity/Modoc/Pluma...	'9.2'	'(5.8 - 12.6)'
21	'San Mateo'	'9.2'	'(5.9 - 12.6)'
22	'Placer'	'9.0'	'(6.2 - 11.7)'
23	'San Bernardino'	'9.0'	'(7.5 - 10.5)'
24	'Stanislaus'	'9.0'	'(5.6 - 12.4)'
25	'Tulare'	'8.8'	'(6.2 - 11.3)'
26	'Yolo'	'8.8'	'(5.0 - 12.7)'
27	'Tuolumne/Calaveras/Amador/Inyo/Mariposa/Mono/...	'8.7'	'(5.8 - 11.6)'
28	'El Dorado'	'8.3'	'(5.9 - 10.6)'
29	'CALIFORNIA OVERALL'	'8.1'	'(7.7 - 8.4)'
30	'Ventura'	'8.1'	'(5.4 - 10.8)'
31	'Santa Barbara'	'8.0'	'(5.1 - 10.9)'
32	'Sonoma'	'7.8'	'(5.2 - 10.3)'
33	'San Luis Obispo'	'7.5'	'(4.4 - 10.6)'
34	'San Diego'	'7.3'	'(6.3 - 8.3)'
35	'Los Angeles'	'7.2'	'(6.5 - 7.8)'
36	'Humboldt'	'7.0'	'(4.9 - 9.2)'
37	'Mendocino'	'6.9'	'(4.6 - 9.3)'
38	'Monterey'	'6.8'	'(4.3 - 9.4)'
39	'Orange'	'6.8'	'(5.7 - 8.0)'
40	'Santa Clara'	'6.8'	'(5.3 - 8.4)'
41	'Marin'	'6.5'	'(3.4 - 9.6)'
42	'San Francisco'	'6.0'	'(3.4 - 8.6)'
43	'Riverside'	'5.2'	'(4.2 - 6.2)'

In [5]:

#Clean up the columnames
asth_county
asth_county['percent_active'] = asth_county['\'Percent with Active Asthma\''].apply(lambda x: float(x[1:-1]))
asth_county['county_name'] = asth_county[asth_county.columns[0]].apply(lambda x: str(x[1:-1]))
asth_county['95_confidence'] = asth_county['\'95% Confidence Interval\''].apply(lambda x: str(x[1:-1]))

In [6]:

#Delete old columns with messy names
del asth_county[asth_county.columns[0]]
del asth_county[asth_county.columns[0]]
del asth_county[asth_county.columns[0]]

Some rows represent multiple counties, we got to undo this grouping

In [7]:

new_rows = []
def unpack_group(row):
    group = row['county_name']
    if '/' in group:
        counties = group.split('/')
        for county in counties:
            new_rows.append({'county_name': county, 'percent_active': row['percent_active'], '95_confidence': row['95_confidence']})
    

In [8]:

#Apply said function
asth_county.apply(unpack_group, axis = 1)

Out[8]:

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
41    None
42    None
43    None

In [9]:

new_df = DataFrame(new_rows)

In [10]:

#Join old dataframe and new rows
new_asth = pd.concat([new_df, asth_county], ignore_index=True)

In [11]:

#Remove rows that are groups
isnt_group = lambda x: '/' not in x

In [12]:

new_asth[new_asth['county_name'].apply(isnt_group)]

Out[12]:

	95_confidence	county_name	percent_active
0	(9.5 - 17.9)	Tehama	13.7
1	(9.5 - 17.9)	Glenn	13.7
2	(9.5 - 17.9)	Colusa	13.7
3	(5.8 - 12.6)	Del Norte	9.2
4	(5.8 - 12.6)	Siskiyou	9.2
5	(5.8 - 12.6)	Lassen	9.2
6	(5.8 - 12.6)	Trinity	9.2
7	(5.8 - 12.6)	Modoc	9.2
8	(5.8 - 12.6)	Plumas	9.2
9	(5.8 - 12.6)	Sierra	9.2
10	(5.8 - 11.6)	Tuolumne	8.7
11	(5.8 - 11.6)	Calaveras	8.7
12	(5.8 - 11.6)	Amador	8.7
13	(5.8 - 11.6)	Inyo	8.7
14	(5.8 - 11.6)	Mariposa	8.7
15	(5.8 - 11.6)	Mono	8.7
16	(5.8 - 11.6)	Alpine	8.7
17	(9.3 - 19.6)	Merced	14.4
19	(8.9 - 16.6)	Yuba	12.7
20	(8.6 - 15.2)	Kern	11.9
21	(8.4 - 14.3)	Fresno	11.3
22	(8.9 - 13.5)	Sacramento	11.2
23	(6.7 - 15.1)	Solano	10.9
24	(7.5 - 14.2)	San Joaquin	10.8
25	(6.6 - 14.5)	Santa Cruz	10.6
26	(7.4 - 12.9)	Kings	10.1
27	(7.2 - 12.8)	Sutter	10.0
28	(7.5 - 12.2)	Alameda	9.8
29	(6.9 - 12.7)	Lake	9.8
30	(6.8 - 12.7)	Napa	9.8
31	(7.2 - 12.2)	Butte	9.7
32	(5.7 - 13.7)	Shasta	9.7
33	(7.2 - 11.9)	Contra Costa	9.6
34	(6.9 - 12.2)	Imperial	9.5
35	(6.9 - 12.1)	Madera	9.5
36	(6.6 - 12.1)	Nevada	9.4
38	(5.9 - 12.6)	San Mateo	9.2
39	(6.2 - 11.7)	Placer	9.0
40	(7.5 - 10.5)	San Bernardino	9.0
41	(5.6 - 12.4)	Stanislaus	9.0
42	(6.2 - 11.3)	Tulare	8.8
43	(5.0 - 12.7)	Yolo	8.8
45	(5.9 - 10.6)	El Dorado	8.3
46	(7.7 - 8.4)	CALIFORNIA OVERALL	8.1
47	(5.4 - 10.8)	Ventura	8.1
48	(5.1 - 10.9)	Santa Barbara	8.0
49	(5.2 - 10.3)	Sonoma	7.8
50	(4.4 - 10.6)	San Luis Obispo	7.5
51	(6.3 - 8.3)	San Diego	7.3
52	(6.5 - 7.8)	Los Angeles	7.2
53	(4.9 - 9.2)	Humboldt	7.0
54	(4.6 - 9.3)	Mendocino	6.9
55	(4.3 - 9.4)	Monterey	6.8
56	(5.7 - 8.0)	Orange	6.8
57	(5.3 - 8.4)	Santa Clara	6.8
58	(3.4 - 9.6)	Marin	6.5
59	(3.4 - 8.6)	San Francisco	6.0
60	(4.2 - 6.2)	Riverside	5.2