import pandas as pd
from pandas import DataFrame
#This is the 'Active Asthma Prevalence, All Ages, 2007' from
#http://www.californiabreathing.org/asthma-data/county-comparisons/active-asthma-prevalence
asth = open('asthma_prev_2007.csv')
asth_county = pd.read_csv(asth)
asth_county
'COUNTY (OR GROUP)' | 'Percent with Active Asthma' | '95% Confidence Interval' | |
---|---|---|---|
0 | 'Merced' | '14.4' | '(9.3 - 19.6)' |
1 | 'Tehama/Glenn/Colusa' | '13.7' | '(9.5 - 17.9)' |
2 | 'Yuba' | '12.7' | '(8.9 - 16.6)' |
3 | 'Kern' | '11.9' | '(8.6 - 15.2)' |
4 | 'Fresno' | '11.3' | '(8.4 - 14.3)' |
5 | 'Sacramento' | '11.2' | '(8.9 - 13.5)' |
6 | 'Solano' | '10.9' | '(6.7 - 15.1)' |
7 | 'San Joaquin' | '10.8' | '(7.5 - 14.2)' |
8 | 'Santa Cruz' | '10.6' | '(6.6 - 14.5)' |
9 | 'Kings' | '10.1' | '(7.4 - 12.9)' |
10 | 'Sutter' | '10.0' | '(7.2 - 12.8)' |
11 | 'Alameda' | '9.8' | '(7.5 - 12.2)' |
12 | 'Lake' | '9.8' | '(6.9 - 12.7)' |
13 | 'Napa' | '9.8' | '(6.8 - 12.7)' |
14 | 'Butte' | '9.7' | '(7.2 - 12.2)' |
15 | 'Shasta' | '9.7' | '(5.7 - 13.7)' |
16 | 'Contra Costa' | '9.6' | '(7.2 - 11.9)' |
17 | 'Imperial' | '9.5' | '(6.9 - 12.2)' |
18 | 'Madera' | '9.5' | '(6.9 - 12.1)' |
19 | 'Nevada' | '9.4' | '(6.6 - 12.1)' |
20 | 'Del Norte/Siskiyou/Lassen/Trinity/Modoc/Pluma... | '9.2' | '(5.8 - 12.6)' |
21 | 'San Mateo' | '9.2' | '(5.9 - 12.6)' |
22 | 'Placer' | '9.0' | '(6.2 - 11.7)' |
23 | 'San Bernardino' | '9.0' | '(7.5 - 10.5)' |
24 | 'Stanislaus' | '9.0' | '(5.6 - 12.4)' |
25 | 'Tulare' | '8.8' | '(6.2 - 11.3)' |
26 | 'Yolo' | '8.8' | '(5.0 - 12.7)' |
27 | 'Tuolumne/Calaveras/Amador/Inyo/Mariposa/Mono/... | '8.7' | '(5.8 - 11.6)' |
28 | 'El Dorado' | '8.3' | '(5.9 - 10.6)' |
29 | 'CALIFORNIA OVERALL' | '8.1' | '(7.7 - 8.4)' |
30 | 'Ventura' | '8.1' | '(5.4 - 10.8)' |
31 | 'Santa Barbara' | '8.0' | '(5.1 - 10.9)' |
32 | 'Sonoma' | '7.8' | '(5.2 - 10.3)' |
33 | 'San Luis Obispo' | '7.5' | '(4.4 - 10.6)' |
34 | 'San Diego' | '7.3' | '(6.3 - 8.3)' |
35 | 'Los Angeles' | '7.2' | '(6.5 - 7.8)' |
36 | 'Humboldt' | '7.0' | '(4.9 - 9.2)' |
37 | 'Mendocino' | '6.9' | '(4.6 - 9.3)' |
38 | 'Monterey' | '6.8' | '(4.3 - 9.4)' |
39 | 'Orange' | '6.8' | '(5.7 - 8.0)' |
40 | 'Santa Clara' | '6.8' | '(5.3 - 8.4)' |
41 | 'Marin' | '6.5' | '(3.4 - 9.6)' |
42 | 'San Francisco' | '6.0' | '(3.4 - 8.6)' |
43 | 'Riverside' | '5.2' | '(4.2 - 6.2)' |
#Clean up the columnames
asth_county
asth_county['percent_active'] = asth_county['\'Percent with Active Asthma\''].apply(lambda x: float(x[1:-1]))
asth_county['county_name'] = asth_county[asth_county.columns[0]].apply(lambda x: str(x[1:-1]))
asth_county['95_confidence'] = asth_county['\'95% Confidence Interval\''].apply(lambda x: str(x[1:-1]))
#Delete old columns with messy names
del asth_county[asth_county.columns[0]]
del asth_county[asth_county.columns[0]]
del asth_county[asth_county.columns[0]]
Some rows represent multiple counties, we got to undo this grouping
new_rows = []
def unpack_group(row):
group = row['county_name']
if '/' in group:
counties = group.split('/')
for county in counties:
new_rows.append({'county_name': county, 'percent_active': row['percent_active'], '95_confidence': row['95_confidence']})
#Apply said function
asth_county.apply(unpack_group, axis = 1)
0 None 1 None 2 None 3 None 4 None 5 None 6 None 7 None 8 None 9 None 10 None 11 None 12 None 13 None 14 None 15 None 16 None 17 None 18 None 19 None 20 None 21 None 22 None 23 None 24 None 25 None 26 None 27 None 28 None 29 None 30 None 31 None 32 None 33 None 34 None 35 None 36 None 37 None 38 None 39 None 40 None 41 None 42 None 43 None
new_df = DataFrame(new_rows)
#Join old dataframe and new rows
new_asth = pd.concat([new_df, asth_county], ignore_index=True)
#Remove rows that are groups
isnt_group = lambda x: '/' not in x
new_asth[new_asth['county_name'].apply(isnt_group)]
95_confidence | county_name | percent_active | |
---|---|---|---|
0 | (9.5 - 17.9) | Tehama | 13.7 |
1 | (9.5 - 17.9) | Glenn | 13.7 |
2 | (9.5 - 17.9) | Colusa | 13.7 |
3 | (5.8 - 12.6) | Del Norte | 9.2 |
4 | (5.8 - 12.6) | Siskiyou | 9.2 |
5 | (5.8 - 12.6) | Lassen | 9.2 |
6 | (5.8 - 12.6) | Trinity | 9.2 |
7 | (5.8 - 12.6) | Modoc | 9.2 |
8 | (5.8 - 12.6) | Plumas | 9.2 |
9 | (5.8 - 12.6) | Sierra | 9.2 |
10 | (5.8 - 11.6) | Tuolumne | 8.7 |
11 | (5.8 - 11.6) | Calaveras | 8.7 |
12 | (5.8 - 11.6) | Amador | 8.7 |
13 | (5.8 - 11.6) | Inyo | 8.7 |
14 | (5.8 - 11.6) | Mariposa | 8.7 |
15 | (5.8 - 11.6) | Mono | 8.7 |
16 | (5.8 - 11.6) | Alpine | 8.7 |
17 | (9.3 - 19.6) | Merced | 14.4 |
19 | (8.9 - 16.6) | Yuba | 12.7 |
20 | (8.6 - 15.2) | Kern | 11.9 |
21 | (8.4 - 14.3) | Fresno | 11.3 |
22 | (8.9 - 13.5) | Sacramento | 11.2 |
23 | (6.7 - 15.1) | Solano | 10.9 |
24 | (7.5 - 14.2) | San Joaquin | 10.8 |
25 | (6.6 - 14.5) | Santa Cruz | 10.6 |
26 | (7.4 - 12.9) | Kings | 10.1 |
27 | (7.2 - 12.8) | Sutter | 10.0 |
28 | (7.5 - 12.2) | Alameda | 9.8 |
29 | (6.9 - 12.7) | Lake | 9.8 |
30 | (6.8 - 12.7) | Napa | 9.8 |
31 | (7.2 - 12.2) | Butte | 9.7 |
32 | (5.7 - 13.7) | Shasta | 9.7 |
33 | (7.2 - 11.9) | Contra Costa | 9.6 |
34 | (6.9 - 12.2) | Imperial | 9.5 |
35 | (6.9 - 12.1) | Madera | 9.5 |
36 | (6.6 - 12.1) | Nevada | 9.4 |
38 | (5.9 - 12.6) | San Mateo | 9.2 |
39 | (6.2 - 11.7) | Placer | 9.0 |
40 | (7.5 - 10.5) | San Bernardino | 9.0 |
41 | (5.6 - 12.4) | Stanislaus | 9.0 |
42 | (6.2 - 11.3) | Tulare | 8.8 |
43 | (5.0 - 12.7) | Yolo | 8.8 |
45 | (5.9 - 10.6) | El Dorado | 8.3 |
46 | (7.7 - 8.4) | CALIFORNIA OVERALL | 8.1 |
47 | (5.4 - 10.8) | Ventura | 8.1 |
48 | (5.1 - 10.9) | Santa Barbara | 8.0 |
49 | (5.2 - 10.3) | Sonoma | 7.8 |
50 | (4.4 - 10.6) | San Luis Obispo | 7.5 |
51 | (6.3 - 8.3) | San Diego | 7.3 |
52 | (6.5 - 7.8) | Los Angeles | 7.2 |
53 | (4.9 - 9.2) | Humboldt | 7.0 |
54 | (4.6 - 9.3) | Mendocino | 6.9 |
55 | (4.3 - 9.4) | Monterey | 6.8 |
56 | (5.7 - 8.0) | Orange | 6.8 |
57 | (5.3 - 8.4) | Santa Clara | 6.8 |
58 | (3.4 - 9.6) | Marin | 6.5 |
59 | (3.4 - 8.6) | San Francisco | 6.0 |
60 | (4.2 - 6.2) | Riverside | 5.2 |