Here we download the raw usage data for years 2000, 2005, and 2010 from the USGS usage site and synthesize all data into a tidy table listing: year, FIPS code, and total annual withdrawals by sector in MGal/day tagged by sector, source type (surface or groundwater) and water type (fresh or saline). This table is formatted as follows:
STATE | FIPS | YEAR | UseClass | SourceClass | SourceType | Amount | |
---|---|---|---|---|---|---|---|
AL | 01001 | 2000 | PublicSupply | Surface | Fresh | 0.00 | |
AL | 01001 | 2000 | Industrial | Ground | Saline | 0.00 |
This table can then easily be summarized and joined, by YEAR and FIPS code, to other accounting data tables and summarized by state.
#Import modules required for analysis
import os
import pandas as pd
import numpy as np
#Create prefix remapping dictionaries to standardize column names for all years to 2010
remapDict = {'IT-':'IR-', #Irrigated (total)
'LA-':'AQ-', #Aquaculture
'LS-':'LI-', #Livestock
'PE-':'PC-', #Closed-loop thermo electric
'Wtotl':'WTotl' #Capitalization mismatch
}
#Create a function for importing a table and standardizing all field names
def importTable(year, remapDict):
#Set the download URL using with the year
theURL = 'http://water.usgs.gov/watuse/data/{0}/usco{0}.txt'.format(year)
#Download the dataset to a data frame (keeping the FIPS attribute as a string)
df = pd.read_table(theURL,dtype={'FIPS':str})
#Ensure that the FIPS column maintains 5 characters
df['FIPS'] = df['FIPS'].apply(lambda x: str(x).zfill(5))
#Remove the STATEFIPS and COUNTYFIPS columns (as they are not needed)
df.drop("STATEFIPS",axis=1,inplace=True)
df.drop("COUNTYFIPS",axis=1,inplace=True)
#Use the remap dictionary to rename columns
#Get the current column names as a list
colNames = df.columns.values.tolist()
for inFld,outFld in remapDict.items():
#This loops through each item in colNames and replaces it with a revised one
colNames_update = [x.replace(inFld,outFld) for x in colNames]
colNames = colNames_update
#Update the column names in the data frame
df.columns = colNames
#Add year field, if not present
if "YEAR" not in df.columns.values:
df.insert(1,"YEAR",year)
#Remove unnamed columns, which oddly appear in 2005 and 2010 datasets
if "Unnamed" in df.columns.values[-1]:
df.drop(df.columns.values[-1],axis=1,inplace=True)
#Status
print "{} records and {} attributes returned for year {}".format(df.shape[0],df.shape[1],year)
#Return the data frame
return df
#Get the tables
df2000 = importTable(2000,remapDict)
df2005 = importTable(2005,remapDict)
df2010 = importTable(2010,remapDict)
3222 records and 69 attributes returned for year 2000 3224 records and 106 attributes returned for year 2005 3224 records and 115 attributes returned for year 2010
#CHECK: example output with a column for each supply/source/type grouping.
df2000.head()
STATE | YEAR | FIPS | TP-TotPop | PS-TOPop | PS-WGWFr | PS-WSWFr | PS-WFrTo | DO-SSPop | DO-WGWFr | ... | PC-WTotl | TO-WGWFr | TO-WGWSa | TO-WGWTo | TO-WSWFr | TO-WSWSa | TO-WSWTo | TO-WFrTo | TO-WSaTo | TO-WTotl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AL | 2000 | 01001 | 43.67 | 39.30 | 5.70 | 0.00 | 5.70 | 4.37 | 2.95 | ... | 0.0 | 10.65 | 0.0 | 10.65 | 26.60 | 0.0 | 26.60 | 37.25 | 0.0 | 37.25 |
1 | AL | 2000 | 01003 | 140.42 | 107.61 | 19.70 | 0.00 | 19.70 | 32.81 | 2.46 | ... | 0.0 | 27.04 | 0.0 | 27.04 | 7.59 | 0.0 | 7.59 | 34.63 | 0.0 | 34.63 |
2 | AL | 2000 | 01005 | 29.04 | 22.27 | 4.85 | 0.00 | 4.85 | 6.77 | 0.51 | ... | 0.0 | 6.36 | 0.0 | 6.36 | 1.06 | 0.0 | 1.06 | 7.42 | 0.0 | 7.42 |
3 | AL | 2000 | 01007 | 20.83 | 17.93 | 3.78 | 0.00 | 3.78 | 2.90 | 0.22 | ... | 0.0 | 4.00 | 0.0 | 4.00 | 0.00 | 0.0 | 0.00 | 4.00 | 0.0 | 4.00 |
4 | AL | 2000 | 01009 | 51.02 | 45.90 | 2.55 | 29.26 | 31.81 | 5.12 | 0.38 | ... | 0.0 | 2.93 | 0.0 | 2.93 | 29.33 | 0.0 | 29.33 | 32.26 | 0.0 | 32.26 |
5 rows × 69 columns
#"Melt" data so each use/class/type becomes a new row
df2000m = pd.melt(df2000,id_vars=['STATE','FIPS','YEAR'],var_name='Class',value_name='Amount')
df2005m = pd.melt(df2005,id_vars=['STATE','FIPS','YEAR'],var_name='Class',value_name='Amount')
df2010m = pd.melt(df2010,id_vars=['STATE','FIPS','YEAR'],var_name='Class',value_name='Amount')
#CHECK: Observe how the data has been 'melted'
df2000m.head()
STATE | FIPS | YEAR | Class | Amount | |
---|---|---|---|---|---|
0 | AL | 01001 | 2000 | TP-TotPop | 43.67 |
1 | AL | 01003 | 2000 | TP-TotPop | 140.42 |
2 | AL | 01005 | 2000 | TP-TotPop | 29.04 |
3 | AL | 01007 | 2000 | TP-TotPop | 20.83 |
4 | AL | 01009 | 2000 | TP-TotPop | 51.02 |
#Merge all records into a single table
dfUse = pd.concat([df2000m, df2005m, df2010m],ignore_index=True)
#CHECK: the dfUse frame should have as many rows as the 2000, 2005, and 2010 data combined
dfUse.shape[0]
(905812, 5)
#Create remap dictionaries. These enable extraction of specific sub-attriubutes
# contained in the Class field. For example, PS-WGWFr refers to Fresh groundwater
# withdrawn for Public Supply.
useClassMap = {'TP':'Population',
'PS':'PublicSupply',
'DO':'Domestic',
'IN':'Industrial',
'IR':'Irrigation',
'IC':'Irrigation_Crop',
'IG':'Irrigation_Golf',
'LI':'Livestock',
'AQ':'Aquaculture',
'MI':'Mining',
'PT':'ThermoElec',
'PO':'ThermoElec_OnceThru',
'PC':'ThermoElec_Recirc',
'TO':'Total'
}
srcClassMap = {'-WGW':'Groundwater',
'-WSW':'Surface'
}
srcTypeMap = {'Fr':'Fresh',
'Sa':'Saline'
}
#Set the use classes (PS, DO, LI, etc.)
for code,value in useClassMap.items():
dfUse.loc[dfUse.Class.str.startswith(code), 'UseClass'] = value
#Set the source classes (surface or groundwater)
for code,value in srcClassMap.items():
dfUse.loc[dfUse.Class.str.contains(code), 'SrcClass'] = value
#Set the source types (fresh or saline)
for code,value in srcTypeMap.items():
dfUse.loc[dfUse.Class.str.endswith(code), 'SrcType'] = value
#CHECK: List classes with null srcClass remaps
dfUse[pd.isnull(dfUse['UseClass'])].Class.unique()
array(['COUNTY'], dtype=object)
#CHECK: List classes with null srcClass remaps
dfUse[pd.isnull(dfUse['SrcClass'])].Class.unique()
array(['TP-TotPop', 'PS-TOPop', 'PS-WFrTo', 'DO-SSPop', 'DO-WFrTo', 'IN-WFrTo', 'IN-WSaTo', 'IN-WTotl', 'IR-IrSpr', 'IR-IrMic', 'IR-IrSur', 'IR-IrTot', 'IR-WFrTo', 'AQ-WFrTo', 'LI-WFrTo', 'MI-WFrTo', 'MI-WSaTo', 'MI-WTotl', 'PT-WFrTo', 'PT-WSaTo', 'PT-WTotl', 'PC-WFrTo', 'PC-WSaTo', 'PC-WTotl', 'TO-WFrTo', 'TO-WSaTo', 'TO-WTotl', 'PS-GWPop', 'PS-SWPop', 'PS-WSaTo', 'PS-WTotl', 'DO-PSDel', 'DO-TOTAL ', 'IC-WFrTo', 'IC-IrSpr', 'IC-IrMic', 'IC-IrSur', 'IC-IrTot', 'IG-WFrTo', 'IG-IrSpr', 'IG-IrMic', 'IG-IrSur', 'IG-IrTot', 'PT-Power', 'PO-WFrTo', 'PO-WSaTo', 'PO-WTotl', 'PO-Power', 'PC-Power', 'COUNTY', 'DO-SSPCp', 'DO-PSPCp', 'AQ-WSaTo', 'AQ-WTotl'], dtype=object)
#CHECK: List classes with null srcClass remaps
dfUse[pd.isnull(dfUse['SrcType'])].Class.unique()
array(['TP-TotPop', 'PS-TOPop', 'PS-WFrTo', 'DO-SSPop', 'DO-WFrTo', 'IN-WGWTo', 'IN-WSWTo', 'IN-WFrTo', 'IN-WSaTo', 'IN-WTotl', 'IR-IrSpr', 'IR-IrMic', 'IR-IrSur', 'IR-IrTot', 'IR-WFrTo', 'AQ-WFrTo', 'LI-WFrTo', 'MI-WGWTo', 'MI-WSWTo', 'MI-WFrTo', 'MI-WSaTo', 'MI-WTotl', 'PT-WSWTo', 'PT-WFrTo', 'PT-WSaTo', 'PT-WTotl', 'PO-WSWTo', 'PC-WSWTo', 'PC-WFrTo', 'PC-WSaTo', 'PC-WTotl', 'TO-WGWTo', 'TO-WSWTo', 'TO-WFrTo', 'TO-WSaTo', 'TO-WTotl', 'PS-GWPop', 'PS-SWPop', 'PS-WGWTo', 'PS-WSWTo', 'PS-WSaTo', 'PS-WTotl', 'DO-PSDel', 'DO-TOTAL ', 'IC-WFrTo', 'IC-IrSpr', 'IC-IrMic', 'IC-IrSur', 'IC-IrTot', 'IG-WFrTo', 'IG-IrSpr', 'IG-IrMic', 'IG-IrSur', 'IG-IrTot', 'PT-WGWTo', 'PT-Power', 'PO-WGWTo', 'PO-WFrTo', 'PO-WSaTo', 'PO-WTotl', 'PO-Power', 'PC-WGWTo', 'PC-Power', 'COUNTY', 'DO-SSPCp', 'DO-PSPCp', 'AQ-WGWTo', 'AQ-WSWTo', 'AQ-WSaTo', 'AQ-WTotl'], dtype=object)
#Remove rows (axis = 0), with a null value in 'any' column
dfOutput = dfUse.dropna(axis=0,how='any')
#CHECK: see that new columns were added to decode the Class
dfOutput.head()
STATE | FIPS | YEAR | Class | Amount | UseClass | SrcClass | SrcType | |
---|---|---|---|---|---|---|---|---|
6444 | AL | 01001 | 2000 | PS-WGWFr | 5.7 | PublicSupply | Groundwater | Fresh |
6445 | AL | 01003 | 2000 | PS-WGWFr | 19.7 | PublicSupply | Groundwater | Fresh |
6446 | AL | 01005 | 2000 | PS-WGWFr | 4.85 | PublicSupply | Groundwater | Fresh |
6447 | AL | 01007 | 2000 | PS-WGWFr | 3.78 | PublicSupply | Groundwater | Fresh |
6448 | AL | 01009 | 2000 | PS-WGWFr | 2.55 | PublicSupply | Groundwater | Fresh |
#Set the output location and filename
dataDir = '../../Data'
outFN = 'UsageDataTidy.csv'
dfOutput.to_csv(dataDir + os.sep + outFN,index=False)