Compile State Usage Data

Loops through a list of all states an pulls USGS water use data for each state into a single table.

Required packages:

* pandas

Created September 2017
[email protected]

Set year and the output filename

In [ ]:
#Set the year
year = 2010
In [ ]:
#Create the output csv file
outFilename = "../Data/AllStatesUsage{}.csv".format(year)

The rest is automated...

First, we need to import required packages, installing 3rd party packages if required

In [ ]:
#Import built-in modules
import os, urllib
In [ ]:
#Import pandas, install if needed
    import pandas as pd
    import pip
    import pandas as pd
In [ ]:
#Read in the file of state abbreviations
stateAbbr = open('stateabbr.csv','r').readlines()

Define the function that will pull the data, reshape it, and create a 'tidy' data frame

In [ ]:
def getData(state_abbr,year):
    '''Downloads USGS data and creates a tidy dataframe of all the usage data for the state'''
    #Set the data URL path and parameters and construct the url
    path = '{}/nwis/water_use?'.format(state_abbr)
    values = {'format':'rdb',
             'wu_year': year,
    url = path + urllib.urlencode(values)
    #Pull data in using the URL and remove the 2nd row of headers
    dfRaw = pd.read_table(url,comment='#',header=[0,1],na_values='-')
    dfRaw.columns = dfRaw.columns.droplevel(level=1)

    #Tidy the data: transform so data in each usage column become row values with a new column listing the usage type
    rowHeadings = ['county_cd', 'county_nm', 'state_cd', 'state_name', 'year']
    dfTidy = pd.melt(dfRaw,id_vars=rowHeadings,value_name='MGal',var_name='Group')

    #Remove rows that don't have volume data (i.e. keep only columns with 'Mgal' in the name)
    dfTidy = dfTidy[dfTidy['Group'].str.contains('Mgal')].copy(deep=True)

    #Change the type of the MGal column to float 
    dfTidy['MGal'] = dfTidy.MGal.astype('float')
    #Summarize county data for the whole state
    stateSummary = dfTidy.groupby(['Group'])['MGal'].sum()
    #Convert to a dataframe
    dfState = pd.DataFrame(stateSummary)
    #Rename the MGal column to the state abbreviation
    dfState.columns = [state_abbr]
    #Return the dataframe
    return dfState
Import data for the first state into a tidy data frame
In [ ]:
#Get the data for the first state
abbr = stateAbbr[0].strip("\n")
print "Processing {}, ".format(abbr),
dfAll = getData(abbr,year)
Loop through the remaining states and merge them to the data frame created above
In [ ]:
#Loop through the remaining states
for state in stateAbbr[1:]:
    abbr = state.strip("\n")
    print abbr,
    dfState = getData(abbr,year)
    dfAll = pd.merge(dfAll,dfState,how='inner',left_index='Group',right_index='Group')
Write the output to a csv file
In [ ]: