In [1]:
import sys, os, urllib
import pandas as pd
import numpy as np
In [2]:
#Set the URL path and values
state = 'la' #Louisiana
path = 'https://waterdata.usgs.gov/{}/nwis/water_use?'.format(state)
values = {'format':'rdb',
         'rdb_compression':'value',
         'wu_area':'County',
         'wu_year':'2010',#2005,2010',
         'wu_county':'ALL',
         'wu_county_nms':'--ALL+Counties--',
         'wu_category_nms':'--ALL+Categories--'
        }
In [3]:
#Retrieve the data as a response object
url = path + urllib.urlencode(values)
In [4]:
#Pull data in using the URL and remove the 2nd row of headers
dfRaw = pd.read_table(url,comment='#',header=[0,1],na_values='-')
dfRaw.columns = dfRaw.columns.droplevel(level=1)
In [5]:
#Read locally, for debugging, and drop the 2nd row of headers
#dfRaw = pd.read_table('../../Data/Proprietary/LA.txt',comment='#',header=[0,1],na_values='-')
#dfRaw.columns = dfRaw.columns.droplevel(level=1)
In [6]:
#Tidy the data
rowHeadings = ['county_cd', 'county_nm', 'state_cd', 'state_name', 'year']
dfTidy = pd.melt(dfRaw,id_vars=rowHeadings,value_name='MGal',var_name='Group')
dfTidy.shape
Out[6]:
(17664, 7)
In [7]:
#Remove rows that don't have volume data
dfTidy = dfTidy[dfTidy['Group'].str.contains('Mgal')]
dfTidy.shape
Out[7]:
(15040, 7)
In [8]:
#Change the type of the MGal column to float
dfTidy['MGal'] = dfTidy.MGal.astype(np.float)
dfTidy['MGal'].sum()
Out[8]:
50620.130000000005
In [9]:
#Fill in Source and Destination columns
dfTidy.loc[dfTidy['Group'].str.startswith('Public Supply'), 'Destination'] = 'PS'
In [10]:
#Summarize 
dfState = dfTidy.groupby(['Group'])['MGal'].sum()
dfState.to_csv('{}.csv'.format(state))