Downloads State Water Use data for 2000/2005/2010 and creates a formatted Physical Water Use Supply table.

Sample URL (Louisiana) https://waterdata.usgs.gov/la/nwis/water_use?format=rdb&rdb_compression=value&wu_area=County&wu_year=2000%2C2005%2C2010&wu_county=ALL&wu_category=ALL&wu_county_nms=--ALL%2BCounties--&wu_category_nms=--ALL%2BCategories--

Workflow

  • Construct the url and download the data into a pandas data frame
  • Melt/gather the usage columns into row values under the column name 'Group'
  • Remove rows with no usage data (identified by not having "Mgal" in the 'Group' name) *
In [4]:
#Import modules
import sys, os, urllib
import pandas as pd
import numpy as np
In [5]:
#Specify the state and year to process
state = 'la' #Louisiana
year = 2010
In [6]:
#Set the output file location to the Data/State data folder
outFolder = '../../Data/Statedata/'
outFN = outFolder + os.sep + '{0}_{1}.csv'.format(state,year)
In [7]:
#Set the data URL path and parameters and construct the url
path = 'https://waterdata.usgs.gov/{}/nwis/water_use?'.format(state)
values = {'format':'rdb',
         'rdb_compression':'value',
         'wu_area':'County',
         'wu_year': year,
         'wu_county':'ALL',
         'wu_county_nms':'--ALL+Counties--',
         'wu_category_nms':'--ALL+Categories--'
        }
url = path + urllib.urlencode(values)
In [8]:
#Pull data in using the URL and remove the 2nd row of headers
dfRaw = pd.read_table(url,comment='#',header=[0,1],na_values='-')
dfRaw.columns = dfRaw.columns.droplevel(level=1)
In [9]:
#Read locally, for debugging, and drop the 2nd row of headers
#dfRaw = pd.read_table('../../Data/Proprietary/LA.txt',comment='#',header=[0,1],na_values='-')
#dfRaw.columns = dfRaw.columns.droplevel(level=1)
dfRaw.head()
Out[9]:
state_cd state_name county_cd county_nm year Total Population total population of area, in thousands Public Supply population served by groundwater, in thousands Public Supply population served by surface water, in thousands Public Supply total population served, in thousands Public Supply self-supplied groundwater withdrawals, fresh, in Mgal/d ... Hydroelectric Power total offstream surface-water withdrawals in Mgal/d Hydroelectric Power power generated by instream use, in gigawatt-hours Hydroelectric Power power generated by offstream use, in gigawatt-hours Hydroelectric Power total power generated, in gigawatt-hours Hydroelectric Power number of instream facilities Hydroelectric Power number of offstream facilities Hydroelectric Power total number of facilities Wastewater Treatment returns by public wastewater facilities, in Mgal/d Wastewater Treatment number of public wastewater facilities Wastewater Treatment reclaimed wastewater released by wastewater facilities, in Mgal/d
0 22 Louisiana 1 Acadia Parish 2010 61.773 44.666 0.000 44.666 5.82 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 22 Louisiana 3 Allen Parish 2010 25.764 22.573 0.000 22.573 4.27 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 22 Louisiana 5 Ascension Parish 2010 107.215 29.510 27.094 56.604 3.02 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 22 Louisiana 7 Assumption Parish 2010 23.421 0.000 23.041 23.041 0.00 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 22 Louisiana 9 Avoyelles Parish 2010 42.073 39.756 0.000 39.756 3.85 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 281 columns

In [10]:
#Tidy the data
rowHeadings = ['county_cd', 'county_nm', 'state_cd', 'state_name', 'year']
dfTidy = pd.melt(dfRaw,id_vars=rowHeadings,value_name='MGal',var_name='Group')
dfTidy.shape
Out[10]:
(17664, 7)
In [11]:
#Remove rows that don't have volume data (i.e. keep only columns with 'Mgal' in the name)
dfTidy = dfTidy[dfTidy['Group'].str.contains('Mgal')]
dfTidy.shape
Out[11]:
(15040, 7)
In [12]:
#Change the type of the MGal column to float
dfTidy['MGal'] = dfTidy.MGal.astype(np.float)
dfTidy['MGal'].sum()
Out[12]:
50620.130000000005
In [13]:
dfTidy.head()
Out[13]:
county_cd county_nm state_cd state_name year Group MGal
256 1 Acadia Parish 22 Louisiana 2010 Public Supply self-supplied groundwater withdr... 5.82
257 3 Allen Parish 22 Louisiana 2010 Public Supply self-supplied groundwater withdr... 4.27
258 5 Ascension Parish 22 Louisiana 2010 Public Supply self-supplied groundwater withdr... 3.02
259 7 Assumption Parish 22 Louisiana 2010 Public Supply self-supplied groundwater withdr... 0.00
260 9 Avoyelles Parish 22 Louisiana 2010 Public Supply self-supplied groundwater withdr... 3.85
In [64]:
#Create lists

##Note: need to keep Livestock above its subcategories, otherwise it will overwrite
#       values in the output UseGroup field
useClasses = ["Aquaculture",
              "Commercial",
              "Domestic",
              "Fossil-fuel Thermoelectric Power",
              "Geothermal Thermoelectric Power",
              "Hydroelectric Power",
              "Industrial",
              "Irrigation, Crop",
              "Irrigation, Golf Courses",
              "Irrigation, Total",
              "Livestock",
              "Livestock \(Animal Specialties\)",
              "Livestock \(Stock\)",
              "Mining",
              "Nuclear Thermoelectric Power",
              "Public Supply",
              "Thermoelectric Power \(Closed-loop cooling\)",
              "Thermoelectric Power \(Once-through cooling\)",
              "Total Thermoelectric Power",
              "Wastewater Treatment"
             ]
srcClasses = ["consumptive use",
              "self-supplied",
              "deliveries from public supply",
              "reclaimed wastewater",
              "conveyance loss",
              "instream water use",
              "offstream surface-water withdrawals",
              "surface water self-supplied offstream withdrawals,"
              "deliveries to commercial,",
              "deliveries to domestic",
              "deliveries to thermoelectric",
              "public use and losses",
              "reclaimed wastewater"             
             ]
srcTypes = ['fresh',
            'saline',
            'surface water withdrawals',
            'groundwater withdrawals',
           ]
In [75]:
#Create and populate UseClass field
dfTidy['UseClass'] = '-'
#Loop thru use classes and assign as new column
for useClass in useClasses:
    #Remote the backslash needed in selecting records
    fixClass = useClass.replace("\\","")
    #Update rows where the Group field contains the use class
    dfTidy.ix[dfTidy.Group.str.contains(useClass),'UseClass'] = fixClass
#dfTidy.UseClass.unique().tolist()
In [112]:
#Create a field of everythihng but the UseClass
dfTidy['Foo'] = [e.replace(k,'') for e,k in zip(dfTidy.Group,dfTidy.UseClass)]
vals = pd.DataFrame(dfTidy['Foo'].unique())
vals.to_csv("etc.csv")
In [83]:
#Subtract useclass from group
dfTidy['Foo'] = '-'
def substrX(grpVal,useVal):
    return "foo"
    #return grpVal.replace(useVal,"")
dfTidy['Foo'] = dfTidy.apply(lambda row: substrX("AB","B"))#.str.replace(x['UseClass'],""))
#for index,rows in dfTidy.iterrows():
dfTidy['Foo'].head()
Out[83]:
256    NaN
257    NaN
258    NaN
259    NaN
260    NaN
Name: Foo, dtype: object
In [73]:
#Create and populate SrcClass field
dfTidy['SrcClass'] = '-'
#Loop thru use classes and assign as new column
for srcClass in srcClasses:
    #Update rows where the Group field contains the use class
    dfTidy.ix[dfTidy.Group.str.contains(srcClass),'SrcClass'] = srcClass
dfTidy.SrcClass.unique().tolist()
Out[73]:
['self-supplied',
 'deliveries to domestic',
 'deliveries to commercial,',
 '-',
 'deliveries to thermoelectric',
 'public use and losses',
 'reclaimed wastewater',
 'deliveries from public supply',
 'consumptive use',
 'conveyance loss']
In [74]:
#CHECK
dfCheck = dfTidy[dfTidy['SrcClass'] == '-']
#dfCheck2 = dfTidy[dfTidy.Group.str.contains(str('Thermoelectric Power \(Once-through cooling\)'))]
dfCheck.Group.unique()
Out[74]:
array(['Public Supply deliveries to industrial, in Mgal/d',
       'Public Supply total deliveries, in Mgal/d',
       'Hydroelectric Power instream water use, in Mgal/d',
       'Hydroelectric Power offstream surface-water withdrawals, fresh, in Mgal/d',
       'Hydroelectric Power total offstream surface-water withdrawals in Mgal/d',
       'Wastewater Treatment returns by public wastewater facilities, in Mgal/d'], dtype=object)
In [13]:
#Summarize 
dfState = dfTidy.groupby(['Group'])['MGal'].sum()
dfState.to_csv(outFN)