Downloads State Water Use data for 2000/2005/2010 and creates a formatted Physical Water Use Supply table.
Workflow
#Import modules
import sys, os, urllib
import pandas as pd
import numpy as np
#Specify the state and year to process
state = 'la' #Louisiana
year = 2010
#Set the output file location to the Data/State data folder
outFolder = '../../Data/Statedata/'
outFN = outFolder + os.sep + '{0}_{1}.csv'.format(state,year)
#Set the data URL path and parameters and construct the url
path = 'https://waterdata.usgs.gov/{}/nwis/water_use?'.format(state)
values = {'format':'rdb',
'rdb_compression':'value',
'wu_area':'County',
'wu_year': year,
'wu_county':'ALL',
'wu_county_nms':'--ALL+Counties--',
'wu_category_nms':'--ALL+Categories--'
}
url = path + urllib.urlencode(values)
#Pull data in using the URL and remove the 2nd row of headers
dfRaw = pd.read_table(url,comment='#',header=[0,1],na_values='-')
dfRaw.columns = dfRaw.columns.droplevel(level=1)
#Read locally, for debugging, and drop the 2nd row of headers
#dfRaw = pd.read_table('../../Data/Proprietary/LA.txt',comment='#',header=[0,1],na_values='-')
#dfRaw.columns = dfRaw.columns.droplevel(level=1)
dfRaw.head()
state_cd | state_name | county_cd | county_nm | year | Total Population total population of area, in thousands | Public Supply population served by groundwater, in thousands | Public Supply population served by surface water, in thousands | Public Supply total population served, in thousands | Public Supply self-supplied groundwater withdrawals, fresh, in Mgal/d | ... | Hydroelectric Power total offstream surface-water withdrawals in Mgal/d | Hydroelectric Power power generated by instream use, in gigawatt-hours | Hydroelectric Power power generated by offstream use, in gigawatt-hours | Hydroelectric Power total power generated, in gigawatt-hours | Hydroelectric Power number of instream facilities | Hydroelectric Power number of offstream facilities | Hydroelectric Power total number of facilities | Wastewater Treatment returns by public wastewater facilities, in Mgal/d | Wastewater Treatment number of public wastewater facilities | Wastewater Treatment reclaimed wastewater released by wastewater facilities, in Mgal/d | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | Louisiana | 1 | Acadia Parish | 2010 | 61.773 | 44.666 | 0.000 | 44.666 | 5.82 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 22 | Louisiana | 3 | Allen Parish | 2010 | 25.764 | 22.573 | 0.000 | 22.573 | 4.27 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 22 | Louisiana | 5 | Ascension Parish | 2010 | 107.215 | 29.510 | 27.094 | 56.604 | 3.02 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 22 | Louisiana | 7 | Assumption Parish | 2010 | 23.421 | 0.000 | 23.041 | 23.041 | 0.00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 22 | Louisiana | 9 | Avoyelles Parish | 2010 | 42.073 | 39.756 | 0.000 | 39.756 | 3.85 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 281 columns
#Tidy the data
rowHeadings = ['county_cd', 'county_nm', 'state_cd', 'state_name', 'year']
dfTidy = pd.melt(dfRaw,id_vars=rowHeadings,value_name='MGal',var_name='Group')
dfTidy.shape
(17664, 7)
#Remove rows that don't have volume data (i.e. keep only columns with 'Mgal' in the name)
dfTidy = dfTidy[dfTidy['Group'].str.contains('Mgal')]
dfTidy.shape
(15040, 7)
#Change the type of the MGal column to float
dfTidy['MGal'] = dfTidy.MGal.astype(np.float)
dfTidy['MGal'].sum()
50620.130000000005
dfTidy.head()
county_cd | county_nm | state_cd | state_name | year | Group | MGal | |
---|---|---|---|---|---|---|---|
256 | 1 | Acadia Parish | 22 | Louisiana | 2010 | Public Supply self-supplied groundwater withdr... | 5.82 |
257 | 3 | Allen Parish | 22 | Louisiana | 2010 | Public Supply self-supplied groundwater withdr... | 4.27 |
258 | 5 | Ascension Parish | 22 | Louisiana | 2010 | Public Supply self-supplied groundwater withdr... | 3.02 |
259 | 7 | Assumption Parish | 22 | Louisiana | 2010 | Public Supply self-supplied groundwater withdr... | 0.00 |
260 | 9 | Avoyelles Parish | 22 | Louisiana | 2010 | Public Supply self-supplied groundwater withdr... | 3.85 |
#Create lists
##Note: need to keep Livestock above its subcategories, otherwise it will overwrite
# values in the output UseGroup field
useClasses = ["Aquaculture",
"Commercial",
"Domestic",
"Fossil-fuel Thermoelectric Power",
"Geothermal Thermoelectric Power",
"Hydroelectric Power",
"Industrial",
"Irrigation, Crop",
"Irrigation, Golf Courses",
"Irrigation, Total",
"Livestock",
"Livestock \(Animal Specialties\)",
"Livestock \(Stock\)",
"Mining",
"Nuclear Thermoelectric Power",
"Public Supply",
"Thermoelectric Power \(Closed-loop cooling\)",
"Thermoelectric Power \(Once-through cooling\)",
"Total Thermoelectric Power",
"Wastewater Treatment"
]
srcClasses = ["consumptive use",
"self-supplied",
"deliveries from public supply",
"reclaimed wastewater",
"conveyance loss",
"instream water use",
"offstream surface-water withdrawals",
"surface water self-supplied offstream withdrawals,"
"deliveries to commercial,",
"deliveries to domestic",
"deliveries to thermoelectric",
"public use and losses",
"reclaimed wastewater"
]
srcTypes = ['fresh',
'saline',
'surface water withdrawals',
'groundwater withdrawals',
]
#Create and populate UseClass field
dfTidy['UseClass'] = '-'
#Loop thru use classes and assign as new column
for useClass in useClasses:
#Remote the backslash needed in selecting records
fixClass = useClass.replace("\\","")
#Update rows where the Group field contains the use class
dfTidy.ix[dfTidy.Group.str.contains(useClass),'UseClass'] = fixClass
#dfTidy.UseClass.unique().tolist()
#Create a field of everythihng but the UseClass
dfTidy['Foo'] = [e.replace(k,'') for e,k in zip(dfTidy.Group,dfTidy.UseClass)]
vals = pd.DataFrame(dfTidy['Foo'].unique())
vals.to_csv("etc.csv")
#Subtract useclass from group
dfTidy['Foo'] = '-'
def substrX(grpVal,useVal):
return "foo"
#return grpVal.replace(useVal,"")
dfTidy['Foo'] = dfTidy.apply(lambda row: substrX("AB","B"))#.str.replace(x['UseClass'],""))
#for index,rows in dfTidy.iterrows():
dfTidy['Foo'].head()
256 NaN 257 NaN 258 NaN 259 NaN 260 NaN Name: Foo, dtype: object
#Create and populate SrcClass field
dfTidy['SrcClass'] = '-'
#Loop thru use classes and assign as new column
for srcClass in srcClasses:
#Update rows where the Group field contains the use class
dfTidy.ix[dfTidy.Group.str.contains(srcClass),'SrcClass'] = srcClass
dfTidy.SrcClass.unique().tolist()
['self-supplied', 'deliveries to domestic', 'deliveries to commercial,', '-', 'deliveries to thermoelectric', 'public use and losses', 'reclaimed wastewater', 'deliveries from public supply', 'consumptive use', 'conveyance loss']
#CHECK
dfCheck = dfTidy[dfTidy['SrcClass'] == '-']
#dfCheck2 = dfTidy[dfTidy.Group.str.contains(str('Thermoelectric Power \(Once-through cooling\)'))]
dfCheck.Group.unique()
array(['Public Supply deliveries to industrial, in Mgal/d', 'Public Supply total deliveries, in Mgal/d', 'Hydroelectric Power instream water use, in Mgal/d', 'Hydroelectric Power offstream surface-water withdrawals, fresh, in Mgal/d', 'Hydroelectric Power total offstream surface-water withdrawals in Mgal/d', 'Wastewater Treatment returns by public wastewater facilities, in Mgal/d'], dtype=object)
#Summarize
dfState = dfTidy.groupby(['Group'])['MGal'].sum()
dfState.to_csv(outFN)