This notebook consolidates water withdrawal, discharge, and transfer data from NCDEQs Water Withdrawal & Transfer Registry (link). The sequence of analysis is as follows:
#import libraries
import os,requests
import pandas as pd
from bs4 import BeautifulSoup
#Extract the contents of the base web page into a 'soup' object for scraping
baseURL = 'http://www.ncwater.org/Permits_and_Registration/Water_Withdrawal_and_Transfer_Registration/report'
r = requests.get(baseURL)
soup = BeautifulSoup(r.text,'lxml')
The following works with the current format of the web page. If it changes, this may have to be revised. Here we select the items in the web form containing the data we want to extract.
#Get the main table, identified with the id=main
trTable = soup.find_all(id="main")[0]
#Get the second table contained in the table selected above
dataTable = trTable.find_all('table')[1]
#Get all rows in the table selected above; these contain the data we want
rows = dataTable.find_all('tr')
#Initialize the dataframe that will hold our data
colNames = ['Owner','Name','Status','Code']
dfSites = pd.DataFrame(columns=colNames)
dfSites.head()
#Loop through each row (skipping the first, which contains headers), extracting data into our data frame
for row in rows[1:]:
#Create a collection of columns for the current row
columns = row.find_all('td')
#Construct a dictionary of the items we want
dictR = {'Owner':columns[0].string,
'Name':columns[1].string,
'Status':columns[2].string,
'Code':columns[3].find("a")['href'].split("/")[-2]}
#Append these data to our dataframe
dfSites = dfSites.append(dictR,ignore_index=True)
dfSites.head()
#Create a folder to hold all the downloads
outFolder = "NCDEQ"
if not os.path.exists(outFolder): os.mkdir(outFolder)
#Save contents to a file...
dfSites.to_csv("NCDEQ/WithdrawalMaster.csv",index=False)
def unstackTable(dfStacked):
'''
Unstacks monthly tables presented in 2-column formats into a
single column format. For example:
| Jan | Jul |
| Feb | Aug |
| Mar | Sep |
| Apr | Oct |
| May | Nov |
| Jun | Dec |
is converted to a single column with associated data attached.
'''
#Copy the table
df2 = dfStacked.copy(deep=True)
#Convert the first row to columns, then drop the row
colNames = df2.iloc[0]
df2.columns = colNames
df2.drop(0,inplace=True)
#Convert two column format to one
df2a = df2.iloc[:,:4]
df2b = df2.iloc[:,4:]
df2 = df2a.append(df2b)
#Set month to be the index
df2.set_index("Month",inplace=True)
#Convert data types for columns 2, 3, and 4 (days, avg, max)
df2.iloc[:,0] = df2.iloc[:,0].fillna(0).astype(int)
df2.iloc[:,1] = df2.iloc[:,1].astype(float)
df2.iloc[:,2] = df2.iloc[:,2].astype(float)
#Return the table
return df2
def ScrapeSite(siteID, year, first=False):
#--DATA EXTRACTION--
#Construct the URL
siteURL = 'http://www.ncwater.org/Permits_and_Registration/Water_Withdrawal_and_Transfer_Registration/report/view/{0}/{1}'.format(siteID,year)
#Extract all tables from the URL into a collection of dataframes
dfs = pd.read_html(siteURL,na_values='NaN')
#Separate tables into labeled variables, unstacking as needed
dfFacility = dfs[2] # Information on the facility
dfWithdrawal = unstackTable(dfs[3]) # Monthly withdrawal data
dfSource = dfs[4] # Information on where water was drawn
dfDischarge = unstackTable(dfs[5]) # Monthly discharge data
dfDischargeMethod = dfs[6] # Information on type and amounts of discharge
dfTransferDescription = dfs[7] # Information on source and destination of transfers
dfTransfer= unstackTable(dfs[8]) # Monthly transfer data
#Extract facility information into variables
registrant = dfFacility.iloc[0,1]
facility_name = dfFacility.iloc[0,1]
county = dfFacility.iloc[2,1]
subbasin= dfFacility.iloc[2,3]
facility_type = dfFacility.iloc[1,3]
#--MONTHLY VOLUME DATA----------------------
#Combine monthly withdrawal, discharge, and transfer tables
dfSiteData = pd.concat([dfWithdrawal,dfDischarge,dfTransfer], axis=1).reset_index()
#Add site information as columns
dfSiteData['SiteID'] = siteID
dfSiteData['Year'] = year
dfSiteData['Registrant'] = registrant
dfSiteData['Facility'] = facility_name
dfSiteData['Type'] = facility_type
dfSiteData['County'] = county
dfSiteData['Subbasin'] = subbasin
#Rearrange columns
columns = dfSiteData.columns.tolist()[10:] + dfSiteData.columns.tolist()[:10]
dfSiteData = dfSiteData[columns]
#--WITHDRAWAL INFO--------------------------
dfSource = dfs[4].copy(deep=True)
dfSource.columns = ('Name','Type','AvgDaily','DaysUsed','Capacity_MGD')
dfSource.drop(0,inplace=True)
dfSource.insert(0,'SiteID',siteID)
dfSource.insert(1,'Year',year)
dfSource.insert(2,'FacilityType',facility_type)
dfSource.insert(3,'County',county)
dfSource.insert(4,'Subbasin',subbasin)
#--DISCHARGE INFO-------------------------
dfDischargeMethod = dfs[6].copy(deep=True)
dfDischargeMethod.columns = ('Permit','Type','AvgDaily','DaysUsed','Capacity_MGD')
dfDischargeMethod.drop(0,inplace=True)
dfDischargeMethod.insert(0,'SiteID',siteID)
dfDischargeMethod.insert(1,'Year',year)
dfDischargeMethod.insert(2,'FacilityType',facility_type)
dfDischargeMethod.insert(3,'County',county)
dfDischargeMethod.insert(4,'Subbasin',subbasin)
#--TRANSFER INFO------------------------------
dfTransferDescription = dfs[7].copy(deep=True)
dfTransferDescription.columns = ('Description','SourceBasin','ReceivingBasin','Capacity')
dfTransferDescription.drop(0,inplace=True)
dfTransferDescription.insert(0,'SiteID',siteID)
dfTransferDescription.insert(1,'Year',year)
dfTransferDescription.insert(2,'FacilityType',facility_type)
dfTransferDescription.insert(3,'County',county)
dfTransferDescription.insert(4,'Subbasin',subbasin)
#-WRITE DATA TO OUTPUT FILES------------------
outCSV1 = "NCDEQ/MonthlyVolumeData.csv"
outCSV2 = "NCDEQ/WithdrawalSourceData.csv"
outCSV3 = "NCDEQ/DischargeMethods.csv"
outCSV4 = "NCDEQ/TransferInfo.csv"
#If this is the first table, write to new csv files
if first:
outputType = 'w' #Write to new file
head = True #Include header row
else:
outputType = 'a' #Append to existing file
head = False #Don't include headers
#Write monthly volume data to new file
with open(outCSV1,outputType) as outFile:
dfSiteData.to_csv(outFile,header=head,index=False)
#Write source info data to new file
with open(outCSV2, outputType) as outFile:
dfSource.to_csv(outFile,header=head,index=False)
#Write discharge info data to new file
with open(outCSV3, outputType) as outFile:
dfDischargeMethod.to_csv(outFile,header=head,index=False)
#Write transfer info data to new file
with open(outCSV4, outputType) as outFile:
dfTransferDescription.to_csv(outFile,header=head,index=False)
return (outCSV1,outCSV2, outCSV3, outCSV4)
#Set flag for the first file (to create a new output)
firstFile = True
#Loop through each site ID and scrape it's data
for index, row in dfSites.iterrows():
#Skip draft data
if row['Status'] == 'Draft': continue
#Get the code and loop through years
siteID = row['Code']
print(index,siteID,end=': ')
#Loop through years 2010 to 2017 and scrape the data
for year in range(2010,2018):
print(year,end='...')
outFiles = ScrapeSite(siteID,year,first=firstFile)
firstFile = False
print()