Notebook

NC Water Scraper¶

This notebook consolidates water withdrawal, discharge, and transfer data from NCDEQs Water Withdrawal & Transfer Registry (link). The sequence of analysis is as follows:

First data from the registry's front page (link above) is scraped into a dataframe listing Registered Owner, Facility Name, Status, and Facility ID.
Then, using each entry's facility ID to access its annual report, data are scraped - one facility and one year at a time - to compile a table listing monthly withdrawals, discharges, and transfers for a set of years.

In [ ]:

#import libraries
import os,requests
import pandas as pd
from bs4 import BeautifulSoup

Step 1. Create a dataframe of sites in the registry¶

In [ ]:

#Extract the contents of the base web page into a 'soup' object for scraping
baseURL = 'http://www.ncwater.org/Permits_and_Registration/Water_Withdrawal_and_Transfer_Registration/report'
r = requests.get(baseURL)
soup = BeautifulSoup(r.text,'lxml')

The following works with the current format of the web page. If it changes, this may have to be revised. Here we select the items in the web form containing the data we want to extract.

In [ ]:

#Get the main table, identified with the id=main
trTable = soup.find_all(id="main")[0]

In [ ]:

#Get the second table contained in the table selected above
dataTable = trTable.find_all('table')[1]

In [ ]:

#Get all rows in the table selected above; these contain the data we want
rows = dataTable.find_all('tr')

In [ ]:

#Initialize the dataframe that will hold our data
colNames = ['Owner','Name','Status','Code']
dfSites = pd.DataFrame(columns=colNames)
dfSites.head()

In [ ]:

#Loop through each row (skipping the first, which contains headers), extracting data into our data frame
for row in rows[1:]:
    
    #Create a collection of columns for the current row
    columns = row.find_all('td')
    
    #Construct a dictionary of the items we want
    dictR = {'Owner':columns[0].string,
             'Name':columns[1].string,
             'Status':columns[2].string,
             'Code':columns[3].find("a")['href'].split("/")[-2]}
    
    #Append these data to our dataframe
    dfSites = dfSites.append(dictR,ignore_index=True)
    
dfSites.head()

In [ ]:

#Create a folder to hold all the downloads
outFolder = "NCDEQ"
if not os.path.exists(outFolder): os.mkdir(outFolder)

In [ ]:

#Save contents to a file...
dfSites.to_csv("NCDEQ/WithdrawalMaster.csv",index=False)

Step 2. Extract report data for each site¶

In [ ]:

def unstackTable(dfStacked):
    '''
    Unstacks monthly tables presented in 2-column formats into a 
    single column format. For example:
    | Jan | Jul | 
    | Feb | Aug |
    | Mar | Sep | 
    | Apr | Oct | 
    | May | Nov | 
    | Jun | Dec | 
    is converted to a single column with associated data attached. 
    '''
    #Copy the table
    df2 = dfStacked.copy(deep=True)
    
    #Convert the first row to columns, then drop the row
    colNames = df2.iloc[0]
    df2.columns = colNames
    df2.drop(0,inplace=True)

    #Convert two column format to one
    df2a = df2.iloc[:,:4]
    df2b = df2.iloc[:,4:]
    df2 = df2a.append(df2b)

    #Set month to be the index
    df2.set_index("Month",inplace=True)

    #Convert data types for columns 2, 3, and 4 (days, avg, max)
    df2.iloc[:,0] = df2.iloc[:,0].fillna(0).astype(int)
    df2.iloc[:,1] = df2.iloc[:,1].astype(float)
    df2.iloc[:,2] = df2.iloc[:,2].astype(float)
    
    #Return the table
    return df2

In [ ]:

def ScrapeSite(siteID, year, first=False):
    
    #--DATA EXTRACTION--
    #Construct the URL
    siteURL = 'http://www.ncwater.org/Permits_and_Registration/Water_Withdrawal_and_Transfer_Registration/report/view/{0}/{1}'.format(siteID,year)

    #Extract all tables from the URL into a collection of dataframes
    dfs = pd.read_html(siteURL,na_values='NaN')

    #Separate tables into labeled variables, unstacking as needed
    dfFacility = dfs[2]                  # Information on the facility
    dfWithdrawal = unstackTable(dfs[3])  # Monthly withdrawal data   
    dfSource = dfs[4]                    # Information on where water was drawn
    dfDischarge = unstackTable(dfs[5])   # Monthly discharge data 
    dfDischargeMethod = dfs[6]           # Information on type and amounts of discharge
    dfTransferDescription = dfs[7]       # Information on source and destination of transfers
    dfTransfer= unstackTable(dfs[8])     # Monthly transfer data

    #Extract facility information into variables
    registrant = dfFacility.iloc[0,1]
    facility_name = dfFacility.iloc[0,1]
    county = dfFacility.iloc[2,1]
    subbasin= dfFacility.iloc[2,3]
    facility_type = dfFacility.iloc[1,3]
    
    #--MONTHLY VOLUME DATA----------------------
    #Combine monthly withdrawal, discharge, and transfer tables
    dfSiteData = pd.concat([dfWithdrawal,dfDischarge,dfTransfer], axis=1).reset_index()

    #Add site information as columns
    dfSiteData['SiteID'] = siteID
    dfSiteData['Year'] = year
    dfSiteData['Registrant'] = registrant
    dfSiteData['Facility'] = facility_name
    dfSiteData['Type'] = facility_type
    dfSiteData['County'] = county
    dfSiteData['Subbasin'] = subbasin

    #Rearrange columns
    columns = dfSiteData.columns.tolist()[10:] + dfSiteData.columns.tolist()[:10]
    dfSiteData = dfSiteData[columns]

    #--WITHDRAWAL INFO--------------------------
    dfSource = dfs[4].copy(deep=True)
    dfSource.columns = ('Name','Type','AvgDaily','DaysUsed','Capacity_MGD')
    dfSource.drop(0,inplace=True)
    dfSource.insert(0,'SiteID',siteID)
    dfSource.insert(1,'Year',year)    
    dfSource.insert(2,'FacilityType',facility_type)
    dfSource.insert(3,'County',county)
    dfSource.insert(4,'Subbasin',subbasin)

    #--DISCHARGE INFO-------------------------
    dfDischargeMethod = dfs[6].copy(deep=True)
    dfDischargeMethod.columns = ('Permit','Type','AvgDaily','DaysUsed','Capacity_MGD')
    dfDischargeMethod.drop(0,inplace=True)
    dfDischargeMethod.insert(0,'SiteID',siteID)
    dfDischargeMethod.insert(1,'Year',year)
    dfDischargeMethod.insert(2,'FacilityType',facility_type)
    dfDischargeMethod.insert(3,'County',county)
    dfDischargeMethod.insert(4,'Subbasin',subbasin)
    
    #--TRANSFER INFO------------------------------
    dfTransferDescription = dfs[7].copy(deep=True)
    dfTransferDescription.columns = ('Description','SourceBasin','ReceivingBasin','Capacity')
    dfTransferDescription.drop(0,inplace=True)
    dfTransferDescription.insert(0,'SiteID',siteID)
    dfTransferDescription.insert(1,'Year',year)
    dfTransferDescription.insert(2,'FacilityType',facility_type)
    dfTransferDescription.insert(3,'County',county)
    dfTransferDescription.insert(4,'Subbasin',subbasin)    

    #-WRITE DATA TO OUTPUT FILES------------------
    outCSV1 = "NCDEQ/MonthlyVolumeData.csv"
    outCSV2 = "NCDEQ/WithdrawalSourceData.csv"
    outCSV3 = "NCDEQ/DischargeMethods.csv"
    outCSV4 = "NCDEQ/TransferInfo.csv"
    
    #If this is the first table, write to new csv files
    if first:
        outputType = 'w' #Write to new file
        head = True      #Include header row
    else: 
        outputType = 'a' #Append to existing file
        head = False     #Don't include headers
        
    #Write monthly volume data to new file
    with open(outCSV1,outputType) as outFile:
        dfSiteData.to_csv(outFile,header=head,index=False)

    #Write source info data to new file  
    with open(outCSV2, outputType) as outFile:
        dfSource.to_csv(outFile,header=head,index=False)

    #Write discharge info data to new file  
    with open(outCSV3, outputType) as outFile:
        dfDischargeMethod.to_csv(outFile,header=head,index=False)

    #Write transfer info data to new file  
    with open(outCSV4, outputType) as outFile:
        dfTransferDescription.to_csv(outFile,header=head,index=False)
            
    return (outCSV1,outCSV2, outCSV3, outCSV4)

In [ ]:

#Set flag for the first file (to create a new output)
firstFile = True

#Loop through each site ID and scrape it's data
for index, row in dfSites.iterrows():
    
    #Skip draft data
    if row['Status'] == 'Draft': continue
        
    #Get the code and loop through years
    siteID = row['Code']
    print(index,siteID,end=': ')

    #Loop through years 2010 to 2017 and scrape the data
    for year in range(2010,2018):
        print(year,end='...')
        outFiles = ScrapeSite(siteID,year,first=firstFile)
        firstFile = False
    print()