Read SWUDS data workbook

In [1]:
#import modules
import sys, os, xlrd
import pandas as pd
import numpy as np
In [2]:
#Filenames
dataDir = '../../Data'
scratchDir = '../../Scratch'
xlFN = dataDir + os.sep + '/LouisianaWaterUse_Distribute.xlsx'
crosswalkFN = dataDir + os.sep + 'naics02tosic87.xls'
crosswalkFN = dataDir + os.sep + '2017_NAICS_to_ISIC_4.xlsx'
##https://www.census.gov/eos/www/naics/concordances/concordances.html
outFN = dataDir + os.sep + "LASsummary.xlsx"
outFNAll = dataDir + os.sep + "LAData.xlsx"
In [3]:
#Read the xlsx file in as a dataframe
dfRaw = pd.read_excel(xlFN,sheetname='data1')
dfCrosswalk = pd.read_excel(crosswalkFN)
In [4]:
dfCrosswalk.columns
Out[4]:
Index([u'Part of NAICS US', u'2017\nNAICS\nUS  ', u'2017 NAICS US TITLE',
       u'Part of ISIC', u'ISIC 4.0', u'ISIC Revision 4.0 Title',
       u'Notes:  link content based on NAICS definition, entire NAICS industry if blank'],
      dtype='object')
In [5]:
#Remove facility columns
dfRaw.drop(dfRaw.columns[3:5],axis=1,inplace=True)
In [6]:
#Join the NAICS codes to the data frame
dfAll = pd.merge(dfRaw,dfCrosswalk,how='left',left_on='SIC1',right_on='ISIC 4.0')
dfAll.columns
Out[6]:
Index([u'County Code', u'SIC1', u'Description', u'Water Use Type', u'MP ID',
       u'MP Type', u'MP Name', u'Source Water Type', u'Aquifer Code',
       u'Aquifer Name2', u'Aquifer Group ID', u'Aquifer Group Name', u'HUC-8',
       u'HUC-12', u'Latitude', u'Longitude', u'strMPLatitudeDD',
       u'strMPLongitudeDD', u'Year', u'Annual Amt. (MGD)', u'Part of NAICS US',
       u'2017\nNAICS\nUS  ', u'2017 NAICS US TITLE', u'Part of ISIC',
       u'ISIC 4.0', u'ISIC Revision 4.0 Title',
       u'Notes:  link content based on NAICS definition, entire NAICS industry if blank'],
      dtype='object')
In [7]:
#Select only 2000, 2005, and 2010 records and list the number of data for each column
dfSelect = dfAll[(dfAll.Year == 2000) | (dfAll.Year == 2005) | (dfAll.Year == 2010)]
dfSelect.to_excel(outFNAll,index=False)
In [8]:
#Pivot on year, listing usage (Annual Amt. MGD) by Source Water  Type and SCI1 Description
#pvt = dfSelect.pivot_table(columns='Year',index=('NAICSTEXT','Description'),values='Annual Amt. (MGD)',aggfunc='sum')
pvt = dfSelect.pivot_table(columns='Year',index=('2017\nNAICS\nUS  ','2017 NAICS US TITLE','ISIC 4.0','Description'),values='Annual Amt. (MGD)',aggfunc='sum')
pvt.to_excel(outFN)