import re
import os
import glob
basepath='/ocean/shared/SalishSeaCastData/DFO/CTD/'
# note: if further files requests are added, see createDBfromDFO_OPDB.py for how to manage multiple directories
dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\.', x))]
dirs1=list()
for ireq in dirs0:
dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \
if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\.', x))]
dirs1.sort()
# create full list of filenames
filenames1=list()
bnamesAll=list()
for cdirpath in dirs1:
filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]
bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]
# left over from nutrients version where multiple requests led to overlap; retain for future use
filenames1.sort()
filenames=filenames1 #contains full paths
print('directories to be processed:\n' + repr(dirs1) + '\n\n')
directories to be processed: ['/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2014 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2015 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2016 data a)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2016 data b)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2017 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2018 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20200928/CTD1', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20200928/CTD2', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20210618/2013 A', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20210618/2013 B', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20210618/2013 C']
test={0:'car',1:'elephant'}
'car' in test.values(),'horse' in test.values()
(True, False)
# create empty set to store variable names and dictionary to store units
varlist=set()
#unitsdict={} #There were multiple units for some variables, so better to record the units with the data
# loop throught directories to get all variable names and units:
for ifile in filenames:
varNames={}
varUnits={}
with open(ifile, 'rt', encoding = "ISO-8859-1") as f:
infile=False
invars=False
indetail=False
inadmin=False
inloc=False
indata=False
detformat=False
for line in f:
if infile:
if re.match('\s*\$', line) or len(line)==0:
infile=False
if invars:
if re.search('\$END', line):
invars=False
else:
test=re.findall("'.*?'",line) # (.*? matches anything but chooses min len match - not greedy)
for expr in test:
line=re.sub(re.escape(expr),re.sub(' ','_',expr),line) # remove spaces from items in quotes
splitline=re.split('\s* \s*',line.strip())
if re.match('[0-9]', splitline[0]):
varnum=int(splitline[0])
cvar=splitline[1]
cvar = re.sub('(?<=[0-9])*\.(?=[0-9])','point',cvar) # decimal points -> point
cvar = re.sub('\-','',cvar) # remove - from column names
cvar = re.sub('\:','_',cvar) # replace : with _
cvar = re.sub('\>','gt',cvar) # replace > with gt
cvar = re.sub('\<','lt',cvar) # replace < with lt
cvar = re.sub('(\'|\.)','',cvar) # remove special characters (' and .)
cunits = splitline[2].strip()
# some files have multiple variables of same name (eg Oxygen:Dissolved:SBE)
# fix this:
cvarbase=cvar
xx=1
while cvar in varNames.values():
cvar=cvarbase+'_'+str(xx)
xx=xx+1
varNames[varnum]=cvar
varUnits[varnum]=cunits
varlist = varlist | {cvar}
elif indetail:
detcount+=1
if re.search('\$END', line):
indetail=False
elif (detcount==1 and re.match('\s*\!\s*No\s*Pad\s*Start\s*Width', line)):
detformat=True
elif inadmin:
if len(line)==0:
inadmin=False
elif inloc:
if len(line)==0:
inloc=False
if re.match('![- ]*$',line):
tem=re.search('(?<=\!)[- ]*$',line)
splitline=re.split(r'\s',tem.group(0))
for ii in range(1, 1+len(splitline)):
detformat=True
if re.search('\*FILE', line):
infile=True
if re.search('\$TABLE\: CHANNELS', line):
invars=True
if re.search('\$TABLE\: CHANNEL DETAIL', line):
indetail=True
detcount=0
if re.search('\*ADMINISTRATION', line):
inadmin=True
if re.search('\*LOCATION', line):
inloc=True
inadmin=False
if re.search('\*END OF HEADER', line):
indata=True
inloc=False
if re.search('\$END',line):
inloc=False
print(varNames)
{1: 'Pressure', 2: 'Temperature_Primary', 3: 'Salinity_T0_C0', 4: 'Number_of_bin_records'}
print(varUnits)
{1: 'decibar', 2: "'deg_C_(ITS90)'", 3: 'PSS-78', 4: 'n/a'}
print(varlist)
{'pH_SBE_Nominal', 'Salinity_T1_C1', 'PAR', 'Oxygen_Dissolved_SBE', 'Temperature', 'Oxygen_Dissolved_Satuation_RBR', 'Number_of_bin_records_1', 'Speed_Sound', 'Turbidity_Wetlabs', 'Number_of_bin_records', 'Conductivity', 'Date', 'Temperature_Primary', 'Transmissivity', 'Salinity_T0_C0', 'Conductance_Specific', 'Record_Number', 'Oxygen_Dissolved_Saturation', 'Oxygen_Dissolved_Saturation_RBR', 'Pressure', 'Conductivity_Primary', 'Time', 'Fluorescence_URU', 'Fluorescence_URU_Seapoint', 'PAR1', 'Depth', 'Transmissivity2', 'Transmissivity_Green', 'PAR_1', 'PAR_Reference', 'Conductivity_Secondary', 'Fluorescence_URU_Wetlabs', 'Salinity', 'Density', 'Oxygen_Dissolved_SBE_1', 'Temperature_Secondary'}
choosevars={'Fluorescence_URU_Wetlabs', 'Oxygen_Dissolved_SBE', 'Speed_Sound', 'PAR1',
'Conductivity_Primary', 'Temperature_Secondary', 'Depth', 'Salinity_T1_C1',
'Conductivity_Secondary', 'Transmissivity', 'PAR_Reference', 'Temperature_Primary',
'Salinity_T0_C0', 'Conductivity', 'Salinity', 'Number_of_bin_records',
'pH_SBE_Nominal', 'PAR_1', 'Pressure', 'Fluorescence_URU_Seapoint', 'Temperature',
'Conductance_Specific', 'Density', 'PAR'}
print(choosevars-varlist)
set()
ifile
'/ocean/shared/SalishSeaCastData/DFO/CTD/req20200928/CTD2/lbb_20190123_20190810_0336m_L2.ctd'
test=re.search(basepath,ifile).group(1)
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-13-e98ae1fb2f39> in <module>() ----> 1 test=re.search(basepath,ifile).group(1) IndexError: no such group
re.search(basepath+'(.*)',ifile).group(1)
'req20181116/EO UBC November 16, 2018 (2018 data)/2018_map.jpg'
if re.match('.*jpg$', ifile):
print('yes')
yes