#!/usr/bin/env python # coding: utf-8 # In[1]: import re import os import glob # In[2]: basepath='/ocean/shared/SalishSeaCastData/DFO/CTD/' # note: if further files requests are added, see createDBfromDFO_OPDB.py for how to manage multiple directories dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\.', x))] dirs1=list() for ireq in dirs0: dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \ if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\.', x))] dirs1.sort() # create full list of filenames filenames1=list() bnamesAll=list() for cdirpath in dirs1: filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))] bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)] # left over from nutrients version where multiple requests led to overlap; retain for future use filenames1.sort() filenames=filenames1 #contains full paths # In[3]: print('directories to be processed:\n' + repr(dirs1) + '\n\n') # In[4]: test={0:'car',1:'elephant'} 'car' in test.values(),'horse' in test.values() # In[5]: # create empty set to store variable names and dictionary to store units varlist=set() #unitsdict={} #There were multiple units for some variables, so better to record the units with the data # loop throught directories to get all variable names and units: for ifile in filenames: varNames={} varUnits={} with open(ifile, 'rt', encoding = "ISO-8859-1") as f: infile=False invars=False indetail=False inadmin=False inloc=False indata=False detformat=False for line in f: if infile: if re.match('\s*\$', line) or len(line)==0: infile=False if invars: if re.search('\$END', line): invars=False else: test=re.findall("'.*?'",line) # (.*? matches anything but chooses min len match - not greedy) for expr in test: line=re.sub(re.escape(expr),re.sub(' ','_',expr),line) # remove spaces from items in quotes splitline=re.split('\s* \s*',line.strip()) if re.match('[0-9]', splitline[0]): varnum=int(splitline[0]) cvar=splitline[1] cvar = re.sub('(?<=[0-9])*\.(?=[0-9])','point',cvar) # decimal points -> point cvar = re.sub('\-','',cvar) # remove - from column names cvar = re.sub('\:','_',cvar) # replace : with _ cvar = re.sub('\>','gt',cvar) # replace > with gt cvar = re.sub('\<','lt',cvar) # replace < with lt cvar = re.sub('(\'|\.)','',cvar) # remove special characters (' and .) cunits = splitline[2].strip() # some files have multiple variables of same name (eg Oxygen:Dissolved:SBE) # fix this: cvarbase=cvar xx=1 while cvar in varNames.values(): cvar=cvarbase+'_'+str(xx) xx=xx+1 varNames[varnum]=cvar varUnits[varnum]=cunits varlist = varlist | {cvar} elif indetail: detcount+=1 if re.search('\$END', line): indetail=False elif (detcount==1 and re.match('\s*\!\s*No\s*Pad\s*Start\s*Width', line)): detformat=True elif inadmin: if len(line)==0: inadmin=False elif inloc: if len(line)==0: inloc=False if re.match('![- ]*$',line): tem=re.search('(?<=\!)[- ]*$',line) splitline=re.split(r'\s',tem.group(0)) for ii in range(1, 1+len(splitline)): detformat=True if re.search('\*FILE', line): infile=True if re.search('\$TABLE\: CHANNELS', line): invars=True if re.search('\$TABLE\: CHANNEL DETAIL', line): indetail=True detcount=0 if re.search('\*ADMINISTRATION', line): inadmin=True if re.search('\*LOCATION', line): inloc=True inadmin=False if re.search('\*END OF HEADER', line): indata=True inloc=False if re.search('\$END',line): inloc=False # In[9]: print(varNames) # In[10]: print(varUnits) # In[6]: print(varlist) # In[7]: choosevars={'Fluorescence_URU_Wetlabs', 'Oxygen_Dissolved_SBE', 'Speed_Sound', 'PAR1', 'Conductivity_Primary', 'Temperature_Secondary', 'Depth', 'Salinity_T1_C1', 'Conductivity_Secondary', 'Transmissivity', 'PAR_Reference', 'Temperature_Primary', 'Salinity_T0_C0', 'Conductivity', 'Salinity', 'Number_of_bin_records', 'pH_SBE_Nominal', 'PAR_1', 'Pressure', 'Fluorescence_URU_Seapoint', 'Temperature', 'Conductance_Specific', 'Density', 'PAR'} # In[8]: print(choosevars-varlist) # In[ ]: # In[9]: ifile # In[13]: test=re.search(basepath,ifile).group(1) # In[19]: re.search(basepath+'(.*)',ifile).group(1) # In[27]: if re.match('.*jpg$', ifile): print('yes') # In[ ]: