#!/usr/bin/env python # coding: utf-8 # In[ ]: from sqlalchemy import create_engine, Column, String, Boolean, Integer, Float, Numeric, MetaData, Table, type_coerce, ForeignKey from sqlalchemy.orm import mapper, create_session, relationship from sqlalchemy.ext.declarative import declarative_base import sqlalchemy.types as types # sqlalchemy interfaces between SQL databases and python with a syntax that I found relatively easy to adapt to... from sqlalchemy import create_engine, case, MetaData from sqlalchemy.orm import create_session, aliased from sqlalchemy.ext.automap import automap_base from sqlalchemy.sql import and_, or_, not_, func import re import os import glob import numpy as np import pandas as pd # In[ ]: fsw={k:False for k in ('infile','invars','indetail','inadmin','inloc','indata','detformat')} # In[ ]: fsw # In[ ]: basepath='/ocean/shared/SalishSeaCastData/DFO/BOT/' basedir=basepath dbname='DFO_OcProfDB' # In[ ]: #ctd version: dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\.', x))] dirs1=list() for ireq in dirs0: dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \ if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\.', x))] dirs1.sort(reverse=True) # create full list of filenames filenames1=list() bnamesAll=set() for cdirpath in dirs1: filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \ if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))] bnamesAll=bnamesAll.union(set([f for f in os.listdir(cdirpath)])) # left over from nutrients version where multiple requests led to overlap; retain for future use filenames1.sort() filenamesCTD=filenames1 #contains full paths # In[ ]: dirs1 # In[ ]: basepath='/ocean/eolson/MEOPAR/obs/' basedir=basepath + 'DFOOPDB/' basedir2=basepath + 'DFOOPDB2/' basedir3=basepath + 'DFOOPDB3/' basedir4=basepath + 'DFOOPDB4/' dbname='DFO_OcProfDB' fout=open(basedir+'createDBfromDFO_OPDB_log.txt','w') ferr=open(basedir+'createDBfromDFO_OPDB_errors.txt','w') fout.write('Files processed:\n') dirs1=[os.path.join(basedir,x) for x in os.listdir(basedir) if (os.path.isdir(basedir+x) \ and not re.match('^\.', x))] dirs1.sort() dirs2=[os.path.join(basedir2,x) for x in os.listdir(basedir2) if (os.path.isdir(basedir2+x) \ and not re.match('^\.', x))] dirs2.sort(reverse=True) dirs3=[os.path.join(basedir3,x) for x in os.listdir(basedir3) if (os.path.isdir(basedir3+x) \ and not re.match('^\.', x))] dirs3.sort() dirs4=[os.path.join(basedir4,x) for x in os.listdir(basedir4) if (os.path.isdir(basedir4+x) \ and not re.match('^\.', x))] dirs4.sort() # create full list of filenames filenames1=list() filenames2=list() filenames3=list() filenames4=list() bnamesAll=list() # most files of same name present in DFOOPDB2 and DFOOPDB are identical but a few appear to have been updated # therefore use new version (DFOOPDB2) for cdirpath in dirs4: filenames4=filenames4+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll] # some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)] for cdirpath in dirs3: filenames3=filenames3+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll] # some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)] for cdirpath in dirs2: filenames2=filenames2+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll] # some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)] for cdirpath in dirs1: filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll] filenames1.sort() filenames2.sort() filenames3.sort() filenames4.sort() filenames=filenames1+filenames2+filenames3+filenames4 #contains full paths # In[ ]: dirs2 # In[ ]: len(filenamesCTD) # In[ ]: len(filenames) # In[ ]: filenamesCTD[0],filenames[0] # In[ ]: re.split('/',filenames[0])[-1] # In[ ]: for i,j in zip(filenamesCTD,filenames): if not re.split('/',i)[-1]==re.split('/',j)[-1]: print(i,'\n',j) # In[ ]: # In[ ]: engine = create_engine('sqlite:////ocean/shared/SalishSeaCastData/DFO/BOT/DFO_OcProfDB.sqlite', echo = False) md=MetaData() md.reflect(engine) # In[ ]: obsCols=[el.name for el in md.tables['ObsTBL'].columns] # In[ ]: obsCols2=[el.name for el in md2.tables['ObsTBL'].columns] # In[ ]: Base = automap_base() Base.prepare(engine, reflect=True) StationTBL=Base.classes.StationTBL ObsTBL=Base.classes.ObsTBL session = create_session(bind = engine, autocommit = False, autoflush = True) # In[ ]: # In[ ]: engine2 = create_engine('sqlite:////ocean/shared/SalishSeaCastData/DFO/BOT/DFO_OcProfDB2.sqlite', echo = False) md2=MetaData() md2.reflect(engine2) Base2 = automap_base() Base2.prepare(engine2, reflect=True) StationTBL2=Base2.classes.StationTBL ObsTBL2=Base2.classes.ObsTBL session2 = create_session(bind = engine2, autocommit = False, autoflush = True) # In[ ]: df=pd.DataFrame(session.execute('select * from ObsTBL'),columns=obsCols) # In[ ]: df # In[ ]: df2=pd.DataFrame(session2.execute('select * from ObsTBL'),columns=obsCols2) # In[ ]: df2 # In[ ]: for icol in df.keys(): for i,j in zip(df[icol],df2[icol]): if not str(i)==str(j): print(icol,i,j) # In[ ]: # In[ ]: # In[ ]: # In[ ]: basepath='/ocean/shared/SalishSeaCastData/DFO/CTD/' basedir=basepath + 'req20181116/' dbname='DFO_CTD' # if more paths added later (additional data requests) see createDBfromDFO_OPDB.py for how to add fout=open(basepath+'createDBfromDFO_CTD_log.txt','w') ferr=open(basepath+'createDBfromDFO_CTD_errors.txt','w') fout.write('Files processed:\n') dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\.', x))] dirs1=list() # create full list of filenames filenames1=list() bnamesAll=set() for cdirpath in dirs1: filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \ if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))] bnamesAll=bnamesAll.union(set([f for f in os.listdir(cdirpath)])) # left over from nutrients version where multiple requests led to overlap; retain for future use filenames1.sort() filenames=filenames1 #contains full paths # In[ ]: dirs0 # In[ ]: for ireq in dirs0: dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \ if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\.', x))] dirs1.sort() # In[ ]: dirs1 # In[ ]: filenames1=list() bnamesAll=list() for cdirpath in dirs1: filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))] bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)] # left over from nutrients version where multiple requests led to overlap; retain for future use filenames1.sort() # In[ ]: # In[ ]: filenames1=list() bnamesAll=list() for cdirpath in dirs1: filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \ if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))] bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)] # left over from nutrients version where multiple requests led to overlap; retain for future use filenames1.sort() # In[ ]: len(filenames1) # In[ ]: # In[ ]: filenames[-1] # In[ ]: testline=' # 0.250' # In[ ]: f=re.search('(?<=PathLength>).*(?=).*(?=).*(?=).*(?=).*(?=).*(?=', ' # ', ' # 953DR', ' # Aug 9th 2017', ' # 19.2982', ' # -1.1000', ' # 0.250', ' # ') # In[ ]: for line in lines: for key, pattern in XmissPatterns.items(): print(line) match=pattern.search(line) if match is None: continue print(key,match[0]) break # In[ ]: for line in lines: for key, pattern in XmissPatterns.items(): print(line) match=pattern.search(line) if match is not None: print(key,match[0]) break # In[ ]: import sqlalchemy from sqlalchemy import create_engine, Column, String, Boolean, Integer, Float, MetaData, Table, type_coerce, ForeignKey, case, update from sqlalchemy.orm import mapper, create_session, relationship, aliased, Session from sqlalchemy.ext.declarative import declarative_base import csv from sqlalchemy import case import numpy as np from sqlalchemy.ext.automap import automap_base import matplotlib.pyplot as plt import sqlalchemy.types as types import numbers from sqlalchemy.sql import and_, or_, not_, func from sqlalchemy.sql import select import os import glob import re from os.path import isfile import gsw import pandas as pd from sqlalchemy.sql import and_, or_, not_, func # In[ ]: # definitions basedir='/ocean/shared/SalishSeaCastData/DFO/CTD/' dbname='DFO_CTD' # if db does not exist, exit # 1st, set Include=False for all CastAway data and duplicates engine0 = create_engine('sqlite:///' + basedir + dbname + '.sqlite', echo = False) md=MetaData() md.reflect(engine0) Base0 = automap_base(metadata=md) # reflect the tables in salish.sqlite: Base0.prepare() # mapped classes have been created # existing tables: StationTBL0=Base0.classes.StationTBL AncillaryTBL0=Base0.classes.AncillaryTBL ObsTBL0=Base0.classes.ObsTBL session0 = create_session(bind = engine0, autocommit = False, autoflush = True) # In[ ]: temp=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include,ObsTBL0.Pressure, ObsTBL0.Salinity,ObsTBL0.Salinity_T0_C0,ObsTBL0.Salinity_T1_C1, ObsTBL0.Temperature_Primary,ObsTBL0.Temperature_Secondary,ObsTBL0.Include,ObsTBL0.PAR,ObsTBL0.PAR1, ObsTBL0.sourceFile).select_from(StationTBL0).\ join(ObsTBL0,ObsTBL0.StationTBLID==StationTBL0.ID).\ filter(StationTBL0.ID==120).all()) # In[ ]: test=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include).\ filter(StationTBL0.ID==120).all()) # In[ ]: test # In[ ]: test=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include).\ filter(StationTBL0.ID==120).all()) # In[ ]: test # In[ ]: temp # In[ ]: temp2=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include,ObsTBL0.Pressure, ObsTBL0.Salinity,ObsTBL0.Salinity_T0_C0,ObsTBL0.Salinity_T1_C1, ObsTBL0.Temperature_Primary,ObsTBL0.Temperature_Secondary,ObsTBL0.Include,ObsTBL0.PAR,ObsTBL0.PAR1, ObsTBL0.sourceFile).select_from(StationTBL0).\ join(ObsTBL0,ObsTBL0.StationTBLID==StationTBL0.ID).\ filter(StationTBL0.ID==135).all()) # In[ ]: temp2 # In[ ]: for el in np.unique(temp['sourceFile']): print(el) # In[ ]: len(temp) # In[ ]: