from sqlalchemy import create_engine, Column, String, Boolean, Integer, Float, Numeric, MetaData, Table, type_coerce, ForeignKey
from sqlalchemy.orm import mapper, create_session, relationship
from sqlalchemy.ext.declarative import declarative_base
import sqlalchemy.types as types
# sqlalchemy interfaces between SQL databases and python with a syntax that I found relatively easy to adapt to...
from sqlalchemy import create_engine, case, MetaData
from sqlalchemy.orm import create_session, aliased
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.sql import and_, or_, not_, func
import re
import os
import glob
import numpy as np
import pandas as pd
fsw={k:False for k in ('infile','invars','indetail','inadmin','inloc','indata','detformat')}
fsw
basepath='/ocean/shared/SalishSeaCastData/DFO/BOT/'
basedir=basepath
dbname='DFO_OcProfDB'
#ctd version:
dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\.', x))]
dirs1=list()
for ireq in dirs0:
dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \
if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\.', x))]
dirs1.sort(reverse=True)
# create full list of filenames
filenames1=list()
bnamesAll=set()
for cdirpath in dirs1:
filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \
if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]
bnamesAll=bnamesAll.union(set([f for f in os.listdir(cdirpath)]))
# left over from nutrients version where multiple requests led to overlap; retain for future use
filenames1.sort()
filenamesCTD=filenames1 #contains full paths
dirs1
basepath='/ocean/eolson/MEOPAR/obs/'
basedir=basepath + 'DFOOPDB/'
basedir2=basepath + 'DFOOPDB2/'
basedir3=basepath + 'DFOOPDB3/'
basedir4=basepath + 'DFOOPDB4/'
dbname='DFO_OcProfDB'
fout=open(basedir+'createDBfromDFO_OPDB_log.txt','w')
ferr=open(basedir+'createDBfromDFO_OPDB_errors.txt','w')
fout.write('Files processed:\n')
dirs1=[os.path.join(basedir,x) for x in os.listdir(basedir) if (os.path.isdir(basedir+x) \
and not re.match('^\.', x))]
dirs1.sort()
dirs2=[os.path.join(basedir2,x) for x in os.listdir(basedir2) if (os.path.isdir(basedir2+x) \
and not re.match('^\.', x))]
dirs2.sort(reverse=True)
dirs3=[os.path.join(basedir3,x) for x in os.listdir(basedir3) if (os.path.isdir(basedir3+x) \
and not re.match('^\.', x))]
dirs3.sort()
dirs4=[os.path.join(basedir4,x) for x in os.listdir(basedir4) if (os.path.isdir(basedir4+x) \
and not re.match('^\.', x))]
dirs4.sort()
# create full list of filenames
filenames1=list()
filenames2=list()
filenames3=list()
filenames4=list()
bnamesAll=list() # most files of same name present in DFOOPDB2 and DFOOPDB are identical but a few appear to have been updated
# therefore use new version (DFOOPDB2)
for cdirpath in dirs4:
filenames4=filenames4+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]
# some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical
bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]
for cdirpath in dirs3:
filenames3=filenames3+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]
# some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical
bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]
for cdirpath in dirs2:
filenames2=filenames2+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]
# some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical
bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]
for cdirpath in dirs1:
filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]
filenames1.sort()
filenames2.sort()
filenames3.sort()
filenames4.sort()
filenames=filenames1+filenames2+filenames3+filenames4 #contains full paths
dirs2
len(filenamesCTD)
len(filenames)
filenamesCTD[0],filenames[0]
re.split('/',filenames[0])[-1]
for i,j in zip(filenamesCTD,filenames):
if not re.split('/',i)[-1]==re.split('/',j)[-1]:
print(i,'\n',j)
engine = create_engine('sqlite:////ocean/shared/SalishSeaCastData/DFO/BOT/DFO_OcProfDB.sqlite', echo = False)
md=MetaData()
md.reflect(engine)
obsCols=[el.name for el in md.tables['ObsTBL'].columns]
obsCols2=[el.name for el in md2.tables['ObsTBL'].columns]
Base = automap_base()
Base.prepare(engine, reflect=True)
StationTBL=Base.classes.StationTBL
ObsTBL=Base.classes.ObsTBL
session = create_session(bind = engine, autocommit = False, autoflush = True)
engine2 = create_engine('sqlite:////ocean/shared/SalishSeaCastData/DFO/BOT/DFO_OcProfDB2.sqlite', echo = False)
md2=MetaData()
md2.reflect(engine2)
Base2 = automap_base()
Base2.prepare(engine2, reflect=True)
StationTBL2=Base2.classes.StationTBL
ObsTBL2=Base2.classes.ObsTBL
session2 = create_session(bind = engine2, autocommit = False, autoflush = True)
df=pd.DataFrame(session.execute('select * from ObsTBL'),columns=obsCols)
df
df2=pd.DataFrame(session2.execute('select * from ObsTBL'),columns=obsCols2)
df2
for icol in df.keys():
for i,j in zip(df[icol],df2[icol]):
if not str(i)==str(j):
print(icol,i,j)
basepath='/ocean/shared/SalishSeaCastData/DFO/CTD/'
basedir=basepath + 'req20181116/'
dbname='DFO_CTD'
# if more paths added later (additional data requests) see createDBfromDFO_OPDB.py for how to add
fout=open(basepath+'createDBfromDFO_CTD_log.txt','w')
ferr=open(basepath+'createDBfromDFO_CTD_errors.txt','w')
fout.write('Files processed:\n')
dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\.', x))]
dirs1=list()
# create full list of filenames
filenames1=list()
bnamesAll=set()
for cdirpath in dirs1:
filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \
if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]
bnamesAll=bnamesAll.union(set([f for f in os.listdir(cdirpath)]))
# left over from nutrients version where multiple requests led to overlap; retain for future use
filenames1.sort()
filenames=filenames1 #contains full paths
dirs0
for ireq in dirs0:
dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \
if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\.', x))]
dirs1.sort()
dirs1
filenames1=list()
bnamesAll=list()
for cdirpath in dirs1:
filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]
bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]
# left over from nutrients version where multiple requests led to overlap; retain for future use
filenames1.sort()
filenames1=list()
bnamesAll=list()
for cdirpath in dirs1:
filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \
if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]
bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]
# left over from nutrients version where multiple requests led to overlap; retain for future use
filenames1.sort()
len(filenames1)
filenames[-1]
testline=' # <PathLength>0.250</PathLength>'
f=re.search('(?<=PathLength>).*(?=</PathLength)',testline)
f
f[0]
if f:
y=f[0]
y
testline=' # <!-- A/D voltage 2, Transmissometer, WET Labs C-Star -->'
f=re.search('Transmissometer,',testline)
f
XmissPatterns={'xmiss_Serial':re.compile('(?<=SerialNumber>).*(?=</SerialNumber)'),
'xmiss_CalDate':re.compile('(?<=CalibrationDate>).*(?=</CalibrationDate)'),
'xmiss_m':re.compile('(?<=M>).*(?=</M)'),
'xmiss_b':re.compile('(?<=B>).*(?=</B)'),
'xmiss_PathLen':re.compile('(?<=PathLength>).*(?=</PathLength)')}
lines=(' # <!-- A/D voltage 2, Transmissometer, WET Labs C-Star -->',
' # <WET_LabsCStar SensorID="71" >',
' # <SerialNumber>953DR</SerialNumber>',
' # <CalibrationDate>Aug 9th 2017</CalibrationDate>',
' # <M>19.2982</M>',
' # <B>-1.1000</B>',
' # <PathLength>0.250</PathLength>',
' # </WET_LabsCStar>')
for line in lines:
for key, pattern in XmissPatterns.items():
print(line)
match=pattern.search(line)
if match is None:
continue
print(key,match[0])
break
for line in lines:
for key, pattern in XmissPatterns.items():
print(line)
match=pattern.search(line)
if match is not None:
print(key,match[0])
break
import sqlalchemy
from sqlalchemy import create_engine, Column, String, Boolean, Integer, Float, MetaData, Table, type_coerce, ForeignKey, case, update
from sqlalchemy.orm import mapper, create_session, relationship, aliased, Session
from sqlalchemy.ext.declarative import declarative_base
import csv
from sqlalchemy import case
import numpy as np
from sqlalchemy.ext.automap import automap_base
import matplotlib.pyplot as plt
import sqlalchemy.types as types
import numbers
from sqlalchemy.sql import and_, or_, not_, func
from sqlalchemy.sql import select
import os
import glob
import re
from os.path import isfile
import gsw
import pandas as pd
from sqlalchemy.sql import and_, or_, not_, func
# definitions
basedir='/ocean/shared/SalishSeaCastData/DFO/CTD/'
dbname='DFO_CTD'
# if db does not exist, exit
# 1st, set Include=False for all CastAway data and duplicates
engine0 = create_engine('sqlite:///' + basedir + dbname + '.sqlite', echo = False)
md=MetaData()
md.reflect(engine0)
Base0 = automap_base(metadata=md)
# reflect the tables in salish.sqlite:
Base0.prepare()
# mapped classes have been created
# existing tables:
StationTBL0=Base0.classes.StationTBL
AncillaryTBL0=Base0.classes.AncillaryTBL
ObsTBL0=Base0.classes.ObsTBL
session0 = create_session(bind = engine0, autocommit = False, autoflush = True)
temp=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include,ObsTBL0.Pressure,
ObsTBL0.Salinity,ObsTBL0.Salinity_T0_C0,ObsTBL0.Salinity_T1_C1,
ObsTBL0.Temperature_Primary,ObsTBL0.Temperature_Secondary,ObsTBL0.Include,ObsTBL0.PAR,ObsTBL0.PAR1,
ObsTBL0.sourceFile).select_from(StationTBL0).\
join(ObsTBL0,ObsTBL0.StationTBLID==StationTBL0.ID).\
filter(StationTBL0.ID==120).all())
test=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include).\
filter(StationTBL0.ID==120).all())
test
test=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include).\
filter(StationTBL0.ID==120).all())
test
temp
temp2=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include,ObsTBL0.Pressure,
ObsTBL0.Salinity,ObsTBL0.Salinity_T0_C0,ObsTBL0.Salinity_T1_C1,
ObsTBL0.Temperature_Primary,ObsTBL0.Temperature_Secondary,ObsTBL0.Include,ObsTBL0.PAR,ObsTBL0.PAR1,
ObsTBL0.sourceFile).select_from(StationTBL0).\
join(ObsTBL0,ObsTBL0.StationTBLID==StationTBL0.ID).\
filter(StationTBL0.ID==135).all())
temp2
for el in np.unique(temp['sourceFile']):
print(el)
len(temp)