#!/usr/bin/env python
# coding: utf-8

# # Accessing DC2 data in PostgreSQL at NERSC part 2

# Owner: **Joanne Bogart [@jrbogart](https://github.com/LSSTDESC/DC2-analysis/issues/new?body=@jrbogart)**  
# Last Verified to Run: 
# 
# This notebook introduces some additional features of the PostgreSQL database at NERSC.   
# 
# __Learning objectives__:
# 
# After going through this notebook, you should be able to:
#   1. Discover which object catalogs are available
#   2. Query on native quantities in those catalogs
#   3. Make use of custom functions, in particular for area searches
# 
# __Logistics__: This notebook is intended to be run through the JupyterHub NERSC interface available here: https://jupyter-dev.nersc.gov. To setup your NERSC environment, please follow the instructions available here: https://confluence.slac.stanford.edu/display/LSSTDESC/Using+Jupyter-dev+at+NERSC
# 
# ### Prerequisites
# Please see the first notebook in this series for instructions on how to gain access to the database.
# 

# In[ ]:


import psycopg2

import numpy as np

get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt

import pandas as pd


# Make the db connection

# In[ ]:


dbname = 'desc_dc2_drp'
dbuser = 'desc_dc2_drp_user'
dbhost = 'nerscdb03.nersc.gov'
dbconfig = {'dbname' : dbname, 'user' : dbuser, 'host' : dbhost}
dbconn = psycopg2.connect(**dbconfig)


# ## Finding Data
# Tables for the Run1.2i data as well as a view to make dpdd quantities more easily accessible are in the `schema` (acts like a namespace) `run12i`.  To reference, say, a table called `position` for Run1.2i use `run12i.position`. 
# 
# ### Finding Datasets
# To find out which datasets are available and by what schema names, query the table `run_provenance`. It's in a special schema known as `public` which does not normally need to be specified.

# In[ ]:


cols = ['schema_name', 'run_designation','simulation_program', 'db_ingest', 'remarks']
# Additional columns in run_provenance store software and input data versions
prov_query = 'SELECT '  + ','.join(cols) + ' from run_provenance'
with dbconn.cursor() as cursor:
    cursor.execute(prov_query)
    fmt = '{0!s:13} {1!s:16} {2!s:18} {3!s:12} {4!s}'
    print(fmt.format(cols[0], cols[1], cols[2], cols[3], cols[4]))
    for record in cursor:
        print(fmt.format(record[0], record[1], record[2], record[3], record[4]))


# Normally only data sets with `db_ingest` == 'complete' are of interest

# In[ ]:


# Pick one of the supported datasets
schema = 'run12p_v4'


# ### Querying on Native Quantities
# Unlike DPDD quantities (all in a single view), native quantities are split across several tables. The first notebook in the PostgreSQL collection shows how to find out which tables belong to a schema and what columns a table has. Alternatively, if you know the column names you want, you can query for the table name.  The following looks for the table containing `ext_shapeHSM_HsmShapeRegauss_resolution`.
# 

# In[ ]:


column = 'ext_shapehsm_hsmshaperegauss_flag'
find_table_query = "select table_name from information_schema.columns where table_schema='{}' and column_name='{}'"
find_table_query = find_table_query.format(schema, column)
print(find_table_query)
with dbconn.cursor() as cursor:
    cursor.execute(find_table_query)
    for record in cursor:
        print(record[0])


# There is some necessary fussiness here:
# * Note `ext_shapeHSM_HsmShapeRegauss_flag` has been transformed to all lower-case in the query. This is required when querying information_schema, where this string is a __value__ in the database (not a column name). 
# * In the query single quotes are used around literals like `run12p_v4`. Double quotes won't work.

# Now suppose we wanted to combine a cut on this quantity, in table dpdd_ref, with cuts on DPDD quantities like `clean`.  Then the query has to be made on a join of these two tables, where we specify that the value of dpdd_ref.object_id = dpdd.objectid. This causes the corresponding rows from each table (or view) to be treated as if they were assembled into one long row. Here is a simple query showing how this is done. A more realistic one would have more conditions in the `where` clause and might join more than two tables.

# In[ ]:


schema = 'run12i'
join_query = 'select count(object_id) from {schema}.dpdd_ref join {schema}.dpdd '
join_query += 'on {schema}.dpdd_ref.object_id = {schema}.dpdd.objectid'
join_query = join_query.format(**locals())
where = " where (ext_shapeHSM_HSmShapeRegauss_flag = 'f') and clean"
join_query += where
print(join_query)                # confirm the query looks reasonable
with dbconn.cursor() as cursor:
    cursor.execute(join_query)
    for record in cursor:
        print(record[0])


# ## User-defined Functions (UDFs)

# Many math functions from the c library have been wrapped and incorporated in an extension module installed in the database. They have their normal c library names with the prefix c_. Functions with a floating point argument or return value usually have two versions, such as c_log (double precision natural logarithm) and c_logf (single precision). They can be incorporated in queries as in this example using the command-line interface program psql:
# ```
# desc_dc2_drp=> select c_asin(1.0);
# 
#      c_asin
# -----------------
#  1.5707963267949
# ```
# There are also functions specially crafted for HSC or LSST catalogs with suggestive names like `patch_contains`, `tract_from_object_id` (used in q3 in the first notebook of this series), `sky_to_pixel`,..
# ```
# desc_dc2_drp=> select count(*) from run12i.dpdd where tractsearch(objectId, 5063);
#  count
# --------
#  233982
# (1 row)
# ```

# ### Area Searches
# The  **dpdd** view has one extra column, `coord`, which is not formally a DPDD quantity. `coord` is an alternate way (other than `ra` and `dec`) to express location.  A `coord` value is a triple of doubles representing a position on a sphere in units of arcseconds. This column is indexed, which can make certain calculations faster. In particular, using the functions `conesearch` and `boxsearch` (which take a `coord` as input) rather than starting with `ra` and `dec` makes queries much faster.  There are also functions to translate between `coord` and `(ra, dec)`.
# 
# 

# #### Cone search
# Find all stars satisfying quality cuts within a fixed radius of a particular coordinate.  The function `coneSearch` returns true if `coord` is within the cone centered at (ra, dec) of the specified radius, measured in arcseconds. 

# In[ ]:


schema = 'run12p_v4'
band = 'i'
mag_col = 'mag_' + band
min_SNR = 25   
max_err = 1/min_SNR
pop = 'Stars'
ra = 54.5
decl = -31.4
radius = 240.0
where = ' where (magerr_{band} < {max_err}) and clean and (extendedness < 1.0) and coneSearch(coord, {ra}, {decl}, {radius})'
qcone = ('SELECT ra, dec, mag_{band} from {schema}.dpdd ' + where).format(**locals())
print(qcone)
with dbconn.cursor() as cursor:
    get_ipython().run_line_magic('time', 'cursor.execute(qcone)')
    records = cursor.fetchall()
    nObj = len(records)
    print('{} objects found '.format(nObj))


# In[ ]:


cmags = pd.DataFrame(records, columns=['ra', 'dec', mag_col])

plt.figure(figsize=(8, 8))
plt.xlabel('ra')
plt.ylabel('dec')
plt.suptitle(pop + ' Cone search', size='xx-large', y=0.92)
p = plt.scatter(cmags['ra'], cmags['dec'], color='y')


# Notice how fast the query is.   Compare time, # objects found and scatter plot after increasing radius, e.g. by a factor of 10 to 2400.0. 

# #### Box search
# Find all stars, subject to quality cuts, with the specified ra and dec bounds

# In[ ]:


ra1 = 54.4
ra2 = 54.8
decl1 = -31.6
decl2 = -31.3

where = ' where (magerr_{band} < {max_err}) and clean and (extendedness < 1.0) and boxSearch(coord, {ra1}, {ra2},{decl1}, {decl2})'
qbox = ('SELECT ra, dec, mag_{band} from {schema}.dpdd ' + where).format(**locals())
print(qbox)
with dbconn.cursor() as cursor:
    get_ipython().run_line_magic('time', 'cursor.execute(qbox)')
    records = cursor.fetchall()
    nObj = len(records)
    print('{} objects found '.format(nObj))


# In[ ]:


bmags = pd.DataFrame(records, columns=['ra', 'dec', mag_col])

plt.figure(figsize=(8, 8))
plt.xlabel('ra')
plt.ylabel('dec')
plt.suptitle(pop + ' Box search', size='xx-large', y=0.92)
p = plt.scatter(bmags['ra'], bmags['dec'], color='y')