#!/usr/bin/env python # coding: utf-8 # # Accessing DC2 data in PostgreSQL at NERSC part 2 # Owner: **Joanne Bogart [@jrbogart](https://github.com/LSSTDESC/DC2-analysis/issues/new?body=@jrbogart)** # Last Verified to Run: # # This notebook introduces some additional features of the PostgreSQL database at NERSC. # # __Learning objectives__: # # After going through this notebook, you should be able to: # 1. Discover which object catalogs are available # 2. Query on native quantities in those catalogs # 3. Make use of custom functions, in particular for area searches # # __Logistics__: This notebook is intended to be run through the JupyterHub NERSC interface available here: https://jupyter-dev.nersc.gov. To setup your NERSC environment, please follow the instructions available here: https://confluence.slac.stanford.edu/display/LSSTDESC/Using+Jupyter-dev+at+NERSC # # ### Prerequisites # Please see the first notebook in this series for instructions on how to gain access to the database. # # In[ ]: import psycopg2 import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd # Make the db connection # In[ ]: dbname = 'desc_dc2_drp' dbuser = 'desc_dc2_drp_user' dbhost = 'nerscdb03.nersc.gov' dbconfig = {'dbname' : dbname, 'user' : dbuser, 'host' : dbhost} dbconn = psycopg2.connect(**dbconfig) # ## Finding Data # Tables for the Run1.2i data as well as a view to make dpdd quantities more easily accessible are in the `schema` (acts like a namespace) `run12i`. To reference, say, a table called `position` for Run1.2i use `run12i.position`. # # ### Finding Datasets # To find out which datasets are available and by what schema names, query the table `run_provenance`. It's in a special schema known as `public` which does not normally need to be specified. # In[ ]: cols = ['schema_name', 'run_designation','simulation_program', 'db_ingest', 'remarks'] # Additional columns in run_provenance store software and input data versions prov_query = 'SELECT ' + ','.join(cols) + ' from run_provenance' with dbconn.cursor() as cursor: cursor.execute(prov_query) fmt = '{0!s:13} {1!s:16} {2!s:18} {3!s:12} {4!s}' print(fmt.format(cols[0], cols[1], cols[2], cols[3], cols[4])) for record in cursor: print(fmt.format(record[0], record[1], record[2], record[3], record[4])) # Normally only data sets with `db_ingest` == 'complete' are of interest # In[ ]: # Pick one of the supported datasets schema = 'run12p_v4' # ### Querying on Native Quantities # Unlike DPDD quantities (all in a single view), native quantities are split across several tables. The first notebook in the PostgreSQL collection shows how to find out which tables belong to a schema and what columns a table has. Alternatively, if you know the column names you want, you can query for the table name. The following looks for the table containing `ext_shapeHSM_HsmShapeRegauss_resolution`. # # In[ ]: column = 'ext_shapehsm_hsmshaperegauss_flag' find_table_query = "select table_name from information_schema.columns where table_schema='{}' and column_name='{}'" find_table_query = find_table_query.format(schema, column) print(find_table_query) with dbconn.cursor() as cursor: cursor.execute(find_table_query) for record in cursor: print(record[0]) # There is some necessary fussiness here: # * Note `ext_shapeHSM_HsmShapeRegauss_flag` has been transformed to all lower-case in the query. This is required when querying information_schema, where this string is a __value__ in the database (not a column name). # * In the query single quotes are used around literals like `run12p_v4`. Double quotes won't work. # Now suppose we wanted to combine a cut on this quantity, in table dpdd_ref, with cuts on DPDD quantities like `clean`. Then the query has to be made on a join of these two tables, where we specify that the value of dpdd_ref.object_id = dpdd.objectid. This causes the corresponding rows from each table (or view) to be treated as if they were assembled into one long row. Here is a simple query showing how this is done. A more realistic one would have more conditions in the `where` clause and might join more than two tables. # In[ ]: schema = 'run12i' join_query = 'select count(object_id) from {schema}.dpdd_ref join {schema}.dpdd ' join_query += 'on {schema}.dpdd_ref.object_id = {schema}.dpdd.objectid' join_query = join_query.format(**locals()) where = " where (ext_shapeHSM_HSmShapeRegauss_flag = 'f') and clean" join_query += where print(join_query) # confirm the query looks reasonable with dbconn.cursor() as cursor: cursor.execute(join_query) for record in cursor: print(record[0]) # ## User-defined Functions (UDFs) # Many math functions from the c library have been wrapped and incorporated in an extension module installed in the database. They have their normal c library names with the prefix c_. Functions with a floating point argument or return value usually have two versions, such as c_log (double precision natural logarithm) and c_logf (single precision). They can be incorporated in queries as in this example using the command-line interface program psql: # ``` # desc_dc2_drp=> select c_asin(1.0); # # c_asin # ----------------- # 1.5707963267949 # ``` # There are also functions specially crafted for HSC or LSST catalogs with suggestive names like `patch_contains`, `tract_from_object_id` (used in q3 in the first notebook of this series), `sky_to_pixel`,.. # ``` # desc_dc2_drp=> select count(*) from run12i.dpdd where tractsearch(objectId, 5063); # count # -------- # 233982 # (1 row) # ``` # ### Area Searches # The **dpdd** view has one extra column, `coord`, which is not formally a DPDD quantity. `coord` is an alternate way (other than `ra` and `dec`) to express location. A `coord` value is a triple of doubles representing a position on a sphere in units of arcseconds. This column is indexed, which can make certain calculations faster. In particular, using the functions `conesearch` and `boxsearch` (which take a `coord` as input) rather than starting with `ra` and `dec` makes queries much faster. There are also functions to translate between `coord` and `(ra, dec)`. # # # #### Cone search # Find all stars satisfying quality cuts within a fixed radius of a particular coordinate. The function `coneSearch` returns true if `coord` is within the cone centered at (ra, dec) of the specified radius, measured in arcseconds. # In[ ]: schema = 'run12p_v4' band = 'i' mag_col = 'mag_' + band min_SNR = 25 max_err = 1/min_SNR pop = 'Stars' ra = 54.5 decl = -31.4 radius = 240.0 where = ' where (magerr_{band} < {max_err}) and clean and (extendedness < 1.0) and coneSearch(coord, {ra}, {decl}, {radius})' qcone = ('SELECT ra, dec, mag_{band} from {schema}.dpdd ' + where).format(**locals()) print(qcone) with dbconn.cursor() as cursor: get_ipython().run_line_magic('time', 'cursor.execute(qcone)') records = cursor.fetchall() nObj = len(records) print('{} objects found '.format(nObj)) # In[ ]: cmags = pd.DataFrame(records, columns=['ra', 'dec', mag_col]) plt.figure(figsize=(8, 8)) plt.xlabel('ra') plt.ylabel('dec') plt.suptitle(pop + ' Cone search', size='xx-large', y=0.92) p = plt.scatter(cmags['ra'], cmags['dec'], color='y') # Notice how fast the query is. Compare time, # objects found and scatter plot after increasing radius, e.g. by a factor of 10 to 2400.0. # #### Box search # Find all stars, subject to quality cuts, with the specified ra and dec bounds # In[ ]: ra1 = 54.4 ra2 = 54.8 decl1 = -31.6 decl2 = -31.3 where = ' where (magerr_{band} < {max_err}) and clean and (extendedness < 1.0) and boxSearch(coord, {ra1}, {ra2},{decl1}, {decl2})' qbox = ('SELECT ra, dec, mag_{band} from {schema}.dpdd ' + where).format(**locals()) print(qbox) with dbconn.cursor() as cursor: get_ipython().run_line_magic('time', 'cursor.execute(qbox)') records = cursor.fetchall() nObj = len(records) print('{} objects found '.format(nObj)) # In[ ]: bmags = pd.DataFrame(records, columns=['ra', 'dec', mag_col]) plt.figure(figsize=(8, 8)) plt.xlabel('ra') plt.ylabel('dec') plt.suptitle(pop + ' Box search', size='xx-large', y=0.92) p = plt.scatter(bmags['ra'], bmags['dec'], color='y')