#!/usr/bin/env python
# coding: utf-8
# # Match truth and object catalogs for DC2 Run 2.2i
# Owner: Yao-Yuan Mao, Scott Daniel (with help from Anže Slosar, Bhairav Valera, HyeYun Park)
# Updated by: Javier Sanchez
# Last Verified to Run: 2020-11-29
#
# **Notes:**
# - Follow this [step-by-step guide](https://confluence.slac.stanford.edu/x/Xgg4Dg) if you don't know how to run this notebook.
# - If you need more information about the Generic Catalog Reader (GCR), see [this diagram](https://github.com/yymao/generic-catalog-reader/blob/master/README.md#concept) and [more examples](https://github.com/LSSTDESC/gcr-catalogs/blob/master/examples/GCRCatalogs%20Demo.ipynb).
#
# ## Learning objectives
# After completing and studying this Notebook, you should be able to:
# 1. Use GCR to load object catalog and truth catalog
# 2. Use `filters` and `native_filters` appropriately
# 3. Use `add_quantity_modifier` and `get_quantity_modifier`
# 4. Use `FoFCatalogMatching` to do Friends-of-friends catalog matching
# 5. Learn some cool Numpy tricks for binning, masking, and reshaping [Advanced]
# 6. Learn use pandas to match truth catalog object id back to the galaxy id in extragalactic catalog [advanced]
# In[1]:
import numpy as np
import matplotlib.pyplot as plt
import healpy as hp
get_ipython().run_line_magic('matplotlib', 'inline')
# In[2]:
from astropy.coordinates import SkyCoord
import FoFCatalogMatching
import GCRCatalogs
import fitsio
# In[3]:
# load object catalog (for a single tract)
object_cat = GCRCatalogs.load_catalog('dc2_object_run2.2i_dr6_wfd_with_metacal')
# In[4]:
# Let's first visually inspect the footprint of one tract of the object catalog.
# When `return_iterator` is turned on, the method `get_quantities` will return an
# iterator, and each element in the iterator will be the quantities we requested in
# different chunks of the dataset.
# For object catalogs, the different chunks happen to be different patches,
# resulting in a different color for each patch in the scatter plot below.
for object_data in object_cat.get_quantities(['ra', 'dec'], native_filters=['tract == 3447'], return_iterator=True):
plt.scatter(object_data['ra'], object_data['dec'], s=1, rasterized=True);
plt.xlabel('RA');
plt.ylabel('Dec');
# In[5]:
# Let's also define a magnitude cut
mag_filters = [
(np.isfinite, 'mag_i'),
'mag_i < 24.5',
]
# In[6]:
# let's add total ellipticity for later use (not needed for now)
object_cat.add_derived_quantity('shape_hsm_regauss_etot', np.hypot,
'ext_shapeHSM_HsmShapeRegauss_e1', 'ext_shapeHSM_HsmShapeRegauss_e2')
# In[53]:
# Load ra and dec from object, using both of the filters we just defined.
object_data = object_cat.get_quantities(['ra', 'dec', 'mag_i_cModel', 'shape_hsm_regauss_etot',
'extendedness', 'blendedness', 'id'],
filters=(mag_filters), native_filters=['tract == 3830'])
# In[24]:
# Let's now turn to the truth catalog. Here we just append galaxies and stars;
# however, truth catalogs are also available in GCRCatalogs and PostgreSQL.
truth_cat = GCRCatalogs.load_catalog('cosmoDC2_v1.1.4_image')
# In[25]:
# Load the stars
stars_cat = fitsio.read('/global/cfs/projectdirs/lsst/groups/SSim/DC2/all_stars_DC2_run2.1i.fits.gz')
# In[26]:
max_ra = np.nanmax(object_data['ra'])
min_ra = np.nanmin(object_data['ra'])
max_dec = np.nanmax(object_data['dec'])
min_dec = np.nanmin(object_data['dec'])
vertices = hp.ang2vec(np.array([min_ra, max_ra, max_ra, min_ra]),
np.array([min_dec, min_dec, max_dec, max_dec]), lonlat=True)
ipix = hp.query_polygon(32, vertices, inclusive=True)
native_filter = f'(healpix_pixel == {ipix[0]})'
for ipx in ipix:
native_filter=native_filter+f' | (healpix_pixel == {ipx})'
pos_filters=[f'ra >= {min_ra}',f'ra <={max_ra}', f'dec >= {min_dec}', f'dec <= {max_dec}']
# In[27]:
# get ra and dec from truth catalog
# note that we add i < 25 to the native filter to speed up load time
truth_mag_filters = ['mag_i < 24.7']
quantities = ['galaxy_id', 'ra', 'dec', 'mag_i', 'redshift']
truth_data = truth_cat.get_quantities(quantities, filters=truth_mag_filters+pos_filters,
native_filters=native_filter)
# In[28]:
mask_stars = (stars_cat['ra'] >= min_ra) & (stars_cat['ra'] <= max_ra) & \
(stars_cat['decl'] >= min_dec) & (stars_cat['decl'] <= max_dec)
stars_cat = stars_cat[mask_stars] # Filter out the stars that are out of the footprint
# In[29]:
total_galaxies = len(truth_data['ra'])
total_objs = len(stars_cat)+total_galaxies # Total length of the output catalog
truth_data_all = dict()
out_qty = ['id', 'ra', 'dec', 'mag_i', 'redshift']
star_qty = ['simobjid', 'ra', 'decl', 'imag', 'redshift']
out_dtype = [np.long, np.float, np.float, np.float, np.float]
# In[30]:
# Allocate the columns of the dictionary
for i, dtype in enumerate(out_dtype):
truth_data_all[out_qty[i]] = np.zeros(total_objs, dtype=dtype)
truth_data_all[out_qty[i]][:total_galaxies] = truth_data[quantities[i]]
if 'redshift' not in star_qty[i]:
truth_data_all[out_qty[i]][total_galaxies:] = stars_cat[star_qty[i]] # We add the info from stars and put all stars at redshift=0
truth_data_all['star'] = np.zeros(total_objs, dtype=np.bool)
truth_data_all['star'][total_galaxies:] = True
# In[31]:
plt.scatter(truth_data_all['ra'][::100], truth_data_all['dec'][::100], s=0.1)
plt.scatter(object_data['ra'][::100], object_data['dec'][::100], s=0.1)
# In[32]:
# now we can really do the matching!
# FoFCatalogMatching.match takes a dictionary of catalogs to match, a friends-of-friends linking length.
# Because our "catalog" is not an astropy table or pandas dataframe,
# `len(truth_coord)` won't give the actual length of the table
# so we need to specify `catalog_len_getter` so that the code knows how to get the length of the catalog.
results = FoFCatalogMatching.match(
catalog_dict={'truth': truth_data_all, 'object': object_data},
linking_lengths=1.0, # Linking length of 1 arcsecond, you can play around with the values!
catalog_len_getter=lambda x: len(x['ra']),
)
# In[33]:
# now we want to count the number of truth and object objects *for each group*
# but instead of looping over groups, we can do this in a smart (and very fast) way
# first we need to know which rows are from the truth catalog and which are from the object
truth_mask = results['catalog_key'] == 'truth'
object_mask = ~truth_mask
# then np.bincount will give up the number of id occurrences (like historgram but with integer input)
n_groups = results['group_id'].max() + 1
n_truth = np.bincount(results['group_id'][truth_mask], minlength=n_groups)
print(n_truth[n_truth>10])
n_object = np.bincount(results['group_id'][object_mask], minlength=n_groups)
# now n_truth and n_object are the number of truth/object objects in each group
# we want to make a 2d histrogram of (n_truth, n_object).
n_max = max(n_truth.max(), n_object.max()) + 1
hist_2d = np.bincount(n_object * n_max + n_truth, minlength=n_max*n_max).reshape(n_max, n_max)
plt.imshow(np.log10(hist_2d+1), extent=(-0.5, n_max-0.5, -0.5, n_max-0.5), origin='lower');
plt.xlabel('Number of truth objects');
plt.ylabel('Number of object objects');
plt.colorbar(label=r'$\log(N_{\rm groups} \, + \, 1)$');
# In[34]:
# Let's further inspect the objects in the groups that have 1-to-1 truth/object match.
# first, let's find our the IDs of the groups that have 1-to-1 truth/object match:
one_to_one_group_mask = np.in1d(results['group_id'], np.flatnonzero((n_truth == 1) & (n_object == 1)))
# and then we can find the row indices in the *original* truth/object catalogs for those 1-to-1 groups
truth_idx = results['row_index'][one_to_one_group_mask & truth_mask]
object_idx = results['row_index'][one_to_one_group_mask & object_mask]
# In[35]:
truth_sc = SkyCoord(truth_data_all['ra'][truth_idx], truth_data_all['dec'][truth_idx], unit="deg")
object_sc = SkyCoord(object_data['ra'][object_idx], object_data['dec'][object_idx], unit="deg")
delta_ra = (object_sc.ra.arcsec - truth_sc.ra.arcsec) * np.cos(np.deg2rad(0.5*(truth_sc.dec.deg + object_sc.dec.deg)))
delta_dec = object_sc.dec.arcsec - truth_sc.dec.arcsec
delta_arcsec = object_sc.separation(truth_sc).arcsec
# In[36]:
plt.figure(figsize=(7.3, 6)) # Pick a figuresize that will result in a square equal-axis plus colorbar
plt.hist2d(delta_ra, delta_dec, bins=40, range=((-0.5, +0.5), (-0.5, +0.5)));
plt.xlabel(r'$\Delta$ RA [arcsec]');
plt.ylabel(r'$\Delta$ Dec [arcsec]');
plt.colorbar();
plt.xlim(-0.5, +0.5)
plt.ylim(-0.5, +0.5)
plt.axis('equal');
# In[37]:
#Plotting Delta angle for the outputs
plt.hist(delta_arcsec, bins=80);
plt.xlim(0, 0.4);
plt.xlabel(r'$\Delta$ angle [arcsec]');
# In[22]:
import pandas as pd
# In[38]:
# convert truth_data and object_data to pandas dataframe and select the matched one
truth_matched = pd.DataFrame(truth_data_all).iloc[truth_idx].reset_index(drop=True)
object_matched = pd.DataFrame(object_data).iloc[object_idx].reset_index(drop=True)
matched = pd.merge(truth_matched, object_matched, left_index=True, right_index=True, suffixes=('_truth', '_object'))
# In[39]:
# Select only those truth objects that are galaxies which were not sprinkled
# (stars and sprinkled objects do not occur in the extragalactic catalog)
matched_gals = matched.query('~star')
# In[40]:
# load redshift and ellipticity from the extragalactic catalog, only for galaxies that are already in `matched_gals`
extragalactic_data = truth_cat.get_quantities(
['galaxy_id', 'mag_i_lsst', 'ellipticity_true'],
filters=[(lambda x: np.in1d(x, matched_gals['id'].values, True), 'galaxy_id')]+truth_mag_filters+pos_filters,
native_filters=native_filter
)
# In[41]:
# merge extragalactic_data to matched_gals
matched_gals = pd.merge(matched_gals, pd.DataFrame(extragalactic_data), 'left', left_on='id', right_on='galaxy_id')
# In[42]:
# compare the magnitude
plt.figure(figsize=(5,5));
plt.scatter(matched_gals['mag_i_lsst'], matched_gals['mag_i_cModel'], s=1);
lims = [14, 25]
plt.plot(lims, lims, c='k', lw=0.5);
plt.xlabel('extragalactic $i$-mag');
plt.ylabel('object $i$-mag (cModel)');
plt.xlim(lims);
plt.ylim(lims);
# In[43]:
# compare the ellipticity (naively -- see below for further discussion)
plt.figure(figsize=(5,5));
plt.scatter(matched_gals['ellipticity_true'], matched_gals['shape_hsm_regauss_etot'], s=1);
lims = [0, 1]
plt.plot(lims, lims, c='k', lw=0.5);
plt.xlabel('extragalactic ellipticity');
plt.ylabel('object ellipticity');
plt.xlim(lims);
plt.ylim(lims);
# The ellipticity comparison plot above is quite surprising.
# It seems that the ellipticities in the object catalog are generally higher (i.e., less round) than those in the extragalactic catalog.
#
# The quantity `shape_hsm_regauss_etot` that we used for the object catalog are the re-Gaussianization shapes, which are PSF corrected, and they could be either rounder (if the correction was an under-correction) or less round (if the correction was an over-correction). Hence, their value being systematically larger than the "truth" from extragalactic catalog seems problematic.
#
# Before we panic, we should, however, remind ourselves of the definition of ellipticities used in these catalogs.
# For the extragalactic catalog, ellipticity is defined as $(1-q)/(1+q)$, where $q$ is the minor-to-major axis ratio
# (see the [SCHEMA](https://github.com/LSSTDESC/gcr-catalogs/blob/master/GCRCatalogs/SCHEMA.md#schema-for-extragalatic-catalogs)).
# On the other hand, for the object catalog, the HSM re-Gaussianization ellipticity that we are using is defined as $(1-q^2)/(1+q^2)$
# (see e.g., Eq. 8 of [Mandelbaum et al. 2006](https://arxiv.org/abs/astro-ph/0511164)).
#
# Hence their definitions are in fact different, so we need to do a conversion before we compare them.
# With some math, we can find the conversion between the two definitions $e_{\rm HSM~def} = \frac{2e_{\rm EGC~def}}{1+e_{\rm EGC~def}^2}$.
# In[44]:
# compare the ellipticity (smartly)
ellipticity_conversion = lambda e: 2*e / (1.0+e*e)
plt.figure(figsize=(5,5));
plt.scatter(ellipticity_conversion(matched_gals['ellipticity_true']), matched_gals['shape_hsm_regauss_etot'], s=1);
lims = [0, 1]
plt.plot(lims, lims, c='k', lw=0.5);
plt.xlabel('extragalactic ellipticity (in object def.)');
plt.ylabel('object ellipticity');
plt.xlim(lims);
plt.ylim(lims);
# This looks much better now!
#
# When you were checking the [SCHEMA](https://github.com/LSSTDESC/gcr-catalogs/blob/master/GCRCatalogs/SCHEMA.md#schema-for-extragalatic-catalogs)) file,
# you probably have also noticed that `ellipticity_true` is the ellipticity before the shear is applied (i.e., unlensed).
# Hence this comparison is still not an apples-to-apples comparison, as the ellipticity in the object catalog is, of course, lensed.
#
# According to the [SCHEMA](https://github.com/LSSTDESC/gcr-catalogs/blob/master/GCRCatalogs/SCHEMA.md#schema-for-extragalatic-catalogs)), we should have been using `ellipticity` from the extragalactic catalog.
# But unfortunately, this quantity is not directly available from the extragalactic catalog!
# In[30]:
def calc_lensed_ellipticity(es1, es2, gamma1, gamma2, kappa):
gamma = gamma1 + gamma2*1j # shear (as a complex number)
es = es1 + es2*1j # intrinsic ellipticity (as a complex number)
g = gamma / (1.0 - kappa) # reduced shear
e = (es + g) / (1.0 + g.conjugate()*es) # lensed ellipticity
return np.absolute(e)
truth_cat.add_derived_quantity('ellipticity', calc_lensed_ellipticity,
'ellipticity_1_true', 'ellipticity_2_true', 'shear_1', 'shear_2', 'convergence')
# In[31]:
# Now let's get the newly defined ellipticity and add to our merged pandas data frame:
extragalactic_data_more = truth_cat.get_quantities(
['galaxy_id', 'ellipticity'],
filters=[(lambda x: np.in1d(x, matched_gals['id'].values, True), 'galaxy_id')], native_filters=native_filter
)
matched_gals = pd.merge(matched_gals, pd.DataFrame(extragalactic_data_more), 'left', left_on='id', right_on='galaxy_id')
# In[32]:
# Now we compare the ellipticity again (and don't forget the definition conversion!)
ellipticity_conversion = lambda e: 2*e / (1.0+e*e)
plt.figure(figsize=(5,5));
plt.scatter(ellipticity_conversion(matched_gals['ellipticity']), matched_gals['shape_hsm_regauss_etot'], s=1);
lims = [0, 1]
plt.plot(lims, lims, c='k', lw=0.5);
plt.xlabel('extragalactic ellipticity (lensed, in object def.)');
plt.ylabel('object ellipticity');
plt.xlim(lims);
plt.ylim(lims);
# # How to use the pre-matched catalogs?
# So far, we have shown how to match the catalogs but this has been done already. You can take advantage of this by loading the matched catalogs.
# You can use `GCRCatalogs`:
# In[17]:
object_matched = GCRCatalogs.load_catalog('dc2_object_run2.2i_dr6_wfd_matched') #
# Or read directly the files (they are fits files, one per tract)
# In[62]:
fname = '/global/cfs/projectdirs/lsst/shared/DC2-prod/Run2.2i/addons/matched/dr6/matched_ids_dc2_object_run2.2i_dr6_wfd_with_metacal_3830.fits.gz'
# In[63]:
import fitsio
data_truth = fitsio.read(fname)
# In[64]:
data_truth.dtype
# How well does `extendedness` perform as a star/galaxy classifier?
# In[22]:
import pandas as pd
# In[103]:
object_data = object_cat.get_quantities(['ra', 'dec', 'mag_i_cModel', 'shape_hsm_regauss_etot',
'extendedness', 'blendedness', 'id'],
native_filters=['tract == 3830'])
# In[104]:
object_data = pd.DataFrame(object_data)
data_truth = pd.DataFrame(data_truth)
data_truth['id'] = data_truth['objectId']
# In[105]:
# We have to change the byte order to be able to use pandas
for key in data_truth.keys():
if data_truth[key].values.dtype.byteorder == '>':
data_truth[key] = data_truth[key].values.byteswap().newbyteorder()
# In[106]:
data_all = object_data.set_index('id').join(data_truth.set_index('id'), lsuffix='_obj', rsuffix='_true', how='inner')
# Let's check the purity of the `extendedness` classifier for stars for objects below $r < 25$
# In[113]:
np.count_nonzero((data_all['extendedness']==0) & (data_all['is_star'])
& (data_all['mag_r_lsst'] < 25))/np.count_nonzero((data_all['extendedness']==0) & (data_all['mag_r_lsst'] < 25))
# What fraction of objects with `extendedness==1` are matched to stars?
# In[114]:
np.count_nonzero((data_all['extendedness']==1) & (data_all['is_star'])
& (data_all['mag_r_lsst'] < 25))/np.count_nonzero((data_all['extendedness']==1) & (data_all['mag_r_lsst'] < 25))
# In[117]:
bright_stars = stars_cat['rmag'] < 16
# In[122]:
plt.figure(figsize=(15,7))
plt.hist2d(data_all['ra_obj'], data_all['dec_obj'],256); plt.gca().set_aspect('equal');
plt.xlabel('RA [deg]');
plt.ylabel('dec [deg]');
plt.title('Full sample')
plt.colorbar(label='Number of objects')
plt.scatter(stars_cat['ra'][bright_stars], stars_cat['decl'][bright_stars], marker='x', c=stars_cat['rmag'][bright_stars], cmap='hot')
plt.colorbar(label='r-band Magnitude')
#plt.scatter(data_all['ra_obj'][::10], data_all['dec_obj'][::10], s=0.1)
# Some of the holes look close to stars but not all of them
# In[ ]: