#!/usr/bin/env python
# coding: utf-8

# (lc-multi)=
# 
# # Light curves from multiple instruments

# In[ ]:


import exoplanet

exoplanet.utils.docs_setup()
print(f"exoplanet.__version__ = '{exoplanet.__version__}'")


# In the {ref}`rv-multi` case study, we discussed fitting the radial velocity curve for a planetary system observed using multiple instruments.
# You might also want to fit data from multiple instruments when fitting the light curve of a transiting planet and that's what we work through in this example.
# This is a somewhat more complicated example than the radial velocity case because some of the physical properties of the system can vary as as function of the instrument.
# Specifically, the transit depth (or the effective radius of the planet) will be a function of the filter or effective wavelength of the observations.
# This is the idea behind transit spectroscopy and the method used in this case study could (and should!) be extended to that use case.
# In this case, we'll combine the light curves from the Kepler and TESS missions for the planet host HAT-P-11.
# 
# ## A brief aside on dataset "weighting"
# 
# Before getting into the details of this case study, let's spend a minute talking about a topic that comes up a lot when discussing combining observations from different instruments or techniques.
# To many people, it seems intuitive that one should (and perhaps must) "weight" how much each dataset contributes to the likelihood based on how much they "trust" those data.
# For example, you might be worried that a dataset with more datapoints will have a larger effect on the the results than you would like.
# While this might seem intuitive, it's wrong: **the only way to combine datasets is to multiply their likelihood functions**.
# Instead, it is useful to understand what you actually mean when you say that you don't "trust" a dataset as much as another.
# **What you're really saying is that you don't believe the observation model that you wrote down**.
# For example, you might think that the quoted error bars are underestimated or there might be correlated noise that an uncorrelated normal observation model can't capture.
# The benefit of thinking about it this way is that it suggests a solution to the problem: incorporate a more flexible observation model that can capture these issues.
# In this case study, the 4 years of (long-cadence) Kepler observations only include about two times as many data points as one month of TESS observations.
# But, as you can see in the figure below, these two datasets have different noise properties (both in terms of photon noise and correlated noise) so we will fit using a different flexible Gaussian process noise model for each data set that will take these different properties into account.

# In[ ]:


import numpy as np
import lightkurve as lk
from astropy.io import fits
from collections import OrderedDict
import matplotlib.pyplot as plt

# Period and reference transit time from the literature for initialization
lit_period = 4.887803076
lit_t0 = 124.8130808


def transit_mask(t):
    return (
        np.abs((t - lit_t0 + 0.5 * lit_period) % lit_period - 0.5 * lit_period)
        < 0.2
    )


kepler_lcfs = lk.search_lightcurve(
    "HAT-P-11", mission="Kepler", cadence="long"
).download_all(flux_column="pdcsap_flux")
kepler_lc = kepler_lcfs.stitch().remove_nans()

# For the purposes of this example, let's only clip out data near the transits
m = transit_mask(kepler_lc.time.value)
kepler_t = np.ascontiguousarray(kepler_lc.time.value[m], dtype=np.float64)
kepler_y = np.ascontiguousarray(
    1e3 * (kepler_lc.flux[m] - 1), dtype=np.float64
)
kepler_yerr = np.ascontiguousarray(
    1e3 * kepler_lc.flux_err[m], dtype=np.float64
)

with fits.open(kepler_lcfs[0].filename) as hdu:
    hdr = hdu[1].header
kepler_texp = hdr["FRAMETIM"] * hdr["NUM_FRM"]
kepler_texp /= 60.0 * 60.0 * 24.0

tess_lcfs = lk.search_lightcurve(
    "HAT-P-11", mission="TESS", author="SPOC", sector=(14, 15)
).download_all(flux_column="pdcsap_flux")
tess_lc = tess_lcfs.stitch().remove_nans()

# For the purposes of this example, let's only clip out data near the transits
t = tess_lc.time.value + 2457000 - 2454833
m = transit_mask(t)
tess_t = np.ascontiguousarray(t[m], dtype=np.float64)
tess_y = np.ascontiguousarray(1e3 * (tess_lc.flux[m] - 1), dtype=np.float64)
tess_yerr = np.ascontiguousarray(1e3 * tess_lc.flux_err[m], dtype=np.float64)

with fits.open(tess_lcfs[0].filename) as hdu:
    hdr = hdu[1].header
tess_texp = hdr["FRAMETIM"] * hdr["NUM_FRM"]
tess_texp /= 60.0 * 60.0 * 24.0

datasets = OrderedDict(
    [
        ("Kepler", [kepler_t, kepler_y, kepler_yerr, kepler_texp]),
        ("TESS", [tess_t, tess_y, tess_yerr, tess_texp]),
    ]
)

fig, axes = plt.subplots(1, len(datasets), sharey=True, figsize=(10, 5))

for i, (name, (t, y, _, _)) in enumerate(datasets.items()):
    ax = axes[i]
    ax.plot(t, y, "k", lw=0.75, label=name)
    ax.set_xlabel("time [KBJD]")
    ax.set_title(name, fontsize=14)

    x_mid = 0.5 * (t.min() + t.max())
    ax.set_xlim(x_mid - 10, x_mid + 10)
axes[0].set_ylim(-10, 10)
fig.subplots_adjust(wspace=0.05)
_ = axes[0].set_ylabel("relative flux [ppt]")


# ## The probabilistic model
# 
# This model is mostly the same as the one used in [Quick fits for TESS light curves](./quick-tess.ipynb), but we're allowing for different noise variances (both the white noise component and the GP amplitude), effective planet radii, and limb-darkening coefficients for each dataset.
# For the purposes of demonstration, we're sharing the length scale of the GP between the two datasets, but this could just have well been a different parameter for each dataset without changing the results.
# The final change that we're using is to use the approximate transit depth `approx_depth` (the depth of the transit at minimum assuming the limb-darkening profile is constant under the disk of the planet) as a parameter instead of the radius ratio.
# This does not have a large effect on the performance or the results, but it can sometimes be a useful parameterization when dealing with high signal-to-noise transits because it reduces the covariance between the radius parameter and the limb darkening coefficients.
# As usual, we run a few iterations of sigma clipping and then find the maximum a posteriori parameters to check to make sure that everything is working:

# In[ ]:


import pymc3 as pm
import pymc3_ext as pmx
import exoplanet as xo
import aesara_theano_fallback.tensor as tt
from functools import partial
from celerite2.theano import terms, GaussianProcess

# Find a reference transit time near the middle of the observations to avoid
# strong covariances between period and t0
x_min = min(np.min(x) for x, _, _, _ in datasets.values())
x_max = max(np.max(x) for x, _, _, _ in datasets.values())
x_mid = 0.5 * (x_min + x_max)
t0_ref = lit_t0 + lit_period * np.round((x_mid - lit_t0) / lit_period)

# Do several rounds of sigma clipping
for i in range(10):
    with pm.Model() as model:

        # Shared orbital parameters
        log_period = pm.Normal("log_period", mu=np.log(lit_period), sigma=1.0)
        period = pm.Deterministic("period", tt.exp(log_period))
        t0 = pm.Normal("t0", mu=t0_ref, sigma=1.0)
        log_dur = pm.Normal("log_dur", mu=np.log(0.1), sigma=10.0)
        dur = pm.Deterministic("dur", tt.exp(log_dur))
        b = pmx.UnitUniform("b")
        ld_arg = 1 - tt.sqrt(1 - b**2)
        orbit = xo.orbits.KeplerianOrbit(
            period=period, duration=dur, t0=t0, b=b
        )

        # We'll also say that the timescale of the GP will be shared
        rho_gp = pm.InverseGamma(
            "rho_gp",
            testval=2.0,
            **pmx.estimate_inverse_gamma_parameters(1.0, 5.0),
        )

        # Loop over the instruments
        parameters = dict()
        lc_models = dict()
        gp_preds = dict()
        gp_preds_with_mean = dict()
        for n, (name, (x, y, yerr, texp)) in enumerate(datasets.items()):

            # We define the per-instrument parameters in a submodel so that we
            # don't have to prefix the names manually
            with pm.Model(name=name, model=model):
                # The flux zero point
                mean = pm.Normal("mean", mu=0.0, sigma=10.0)

                # The limb darkening
                u = xo.QuadLimbDark("u")
                star = xo.LimbDarkLightCurve(u)

                # The radius ratio
                log_approx_depth = pm.Normal(
                    "log_approx_depth", mu=np.log(4e-3), sigma=10
                )
                ld = 1 - u[0] * ld_arg - u[1] * ld_arg**2
                ror = pm.Deterministic(
                    "ror", tt.exp(0.5 * log_approx_depth) / tt.sqrt(ld)
                )

                # Noise parameters
                med_yerr = np.median(yerr)
                std = np.std(y)
                sigma = pm.InverseGamma(
                    "sigma",
                    testval=med_yerr,
                    **pmx.estimate_inverse_gamma_parameters(
                        med_yerr, 0.5 * std
                    ),
                )
                sigma_gp = pm.InverseGamma(
                    "sigma_gp",
                    testval=0.5 * std,
                    **pmx.estimate_inverse_gamma_parameters(
                        med_yerr, 0.5 * std
                    ),
                )

                # Keep track of the parameters for optimization
                parameters[name] = [mean, u, log_approx_depth]
                parameters[f"{name}_noise"] = [sigma, sigma_gp]

            # The light curve model
            def lc_model(mean, star, ror, texp, t):
                return mean + 1e3 * tt.sum(
                    star.get_light_curve(orbit=orbit, r=ror, t=t, texp=texp),
                    axis=-1,
                )

            lc_model = partial(lc_model, mean, star, ror, texp)
            lc_models[name] = lc_model

            # The Gaussian Process noise model
            kernel = terms.SHOTerm(sigma=sigma_gp, rho=rho_gp, Q=1.0 / 3)
            gp = GaussianProcess(
                kernel, t=x, diag=yerr**2 + sigma**2, mean=lc_model
            )
            gp.marginal(f"{name}_obs", observed=y)
            gp_preds[name] = gp.predict(y, include_mean=False)
            gp_preds_with_mean[name] = gp_preds[name] + gp.mean_value

        # Optimize the model
        map_soln = model.test_point
        for name in datasets:
            map_soln = pmx.optimize(map_soln, parameters[name])
        for name in datasets:
            map_soln = pmx.optimize(map_soln, parameters[f"{name}_noise"])
            map_soln = pmx.optimize(map_soln, parameters[name] + [log_dur, b])
        map_soln = pmx.optimize(map_soln)

        # Do some sigma clipping
        num = dict((name, len(datasets[name][0])) for name in datasets)
        clipped = dict()
        masks = dict()
        for name in datasets:
            mdl = pmx.eval_in_model(gp_preds_with_mean[name], map_soln)
            resid = datasets[name][1] - mdl
            sigma = np.sqrt(np.median((resid - np.median(resid)) ** 2))
            masks[name] = np.abs(resid - np.median(resid)) < 7 * sigma
            clipped[name] = num[name] - masks[name].sum()
            print(f"Sigma clipped {clipped[name]} {name} light curve points")

        if all(c < 10 for c in clipped.values()):
            break

        else:
            for name in datasets:
                datasets[name][0] = datasets[name][0][masks[name]]
                datasets[name][1] = datasets[name][1][masks[name]]
                datasets[name][2] = datasets[name][2][masks[name]]


# Here are the two phased light curves (with the Gaussian process model removed).
# We can see the effect of exposure time integration and the difference in photometric precision, but everything should be looking good!

# In[ ]:


dt = np.linspace(-0.2, 0.2, 500)

with model:
    trends = pmx.eval_in_model([gp_preds[k] for k in datasets], map_soln)
    phase_curves = pmx.eval_in_model(
        [lc_models[k](t0 + dt) for k in datasets], map_soln
    )

fig, axes = plt.subplots(2, sharex=True, sharey=True, figsize=(8, 6))

for n, name in enumerate(datasets):
    ax = axes[n]

    x, y = datasets[name][:2]

    period = map_soln["period"]
    folded = (x - map_soln["t0"] + 0.5 * period) % period - 0.5 * period
    m = np.abs(folded) < 0.2
    ax.plot(
        folded[m],
        (y - trends[n] - map_soln[f"{name}_mean"])[m],
        ".k",
        alpha=0.3,
        mec="none",
    )
    ax.plot(
        dt, phase_curves[n] - map_soln[f"{name}_mean"], f"C{n}", label=name
    )
    ax.annotate(
        name,
        xy=(1, 0),
        xycoords="axes fraction",
        va="bottom",
        ha="right",
        xytext=(-3, 3),
        textcoords="offset points",
        fontsize=14,
    )

axes[-1].set_xlim(-0.15, 0.15)
axes[-1].set_xlabel("time since transit [days]")
for ax in axes:
    ax.set_ylabel("relative flux [ppt]")


# Then we run the MCMC:

# In[ ]:


import platform

with model:
    trace = pmx.sample(
        tune=1500,
        draws=1500,
        start=map_soln,
        # Parallel sampling runs poorly or crashes on macos
        cores=1 if platform.system() == "Darwin" else 2,
        chains=2,
        target_accept=0.95,
        return_inferencedata=True,
        random_seed=[11, 9],
    )


# And check the convergence diagnostics:

# In[ ]:


import arviz as az

az.summary(trace)


# Since we fit for a radius ratio in each band, we can see if the transit depth is different in Kepler compared to TESS.
# The plot below demonstrates that there is no statistically significant difference between the radii measured in these two bands:

# In[ ]:


plt.hist(
    trace.posterior["Kepler_ror"].values.flatten(),
    20,
    density=True,
    histtype="step",
    label="Kepler",
)
plt.hist(
    trace.posterior["TESS_ror"].values.flatten(),
    20,
    density=True,
    histtype="step",
    label="TESS",
)
plt.yticks([])
plt.xlabel("effective radius ratio")
_ = plt.legend(fontsize=12)


# We can also compare the inferred limb-darkening coefficients:

# In[ ]:


import corner

fig = corner.corner(
    trace,
    var_names=["TESS_u"],
    bins=40,
    color="C1",
    range=((0.5, 0.9), (-0.5, 0.1)),
)
corner.corner(
    trace,
    var_names=["Kepler_u"],
    bins=40,
    color="C0",
    fig=fig,
    labels=["$u_1$", "$u_2$"],
    range=((0.5, 0.9), (-0.5, 0.1)),
)
fig.axes[0].axvline(-1.0, color="C0", label="Kepler")
fig.axes[0].axvline(-1.0, color="C1", label="TESS")
_ = fig.axes[0].legend(
    fontsize=12, loc="center left", bbox_to_anchor=(1.1, 0.5)
)


# ## Citations
# 
# As described in the [citation tutorial](https://docs.exoplanet.codes/en/stable/tutorials/citation/), we can use [citations.get_citations_for_model](https://docs.exoplanet.codes/en/stable/user/api/#exoplanet.citations.get_citations_for_model) to construct an acknowledgement and BibTeX listing that includes the relevant citations for this model.

# In[ ]:


with model:
    txt, bib = xo.citations.get_citations_for_model()
print(txt)


# In[ ]:


print(bib.split("\n\n")[0] + "\n\n...")


# In[ ]: