#!/usr/bin/env python
# coding: utf-8

# # Method of Simulated Moments (MSM) for Structural Estimation

# ### Steps of MSM estimation
# 1. Load empirical data
# 2. Define a function to calculate empirical moments from the data
# 3. Calculate the covariance matrix of the empirical moments (for the weighting matrix)
# 4. Define a `HARK` agent type with the model parameters to be estimated
# 5. Define a function to simulate the model and calculate the simulated moments
# 6. Estimate the model parameters by minimizing the distance between the empirical and simulated moments

# In[1]:


from copy import copy

import estimagic as em
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from statsmodels.stats.weightstats import DescrStatsW

from HARK.Calibration.Income.IncomeTools import (
    Cagetti_income,
    parse_income_spec,
    parse_time_params,
)
from HARK.Calibration.life_tables.us_ssa.SSATools import parse_ssa_life_table
from HARK.Calibration.SCF.WealthIncomeDist.SCFDistTools import (
    get_scf_distr_stats,
    income_wealth_dists_from_scf,
)
from HARK.ConsumptionSaving.ConsIndShockModel import (
    IndShockConsumerType,
    init_lifecycle,
)
from HARK.estimation import estimate_msm
from HARK.utilities import plot_funcs


# ## 1. Load empirical data

# In[2]:


scf_data = get_scf_distr_stats()
scf_data["Wealth"] = np.exp(scf_data["lnWealth.mean"])
scf_data = scf_data[scf_data["Educ"] == "College"]
scf_data = scf_data[scf_data["YEAR"] != "All"]
scf_data = scf_data[scf_data["Age_grp"] != "All"]
scf_data = scf_data[~scf_data["Age_grp"].isin(["(15,20]", "(20,25]"])]

scf_data


# ## 2. Calculate Moments

# In[3]:


indices = scf_data["Age_grp"].unique()


def calculate_weighted_median(data):
    stats = DescrStatsW(data["Wealth"], weights=data["w.obs"])
    return stats.quantile(0.5, return_pandas=False)[0]


def calculate_moments(data):
    medians = data.groupby(["Age_grp"]).apply(
        calculate_weighted_median,
        include_groups=False,
    )
    return medians.reindex(indices, fill_value=0.0)


# In[4]:


empirical_moments = calculate_moments(scf_data)


# In[5]:


empirical_moments.plot()


# ## 3. Calculate the covariance matrix of empirical moments

# In[6]:


moments_cov = em.get_moments_cov(
    scf_data,
    calculate_moments,
    bootstrap_kwargs={"n_draws": 5_000, "seed": 11323, "error_handling": "continue"},
)

moments_cov


# ## 4. Define an agent type to simulate data

# In[16]:


birth_age = 25
death_age = 100
adjust_infl_to = 1992
income_calib = Cagetti_income
education = "College"

# Income specification
income_params = parse_income_spec(
    age_min=birth_age,
    age_max=death_age,
    adjust_infl_to=adjust_infl_to,
    **income_calib[education],
    SabelhausSong=True,
)

# Initial distribution of wealth and permanent income
dist_params = income_wealth_dists_from_scf(
    base_year=adjust_infl_to,
    age=birth_age,
    education=education,
    wave=1995,
)

# We need survival probabilities only up to death_age-1, because survival
# probability at death_age is 1.
liv_prb = parse_ssa_life_table(
    female=True,
    cross_sec=True,
    year=2004,
    min_age=birth_age,
    max_age=death_age - 1,
)

# Parameters related to the number of periods implied by the calibration
time_params = parse_time_params(age_birth=birth_age, age_death=death_age)

# Update all the new parameters
params = copy(init_lifecycle)
params.update(time_params)
params.update(dist_params)
params.update(income_params)
params["LivPrb"] = liv_prb
params["AgentCount"] = 10_000
params["T_sim"] = 200
params["track_vars"] = ["aNrm", "bNrm", "cNrm", "pLvl", "t_age", "mNrm"]
params["PermGroFacAgg"] = 1.0
params["Rfree"] = [params["Rfree"][0]] * params["T_cycle"]


# In[17]:


LifeCycleAgent = IndShockConsumerType(**params)
LifeCycleAgent.solve()


# In[18]:


LifeCycleAgent.unpack("cFunc")
# Plot the consumption functions
print("Consumption functions")
plot_funcs(LifeCycleAgent.cFunc, 0, 5)


# In[19]:


LifeCycleAgent.update()

# Run the simulations
LifeCycleAgent.initialize_sim()
LifeCycleAgent.simulate()  # Note: simulate() does not return history; it updates LifeCycleAgent.history
history = LifeCycleAgent.history


# In[20]:


raw_data = {
    "Age": history["t_age"].flatten() + birth_age - 1,
    "pIncome": history["pLvl"].flatten(),
    "nrmM": history["mNrm"].flatten(),
    "nrmC": history["cNrm"].flatten(),
}

sim_data = pd.DataFrame(raw_data)
sim_data["Cons"] = sim_data.nrmC * sim_data.pIncome
sim_data["M"] = sim_data.nrmM * sim_data.pIncome

# Find the mean of each variable at every age
AgeMeans = sim_data.groupby(["Age"]).median().reset_index()

plt.figure()
plt.plot(AgeMeans.Age, AgeMeans.pIncome, label="Permanent Income")
plt.plot(AgeMeans.Age, AgeMeans.M, label="Market resources")
plt.plot(AgeMeans.Age, AgeMeans.Cons, label="Consumption")
plt.legend()
plt.xlabel("Age")
plt.ylabel(f"Thousands of {adjust_infl_to} USD")
plt.title("Variable Medians Conditional on Survival")
plt.grid()


# In[21]:


age_groups = [
    list(range(start, start + 5)) for start in range(birth_age + 1, 95 + 1, 5)
]

# generate labels as (25,30], (30,35], ...
age_labels = [f"({group[0] - 1},{group[-1]}]" for group in age_groups]

# Generate mappings between the real ages in the groups and the indices of simulated data
age_mapping = dict(zip(age_labels, map(np.array, age_groups)))


# In[22]:


age_mapping


# ## 5. Define a function to calculate simulated moments

# In[23]:


def simulate_moments(params, agent=None):
    agent.assign_parameters(**params)
    agent.update()

    agent.solve()

    agent.initialize_sim()
    agent.simulate()
    history = agent.history

    raw_data = {
        "age": history["t_age"].flatten() + birth_age - 1,
        "b_nrm": history["bNrm"].flatten(),
        "p_lvl": history["pLvl"].flatten(),
    }

    sim_data = pd.DataFrame(raw_data)
    sim_data["Wealth"] = sim_data.b_nrm * sim_data.p_lvl

    sim_data["Age_grp"] = pd.cut(
        sim_data.age,
        bins=range(birth_age + 1, 97, 5),
        labels=age_labels,
        right=False,
    )

    sim_data = sim_data.dropna()

    return sim_data.groupby("Age_grp", observed=False)["Wealth"].median()


# In[24]:


simulate_moments({}, agent=LifeCycleAgent).plot()
empirical_moments.plot()


# ## 6. Estimate the model parameters

# In[25]:


init_params = {"CRRA": 3.009252, "DiscFac": 1.006925}
lower_bounds = {"CRRA": 1.0, "DiscFac": 0.9}
upper_bounds = {"CRRA": 10.0, "DiscFac": 1.1}

res = estimate_msm(
    LifeCycleAgent,
    init_params,
    empirical_moments,
    moments_cov,
    simulate_moments,
    optimize_options={
        "algorithm": "scipy_lbfgsb",
        "error_handling": "continue",
    },
    estimagic_options={
        "lower_bounds": lower_bounds,
        "upper_bounds": upper_bounds,
    },
)


# In[26]:


pd.concat(res.summary())


# In[27]:


res.params


# In[28]:


pd.DataFrame(res.cov(method="robust"))


# In[29]:


res.se()


# In[30]:


LifeCycleAgent.assign_parameters(**res.params)
LifeCycleAgent.update()
LifeCycleAgent.solve()
LifeCycleAgent.initialize_sim()
LifeCycleAgent.simulate()
history = LifeCycleAgent.history

raw_data = {
    "Age": history["t_age"].flatten() + birth_age - 1,
    "pIncome": history["pLvl"].flatten(),
    "nrmM": history["mNrm"].flatten(),
    "nrmC": history["cNrm"].flatten(),
}

sim_data = pd.DataFrame(raw_data)
sim_data["Cons"] = sim_data.nrmC * sim_data.pIncome
sim_data["M"] = sim_data.nrmM * sim_data.pIncome

# Find the mean of each variable at every age
AgeMeans = sim_data.groupby(["Age"]).median().reset_index()

plt.figure()
plt.plot(AgeMeans.Age, AgeMeans.pIncome, label="Permanent Income")
plt.plot(AgeMeans.Age, AgeMeans.M, label="Market resources")
plt.plot(AgeMeans.Age, AgeMeans.Cons, label="Consumption")
plt.legend()
plt.xlabel("Age")
plt.ylabel(f"Thousands of {adjust_infl_to} USD")
plt.title("Variable Medians Conditional on Survival")
plt.grid()


# In[31]:


simulate_moments(res.params, agent=LifeCycleAgent).plot()
empirical_moments.plot()


# In[ ]: