#!/usr/bin/env python
# coding: utf-8

# In[1]:


# Created by: Sergiu Iatco / 2021.10.01
# https://github.com/itsergiu/Predict-S-P-500-correction-with-Shiller-PE-Ratio
# How to predict S&P 500 correction

# Nadeem Walayat - The Market Oracle
# http://www.marketoracle.co.uk/Article69423.html

# Vitaliy Katsenelson 
# https://contrarianedge.com/sideways-market/

# https://www.marketwatch.com/story/market-analysts-cant-agree-on-where-stocks-are-going-next-so-double-check-the-data-before-you-buy-or-sell-11632447577

# Purpose: to build a machine learning model to predict S&P 500 correction within next 6 months


# In[2]:


# !pip install pandas
# !pip install dateparser
# !pip install xgboost
# !pip install sklearn
# !pip install numpy
# !pip install plotly_express


# In[3]:


import pandas as pd
# import dateparser
import xgboost as xgb
from sklearn import model_selection
from sklearn.metrics import r2_score
import numpy as np
from fredapi import Fred
import plotly.express as px
from IPython.display import display


# In[4]:


url_per='https://www.multpl.com/shiller-pe/table/by-month'


# In[5]:


ls_tables = pd.read_html(url_per)


# In[6]:


type(ls_tables)


# In[7]:


len(ls_tables)


# In[8]:


ls_tables[0]


# In[9]:


df_per=ls_tables[0]


# In[10]:


df_per.dtypes


# In[11]:


cols = df_per.columns.tolist()
cols


# In[12]:


df_per.rename(
    columns=({ cols[1]: 'PER'}), 
    inplace=True,)


# In[13]:


df_per.head()


# In[14]:


url_sp='https://www.multpl.com/s-p-500-historical-prices/table/by-month'


# In[15]:


ls_tables = pd.read_html(url_sp)


# In[16]:


len(ls_tables)


# In[17]:


df_sp=ls_tables[0]
df_sp.head()


# In[18]:


cols = df_sp.columns.tolist()
cols


# In[19]:


df_sp.rename(
    columns=({ cols[1]: 'Price'}), 
    inplace=True,)


# In[20]:


df_sp.head()


# In[21]:


df_per.index, df_sp.index


# In[22]:


df = df_per.merge(df_sp, left_on='Date', right_on='Date')
df


# In[23]:


df.dtypes


# In[24]:


df['Date']=pd.to_datetime(df['Date'])
df


# In[25]:


df.head()


# In[26]:


df.set_index(['Date'], inplace=True)
df['Date']=df.index
df


# In[27]:


# https://pandas.pydata.org/docs/reference/api/pandas.melt.html
df_per = pd.melt(df, id_vars=['Date'], value_vars=['PER'])
px.line(df_per, x='Date', y='value', color='variable')


# In[28]:


df.describe()


# In[29]:


df.dtypes


# In[30]:


ls_price_p = ['Price'+str(i)+'F' for i in range(7)]
ls_price_p


# In[31]:


ls_price_p[1:]


# In[32]:


ls_price_p[0]


# In[33]:


df.columns.tolist()


# In[34]:


df.head(10)


# In[35]:


df.tail(10)


# In[36]:


for i,e in enumerate(ls_price_p):
    df[e]=df.Price.shift(i)   


# In[37]:


df['Price1P'] = df['Price'].shift(-1)
df['Price_Var']=df['Price']/df['Price1P']-1


# In[38]:


df.head(10)


# In[39]:


df['PriceFmin']=df[ls_price_p].min(axis=1)


# In[40]:


df.head(10)


# In[41]:


col_y = 'Price_Corr_6M'
df[col_y]=1-df['PriceFmin']/df['Price0F']


# In[42]:


df.head(10)


# In[43]:


df[df.isna().any(axis=1)]


# In[44]:


df.loc[df.isna().any(axis=1),col_y]=np.nan # set to zero corrections with Nan rows
df[df.isna().any(axis=1)]


# In[45]:


df_initial=df.copy()


# In[46]:


df_initial.head()


# In[47]:


df.dropna(axis=0, how='any', inplace=True) # remove NaN rowss


# In[48]:


df[df[col_y]>0].describe()


# In[49]:


last_months=240
df_per = pd.melt(df.head(last_months), id_vars='Date', value_vars=['Price'])
px.line(df_per, x='Date', y='value', color='variable')


# In[50]:


# min_correction=0.15
min_correction=0.15 # take a look at big corrections
df[df[col_y]>min_correction].head(10)


# In[51]:


df[df[col_y]>min_correction][col_y].describe()


# In[52]:


# plot PER & Price_correction


# In[53]:


min_correction=0.05 # force learning above ?
# min_correction=0
if min_correction!=0:
    df[col_y] = df[col_y].apply(lambda x: 0 if x<min_correction else x)

# df[col_y] = -df[col_y] # reverse sign

df_per_pc=df[['Date','PER',col_y]].copy()
df_per_pc[[col_y]]=100*df_per_pc[[col_y]]
df_per_pc[col_y]=-df_per_pc[col_y] # reverse sign for plotting


# In[54]:


last_months=2400
df_plot = pd.melt(df_per_pc, id_vars='Date', value_vars=['PER',col_y])
px.line(df_plot, x='Date', y='value', color='variable')


# In[55]:


pd.set_option('display.max_columns', 500)
df.head(10)


# In[56]:


df.head(36)


# In[57]:


def gen_features(df,cols, ref_period, band):
    features = []
    
    features.append(df['Date'].iloc[ref_period])
    for col in cols:
        segment = df[col].iloc[ref_period:ref_period+band]
        features.append(segment.min()) 
        features.append(segment.max())

        for i in range(2,99,10):
            features.append(np.quantile(segment, i/100))

        features.append(segment.mean())
        features.append(segment.std())
        features.append(segment.mad())
        features.append(segment.skew())
        features.append(segment.kurtosis())

        features.append(np.sqrt(np.mean(segment**2)))

        features.append(np.abs(segment).mean())
        features.append(np.abs(segment).std())

        features.append(np.abs(np.diff(segment)).mean())
        features.append(np.abs(np.diff(segment)).std())
        
    return pd.Series(features)


# In[58]:


def generate_dataset(df_f, cols, band=60, start_date='2000-01-01', periods = 0):
#     global cols
    df = df_f.copy()
    X_train = pd.DataFrame()

    y_train = pd.Series(dtype='float64', name=col_y)
#     cols =['PER', 'Price', 'Price_Var']

    if periods>0:
        ref_periods = periods
    else:
        ref_periods = df[df.index>=start_date].shape[0]
        
    for ref_period in range(ref_periods):
        X = gen_features(df,cols, ref_period, band)
        X_train = X_train.append(X, ignore_index=True)
        y_train = y_train.append(pd.Series(df[col_y][ref_period], index=[df.index[ref_period]], name=col_y))

    X_train = X_train.rename({0:'Date'}, axis='columns')
    X_train = X_train.set_index('Date')
    return X_train, y_train


# In[59]:


start_date = '1950-01-01' # set your own date
band=60 # how many months from past to use to generate statistics


# In[60]:


cols_features = ['PER', 'Price', 'Price_Var']
X_train, y_train = generate_dataset(df, cols_features, band, start_date)


# In[61]:


y_train.shape, X_train.shape


# In[62]:


y_train.index


# In[63]:


X_train.index


# In[64]:


dataset = pd.concat([y_train, X_train], axis=1, join="inner") # align to avoid random shifting
dataset.describe()


# In[65]:


Y=dataset.pop(col_y) # target to predict


# In[66]:


Y.describe()


# In[67]:


X=dataset # features for prediction
X.describe()


# In[68]:


# PREDICTION


# In[69]:


def generate_model(df, cols, start_date, verbosity = False):
    best_score = 0
    best_model = None
    best_band = 0
    test_size = 0.3
    bands = 60
    
    for band in range(12,bands+1,12): # [12,24,36,48,60] months
        X_train, y_train = generate_dataset(df, cols, band, start_date)
        dataset = pd.concat([y_train, X_train], axis=1, join="inner") # align to avoid random shifting
        Y=dataset.pop(col_y)
        X=dataset
        X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, shuffle = True)
        model = xgb.XGBRegressor()
        model.fit(X_train,Y_train.values)
        score = model.score(X_test, Y_test)
        if best_score < score:
            best_score = score
            best_model = model
            best_band = band
            X_t = X_test
            Y_t = Y_test
            
#         if verbosity: # verbosit for each band
#             print(score, band)
    if verbosity: # verbosity for best score
        print('retest score:', best_model.score(X_t, Y_t).round(3), best_score.round(3), 'features:', cols, 'X_train.shape:', X_train.shape)# retest best model score
    
    return best_model, best_band, best_score


# In[70]:


start_date = '1970-01-01'
cols_features =['PER', 'Price', 'Price_Var']
model, band, score = generate_model(df, cols_features, start_date, True)


# In[71]:


# model.score(X_t, Y_t), best_score # retest best model score


# In[72]:


df_initial['horizon'] = 'past'
df_initial.loc[np.isnan(df_initial['Price6F']),['horizon']]='future'
df_initial.head(10)


# In[73]:


df_initial['Date']=df_initial.index # add required column
df_initial.head(24)


# In[74]:


df.head()


# In[75]:


periods=120 # predicted periods
df_pred = df_initial.copy()
cols_features =['PER', 'Price', 'Price_Var']
X_pred, y_dummy = generate_dataset(df_pred, cols_features, band, periods=periods)
X_pred.head() # generated features


# In[76]:


X_pred.shape


# In[77]:


# df_pred = df_pred.iloc[:periods]
# df_pred


# In[78]:


y_pred = model.predict(X_pred).round(2) # readable
# y_pred = model.predict(X_pred)
# y_pred


# In[79]:


df_pred = df_pred.iloc[:periods].copy()
df_pred[col_y+'Pred'] = np.clip(y_pred,0,0.5) # replace outliers


# In[80]:


cols_pred = ['PER', 'Price', 'Price_Corr_6M', 'horizon', 'Price_Corr_6MPred']
df_pred[cols_pred].head(36) # one shoot prediction


# In[81]:


ind_y = df_pred['horizon']=='past'
# ind_y


# In[82]:


model.score(X_pred.loc[ind_y], df_pred.loc[ind_y,col_y]) # score for past - could contain seen data (X_train)


# In[83]:


# ind_y[ind_y==True] # past months for score


# In[84]:


# RANDOM CROSS FOLD
n_times = 10 # build model n times and predict n times
periods = 6 # predicted last months
df_pred = df_initial.copy()
df_pred_iter = df_pred.iloc[:periods].copy()
ls_pc6m_cols=[]
ls_score = []
cols_features =['PER', 'Price', 'Price_Var']
for i in range(n_times):
    start_date = '1970-01-01'
    model, band, score = generate_model(df, cols_features, start_date, True)
    ls_score.append(score.round(3))
    print('score:',score.round(3), 'band:', band)
    
    X_pred, y_dummy = generate_dataset(df_pred, cols_features, band, periods=periods)
    y_pred = model.predict(X_pred).round(2) # readable
    pc6m_col = 'PC6M'+str(1+i)
    ls_pc6m_cols.append(pc6m_col)
    df_pred_iter[pc6m_col] = y_pred    


# In[85]:


cols_iter = ['PER', 'Price', 'Price_Var', 'horizon']
df_pred_iter=df_pred_iter[cols_iter+ls_pc6m_cols].copy()


# In[86]:


df_pred_iter['PC6M_AVG']=df_pred_iter[ls_pc6m_cols].mean(axis=1) # average of all predictions


# In[87]:


# RANDOM CROSS FOLD PREDICTION with ['PER', 'Price', 'Price_Var']
df_pred_iter


# In[88]:


np.mean(ls_score), ls_score # model score mean and model score list


# In[89]:


# Buffet Indicator


# In[90]:


# Get your api_key from FRED
# https://fred.stlouisfed.org/docs/api/api_key.html

fred = Fred(api_key='abcdefghijklmnopqrstuvwxyz123456') # example # get your api_key from FRED 
fred = Fred(api_key='cba3e183577c1edbc4f2f731f82ae371')


# In[91]:


# wilshire = fred.get_series('WILL5000PRFC')


# In[92]:


gdp_data = fred.get_series_latest_release('GDP')
gdp_data


# In[93]:


# Convert from quarter to day with interpolate
idx = pd.date_range(gdp_data.index.min(), gdp_data.index.max()) 
gdp_data = gdp_data.reindex(idx, fill_value=np.nan).copy()
gdp_data


# In[94]:


gdp_data = pd.to_numeric(gdp_data, errors='coerce').astype('float64').copy()
gdp_data.interpolate(inplace=True)
gdp_data


# In[95]:


# %matplotlib inline
gdp_data.plot.line()


# In[96]:


wilshire = fred.get_series('WILL5000PRFC')


# In[97]:


idx = pd.date_range(wilshire.index.min(), wilshire.index.max())
wilshire = wilshire.reindex(idx, fill_value=np.nan).copy()
wilshire


# In[98]:


wilshire.interpolate(inplace=True)
wilshire


# In[99]:


wilshire.plot.line()


# In[100]:


gdp_data.name='GDP'
wilshire.name='Wilshire 5000'
bf_ind = pd.concat([gdp_data, wilshire], axis=1) # Buffet Indicator
bf_ind


# In[101]:


bf_ind.dropna(subset=['Wilshire 5000'], inplace=True) # drop missing GDP
bf_ind


# In[102]:


bf_ind.interpolate(method ='spline',order=2, inplace=True)


# In[103]:


col_bf_ind='Buffet Indicator'
bf_ind[col_bf_ind] =  bf_ind['Wilshire 5000'] / bf_ind['GDP'] 
bf_ind


# In[104]:


# https://www.longtermtrends.net/market-cap-to-gdp-the-buffett-indicator/
bf_ind[col_bf_ind].plot.line() # compare with chart from above link


# In[105]:


bf_ind.index.min(), bf_ind.index.max()


# In[106]:


df_initial


# In[107]:


df_pred = df_initial.copy()
df_pred = pd.concat([df_pred, bf_ind], axis=1)
df_pred.sort_index(ascending=False, inplace=True)


# In[108]:


cols_iter = ['Date','PER', 'Price', 'Price_Var', 'horizon', 'Buffet Indicator', 'Price_Corr_6M']
df_pred[col_bf_ind] = df_pred[col_bf_ind].interpolate(method='linear', limit_direction='backward')
df_pred = df_pred[cols_iter].copy()
df_pred.dropna(subset=['horizon', 'Buffet Indicator'], inplace=True)
df_pred


# In[109]:


cols_train=['Date','PER', 'Price', 'Price_Var', 'Buffet Indicator', 'Price_Corr_6M']
df= df_pred[cols_train].copy()
df.dropna(inplace=True)
df


# In[110]:


# RANDOM CROSS FOLD for Bootstrapping
ls_cols_features=['PER', 'Price', 'Price_Var','Buffet Indicator'] # test with one by one feature
ls_cols_features.append(['PER','Buffet Indicator'])  # test with set of features
ls_cols_features.append(['PER', 'Price','Buffet Indicator'])  # test with set of features
start_date = df.index[-periods]

for e in ls_cols_features:
    n_times = 10 # build model n times and predict n times
    periods = 6 # predicted last months
    # df_pred = df_initial.copy()
    df_pred_iter = df_pred.iloc[:periods].copy()
    ls_pc6m_cols=[]
    ls_score = []
    cols_features = [e]
    if isinstance(e, list):
        cols_features = e # set of features
    else:
        cols_features = [e] # one feature
    
    for i in range(n_times):
    #     start_date = '1974-01-01'
        model, band, score = generate_model(df, cols_features, start_date, True)
        score = score.round(3)
        ls_score.append(score)
        print('score:',score, 'band:', band)

        X_pred, y_dummy = generate_dataset(df_pred, cols_features, band, periods=periods)
        y_pred = model.predict(X_pred).round(2) # recols_features, adable
        y_pred = np.clip(y_pred,0,0.5) # replace outliers
        pc6m_col = 'PC6M'+str(1+i)
        ls_pc6m_cols.append(pc6m_col)
        df_pred_iter[pc6m_col] = y_pred  

    # RANDOM CROSS FOLD PREDICTION on PER and BUFFET INDICATOR
    df_pred_iter['PC6M_AVG']=df_pred_iter[ls_pc6m_cols].mean(axis=1) # average of all predictions
    display(df_pred_iter.loc[:,df_pred_iter.columns!=col_y])
    print('scores:', np.mean(ls_score).round(3), ls_score) # model score mean and model score list)


# In[111]:


#