#!/usr/bin/env python # coding: utf-8 # In[1]: # Created by: Sergiu Iatco / 2021.10.01 # https://github.com/itsergiu/Predict-S-P-500-correction-with-Shiller-PE-Ratio # How to predict S&P 500 correction # Nadeem Walayat - The Market Oracle # http://www.marketoracle.co.uk/Article69423.html # Vitaliy Katsenelson # https://contrarianedge.com/sideways-market/ # https://www.marketwatch.com/story/market-analysts-cant-agree-on-where-stocks-are-going-next-so-double-check-the-data-before-you-buy-or-sell-11632447577 # Purpose: to build a machine learning model to predict S&P 500 correction within next 6 months # In[2]: # !pip install pandas # !pip install dateparser # !pip install xgboost # !pip install sklearn # !pip install numpy # !pip install plotly_express # In[3]: import pandas as pd # import dateparser import xgboost as xgb from sklearn import model_selection from sklearn.metrics import r2_score import numpy as np from fredapi import Fred import plotly.express as px from IPython.display import display # In[4]: url_per='https://www.multpl.com/shiller-pe/table/by-month' # In[5]: ls_tables = pd.read_html(url_per) # In[6]: type(ls_tables) # In[7]: len(ls_tables) # In[8]: ls_tables[0] # In[9]: df_per=ls_tables[0] # In[10]: df_per.dtypes # In[11]: cols = df_per.columns.tolist() cols # In[12]: df_per.rename( columns=({ cols[1]: 'PER'}), inplace=True,) # In[13]: df_per.head() # In[14]: url_sp='https://www.multpl.com/s-p-500-historical-prices/table/by-month' # In[15]: ls_tables = pd.read_html(url_sp) # In[16]: len(ls_tables) # In[17]: df_sp=ls_tables[0] df_sp.head() # In[18]: cols = df_sp.columns.tolist() cols # In[19]: df_sp.rename( columns=({ cols[1]: 'Price'}), inplace=True,) # In[20]: df_sp.head() # In[21]: df_per.index, df_sp.index # In[22]: df = df_per.merge(df_sp, left_on='Date', right_on='Date') df # In[23]: df.dtypes # In[24]: df['Date']=pd.to_datetime(df['Date']) df # In[25]: df.head() # In[26]: df.set_index(['Date'], inplace=True) df['Date']=df.index df # In[27]: # https://pandas.pydata.org/docs/reference/api/pandas.melt.html df_per = pd.melt(df, id_vars=['Date'], value_vars=['PER']) px.line(df_per, x='Date', y='value', color='variable') # In[28]: df.describe() # In[29]: df.dtypes # In[30]: ls_price_p = ['Price'+str(i)+'F' for i in range(7)] ls_price_p # In[31]: ls_price_p[1:] # In[32]: ls_price_p[0] # In[33]: df.columns.tolist() # In[34]: df.head(10) # In[35]: df.tail(10) # In[36]: for i,e in enumerate(ls_price_p): df[e]=df.Price.shift(i) # In[37]: df['Price1P'] = df['Price'].shift(-1) df['Price_Var']=df['Price']/df['Price1P']-1 # In[38]: df.head(10) # In[39]: df['PriceFmin']=df[ls_price_p].min(axis=1) # In[40]: df.head(10) # In[41]: col_y = 'Price_Corr_6M' df[col_y]=1-df['PriceFmin']/df['Price0F'] # In[42]: df.head(10) # In[43]: df[df.isna().any(axis=1)] # In[44]: df.loc[df.isna().any(axis=1),col_y]=np.nan # set to zero corrections with Nan rows df[df.isna().any(axis=1)] # In[45]: df_initial=df.copy() # In[46]: df_initial.head() # In[47]: df.dropna(axis=0, how='any', inplace=True) # remove NaN rowss # In[48]: df[df[col_y]>0].describe() # In[49]: last_months=240 df_per = pd.melt(df.head(last_months), id_vars='Date', value_vars=['Price']) px.line(df_per, x='Date', y='value', color='variable') # In[50]: # min_correction=0.15 min_correction=0.15 # take a look at big corrections df[df[col_y]>min_correction].head(10) # In[51]: df[df[col_y]>min_correction][col_y].describe() # In[52]: # plot PER & Price_correction # In[53]: min_correction=0.05 # force learning above ? # min_correction=0 if min_correction!=0: df[col_y] = df[col_y].apply(lambda x: 0 if x0: ref_periods = periods else: ref_periods = df[df.index>=start_date].shape[0] for ref_period in range(ref_periods): X = gen_features(df,cols, ref_period, band) X_train = X_train.append(X, ignore_index=True) y_train = y_train.append(pd.Series(df[col_y][ref_period], index=[df.index[ref_period]], name=col_y)) X_train = X_train.rename({0:'Date'}, axis='columns') X_train = X_train.set_index('Date') return X_train, y_train # In[59]: start_date = '1950-01-01' # set your own date band=60 # how many months from past to use to generate statistics # In[60]: cols_features = ['PER', 'Price', 'Price_Var'] X_train, y_train = generate_dataset(df, cols_features, band, start_date) # In[61]: y_train.shape, X_train.shape # In[62]: y_train.index # In[63]: X_train.index # In[64]: dataset = pd.concat([y_train, X_train], axis=1, join="inner") # align to avoid random shifting dataset.describe() # In[65]: Y=dataset.pop(col_y) # target to predict # In[66]: Y.describe() # In[67]: X=dataset # features for prediction X.describe() # In[68]: # PREDICTION # In[69]: def generate_model(df, cols, start_date, verbosity = False): best_score = 0 best_model = None best_band = 0 test_size = 0.3 bands = 60 for band in range(12,bands+1,12): # [12,24,36,48,60] months X_train, y_train = generate_dataset(df, cols, band, start_date) dataset = pd.concat([y_train, X_train], axis=1, join="inner") # align to avoid random shifting Y=dataset.pop(col_y) X=dataset X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, shuffle = True) model = xgb.XGBRegressor() model.fit(X_train,Y_train.values) score = model.score(X_test, Y_test) if best_score < score: best_score = score best_model = model best_band = band X_t = X_test Y_t = Y_test # if verbosity: # verbosit for each band # print(score, band) if verbosity: # verbosity for best score print('retest score:', best_model.score(X_t, Y_t).round(3), best_score.round(3), 'features:', cols, 'X_train.shape:', X_train.shape)# retest best model score return best_model, best_band, best_score # In[70]: start_date = '1970-01-01' cols_features =['PER', 'Price', 'Price_Var'] model, band, score = generate_model(df, cols_features, start_date, True) # In[71]: # model.score(X_t, Y_t), best_score # retest best model score # In[72]: df_initial['horizon'] = 'past' df_initial.loc[np.isnan(df_initial['Price6F']),['horizon']]='future' df_initial.head(10) # In[73]: df_initial['Date']=df_initial.index # add required column df_initial.head(24) # In[74]: df.head() # In[75]: periods=120 # predicted periods df_pred = df_initial.copy() cols_features =['PER', 'Price', 'Price_Var'] X_pred, y_dummy = generate_dataset(df_pred, cols_features, band, periods=periods) X_pred.head() # generated features # In[76]: X_pred.shape # In[77]: # df_pred = df_pred.iloc[:periods] # df_pred # In[78]: y_pred = model.predict(X_pred).round(2) # readable # y_pred = model.predict(X_pred) # y_pred # In[79]: df_pred = df_pred.iloc[:periods].copy() df_pred[col_y+'Pred'] = np.clip(y_pred,0,0.5) # replace outliers # In[80]: cols_pred = ['PER', 'Price', 'Price_Corr_6M', 'horizon', 'Price_Corr_6MPred'] df_pred[cols_pred].head(36) # one shoot prediction # In[81]: ind_y = df_pred['horizon']=='past' # ind_y # In[82]: model.score(X_pred.loc[ind_y], df_pred.loc[ind_y,col_y]) # score for past - could contain seen data (X_train) # In[83]: # ind_y[ind_y==True] # past months for score # In[84]: # RANDOM CROSS FOLD n_times = 10 # build model n times and predict n times periods = 6 # predicted last months df_pred = df_initial.copy() df_pred_iter = df_pred.iloc[:periods].copy() ls_pc6m_cols=[] ls_score = [] cols_features =['PER', 'Price', 'Price_Var'] for i in range(n_times): start_date = '1970-01-01' model, band, score = generate_model(df, cols_features, start_date, True) ls_score.append(score.round(3)) print('score:',score.round(3), 'band:', band) X_pred, y_dummy = generate_dataset(df_pred, cols_features, band, periods=periods) y_pred = model.predict(X_pred).round(2) # readable pc6m_col = 'PC6M'+str(1+i) ls_pc6m_cols.append(pc6m_col) df_pred_iter[pc6m_col] = y_pred # In[85]: cols_iter = ['PER', 'Price', 'Price_Var', 'horizon'] df_pred_iter=df_pred_iter[cols_iter+ls_pc6m_cols].copy() # In[86]: df_pred_iter['PC6M_AVG']=df_pred_iter[ls_pc6m_cols].mean(axis=1) # average of all predictions # In[87]: # RANDOM CROSS FOLD PREDICTION with ['PER', 'Price', 'Price_Var'] df_pred_iter # In[88]: np.mean(ls_score), ls_score # model score mean and model score list # In[89]: # Buffet Indicator # In[90]: # Get your api_key from FRED # https://fred.stlouisfed.org/docs/api/api_key.html fred = Fred(api_key='abcdefghijklmnopqrstuvwxyz123456') # example # get your api_key from FRED fred = Fred(api_key='cba3e183577c1edbc4f2f731f82ae371') # In[91]: # wilshire = fred.get_series('WILL5000PRFC') # In[92]: gdp_data = fred.get_series_latest_release('GDP') gdp_data # In[93]: # Convert from quarter to day with interpolate idx = pd.date_range(gdp_data.index.min(), gdp_data.index.max()) gdp_data = gdp_data.reindex(idx, fill_value=np.nan).copy() gdp_data # In[94]: gdp_data = pd.to_numeric(gdp_data, errors='coerce').astype('float64').copy() gdp_data.interpolate(inplace=True) gdp_data # In[95]: # %matplotlib inline gdp_data.plot.line() # In[96]: wilshire = fred.get_series('WILL5000PRFC') # In[97]: idx = pd.date_range(wilshire.index.min(), wilshire.index.max()) wilshire = wilshire.reindex(idx, fill_value=np.nan).copy() wilshire # In[98]: wilshire.interpolate(inplace=True) wilshire # In[99]: wilshire.plot.line() # In[100]: gdp_data.name='GDP' wilshire.name='Wilshire 5000' bf_ind = pd.concat([gdp_data, wilshire], axis=1) # Buffet Indicator bf_ind # In[101]: bf_ind.dropna(subset=['Wilshire 5000'], inplace=True) # drop missing GDP bf_ind # In[102]: bf_ind.interpolate(method ='spline',order=2, inplace=True) # In[103]: col_bf_ind='Buffet Indicator' bf_ind[col_bf_ind] = bf_ind['Wilshire 5000'] / bf_ind['GDP'] bf_ind # In[104]: # https://www.longtermtrends.net/market-cap-to-gdp-the-buffett-indicator/ bf_ind[col_bf_ind].plot.line() # compare with chart from above link # In[105]: bf_ind.index.min(), bf_ind.index.max() # In[106]: df_initial # In[107]: df_pred = df_initial.copy() df_pred = pd.concat([df_pred, bf_ind], axis=1) df_pred.sort_index(ascending=False, inplace=True) # In[108]: cols_iter = ['Date','PER', 'Price', 'Price_Var', 'horizon', 'Buffet Indicator', 'Price_Corr_6M'] df_pred[col_bf_ind] = df_pred[col_bf_ind].interpolate(method='linear', limit_direction='backward') df_pred = df_pred[cols_iter].copy() df_pred.dropna(subset=['horizon', 'Buffet Indicator'], inplace=True) df_pred # In[109]: cols_train=['Date','PER', 'Price', 'Price_Var', 'Buffet Indicator', 'Price_Corr_6M'] df= df_pred[cols_train].copy() df.dropna(inplace=True) df # In[110]: # RANDOM CROSS FOLD for Bootstrapping ls_cols_features=['PER', 'Price', 'Price_Var','Buffet Indicator'] # test with one by one feature ls_cols_features.append(['PER','Buffet Indicator']) # test with set of features ls_cols_features.append(['PER', 'Price','Buffet Indicator']) # test with set of features start_date = df.index[-periods] for e in ls_cols_features: n_times = 10 # build model n times and predict n times periods = 6 # predicted last months # df_pred = df_initial.copy() df_pred_iter = df_pred.iloc[:periods].copy() ls_pc6m_cols=[] ls_score = [] cols_features = [e] if isinstance(e, list): cols_features = e # set of features else: cols_features = [e] # one feature for i in range(n_times): # start_date = '1974-01-01' model, band, score = generate_model(df, cols_features, start_date, True) score = score.round(3) ls_score.append(score) print('score:',score, 'band:', band) X_pred, y_dummy = generate_dataset(df_pred, cols_features, band, periods=periods) y_pred = model.predict(X_pred).round(2) # recols_features, adable y_pred = np.clip(y_pred,0,0.5) # replace outliers pc6m_col = 'PC6M'+str(1+i) ls_pc6m_cols.append(pc6m_col) df_pred_iter[pc6m_col] = y_pred # RANDOM CROSS FOLD PREDICTION on PER and BUFFET INDICATOR df_pred_iter['PC6M_AVG']=df_pred_iter[ls_pc6m_cols].mean(axis=1) # average of all predictions display(df_pred_iter.loc[:,df_pred_iter.columns!=col_y]) print('scores:', np.mean(ls_score).round(3), ls_score) # model score mean and model score list) # In[111]: #