#!/usr/bin/env python # coding: utf-8 # In[1]: # Created by: Sergiu Iatco / 2021.10.01 # https://github.com/itsergiu/Predict-S-P-500-correction-with-Shiller-PE-Ratio # How to predict S&P 500 correction # Nadeem Walayat - The Market Oracle # http://www.marketoracle.co.uk/Article69423.html # Vitaliy Katsenelson # https://contrarianedge.com/sideways-market/ # https://www.marketwatch.com/story/market-analysts-cant-agree-on-where-stocks-are-going-next-so-double-check-the-data-before-you-buy-or-sell-11632447577 # Purpose: to build a machine learning model to predict S&P 500 correction within next 6 months # Disclaimer: # # Author do not assume and hereby disclaim any liability to any party for any loss, damage, or disruption caused by errors or omissions, whether such errors or omissions result from negligence, accident, or any other cause. The software is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. # # The below is a matter of opinion provided for general information purposes only and is not intended as investment advice. Information and analysis above are derived from sources and utilising methods believed to be reliable, but I cannot accept responsibility for any trading losses you may incur as a result of this analysis. Individuals should consult with their personal financial advisors before engaging in any trading activities. # In[2]: # !pip install pandas # !pip install dateparser # !pip install xgboost # !pip install sklearn # !pip install numpy # !pip install plotly_express # !pip install fredapi # In[3]: import pandas as pd # import dateparser import xgboost as xgb from sklearn import model_selection from sklearn.metrics import r2_score import numpy as np from fredapi import Fred import plotly.express as px from IPython.display import display # In[4]: url_per='https://www.multpl.com/shiller-pe/table/by-month' # In[5]: ls_tables = pd.read_html(url_per) # In[6]: type(ls_tables) # In[7]: len(ls_tables) # In[8]: ls_tables[0] # In[9]: df_per=ls_tables[0] # In[10]: df_per.dtypes # In[11]: cols = df_per.columns.tolist() cols # In[12]: df_per.rename( columns=({ cols[1]: 'PER'}), inplace=True,) # In[13]: df_per.head() # In[14]: url_sp='https://www.multpl.com/s-p-500-historical-prices/table/by-month' # In[15]: ls_tables = pd.read_html(url_sp) # In[16]: len(ls_tables) # In[17]: df_sp=ls_tables[0] df_sp.head() # In[18]: cols = df_sp.columns.tolist() cols # In[19]: df_sp.rename( columns=({ cols[1]: 'Price'}), inplace=True,) # In[20]: df_sp.head() # In[21]: df_per.index, df_sp.index # In[22]: df = df_per.merge(df_sp, left_on='Date', right_on='Date') df # In[23]: df.dtypes # In[24]: df['Date']=pd.to_datetime(df['Date']) df # In[25]: df.head() # In[26]: df.set_index(['Date'], inplace=True) df['Date']=df.index df # In[27]: # https://pandas.pydata.org/docs/reference/api/pandas.melt.html df_per = pd.melt(df, id_vars=['Date'], value_vars=['PER']) px.line(df_per, x='Date', y='value', color='variable') # In[28]: df.describe() # In[29]: df.dtypes # In[30]: ls_price_p = ['Price'+str(i)+'F' for i in range(7)] ls_price_p # In[31]: ls_price_p[1:] # In[32]: ls_price_p[0] # In[33]: df.columns.tolist() # In[34]: df.head(10) # In[35]: df.tail(10) # In[36]: for i,e in enumerate(ls_price_p): df[e]=df.Price.shift(i) # In[37]: df['Price1P'] = df['Price'].shift(-1) df['Price_Var']=df['Price']/df['Price1P']-1 # In[38]: df.head(10) # In[39]: df['PriceFmin']=df[ls_price_p].min(axis=1) # In[40]: df.head(10) # In[41]: col_y = 'Price_Corr_6M' df[col_y]=1-df['PriceFmin']/df['Price0F'] # In[42]: df.head(10) # In[43]: df[df.isna().any(axis=1)] # In[44]: df.loc[df.isna().any(axis=1),col_y]=np.nan # set to zero corrections with Nan rows df[df.isna().any(axis=1)] # In[45]: df_initial=df.copy() # In[46]: df_initial.head() # In[47]: df.dropna(axis=0, how='any', inplace=True) # remove NaN rowss # In[48]: df[df[col_y]>0].describe() # In[49]: last_months=240 df_per = pd.melt(df.head(last_months), id_vars='Date', value_vars=['Price']) px.line(df_per, x='Date', y='value', color='variable') # In[50]: # min_correction=0.15 min_correction=0.15 # take a look at big corrections df[df[col_y]>min_correction].head(10) # In[51]: df[df[col_y]>min_correction][col_y].describe() # In[52]: # plot PER & Price_correction # In[53]: min_correction=0.05 # force learning above ? # min_correction=0 if min_correction!=0: df[col_y] = df[col_y].apply(lambda x: 0 if x0: ref_periods = periods else: ref_periods = df[df.index>=start_date].shape[0] for ref_period in range(ref_periods): X = gen_features(df,cols, ref_period, band) X_train = X_train.append(X, ignore_index=True) y_train = y_train.append(pd.Series(df[col_y][ref_period], index=[df.index[ref_period]], name=col_y)) X_train = X_train.rename({0:'Date'}, axis='columns') X_train = X_train.set_index('Date') return X_train, y_train # In[59]: start_date = '1950-01-01' # set your own date band=60 # how many months from past to use to generate statistics # In[60]: cols_features = ['PER', 'Price', 'Price_Var'] X_train, y_train = generate_dataset(df, cols_features, band, start_date) # In[61]: y_train.shape, X_train.shape # In[62]: y_train.index # In[63]: X_train.index # In[64]: dataset = pd.concat([y_train, X_train], axis=1, join="inner") # align to avoid random shifting dataset.describe() # In[65]: Y=dataset.pop(col_y) # target to predict # In[66]: Y.describe() # In[67]: X=dataset # features for prediction X.describe() # In[68]: # PREDICTION # In[69]: def generate_model(df, cols, start_date, verbosity = False): best_score = 0 best_model = None best_band = 0 test_size = 0.3 bands = 60 for band in range(12,bands+1,12): # [12,24,36,48,60] months X_train, y_train = generate_dataset(df, cols, band, start_date) dataset = pd.concat([y_train, X_train], axis=1, join="inner") # align to avoid random shifting Y=dataset.pop(col_y) X=dataset X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, shuffle = True) model = xgb.XGBRegressor() model.fit(X_train,Y_train.values) score = model.score(X_test, Y_test) if best_score < score: best_score = score best_model = model best_band = band X_t = X_test Y_t = Y_test # if verbosity: # verbosit for each band # print(score, band) if verbosity: # verbosity for best score print('retest score:', best_model.score(X_t, Y_t).round(3), best_score.round(3), 'features:', cols, 'X_train.shape:', X_train.shape)# retest best model score return best_model, best_band, best_score # In[70]: start_date = '1970-01-01' cols_features =['PER', 'Price', 'Price_Var'] model, band, score = generate_model(df, cols_features, start_date, True) # In[71]: # model.score(X_t, Y_t), best_score # retest best model score # In[72]: df_initial['horizon'] = 'past' df_initial.loc[np.isnan(df_initial['Price6F']),['horizon']]='future' df_initial.head(10) # In[73]: df_initial['Date']=df_initial.index # add required column df_initial.head(24) # In[74]: df.head() # In[75]: periods=120 # predicted periods df_pred = df_initial.copy() cols_features =['PER', 'Price', 'Price_Var'] X_pred, y_dummy = generate_dataset(df_pred, cols_features, band, periods=periods) X_pred.head() # generated features # In[76]: X_pred.shape # In[77]: # df_pred = df_pred.iloc[:periods] # df_pred # In[78]: y_pred = model.predict(X_pred).round(2) # readable # y_pred = model.predict(X_pred) # y_pred # In[79]: df_pred = df_pred.iloc[:periods].copy() df_pred[col_y+'Pred'] = np.clip(y_pred,0,0.5) # replace outliers # In[80]: cols_pred = ['PER', 'Price', 'Price_Corr_6M', 'horizon', 'Price_Corr_6MPred'] df_pred[cols_pred].head(36) # one shoot prediction # In[81]: ind_y = df_pred['horizon']=='past' # ind_y # In[82]: model.score(X_pred.loc[ind_y], df_pred.loc[ind_y,col_y]) # score for past - could contain seen data (X_train) # In[83]: # ind_y[ind_y==True] # past months for score # In[84]: # RANDOM CROSS FOLD n_times = 10 # build model n times and predict n times periods = 6 # predicted last months df_pred = df_initial.copy() df_pred_iter = df_pred.iloc[:periods].copy() ls_pc6m_cols=[] ls_score = [] cols_features =['PER', 'Price', 'Price_Var'] for i in range(n_times): start_date = '1970-01-01' model, band, score = generate_model(df, cols_features, start_date, True) ls_score.append(score.round(3)) print('score:',score.round(3), 'band:', band) X_pred, y_dummy = generate_dataset(df_pred, cols_features, band, periods=periods) y_pred = model.predict(X_pred).round(2) # readable pc6m_col = 'PC6M'+str(1+i) ls_pc6m_cols.append(pc6m_col) df_pred_iter[pc6m_col] = y_pred # In[85]: cols_iter = ['PER', 'Price', 'Price_Var', 'horizon'] df_pred_iter=df_pred_iter[cols_iter+ls_pc6m_cols].copy() # In[86]: df_pred_iter['PC6M_AVG']=df_pred_iter[ls_pc6m_cols].mean(axis=1) # average of all predictions # In[87]: # RANDOM CROSS FOLD PREDICTION with ['PER', 'Price', 'Price_Var'] df_pred_iter # In[88]: np.mean(ls_score), ls_score # model score mean and model score list # In[89]: # Buffet Indicator # In[90]: # Get your api_key from FRED # https://fred.stlouisfed.org/docs/api/api_key.html # fred = Fred(api_key='abcdefghijklmnopqrstuvwxyz123456') # example # get your api_key from FRED fred = Fred(api_key='your_api_key') # In[91]: # wilshire = fred.get_series('WILL5000PRFC') # In[92]: gdp_data = fred.get_series_latest_release('GDP') gdp_data # In[93]: # Convert from quarter to day with interpolate idx = pd.date_range(gdp_data.index.min(), gdp_data.index.max()) gdp_data = gdp_data.reindex(idx, fill_value=np.nan).copy() gdp_data # In[94]: gdp_data = pd.to_numeric(gdp_data, errors='coerce').astype('float64').copy() gdp_data.interpolate(inplace=True) gdp_data # In[95]: # %matplotlib inline gdp_data.plot.line() # In[96]: wilshire = fred.get_series('WILL5000PRFC') # In[97]: idx = pd.date_range(wilshire.index.min(), wilshire.index.max()) wilshire = wilshire.reindex(idx, fill_value=np.nan).copy() wilshire # In[98]: wilshire.interpolate(inplace=True) wilshire # In[99]: wilshire.plot.line() # In[100]: gdp_data.name='GDP' wilshire.name='Wilshire 5000' bf_ind = pd.concat([gdp_data, wilshire], axis=1) # Buffet Indicator bf_ind # In[101]: bf_ind.dropna(subset=['Wilshire 5000'], inplace=True) # drop missing GDP bf_ind # In[102]: bf_ind.interpolate(method ='spline',order=2, inplace=True) # In[103]: col_bf_ind='Buffet Indicator' bf_ind[col_bf_ind] = bf_ind['Wilshire 5000'] / bf_ind['GDP'] bf_ind # In[104]: # https://www.longtermtrends.net/market-cap-to-gdp-the-buffett-indicator/ bf_ind[col_bf_ind].plot.line() # compare with chart from above link # In[105]: bf_ind.index.min(), bf_ind.index.max() # In[106]: df_initial # In[107]: df_pred = df_initial.copy() df_pred = pd.concat([df_pred, bf_ind], axis=1) df_pred.sort_index(ascending=False, inplace=True) # In[108]: cols_iter = ['Date','PER', 'Price', 'Price_Var', 'horizon', 'Buffet Indicator', 'Price_Corr_6M'] df_pred[col_bf_ind] = df_pred[col_bf_ind].interpolate(method='linear', limit_direction='backward') df_pred = df_pred[cols_iter].copy() df_pred.dropna(subset=['horizon', 'Buffet Indicator'], inplace=True) df_pred # In[109]: cols_train=['Date','PER', 'Price', 'Price_Var', 'Buffet Indicator', 'Price_Corr_6M'] df= df_pred[cols_train].copy() df.dropna(inplace=True) df # In[110]: # RANDOM CROSS FOLD for Bootstrapping ls_cols_features=['PER', 'Price', 'Price_Var','Buffet Indicator'] # test with one by one feature ls_cols_features.append(['PER','Buffet Indicator']) # test with set of features ls_cols_features.append(['PER', 'Price','Buffet Indicator']) # test with set of features start_date = df.index[-periods] for e in ls_cols_features: n_times = 10 # build model n times and predict n times periods = 6 # predicted last months # df_pred = df_initial.copy() df_pred_iter = df_pred.iloc[:periods].copy() ls_pc6m_cols=[] ls_score = [] cols_features = [e] if isinstance(e, list): cols_features = e # set of features else: cols_features = [e] # one feature for i in range(n_times): # start_date = '1974-01-01' model, band, score = generate_model(df, cols_features, start_date, True) score = score.round(3) ls_score.append(score) print('score:',score, 'band:', band) X_pred, y_dummy = generate_dataset(df_pred, cols_features, band, periods=periods) y_pred = model.predict(X_pred).round(2) # recols_features, adable y_pred = np.clip(y_pred,0,0.5) # replace outliers pc6m_col = 'PC6M'+str(1+i) ls_pc6m_cols.append(pc6m_col) df_pred_iter[pc6m_col] = y_pred # RANDOM CROSS FOLD PREDICTION on PER and BUFFET INDICATOR df_pred_iter['PC6M_AVG']=df_pred_iter[ls_pc6m_cols].mean(axis=1) # average of all predictions display(df_pred_iter.loc[:,df_pred_iter.columns!=col_y]) print('scores:', np.mean(ls_score).round(3), ls_score) # model score mean and model score list) # In[111]: #