!pip3 install ta
Collecting ta Downloading https://files.pythonhosted.org/packages/90/ec/e4f5aea8c7f0f55f92b52ffbafa389ea82f3a10d9cab2760e40af34c5b3f/ta-0.5.25.tar.gz Requirement already satisfied: numpy in /usr/local/lib/python3.7/site-packages (from ta) (1.18.0) Requirement already satisfied: pandas in /usr/local/lib/python3.7/site-packages (from ta) (1.0.3) Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/site-packages (from pandas->ta) (2019.3) Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.7/site-packages (from pandas->ta) (2.8.1) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas->ta) (1.13.0) Building wheels for collected packages: ta Building wheel for ta (setup.py) ... done Created wheel for ta: filename=ta-0.5.25-cp37-none-any.whl size=24879 sha256=7f53b78a9fc5a542b536b1bd6fd928280409e9e22a683b29a49a901add30898a Stored in directory: /Users/enzoampil/Library/Caches/pip/wheels/2e/93/b7/cf649194508e53cee4145ffb949e9f26877a5a8dd12db9ed5b Successfully built ta Installing collected packages: ta Successfully installed ta-0.5.25
from ta import add_all_ta_features
from ta.utils import dropna
%matplotlib inline
import matplotlib.pylab as plt
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression, LinearRegression
from fastquant import get_crypto_data
import numpy as np
df = get_crypto_data("BTC/USDT", "2019-05-13", "2020-08-23").reset_index().reset_index()
df
index | dt | open | high | low | close | volume | |
---|---|---|---|---|---|---|---|
0 | 0 | 2019-05-13 | 6968.24 | 8100.00 | 6870.00 | 7790.71 | 85804.735333 |
1 | 1 | 2019-05-14 | 7795.62 | 8366.00 | 7599.56 | 7947.56 | 76583.722603 |
2 | 2 | 2019-05-15 | 7945.26 | 8249.00 | 7850.00 | 8169.87 | 37884.327211 |
3 | 3 | 2019-05-16 | 8169.08 | 8320.00 | 7705.00 | 7866.59 | 69630.513996 |
4 | 4 | 2019-05-17 | 7868.67 | 7925.00 | 6913.00 | 7355.26 | 88752.008159 |
... | ... | ... | ... | ... | ... | ... | ... |
464 | 464 | 2020-08-19 | 11945.10 | 12020.08 | 11561.00 | 11754.59 | 73940.169606 |
465 | 465 | 2020-08-20 | 11754.38 | 11888.00 | 11668.00 | 11853.55 | 46085.254351 |
466 | 466 | 2020-08-21 | 11853.54 | 11878.00 | 11485.81 | 11531.34 | 64448.306142 |
467 | 467 | 2020-08-22 | 11531.23 | 11686.00 | 11376.81 | 11662.96 | 43678.701646 |
468 | 468 | 2020-08-23 | 11663.51 | 11718.07 | 11514.13 | 11648.13 | 37900.004690 |
469 rows × 7 columns
X = add_all_ta_features(
df, open="open", high="high", low="low", close="close", volume="volume")
X.mean().sort_values()
volume_sma_em -2.291602e+06 volume_em -1.486544e+06 momentum_wr -4.675613e+01 trend_dpo -1.208690e+01 trend_psar_up_indicator 4.051173e-02 ... volatility_bbh 1.002699e+04 volume 5.965989e+04 volume_fi 5.500383e+05 volume_obv 5.638757e+05 volume_adi 1.153206e+06 Length: 78, dtype: float64
X.shape, df.shape
((469, 79), (469, 79))
X = X.iloc[: -1]
X["pct_change"] = df.close.pct_change()
X["pct_change_lag1"] = X["pct_change"].shift()
X = X.fillna(-1)
y = df.close.pct_change().shift(-1).iloc[:-1]
del X["dt"]
X.shape, y.shape
((468, 80), (468,))
y.isna().sum()
0
X.columns
Index(['index', 'open', 'high', 'low', 'close', 'volume', 'volume_adi', 'volume_obv', 'volume_cmf', 'volume_fi', 'momentum_mfi', 'volume_em', 'volume_sma_em', 'volume_vpt', 'volume_nvi', 'volume_vwap', 'volatility_atr', 'volatility_bbm', 'volatility_bbh', 'volatility_bbl', 'volatility_bbw', 'volatility_bbp', 'volatility_bbhi', 'volatility_bbli', 'volatility_kcc', 'volatility_kch', 'volatility_kcl', 'volatility_kcw', 'volatility_kcp', 'volatility_kchi', 'volatility_kcli', 'volatility_dcl', 'volatility_dch', 'trend_macd', 'trend_macd_signal', 'trend_macd_diff', 'trend_sma_fast', 'trend_sma_slow', 'trend_ema_fast', 'trend_ema_slow', 'trend_adx', 'trend_adx_pos', 'trend_adx_neg', 'trend_vortex_ind_pos', 'trend_vortex_ind_neg', 'trend_vortex_ind_diff', 'trend_trix', 'trend_mass_index', 'trend_cci', 'trend_dpo', 'trend_kst', 'trend_kst_sig', 'trend_kst_diff', 'trend_ichimoku_conv', 'trend_ichimoku_base', 'trend_ichimoku_a', 'trend_ichimoku_b', 'trend_visual_ichimoku_a', 'trend_visual_ichimoku_b', 'trend_aroon_up', 'trend_aroon_down', 'trend_aroon_ind', 'trend_psar_up', 'trend_psar_down', 'trend_psar_up_indicator', 'trend_psar_down_indicator', 'momentum_rsi', 'momentum_tsi', 'momentum_uo', 'momentum_stoch', 'momentum_stoch_signal', 'momentum_wr', 'momentum_ao', 'momentum_kama', 'momentum_roc', 'others_dr', 'others_dlr', 'others_cr', 'pct_change', 'pct_change_lag1'], dtype='object')
Let's train a boosted decision tree on the filtered as well as the full set of extracted features.
X_full_train, X_full_test, y_train, y_test = X.iloc[:-50], X.iloc[-50:], y.iloc[:-50], y.iloc[-50:]
#X_filtered_train, X_filtered_test = X_full_train[X_filtered.columns], X_full_test[X_filtered.columns]
X_filtered_train, X_filtered_test = X_full_train, X_full_test
y_train, X_full_train
(0 0.020133 1 0.027972 2 -0.037122 3 -0.065000 4 -0.013298 ... 413 -0.005875 414 0.010226 415 -0.015756 416 -0.003112 417 0.008523 Name: close, Length: 418, dtype: float64, index open high low close volume volume_adi \ 0 0 6968.24 8100.00 6870.00 7790.71 85804.735333 4.265263e+04 1 1 7795.62 8366.00 7599.56 7947.56 76583.722603 3.561417e+04 2 2 7945.26 8249.00 7850.00 8169.87 37884.327211 5.847199e+04 3 3 8169.08 8320.00 7705.00 7866.59 69630.513996 2.543203e+04 4 4 7868.67 7925.00 6913.00 7355.26 88752.008159 1.425209e+04 .. ... ... ... ... ... ... ... 413 413 9116.16 9238.00 9024.67 9192.56 42120.293261 2.249072e+06 414 414 9192.93 9205.00 9064.89 9138.55 31463.162801 2.250691e+06 415 415 9138.08 9292.00 9080.10 9232.00 38488.528699 2.267384e+06 416 416 9231.99 9261.96 8940.00 9086.54 45725.168076 2.263282e+06 417 417 9086.54 9125.00 9037.47 9058.26 28943.420177 2.248088e+06 volume_obv volume_cmf volume_fi ... momentum_stoch_signal \ 0 8.580474e+04 -1.000000 -1.000000e+00 ... -1.000000 1 1.623885e+05 -1.000000 -1.000000e+00 ... -1.000000 2 2.002728e+05 -1.000000 -1.000000e+00 ... -1.000000 3 1.306423e+05 -1.000000 -1.000000e+00 ... -1.000000 4 4.189026e+04 -1.000000 -1.000000e+00 ... -1.000000 .. ... ... ... ... ... 413 1.271013e+06 0.138373 -1.316667e+06 ... 28.930306 414 1.239550e+06 0.129728 -1.371332e+06 ... 33.384724 415 1.278038e+06 0.221278 -6.616059e+05 ... 37.455473 416 1.232313e+06 0.193137 -1.517260e+06 ... 33.723689 417 1.203370e+06 0.153633 -1.417440e+06 ... 30.897571 momentum_wr momentum_ao momentum_kama momentum_roc others_dr \ 0 -1.000000 -1.000000 -1.000000 -1.000000 -13.584355 1 -1.000000 -1.000000 -1.000000 -1.000000 2.013295 2 -1.000000 -1.000000 -1.000000 -1.000000 2.797211 3 -1.000000 -1.000000 -1.000000 -1.000000 -3.712177 4 -1.000000 -1.000000 -1.000000 -1.000000 -6.500021 .. ... ... ... ... ... 413 -62.031679 -338.369441 9352.992996 -2.879831 0.835971 414 -67.734952 -349.778265 9347.375709 -2.639693 -0.587540 415 -57.866948 -341.569588 9346.407205 -0.840259 1.022591 416 -73.227033 -313.513765 9309.077553 -2.910690 -1.575607 417 -76.213305 -297.808647 9274.625812 -2.543710 -0.311230 others_dlr others_cr pct_change pct_change_lag1 0 -1.000000 0.000000 -1.000000 -1.000000 1 1.993297 2.013295 0.020133 -1.000000 2 2.758803 4.866822 0.027972 0.020133 3 -3.782832 0.973981 -0.037122 0.027972 4 -6.720897 -5.589349 -0.065000 -0.037122 .. ... ... ... ... 413 0.832496 17.993867 0.008360 0.011579 414 -0.589273 17.300605 -0.005875 0.008360 415 1.017398 18.500111 0.010226 -0.005875 416 -1.588151 16.633015 -0.015756 0.010226 417 -0.311715 16.270019 -0.003112 -0.015756 [418 rows x 80 columns])
y_train.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x12448fed0>
#regressor_full = RandomForestRegressor()
regressor_full = LinearRegression()
regressor_full.fit(X_full_train, y_train)
#print(classification_report(y_test, classifier_full.predict(X_full_test)))
LinearRegression()
# Out of sample
import pandas as pd
from matplotlib import pyplot as plt
pdf = pd.DataFrame(dict(pred=regressor_full.predict(X_full_test), actual=y_test))
pdf.plot.scatter(0, 1)
#plt.xlim(-0.025, 0.025)
#plt.ylim(-0.15, 0.15)
<matplotlib.axes._subplots.AxesSubplot at 0x121156dd0>
pdf.corr()
pred | actual | |
---|---|---|
pred | 1.000000 | 0.332945 |
actual | 0.332945 | 1.000000 |
# In sample
# The in sample predictions from linear regression are not too overfit, compared to random forest - looks promising
import pandas as pd
from matplotlib import pyplot as plt
pdf = pd.DataFrame(dict(pred=regressor_full.predict(X_full_train), actual=y_train))
pdf.plot.scatter(0, 1)
plt.xlim(-0.1, 0.1)
(-0.1, 0.1)
feat_importance = pd.DataFrame({"importance": regressor_full.feature_importances_, "feat": X_full_train.columns})
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-398-fe84e39d411f> in <module> ----> 1 feat_importance = pd.DataFrame({"importance": regressor_full.feature_importances_, "feat": X_full_train.columns}) AttributeError: 'LinearRegression' object has no attribute 'feature_importances_'
feat_importance
importance | feat | |
---|---|---|
0 | 0.010040 | index |
1 | 0.011414 | open |
2 | 0.022711 | high |
3 | 0.015858 | low |
4 | 0.022280 | close |
... | ... | ... |
75 | 0.008654 | others_dr |
76 | 0.017034 | others_dlr |
77 | 0.021043 | others_cr |
78 | 0.008210 | pct_change |
79 | 0.022445 | pct_change_lag1 |
80 rows × 2 columns
feat_importance.set_index("feat").sort_values(ascending=False, by="importance")
importance | |
---|---|
feat | |
trend_dpo | 0.039516 |
volume_adi | 0.034599 |
trend_cci | 0.030592 |
volume_obv | 0.029656 |
trend_visual_ichimoku_a | 0.028776 |
... | ... |
volatility_bbli | 0.000416 |
trend_psar_down_indicator | 0.000392 |
volatility_kchi | 0.000119 |
volatility_kcli | 0.000078 |
volatility_bbhi | 0.000066 |
80 rows × 1 columns
feat_importance.set_index("feat").sort_values(ascending=True, by="importance").plot.barh(figsize=(10, 15))
plt.title("TA Feature Importance for Predicting BTC returns", fontsize=20)
Text(0.5, 1.0, 'TA Feature Importance for Predicting BTC returns')
Improved corrrelation by 5 percentage points!
top_feats = feat_importance.set_index("feat").importance.sort_values(ascending=False).head(20).index.values
#regressor_top = RandomForestRegressor()
regressor_top = LinearRegression()
regressor_top.fit(X_full_train[top_feats], y_train)
LinearRegression()
# Out of sample
import pandas as pd
from matplotlib import pyplot as plt
pdf = pd.DataFrame(dict(pred=regressor_top.predict(X_full_test[top_feats]), actual=y_test))
pdf['pos_pred'] = pdf.pred.gt(0)
pdf['pos_actual'] = pdf.actual.gt(0)
pdf.plot.scatter(0, 1)
#plt.xlim(-0.1, 0.1)
#plt.ylim(-0.10, 0.10)
<matplotlib.axes._subplots.AxesSubplot at 0x129e1e890>
pdf[["pred", "actual"]].corr()
pred | actual | |
---|---|---|
pred | 1.000000 | 0.217987 |
actual | 0.217987 | 1.000000 |
np.sqrt(pdf.actual.subtract(pdf.pred).pow(2).mean())
0.06142693380160941
# Overall accuracy
(pdf['pos_pred'] == pdf['pos_actual']).mean()
0.55
pdf['pos_actual'].mean()
0.56
#import statsmodels.regression.linear_model as sm
import statsmodels.api as sm
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(0)
comb = pdf[["pred", "actual"]].copy()
comb = comb.dropna()
X = pdf[["pred"]]
Y = pdf[["actual"]]
model2 = sm.OLS(Y,sm.add_constant(X), data=comb)
model_fit = model2.fit()
print(model_fit.summary())
#Plot
pdf[["pred", "actual"]].plot(kind='scatter', x="pred", y="actual")
#plt.ylim(-0.2, 0.2)
#plt.xlim(-0.2, 0.2)
#Seaborn
sns.lmplot(x="pred", y="actual", data=pdf)
plt.ylim(-0.1, 0.2)
#plt.xlim(-0.2, 0.2)
OLS Regression Results ============================================================================== Dep. Variable: actual R-squared: 0.111 Model: OLS Adj. R-squared: 0.102 Method: Least Squares F-statistic: 12.22 Date: Mon, 24 Aug 2020 Prob (F-statistic): 0.000713 Time: 15:53:08 Log-Likelihood: 146.11 No. Observations: 100 AIC: -288.2 Df Residuals: 98 BIC: -283.0 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -0.0118 0.007 -1.680 0.096 -0.026 0.002 pred 0.1393 0.040 3.495 0.001 0.060 0.218 ============================================================================== Omnibus: 102.398 Durbin-Watson: 2.133 Prob(Omnibus): 0.000 Jarque-Bera (JB): 1986.483 Skew: -3.128 Prob(JB): 0.00 Kurtosis: 23.919 Cond. No. 7.11 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
(-0.1, 0.2)
Looks like adding lag features made it worse
df = get_crypto_data("BTC/USDT", "2019-01-01", "2020-08-23").reset_index().reset_index()
X = add_all_ta_features(
df, open="open", high="high", low="low", close="close", volume="volume")
X = X.iloc[: -1]
X["pct_change"] = df.close.pct_change()
X["pct_change_lag1"] = X["pct_change"].shift()
X = X.fillna(-1)
y = df.close.pct_change().shift(-1).iloc[:-1]
if "dt" in X.columns:
del X["dt"]
X_1 = X.shift().fillna(-1)
X_1.columns = [c + "_1" for c in X.columns]
X_comb = pd.concat([X, X_1], axis=1)
X_full_train, X_full_test, y_train, y_test = X_comb.iloc[:-100], X_comb.iloc[-100:], y.iloc[:-100], y.iloc[-100:]
#X_filtered_train, X_filtered_test = X_full_train[X_filtered.columns], X_full_test[X_filtered.columns]
X_filtered_train, X_filtered_test = X_full_train, X_full_test
regressor_full = LinearRegression()
#regressor_full = RandomForestRegressor()
regressor_full.fit(X_full_train, y_train)
LinearRegression()
# Out of sample
import pandas as pd
from matplotlib import pyplot as plt
pdf = pd.DataFrame(dict(pred=regressor_full.predict(X_full_test), actual=y_test))
pdf.plot.scatter(0, 1)
plt.xlim(-0.1, 0.2)
plt.ylim(-0.15, 0.20)
(-0.15, 0.2)
pdf.corr()
pred | actual | |
---|---|---|
pred | 1.000000 | 0.332945 |
actual | 0.332945 | 1.000000 |
feat_importance = pd.DataFrame({"importance": regressor_full.feature_importances_, "feat": X_full_train.columns})
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-418-fe84e39d411f> in <module> ----> 1 feat_importance = pd.DataFrame({"importance": regressor_full.feature_importances_, "feat": X_full_train.columns}) AttributeError: 'LinearRegression' object has no attribute 'feature_importances_'
feat_importance.shape
(160, 2)
feat_importance.set_index("feat").sort_values(ascending=True, by="importance").plot.barh(figsize=(15, 30))
plt.title("TA Feature Importance for Predicting BTC returns", fontsize=20)
Text(0.5, 1.0, 'TA Feature Importance for Predicting BTC returns')
top_feats = feat_importance.set_index("feat").importance.sort_values(ascending=False).head(50).index.values
regressor_top = RandomForestRegressor()
regressor_top.fit(X_full_train[top_feats], y_train)
RandomForestRegressor()
# Out of sample
import pandas as pd
from matplotlib import pyplot as plt
pdf = pd.DataFrame(dict(pred=regressor_top.predict(X_full_test[top_feats]), actual=y_test))
pdf.plot.scatter(0, 1)
plt.xlim(-0.1, 0.1)
plt.ylim(-0.15, 0.15)
(-0.15, 0.15)
pdf.corr()
pred | actual | |
---|---|---|
pred | 1.00000 | 0.04023 |
actual | 0.04023 | 1.00000 |
y.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x12648c3d0>
# One approach would be to specify as a ternary classification problem
#
y.describe()
count 499.000000 mean 0.002803 std 0.041368 min -0.395048 25% -0.013306 50% 0.001246 75% 0.018538 max 0.171968 Name: close, dtype: float64