#!/usr/bin/env python # coding: utf-8 # # Google stock analysis and predictions # ## Affinito Alessandro # # # ### Abstract # # # We study some tech stock price through data visualization and some financial technique, focusing on those which are intended to give a sort of *reliable* prevision to permit brokers have a basis on which they could decide when it is the best moment to sell or buy stocks. We first analyze a year of data about the biggest companies as Amazon, Google, Apple and Microsoft but right after that we focus on Google stocks. # # Next we leave the financial tools for _supervised learning_ analysis. These machine learning processes learn a function from an input type to an output type using data comprising examples. Furthermore we'll talk specifically of _regression_ supervised learning, meaning that we're interested in inferring a real valued function whose values corresponds to the mean of a dependant variable (stock prices). # # We first applied linear regression on the last 6 years of Google Trends about the word 'google' specifically searched in the financial news domain, versus the last 6 years Google stock prices. From now on we change our feature domain with a multivariate input, i.e. we use other stock prices (AAPL, MSFT, TWTR, AMZN) to study the accuracy of others algorithms such as a _multivariate linear regression_, a _SVR_ and a _Random Forest_. # # # *** keywords :*** Finance, Stock Price Analysis, MACD, Machine Learning, Linear Regression, SVR, Random Forest, Data Visualization, Python, R # # --- # # ### Index # 2. Data collection # 2. Financial analysis # 3. Moving average # 3. MACD # 3. Daily return analysis # 3. Risk analysis # 2. Correlation # 2. Can we see the future ? # 2. Google Trends Simple Linear Regression # 2. Multivariate LR # 2. Support Vector Regression # 2. Random Forest # 2. Conclusions # # In[2]: import pandas as pd from pandas import Series,DataFrame import numpy as np # For Visualization import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') get_ipython().run_line_magic('matplotlib', 'inline') # For reading stock data from yahoo or google from pandas.io.data import DataReader # For time stamps from datetime import datetime # suppressing warnings import warnings warnings.filterwarnings('ignore') # In[3]: # interactive plots import plotly.plotly as py import cufflinks as cf import plotly.tools as tls tls.set_credentials_file(username='affinito', api_key='') #from plotly import __version__ from plotly.offline import download_plotlyjs, init_notebook_mode, iplot init_notebook_mode() # #### Data Retrieving # In[4]: # The tech stocks we'll use for this analysis tech_list = ['AAPL','GOOG','MSFT','AMZN'] # Set up End and Start times for data grab end = datetime.now() start = datetime(end.year - 1, end.month, end.day) #For loop for grabing yahoo finance data and setting as a dataframe for stock in tech_list: # Set DataFrame as the Stock Ticker globals()[stock] = DataReader(stock,'google',start,end) # In[182]: #AAPL.to_csv("files/apple.csv") GOOG.ix[:10,:-8] # # # In[5]: cf.set_config_file(offline=True, world_readable=True, theme='ggplot') GOOG['Close'].iplot(kind='scatter', title="Google stocks closing prices", yTitle='Price', theme='white', bestfit=True) # In[6]: GOOG['Volume'].iplot(title='Google stocks volume', theme='pearl', yTitle='Volume', bestfit=False ) # # --- # ### Moving Average # Let's see the [moving average](http://www.investopedia.com/terms/m/movingaverage.asp) of the apple stock price: # # A widely used indicator in technical analysis that helps smooth out price action by filtering out the “noise” from random price fluctuations. A moving average (MA) is a trend-following indicator because it is based on past prices. The two basic and commonly used MAs are the simple moving average (SMA), which is the simple average of a security over a defined number of time periods, and the exponential moving average (EMA), which gives bigger weight to more recent prices. # In[7]: mov_avg = [10, 20, 30] for m in mov_avg: GOOG[ str(m)+" MA" ] = pd.rolling_mean( GOOG['Close'], m) # In[79]: GOOG.tail() # Moving averages versus the real closing value zoomed on last 100 market days. # In[8]: GOOG[['Close','10 MA','20 MA','30 MA']][-100:].iplot(theme='white') # While the [exponential moving average](https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average) gives more weight to latest data, i.e. react faster to recent price changes thanks to exponentially decreasing weights associated to oldest data. # # So let's have a look at the differences in a 26 day (we'll see later why) window: # In[9]: window = 26 GOOG[str(window)+' MA'] = pd.rolling_mean(GOOG['Close'], window) ema_center = (window-1)/2 GOOG[str(window)+' EMA']= pd.ewma(GOOG['Close'], com=ema_center) GOOG[['Close',str(window)+' MA', str(window)+' EMA']].iplot(theme='white') # # ### MACD # Moving average convergence divergence (MACD) is a trend-following momentum indicator that shows the relationship between two moving averages of prices. The MACD is calculated by subtracting the 26-day exponential moving average (EMA) from the 12-day EMA. A nine-day EMA of the MACD, called the "signal line", is then plotted on top of the MACD, functioning as a trigger for buy and sell signals. # # # In[10]: GOOG['12 EMA'] = pd.ewma(GOOG['Close'], com=(12-1)/2) GOOG['MACD'] = GOOG['12 EMA'] - GOOG['26 EMA'] #GOOG[['Close','MACD','26 MA']].iplot(theme='white') GOOG[['Close','26 MA','MACD']].iplot( subplots=True, shape=(3,1), shared_xaxes=True, title='Comparison between prevision power of MA and MACD', size=10 ) # # ### Daily return analysis # # Now it's time to study the risk of the stock, that similar to [ROI](http://www.investopedia.com/terms/r/returnoninvestment.asp), is used to evaluate the efficiency of an investment or to compare the efficiency of a number of different investments. To calculate ROI, the benefit of an investment is divided by the cost of the investment, and the result is expressed as a percentage. # # In[11]: GOOG['Daily return'] = GOOG['Close'].pct_change() GOOG['Daily return'].iplot(theme='white', yTitle='daily benefit') # What's happened on the 17 of July ? # > [Google gains billions in value as YouTube drives ad growth](http://www.reuters.com/article/us-google-research-idUSKCN0PR1FG20150717) : Google Inc's (GOOGL.O) shares closed up 16.3 percent at \$699.62 on Friday, adding about \$65 billion to its market value, as strong growth in YouTube viewership eased investor concerns about Facebook Inc's (FB.O) push into video. # # > [It’s official: Google books biggest day in history, adding \$66.9B](http://www.marketwatch.com/story/google-having-best-day-in-history-of-wall-street-investors-gain-65-billion-2015-07-17) : Google Inc.’s stock closed at a record \$699.62 on Friday, delivering \$66.9 billion to investors in one day — a record for Wall Street. Shares of Google GOOGL, -3.60% GOOG, -3.45% skyrocketed 16.3% on Friday, the company’s biggest one-day percentage gain since April 2008. The increase raised Google’s market capitalization by \$66.9 billion to $478 billion, according to FactSet. # # # Great, now let's get an overall look at the average daily return using a histogram. We'll see a histogram with the percentage of having a given daily benefit. # In[12]: GOOG['Daily return'].iplot(theme='white', kind='hist', histfunc='count', histnorm='percent') # Now let's compare all the closing prices of the different stocks # In[13]: closing_df = DataFrame(data=[GOOG['Close'], AAPL['Close'], MSFT['Close'], AMZN['Close']]) closing_df = closing_df.transpose() closing_df.columns = ['Google','Apple','Microsoft','Amazon'] # making the avg daily return dataframe daily_returns_df = closing_df.pct_change() daily_returns_df.head() # # ### Risk analysis # # >Risk analysis refers to the uncertainty of forecasted future cash flows streams, variance of portfolio/stock returns, statistical analysis to determine the probability of a project's success or failure, and possible future economic states. # # One of the ways to quantify risk, is using the information we've gathered on daily percentage returns is by comparing the expected return with the standard deviation of the daily returns. # # In[14]: risk_df = DataFrame( [daily_returns_df.dropna().mean(), daily_returns_df.dropna().std()] ) risk_df = risk_df.transpose() risk_df.columns = ['mean','std'] #risk_df.index = ['mean','std'] risk_df # In[15]: daily_returns_df.iplot(theme='white', kind='box', yTitle='Risk') # Now, refering to the Google daily return histogram above let's get the amount of risk for the stock: # In[16]: daily_returns_df['Google'].quantile(0.05) # The 0.05 empirical quantile of daily returns is at about -0.023. That means that with 95% confidence, our worst daily loss will not exceed 2.3%. # #
# ### Correlation #
# Now let's check if there's any linear correlation between these stocks. We can do this through Seaborn pairplot which lets us see it in terms of Pearson product-moment correlation coefficient. #

# Remind Expectation doesn't have multiplicativity property in general, unless the variables are uncorrelated, this discrepancy is measured by covariance, that's why the graph isn't simmetric. #

# # We'll discover two interesting high correlations. # # In[23]: sns.pairplot(daily_returns_df.dropna(), size=2.6) # We haven't found any paper or piece of news justifying the high correlation between Google vs Amazon stocks and Apple vs Microsoft stocks. # Just coincidence ? # Or maybe Apple and Google could be seen as some of the mayor clustering nodes of the higly connected ([scale-free](https://en.wikipedia.org/wiki/Scale-free_network))[1] stock market network and so they're probably highly correlated with other stocks? # # 1. [Statistical analysis of financial networks](http://ise.tamu.edu/people/faculty/butenko/papers/csda_market.pdf) # In[22]: sns.jointplot('Google','Amazon', daily_returns_df, kind='scatter', color='seagreen') # In[18]: sns.jointplot('Apple','Microsoft', daily_returns_df, kind='scatter', color='seagreen') # # ### Correlation heatmap # In[19]: daily_returns_df = daily_returns_df.dropna() # rowvar=0 -> observations are in the columns daily_corr = np.corrcoef(daily_returns_df[['Apple','Microsoft']].values, rowvar=1) corr_min = daily_corr.min() corr_max = daily_corr.max() # Next in a heatmap trace object, we overwrite the default minimum and maximum color scale values (with 'zmin' and ' zmax' respectively) to round up the color bar's range. # In[21]: from plotly.graph_objs import * corr_trace = Heatmap( z=daily_corr, # correlation as color contours x=daily_returns_df.index, # sites on both y=daily_returns_df.index, # axes zauto=False, # (!) overwrite Plotly's default color levels zmin=corr_min, # (!) set value of min color level zmax=corr_max, # (!) set value of max color level colorscale='YIOrRd', # light yellow-orange-red colormap reversescale=True # inverse colormap order ) heatmap_title = "Apple-Microsoft yearly correlation heatmap from "+GOOG.index[0].strftime("%d/%m/%y") # Make layout object layout = Layout( title=heatmap_title, # set plot title autosize=False, # turn off autosize height=500, # plot's height in pixels width=800, # plot's width in pixels margin=Margin(l=130), # adjust left margin ) corr_data = Data([corr_trace]) fig = Figure( data=corr_data, layout=layout) py.iplot(fig, filename='apple-microsoft-correlation-heatmap') # remember you can zoom on the psychedelic figure! # --- # # Supervised Learning #
# ## Google Trends in Simple Linear Regression # # Stock price forecasting is a popular and important topic in financial and academic studies. Time series analysis is the most common and fundamental method used to perform this task. This paper aims to combine the conventional time series analysis technique with information from the Google trend website and the Yahoo finance website to predict weekly changes in stock price. Important news/events related to a selected stock over a five year span are recorded and the weekly Google trend index values on this stock are used to provide a measure of the magnitude of these events. The result of this experiment shows significant correlation between the changes in weekly stock prices and the values of important news/events computed from the Google trend website. The algorithm proposed in this paper can potentially outperform the conventional time series analysis in stock price forecasting. # [Stock Price Forecasting Using Information from Yahoo Finance and Google Trend - Selene Yue Xu] # # As stated in the above quotation, the relationship between weekly stock prices changes and Google Trend news is proved to be not an effect of random causes, i.e. it's statistically significant. So let's study if this correlation holds for this case. # # *google_trends* contains the number of times the word "google" was searched in Google Financial News per week. # # Before plotting the linear regression some criterion should be respected, such as the linearity between the two dataset can't be null. # So we want to plot the percentual change of google trends over the percentual change of the google stock prices. # What we expect is a very little correlation, thus no studies found any significant correlation for the word _google_ itself. # # In[530]: end = datetime.now() start = datetime(end.year - 6, end.month, end.day) GOOG = DataReader('GOOG','google', start, end) GOOG['Daily return'] = GOOG['Close'].pct_change() print ( "Number of values : "+str(GOOG.size)) GOOG.head() # In[488]: google_trends = pd.read_csv("files/trendGoogle_financeNews_2010-2016.csv") # 319 values, but we need to divide for 5 days weeks, so let's keep 315 google_trends = google_trends[5:-3] # to have equal starts google_trends.columns= ['Date','Value'] google_trends.index = pd.Series( range(google_trends.index.size ) ) # google_trends.index.size = 310 google_trends[['Value']] = google_trends[['Value']].astype(int) google_trends.head() # Dividing the values by 5-days weeks and averaging # In[465]: num_of_groups = len( GOOG['Daily return'])/5 drw_mean = np.array([0.0]*int(num_of_groups)) i = 0 for v in np.split( GOOG['Daily return'].values, num_of_groups): drw_mean[i] = v.mean() i+=1 print(" Num. of weeks available: "+str(num_of_groups)) # In[468]: print( drw_mean[:10] ) # Now we can create the dataframe containing the two percentage series to plot on a correlation figure: # In[40]: # let's generate the new dataframe #trend_price_df = DataFrame( [google_trends['Value'][:300].pct_change(), drw_mean] ) #trend_price_df = trend_price_df.transpose() #trend_price_df.columns = ['trend pct','stock pct'] trend_price_df = pd.read_csv("files/trend_price.csv", index_col=0) # to avoid to get different data in future executions #trend_price_df.to_csv("files/trend_price.csv") trend_price_df = trend_price_df.dropna() # who needs null values ? trend_price_df.head() # In[41]: # .. sns.jointplot('trend pct','stock pct', trend_price_df, kind='scatter', color='seagreen', size=7) # So as we expected, the PPMCC is low (0.13), but it could still be interesting, although not significant, to try a prevision through a linear regression. # # ### Linear Regression # To perform supervised learning we have to chose how to represent our function. In this case we could approximate y, which corresponds to the vector of output values, as a linear function of x (called input or feature) :
# #

$ h_\theta (x_i) = \hat y_i = \theta_0 + \theta_i x_{i} + \epsilon $

# # where θ_i are the **parameters or weights** and ε is the error called **residual**. Since this is the case of a univariate linear regression, which is why we have just one x_i. # # Now for running the prediction we want to minimize the distance between the hypotheses h(x) and y. We can do this throgh a cost function named ***least-squares cost function*** J(θ) : #

$ J(\theta) = \frac{1}{2} \sum_{i=1}^{m}( h(x_i) - y_i)^2 $

# # To minimize this cost function an algorithm called ***gradient descent*** is used. Where iteratively the function is derived respect to a secifical parameter, with the goal of making a step in the deepest decrease of J. # # Numpy has a built in Least Square Method in its linear algebra library. We'll use this first for our Univariate regression and then move on to scikit learn for out Multi variate regression. # # So we'll look at our line as y = mx+ b but we need to transform it in matrix form in order to use [numpy.linalg.lstsq](http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.linalg.lstsq.html): y= Ap, where A=[x 1] and p=[m b] # # # # In[42]: import sklearn from sklearn.linear_model import LinearRegression #trend_price_df = trend_price_df.dropna() X = np.array([ [value, 1] for value in trend_price_df['trend pct']]) Y = trend_price_df['stock pct'] m, b = np.linalg.lstsq(X, Y)[0] # Now that we have found out our linear coefficients, we can plot the line: # In[516]: x = trend_price_df['trend pct'] plt.plot(x, Y,'o') plt.plot(x, m*x + b, 'r', label='Best Fit Line') # *RMSE* : Now it's time to find the error of the least square method, for each element, it checks the difference between the line and the true value, squares it, and returns the sum of all these residuals. # #

$ RMSE = \sqrt{\frac{\sum_{i=1}^n (\hat y_i - y_i)^2}{n}} $

# # To train my model we used 70% of the input feature and 30% for the testing. # # In[67]: from sklearn import linear_model # Get the resulting array result = np.linalg.lstsq(X,Y) # Get the total error and the residuals vector lin_sq_residual_sum = result[1] train_size = int(np.ceil( 0.7*len(X) )) X_train = X[:train_size] X_test = X[-train_size:] Y_train = Y[:train_size] Y_test = Y[-train_size:] lin_reg = linear_model.LinearRegression().fit(X_train, Y_train) uni_lin_residuals = lin_reg.predict(X_test) - Y_test # Get the root mean square error lin_rmse = np.sqrt( lin_sq_residual_sum /len(X) )[0] # Print print ("The root mean squared error was:\t {:1.5f}\ \n\t std( residuals ):\t\t {:1.5f}".format( lin_rmse, np.std( uni_lin_residuals ))) # In[114]: np.savetxt("files/uni_lin_residuals.csv", uni_lin_residuals, delimiter=',', fmt='%1.6f') # Since the root mean square error (RMSE) corresponds approximately to the standard deviation we can now say that the price of a stock won't vary more than 2 times the RMSE 95% of the time ( [68–95–99.7 rule](https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule) ). # --- # # ### Multivariate linear regression # Let's now use more than one feature to fit our linear regression model, i.e. we'll fit it with Apple, Microsoft, Amazon and Twitter stock information. # In[29]: tech_list = ['AAPL','MSFT','AMZN','TWTR','GOOG'] # nasdaq ? end = datetime.now() start = datetime(end.year - 4, end.month, end.day) stocks = [] for t in tech_list: stocks.append( DataReader(t, 'google', start, end) ) # TWTR! # start 2013-11-07 # In[567]: stocks[4].size # Let's fit a linear model with our features : # In[31]: import sklearn from sklearn.linear_model import LinearRegression X_stocks = DataFrame( [stocks[0]['Close'].pct_change(), stocks[1]['Close'].pct_change(), stocks[2]['Close'].pct_change(), stocks[3]['Close'].pct_change() ] ) X_stocks = X_stocks.transpose() X_stocks.columns = tech_list[:-1] Y_target = stocks[4]['Close'].ix['2013-11-07': ].pct_change() X_stocks = X_stocks.dropna()[-500:] Y_target = Y_target.dropna()[-500:] lreg = LinearRegression() lreg.fit(X_stocks, Y_target ) # fits a linear model print (' The estimated intercept coefficient is %.5f ' %lreg.intercept_) print (' The number of coefficients used was %d ' % len(lreg.coef_) ) # Let's build a common dataframe : # In[88]: X_stocks = pd.read_csv("files/X_stocks.csv", index_col=[0]) Y_target = pd.read_csv("files/Y_target.csv", index_col=[0], names=['Date','Target Value'] ) X_stocks.head() # And let's see the linear coefficient associated to each feature : # In[89]: coef_df = DataFrame( X_stocks.columns ) coef_df.columns = ['Feature'] coef_df["Coefficient Estimate"] = pd.Series( lreg.coef_ ) coef_df # Where we can see the highest correlation with Microsoft stocks. # # Now let's split our data in order to perform cross validation, then we check again our accuracy through the RMSE. # # In[90]: X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X_stocks, Y_target) # Print shapes of the training and testing data sets print (X_train.shape, X_test.shape, Y_train.shape, Y_test.shape) # In[99]: lreg.fit(X_train,Y_train) pred_train = lreg.predict(X_train) pred_test = lreg.predict(X_test) print ("MSE on training set:\t %.6f" % np.mean((Y_train - pred_train) ** 2) ) print ("MSE on testing set:\t %.6f" %np.mean((Y_test - pred_test) ** 2) ) # In[110]: mlin_residuals = (Y_test - pred_test).values mlin_RMSE = np.sqrt( np.mean( mlin_residuals** 2)) print(" Multivariate linear regression \n\tRMSE : {:1.5f}\ \n\t std : {:1.5f}".format(mlin_RMSE, np.std(mlin_residuals)) ) # In[115]: np.savetxt("files/mlin_residuals.csv", mlin_residuals, delimiter=',', fmt='%1.6f') #
# The difference between the observed value of the dependent variable (y) and the predicted value (h(x)) is called the **residual** (e). Each data point has one residual, so that *residual = ObservedValue − PredictedValue*.
# You can think of these residuals in the same way as the θ value we discussed earlier, in this case however, there were multiple data points considered, such that we have the more general equation #

$ h_\theta (x) = \sum_{i=0}^{m}{\theta_i x_i}$

# # A **residual plot** is a graph that shows the residuals on the vertical axis and the independent variable on the horizontal axis. If the points in a residual plot are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a non-linear model is more appropriate. # # In[595]: # Scatter plot the training data train = plt.scatter( pred_train, (pred_train-Y_train), c='b', alpha=0.5) # Scatter plot the testing data test = plt.scatter( pred_test, (pred_test-Y_test), c='r', alpha=0.5, ) # Plot a horizontal axis line at 0 plt.hlines( y=0, xmin=-0.015, xmax=0.015) #Labels plt.legend((train,test),('Training','Test'), loc='lower left') plt.title('Residual Plots') # # # ## SVR # # In machine learning, support vector machines (SVMs) are supervised learning models with associated learning algorithms that analyze data and recognize patterns, used for classification and regression analysis. Given a set of training examples, each marked for belonging to one of two categories, an SVM training algorithm builds a model that assigns new examples into one category or the other, making it a non-probabilistic binary linear classifier. An SVM model is a representation of the examples as points in space, mapped so that the examples of the separate categories are divided by a clear gap that is as wide as possible. New examples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall on. # # The advantages of support vector machines are: # # * Effective in high dimensional spaces. # * Still effective in cases where number of dimensions is greater than the number of samples. # * Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient. # * Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels. # # The disadvantages of support vector machines include: # * If the number of features is much greater than the number of samples, the method is likely to give poor performances. # * SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation. # # This technique was developed in three major steps. # 2. First, assuming that the two classes of training examples can be separated by a hyperplane, the optimal hyperplane is the # one that separates the training examples with the widest margin [[1]](http://pubs.sciepub.com/jgg/2/3/9/index.html). # 2. Second, it were incorporated kernel function into the maximum margin models. Kernels allow SVM to implicitly construct the optimal hyperplane in the feature space, and the resulting nonlinear model is important for modeling real data. # 2. in case the training examples are not linearly separable, Cortes and Vapnik [[2]](http://image.diku.dk/imagecanon/material/cortes_vapnik95.pdf) showed that the some margin can be applied, allowing some examples to violate the margin condition (soft margin). # # So, Given training vectors x_i in ℝ^p, i=1,..., n, and a vector y in ℝⁿ ε-SVR solves the following primal problem: #

# # As the constraints in the primal form are not convenient to handle, people have conventionally resorted to the dual problem, which is again a quadratic program, but with much simpler constraints: box constraints plus a single linear equality constraint. # # The training examples can be divided into three categories according to the value of α\*_i. # # * If α\*_i= 0, it means the corresponding training example does not affect the decision boundary, and in fact it lies beyond the margin, that is, y_i( < w, x_i > +b ) > 1. # * If α\*_i ∈(0, n⁻¹), then the training example lies on the margin. # * If α\*_i = n⁻¹ it means the training example violates the margin. # In the latter two cases where α\*_i > 0, the i-th training example is called a support vector. #

# # *e* is the vector of all ones, *C* > 0 is the upper bound, *Q* is an n by n positive semidefinite matrix, Q_ij=K(x_i, x_j) = φ(x_i)^T φ(x_j) is the kernel. # #

# # #### REF: # # * [SVM on Stanford notes](http://cs229.stanford.edu/notes/cs229-notes3.pdf) # * [Tutorial on support vector regression](http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=0D73512B5DE850E20FF80D1707D4D610?doi=10.1.1.114.4288&rep=rep1&type=pdf) # * [Scikit SVR tips](http://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use) # * [SVR on Wikipedia](https://en.wikipedia.org/wiki/Support_vector_machine#Regression) # Let's now define our training sets end testing sets. # # Here you can see too the python settings for the model, but we'll do the job in R, which has a very nice ad-hoc package ([e1071](https://cran.r-project.org/web/packages/e1071/index.html)) # In[116]: from sklearn.svm import SVR from sklearn.cross_validation import train_test_split ''' C Penalty parameter of the error term. epsilon Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value. kernel Specifies the kernel type to be used in the algorithm. RBF = radial basis function. gamma Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used ''' svr_model = SVR(kernel='rbf', gamma='auto', C=1.0, epsilon=0.1, cache_size=500 ) # Split the data into Trainging and Testing sets X_train, X_test, Y_train, Y_test = train_test_split( X_stocks, Y_target, train_size=0.7, random_state=42 ) print ("Subset shapes: X: {}, {}, y:{}, {}".format(X_train.shape, X_test.shape,Y_train.shape,Y_test.shape )) # Fit the SVM model according to the given training data. xxTrain = np.array( X_train.as_matrix() ) yyTrain = np.array( Y_train.as_matrix() ) # In[ ]: # ### SVR in R # # The model used by svm function is an epsilon-SVR. Here, the data points lie in between the two borders of the margin # which is maximized under suitable conditions to avoid outlier inclusion. I.e. we have a ε-insensitive loss function defined as $ |y - f(x)|_{\epsilon} := max \left\{ 0, |y - f(x)| - \epsilon \right\} $. Which is our minimization problem (see above): #

$ min \left\{ \frac{1}{2}|w|^2 + \frac{C}{m}\sum_{i=1}^{m}{ |y_i - f(x_i)|_{\epsilon} } \right\} $

# # # #### Ref # * [epsilon SVR](http://gandalf.psych.umn.edu/users/schrater/schrater_lab/courses/PattRecog09/RegressionII.pdf) # * [CRAN svm demonstration](https://cran.r-project.org/web/packages/e1071/vignettes/svmdoc.pdf) # * [R in jupyter](http://www.swegler.com/becky/blog/2014/08/03/ipython-ipython-notebook-anaconda-and-r-rpy2/) # * [rpy2](http://rpy.sourceforge.net/rpy2/doc-2.4/html/interactive.html#module-rpy2.ipython.rmagic) # In[5]: # http://rpy.sourceforge.net/rpy2/doc-2.4/html/interactive.html#module-rpy2.ipython.rmagic get_ipython().run_line_magic('load_ext', 'rpy2.ipython') get_ipython().run_line_magic('load_ext', 'rmagic') # In[6]: from rpy2.robjects.packages import importr utils = importr('utils') utils.install_packages('e1071') # In[117]: get_ipython().run_cell_magic('R', '-i xxTrain,yyTrain,Y_target -o svrPredictionRMSE,SVR_error', 'library(e1071)\n\nrmse <- function(error)\n{\n sqrt(mean(error^2))\n}\n\nmodel <- svm( yyTrain ~ xxTrain )\npredictedY <- predict(model)\n\nSVR_error <- Y_target - predictedY\nsvrPredictionRMSE <- rmse( SVR_error)\n') # In[118]: print( " RMSE for the multivariate SVR : {:4.4f}".format( svrPredictionRMSE[0])) # Could it be improved ? # # In our previous example, we performed an epsilon-regression, we did not set any value for epsilon ( ϵ ), but it took a default value of 0.1. There is also a cost parameter which we can change to avoid overfitting (by default is 1) [[svm doc](http://ugrad.stat.ubc.ca/R/library/e1071/html/svm.html)]. # # In[36]: get_ipython().run_cell_magic('R', '-i xxTrain,yyTrain', '# perform a grid search\ntuneResult <- tune(svm, yyTrain ~ xxTrain, ranges = list(epsilon = seq(0,1,0.05), cost = 2^(0:9)))\n\n# Draw the tuning graph\nplot(tuneResult)\n') # So, since lower the error the better is our model, now let's zoom in the darker region of our heatmap. # In[120]: get_ipython().run_cell_magic('R', '-i xxTrain,yyTrain -o tuneResult', '# perform a grid search\ntuneResult <- tune(svm, yyTrain ~ xxTrain, ranges = list(epsilon = seq(0,1,0.05), cost = (1:10)))\n\n# Draw the tuning graph\nplot(tuneResult)\n') # In[39]: print(tuneResult) # In[121]: get_ipython().run_cell_magic('R', '-i xxTrain,yyTrain,Y_target -o tunedModelRMSE,SVR_error', '\ntunedModel <- tuneResult$best.model\ntunedModelY <- predict(tunedModel, xxTrain) \n \nSVR_error <- Y_target - tunedModelY \n \n# the tune method randomly shuffles the data\ntunedModelRMSE <- rmse(SVR_error) \n') # In[65]: print( " RMSE \n\tfor the tuned SVR \t= {:4.4f} \ \n\tfor the standard SVR \t= {:4.4f} ".format( tunedModelRMSE[0], svrPredictionRMSE[0])) improvement = tunedModelRMSE[0]/svrPredictionRMSE[0] print( " We have improved our model of a {:.2f}%".format(improvement) ) # Well it seems standard values (epsilon=0.1 and cost=1) were good enough for our model. But It is a good practice to prove it and to try an improvement, which we did, even if a little bit. # # In[122]: np.savetxt("files/SVR_error.csv", SVR_error, delimiter=',', fmt='%1.6f') # --- # ## Random Forest # # Random Forest is an **ensemble learning** technique. It is a hybrid of the Bagging algorithm and the random subspace method, and uses decision trees as the base classifier. #

# Each tree is constructed from a bootstrap sample from the original dataset. To diversify the classifiers, at each branch in the tree, the decision of which feature to split on is restricted to a random subset of size n, from the full feature set. The random subset is chosen anew for each branching point. *n* is suggested # to be *log(N + 1)*, where *N* is the size of the whole feature set. # # **Decision trees** are built in a top-down fashion, with a recursive partitioning for selecting the root of the tree (and his subtrees), splitting the sets of examples in disjoint sets and adding respective nodes and branches to the tree. Typical attribute selection criteria use a function t omeasure the **impurity** of a node, that is the degree t owhich the node contains only examples of a single class. # Each tree is constructed using a different bootstrap sample from the original data. About one-third of the cases are left out of the bootstrap sample and not used in the construction of the kth tree [1]. # # With CART decision trees the impurity measure used for classification is the Gini index, defined as below, where S_i is the subset of the training examples S relative to the class c_i. While the **MSE** is used for the regression. # #

$ Gini(S) = 1 - \sum_{i=1}^{c} {\left( \frac{|S_i|}{|S|} \right) }^2 $

# # # # After some trials I have found I could reduce significantly the number of grown trees from 500 (the default value [4]) to 60 without loosing much accuracy, that is about 8%, but 2.7\*10^-5 in absolute value [3], but for the final comparison I used the 300 trees RF in order to gain the minimum RMSE. # # # #### Ref: # 2. [Random forests - Leo Breiman and Adele Cutler](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm) # 2. [Empirical comparison of supervised learning algorithms](https://www.cs.cornell.edu/~caruana/ctp/ct.papers/caruana.icml06.pdf) # 2. [Limiting the number of trees in random forests](http://lisa.ulb.ac.be/image/publifiles/100/Mcs2001_final.pdf) # 2. [Random Forest - R doc](https://cran.r-project.org/web/packages/randomForest/randomForest.pdf) # In[7]: get_ipython().run_line_magic('R', "install.packages('randomForest')") # Let's see a chart of variable importance as measured by the Random Forest # # In[126]: get_ipython().run_cell_magic('R', '-o rf_300,rf_mse,rf_residuals', 'library(randomForest)\nX_stocks= read.csv("files/X_stocks.csv", header=TRUE)\nY_target = read.csv("files/Y_target.csv", header=FALSE)\n\n# seeding the random sampling generator to have reproducible results\nset.seed(42)\n\nrf_300 = randomForest( matrix(Y_target[,2], nrow=500, byrow=TRUE), x=X_stocks[,c(2,3,4,5)], ntree=300, mtry=1, replace=TRUE)\nrf_mse = rf_300$mse\n\nrf_residuals = Y_target - predict(rf_300)\n\nvarImpPlot(rf_300)\n') # In[27]: rf_RMSE= np.sqrt(np.mean( rf_mse )) print(rf_RMSE) # It's interesting too, to note how the the importance of the variables in the decision trees were proportional to their correlation to the target variable. # # In[138]: rf_residuals = rf_residuals['V2'] np.savetxt("files/rf_residuals.csv", rf_residuals, delimiter=',', fmt='%1.6f') # --- # ## Conclusions # # Surprisingly the best result (0.007) is reached through the simple linear regression on the google trend analysis. As said this was focused on searches about finance, on a 300 weeks period (70% of which was used to train the algorithm). Trying to predict percentual price changes per week instead of per day could have improved the accuracy. But since there was no linear correlation this result should be ignored. # # The second lowest RMSE through the SVR with a value of 0.01715 and the residuals distribution centered on zero. # # The random forest directly follows with almost the same values for RMS (0.0174) and residual distribution, which is just a little more sqeezed. # # The multivariate linear regression still performs similar, returning an RMS of about 0.02 but with a left skewed residual distribution. # # # # In[144]: RMSE_list = [ lin_rmse, mlin_RMSE, tunedModelRMSE[0], rf_RMSE] print( "RMSE summary\n\t univ. linear regression on google trend:\t {:1.3f} \ \n\t multivariate linear regression:\t\t {:1.5f} \ \n\t SVR : \t\t\t\t\t\t {:1.5f} \ \n\t Random forest (300 trees):\t\t\t {:1.5f}".format( *RMSE_list)) # In[45]: residuals_names = [ 'uni_lin_residuals', 'mlin_residuals', 'SVR_error', 'rf_residuals'] residuals = [] for r in residuals_names: path = "files/"+r+".csv" r_csv = pd.read_csv(path, header=None,) residuals.append(r_csv) fig, ax = plt.subplots( figsize=(15, 10)) ax.boxplot(residuals, patch_artist=True, notch=False, labels=residuals_names ) ax.set_title("RESIDUALS BOXPLOT CONFRONTATION ") ax.set_ylim(-0.06, 0.05) plt.show() # In[ ]: # In[ ]: