#!/usr/bin/env python # coding: utf-8 # # Stock Market Clustering with a KMeans algorithm # > In this project, it will show the clustering algorithm to detect similar companies based on stock market movement. The original data is from Yahoo Finance. # # - toc: true # - badges: true # - comments: true # - author: Chanseok Kang # - categories: [Python, Machine_Learning] # - image: images/kmeans_stock_cluster.png # ## Required Packages # In[4]: import sys import datetime import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns import pandas as pd from pandas_datareader import data import sklearn plt.rcParams['figure.figsize'] = (8, 8) # ## Version check # In[5]: print('Python: {}'.format(sys.version)) print('Numpy: {}'.format(np.__version__)) print('Matplotlib: {}'.format(mpl.__version__)) print('Seaborn: {}'.format(sns.__version__)) print('Pandas: {}'.format(pd.__version__)) print('Scikit-learn: {}'.format(sklearn.__version__)) # ## Prepare dataset # In[35]: # Define the instruments to download companies_dict = { 'Amazon': 'AMZN', 'Apple': 'AAPL', 'Walgreen': 'WBA', 'Northrop Grumman': 'NOC', 'Boeing': 'BA', 'Lockheed Martin': 'LMT', 'McDonalds': 'MCD', 'Intel': 'INTC', 'Navistar': 'NAV', 'IBM': 'IBM', 'Texas Instruments': 'TXN', 'MasterCard': 'MA', 'Microsoft': 'MSFT', 'General Electrics': 'GE', 'American Express': 'AXP', 'Pepsi': 'PEP', 'Coca Cola': 'KO', 'Johnson & Johnson': 'JNJ', 'Toyota': 'TM', 'Honda': 'HMC', 'Mistubishi': 'MSBHY', 'Sony': 'SNE', 'Exxon': 'XOM', 'Chevron': 'CVX', 'Valero Energy': 'VLO', 'Ford': 'F', 'Bank of America': 'BAC', } # In[36]: companies = sorted(companies_dict.items(), key=lambda x: x[1]) print(companies) # In[107]: companies_sorted = [x[1] for x in companies] companies_sorted # In[108]: # Define which online source to use data_source = 'yahoo' # Define the start and end dates start_date = '2015-01-01' end_date = '2017-12-31' # Use pandas_reader.data.DataReader to load the desired stock data panel_data = data.DataReader(companies_sorted, data_source, start_date, end_date).unstack().unstack().T # Print Axes labels print(panel_data.axes) # In[109]: # Find stock open and close data values stock_close = panel_data['Close'] stock_open = panel_data['Open'] print(stock_close.iloc[0]) # In[110]: # Calculate daily stock movement stock_close = np.array(stock_close).T stock_open = np.array(stock_open).T row, col = stock_close.shape print(row, col) movements = np.zeros([row, col]) for i in range(0, row): movements[i, :] = np.subtract(stock_close[i, :], stock_open[i, :]) for i in range(0, len(companies)): print('Company: {}, Change: {}'.format(companies[i][0], sum(movements[i, :]))) # ## Visualization # In[111]: # Plot Stock Movements plt.figure(figsize=(18, 16)) ax1 = plt.subplot(221) plt.plot(movements[0][:]); plt.title(companies[0]); ax2 = plt.subplot(222, sharey=ax1) plt.plot(movements[1][:]); plt.title(companies[1]); # ## Preprocessing dataset # In[112]: from sklearn.preprocessing import Normalizer # Create the normalizer normalizer = Normalizer() new = normalizer.fit_transform(movements) print(new.max()) print(new.min()) print(new.mean()) # In[113]: # Plot Stock Movements plt.figure(figsize=(18, 16)) ax1 = plt.subplot(221) plt.plot(new[0][:]); plt.title(companies[0]); ax2 = plt.subplot(222, sharey=ax1) plt.plot(new[1][:]); plt.title(companies[1]); # ## Build Pipeline # In[126]: from sklearn.pipeline import make_pipeline from sklearn.cluster import KMeans # Define normalizer normalizer = Normalizer() # Create a KMeans model - 10 clusters kmeans = KMeans(n_clusters=10, max_iter=1000) # Make a pipeline chaining normalizer and kmeans pipeline = make_pipeline(normalizer, kmeans) # In[127]: # Fit pipeline to dail stock movements pipeline.fit(movements) # In[128]: print(kmeans.inertia_) # ## Clustering # In[129]: # Predict the cluster labels labels = pipeline.predict(movements) # Create a DataFrame aligning labels and companies df = pd.DataFrame({'labels':labels, 'companies': companies}) # Display df sorted by cluster label print(df.sort_values('labels')) # ## Dimensionality Reduction with PCA # In[136]: from sklearn.decomposition import PCA # Visualize the results on PCA-reduced data reduced_data = PCA(n_components=2).fit_transform(new) # Run kmeans on the reduced data kmeans = KMeans(n_clusters=10) kmeans.fit(reduced_data) labels = kmeans.predict(reduced_data) print(kmeans.inertia_) # Create a DataFrame aligning labels and companies df = pd.DataFrame({'labels':labels, 'companies':companies}) print(df.sort_values('labels')) # ## Visualize cluster # In[138]: # Define step size of mesh h = 0.01 # Plot the decision boundary (+- 1 for padding) x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in the mesh using our trained model Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) # Define colorplot cmap = plt.cm.Paired # Plot figure plt.figure(figsize=(10, 10)) plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap = cmap, aspect='auto', origin='lower'); plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=5); # Plot the centroids of each cluster as a white X centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10); plt.title('K-Means clustering on Stock Market Movements (PCA-Reduced Data)'); plt.xlim(x_min, x_max); plt.ylim(y_min, y_max);