#!/usr/bin/env python
# coding: utf-8

# # Stock Market Clustering with a KMeans algorithm
# > In this project, it will show the clustering algorithm to detect similar companies based on stock market movement. The original data is from Yahoo Finance.
# 
# - toc: true 
# - badges: true
# - comments: true
# - author: Chanseok Kang
# - categories: [Python, Machine_Learning]
# - image: images/kmeans_stock_cluster.png

# ## Required Packages

# In[4]:


import sys
import datetime
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pandas_datareader import data
import sklearn

plt.rcParams['figure.figsize'] = (8, 8)


# ## Version check

# In[5]:


print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(np.__version__))
print('Matplotlib: {}'.format(mpl.__version__))
print('Seaborn: {}'.format(sns.__version__))
print('Pandas: {}'.format(pd.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))


# ## Prepare dataset

# In[35]:


# Define the instruments to download
companies_dict = {
    'Amazon': 'AMZN',
    'Apple': 'AAPL',
    'Walgreen': 'WBA',
    'Northrop Grumman': 'NOC',
    'Boeing': 'BA',
    'Lockheed Martin': 'LMT',
    'McDonalds': 'MCD',
    'Intel': 'INTC',
    'Navistar': 'NAV',
    'IBM': 'IBM',
    'Texas Instruments': 'TXN',
    'MasterCard': 'MA',
    'Microsoft': 'MSFT',
    'General Electrics': 'GE',
    'American Express': 'AXP',
    'Pepsi': 'PEP',
    'Coca Cola': 'KO',
    'Johnson & Johnson': 'JNJ',
    'Toyota': 'TM',
    'Honda': 'HMC',
    'Mistubishi': 'MSBHY',
    'Sony': 'SNE',
    'Exxon': 'XOM',
    'Chevron': 'CVX',
    'Valero Energy': 'VLO',
    'Ford': 'F',
    'Bank of America': 'BAC',
}


# In[36]:


companies = sorted(companies_dict.items(), key=lambda x: x[1])
print(companies)


# In[107]:


companies_sorted = [x[1] for x in companies]
companies_sorted


# In[108]:


# Define which online source to use
data_source = 'yahoo'

# Define the start and end dates
start_date = '2015-01-01'
end_date = '2017-12-31'

#  Use pandas_reader.data.DataReader to load the desired stock data
panel_data = data.DataReader(companies_sorted, data_source, start_date, end_date).unstack().unstack().T

# Print Axes labels
print(panel_data.axes)


# In[109]:


# Find stock open and close data values
stock_close = panel_data['Close']
stock_open = panel_data['Open']

print(stock_close.iloc[0])


# In[110]:


# Calculate daily stock movement
stock_close = np.array(stock_close).T
stock_open = np.array(stock_open).T

row, col = stock_close.shape
print(row, col)

movements = np.zeros([row, col])

for i in range(0, row):
    movements[i, :] = np.subtract(stock_close[i, :], stock_open[i, :])
    
for i in range(0, len(companies)):
    print('Company: {}, Change: {}'.format(companies[i][0], sum(movements[i, :])))


# ## Visualization

# In[111]:


# Plot Stock Movements
plt.figure(figsize=(18, 16))
ax1 = plt.subplot(221)
plt.plot(movements[0][:]);
plt.title(companies[0]);

ax2 = plt.subplot(222, sharey=ax1)
plt.plot(movements[1][:]);
plt.title(companies[1]);


# ## Preprocessing dataset

# In[112]:


from sklearn.preprocessing import Normalizer

# Create the normalizer
normalizer = Normalizer()
new = normalizer.fit_transform(movements)

print(new.max())
print(new.min())
print(new.mean())


# In[113]:


# Plot Stock Movements
plt.figure(figsize=(18, 16))
ax1 = plt.subplot(221)
plt.plot(new[0][:]);
plt.title(companies[0]);

ax2 = plt.subplot(222, sharey=ax1)
plt.plot(new[1][:]);
plt.title(companies[1]);


# ## Build Pipeline

# In[126]:


from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans

# Define normalizer
normalizer = Normalizer()

# Create a KMeans model - 10 clusters
kmeans = KMeans(n_clusters=10, max_iter=1000)

# Make a pipeline chaining normalizer and kmeans
pipeline = make_pipeline(normalizer, kmeans)


# In[127]:


# Fit pipeline to dail stock movements
pipeline.fit(movements)


# In[128]:


print(kmeans.inertia_)


# ## Clustering

# In[129]:


# Predict the cluster labels
labels = pipeline.predict(movements)

# Create a DataFrame aligning labels and companies
df = pd.DataFrame({'labels':labels, 'companies': companies})

# Display df sorted by cluster label
print(df.sort_values('labels'))


# ## Dimensionality Reduction with PCA

# In[136]:


from sklearn.decomposition import PCA

# Visualize the results on PCA-reduced data
reduced_data = PCA(n_components=2).fit_transform(new)

# Run kmeans on the reduced data
kmeans = KMeans(n_clusters=10)
kmeans.fit(reduced_data)
labels = kmeans.predict(reduced_data)

print(kmeans.inertia_)

# Create a DataFrame aligning labels and companies
df = pd.DataFrame({'labels':labels, 'companies':companies})
print(df.sort_values('labels'))


# ## Visualize cluster

# In[138]:


# Define step size of mesh
h = 0.01

# Plot the decision boundary (+- 1 for padding)
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in the mesh using our trained model
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

# Define colorplot
cmap = plt.cm.Paired

# Plot figure
plt.figure(figsize=(10, 10))
plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap = cmap, aspect='auto', origin='lower');
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=5);

# Plot the centroids of each cluster as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10);
plt.title('K-Means clustering on Stock Market Movements (PCA-Reduced Data)');
plt.xlim(x_min, x_max);
plt.ylim(y_min, y_max);