Notebook

In [10]:

# Author: Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD 3 clause
%matplotlib inline
import datetime

import numpy as np
import matplotlib.pyplot as plt
# try:
#     from matplotlib.finance import quotes_historical_yahoo
# except ImportError:
#     from matplotlib.finance import quotes_historical_yahoo_ochl as quotes_historical_yahoo
from matplotlib.collections import LineCollection

from sklearn import cluster, covariance, manifold

In [13]:

# Retrieve the data from Internet

# Choose a time period reasonnably calm (not too long ago so that we get
# high-tech firms, and before the 2008 crash)
d1 = datetime.datetime(2003, 1, 1)
d2 = datetime.datetime(2008, 1, 1)

# kraft symbol has now changed from KFT to MDLZ in yahoo
symbol_dict = {
    'TOT': 'Total',
    'XOM': 'Exxon',
    'CVX': 'Chevron',
    'COP': 'ConocoPhillips',
    'VLO': 'Valero Energy',
    'MSFT': 'Microsoft',
    'IBM': 'IBM',
    'TWX': 'Time Warner',
    'CMCSA': 'Comcast',
    'CVC': 'Cablevision',
    'YHOO': 'Yahoo',
    'DELL': 'Dell',
    'HPQ': 'HP',
    'AMZN': 'Amazon',
    'TM': 'Toyota',
    'CAJ': 'Canon',
    'MTU': 'Mitsubishi',
    'SNE': 'Sony',
    'F': 'Ford',
    'HMC': 'Honda',
    'NAV': 'Navistar',
    'NOC': 'Northrop Grumman',
    'BA': 'Boeing',
    'KO': 'Coca Cola',
    'MMM': '3M',
    'MCD': 'Mc Donalds',
    'PEP': 'Pepsi',
    'MDLZ': 'Kraft Foods',
    'K': 'Kellogg',
    'UN': 'Unilever',
    'MAR': 'Marriott',
    'PG': 'Procter Gamble',
    'CL': 'Colgate-Palmolive',
    'GE': 'General Electrics',
    'WFC': 'Wells Fargo',
    'JPM': 'JPMorgan Chase',
    'AIG': 'AIG',
    'AXP': 'American express',
    'BAC': 'Bank of America',
    'GS': 'Goldman Sachs',
    'AAPL': 'Apple',
    'SAP': 'SAP',
    'CSCO': 'Cisco',
    'TXN': 'Texas instruments',
    'XRX': 'Xerox',
    'LMT': 'Lookheed Martin',
    'WMT': 'Wal-Mart',
    'WBA': 'Walgreen',
    'HD': 'Home Depot',
    'GSK': 'GlaxoSmithKline',
    'PFE': 'Pfizer',
    'SNY': 'Sanofi-Aventis',
    'NVS': 'Novartis',
    'KMB': 'Kimberly-Clark',
    'R': 'Ryder',
    'GD': 'General Dynamics',
    'RTN': 'Raytheon',
    'CVS': 'CVS',
    'CAT': 'Caterpillar',
    'DD': 'DuPont de Nemours'}

symbols, names = np.array(list(symbol_dict.items())).T

In [8]:

# quotes = [quotes_historical_yahoo(symbol, d1, d2, asobject=True)
#           for symbol in symbols]

# stock_open = np.array([q.open for q in quotes]).astype(np.float)
# stock_close = np.array([q.close for q in quotes]).astype(np.float)

In [5]:

# import pickle
# output = open('/Users/chengjun/GitHub/cjc2016/data/stock_open.pkl', 'wb')
# # Pickle dictionary using protocol 0.
# pickle.dump(stock_open, output)
# output.close()

# output = open('/Users/chengjun/GitHub/cjc2016/data/stock_close.pkl', 'wb')
# # Pickle dictionary using protocol 0.
# pickle.dump(stock_close, output)
# output.close()

Load Pickle Data¶

In [4]:

import pickle
data = open('../data/stock_open.pkl', 'rb')
# Pickle dictionary using protocol 0.
stock_open = pickle.load(data, encoding='bytes')
data.close()

data = open('../data/stock_close.pkl', 'rb')
# Pickle dictionary using protocol 0.
stock_close = pickle.load(data, encoding='bytes')
data.close()

In [5]:

# The daily variations of the quotes are what carry most information
variation = stock_close - stock_open

In [6]:

variation

Out[6]:

array([[ 0.12936386, -0.03172999,  0.20258755, ..., -0.14865278,
         0.07157136, -0.44045379],
       [ 0.63269122,  0.0799928 ,  0.99630258, ..., -1.11184795,
        -0.75570651,  1.01629786],
       [ 0.34160551,  0.01423356,  0.20638595, ..., -0.79103983,
        -0.43507428, -0.57986219],
       ..., 
       [ 0.91294843, -0.13416206,  0.29449672, ..., -0.62319818,
        -0.33666256, -0.18624163],
       [ 0.00920966, -0.05986028, -0.16116061, ..., -0.30019468,
        -0.02633234, -0.08426518],
       [ 0.06352973,  0.01270637, -0.01694387, ..., -0.19319241,
        -0.05201374, -0.63159692]])

GraphLassoCV¶

Sparse inverse covariance w/ cross-validated choice of the l1 penalty

In [11]:

# Learn a graphical structure from the correlations
edge_model = covariance.GraphLassoCV()

# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)

Out[11]:

GraphLassoCV(alphas=4, assume_centered=False, cv=None, enet_tol=0.0001,
       max_iter=100, mode='cd', n_jobs=1, n_refinements=4, tol=0.0001,
       verbose=False)

Cluster using affinity propagation¶

In [14]:

_, labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels = labels.max()

for i in range(n_labels + 1):
    print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

Cluster 1: Dell, General Electrics, Toyota
Cluster 2: Goldman Sachs, Procter Gamble, Chevron
Cluster 3: Mitsubishi, Navistar, 3M
Cluster 4: Kimberly-Clark, Amazon, General Dynamics
Cluster 5: Apple, Home Depot, Sanofi-Aventis, Total, HP
Cluster 6: AIG, Mc Donalds
Cluster 7: Time Warner, Cisco, Yahoo, Canon, Pfizer, Pepsi, Northrop Grumman, CVS, Boeing, Wal-Mart
Cluster 8: IBM, JPMorgan Chase, Kraft Foods
Cluster 9: SAP, Coca Cola, Exxon, GlaxoSmithKline, Walgreen, Bank of America, ConocoPhillips, Sony, Honda, Marriott, Comcast, Novartis, Valero Energy, Raytheon
Cluster 10: Colgate-Palmolive, Ryder, Cablevision, Lookheed Martin, American express, Unilever, Microsoft, Texas instruments
Cluster 11: Ford, Kellogg, Caterpillar, Wells Fargo, DuPont de Nemours
Cluster 12: Xerox

In [15]:

# Find a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver='dense', n_neighbors=6)

embedding = node_position_model.fit_transform(X.T).T

In [16]:

# Display a graph of the partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

In [20]:

# Plot the nodes using the coordinates of our embedding
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
            cmap=plt.cm.Spectral)
plt.show()

In [23]:

# Visualization
plt.figure(1, facecolor='w', figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')

# Display a graph of the partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

# Plot the nodes using the coordinates of our embedding
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
            cmap=plt.cm.Spectral)

# Plot the edges
start_idx, end_idx = np.where(non_zero)
#a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [[embedding[:, start], embedding[:, stop]]
            for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
                    zorder=0, cmap=plt.cm.hot_r,
                    norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)

# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
for index, (name, label, (x, y)) in enumerate(
        zip(names, labels, embedding.T)):

    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = 'left'
        x = x + .002
    else:
        horizontalalignment = 'right'
        x = x - .002
    if this_dy > 0:
        verticalalignment = 'bottom'
        y = y + .002
    else:
        verticalalignment = 'top'
        y = y - .002
    plt.text(x, y, name, size=10,
             horizontalalignment=horizontalalignment,
             verticalalignment=verticalalignment,
             bbox=dict(facecolor='w',
                       edgecolor=plt.cm.Spectral(label / float(n_labels)),
                       alpha=.6))

plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
         embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
         embedding[1].max() + .03 * embedding[1].ptp())

plt.show()

In [ ]: