#!/usr/bin/env python # coding: utf-8 # Open In Colab # # Unsupervised Learning: StockCenter # # Use K-means and Hierarchical clustering techniques to group stocks. StockCenter can use this information to compile a portfolio containing similar performing securities across various sectors. # # Attributes: # # * `Ticker Symbol`: Stock abbreviation # * `Security`: Company Name # * `GICS Sector`: Global Industry Classification Standard economic sector designation # * `GICS Sub Industry`: GICS-designated economic sub-industry # * `Current Price`: Stock price in USD # * `Price Change`: Percent change in price (last 13 weeks) # * `Volatility`: Standard deviation in price (last 13 weeks) # * `ROE`: Net income divided by shareholder equity, where shareholder equity is a company's assets less debts # * `Cash Ratio`: Cash and cash equivalents divided by total liabilities # * `Net Cash Flow`: Difference between cash inflow and outflow in USD # * `Net Income`: Company revenue minus expenses, interest, and taxes (USD) # * `Earnings Per Share`: Net profit over outstanding common shares (USD/share) # * `Estimated Shares Outstanding`: Shareholder-held stock # * `P/E Ratio`: Stock price divided by earnings per share # * `P/B Ratio`: Stock price divided by company book value per share, where book value is total assets less total liabilities # ## Importing necessary libraries and data # In[ ]: get_ipython().system(' pip install -U scikit-learn') # In[ ]: import numpy as np import pandas as pd # visualization from matplotlib import pyplot as plt import seaborn as sns from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer from scipy.cluster.hierarchy import dendrogram # normalization, distance, and correlation functions from scipy.stats import zscore from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import linkage, cophenet from sklearn.metrics import silhouette_score # clustering from sklearn.cluster import KMeans, AgglomerativeClustering # In[ ]: df=pd.read_csv('stock_data.csv') data=df.copy() # In[ ]: sns.set_theme() # ## Data Overview # In[ ]: data.sample(10,random_state=1) # There are two unique identifiers for each record: ticker symbol and security. Both a general sector and a sub-industry are described as well. # In[ ]: data.info() # Most of our attributes are numerical, with about twice as many floats to ints. There are no null entries. # ## Exploratory Data Analysis (EDA) # In[ ]: data.describe().T # * Many values are in the millions or billions. # * Net cash flow and net income have some of the highest values, along with some of the lowest. # * The mean P/E ratio is positive, while the mean P/B ratio is negative. # In[ ]: data.describe(include='object').T # The most common sector in our data is industrials, and the most common sub-industry is Oil & Gas Exploration & Production. # In[ ]: # boxplots data.plot( kind='box', subplots=True, sharey=False, layout=(3,4), figsize=(15,20)); # * Every attribute shows evidence of many outliers, indicating the broad spread of the data. # * Net cash flow is fairly concentrated around 0, though the minimum and maximum are respectively massive. # * The distribution of price change appears fairly balanced. # * ROE is concentrated near 0, with roughly two pockets of data above. # * Cash ratio has a singe value far outside the rest of the data, as does P/B ratio. # In[ ]: # histograms plt.figure(figsize=(20,18)) for idx,col in enumerate(data.select_dtypes(np.number).columns): plt.subplot(4,3,idx+1) plt.title('Distribution of '+str(col),fontsize=14) sns.histplot(data[col],kde=True) plt.tight_layout() # * The distribution of stock prices is heavily right-skewed. # * Moreover, volatility, ROE, cash ratio, estimated shares outstanding, and P/E ratio are right-skewed. # * Net cash flow is very tightly concentrated about 0 with long tails. Net income is similar, although not as tightly concentrated. # * P/E has a second peak that appears out of nowhere to the right of the mean. # In[ ]: plt.figure(figsize=(12,8)) plt.title('Correlation Matrix',fontsize=20) sns.heatmap( data=data.corr(), vmin=-1, cmap='Spectral', annot=True ); # The matrix above shows the correlation between the different variables. # * Net income is positively correlated with earnings per share and estimated shares outstanding. # * Volatility is negatively correlated with price change, net income, and earnings per share. Earnings per share is also negatively correlated with ROE. # * Earnings per share and current price are positively correlated. # In[ ]: sns.pairplot(data,diag_kind='kde',corner=True); # There are no immediately visible trends in the paired scatterplot. # In[ ]: subind=data.groupby('GICS Sector')['GICS Sub Industry'].nunique() plt.figure(figsize=(15,8)) plt.subplot(1,2,1) plt.title('Number of Securities per Sector') sns.countplot( x=data['GICS Sector'], order=data['GICS Sector'].value_counts().index) plt.xticks(rotation=90) plt.subplot(1,2,2) plt.title('Number of Sub-industries per Sector') sns.barplot( x=subind.index, y=subind.values, order=data['GICS Sector'].value_counts().index) plt.xticks(rotation=90) plt.show() # On the left, we see the number of records per sector. The plot on the right shows the number of sub-industries in each sector. # * Industrials has the most records in our data set, followed by financials, health care, and consumer discretionary. # * Consumer discretionary, however, boasts the most sub-industries. # In[ ]: plt.figure(figsize=(8,5)) plt.title('Price Change by Sector') sns.barplot( data=data, x='GICS Sector', y='Price Change', order=(data .groupby('GICS Sector')['Price Change'] .mean() .sort_values(ascending=False) .index)) plt.xticks(rotation=90); # This plot shows price change by GICS sector, sorted by mean price increase. Health care is the sector with the greatest mean price increase. # In[ ]: sns.boxplot( data=data, y='GICS Sector', x='Net Income', showfliers=False); # Net income is generally positive. However, the energy sector seems to report largely negative net incomes! # In[ ]: energy_df=(pd.DataFrame( data.query("`GICS Sector`=='Energy'")['GICS Sub Industry'].value_counts()) .rename(columns={'GICS Sub Industry':'no_records'})) energy_df['mean_net_income']=( data .query("`GICS Sector`=='Energy'") .groupby('GICS Sub Industry')['Net Income'] .mean()) energy_df # Looking closer, we find that one sub-industry is to blame: Oil & Gas Exploration & Production. This sub-industry has a very negative mean net income and by far the most records. Therefore this sub-industry outweighs the others and drags the distribution down into the negative net income range. # In[ ]: # cash ratio boxplots plt.figure(figsize=(15,5)) plt.subplot(1,2,1) plt.title('Cash Ratio by Sector') sns.boxplot( data=data, x='GICS Sector', y='Cash Ratio', order=(data .groupby('GICS Sector')['Cash Ratio'] .mean() .sort_values(ascending=False) .index), showfliers=True) plt.xticks(rotation=90) plt.subplot(1,2,2) plt.title('same without fliers') sns.boxplot( data=data, x='GICS Sector', y='Cash Ratio', order=(data .groupby('GICS Sector')['Cash Ratio'] .mean() .sort_values(ascending=False) .index), showfliers=False) plt.xticks(rotation=90) plt.show() # Here we see the cash ratio by GICS sector. The sectors are sorted by mean cash ratio, with IT highest and utilities lowest. # # We get a better idea of relative spread without the fliers, although it is precisely those that bias the means. Put another way, that value around 1000 is part of the reason IT has the highest mean cash ratio. # # Financials and real estate are very tightly distributed. IT and health care, on the other hand, see a broad range in cash ratios. Utilities has a reliably low cash ratio. In the middle is consumer staples and energy. The former has a greater overall spread, but a narrower 50% than the latter. # In[ ]: # p/e ratio boxplots plt.figure(figsize=(15,5)) plt.subplot(1,2,1) plt.title('P/E Ratio by Sector') sns.boxplot( data=data, x='GICS Sector', y='P/E Ratio', order=(data .groupby('GICS Sector')['P/E Ratio'] .mean() .sort_values(ascending=True) .index), showfliers=True) plt.xticks(rotation=90) plt.subplot(1,2,2) plt.title('same without fliers') sns.boxplot( data=data, x='GICS Sector', y='P/E Ratio', order=(data .groupby('GICS Sector')['P/E Ratio'] .mean() .sort_values(ascending=True) .index), showfliers=False) plt.xticks(rotation=90) plt.show() # Here are P/E ratios by sector, sorted by sector mean. A lower P/E ratio is generally considered better, as a high ratio indicates an overvalued stock. # * Telecom has the lowest P/E ratio. # * Energy has the highest P/E ratio and the greatest variablity from the lower fence to the upper fence. # * Consumer discretionary has one value far above the others. IT and health care also have a value floating far outside their range. # * Utilities has P/E ratios packed between roughly 20 and 25. This range is favored by many investors because it is not too high and still realistic. Financials and industrials also have P/E ratios on the lower end, though with greater spreads. # ## Data Preprocessing # In[ ]: data.duplicated().sum() # There are no duplicate records in our data set. # In[ ]: data.isna().sum() # Additionally, there are no records will null entries. # In[ ]: data.plot( kind='box', subplots=True, sharey=False, layout=(3,4), figsize=(15,20)); # We recall the boxplots from the EDA above to check for conspicuous outliers. The attributes Current Price, Cash Ratio, and P/B Ratio have prominent values on the high end. # In[ ]: data.query('`Current Price`>1000') # Researching the market history of this security reveals that the recorded current price, while higher than our other observations, is completely reasonable. (Note: PCLN is now being traded on the Nasdaq under the ticker symbol BKNG.) # In[ ]: data.query('`Cash Ratio`>800') # A cash ratio of 958 is preposterous. [Research](https://www.macrotrends.net/stocks/charts/META/meta-platforms/current-ratio) indicates that Facebook's cash ratio has never risen above 14. As of now (January 2023), it's max was achieved in Q3-2017 at a value of just over 13. Facebooks's [current cash ratio](https://www.investing.com/equities/facebook-inc-ratios) is 2.57. # # That being said, around 50% of our data have a cash ratio above 50, far outside the expectation of a data set of this size. We must conclude that there is a difference in definition or a systematic difference in the calculation which inflates all the cash ratios. As we will be scaling this data, the relationship between the records is far more important than the raw numbers, and we must look past this inconsistency. # In[ ]: data.query('`P/B Ratio`>100') # While ADS currently sees a P/B ratio far lower than the recorded value, its stock price history indicates there was a time when a ratio of 129 was possible. # In[ ]: # scale numerical data attr=data.drop( data.select_dtypes('object').columns, axis=1 ).apply(zscore) # Lastly we separate out the numerical features and scale each column. # ## Second EDA # In[ ]: plt.figure(figsize=(20,18)) for idx,col in enumerate(attr.columns): plt.subplot(4,3,idx+1) plt.title('Distribution of '+str(col),fontsize=14) sns.histplot(attr[col],kde=True) plt.tight_layout() # As we only scaled the data, there is no change to the distributions other than the axis scales. Note that each distribution has a mean centered at 0 and a standard deviation of 1. This is especially apparent in the distribution of price change, which is approximately normal. # ## K-means Clustering # ### Exploration of k values # In[ ]: # elbow method visualization km=KMeans(n_init=10,random_state=1) vis=KElbowVisualizer(km,k=(1,10),timings=False,locate_elbow=False) vis.fit(attr) vis.show(); # The curve is fairly smooth, but k=3 and k=4 have slightly more bend than the rest in my estimation. # In[ ]: # compute silhouette scores scores=[] for k in range(2,10): scores.append(silhouette_score( attr, KMeans(n_clusters=k, n_init=10, random_state=1).fit_predict(attr) )) # In[ ]: plt.figure(figsize=(8,5)) plt.title('Silhouette Scores by Number of Clusters') plt.plot(np.arange(2,10),scores,marker='D') plt.xlabel('Number of Clusters') plt.ylabel('Silhouette Score'); # Looking at the silhouette scores for various numbers of clusters, we see the greatest score is achieved at k=3 clusters. After k=4, the values drop off sharply. At k=8, there is a brief rise. # In[ ]: # broad search silhouette scores=[] for k in range(2,30): scores.append(silhouette_score( attr, KMeans(n_clusters=k, n_init=10, random_state=1).fit_predict(attr) )) plt.figure(figsize=(8,5)) plt.title('Silhouette Scores by Number of Clusters') plt.plot(np.arange(2,30),scores,marker='D') plt.xlabel('Number of Clusters') plt.ylabel('Silhouette Score'); # Although k=8 looked appreciably worse than k=3 in the last plot, this broad search reveals that the scores for k<10 are all quite good. Interestingly, there's a spike around k=13. # # To determine the best value for k, we will use the silhouette visualizer to study the quality of clusters. # In[ ]: # silhouette visualizer for k in range(2,10): vis=SilhouetteVisualizer( KMeans(n_clusters=k, n_init=10, random_state=1)).fit(attr).show(); # The visualizer yields some interesting findings. # # * There is always one big group. This big group comprises observations with positive silhouette scores. # * The smaller groups are largely composed of entries with negative silhouette scores. # * Groups 0 and 1 in the k=3 model have just as many negative silhouette scores as positive ones. What's more, the worst (most negative) values in group 0 are greater in magnitude than the best (most positive) ones. # * The model which appears to have the fewest negative observations is k=8. Over 50% of records in each cluster have a positive silhouette score. # * The number of records in each of the smaller clusters varies. (See bar widths.) # In[ ]: scores[3-2]-scores[8-2] # Despite its slightly higher silhouette score, k=3 appears to be the weaker choice: The largest cluster's separation from the others is inflating the silhouette score, like skewed data biasing a mean higher than the median. What's more, the difference between scores for k=3 and k=8 is only 0.0617. # # From the silhouette visualizer, we see that the best separation is achieved at k=8 clusters. Many of the clusters boast only records with positive silhouette scores, and negative scores do not comprise the majority of any cluster. We will further investigate k=8. # ### k=8 # In[ ]: # kmeans model km=KMeans(n_clusters=8, n_init=10, random_state=1) km.fit(attr) km_group=km.predict(attr) data['km_group']=km_group # In[ ]: # boxplots of groups for col in attr.columns: plt.figure(figsize=(16,5)) plt.suptitle(str(col)+' by Cluster',fontsize=20) plt.subplot(1,2,1) plt.title('with outliers') sns.boxplot(data=data, x='km_group', y=col, showfliers=True) plt.xlabel('Cluster') plt.subplot(1,2,2) plt.title('outliers removed') sns.boxplot(data=data, x='km_group', y=col, showfliers=False) plt.xlabel('Cluster') plt.show() print('') # In[ ]: data.groupby('km_group').mean() # ### Cluster Analysis # There's a lot to absorb here, but let's start by highlighting group 5. # # * The current stock prices for securities in group 5 are comparable to some other groups, but are generally on the lower end. # * Price change is low but positive, with a narrow spread in variability. Low volatility means that this group contains fairly consistent securities. # * Net cash flow is more variable here than in any other group. Perhaps this is biased by several negative records. Do note that there is also a highly positive record, but this is so far outside the spread of the data that it is represented as a singular dot above the upper fence of the boxplot. # * Net income of group 5 securities is higher than any other group! # * This group has the second-highest mean earnings per share, being outperformed by group 1. # * Very noticeably, group 5 has more shares outstanding than any other. # * Group 5 also has some of the lowest P/E ratios, indicating that the stock price is not inflated relative to the company's performance. (Note that this group has the lowest mean P/E ratio.) # # Let's take a closer look at the securities comprising group 5. # In[ ]: data.query('km_group==5') # We find primarily big-name, multi-national corporations here, including financial institutions, major technology and telecommunication corporations, and even common consumer brands such as Ford and Coca Cola. Note that banks are the major contributor to the negative skew in net cash flow. # # Turning to a more geneal cluster analysis, let's look at the number of records in each group. # In[ ]: data['km_group'].value_counts() # As suggested by the silhouette visualizer, there is one big group (namely group 2) and many smaller groups. Group 6 has two records, the fewest in the bunch. # In[ ]: # group 2 data.query('km_group==2')['GICS Sector'].value_counts() # Group 2 is comprised of securities from many sectors. Industrials are the most common, followed by financials. The sector with the fewest records in this group is telecom. # In[ ]: # group 3 data.query('km_group==3')['GICS Sector'].value_counts() # In[ ]: data['GICS Sector'].value_counts()['Energy'] # Of the 30 Energy securities in our data set, 21 have landed in group 3. # In[ ]: (data .query('km_group==3') .loc[(data .query('km_group==3')['GICS Sector'] .mask(lambda x: x=='Energy') .notnull()) ] ) # The other entries in group 3 are split between industrials, materials, and information technology. These six records are shown above. # In[ ]: # group 0 data.query('km_group==0')['GICS Sector'].value_counts() # Group 0 is comprised primarily of IT and healthcare securities. It generally performs around the middle of the pack, but it offers the highest cash ratios among the groups. # In[ ]: data.query('km_group==6') # While group 6 is small, at least both records are from the energy sector. Other than that, their performance differs in almost every attribute. # ### A Curiosity: k=3 # We briefly investigate the possibility of only three clusters. # In[ ]: # kmeans with k=3 km3=KMeans(n_clusters=3, n_init=10, random_state=1) km3.fit(attr) km_group3=km3.predict(attr) data['km_group3']=km_group3 # In[ ]: for col in attr.columns: plt.figure(figsize=(8,5)) plt.title(str(col)+' by Cluster') sns.boxplot(data=data, x='km_group3', y=col, showfliers=False) plt.xlabel('Cluster') plt.show() print('') # With only three clusters, we see some features with separation and others with none. This model seems too coarse to distinguish meaningfully between groups of securities. # ## Hierarchical Clustering # ### Exploration of linkage methods, distance metrics, and number of clusters # In[ ]: # declaring variables dist_metrics=['euclidean','chebyshev','mahalanobis','minkowski','cityblock'] link_methods=['single','average','complete','weighted'] pointwise=pdist(attr) tab=pd.DataFrame( np.zeros((5,4)), index=dist_metrics, columns=link_methods) max_coph=0 max_coph_data={'metric':'','linkage':''} # We will explore many distance metrics and linkage methods to identify the pair with the highest cophenet correlation. # In[ ]: # computing cophenet scores for d in dist_metrics: for l in link_methods: z=linkage(attr,metric=d,method=l) c,_=cophenet(z,pointwise) tab.loc[d,l]=c if c>max_coph: max_coph=c max_coph_data['metric']=d max_coph_data['linkage']=l # In[ ]: print(f'''The maximum cophenetic correlation is {max_coph} with the {max_coph_data['metric']} metric and {max_coph_data['linkage']} linkage method.''') # In[ ]: tab # The best score is achieved with the euclidean metric and average linkage. The table above lists the cophenet correlation for every pair of metric and linkage method. # # Notice that the scores trend based on the linkage method, with distance metric having far less effect. For example, the complete linkage method reliably led to lower cophenet correlation across distance metrics. # # There are a few extra linkage methods that require the Euclidean metric. Let's look at the dendrograms for all Euclidean linkage methods. # In[ ]: link_methods=['single','average','complete','weighted','median','centroid','ward'] # dendrogram plots for l in link_methods: z=linkage(attr,metric='euclidean',method=l) c,_=cophenet(z,pointwise) # full tree plt.figure(figsize=(18,10)) plt.subplot(2,1,1) plt.title(f'{l.capitalize()} linkage method with cophenet correlation {np.round(c,4)}', fontsize=20) dendrogram(z) plt.xticks([]) # truncated plt.subplot(2,1,2) plt.title('Truncated dendrogram', fontsize=16) dendrogram(z,p=30,truncate_mode='lastp') plt.xticks([]) plt.show() print('') # The general clusterings for each method can be observed in the truncated dendrograms. These truncated plots more accurately represent how the clustering would look for our purposes, as we will definitely not choose a model with more than, say, twenty clusters. # * One of the most organized dendrogram is the single linkage. The hierarchies are quite clear here. # * The average linkage method still offers the best cophenet correlation, though centroid is a close second. # * The Ward method has the lowest cophenet correlation. Additionally, the dendrogram structure is noticably different from the rest. # # Let's look closer at the dendrogram with Euclidean distance and average linkage. # In[ ]: z=linkage(attr,metric='euclidean',method='average') plt.figure(figsize=(18,15)) # full dendrogram plt.subplot(3,1,1) plt.title('Dendrogram of Clustering with Average Linkage Method',fontsize=20) dendrogram(z) plt.axhline(y=8, color='black', linestyle='dashed', linewidth=3, label='8 clusters') plt.axhline(y=6, color='black', linestyle='dotted', linewidth=3, label='14 clusters') plt.xticks([]) plt.legend(loc='upper right', prop={'size':15}) # truncated 8 plt.subplot(3,1,2) plt.title('8 clusters',fontsize=16) dendrogram(z,p=8,truncate_mode='lastp') plt.xticks([]) # truncated 14 plt.subplot(3,1,3) plt.title('14 clusters',fontsize=16) dendrogram(z,p=14,truncate_mode='lastp') plt.xticks([]) plt.show() # ### Comparing k values # # We start with eight clusters. # In[ ]: # 8 clusters ag8=AgglomerativeClustering(n_clusters=8, metric='euclidean', linkage='average') ag8.fit(attr) pd.Series(ag8.labels_).value_counts() # It seems that most records are sorted into a single group. Is this true of model with a different number of clusters? # In[ ]: # plot of biggest cluster size cl_size=[] for k in range(2,19): a=AgglomerativeClustering( n_clusters=k, metric='euclidean', linkage='average' ).fit(attr).labels_ cl_size.append( pd.Series(a) .value_counts(sort=True) .reset_index(drop=True)[0]) plt.figure(figsize=(8,5)) plt.title('Largest Cluster Size by Number of Clusters',fontsize=16) plt.plot(np.arange(2,19),cl_size,marker='D') plt.xlabel('Number of Clusters') plt.ylabel('Size of Largest Cluster'); # At best, hierarchical clustering is throwing 90% of the data into one group, and usually even more. This problem persists for every reasonably small choice of k. That is to say, there can be very little in the way of trends in these groupings, as one group is too large and the rest are too small. # In[ ]: # 14 clusters ag14=AgglomerativeClustering(n_clusters=14, metric='euclidean', linkage='average') ag14.fit(attr) pd.Series(ag14.labels_).value_counts() # Our other candidate, k=14, does not fare much better. About 90% of the data is sorted into one cluster. The majority of the other clusters are singleton sets or have two records. Let's look for trends in only the clusters with more. # In[ ]: data['ag_group']=ag14.labels_ # In[ ]: # identifying somewhat bigger clusters idx=[] vc=pd.Series(ag14.labels_).value_counts().to_dict() for key in vc.keys(): if vc[key]>2: idx.append(key) data_cut=data.query('ag_group in @idx') # We have cut out records from the data set that don't land in one of the four biggest clusters obtained by agglomerative clustering. # In[ ]: # boxplots for col in attr.columns: plt.figure(figsize=(8,5)) plt.title(str(col)+' by Cluster',fontsize=20) sns.boxplot(data=data_cut.drop('km_group',axis=1), x='ag_group', y=col, showfliers=False) plt.xlabel('Cluster') plt.show() print('') # Recall that group 0 has 7 records, group 1 has 5 records, group 2 has 307 records, and group 4 has 9 records. # # Looking at the top four clusters, we get meaningful trends and good separation of features. # * Group 0 generally has the lowest current price. # * Group 4 has an extremely narrow price change. Securities in group 4 very reliably gain value. # * On the other hand, group 0 has a wide spread of negative values, meaning that these lose value by varying amounts. In addition, group 0 is highly volitile, meaning the spread of the price change data over the last 13 weeks is greater. # * Group 1 outperforms the rest on ROE by a considerable margin. # * Cash ratios are fairly similar, differing primarily in spread. # * Group 4 has a large spread in net cash flow, whereas the other three groups see a net cash flow around 0. # * Group 4 has the highest net income, while group 0 has the lowest, with the later almost always negative. # * While group 2 has both positive and negative earnings per share, groups 1 and 4 see only positive. Group 0, on the other hand, is all negative. # * Group 4 differentiates itself with more estimated outstanding shares than the rest. # * Group 0 has the highest P/E ratio, and group 2 has the greatest spread in P/B ratio. # # There is sufficient separation in the data to make fourteen clusters the best choice for a hierarchical model. # In[ ]: data_cut.query('ag_group==0') # The worst performing cluster is certainly group 0. We see here that this group contains securities from the energy sector (and one from materials). These companies are involved in the extraction of natural resources from the earth, such as oil, gas, and copper. # In[ ]: data_cut.query('ag_group==4') # Group 4, one of the better performing clusters, has a good mix of sectors. Current prices are low enough to be a good buy and positive price change with low volitility indicates that these securities will appreciate in value. # ## K-means vs Hierarchical Clustering # ### The Big Clusters # # Both clustering algorithms yield one big cluster. Let's look first at how much the big ones overlap. Recall that `km_group` identifies the cluster assigned by the k-means model and `ag_group` is the cluster given by agglomerative hierarchical clustering. # Note that the big groups happen to both be labelled 2. # In[ ]: # % overlap in big clusters (data .query('ag_group==2') .query('km_group==2') .shape[0]/data .query('km_group==2') .shape[0]) # Over 99% of records in k-means cluster 2 were sorted into hierarchical cluster 2. This shows strong alignment, that k-means cluster 2 is essentially a subset of hierarchical cluster 2. # In[ ]: # left out record (and its group) a=data.query('ag_group==2').index b=data.query('km_group==2').index.isin(a) c=data.query('km_group==2').loc[np.logical_not(b)]['ag_group'].values[0] data.query('ag_group==@c') # Only one record belonging to k-means cluster 2 was put in a different hierarchical cluster, namely Bank of America. It landed in its own singleton group in the agglomerative clustering model. # # But there are 307 records in hierarchical group 2, so what k-means cluster are the remaining records assigned to (those that didn't make it into the big group)? # In[ ]: # ag big group overflow km groups (data .loc[data['km_group'] .mask(lambda x: x==2) .notnull()] .query('ag_group==2')['km_group'] .value_counts()) # The remaining records fall into groups 3, 0, 1, and 5. # # This can be summarized by saying hierarchical clustering put 90% of the records into a single cluster, whereas k-means spread those records across five clusters. # ### Smaller Clusters # In[ ]: data_cut.query('ag_group==0')['km_group'].unique() # In[ ]: data.query('km_group==3')['ag_group'].unique() # There's good overlap among some clusters. Every record in agglomerated cluster 0 is in k-means cluster 3. Conversely, every record in k-means cluster 3 falls into either ag. groups 0 or 2. # # That is to say, ag. cluster 0 contains a subset of the records in k-means cluster 3. # In[ ]: data_cut.query('ag_group==4')['km_group'].unique() # In[ ]: data.query('km_group==5')['ag_group'].unique() # Here again, ag. cluster 4 contains a subset of records in k-means group 5. # ### Cluster Qualities # # As was just discussed, there's much overlap in the big clusters. While k-means gave eight clusters, the agglomerative method yielded fourteen. # # Most of the groups obtained through hierarchical clustering contained fewer than five records. With so few records per cluster, we gain little insight into trends across securities. Moreover, a diversified portfolio could not be assembled from a single cluster, as there are not enough records per cluster to guarantee robust performance. We can glean some information from the four biggest clusters, but they are still dominated by the largest of the bunch. # # Where the k-means model excells is in producing more medium-sized clusters. This allows us to investigate more distinct trends and lends itself better to StockCenter's goal of assembling diversified, well-performing portfolios. # ### Execution Time # # This is not actually a huge concern, as both clustering methods run quite quickly. Still, it is interesting to compare. # In[ ]: get_ipython().run_cell_magic('timeit', '', 'KMeans(n_clusters=8,n_init=10,random_state=1).fit(attr)\n') # In[ ]: get_ipython().run_cell_magic('timeit', '', "AgglomerativeClustering(\n n_clusters=14,\n metric='euclidean',\n linkage='average').fit(attr)\n") # We find that k-means takes roughly seven times longer to execute. Still, run time is on the order of milliseconds—no worries! # ## Actionable Insights and Recommendations # ### k-means # # Recall group 5 from k-means clustering, studied earlier. # In[ ]: data.query('km_group==5') # There is a good mix of sectors in this group. Investing in a diverse group of sectors is important, as dips in the performance of one sector are mitigated by other unaffected sectors. # # A portfolio comprised of securities from a single sector will not be resilient to disruptions in that sector. Consider, for example, how global shipping was upset by the vessel that was lodged in the Suez Canal. While global logistics companies struggled, there was no real impact on the energy sector or financials. Such diversity lends resiliency to a portfolio. # In[ ]: data.query('km_group==5').drop(['km_group','ag_group'],axis=1).plot( kind='box', subplots=True, sharey=False, layout=(3,4), figsize=(15,20)); # * Stock prices in group 5 are fairly low, and the lower P/E ratio suggests these are not overvalued. # * The volatility of stocks in group 5 is among the lowest of the groups. # * From our cluster analysis earlier, we recall that group 5 has the stocks with the highest net income. # # I might advise investing in most of the stocks in group 5 to build a diverse portfolio. Perhaps I would leave out Pfizer and Coca Cola, as their P/E ratios are higher than the rest (25 and 28 respectively). # In[ ]: data.query('km_group==3')['GICS Sector'].value_counts() # I would advise against building a portfolio entirely on group 3. This groups is comprised almost entirely of energy securities. As was discussed before, a disruption in the energy industry would negatively impact all of these stocks, bringing down the value of the portfolio significantly. # ### Hierarchical # # Let's look at group 4 from hierarchical clustering. # In[ ]: data.query('ag_group==4') # * From our analyses earlier, recall that group 4 has a steady positive price change. # * While the price is going up, the stocks are still affordable and are not overvalued (P/E ratio). # * There is a good mix of GICS sectors in this group. # # This would be the group I would build a portfolio on. What's more, this group is a subset of the k-means group just described! Summarily, both clustering methods identified a group with almost the same high-performing, diversified stocks.