A Summary of lecture "Unsupervised Learning with scikit-learn", via datacamp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In the video, you saw NMF applied to transform a toy word-frequency array. Now it's your turn to apply NMF, this time using the tf-idf word-frequency array of Wikipedia articles, given as a csr matrix articles
. Here, fit the model and transform the articles. In the next exercise, you'll explore the result.
from scipy.sparse import csr_matrix
documents = pd.read_csv('./dataset/wikipedia-vectors.csv', index_col=0)
titles = documents.columns
articles = csr_matrix(documents.values).T
from sklearn.decomposition import NMF
# Create an NMF instance: model
model = NMF(n_components=6)
# Fit the model to articles
model.fit(articles)
# Transform the articles: nmf_features
nmf_features = model.transform(articles)
# Print the NMF features
print(nmf_features)
[[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.40447144e-01] [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.66581665e-01] [3.82052712e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.98630002e-01] [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.81723960e-01] [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.85497565e-01] [1.29288170e-02 1.37900639e-02 7.76326408e-03 3.34365996e-02 0.00000000e+00 3.34508155e-01] [0.00000000e+00 0.00000000e+00 2.06741971e-02 0.00000000e+00 6.04540794e-03 3.59046120e-01] [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.90956931e-01] [1.54271421e-02 1.42828947e-02 3.76635009e-03 2.37026001e-02 2.62642981e-02 4.80754528e-01] [1.11736323e-02 3.13702678e-02 3.09484990e-02 6.56762061e-02 1.96694618e-02 3.38274818e-01] [0.00000000e+00 0.00000000e+00 5.30717612e-01 0.00000000e+00 2.83704029e-02 0.00000000e+00] [0.00000000e+00 0.00000000e+00 3.56508094e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00] [1.20125112e-02 6.50087569e-03 3.12244190e-01 6.09549744e-02 1.13871286e-02 1.92593939e-02] [3.93478571e-03 6.24483457e-03 3.42372089e-01 1.10728765e-02 0.00000000e+00 0.00000000e+00] [4.63812699e-03 0.00000000e+00 4.34913555e-01 0.00000000e+00 3.84308261e-02 3.08119905e-03] [0.00000000e+00 0.00000000e+00 4.83287460e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00] [5.65006510e-03 1.83547516e-02 3.76531712e-01 3.25342948e-02 0.00000000e+00 1.13329771e-02] [0.00000000e+00 0.00000000e+00 4.80912131e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00] [0.00000000e+00 9.01923006e-03 5.51006051e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00] [0.00000000e+00 0.00000000e+00 4.65968041e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00] [0.00000000e+00 1.14088418e-02 2.08654946e-02 5.17579649e-01 5.81458673e-02 1.37848139e-02] [0.00000000e+00 0.00000000e+00 0.00000000e+00 5.10290254e-01 0.00000000e+00 0.00000000e+00] [0.00000000e+00 5.60141699e-03 0.00000000e+00 4.22226760e-01 0.00000000e+00 0.00000000e+00] [0.00000000e+00 0.00000000e+00 0.00000000e+00 4.36592958e-01 0.00000000e+00 0.00000000e+00] [0.00000000e+00 0.00000000e+00 0.00000000e+00 4.97911506e-01 0.00000000e+00 0.00000000e+00] [9.88376115e-02 8.60100028e-02 3.91034522e-03 3.80879401e-01 4.39283084e-04 5.22130114e-03] [0.00000000e+00 0.00000000e+00 0.00000000e+00 5.71962504e-01 0.00000000e+00 7.13513359e-03] [1.31466473e-02 1.04860275e-02 0.00000000e+00 4.68736079e-01 0.00000000e+00 1.16305318e-02] [3.84543550e-03 0.00000000e+00 0.00000000e+00 5.75501882e-01 0.00000000e+00 0.00000000e+00] [2.25241869e-03 1.38746694e-03 0.00000000e+00 5.27754407e-01 1.20275139e-02 1.49477806e-02] [0.00000000e+00 4.07574382e-01 1.85713967e-03 0.00000000e+00 2.96635743e-03 4.52315589e-04] [1.53418232e-03 6.08212140e-01 5.22275466e-04 6.24626335e-03 1.18454877e-03 4.40049387e-04] [5.38809700e-03 2.65034105e-01 5.38508926e-04 1.86857967e-02 6.38706684e-03 2.90092523e-03] [0.00000000e+00 6.44957364e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [0.00000000e+00 6.08946122e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [0.00000000e+00 3.43707347e-01 0.00000000e+00 0.00000000e+00 3.97828600e-03 0.00000000e+00] [6.10497459e-03 3.15333091e-01 1.54879481e-02 0.00000000e+00 5.06288085e-03 4.74315077e-03] [6.47362189e-03 2.13342287e-01 9.49492529e-03 4.56815320e-02 1.71929395e-02 9.52023189e-03] [7.99132601e-03 4.67625236e-01 0.00000000e+00 2.43337052e-02 0.00000000e+00 0.00000000e+00] [0.00000000e+00 6.42861446e-01 0.00000000e+00 2.35768628e-03 0.00000000e+00 0.00000000e+00] [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.77121003e-01 0.00000000e+00] [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.94295496e-01 0.00000000e+00] [0.00000000e+00 2.99081204e-04 2.14485182e-03 0.00000000e+00 3.81809252e-01 5.83752705e-03] [0.00000000e+00 0.00000000e+00 0.00000000e+00 5.64485513e-03 5.42284829e-01 0.00000000e+00] [1.78055699e-03 7.84461186e-04 1.41627290e-02 4.59634651e-04 4.24336362e-01 0.00000000e+00] [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.11432598e-01 0.00000000e+00] [0.00000000e+00 0.00000000e+00 3.28382958e-03 0.00000000e+00 3.72916714e-01 0.00000000e+00] [0.00000000e+00 2.62099570e-04 3.61103149e-02 2.32246874e-04 2.30529171e-01 0.00000000e+00] [1.12515562e-02 2.12341198e-03 1.60971826e-02 1.02447544e-02 3.25487703e-01 3.75864568e-02] [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.18991783e-01 3.57664717e-04] [3.08367803e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [3.68174824e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [3.97945914e-01 2.81721215e-02 3.67011224e-03 1.70005030e-02 1.95983506e-03 2.11635763e-02] [3.75795603e-01 2.07534002e-03 0.00000000e+00 3.72019376e-02 0.00000000e+00 5.85903599e-03] [4.38029361e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [4.57882228e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [2.75477966e-01 4.46985638e-03 0.00000000e+00 5.29463349e-02 0.00000000e+00 1.90989751e-02] [4.45195103e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.48742823e-03 0.00000000e+00] [2.92741164e-01 1.33673384e-02 1.14263020e-02 1.05161816e-02 1.87711505e-01 9.23926402e-03] [3.78267498e-01 1.43979557e-02 0.00000000e+00 9.84882180e-02 1.35911385e-02 0.00000000e+00]]
Now you will explore the NMF features you created in the previous exercise. A solution to the previous exercise has been pre-loaded, so the array nmf_features
is available. Also available is a list titles giving the title
of each Wikipedia article.
When investigating the features, notice that for both actors, the NMF feature 3 has by far the highest value. This means that both articles are reconstructed using mainly the 3rd NMF component. In the next video, you'll see why: NMF components represent topics (for instance, acting!).
# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features, index=titles)
# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])
# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])
0 0.003845 1 0.000000 2 0.000000 3 0.575502 4 0.000000 5 0.000000 Name: Anne Hathaway, dtype: float64 0 0.000000 1 0.005601 2 0.000000 3 0.422227 4 0.000000 5 0.000000 Name: Denzel Washington, dtype: float64
In this exercise, you'll check your understanding of how NMF reconstructs samples from its components using the NMF feature values. On the right are the components of an NMF model. If the NMF feature values of a sample are [2, 1]
, then which of the following is most likely to represent the original sample? A pen and paper will help here! You have to apply the same technique Ben used in the video to reconstruct the sample [0.1203 0.1764 0.3195 0.141]
.
sample_feature = np.array([2, 1])
components = np.array([[1. , 0.5, 0. ],
[0.2, 0.1, 2.1]])
np.matmul(sample_feature.T, components)
array([2.2, 1.1, 2.1])
In the video, you learned when NMF is applied to documents, the components correspond to topics of documents, and the NMF features reconstruct the documents from the topics. Verify this for yourself for the NMF model that you built earlier using the Wikipedia articles. Previously, you saw that the 3rd NMF feature value was high for the articles about actors Anne Hathaway and Denzel Washington. In this exercise, identify the topic of the corresponding NMF component.
The NMF model you built earlier is available as model
, while words
is a list of the words that label the columns of the word-frequency array.
After you are done, take a moment to recognise the topic that the articles about Anne Hathaway and Denzel Washington have in common!
words = []
with open('./dataset/wikipedia-vocabulary-utf8.txt') as f:
words = f.read().splitlines()
# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=words)
# Print the shape of the DataFrame
print(components_df.shape)
# Select row 3: component
component = components_df.iloc[3]
# Print result of nlargest
print(component.nlargest())
(6, 13125) film 0.628104 award 0.253223 starred 0.245373 role 0.211528 actress 0.186465 Name: 3, dtype: float64
In the following exercises, you'll use NMF to decompose grayscale images into their commonly occurring patterns. Firstly, explore the image dataset and see how it is encoded as an array. You are given 100 images as a 2D array samples
, where each row represents a single 13x8 image. The images in your dataset are pictures of a LED digital display.
df = pd.read_csv('./dataset/lcd-digits.csv', header=None)
df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 104 columns
samples = df.values
# Select the 0th row: digit
digit = samples[0]
# Print digit
print(digit)
# Reshape digit to a 13x8 array: bitmap
bitmap = digit.reshape(13, 8)
# Print bitmap
print(bitmap)
# Use plt.imshow to display bitmap
plt.imshow(bitmap, cmap='gray', interpolation='nearest')
plt.colorbar()
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [[0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 1. 1. 1. 1. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0.]]
<matplotlib.colorbar.Colorbar at 0x7fe9d065dc50>
Now use what you've learned about NMF to decompose the digits dataset. You are again given the digit images as a 2D array samples
. This time, you are also provided with a function show_as_image()
that displays the image encoded by any 1D array:
def show_as_image(sample):
bitmap = sample.reshape((13, 8))
plt.figure()
plt.imshow(bitmap, cmap='gray', interpolation='nearest')
plt.colorbar()
plt.show()
def show_as_image(sample):
bitmap = sample.reshape((13, 8))
plt.figure()
plt.imshow(bitmap, cmap='gray', interpolation='nearest')
plt.colorbar()
# Create an NMF model: model
model = NMF(n_components=7)
# Apply fit_transform to samples: features
features = model.fit_transform(samples)
# Call show_as_image on each component
for component in model.components_:
show_as_image(component)
# Assign the 0th row of features: digit_features
digit_features = features[0]
# Print digit_features
print(digit_features)
[4.76823559e-01 0.00000000e+00 0.00000000e+00 5.90605054e-01 4.81559442e-01 0.00000000e+00 7.37568241e-16]
Unlike NMF, PCA doesn't learn the parts of things. Its components do not correspond to topics (in the case of documents) or to parts of images, when trained on images. Verify this for yourself by inspecting the components of a PCA model fit to the dataset of LED digit images from the previous exercise. The images are available as a 2D array samples
. Also available is a modified version of the show_as_image()
function which colors a pixel red if the value is negative.
After submitting the answer, notice that the components of PCA do not represent meaningful parts of images of LED digits!
from sklearn.decomposition import PCA
# Createa PCA instance: model
model = PCA(n_components=7)
# Apply fit_transform to samples: features
features = model.fit_transform(samples)
# Call show_as_image on each component
for component in model.components_:
show_as_image(component)
- Uses the angle between the lines - Higher values means more similar - Maximum value is 1, when angle is 0 degrees
In the video, you learned how to use NMF features and the cosine similarity to find similar articles. Apply this to your NMF model for popular Wikipedia articles, by finding the articles most similar to the article about the footballer Cristiano Ronaldo. The NMF features you obtained earlier are available as nmf_features
, while titles
is a list of the article titles.
from sklearn.preprocessing import normalize
# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)
# Create a DataFrame: df
df = pd.DataFrame(norm_features, index=titles)
# Select the row corresponding to 'Cristiano Ronaldo': article
article = df.loc['Cristiano Ronaldo']
# Compute the dot products: similarities
similarities = df.dot(article)
# Display thouse with the largest cosine similarity
print(similarities.nlargest())
Cristiano Ronaldo 1.000000 Franck Ribéry 0.999972 Radamel Falcao 0.999942 Zlatan Ibrahimović 0.999942 France national football team 0.999923 dtype: float64
In this exercise and the next, you'll use what you've learned about NMF to recommend popular music artists! You are given a sparse array artists
whose rows correspond to artists and whose columns correspond to users. The entries give the number of times each artist was listened to by each user.
In this exercise, build a pipeline and transform the array into normalized NMF features. The first step in the pipeline, MaxAbsScaler
, transforms the data so that all users have the same influence on the model, regardless of how many different artists they've listened to. In the next exercise, you'll use the resulting normalized NMF features for recommendation!
from scipy.sparse import coo_matrix
df = pd.read_csv('./dataset/scrobbler-small-sample.csv')
artists1 = df.sort_values(['artist_offset', 'user_offset'], ascending=[True, True])
row_ind = np.array(artists1['artist_offset'])
col_ind = np.array(artists1['user_offset'])
data1 = np.array(artists1['playcount'])
artists = coo_matrix((data1, (row_ind, col_ind)))
artists
<111x500 sparse matrix of type '<class 'numpy.int64'>' with 2894 stored elements in COOrdinate format>
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline
# Create a MaxAbsScaler: scaler
scaler = MaxAbsScaler()
# Create an NMF model: nmf
nmf = NMF(n_components=20)
# Create a Normalizer: normalizer
normalizer = Normalizer()
# Create a pipeline: pipeline
pipeline = make_pipeline(scaler, nmf, normalizer)
# Apply fit_transform to artists: norm_features
norm_features = pipeline.fit_transform(artists)
Suppose you were a big fan of Bruce Springsteen - which other musicial artists might you like? Use your NMF features from the previous exercise and the cosine similarity to find similar musical artists. A solution to the previous exercise has been run, so norm_features
is an array containing the normalized NMF features as rows. The names of the musical artists are available as the list artist_names
.
df = pd.read_csv('./dataset/artists.csv', header=None)
artist_names = df.values.reshape(111).tolist()
artist_names
['Massive Attack', 'Sublime', 'Beastie Boys', 'Neil Young', 'Dead Kennedys', 'Orbital', 'Miles Davis', 'Leonard Cohen', 'Van Morrison', 'NOFX', 'Rancid', 'Lamb', 'Korn', 'Dropkick Murphys', 'Bob Dylan', 'Eminem', 'Nirvana', 'Van Halen', 'Damien Rice', 'Elvis Costello', 'Everclear', 'Jimi Hendrix', 'PJ Harvey', 'Red Hot Chili Peppers', 'Ryan Adams', 'Soundgarden', 'The White Stripes', 'Madonna', 'Eric Clapton', 'Bob Marley', 'Dr. Dre', 'The Flaming Lips', 'Tom Waits', 'Moby', 'Cypress Hill', 'Garbage', 'Fear Factory', '50 Cent', 'Ani DiFranco', 'Matchbox Twenty', 'The Police', 'Eagles', 'Phish', 'Stone Temple Pilots', 'Black Sabbath', 'Britney Spears', 'Fatboy Slim', 'System of a Down', 'Simon & Garfunkel', 'Snoop Dogg', 'Aimee Mann', 'Less Than Jake', 'Rammstein', 'Reel Big Fish', 'The Prodigy', 'Pantera', 'Foo Fighters', 'The Beatles', 'Incubus', 'Audioslave', 'Bright Eyes', 'Machine Head', 'AC/DC', 'Dire Straits', 'Motörhead', 'Ramones', 'Slipknot', 'Me First and the Gimme Gimmes', 'Bruce Springsteen', 'Queens of the Stone Age', 'The Chemical Brothers', 'Bon Jovi', 'Goo Goo Dolls', 'Alice in Chains', 'Howard Shore', 'Barenaked Ladies', 'Anti-Flag', 'Nick Cave and the Bad Seeds', 'Static-X', 'Misfits', '2Pac', 'Sparta', 'Interpol', 'The Crystal Method', 'The Beach Boys', 'Goldfrapp', 'Bob Marley & the Wailers', 'Kylie Minogue', 'The Blood Brothers', 'Mirah', 'Ludacris', 'Snow Patrol', 'The Mars Volta', 'Yeah Yeah Yeahs', 'Iced Earth', 'Fiona Apple', 'Rilo Kiley', 'Rufus Wainwright', 'Flogging Molly', 'Hot Hot Heat', 'Dredg', 'Switchfoot', 'Tegan and Sara', 'Rage Against the Machine', 'Keane', 'Jet', 'Franz Ferdinand', 'The Postal Service', 'The Dresden Dolls', 'The Killers', 'Death From Above 1979']
# Createa DataFrame: df
df = pd.DataFrame(norm_features, index=artist_names)
# Select row of 'Bruce Springsteen': artist
artist = df.loc['Bruce Springsteen']
# Compute cosine similarities: similarities
similarities = df.dot(artist)
# Display those with highest cosine similarity
print(similarities.nlargest())
Bruce Springsteen 1.000000 Neil Young 0.956763 Van Morrison 0.874541 Leonard Cohen 0.866873 Bob Dylan 0.862504 dtype: float64