In this exercise you'll discover what makes a song unique by finding its most distinctive words using TF-IDF.
You will:
# JUST RUN THIS, no changes needed
from google.colab import drive
import pandas as pd
import math
import re
drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/MyDrive/datasets/lyrics.csv')
# FIXUP DATA
df["Title"] = df["Title"].str.replace("\u200b", "")
# Look at the data structure
print(df.columns)
print(f"Total songs: {len(df)}")
print(df.head(3))
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True). Index(['Artist', 'Title', 'Album', 'Year', 'Date', 'Lyric'], dtype='object') Total songs: 5981 Artist Title Album Year Date \ 0 Dua Lipa New Rules Dua Lipa 2017.0 2017-06-02 1 Dua Lipa Don’t Start Now Future Nostalgia 2019.0 2019-11-01 2 Dua Lipa IDGAF Dua Lipa 2017.0 2017-06-02 Lyric 0 one one one one one talkin' in my sleep at n... 1 if you don't wanna see me did a full 80 craz... 2 you call me all friendly tellin' me how much y...
# JUST RUN THIS, no changes needed
STOP_WORDS = {
"a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
"has", "he", "in", "is", "it", "its", "of", "on", "that", "the",
"to", "was", "were", "will", "with", "i", "you", "we", "they",
"me", "my", "your", "our", "their", "him", "her", "she"
}
def preprocess(text):
"""Convert text to a list of lowercase words, removing stop words"""
# Convert to lowercase first
text = text.lower()
# Split on punctuation and whitespace
tokens = re.split(r"[,\.\!\?\s]+", text)
# Keep only non-empty tokens that aren't stop words
processed_tokens = []
for token in tokens:
if token and token not in STOP_WORDS:
processed_tokens.append(token)
return processed_tokens
# Test it
test_text = "Hello! How are you doing today?"
print(preprocess(test_text)) # Should print: ['hello', 'how', 'doing', 'today']
['hello', 'how', 'doing', 'today']
Write a get_song_lyrics function
def get_song_lyrics(lyrics_df, artist, title):
artist_df = lyrics_df[lyrics_df["Artist"] == artist]
title_df = artist_df[artist_df["Title"] == title]
return title_df['Lyric'].values[0]
# Test your function
artist = "Eminem" # Change to your choice!
title = "Rap God" # Change to your choice!
target_lyrics = get_song_lyrics(df, artist, title)
print(f"Found lyrics for {artist} - {title}")
print(f"First 200 chars: {target_lyrics[:200]}...")
Found lyrics for Eminem - Rap God First 200 chars: look i was gonna go easy on you not to hurt your feelings but i'm only going to get this one chance six minutes six minutes something's wrong i can feel it six minutes slim shady you're on just a feel...
Write a function to calculate word frequencies
def calculate_term_frequency(text):
term_freq = {}
# FIRST, preprocess the text
processed_text = preprocess(text)
for word in processed_text:
term_freq[word] = term_freq.get(word, 0) + 1
return term_freq
# Test your function by running the below
tf = calculate_term_frequency(target_lyrics)
print(f"Unique words: {len(tf)}")
print("Top 5 words:", sorted(tf.items(), key=lambda x: x[1], reverse=True)[:5])
Unique words: 579 Top 5 words: [("i'm", 28), ('but', 20), ('like', 16), ('rap', 13), ('get', 12)]
Write a function that calculates how many documents (songs) the word appears in
def calculate_document_frequency(corpus, target_terms):
doc_freq = {}
# FIRST, preprocess all of the corpus into a list of preprocessed list
processed_corpus = []
for doc in corpus:
processed_corpus.append(preprocess(doc))
for term in target_terms:
for doc in processed_corpus:
if term in doc:
doc_freq[term] = doc_freq.get(term, 0) + 1
return doc_freq
# Create corpus and calculate DF
corpus = df["Lyric"].tolist()
target_words = list(set(tf.keys())) # Unique words from our target song
print(f"Calculating document frequency for {len(target_words)} words...")
df_counts = calculate_document_frequency(corpus, target_words)
for term, doc_freq in list(df_counts.items())[:10]:
print(f"{term}: {doc_freq}")
Calculating document frequency for 579 words... zod: 2 wrong: 659 basically: 24 least: 156 mayweather's: 2 arm: 95 'cause: 2032 can't: 2068 crap: 32 it's: 2968
Write a function to calculate the final TF-IDF scores:
def calculate_tfidf(term_freq, doc_freq, total_docs):
tfidf = {}
for term in term_freq:
tf = term_freq[term]
idf = math.log(total_docs / doc_freq[term])
tfidf[term] = tf * idf
return tfidf
# Calculate TF-IDF
tfidf_scores = calculate_tfidf(tf, df_counts, len(corpus))
# Display results
sorted_scores = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
print(f"\nTop 20 most distinctive words in '{title}':")
for word, score in sorted_scores[:20]:
print(f" {word}: {score:.3f}")
Top 20 most distinctive words in 'Rap God': rap: 37.430 nod: 37.269 lookin': 31.204 boy: 21.031 god: 20.626 beginnin': 19.497 slap: 18.088 asgard: 16.006 box: 15.841 doc: 15.423 nascar: 14.174 fab: 14.174 minutes: 13.218 i'm: 12.473 front: 12.106 motherfuckin': 11.702 fame: 11.441 satan: 11.401 while: 11.317 fuck: 11.008
Here's some code to help print out the most common words and the most distinctive words.
print("\nAnalysis:")
print(f"Tokens with highest TFIDF scores:")
print(f"Most common words:")
for term, tf_score in sorted(tf.items(), key=lambda x: x[1], reverse=True)[:5]:
print(f"- {term}: {tf_score}")
print(f"Most distinctive words:")
for term, tfidf_score in sorted_scores[:5]:
print(f"- {term}: {tfidf_score}")
print("Are they the same? What does this tell us about TF-IDF?")
Analysis: Tokens with highest TFIDF scores: Most common words: - i'm: 28 - but: 20 - like: 16 - rap: 13 - get: 12 Most distinctive words: - rap: 37.43001466205759 - nod: 37.26861844353934 - lookin': 31.203939538982407 - boy: 21.03074593847605 - god: 20.62596687202567 Are they the same? What does this tell us about TF-IDF?