# JUST RUN THIS, no changes needed from google.colab import drive import pandas as pd import math import re drive.mount('/content/gdrive') df = pd.read_csv('/content/gdrive/MyDrive/datasets/lyrics.csv') # Look at the data structure print(df.columns) print(f"Total songs: {len(df)}") print(df.head(3)) # JUST RUN THIS, no changes needed STOP_WORDS = { "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with", "i", "you", "we", "they", "me", "my", "your", "our", "their", "him", "her", "she" } def preprocess(text): """Convert text to a list of lowercase words, removing stop words""" # Convert to lowercase first text = text.lower() # Split on punctuation and whitespace tokens = re.split(r"[,\.\!\?\s]+", text) # Keep only non-empty tokens that aren't stop words processed_tokens = [] for token in tokens: if token and token not in STOP_WORDS: processed_tokens.append(token) return processed_tokens # Test it test_text = "Hello! How are you doing today?" print(preprocess(test_text)) # Should print: ['hello', 'how', 'doing', 'today'] def get_song_lyrics(lyrics_df, artist, title): # TODO: Your code here! # 1. Filter to rows where Artist equals artist # 2. From those rows, filter where Title equals title # 3. Get the Lyric value (use .iloc[0]["Lyric"] or .values[0]) # 4. Handle the case where the song isn't found pass # Test your function artist = "Dua Lipa" # Change to your choice! title = "New Rules" # Change to your choice! target_lyrics = get_song_lyrics(df, artist, title) print(f"Found lyrics for {artist} - {title}") print(f"First 200 chars: {target_lyrics[:200]}...") def calculate_term_frequency(text): term_freq = {} # FIRST, preprocess the text processed_text = preprocess(text) # TODO: Your code here! # For each word: # 1. If it IS NOT in term_freq (remember `in term_freq`) set to 1 # 2. If it IS in term_freq increment by 1 # 3. Return the term_freqs return term_freq # Test your function by running the below tf = calculate_term_frequency(target_lyrics) print(f"Unique words: {len(tf)}") print("Top 5 words:", sorted(tf.items(), key=lambda x: x[1], reverse=True)[:5]) def calculate_document_frequency(corpus, target_terms): doc_freq = {} # FIRST, preprocess all of the corpus into a list of preprocessed list processed_corpus = [] for doc in corpus: processed_corpus.append(preprocess(doc)) # TODO: Your Code here! # We've already preprocessed the corpus for you above! # Now, for each document in processed_corpus: # 1. For each word in target_words: # 2. Check if word is in the document set # 3. If yes, increment doc_freq[word] return doc_freq # Create corpus and calculate DF corpus = df["Lyric"].tolist() target_words = list(set(tf.keys())) # Unique words from our target song print(f"Calculating document frequency for {len(target_words)} words...") df_counts = calculate_document_frequency(corpus, target_words) def calculate_tfidf(term_freq, doc_freq, total_docs): tfidf = {} # TODO: Your code here! # For each word in term_freq: # 1. Calculate IDF = math.log(total_docs / doc_freq[word]) # 2. Calculate TF-IDF = term_freq[word] * IDF # 3. Store in tfidf dictionary return tfidf # Calculate TF-IDF tfidf_scores = calculate_tfidf(tf, df_counts, len(corpus)) # Display results sorted_scores = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True) print(f"\nTop 20 most distinctive words in '{title}':") for word, score in sorted_scores[:20]: print(f" {word}: {score:.3f}") print("\nAnalysis:") print(f"Tokens with highest TFIDF scores:") print(f"Most common words:') for term, tf_score in sorted(tf.items(), key=lambda x: x[1], reverse=True)[:5]: print(f"- {term}: {tf_score}") print(f"Most distinctive words:") for term, tfidf_score in sorted_scores[:5]: print(f"- {term}: {tfidf_score}") print("Are they the same? What does this tell us about TF-IDF?")