In this assignment, you'll analyze the emotional tone of songs using sentiment analysis. You'll discover which artists are the most positive or negative, and which songs have the strongest emotional content.
You will:
# JUST RUN THIS, no changes needed
from google.colab import drive
import pandas as pd
from textblob import TextBlob
drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/MyDrive/datasets/lyrics.csv')
# FIXUP DATA
df["Title"] = df["Title"].str.replace("\u200b", "")
df["Lyric"] = df["Lyric"].str.replace("\u200b", "")
# Look at the data
print(f"Total songs: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
df.head()
TextBlob gives us two measures:
# Example of sentiment analysis
# JUST RUN THIS to see how it works
test_sentences = [
"I love this amazing song!",
"This music is terrible and boring.",
"The song has three verses.",
"I feel so happy when I hear this!"
]
for sentence in test_sentences:
blob = TextBlob(sentence)
print(f"Text: '{sentence}'")
print(f" Polarity: {blob.sentiment.polarity:.3f}")
print(f" Subjectivity: {blob.sentiment.subjectivity:.3f}")
print()
def get_song_lyrics(df, artist, title):
# Input: df is the lyrics.csv dataframe
# artist is the artist you're looking for
# title is the title of the song
# Output: Returns the lyrics of the song
#
# TODO: Your code here!
pass
def calculate_song_sentiment(lyrics):
# Input: lyrics is a string of song lyrics
# Output: Returns the polarity score (float between -1 and 1)
#
# TODO: Your code here!
# 1. Create a TextBlob object with the lyrics
# 2. Get the sentiment.polarity
# 3. Return the polarity value
pass
# Test with one song
artist = "Dua Lipa" # Change to your choice!
title = "New Rules" # Change to your choice!
# Get the lyrics for this song
lyrics = get_song_lyrics(df, artist, title)
# Calculate sentiment
sentiment = calculate_song_sentiment(lyrics)
print(f"{artist} - {title}")
print(f"Sentiment: {sentiment:.3f}")
print("(Negative < 0 < Positive)")
Use df.apply(...)
to calculate sentiment for every song.
df.apply(...)
takes a function AS AN ARGUMENT and applys it to everything in that column returning a new column.
For example, if you wanted to make a new column of lyric lengths:
Use this pattern to create a new column for Sentiment
def get_lyrics_length(lyrics):
return len(lyrics)
df['lyric length'] = df['Lyric'].apply(get_lyric_length)
def add_sentiment_column(df):
# Input: df is the lyrics DataFrame
# Output: Returns df with a new 'sentiment' column added
#
# TODO: Your code here!
return df
# Add sentiment scores to all songs
df = add_sentiment_column(df)
# Check it worked
print("Sentiment column added!")
print(df[['Artist', 'Title', 'sentiment']].head())
Use pandas' .sort_values(...)
method to sort the rows by sentiment.
sorted_ascending_df = df.sort_values(by="sentiment", ascending=True)
sorted_decending_df = df.sort_values(by="sentiment", ascending=False)
You can also loop over the dataframe to print:
for idx, row in df.iterrows():
title = df['Title']
print(f"Row number: {idx}")
print(f"Title: {title}")
def find_extreme_songs(df, artist, n=5):
# Input: df is the DataFrame with sentiment column
# artist is the artist name to analyze
# n is how many songs to show
# Output: Prints the most positive and negative songs
#
# TODO: Your code here!
# 1. Filter to just this artist's songs
# 2. Sort by sentiment to find most positive
# 3. Sort by sentiment to find most negative
# 4. Print the top n for each
pass
# Analyze your chosen artist
my_artist = "Taylor Swift" # Change to your choice!
find_extreme_songs(df, my_artist)
Reminder, here's the code from our slides on performing GroupBy on a dataframe of books where we were getting the sum
of the 'qty'
column.
qty_by_genre = books_df.groupby('genre')['qty'].sum()
print("Total quantities by genre:")
for genre, total_qty in qty_by_genre.items():
print(f" {genre}: {total_qty}")
def calculate_artist_sentiment(df):
# Input: df is the DataFrame with sentiment column
# Output: Returns a Series with average sentiment per artist
# TODO: Your code here!
# 1. Group by Artist
# 2. Calculate mean sentiment for each artist
# 3. Sort by sentiment
# 4. Return the sorted Series
pass
# Find most positive and negative artists
artist_sentiments = calculate_artist_sentiment(df)
print("Most Positive Artists:")
print(artist_sentiments.tail(10))
print("\nMost Negative Artists:")
print(artist_sentiments.head(10))
Try to use all of the above to calculate sentiment by year for an artist.
def analyze_sentiment_by_year(df, artist):
# Input: df is the DataFrame, artist is the artist name
# Output: Returns DataFrame with year and average sentiment
# TODO: Your code here!
# 1. Filter to the artist
# 2. Extract year from Date column (it's already there as Year)
# 3. Group by Year and calculate mean sentiment
# 4. Return results sorted by year
pass
# Analyze how an artist's sentiment changed over time
artist_timeline = analyze_sentiment_by_year(df, my_artist)
print(f"\n{my_artist}'s sentiment over time:")
print(artist_timeline)
# Bonus: Plot it!
# artist_timeline.plot(x='Year', y='sentiment', kind='line',
# title=f'{my_artist} Sentiment Timeline')