In [1]:

%load_ext watermark
%watermark -d -v -a 'Sebastian Raschka' 

Sebastian Raschka 15/12/2014 

CPython 2.7.8
IPython 2.1.0

Loading artists and titles from the Million Song Dataset into a Pandas DataFrame¶

In [2]:

import pandas as pd
import re

store = pd.HDFStore('/Users/sebastian/Desktop/msd_summary_file.h5')
artists = pd.Series(store.root.metadata.songs.cols.artist_name)
titles = pd.Series(store.root.metadata.songs.cols.title)
store.close()

df = pd.concat([artists, titles], axis=1)

df.columns=['artist', 'title']

df.loc[:, 'artist'] = df.loc[:, 'artist'].apply((lambda x: x.decode('utf-8')))
df.loc[:, 'title'] = df.loc[:, 'title'].apply((lambda x: x.decode('utf-8')))

# removes parentheses + content e.g., 'Bleed (Album Version)' -> 'Bleed'
df.loc[:, 'title'] = df.loc[:, 'title'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df.loc[:, 'artist'] = df.loc[:, 'artist'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))

df.tail()

---------------------------------------------------------------------------
NoSuchNodeError                           Traceback (most recent call last)
<ipython-input-2-b11d88fe4c33> in <module>()
      3 
      4 store = pd.HDFStore('/Users/sebastian/Desktop/msd_summary_file.h5')
----> 5 artists = pd.Series(store.root.metadata.songs.cols.artist_name)
      6 titles = pd.Series(store.root.metadata.songs.cols.title)
      7 store.close()

/Users/sebastian/miniconda3/envs/py34/lib/python3.4/site-packages/tables/group.py in __getattr__(self, name)
    809             self._g_add_children_names()
    810             return mydict[name]
--> 811         return self._f_get_child(name)
    812 
    813     def __setattr__(self, name, value):

/Users/sebastian/miniconda3/envs/py34/lib/python3.4/site-packages/tables/group.py in _f_get_child(self, childname)
    679         self._g_check_open()
    680 
--> 681         self._g_check_has_child(childname)
    682 
    683         childpath = join_path(self._v_pathname, childname)

/Users/sebastian/miniconda3/envs/py34/lib/python3.4/site-packages/tables/group.py in _g_check_has_child(self, name)
    403             raise NoSuchNodeError(
    404                 "group ``%s`` does not have a child named ``%s``"
--> 405                 % (self._v_pathname, name))
    406         return node_type
    407 

NoSuchNodeError: group ``/`` does not have a child named ``metadata``

Save the artist-title table as SQLite3 for fututure analyses¶

In [3]:

import sqlite3

conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
df.to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()

Code to scrape lyrics from the web¶

In [2]:

import urllib, re
import bs4
          
def songlyrics(artist, title):
    
    artist = artist.encode('utf8', 'replace')
    title = title.encode('utf8', 'replace')
    
    artist = urllib.quote(artist.lower().replace(' ','-'))
    title = urllib.quote(title.lower().replace(' ','-'))

    
    try:
        lyrics = urllib.urlopen('http://www.songlyrics.com/%s/%s-lyrics/' % (artist,title))
    except:
        return None
    text = lyrics.read()
    soup = bs4.BeautifulSoup(text)
    lyrics = soup.findAll(attrs= {'id' : 'songLyricsDiv'})
    if not lyrics:
        return None
    else:
        if str(lyrics[0]).startswith("<p class='songLyricsV14 iComment-text' id='songLyricsDiv'></p>"):

            return None
        try:
            return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
        except:
            return None


def lyricsmode(artist, title):
    
    artist = artist.encode('utf8', 'replace')
    title = title.encode('utf8', 'replace')
    
    artist = urllib.quote(artist.lower().replace(' ','_'))
    title = urllib.quote(title.lower().replace(' ','_'))

    try:
        url = 'http://www.lyricsmode.com/lyrics/%s/%s/%s.html' % (artist[0],artist, title)
        lyrics = urllib.urlopen(url)
    except:
        return None 
    text = lyrics.read()
    soup = bs4.BeautifulSoup(text)
    #lyricsmode places the lyrics in a span with an id of "lyrics"
    lyrics = soup.findAll(attrs= {'id' : 'lyrics_text'})
    if not lyrics:
        return None 
    try:
        return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
    except:
        return None  

def get_lyrics(artist, title):
    lyr = songlyrics(artist, title)
    if not lyr:
        lyr = lyricsmode(artist, title)
    return lyr


test = get_lyrics('Bob Dylan','Blowing in the wind')
print(test)
test2 = get_lyrics('test','test')
print(test2)

How many roads must a man walk down
Before you call him a man?
Yes, 'n' how many seas must a white dove sail
Before she sleeps in the sand?
Yes, 'n' how many times must the cannon balls fly
Before they're forever banned?

The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind

How many times must a man look up
Before he can see the sky?
Yes, 'n' how many ears must one man have
Before he can hear people cry?
Yes, 'n' how many deaths will it take till he knows
That too many people have died?

The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind

How many years can a mountain exist
Before it's washed to the sea?
Yes, 'n' how many years can some people exist
Before they're allowed to be free?
Yes, 'n' how many times can a man turn his head
Pretending he just doesn't see?

The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind
None

Code to check if lyrics are English¶

As a rule of thumb, I assume that every song that has less than 50% English words (i.e., words that are not in the English vocabulary) is non-English.

In [3]:

import nltk

def eng_ratio(text):
    ''' Returns the ratio of non-English to English words from a text '''

    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) 
    common = text_vocab.intersection(english_vocab)
    try:
        diff = len(common)/float(len(text_vocab))
    except ZeroDivisionError:
        diff = 0.0
    return diff
    
text = 'This is a test fahrrad'

print(eng_ratio(text))
lyr = get_lyrics('Pharrell','Happy')
print(eng_ratio(lyr))

0.8
0.986666666667

Annotating the language of the songs¶

The following labels are being used to annotate the songs:
0 = no lyrics
1 = likely English
2 = likely non-English

In [ ]:

#df = df.loc[:3000, :]
df['lang'] = pd.Series('', index=df.index)
df.tail()

In [ ]:

import pyprind
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    lyr = get_lyrics(artist=df.loc[row_id]['artist'].encode('utf-8'), title=df.loc[row_id]['title'].encode('utf-8'))
    
    if not lyr:
        df.loc[row_id,'lang'] = 0
    elif eng_ratio(lyr) >= 0.5:
        df.loc[row_id,'lang'] = 1
    else:
        df.loc[row_id,'lang'] = 2
    
    pbar.update()

0%                          100%
[                              ]

In [ ]:

df[df['lang'] == 1].shape

In [20]:

df = df[df['lang'] == 1]
df.index = range(df.shape[0])
df.tail()

Out[20]:

	artist	title	lang
647	Suzanne Vega	Marlene On The Wall	1
648	Nelly	St. Louie	1
649	Larue	Reason	1
650	Liam Lynch	SOS	1
651	Oasis	Boy With The Blues	1

In [21]:

df = df[df['lang'] == 1]

conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
df[df['lang'] == 1].loc[:649,:].to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()

Pick a random song from the database¶

In [6]:

import sqlite3

conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
cursor = conn.cursor()
sql = "SELECT artist,title FROM artist_title ORDER BY RANDOM() LIMIT 1;"
cursor.execute(sql)
result = cursor.fetchone()
artistname = result[0].decode('utf-8')
songtitle = result[1].decode('utf-8')
print('Arist: %s \nSong: %s' % (artistname, songtitle))
conn.close()

Arist: Matisyahu 
Song: Warrior

Check songs in the SQLite database¶

In [ ]:

import sys
import sqlite3

conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
c = conn.cursor()
conn.text_factory = str

sql = "SELECT COUNT(*) from artist_title WHERE language IS NULL;"
c.execute(sql)
res = c.fetchone()
print('Started at: %s' %res[0])   

try:

    while True:

        
        
        

        sql = "SELECT rowid,artist,title FROM artist_title WHERE language IS NULL LIMIT 1;"
        c.execute(sql)
        result = c.fetchone()
        row_id = result[0]
        artistname = result[1].decode('utf-8')
        songtitle = result[2].decode('utf-8')
    
    
        lyr = get_lyrics(artist=artistname, title=songtitle)
    
        lang = None
        if not lyr:
            lang = 0
        elif eng_ratio(lyr) >= 0.5:
            lang = 1
        else:
            lang = 2
    
        sql = "UPDATE artist_title SET language=?,lyrics=? WHERE rowid=?;"
    
        c.execute(sql, (lang, lyr, row_id))
    
    
        conn.commit()
        
        # count remaining songs
        sql = "SELECT COUNT(*) from artist_title WHERE language IS NULL;"
        c.execute(sql)
        res = c.fetchone()
        remaining = res[0]
        
        sys.stdout.write('\r')
        sys.stdout.write('Remaining: %s' %remaining)
        sys.stdout.flush()
        if not remaining:
            break
        
finally:
    conn.commit()
    conn.close()

Started at: 988287
Remaining: 988243

In [ ]:

print('test')

In [ ]: