%load_ext watermark
%watermark -d -v -a 'Sebastian Raschka'
Sebastian Raschka 15/12/2014 CPython 2.7.8 IPython 2.1.0
import pandas as pd
import re
store = pd.HDFStore('/Users/sebastian/Desktop/msd_summary_file.h5')
artists = pd.Series(store.root.metadata.songs.cols.artist_name)
titles = pd.Series(store.root.metadata.songs.cols.title)
store.close()
df = pd.concat([artists, titles], axis=1)
df.columns=['artist', 'title']
df.loc[:, 'artist'] = df.loc[:, 'artist'].apply((lambda x: x.decode('utf-8')))
df.loc[:, 'title'] = df.loc[:, 'title'].apply((lambda x: x.decode('utf-8')))
# removes parentheses + content e.g., 'Bleed (Album Version)' -> 'Bleed'
df.loc[:, 'title'] = df.loc[:, 'title'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df.loc[:, 'artist'] = df.loc[:, 'artist'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df.tail()
--------------------------------------------------------------------------- NoSuchNodeError Traceback (most recent call last) <ipython-input-2-b11d88fe4c33> in <module>() 3 4 store = pd.HDFStore('/Users/sebastian/Desktop/msd_summary_file.h5') ----> 5 artists = pd.Series(store.root.metadata.songs.cols.artist_name) 6 titles = pd.Series(store.root.metadata.songs.cols.title) 7 store.close() /Users/sebastian/miniconda3/envs/py34/lib/python3.4/site-packages/tables/group.py in __getattr__(self, name) 809 self._g_add_children_names() 810 return mydict[name] --> 811 return self._f_get_child(name) 812 813 def __setattr__(self, name, value): /Users/sebastian/miniconda3/envs/py34/lib/python3.4/site-packages/tables/group.py in _f_get_child(self, childname) 679 self._g_check_open() 680 --> 681 self._g_check_has_child(childname) 682 683 childpath = join_path(self._v_pathname, childname) /Users/sebastian/miniconda3/envs/py34/lib/python3.4/site-packages/tables/group.py in _g_check_has_child(self, name) 403 raise NoSuchNodeError( 404 "group ``%s`` does not have a child named ``%s``" --> 405 % (self._v_pathname, name)) 406 return node_type 407 NoSuchNodeError: group ``/`` does not have a child named ``metadata``
import sqlite3
conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
df.to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()
import urllib, re
import bs4
def songlyrics(artist, title):
artist = artist.encode('utf8', 'replace')
title = title.encode('utf8', 'replace')
artist = urllib.quote(artist.lower().replace(' ','-'))
title = urllib.quote(title.lower().replace(' ','-'))
try:
lyrics = urllib.urlopen('http://www.songlyrics.com/%s/%s-lyrics/' % (artist,title))
except:
return None
text = lyrics.read()
soup = bs4.BeautifulSoup(text)
lyrics = soup.findAll(attrs= {'id' : 'songLyricsDiv'})
if not lyrics:
return None
else:
if str(lyrics[0]).startswith("<p class='songLyricsV14 iComment-text' id='songLyricsDiv'></p>"):
return None
try:
return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
except:
return None
def lyricsmode(artist, title):
artist = artist.encode('utf8', 'replace')
title = title.encode('utf8', 'replace')
artist = urllib.quote(artist.lower().replace(' ','_'))
title = urllib.quote(title.lower().replace(' ','_'))
try:
url = 'http://www.lyricsmode.com/lyrics/%s/%s/%s.html' % (artist[0],artist, title)
lyrics = urllib.urlopen(url)
except:
return None
text = lyrics.read()
soup = bs4.BeautifulSoup(text)
#lyricsmode places the lyrics in a span with an id of "lyrics"
lyrics = soup.findAll(attrs= {'id' : 'lyrics_text'})
if not lyrics:
return None
try:
return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
except:
return None
def get_lyrics(artist, title):
lyr = songlyrics(artist, title)
if not lyr:
lyr = lyricsmode(artist, title)
return lyr
test = get_lyrics('Bob Dylan','Blowing in the wind')
print(test)
test2 = get_lyrics('test','test')
print(test2)
How many roads must a man walk down Before you call him a man? Yes, 'n' how many seas must a white dove sail Before she sleeps in the sand? Yes, 'n' how many times must the cannon balls fly Before they're forever banned? The answer, my friend, is blowin' in the wind The answer is blowin' in the wind How many times must a man look up Before he can see the sky? Yes, 'n' how many ears must one man have Before he can hear people cry? Yes, 'n' how many deaths will it take till he knows That too many people have died? The answer, my friend, is blowin' in the wind The answer is blowin' in the wind How many years can a mountain exist Before it's washed to the sea? Yes, 'n' how many years can some people exist Before they're allowed to be free? Yes, 'n' how many times can a man turn his head Pretending he just doesn't see? The answer, my friend, is blowin' in the wind The answer is blowin' in the wind None
As a rule of thumb, I assume that every song that has less than 50% English words (i.e., words that are not in the English vocabulary) is non-English.
import nltk
def eng_ratio(text):
''' Returns the ratio of non-English to English words from a text '''
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha())
common = text_vocab.intersection(english_vocab)
try:
diff = len(common)/float(len(text_vocab))
except ZeroDivisionError:
diff = 0.0
return diff
text = 'This is a test fahrrad'
print(eng_ratio(text))
lyr = get_lyrics('Pharrell','Happy')
print(eng_ratio(lyr))
0.8 0.986666666667
The following labels are being used to annotate the songs:
0 = no lyrics
1 = likely English
2 = likely non-English
#df = df.loc[:3000, :]
df['lang'] = pd.Series('', index=df.index)
df.tail()
import pyprind
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
lyr = get_lyrics(artist=df.loc[row_id]['artist'].encode('utf-8'), title=df.loc[row_id]['title'].encode('utf-8'))
if not lyr:
df.loc[row_id,'lang'] = 0
elif eng_ratio(lyr) >= 0.5:
df.loc[row_id,'lang'] = 1
else:
df.loc[row_id,'lang'] = 2
pbar.update()
0% 100% [ ]
df[df['lang'] == 1].shape
df = df[df['lang'] == 1]
df.index = range(df.shape[0])
df.tail()
artist | title | lang | |
---|---|---|---|
647 | Suzanne Vega | Marlene On The Wall | 1 |
648 | Nelly | St. Louie | 1 |
649 | Larue | Reason | 1 |
650 | Liam Lynch | SOS | 1 |
651 | Oasis | Boy With The Blues | 1 |
df = df[df['lang'] == 1]
conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
df[df['lang'] == 1].loc[:649,:].to_sql(name='artist_title', con=conn)
conn.commit()
conn.close()
import sqlite3
conn = sqlite3.connect('../../dataset/random_subsets/artist_title_650.sqlite')
cursor = conn.cursor()
sql = "SELECT artist,title FROM artist_title ORDER BY RANDOM() LIMIT 1;"
cursor.execute(sql)
result = cursor.fetchone()
artistname = result[0].decode('utf-8')
songtitle = result[1].decode('utf-8')
print('Arist: %s \nSong: %s' % (artistname, songtitle))
conn.close()
Arist: Matisyahu Song: Warrior
import sys
import sqlite3
conn = sqlite3.connect('../../dataset/million/artist_title.sqlite')
c = conn.cursor()
conn.text_factory = str
sql = "SELECT COUNT(*) from artist_title WHERE language IS NULL;"
c.execute(sql)
res = c.fetchone()
print('Started at: %s' %res[0])
try:
while True:
sql = "SELECT rowid,artist,title FROM artist_title WHERE language IS NULL LIMIT 1;"
c.execute(sql)
result = c.fetchone()
row_id = result[0]
artistname = result[1].decode('utf-8')
songtitle = result[2].decode('utf-8')
lyr = get_lyrics(artist=artistname, title=songtitle)
lang = None
if not lyr:
lang = 0
elif eng_ratio(lyr) >= 0.5:
lang = 1
else:
lang = 2
sql = "UPDATE artist_title SET language=?,lyrics=? WHERE rowid=?;"
c.execute(sql, (lang, lyr, row_id))
conn.commit()
# count remaining songs
sql = "SELECT COUNT(*) from artist_title WHERE language IS NULL;"
c.execute(sql)
res = c.fetchone()
remaining = res[0]
sys.stdout.write('\r')
sys.stdout.write('Remaining: %s' %remaining)
sys.stdout.flush()
if not remaining:
break
finally:
conn.commit()
conn.close()
Started at: 988287 Remaining: 988243
print('test')