#!/usr/bin/env python # coding: utf-8 # # Loading the OHHLA corpus # This notebook shows a typical example of data loading and preprocessing necessary for NLP. In this case we are loading a corpus downloaded from the Hip-Hop Lyrics webpage [www.ohhla.com](www.ohhla.com). Our primary goal is to provide a dataset loading function for the [language modelling](todo) chapter in this book. # # We provide the corpus in the `data` directory. As this notebook lives in a sub-directory itself, we access it via `../data`. Before preprocessing all files and provide *generic* loaders it is useful to inspect the format of the files based on a specific example file, and work on the loading process in this context. Here we look at `/data/ohhla/train/www.ohhla.com/anonymous/j_live/SPTA/authentc.jlv.txt`. # In[2]: with open('../data/ohhla/train/www.ohhla.com/anonymous/j_live/allabove/satisfy.jlv.txt.html', 'r') as f: # we use read().splitlines() instead of readlines() to skip newline characters lines = f.read().splitlines() lines # We first would like to remove everything outside of the `

` tag, and then remove the meta information.

# In[3]:


def find_lyrics(lines):
    filtered = []
    in_pre = False
    for line in lines:
        if '' in line:
            in_pre = True
            filtered.append(line.replace("",""))
        elif '' in line:
            in_pre = False
            filtered.append(line.replace("",""))
        elif in_pre:
            filtered.append(line)
    return filtered[6:]
    
lyrics = find_lyrics(lines)
lyrics[:10]


# Finally, we would like to convert the list of lines with newline characters to a single string, as this will be easier to process for our language models. We will also mark lyrical "bars" (lines) using a `BAR` tag to still capture the rhythmical structure in the song.

# In[4]:


string = '[BAR]' + '[/BAR][BAR]'.join(lyrics) + '[/BAR]'
string[:500]


# We are now ready to provide a loading function. 

# In[5]:


def load_song(file_name):
    def load_raw(encoding):
        with open(file_name, 'r',encoding=encoding) as f:
            # we use read().splitlines() instead of readlines() to skip newline characters
            lines = f.read().splitlines()   
            # some files are pure txt files for which we don't need to extract the lyrics 
            lyrics = find_lyrics(lines) if file_name.endswith('html') else lines[5:]
            string = '[BAR]' + '[/BAR][BAR]'.join(lyrics) + '[/BAR]'
            return string
    try:
        return load_raw('utf-8')
    except UnicodeDecodeError:
        try:
            return load_raw('cp1252')
        except UnicodeDecodeError:
            print("Could not load " + file_name)
            return ""

        
    
song = load_song('../data/ohhla/train/www.ohhla.com/anonymous/j_live/allabove/satisfy.jlv.txt.html')
song[:500]


# Now we want to load several files from an album directory. 

# In[6]:


from os import listdir
from os.path import isfile, join

def load_album(path):
    # we filter out directories, and files that don't look like song files in OHHLA.
    onlyfiles = [join(path, f) for f in listdir(path) if isfile(join(path, f)) and 'txt' in f]
    lyrics = [load_song(f) for f in onlyfiles]
    return lyrics

songs = load_album('../data/ohhla/train/www.ohhla.com/anonymous/j_live/SPTA/')
[len(s) for s in songs]


# We will also make it easy to load several albums. Then, for a few artists we provide short cuts to the album directories we care about. 

# In[7]:


def load_albums(album_paths):
    return [song 
            for path in album_paths 
            for song in load_album(path)]

top_dir = '../data/ohhla/train/www.ohhla.com/anonymous/'
j_live = [
    top_dir + '/j_live/allabove/',
    top_dir + '/j_live/bestpart/'
]
len(load_albums(j_live))


# It will be useful to convert a list of documents into a flat list of tokens. Based on the approach showed in the [tokenisation chapter](todo) we can do this as follows:

# In[17]:


import re
token = re.compile("\[BAR\]|\[/BAR\]|[\w-]+|'m|'t|'ll|'ve|'d|'s|\'")
def words(docs):
    return [word 
            for doc in docs 
            for word in token.findall(doc)]
song_words = words(songs)
song_words[:20]


# Finally we provide a function that can load all songs within a top-level directory.

# In[10]:


def load_all_songs(path):
    only_files = [join(path, f) for f in listdir(path) if isfile(join(path, f)) and 'txt' in f]
    only_paths = [join(path, f) for f in listdir(path) if not isfile(join(path, f))]
    lyrics = [load_song(f) for f in only_files]
    sub_songs = [song for sub_path in only_paths for song in load_all_songs(sub_path)]
    return lyrics + sub_songs

len(load_all_songs("../data/ohhla/train/www.ohhla.com/anonymous/j_live/"))