Notebook

pyword2vec single thread experiment¶

In [1]:

import numpy as np
from collections import Counter
import mmap
import re
import networkx as nx
import math

In [2]:

## EXPonetial memoization table

MAX_EXP = 6
EXP_TABLE_SIZE = 1000
EXP_TABLE = np.arange(start = 0, stop = EXP_TABLE_SIZE, 
                      step = 1, dtype = np.float64)
EXP_TABLE = np.exp((EXP_TABLE / EXP_TABLE_SIZE * 2. - 1.) * MAX_EXP)
EXP_TABLE = EXP_TABLE / (EXP_TABLE + 1.)

In [3]:

## HELPER FUNCTION
def file_to_wordstream(file_path):
    with open(file_path) as fin:
        mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)
        for match in re.finditer(r'(.*?)\s', mf):
            word = match.group(1)
            if word:
                yield word
                
def inspect_vocab_tree(vocab):
    g = nx.DiGraph()
    vocab_size = len(vocab)
    edges = set()
    for vw in vocab:
        tree_path = [i + vocab_size for i in vw['path']]
        tree_path = [str(i) if i >= vocab_size 
                         else "%d_%s(%d)" % (i, vocab[i]['word'], vocab[i]['count']) 
                     for i in tree_path]
        edges.update(zip(tree_path[:-1], tree_path[1:]))
    g.add_edges_from(edges)
    figure(figsize=(16, 16))
    pos = nx.graphviz_layout(g, prog='dot')
    nx.draw(g, pos, with_labels=True, arrows = True, node_size=3000, font_size = 30)
    return g

In [4]:

## HashedVocab

class HashedVocab(object):
    def __init__(self):
        self.HASH_SIZE = 30000000
        self.MIN_COUNT = 5
        self.vocab = []
        self.vocab_hash = np.empty(self.HASH_SIZE, dtype = np.int)
        self.vocab_hash.fill(-1)
    def fit(self, word_stream):
        counter = Counter(word_stream)
        self.vocab = map(lambda x: dict(zip(['word', 'count'], x)), 
                         filter(lambda x: x[1] > self.MIN_COUNT, 
                                    counter.most_common(len(counter))))
        if len(self.vocab) > self.HASH_SIZE * 0.7:
            raise RuntimeError('Vocab size too large, increase MIN_COUNT or increase HASH_SIZE')
        self.build_hash()
        self.build_huffman_tree()
    def search_for(self, word):
        word_hash = self.get_word_hash(word)
        while True:
            word_index = self.vocab_hash[word_hash]
            if word_index == -1:
                return - 1
            elif word == self.vocab[word_index]['word']:
                return word_index
            else:
                word_hash = (word_hash + 1) % self.HASH_SIZE
    def __getitem__(self, word_index):
        return self.vocab[word_index]
    def __len__(self):
        return len(self.vocab)
    
    def build_hash(self):
        self.vocab_hash = np.empty(self.HASH_SIZE, dtype = np.int)
        self.vocab_hash.fill(-1)
        for word_index, vocab_word in enumerate(self.vocab):
            word = vocab_word['word']
            word_hash = self.get_word_hash(word)
            self.add_to_hash(word_hash, word_index)
    def get_word_hash(self, word):
        ## TOO SLOW
        #word_hash = sum([ord(c)*(257**i) 
        #             for i, c in zip(range(len(word))[::-1], word)])
        word_hash = 0
        for c in word:
            word_hash = word_hash * 257 + ord(c)
        word_hash %= self.HASH_SIZE
        return word_hash
    def add_to_hash(self, word_hash, word_index):
        while self.vocab_hash[word_hash] != -1:
            word_hash = (word_hash + 1) % self.HASH_SIZE
        self.vocab_hash[word_hash] = word_index
    def build_huffman_tree(self):
        """
        build binary Huffman Tree by word counts,
        the structure will be embedded in the vocab_word
        dict(word, count, path, code) in the vocab
        
        vocab_word['code'] will be the binary representation of word
        based on frequency
        vocab_word['path'] will be the path from root to leaf 
        """
        ## for arbitary full binary, n-1 internal inodes at max 
        ## given n leaves. But in the original C code, the count
        ## binary and parent_node size are n*2+1 instead of n*2-1
        vocab_size = len(self.vocab)
        print "DEBUG:", vocab_size
        ## workhorse data structure for tree construction
        count = np.empty(vocab_size*2-1, dtype = np.int64)
        count.fill(1e15)
        count[:vocab_size] = [vw['count'] for vw in self.vocab]
        ## boolean values of each node
        binary = np.zeros(vocab_size*2-1, dtype = np.int8)
        ## parent node to store the path
        parent_node = np.empty(vocab_size*2-1, dtype = np.int64)
        ## construct the tree
        pos1, pos2 = vocab_size-1, vocab_size
        for a in xrange(vocab_size-1):
            ## min1i
            if pos1 >= 0:
                if count[pos1] < count[pos2]:
                    min1i, pos1 = pos1, pos1-1
                else:
                    min1i, pos2 = pos2, pos2+1
            else:
                min1i, pos2 = pos2, pos2+1
            ## min2i
            if pos1 >= 0:
                if count[pos1] < count[pos2]:
                    min2i, pos1 = pos1, pos1-1
                else:
                    min2i, pos2 = pos2, pos2+1
            else:
                min2i, pos2 = pos2, pos2+1
            count[vocab_size + a] = count[min1i] + count[min2i]
            parent_node[min1i] = vocab_size + a
            parent_node[min2i] = vocab_size + a
            binary[min2i] = 1
        for a in xrange(vocab_size):
            b, i = a, 0
            code, path = [], []
            while True:
                code.append(binary[b])
                path.append(b)
                i += 1
                b = parent_node[b]
                if b == vocab_size * 2 - 2: break
            self.vocab[a]['path'] = [vocab_size - 2] + [p -vocab_size for p in path[::-1]]
            self.vocab[a]['code'] = code[::-1]

In [5]:

## Unigram Table
class UnigramTable(object):
    def __init__(self, hashed_vocab):
        self.TABLE_SIZE = int(1e8)
        self.table = np.empty(self.TABLE_SIZE, np.int64)
        self.power = 0.75
        vocab = hashed_vocab.vocab
        vocab_size = len(vocab)
        train_words_pow = sum(math.pow(w['count'], self.power) 
                              for w in vocab)
        i = 0
        d1 = math.pow(vocab[i]['count'], self.power) / train_words_pow
        for a in xrange(self.TABLE_SIZE):
            self.table[a] = i
            if a * 1. / self.TABLE_SIZE > d1:
                i += 1
                d1 += math.pow(vocab[i]['count'], self.power) / train_words_pow
            if i >= vocab_size:
                i = vocab_size - 1
    def __getitem__(self, index):
        return self.table[index]
    

In [17]:

## Learning Models
class Word2Vec(object):
    """
    Net Architecture: cbow / skip_gram
    Learning: hs / negative_sampling
    """
    def __init__(self, hashed_vocab, unigram_table, layer1_size,
                 net_type, learn_hs, learn_negative):
        """
        hashed_vocab: vocab to build on
        layer1_size: the dimensionality of feature space
        net_type: {'cbow', 'skip_gram'}
        learn_hs : hierarchical softmax
        learn_negative: negative sampling
        """
        self.vocab = hashed_vocab
        self.table = unigram_table
        self.layer1_size = layer1_size
        self.net_type = net_type
        self.learn_hs = learn_hs
        self.learn_negative = learn_negative
        
        self.starting_alpha = 0.025
        self.sentence_len = 1000
        self.sampling = 1e-4
        self.window = 5 # max skip length between words
        self.negative = 5 # 5 - 10
        
        self.REAL_TYPE = np.float64
        self.syn0 = np.array([], dtype = self.REAL_TYPE)
        self.syn1 = np.array([], dtype = self.REAL_TYPE)
        self.syn1neg = np.array([], dtype = self.REAL_TYPE)
        
    def init_net(self):
        """
        syn0 - len(vocab) * layer1_size
        syn1 - len(vocab) * layer1_size
        syn1neg - len(vocab) * layer1_size
        """
        vocab_size = len(self.vocab)
        self.syn0 = np.random.uniform(low = -.5 / self.layer1_size, 
                                      high = .5 / self.layer1_size, 
                                      size = (vocab_size, self.layer1_size)).astype(self.REAL_TYPE)
        if self.learn_hs: 
            self.syn1 = np.zeros((vocab_size, self.layer1_size), dtype = self.REAL_TYPE)
        if self.learn_negative:
            self.syn1neg = np.zeros((vocab_size, self.layer1_size), dtype = self.REAL_TYPE)
    def fit(self, words):
        """
        word_stream: stream of words
        ntotal: total number of words in word_stream
        """
        self.init_net()
        
        ntotal = len(words)
        
        alpha = self.starting_alpha
        next_random = 0
        
        nprocessed = 0
        while nprocessed < ntotal:
            ## adjust learning rate alpha
            alpha = max(self.starting_alpha * (1 - nprocessed / (ntotal + 1.)), 
                        self.starting_alpha * 0.0001)
            ## refill the sentence
            sentence_index = []
            while nprocessed < ntotal and len(sentence_index) < self.sentence_len:
                ## sampling down the infrequent words
                if nprocessed % 10000 == 0: 
                    print 'progress:', nprocessed * 100. / ntotal, '%'
                word = words[nprocessed]
                word_index = self.vocab.search_for(word)
                word_count = self.vocab[word_index]['count']
                nprocessed += 1
                if word_index == -1: continue
                if self.sampling > 0:
                    ran = ( (math.sqrt(word_count / (self.sampling * ntotal)) + 1) 
                                                       * (self.sampling * ntotal) / word_count);
                    next_random = next_random * 25214903917 + 11
                    if ran < (next_random & 0xFFFF) / 65536.: continue
                sentence_index.append(word_index)
            ## for each word in sentence
            ## pivot is the current word index in sentence_index
            for ipivot, pivot in enumerate(sentence_index):
                next_random = next_random * 25214903917 + 11
                b = next_random % self.window
                neu1 = np.zeros(self.layer1_size, dtype = self.REAL_TYPE)
                neu1e = np.zeros(self.layer1_size, dtype = self.REAL_TYPE)
                if self.net_type == 'cbow':
                    ## accumulate neighbors into neu1
                    ## neighbors [ipivot-(window-b), ipivot+window-b]
                    left = max(0, ipivot-(self.window-b))
                    right = min(self.sentence_len-1, ipivot+self.window-b)
                    ## NEU1 -- ACCUMULATION IN SENTENCE
                    ## IN -> HIDDEN
                    neu1 = np.sum(self.syn0[left:right+1, :], axis = 0)
                    if self.learn_hs:
                        ## parent node in Huffman tree
                        ## the last one is root
                        for parent_index, parent_code in zip(self.vocab[pivot]['path'][:-1], 
                                                             self.vocab[pivot]['code'][:-1]):
                            ## F IS COS-DIFF OF NEIGHBORS'SYN0 AND PARENTS (class)' SYN1
                            f = np.dot(neu1, self.syn1[parent_index, :])
                            if f <= -MAX_EXP or f >= MAX_EXP:
                                continue
                            else:
                                f = EXP_TABLE[int((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
                            g = (1 - parent_code - f) * alpha
                            neu1e += g * self.syn1[parent_index]
                            self.syn1[parent_index] += g * neu1
                    if self.learn_negative:
                        for d in xrange(self.negative + 1):
                            if d == 0:
                                target = pivot
                                label = 1
                            else:
                                next_random = next_random * 25214903917 + 11
                                target = self.table[(next_random >> 16) % self.table.TABLE_SIZE]
                                if (target == pivot): continue
                                label = 0
                            f = np.dot(neu1, self.syn1neg[target, :])
                            if f > MAX_EXP: 
                                g = (label - 1) * alpha
                            elif f < -MAX_EXP: 
                                g = (label - 0) * alpha
                            else: 
                                g = alpha * (label - EXP_TABLE[int((f + MAX_EXP) 
                                                                   * (EXP_TABLE_SIZE / MAX_EXP / 2))])
                            neu1e += g * self.syn1neg[target]
                            self.syn1neg[target] += g * neu1
                    ## HIDDEN -> IN
                    self.syn0[left:right+1, :] += neu1e
                elif self.net_type == 'skip_gram':
                    pass

TESTING¶

In [7]:

## TEST HashedVocab
hashed_vocab = HashedVocab()
#hashed_vocab.fit(file_to_wordstream('data/text_simple'))
hashed_vocab.fit(file_to_wordstream('data/simple.txt'))
unigram_table = UnigramTable(hashed_vocab)

DEBUG: 1179

In [8]:

for i, vw in enumerate(hashed_vocab.vocab):
    word = vw['word']
    assert i == hashed_vocab.search_for(word)
    
print hashed_vocab.search_for('alien')
print len(hashed_vocab.vocab)
#inspect_vocab_tree(hashed_vocab.vocab)
print hashed_vocab[10]['word']
for n in hashed_vocab[10]['path'][:-1]:
    print hashed_vocab[n]['word']

-1
1179
as
respect
practice
issued
texts
outside
private

In [9]:

train_words = list(file_to_wordstream('data/simple.txt'))

In [18]:

word2vec = Word2Vec(hashed_vocab, unigram_table, 100, 'cbow', True, True)
word2vec.fit(train_words)

progress: 0.0 %
progress: 20.000400008 %
progress: 40.000800016 %
progress: 60.001200024 %
progress: 80.001600032 %

In [ ]: