Test tricks for pyword2vec

Main Change from v1:

  • decide to totally ignore "<\s>" because it is very confusing and seems not correctly implemented in the C version
In [1]:
from collections import namedtuple
import numpy as np
import mmap
import re
from IPython.display import display
import networkx as nx
import multiprocessing
import ctypes
from os import path
In [2]:
## control flags
flags = {
     'debug_mode': 2
    , 'hs': True
    , 'negative': True
    , 'train_file': 'data/text_simple'
    , 'num_threads': 10
}
In [3]:
## types
#VocabWord = {'word':??,'count':??, 'path':??, 'code':??}
REAL = np.float64
HASH_TYPE = np.int64
SHORT_INT = np.int8
In [4]:
## CONSTRATS
MAX_STRING = 100
EXP_TABLE_SIZE = 1000
MAX_EXP = 6
##
#VOCAB_MAX_SIZE = 1000
VOCAB_HASH_SIZE = 30000000
n_train_words = 0
##
MIN_COUNT = 5
MIN_REDUCE = 1
MAX_SENTENCE_LENGTH = 1000
MAX_CODE_LENGTH = 40
##
WINDOW = 5
LAYER1_SIZE = 100
ALPHA = 0.025
##
TABLE_SIZE = 1e8
In [5]:
## DATA STRUCTURE
vocab = []

vocab_hash = np.empty(VOCAB_HASH_SIZE, dtype = HASH_TYPE)
vocab_hash.fill(-1)

exp_table = np.arange(start = 0, stop = EXP_TABLE_SIZE, 
                      step = 1, dtype = REAL)
exp_table = np.exp((exp_table / EXP_TABLE_SIZE * 2. - 1.) * MAX_EXP)
exp_table = exp_table / (exp_table + 1.)


syn0 = np.array([], dtype = REAL)
syn1 = np.array([], dtype = REAL)
syn1neg = np.array([], dtype = REAL)
In [6]:
def get_word_hash(word):
    word_hash = sum([ord(c)*(257**i) 
                     for i, c in zip(range(len(word))[::-1], word)])
    word_hash %= VOCAB_HASH_SIZE
    return word_hash

def add_vocab_hash(word_hash, word_index):
    global vocab_hash
    while vocab_hash[word_hash] != -1:
        word_hash = (word_hash + 1) % VOCAB_HASH_SIZE
    vocab_hash[word_hash] = word_index
    
def search_vocab(word):
    """
    Search for word's vocab_index by using the vocab_hash
    """
    word_hash = get_word_hash(word)
    while True:
        word_index = vocab_hash[word_hash]
        ## no found
        if word_index == -1:
            return -1
        elif word == vocab[word_index]['word']:
            return word_index
        else:
            word_hash = (word_hash + 1) % VOCAB_HASH_SIZE
    return -1 # should never reach here

def reduce_vocab():
    """
    Reduce the vocabulary size by removing infrequent tokens
    """
    global vocab, vocab_hash
    ## in-place remove infrequent words
    a, b = 0, 0
    for a in xrange(len(vocab)):
        if vocab[a]['count'] > MIN_REDUCE:
            vocab[b]['count'] = vocab[a]['count']
            vocab[b]['word'] = vocab[a]['word']
            b += 1
    vocab = vocab[:b]
    ## reset the hash table
    vocab_hash.fill(-1)
    for word_index, vocab_word in enumerate(vocab):
        word_hash = get_word_hash(vocab_word['word'])
        add_vocab_hash(word_hash, word_index)
    MIN_REDUCE += 1

def add_word_to_vocab(word):
    """
    construct a VocabWord {'count', 'path', 'word', 'code'} 
    from word
    add vocab_word to vocab
    put its index to vocab_hash
    word_index: the index of vocab_word in vocab
    word_hash: the index of word_index in vocab_hash
    """
    global vocab, vocab_hash
    vocab_word = dict(count = 0, path = None, word = word, code = None)
    vocab.append(vocab_word)
    
    word_hash = get_word_hash(word)
    word_index = len(vocab)-1
    add_vocab_hash(word_hash, word_index)
    return word_index

def sort_vocab():
    """
    sort the vocabulary by frequency using word counts
    </s> will be kept in vocab at the first place, with count = 0,
    but it is NOT hashed, which means its hash value in vocab_hash
    will be -1
    """
    global vocab, vocab_hash, n_train_words
    ## sort the vocab 
    ## based on word frequency in DECRESENT order
    vocab = sorted(vocab, key = lambda v: v['count'], 
                       reverse = True)
    ## re-initialize vocab_hash, reduce vocab
    vocab_hash.fill(-1)
    vocab_sz = len(vocab)
    n_train_words = 0
    for iword, vword in enumerate(vocab):
        ## discarding words less than MIN_COUNT
        if vword['count'] < MIN_COUNT:
            vocab_sz -= 1
        else: 
            word_hash = get_word_hash(vword['word'])
            add_vocab_hash(word_hash, iword)
            n_train_words += vword['count']
    ## trucate vocab
    vocab = vocab[:vocab_sz]

def read_word(fpath):
    """
    Lazy read word from a file (words separated by whitespace)
    using mmap (OS virtual memory system) to read the file,
    ONLY TESTED on POSIX
    WE DONT insert </s> explicitly for every \n here, like
    in the original C version, though there is no \n in the 
    training file
    """
    with open(fpath) as fin:
        mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)
        for word in re.finditer(r'(.*?)\s', mf):
            w = word.group(1)
            if w: 
                yield w
In [7]:
def learn_vocab_from_file(fpath):
    """
    fpath: path of train file,
    train file is a collection of words delimited by whitespace
    modify: vocab - np.array of dtype=vocab_word
            vocab_hash - hashed index of words in vocab
    """
    global vocab, vocab_hash
    ## prelocate data structure
    vocab_hash = np.empty(VOCAB_HASH_SIZE, dtype = np.int32)
    vocab_hash.fill(-1)
    for i, word in enumerate(read_word(fpath)):
        if flags['debug_mode'] > 1 and i % 1000000 == 0:
            print "%iM" % (i/1000000)
        ## find the word's vocab_index by using vocab_hash
        word_index = search_vocab(word)
        ## add new word
        if word_index == -1:
            word_index = add_word_to_vocab(word)
            vocab[word_index]['count'] = 1
        else:
            vocab[word_index]['count'] += 1
        ## increase vocab_hash size
        if len(vocab) > VOCAB_HASH_SIZE * 0.7:
            reduce_vocab()
    sort_vocab()
    if flags['debug_mode'] > 0:
        print 'Vocab Size: %i\nWords in train file: %d' % (len(vocab), i)
In [8]:
def init_net():
    """
    syn0 - len(vocab) * layer1_size
    syn1 - len(vocab) * layer1_size
    Initialize with certain weight values
    Make them shared variable
    """
    global syn0, syn1, syn1neg
    vocab_size = len(vocab)
    shared_syn0_base = multiprocessing.Array(ctypes.c_float, 
                                             vocab_size * LAYER1_SIZE)
    syn0 = np.ctypeslib.as_array(shared_syn0_base.get_obj())
    syn0[:] = np.random.uniform(low = -.5 / LAYER1_SIZE,
                             high = .5 / LAYER1_SIZE,
                             size = vocab_size * LAYER1_SIZE)
    syn0.reshape((vocab_size, LAYER1_SIZE))
    
    if flags['hs']:
        shared_syn1_base = multiprocessing.Array(ctypes.c_float, 
                                             vocab_size * LAYER1_SIZE)
        syn1 = np.ctypeslib.as_array(shared_syn1_base.get_obj())
        syn1.reshape((vocab_size, LAYER1_SIZE))
    if flags['negative']:
        shared_syn1neg_base = multiprocessing.Array(ctypes.c_float, 
                                             vocab_size * LAYER1_SIZE)
        syn1neg = np.ctypeslib.as_array(shared_syn1neg_base.get_obj())
        syn1neg.reshape((vocab_size, LAYER1_SIZE))
In [9]:
def create_binary_tree():
    """Huffman tree by word counts
    word['code'] will be the binary representation of word based on frequency
    word['path'] will be the path from root to leaf in the tree
    """
    global vocab
    ## FOR arbitary full binary, n-1 internal nodes at max given n leaves
    ## But in the original C code, the count, binary and parent_node size
    ## are n*2+1 intead of n*2-1
    
    ## original version, vocab_size is actually vocab_size - 1
    vocab_size = len(vocab)
    ## count - tree construction based on count
    count = np.empty(vocab_size*2-1, dtype=HASH_TYPE)
    count.fill(1e15)
    count[:vocab_size] = [vw['count'] for vw in vocab]
    ## binary - boolean value of each node
    binary = np.zeros(vocab_size*2-1, dtype = SHORT_INT)
    ## parent_node
    parent_node = np.empty(vocab_size*2-1, dtype=HASH_TYPE)
    ## construct the tree
    pos1, pos2 = vocab_size-1, vocab_size
    for a in xrange(vocab_size-1):
        ## min1i
        if pos1 >= 0:
            if count[pos1] < count[pos2]:
                min1i, pos1 = pos1, pos1-1
            else:
                min1i, pos2 = pos2, pos2+1
        else:
            min1i, pos2 = pos2, pos2+1
        ## min2i
        if pos1 >= 0:
            if count[pos1] < count[pos2]:
                min2i, pos1 = pos1, pos1-1
            else:
                min2i, pos2 = pos2, pos2+1
        else:
            min2i, pos2 = pos2, pos2+1
        count[vocab_size + a] = count[min1i] + count[min2i]
        parent_node[min1i] = vocab_size + a;
        parent_node[min2i] = vocab_size + a;
        binary[min2i] = 1;
    for a in xrange(vocab_size):
        b, i = a, 0
        code, path = [], []
        while True:
            code.append(binary[b])
            path.append(b)
            i += 1
            b = parent_node[b]
            if b == vocab_size * 2 - 2: break
        vocab[a]['path'] = [vocab_size - 2] + [p -vocab_size for p in path[::-1]]
        vocab[a]['code'] = code[::-1]
        
def inspect_vocab_tree(vocab):
    g = nx.DiGraph()
    vocab_size = len(vocab)
    edges = set()
    for vw in vocab:
        tree_path = [i + vocab_size for i in vw['path']]
        tree_path = [str(i) if i >= vocab_size 
                         else "%d_%s(%d)" % (i, vocab[i]['word'], vocab[i]['count']) 
                     for i in tree_path]
        edges.update(zip(tree_path[:-1], tree_path[1:]))
    g.add_edges_from(edges)
    figure(figsize=(16, 16))
    pos = nx.graphviz_layout(g, prog='dot')
    nx.draw(g, pos, with_labels=True, arrows = True, node_size=3000, font_size = 30)
    return g
In [10]:
def refill(mf, sent_len, end):
    nsents = 0
    sentence = []
    while True:
        c = mf.read(1)
        if not c: break ## end of file
        if mf.tell() > end: break
        if c == ' ': nsents += 1
        if nsents == sent_len: break
    return ''.join(sentence).split()

def train_model_thread(pid):
    """architecture: cbow / skip_gram
    learning: hs / negative_sampling
    running in multiprocessing pool
    """
    global syn0, syn1
    neu1 = np.empty(LAYER1_SIZE, dtype = REAL)
    neu1e = np.empty(LAYER1_SIZE, dtype = REAL)
    num_threads = flags['num_threads']
    fsize = path.getsize(flags['train_file'])
    with open(flags['train_file'], 'r') as fin:
        mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)
        fstart = fsize / num_threads * pid
        fend = fsize / num_threads * (pid + 1)
        ftell = fstart
        for word in re.finditer(r'(.*?)\s', mf[fstart:]):
            if ftell > fend: break
            w = word.group(1)
            ftell += len(word.group(0))
            ??
In [10]:
 

TESTING

In [11]:
learn_vocab_from_file('data/text_simple')
0M
Vocab Size: 92
Words in train file: 2899
In [12]:
display(vocab)
display([vocab_hash[h] for h in [get_word_hash(w['word']) for w in vocab]])
[{'code': None, 'count': 141, 'path': None, 'word': 'the'},
 {'code': None, 'count': 112, 'path': None, 'word': 'acid'},
 {'code': None, 'count': 84, 'path': None, 'word': 'a'},
 {'code': None, 'count': 71, 'path': None, 'word': 'and'},
 {'code': None, 'count': 68, 'path': None, 'word': 'of'},
 {'code': None, 'count': 57, 'path': None, 'word': 'in'},
 {'code': None, 'count': 53, 'path': None, 'word': 'is'},
 {'code': None, 'count': 49, 'path': None, 'word': 'three'},
 {'code': None, 'count': 48, 'path': None, 'word': 'to'},
 {'code': None, 'count': 47, 'path': None, 'word': 'two'},
 {'code': None, 'count': 35, 'path': None, 'word': 'acids'},
 {'code': None, 'count': 34, 'path': None, 'word': 'are'},
 {'code': None, 'count': 33, 'path': None, 'word': 'one'},
 {'code': None, 'count': 33, 'path': None, 'word': 'e'},
 {'code': None, 'count': 30, 'path': None, 'word': 'zero'},
 {'code': None, 'count': 30, 'path': None, 'word': 'h'},
 {'code': None, 'count': 28, 'path': None, 'word': 'for'},
 {'code': None, 'count': 26, 'path': None, 'word': 'aq'},
 {'code': None, 'count': 26, 'path': None, 'word': 'k'},
 {'code': None, 'count': 23, 'path': None, 'word': 'an'},
 {'code': None, 'count': 21, 'path': None, 'word': 'that'},
 {'code': None, 'count': 19, 'path': None, 'word': 'asphalt'},
 {'code': None, 'count': 19, 'path': None, 'word': 'standards'},
 {'code': None, 'count': 18, 'path': None, 'word': 'o'},
 {'code': None, 'count': 18, 'path': None, 'word': 'as'},
 {'code': None, 'count': 17, 'path': None, 'word': 'abacus'},
 {'code': None, 'count': 17, 'path': None, 'word': 'this'},
 {'code': None, 'count': 16, 'path': None, 'word': 'by'},
 {'code': None, 'count': 16, 'path': None, 'word': 'with'},
 {'code': None, 'count': 16, 'path': None, 'word': 'can'},
 {'code': None, 'count': 15, 'path': None, 'word': 'or'},
 {'code': None, 'count': 15, 'path': None, 'word': 'five'},
 {'code': None, 'count': 15, 'path': None, 'word': 'four'},
 {'code': None, 'count': 15, 'path': None, 'word': 'six'},
 {'code': None, 'count': 14, 'path': None, 'word': 'base'},
 {'code': None, 'count': 14, 'path': None, 'word': 'nine'},
 {'code': None, 'count': 14, 'path': None, 'word': 'ansi'},
 {'code': None, 'count': 13, 'path': None, 'word': 'water'},
 {'code': None, 'count': 12, 'path': None, 'word': 'definition'},
 {'code': None, 'count': 12, 'path': None, 'word': 'found'},
 {'code': None, 'count': 11, 'path': None, 'word': 'be'},
 {'code': None, 'count': 11, 'path': None, 'word': 'ha'},
 {'code': None, 'count': 11, 'path': None, 'word': 'seven'},
 {'code': None, 'count': 11, 'path': None, 'word': 'eight'},
 {'code': None, 'count': 11, 'path': None, 'word': 'proton'},
 {'code': None, 'count': 11, 'path': None, 'word': 'which'},
 {'code': None, 'count': 10, 'path': None, 'word': 'used'},
 {'code': None, 'count': 10, 'path': None, 'word': 'dissociation'},
 {'code': None, 'count': 9, 'path': None, 'word': 'american'},
 {'code': None, 'count': 9, 'path': None, 'word': 'weak'},
 {'code': None, 'count': 9, 'path': None, 'word': 'form'},
 {'code': None, 'count': 8, 'path': None, 'word': 'they'},
 {'code': None, 'count': 8, 'path': None, 'word': 'use'},
 {'code': None, 'count': 8, 'path': None, 'word': 'some'},
 {'code': None, 'count': 8, 'path': None, 'word': 'from'},
 {'code': None, 'count': 8, 'path': None, 'word': 'lewis'},
 {'code': None, 'count': 8, 'path': None, 'word': 'most'},
 {'code': None, 'count': 8, 'path': None, 'word': 'strong'},
 {'code': None, 'count': 8, 'path': None, 'word': 'example'},
 {'code': None, 'count': 7, 'path': None, 'word': 'have'},
 {'code': None, 'count': 7, 'path': None, 'word': 'also'},
 {'code': None, 'count': 7, 'path': None, 'word': 'bases'},
 {'code': None, 'count': 7, 'path': None, 'word': 'it'},
 {'code': None, 'count': 7, 'path': None, 'word': 'constant'},
 {'code': None, 'count': 7, 'path': None, 'word': 'l'},
 {'code': None, 'count': 6, 'path': None, 'word': 'so'},
 {'code': None, 'count': 6, 'path': None, 'word': 's'},
 {'code': None, 'count': 6, 'path': None, 'word': 'more'},
 {'code': None, 'count': 6, 'path': None, 'word': 'organic'},
 {'code': None, 'count': 6, 'path': None, 'word': 'protons'},
 {'code': None, 'count': 6, 'path': None, 'word': 'orbital'},
 {'code': None, 'count': 6, 'path': None, 'word': 'its'},
 {'code': None, 'count': 6, 'path': None, 'word': 'anion'},
 {'code': None, 'count': 6, 'path': None, 'word': 'hydrochloric'},
 {'code': None, 'count': 6, 'path': None, 'word': 'donate'},
 {'code': None, 'count': 6, 'path': None, 'word': 'products'},
 {'code': None, 'count': 6, 'path': None, 'word': 'institute'},
 {'code': None, 'count': 5, 'path': None, 'word': 'when'},
 {'code': None, 'count': 5, 'path': None, 'word': 'called'},
 {'code': None, 'count': 5, 'path': None, 'word': 'was'},
 {'code': None, 'count': 5, 'path': None, 'word': 'solution'},
 {'code': None, 'count': 5, 'path': None, 'word': 'has'},
 {'code': None, 'count': 5, 'path': None, 'word': 'according'},
 {'code': None, 'count': 5, 'path': None, 'word': 'ion'},
 {'code': None, 'count': 5, 'path': None, 'word': 'substances'},
 {'code': None, 'count': 5, 'path': None, 'word': 'all'},
 {'code': None, 'count': 5, 'path': None, 'word': 'conjugate'},
 {'code': None, 'count': 5, 'path': None, 'word': 'these'},
 {'code': None, 'count': 5, 'path': None, 'word': 'such'},
 {'code': None, 'count': 5, 'path': None, 'word': 'sulfuric'},
 {'code': None, 'count': 5, 'path': None, 'word': 'tar'},
 {'code': None, 'count': 5, 'path': None, 'word': 'national'}]
[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91]
In [13]:
create_binary_tree()
In [14]:
display(vocab)
[{'code': [1, 1, 1, 1],
  'count': 141,
  'path': [90, 89, 87, 83, -92],
  'word': 'the'},
 {'code': [1, 0, 0, 1],
  'count': 112,
  'path': [90, 89, 86, 80, -91],
  'word': 'acid'},
 {'code': [0, 0, 1, 1],
  'count': 84,
  'path': [90, 88, 84, 77, -90],
  'word': 'a'},
 {'code': [1, 1, 1, 0, 1],
  'count': 71,
  'path': [90, 89, 87, 83, 75, -89],
  'word': 'and'},
 {'code': [1, 1, 0, 1, 1],
  'count': 68,
  'path': [90, 89, 87, 82, 74, -88],
  'word': 'of'},
 {'code': [1, 0, 1, 0, 0],
  'count': 57,
  'path': [90, 89, 86, 81, 71, -87],
  'word': 'in'},
 {'code': [1, 0, 0, 0, 0],
  'count': 53,
  'path': [90, 89, 86, 80, 70, -86],
  'word': 'is'},
 {'code': [0, 1, 1, 0, 1],
  'count': 49,
  'path': [90, 88, 85, 79, 68, -85],
  'word': 'three'},
 {'code': [0, 1, 1, 0, 0],
  'count': 48,
  'path': [90, 88, 85, 79, 68, -84],
  'word': 'to'},
 {'code': [0, 1, 0, 1, 0],
  'count': 47,
  'path': [90, 88, 85, 78, 67, -83],
  'word': 'two'},
 {'code': [1, 1, 1, 0, 0, 1],
  'count': 35,
  'path': [90, 89, 87, 83, 75, 62, -82],
  'word': 'acids'},
 {'code': [1, 1, 1, 0, 0, 0],
  'count': 34,
  'path': [90, 89, 87, 83, 75, 62, -81],
  'word': 'are'},
 {'code': [1, 1, 0, 1, 0, 0],
  'count': 33,
  'path': [90, 89, 87, 82, 74, 61, -80],
  'word': 'one'},
 {'code': [1, 1, 0, 0, 1, 1],
  'count': 33,
  'path': [90, 89, 87, 82, 73, 60, -79],
  'word': 'e'},
 {'code': [1, 0, 1, 1, 1, 0],
  'count': 30,
  'path': [90, 89, 86, 81, 72, 58, -78],
  'word': 'zero'},
 {'code': [1, 0, 1, 1, 0, 1],
  'count': 30,
  'path': [90, 89, 86, 81, 72, 57, -77],
  'word': 'h'},
 {'code': [1, 0, 1, 0, 1, 0],
  'count': 28,
  'path': [90, 89, 86, 81, 71, 56, -76],
  'word': 'for'},
 {'code': [0, 1, 1, 1, 1, 0],
  'count': 26,
  'path': [90, 88, 85, 79, 69, 54, -75],
  'word': 'aq'},
 {'code': [0, 1, 1, 1, 0, 1],
  'count': 26,
  'path': [90, 88, 85, 79, 69, 53, -74],
  'word': 'k'},
 {'code': [0, 1, 0, 0, 1, 0],
  'count': 23,
  'path': [90, 88, 85, 78, 66, 51, -73],
  'word': 'an'},
 {'code': [0, 0, 1, 0, 1, 0],
  'count': 21,
  'path': [90, 88, 84, 77, 65, 49, -72],
  'word': 'that'},
 {'code': [0, 0, 0, 1, 0, 0],
  'count': 19,
  'path': [90, 88, 84, 76, 64, 46, -71],
  'word': 'asphalt'},
 {'code': [0, 0, 0, 0, 1, 1],
  'count': 19,
  'path': [90, 88, 84, 76, 63, 45, -70],
  'word': 'standards'},
 {'code': [0, 0, 0, 0, 1, 0],
  'count': 18,
  'path': [90, 88, 84, 76, 63, 45, -69],
  'word': 'o'},
 {'code': [0, 0, 0, 0, 0, 1],
  'count': 18,
  'path': [90, 88, 84, 76, 63, 44, -68],
  'word': 'as'},
 {'code': [1, 1, 0, 1, 0, 1, 1],
  'count': 17,
  'path': [90, 89, 87, 82, 74, 61, 43, -67],
  'word': 'abacus'},
 {'code': [1, 1, 0, 1, 0, 1, 0],
  'count': 17,
  'path': [90, 89, 87, 82, 74, 61, 43, -66],
  'word': 'this'},
 {'code': [1, 1, 0, 0, 1, 0, 0],
  'count': 16,
  'path': [90, 89, 87, 82, 73, 60, 42, -65],
  'word': 'by'},
 {'code': [1, 1, 0, 0, 0, 1, 1],
  'count': 16,
  'path': [90, 89, 87, 82, 73, 59, 41, -64],
  'word': 'with'},
 {'code': [1, 1, 0, 0, 0, 1, 0],
  'count': 16,
  'path': [90, 89, 87, 82, 73, 59, 41, -63],
  'word': 'can'},
 {'code': [1, 0, 1, 1, 1, 1, 0],
  'count': 15,
  'path': [90, 89, 86, 81, 72, 58, 39, -62],
  'word': 'or'},
 {'code': [1, 0, 1, 1, 0, 0, 1],
  'count': 15,
  'path': [90, 89, 86, 81, 72, 57, 38, -61],
  'word': 'five'},
 {'code': [1, 0, 1, 1, 0, 0, 0],
  'count': 15,
  'path': [90, 89, 86, 81, 72, 57, 38, -60],
  'word': 'four'},
 {'code': [1, 0, 1, 0, 1, 1, 1],
  'count': 15,
  'path': [90, 89, 86, 81, 71, 56, 37, -59],
  'word': 'six'},
 {'code': [1, 0, 0, 0, 1, 1, 1],
  'count': 14,
  'path': [90, 89, 86, 80, 70, 55, 36, -58],
  'word': 'base'},
 {'code': [1, 0, 0, 0, 1, 1, 0],
  'count': 14,
  'path': [90, 89, 86, 80, 70, 55, 36, -57],
  'word': 'nine'},
 {'code': [1, 0, 0, 0, 1, 0, 1],
  'count': 14,
  'path': [90, 89, 86, 80, 70, 55, 35, -56],
  'word': 'ansi'},
 {'code': [0, 1, 1, 1, 1, 1, 0],
  'count': 13,
  'path': [90, 88, 85, 79, 69, 54, 34, -55],
  'word': 'water'},
 {'code': [0, 1, 1, 1, 0, 0, 0],
  'count': 12,
  'path': [90, 88, 85, 79, 69, 53, 33, -54],
  'word': 'definition'},
 {'code': [0, 1, 0, 1, 1, 1, 1],
  'count': 12,
  'path': [90, 88, 85, 78, 67, 52, 32, -53],
  'word': 'found'},
 {'code': [0, 1, 0, 0, 0, 1, 1],
  'count': 11,
  'path': [90, 88, 85, 78, 66, 50, 29, -52],
  'word': 'be'},
 {'code': [0, 1, 0, 0, 0, 1, 0],
  'count': 11,
  'path': [90, 88, 85, 78, 66, 50, 29, -51],
  'word': 'ha'},
 {'code': [0, 1, 0, 0, 0, 0, 1],
  'count': 11,
  'path': [90, 88, 85, 78, 66, 50, 28, -50],
  'word': 'seven'},
 {'code': [0, 1, 0, 0, 0, 0, 0],
  'count': 11,
  'path': [90, 88, 85, 78, 66, 50, 28, -49],
  'word': 'eight'},
 {'code': [0, 0, 1, 0, 1, 1, 1],
  'count': 11,
  'path': [90, 88, 84, 77, 65, 49, 27, -48],
  'word': 'proton'},
 {'code': [0, 0, 1, 0, 1, 1, 0],
  'count': 11,
  'path': [90, 88, 84, 77, 65, 49, 27, -47],
  'word': 'which'},
 {'code': [0, 0, 1, 0, 0, 1, 0],
  'count': 10,
  'path': [90, 88, 84, 77, 65, 48, 26, -46],
  'word': 'used'},
 {'code': [0, 0, 1, 0, 0, 0, 1],
  'count': 10,
  'path': [90, 88, 84, 77, 65, 48, 25, -45],
  'word': 'dissociation'},
 {'code': [0, 0, 0, 0, 0, 0, 1],
  'count': 9,
  'path': [90, 88, 84, 76, 63, 44, 21, -44],
  'word': 'american'},
 {'code': [0, 0, 0, 0, 0, 0, 0],
  'count': 9,
  'path': [90, 88, 84, 76, 63, 44, 21, -43],
  'word': 'weak'},
 {'code': [1, 1, 0, 0, 1, 0, 1, 1],
  'count': 9,
  'path': [90, 89, 87, 82, 73, 60, 42, 20, -42],
  'word': 'form'},
 {'code': [1, 1, 0, 0, 1, 0, 1, 0],
  'count': 8,
  'path': [90, 89, 87, 82, 73, 60, 42, 20, -41],
  'word': 'they'},
 {'code': [1, 1, 0, 0, 0, 0, 1, 1],
  'count': 8,
  'path': [90, 89, 87, 82, 73, 59, 40, 19, -40],
  'word': 'use'},
 {'code': [1, 1, 0, 0, 0, 0, 1, 0],
  'count': 8,
  'path': [90, 89, 87, 82, 73, 59, 40, 19, -39],
  'word': 'some'},
 {'code': [1, 1, 0, 0, 0, 0, 0, 1],
  'count': 8,
  'path': [90, 89, 87, 82, 73, 59, 40, 18, -38],
  'word': 'from'},
 {'code': [1, 1, 0, 0, 0, 0, 0, 0],
  'count': 8,
  'path': [90, 89, 87, 82, 73, 59, 40, 18, -37],
  'word': 'lewis'},
 {'code': [1, 0, 1, 1, 1, 1, 1, 1],
  'count': 8,
  'path': [90, 89, 86, 81, 72, 58, 39, 17, -36],
  'word': 'most'},
 {'code': [1, 0, 1, 1, 1, 1, 1, 0],
  'count': 8,
  'path': [90, 89, 86, 81, 72, 58, 39, 17, -35],
  'word': 'strong'},
 {'code': [1, 0, 1, 0, 1, 1, 0, 1],
  'count': 8,
  'path': [90, 89, 86, 81, 71, 56, 37, 16, -34],
  'word': 'example'},
 {'code': [1, 0, 1, 0, 1, 1, 0, 0],
  'count': 7,
  'path': [90, 89, 86, 81, 71, 56, 37, 16, -33],
  'word': 'have'},
 {'code': [1, 0, 0, 0, 1, 0, 0, 1],
  'count': 7,
  'path': [90, 89, 86, 80, 70, 55, 35, 15, -32],
  'word': 'also'},
 {'code': [1, 0, 0, 0, 1, 0, 0, 0],
  'count': 7,
  'path': [90, 89, 86, 80, 70, 55, 35, 15, -31],
  'word': 'bases'},
 {'code': [0, 1, 1, 1, 1, 1, 1, 1],
  'count': 7,
  'path': [90, 88, 85, 79, 69, 54, 34, 14, -30],
  'word': 'it'},
 {'code': [0, 1, 1, 1, 1, 1, 1, 0],
  'count': 7,
  'path': [90, 88, 85, 79, 69, 54, 34, 14, -29],
  'word': 'constant'},
 {'code': [0, 1, 1, 1, 0, 0, 1, 1],
  'count': 7,
  'path': [90, 88, 85, 79, 69, 53, 33, 13, -28],
  'word': 'l'},
 {'code': [0, 1, 1, 1, 0, 0, 1, 0],
  'count': 6,
  'path': [90, 88, 85, 79, 69, 53, 33, 13, -27],
  'word': 'so'},
 {'code': [0, 1, 0, 1, 1, 1, 0, 1],
  'count': 6,
  'path': [90, 88, 85, 78, 67, 52, 32, 12, -26],
  'word': 's'},
 {'code': [0, 1, 0, 1, 1, 1, 0, 0],
  'count': 6,
  'path': [90, 88, 85, 78, 67, 52, 32, 12, -25],
  'word': 'more'},
 {'code': [0, 1, 0, 1, 1, 0, 1, 1],
  'count': 6,
  'path': [90, 88, 85, 78, 67, 52, 31, 11, -24],
  'word': 'organic'},
 {'code': [0, 1, 0, 1, 1, 0, 1, 0],
  'count': 6,
  'path': [90, 88, 85, 78, 67, 52, 31, 11, -23],
  'word': 'protons'},
 {'code': [0, 1, 0, 1, 1, 0, 0, 1],
  'count': 6,
  'path': [90, 88, 85, 78, 67, 52, 31, 10, -22],
  'word': 'orbital'},
 {'code': [0, 1, 0, 1, 1, 0, 0, 0],
  'count': 6,
  'path': [90, 88, 85, 78, 67, 52, 31, 10, -21],
  'word': 'its'},
 {'code': [0, 1, 0, 0, 1, 1, 1, 1],
  'count': 6,
  'path': [90, 88, 85, 78, 66, 51, 30, 9, -20],
  'word': 'anion'},
 {'code': [0, 1, 0, 0, 1, 1, 1, 0],
  'count': 6,
  'path': [90, 88, 85, 78, 66, 51, 30, 9, -19],
  'word': 'hydrochloric'},
 {'code': [0, 1, 0, 0, 1, 1, 0, 1],
  'count': 6,
  'path': [90, 88, 85, 78, 66, 51, 30, 8, -18],
  'word': 'donate'},
 {'code': [0, 1, 0, 0, 1, 1, 0, 0],
  'count': 6,
  'path': [90, 88, 85, 78, 66, 51, 30, 8, -17],
  'word': 'products'},
 {'code': [0, 0, 1, 0, 0, 1, 1, 1],
  'count': 6,
  'path': [90, 88, 84, 77, 65, 48, 26, 7, -16],
  'word': 'institute'},
 {'code': [0, 0, 1, 0, 0, 1, 1, 0],
  'count': 5,
  'path': [90, 88, 84, 77, 65, 48, 26, 7, -15],
  'word': 'when'},
 {'code': [0, 0, 1, 0, 0, 0, 0, 1],
  'count': 5,
  'path': [90, 88, 84, 77, 65, 48, 25, 6, -14],
  'word': 'called'},
 {'code': [0, 0, 1, 0, 0, 0, 0, 0],
  'count': 5,
  'path': [90, 88, 84, 77, 65, 48, 25, 6, -13],
  'word': 'was'},
 {'code': [0, 0, 0, 1, 1, 1, 1, 1],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 47, 24, 5, -12],
  'word': 'solution'},
 {'code': [0, 0, 0, 1, 1, 1, 1, 0],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 47, 24, 5, -11],
  'word': 'has'},
 {'code': [0, 0, 0, 1, 1, 1, 0, 1],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 47, 24, 4, -10],
  'word': 'according'},
 {'code': [0, 0, 0, 1, 1, 1, 0, 0],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 47, 24, 4, -9],
  'word': 'ion'},
 {'code': [0, 0, 0, 1, 1, 0, 1, 1],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 47, 23, 3, -8],
  'word': 'substances'},
 {'code': [0, 0, 0, 1, 1, 0, 1, 0],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 47, 23, 3, -7],
  'word': 'all'},
 {'code': [0, 0, 0, 1, 1, 0, 0, 1],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 47, 23, 2, -6],
  'word': 'conjugate'},
 {'code': [0, 0, 0, 1, 1, 0, 0, 0],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 47, 23, 2, -5],
  'word': 'these'},
 {'code': [0, 0, 0, 1, 0, 1, 1, 1],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 46, 22, 1, -4],
  'word': 'such'},
 {'code': [0, 0, 0, 1, 0, 1, 1, 0],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 46, 22, 1, -3],
  'word': 'sulfuric'},
 {'code': [0, 0, 0, 1, 0, 1, 0, 1],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 46, 22, 0, -2],
  'word': 'tar'},
 {'code': [0, 0, 0, 1, 0, 1, 0, 0],
  'count': 5,
  'path': [90, 88, 84, 76, 64, 46, 22, 0, -1],
  'word': 'national'}]
In [15]:
%pylab inline
g = inspect_vocab_tree(vocab)
Populating the interactive namespace from numpy and matplotlib
In [16]:
init_net()
/usr/lib/python2.7/dist-packages/numpy/ctypeslib.py:411: RuntimeWarning: Item size computed from the PEP 3118 buffer format string does not match the actual item size.
  return array(obj, copy=False)
In [16]:
 
In [ ]: