In [1]:

import artm
import warnings
warnings.filterwarnings('ignore')
import re

This is the way to gather vocab file having collection in Vowpal Wabbit format¶

In [2]:

vw_file_path = 'vw.txt'
target_folder = 'batches'

In [3]:

batch_vectorizer = artm.BatchVectorizer(data_path=vw_file_path,
                                        data_format='vowpal_wabbit',
                                        target_folder=target_folder)

In [4]:

dictionary = artm.Dictionary()
dictionary.gather(data_path=target_folder)

In [5]:

dictionary_path = target_folder + '/dictionary.txt'

In [6]:

dictionary.save_text(dictionary_path=dictionary_path)

In [7]:

vocab_path = target_folder + '/' + 'vocab.txt'

with open(dictionary_path, 'r') as dictionary_file:
    with open(vocab_path, 'w') as vocab_file:
        dictionary_file.readline()
        dictionary_file.readline()
        for line in dictionary_file:
            elems = re.split(', ', line)
            vocab_file.write(' '.join(elems[:2]) + '\n')

In [ ]: