In [1]:
import artm
import warnings
warnings.filterwarnings('ignore')
import re

This is the way to gather vocab file having collection in Vowpal Wabbit format

In [2]:
vw_file_path = 'vw.txt'
target_folder = 'batches'
In [3]:
batch_vectorizer = artm.BatchVectorizer(data_path=vw_file_path,
                                        data_format='vowpal_wabbit',
                                        target_folder=target_folder)
In [4]:
dictionary = artm.Dictionary()
dictionary.gather(data_path=target_folder)
In [5]:
dictionary_path = target_folder + '/dictionary.txt'
In [6]:
dictionary.save_text(dictionary_path=dictionary_path)
In [7]:
vocab_path = target_folder + '/' + 'vocab.txt'

with open(dictionary_path, 'r') as dictionary_file:
    with open(vocab_path, 'w') as vocab_file:
        dictionary_file.readline()
        dictionary_file.readline()
        for line in dictionary_file:
            elems = re.split(', ', line)
            vocab_file.write(' '.join(elems[:2]) + '\n')
In [ ]: