RusAge

Data from RusAge (https://www.kaggle.com/oldaandozerskaya/fiction-corpus-for-agebased-text-classification)

RusAge: Corpus for Age-Based Text Classification Russian fiction books' previews with age rating labels.

In [ ]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from nltk.corpus import PlaintextCorpusReader
from nltk.text import Text

In [ ]:
corpus = PlaintextCorpusReader("./rusage/abstracts/", '.*\.txt', encoding='utf-8')
corpus_tokens = corpus.words()
print(corpus_tokens[:10])
In [ ]:
files=corpus.fileids()

for f in files:
    print (f)
In [ ]:
freq = nltk.FreqDist(corpus.words())
#common words 
print("Common Words:", freq.most_common(10))
In [ ]:
russian_stopwords = stopwords.words("russian") + list(string.punctuation)

#Preprocess function
def preprocess_text(words):
    
    wordsFiltered = []

    for w in words:
        if w not in russian_stopwords:
            wordsFiltered.append(w)

    return wordsFiltered


preprocess_corpus = preprocess_text(corpus.words())
In [ ]:
freq = nltk.FreqDist(preprocess_corpus)
#common words 
print("Common Words:", freq.most_common(20))
#specific words 
print("Specific Word: ", freq.get("истории"))
In [ ]:
t = Text(corpus_tokens)
t.concordance('Толстого', lines=20)