Data from RusAge (https://www.kaggle.com/oldaandozerskaya/fiction-corpus-for-agebased-text-classification)
RusAge: Corpus for Age-Based Text Classification Russian fiction books' previews with age rating labels.
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from nltk.corpus import PlaintextCorpusReader
from nltk.text import Text
corpus = PlaintextCorpusReader("./rusage/abstracts/", '.*\.txt', encoding='utf-8')
corpus_tokens = corpus.words()
print(corpus_tokens[:10])
files=corpus.fileids()
for f in files:
print (f)
freq = nltk.FreqDist(corpus.words())
#common words
print("Common Words:", freq.most_common(10))
russian_stopwords = stopwords.words("russian") + list(string.punctuation)
#Preprocess function
def preprocess_text(words):
wordsFiltered = []
for w in words:
if w not in russian_stopwords:
wordsFiltered.append(w)
return wordsFiltered
preprocess_corpus = preprocess_text(corpus.words())
freq = nltk.FreqDist(preprocess_corpus)
#common words
print("Common Words:", freq.most_common(20))
#specific words
print("Specific Word: ", freq.get("истории"))
t = Text(corpus_tokens)
t.concordance('Толстого', lines=20)