In [1]:

sentence_bn = ['আমরা আজ ল্যাব ক্লাস এ নিউরাল নেটওয়ার্ক প্রয়োগ করব', 'আমাদের প্রোগ্রামিং অনেক ভালো লাগে', 'আমরা সবাই এই কোর্সে ভালো মার্ক পেতে চাই']
sentence_en = ['We do not know how to implement a neural network', 'We don\'t like programking at all', 'We are in great doubt about our marks in this course']

In [2]:

import nltk
import warnings

warnings.filterwarnings("ignore")
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

Out[2]:

True

In [3]:

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
# vec.fit(sentence_bn)
model = vec.fit_transform(sentence_en)

vec.vocabulary_
vocab_list = vec.get_feature_names()
print(vocab_list)
# print(model.toarray())
count_list = model.toarray().sum(axis=0)

['about', 'all', 'are', 'at', 'course', 'do', 'don', 'doubt', 'great', 'how', 'implement', 'in', 'know', 'like', 'marks', 'network', 'neural', 'not', 'our', 'programking', 'this', 'to', 'we']

In [4]:

print(dict(zip(vocab_list,count_list)))

{'about': 1, 'all': 1, 'are': 1, 'at': 1, 'course': 1, 'do': 1, 'don': 1, 'doubt': 1, 'great': 1, 'how': 1, 'implement': 1, 'in': 2, 'know': 1, 'like': 1, 'marks': 1, 'network': 1, 'neural': 1, 'not': 1, 'our': 1, 'programking': 1, 'this': 1, 'to': 1, 'we': 3}

In [5]:

from nltk import word_tokenize

vec_new = CountVectorizer(encoding='utf-8', tokenizer=word_tokenize)
vec_new.fit(sentence_bn)

vocab_list_bn = vec_new.get_feature_names()
print(vocab_list_bn)
print(vec_new.vocabulary_)

['অনেক', 'আজ', 'আমরা', 'আমাদের', 'এ', 'এই', 'করব', 'কোর্সে', 'ক্লাস', 'চাই', 'নিউরাল', 'নেটওয়ার্ক', 'পেতে', 'প্রোগ্রামিং', 'প্রয়োগ', 'ভালো', 'মার্ক', 'লাগে', 'ল্যাব', 'সবাই']
{'আমরা': 2, 'আজ': 1, 'ল্যাব': 18, 'ক্লাস': 8, 'এ': 4, 'নিউরাল': 10, 'নেটওয়ার্ক': 11, 'প্রয়োগ': 14, 'করব': 6, 'আমাদের': 3, 'প্রোগ্রামিং': 13, 'অনেক': 0, 'ভালো': 15, 'লাগে': 17, 'সবাই': 19, 'এই': 5, 'কোর্সে': 7, 'মার্ক': 16, 'পেতে': 12, 'চাই': 9}

In [6]:

# Tokenization

tokenized_bn = []

new_sentence = sentence_bn + ['এইখানে আমরা নতুন আরো একটি বাক্য রাখলাম']

for sent in new_sentence:
  tokenized_sent = word_tokenize(sent)
  # print(tokenized_sent)
  tokenized_bn.append(tokenized_sent)

print(tokenized_bn)

[['আমরা', 'আজ', 'ল্যাব', 'ক্লাস', 'এ', 'নিউরাল', 'নেটওয়ার্ক', 'প্রয়োগ', 'করব'], ['আমাদের', 'প্রোগ্রামিং', 'অনেক', 'ভালো', 'লাগে'], ['আমরা', 'সবাই', 'এই', 'কোর্সে', 'ভালো', 'মার্ক', 'পেতে', 'চাই'], ['এইখানে', 'আমরা', 'নতুন', 'আরো', 'একটি', 'বাক্য', 'রাখলাম']]

In [8]:

from sklearn.preprocessing import LabelEncoder

std_names = ['জিয়াদ', 'নয়ন', 'কিশোর', 'বিলাস', 'ইমতিয়াজুল', 'মনিরুল' , 'নয়ন']

encoder = LabelEncoder()
name_labels = encoder.fit_transform(std_names)

# dense representation
print(name_labels)

[2 3 1 4 0 5 3]

In [9]:

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
name_labels = name_labels.reshape((7, 1))

sparse_rep = encoder.fit_transform(name_labels).toarray()
print(sparse_rep)

[[0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]]