import sys
!{sys.executable} -m pip install --user pandas numpy sklearn nltk
Requirement already satisfied: pandas in /usr/lib64/python3.8/site-packages (0.25.3) Requirement already satisfied: numpy in /usr/lib64/python3.8/site-packages (1.18.4) Requirement already satisfied: sklearn in /home/pasha/.local/lib/python3.8/site-packages (0.0) Requirement already satisfied: nltk in /home/pasha/.local/lib/python3.8/site-packages (3.4.5) Requirement already satisfied: python-dateutil>=2.6.1 in /usr/lib/python3.8/site-packages (from pandas) (2.8.0) Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3.8/site-packages (from pandas) (2020.1) Requirement already satisfied: scikit-learn in /home/pasha/.local/lib/python3.8/site-packages (from sklearn) (0.22.2.post1) Requirement already satisfied: six in /usr/lib/python3.8/site-packages (from nltk) (1.14.0) Requirement already satisfied: joblib>=0.11 in /home/pasha/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (0.14.1) Requirement already satisfied: scipy>=0.17.0 in /usr/lib64/python3.8/site-packages (from scikit-learn->sklearn) (1.4.1)
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# Define constants
tfidf = TfidfVectorizer(stop_words='english')
# By https://ru.stackoverflow.com/questions/995616/Как-сделать-tf-idf-для-русских-текстов
#import nltk
#from nltk.corpus import stopwords as nltk_stopwords
#nltk.download('stopwords')
#stopwords = set(nltk_stopwords.words('russian') )
#tfidf = TfidfVectorizer(stop_words=stopwords)
# Minimal example:
articles = pd.read_csv('data/TF-IDF/TF-IDF-min.csv.Ru', index_col='№')
# articles = pd.read_csv('data/TF-IDF/TF-IDF-min.csv.En', index_col='№')
# Full real file:
# articles = pd.read_csv('RDC-135_articles_golden_set_mapping.csv', index_col='№')
articles
Article | |
---|---|
№ | |
1 | Раз, два, три, четыре |
2 | вышел зайчик погулять |
3 | вдруг охотник выбегает |
4 | прямо в зайчик стреляет |
5 | пуляет прямо в зайчик |
tfidf_matrix = tfidf.fit_transform(articles['Article'])
tfidf_matrix.toarray() # or res.todense()
array([[0. , 0. , 0. , 0.5 , 0. , 0. , 0. , 0. , 0. , 0.5 , 0. , 0.5 , 0.5 ], [0. , 0. , 0.63907044, 0. , 0.42799292, 0. , 0.63907044, 0. , 0. , 0. , 0. , 0. , 0. ], [0.57735027, 0.57735027, 0. , 0. , 0. , 0.57735027, 0. , 0. , 0. , 0. , 0. , 0. , 0. ], [0. , 0. , 0. , 0. , 0.4622077 , 0. , 0. , 0.55681615, 0. , 0. , 0.69015927, 0. , 0. ], [0. , 0. , 0. , 0. , 0.4622077 , 0. , 0. , 0.55681615, 0.69015927, 0. , 0. , 0. , 0. ]])
tfidf_matrix.shape
(5, 13)
tfidf.stop_words
'english'
# By https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html
# Вышеупомянутый X имеет значения TF-IDF всех документов в корпусе. Это большая разреженная матрица.
# Теперь,
tfidf.get_feature_names()
# это дает вам список всех токенов или n-граммов или слов. Для первого документа в вашем корпусе,
['вдруг', 'выбегает', 'вышел', 'два', 'зайчик', 'охотник', 'погулять', 'прямо', 'пуляет', 'раз', 'стреляет', 'три', 'четыре']
# By https://stackoverflow.com/questions/37593293/what-is-the-simplest-way-to-get-tfidf-with-pandas-dataframe#comment72191707_37593408
# v.get_feature_names() will give you the list of feature names.
# v.vocabulary_ will give you a dict with feature names as keys and their index in the matrix produced as values.
tfidf.vocabulary_
{'раз': 9, 'два': 3, 'три': 11, 'четыре': 12, 'вышел': 2, 'зайчик': 4, 'погулять': 6, 'вдруг': 0, 'охотник': 5, 'выбегает': 1, 'прямо': 7, 'стреляет': 10, 'пуляет': 8}
# By https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html
# это дает вам список всех токенов или n-граммов или слов. Для первого документа в вашем корпусе,
# Позволяет распечатать их:
def print_word_ratings_by_document(doc=0):
feature_names = tfidf.get_feature_names()
feature_index = tfidf_matrix[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
print (w, s)
for i, _ in articles.iterrows():
print("### doc {}: ###".format(i-1))
print_word_ratings_by_document(i-1)
### doc 0: ### четыре 0.5 три 0.5 два 0.5 раз 0.5 ### doc 1: ### погулять 0.6390704413963749 зайчик 0.42799292268317357 вышел 0.6390704413963749 ### doc 2: ### выбегает 0.5773502691896258 охотник 0.5773502691896258 вдруг 0.5773502691896258 ### doc 3: ### стреляет 0.6901592662889633 прямо 0.5568161504458247 зайчик 0.46220770413113277 ### doc 4: ### пуляет 0.6901592662889633 прямо 0.5568161504458247 зайчик 0.46220770413113277
# https://ru.stackoverflow.com/questions/772859/tfidfvectorizer/773018#773018
# Топ 10 самых популярных слов:
N=10
idx = np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1][:N]
top_10_words = np.array(tfidf.get_feature_names())[idx].tolist()
top_10_words
['зайчик', 'прямо', 'стреляет', 'пуляет', 'погулять', 'вышел', 'охотник', 'выбегает', 'вдруг', 'четыре']
tfidf_matrix.shape
(5, 13)
tfidf_matrix.todense()
matrix([[0. , 0. , 0. , 0.5 , 0. , 0. , 0. , 0. , 0. , 0.5 , 0. , 0.5 , 0.5 ], [0. , 0. , 0.63907044, 0. , 0.42799292, 0. , 0.63907044, 0. , 0. , 0. , 0. , 0. , 0. ], [0.57735027, 0.57735027, 0. , 0. , 0. , 0.57735027, 0. , 0. , 0. , 0. , 0. , 0. , 0. ], [0. , 0. , 0. , 0. , 0.4622077 , 0. , 0. , 0.55681615, 0. , 0. , 0.69015927, 0. , 0. ], [0. , 0. , 0. , 0. , 0.4622077 , 0. , 0. , 0.55681615, 0.69015927, 0. , 0. , 0. , 0. ]])
# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html#numpy.sum
tfidf_matrix.sum(axis=0) # Сумма по столбцам
matrix([[0.57735027, 0.57735027, 0.63907044, 0.5 , 1.35240833, 0.57735027, 0.63907044, 1.1136323 , 0.69015927, 0.5 , 0.69015927, 0.5 , 0.5 ]])
# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
tfidf_matrix.sum(axis=0).argsort(axis=1) # Возвращает *индексы*, по возрастанию значений элементов
matrix([[ 3, 9, 11, 12, 0, 1, 5, 2, 6, 8, 10, 7, 4]])
# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.ravel.html
np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))
array([ 3, 9, 11, 12, 0, 1, 5, 2, 6, 8, 10, 7, 4])
np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1] # Reverse list (DESC)
array([ 4, 7, 10, 8, 6, 2, 5, 1, 0, 12, 11, 9, 3])
np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1][:N] # Take top N elements
array([ 4, 7, 10, 8, 6, 2, 5, 1, 0, 12])
np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[-N:][::-1]
array([ 4, 7, 10, 8, 6, 2, 5, 1, 0, 12])
similarities = (tfidf_matrix * tfidf_matrix.T).A[-1,:-1]
similarities
array([0. , 0.19782163, 0. , 0.52368019])
# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
# numpy.argmax(a, axis=None, out=None)
# Returns the indices of the maximum values along an axis.
max_sim_position = np.argmax(similarities)
max_sim_position
3
max_sim = max(similarities)
max_sim
0.52368018715548
similarities[max_sim_position]
0.52368018715548
tfidf_matrix.todense()
matrix([[0. , 0. , 0. , 0.5 , 0. , 0. , 0. , 0. , 0. , 0.5 , 0. , 0.5 , 0.5 ], [0. , 0. , 0.63907044, 0. , 0.42799292, 0. , 0.63907044, 0. , 0. , 0. , 0. , 0. , 0. ], [0.57735027, 0.57735027, 0. , 0. , 0. , 0.57735027, 0. , 0. , 0. , 0. , 0. , 0. , 0. ], [0. , 0. , 0. , 0. , 0.4622077 , 0. , 0. , 0.55681615, 0. , 0. , 0.69015927, 0. , 0. ], [0. , 0. , 0. , 0. , 0.4622077 , 0. , 0. , 0.55681615, 0.69015927, 0. , 0. , 0. , 0. ]])
tfidf_matrix.shape
(5, 13)
type(tfidf_matrix)
scipy.sparse.csr.csr_matrix
mul = tfidf_matrix * tfidf_matrix.T
mul.todense()
matrix([[1. , 0. , 0. , 0. , 0. ], [0. , 1. , 0. , 0.19782163, 0.19782163], [0. , 0. , 1. , 0. , 0. ], [0. , 0.19782163, 0. , 1. , 0.52368019], [0. , 0.19782163, 0. , 0.52368019, 1. ]])
mul.todense().shape
(5, 5)
scores = mul.A
scores
array([[1. , 0. , 0. , 0. , 0. ], [0. , 1. , 0. , 0.19782163, 0.19782163], [0. , 0. , 1. , 0. , 0. ], [0. , 0.19782163, 0. , 1. , 0.52368019], [0. , 0.19782163, 0. , 0.52368019, 1. ]])
# We will search maximum, so do not willing match to himself:
np.fill_diagonal(scores, -1)
scores
array([[-1. , 0. , 0. , 0. , 0. ], [ 0. , -1. , 0. , 0.19782163, 0.19782163], [ 0. , 0. , -1. , 0. , 0. ], [ 0. , 0.19782163, 0. , -1. , 0.52368019], [ 0. , 0.19782163, 0. , 0.52368019, -1. ]])
type(scores)
numpy.ndarray
scores_df = pd.DataFrame(scores, index=articles.index)
scores_df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
№ | |||||
1 | -1.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
2 | 0.0 | -1.000000 | 0.0 | 0.197822 | 0.197822 |
3 | 0.0 | 0.000000 | -1.0 | 0.000000 | 0.000000 |
4 | 0.0 | 0.197822 | 0.0 | -1.000000 | 0.523680 |
5 | 0.0 | 0.197822 | 0.0 | 0.523680 | -1.000000 |
# By https://gist.github.com/RZachLamberty/1ed47cd0e2d0d968f7cdbd3d53a50f4c
# you can calculate cosine similarity easily given this
(tfidf_matrix @ tfidf_matrix.T).todense()
matrix([[1. , 0. , 0. , 0. , 0. ], [0. , 1. , 0. , 0.19782163, 0.19782163], [0. , 0. , 1. , 0. , 0. ], [0. , 0.19782163, 0. , 1. , 0.52368019], [0. , 0.19782163, 0. , 0.52368019, 1. ]])
# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.A.html
# Return self as an ndarray object.
# Equivalent to np.asarray(self)
mul.A
array([[1. , 0. , 0. , 0. , 0. ], [0. , 1. , 0. , 0.19782163, 0.19782163], [0. , 0. , 1. , 0. , 0. ], [0. , 0.19782163, 0. , 1. , 0.52368019], [0. , 0.19782163, 0. , 0.52368019, 1. ]])
mul.A[-1,:-1] # Последняя строка
array([0. , 0.19782163, 0. , 0.52368019])
mul.A[-1,0:] # Последняя строка
array([0. , 0.19782163, 0. , 0.52368019, 1. ])
mul.A[-1,:-1] # Последняя строка без последнего элемента.
# Матрица диагональная (остальные не имеют значения, зеркально повторяются), содержит веса всех со всеми.
# Последний элемент, выкидывается потому что это матч самого к себе, если мы рассматриваем максимальный дубль
# для последнего в наборе документа (так было в функции get_similarities из GOJI)
# Получается что реально нужно выкидывать по индексу того, для которого ищется сравнение!
array([0. , 0.19782163, 0. , 0.52368019])
# Получается что реально нужно выкидывать по индексу того, для которого ищется сравнение!
forDocNo=1 # Expect match doc 3<>4
scores
array([[-1. , 0. , 0. , 0. , 0. ], [ 0. , -1. , 0. , 0.19782163, 0.19782163], [ 0. , 0. , -1. , 0. , 0. ], [ 0. , 0.19782163, 0. , -1. , 0.52368019], [ 0. , 0.19782163, 0. , 0.52368019, -1. ]])
s = scores[forDocNo]
max_sim_position = np.argmax(s)
(max_sim_position, s[max_sim_position])
(3, 0.19782162617776308)
articles
# Should be matched:
# id № Dup№
# 0 1 -
# 1 2 (4 and 5)
# 2 3 -
# 3 4 5
# 4 5 4
Article | |
---|---|
№ | |
1 | Раз, два, три, четыре |
2 | вышел зайчик погулять |
3 | вдруг охотник выбегает |
4 | прямо в зайчик стреляет |
5 | пуляет прямо в зайчик |
scores
array([[-1. , 0. , 0. , 0. , 0. ], [ 0. , -1. , 0. , 0.19782163, 0.19782163], [ 0. , 0. , -1. , 0. , 0. ], [ 0. , 0.19782163, 0. , -1. , 0.52368019], [ 0. , 0.19782163, 0. , 0.52368019, -1. ]])
scores_df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
№ | |||||
1 | -1.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
2 | 0.0 | -1.000000 | 0.0 | 0.197822 | 0.197822 |
3 | 0.0 | 0.000000 | -1.0 | 0.000000 | 0.000000 |
4 | 0.0 | 0.197822 | 0.0 | -1.000000 | 0.523680 |
5 | 0.0 | 0.197822 | 0.0 | 0.523680 | -1.000000 |
scores_df.loc[3]
0 0.0 1 0.0 2 -1.0 3 0.0 4 0.0 Name: 3, dtype: float64
scores_df.iloc[3]
0 0.000000 1 0.197822 2 0.000000 3 -1.000000 4 0.523680 Name: 4, dtype: float64
# By https://stackoverflow.com/questions/26658240/getting-the-index-of-a-row-in-a-pandas-apply-function/48819898#48819898
# index available as row.name
def most_similar(row):
# print('====')
# print('row.name={}; scores_df.loc[row.name].idxmax()={}; scores_df.iloc[scores_df.loc[row.name].idxmax()].name={}'.format(
# row.name
# ,scores_df.loc[row.name].idxmax()
# ,scores_df.iloc[scores_df.loc[row.name].idxmax()].name
# )
# )
# print('row.loc={}; row.iloc={}'.format(row.loc, row.iloc))
max_similar_doc = scores_df.iloc[scores_df.loc[row.name].idxmax()].name # Array index (.iloc) into DataFrame index (.name)
max_similar_score = scores_df.loc[row.name].max()
return ((max_similar_doc if max_similar_score > 0 else -1), max_similar_score)
# articles['Max DUP score docId'] = articles.apply(lambda i: np.argmax(scores), axis=1)
# articles['Max DUP score docId'] = articles.apply(lambda i: type(i.index), axis=1)
articles[['Max DUP score docId', 'Max TF/IDF DUP score']] = articles.apply(most_similar, axis=1, result_type='expand')
articles['Max DUP score docId'] = articles['Max DUP score docId'].astype('int32')
articles
Article | Max DUP score docId | Max TF/IDF DUP score | |
---|---|---|---|
№ | |||
1 | Раз, два, три, четыре | -1 | 0.000000 |
2 | вышел зайчик погулять | 4 | 0.197822 |
3 | вдруг охотник выбегает | -1 | 0.000000 |
4 | прямо в зайчик стреляет | 5 | 0.523680 |
5 | пуляет прямо в зайчик | 4 | 0.523680 |
( scores_df[3], scores_df.loc[3] )
(№ 1 0.000000 2 0.197822 3 0.000000 4 -1.000000 5 0.523680 Name: 3, dtype: float64, 0 0.0 1 0.0 2 -1.0 3 0.0 4 0.0 Name: 3, dtype: float64)
By https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns/52363890#52363890 +By https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns/52363890#comment106834440_16242202 for column naming
# articles[['a', 'b']] = articles.apply(lambda i: [1, 2], axis=1, result_type='expand')
# articles
def dump_xml_file(row):
print(type(row))
print(row['File name'])
with open('articles/' + row['File name'].replace('.xml', '.txt'), 'w+') as file:
file.write(row['Article'])
articles.apply(dump_xml_file, axis=1)
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) /usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py in get_value(self, series, key) 4735 try: -> 4736 return libindex.get_value_box(s, key) 4737 except IndexError: pandas/_libs/index.pyx in pandas._libs.index.get_value_box() pandas/_libs/index.pyx in pandas._libs.index.get_value_at() pandas/_libs/util.pxd in pandas._libs.util.get_value_at() pandas/_libs/util.pxd in pandas._libs.util.validate_indexer() TypeError: 'str' object cannot be interpreted as an integer During handling of the above exception, another exception occurred: KeyError Traceback (most recent call last) <ipython-input-51-38f66aea0c91> in <module> 5 file.write(row['Article']) 6 ----> 7 articles.apply(dump_xml_file, axis=1) /usr/lib64/python3.8/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds) 6926 kwds=kwds, 6927 ) -> 6928 return op.get_result() 6929 6930 def applymap(self, func): /usr/lib64/python3.8/site-packages/pandas/core/apply.py in get_result(self) 184 return self.apply_raw() 185 --> 186 return self.apply_standard() 187 188 def apply_empty_result(self): /usr/lib64/python3.8/site-packages/pandas/core/apply.py in apply_standard(self) 290 291 # compute the result using the series generator --> 292 self.apply_series_generator() 293 294 # wrap results /usr/lib64/python3.8/site-packages/pandas/core/apply.py in apply_series_generator(self) 319 try: 320 for i, v in enumerate(series_gen): --> 321 results[i] = self.f(v) 322 keys.append(v.name) 323 except Exception as e: <ipython-input-51-38f66aea0c91> in dump_xml_file(row) 1 def dump_xml_file(row): 2 print(type(row)) ----> 3 print(row['File name']) 4 with open('articles/' + row['File name'].replace('.xml', '.txt'), 'w+') as file: 5 file.write(row['Article']) /usr/lib64/python3.8/site-packages/pandas/core/series.py in __getitem__(self, key) 1069 key = com.apply_if_callable(key, self) 1070 try: -> 1071 result = self.index.get_value(self, key) 1072 1073 if not is_scalar(result): /usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py in get_value(self, series, key) 4742 raise InvalidIndexError(key) 4743 else: -> 4744 raise e1 4745 except Exception: # pragma: no cover 4746 raise e1 /usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py in get_value(self, series, key) 4728 k = self._convert_scalar_indexer(k, kind="getitem") 4729 try: -> 4730 return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None)) 4731 except KeyError as e1: 4732 if len(self) > 0 and (self.holds_integer() or self.is_boolean()): pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: ('File name', 'occurred at index 1')