此 notebook 是博文 sklearn 如何计算 TFIDF 中的代码。
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
documents = [
"低头亲吻我的左手",
"换取被宽恕的承诺",
"老旧管风琴在角落",
"一直一直一直伴奏",
]
documents = [" ".join(jieba.cut(item)) for item in documents]
Building prefix dict from the default dictionary ... Loading model from cache C:\Users\secsi\AppData\Local\Temp\jieba.cache Loading model cost 0.909 seconds. Prefix dict has been built succesfully.
documents
['低头 亲吻 我 的 左手', '换取 被 宽恕 的 承诺', '老旧 管风琴 在 角落', '一直 一直 一直 伴奏']
以 一直
为例
# idf(一直, D)
np.log((1+4)/(1+1)) + 1
1.916290731874155
# tfidf(一直, 文档 4, D)
3 * 1.916290731874155
5.748872195622465
# 文档 4 未归一化 tfidf 向量
no_norm = np.array([5.748872195622465, 0, 1.916290731874155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
# 文档 4 归一化 tfidf 向量
normed = no_norm / np.sqrt(sum(no_norm ** 2))
normed
array([0.9486833 , 0. , 0.31622777, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])
# 默认情况下 sklearn 会莫名其妙地去除掉一些停用词,即使 stop_words=None
# 详细讨论参见 https://github.com/scikit-learn/scikit-learn/issues/10756
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b', strip_accents=False)
X = vectorizer.fit_transform(documents)
vectorizer.vocabulary_
{'低头': 3, '亲吻': 1, '我': 7, '的': 10, '左手': 6, '换取': 9, '被': 13, '宽恕': 5, '承诺': 8, '老旧': 12, '管风琴': 11, '在': 4, '角落': 14, '一直': 0, '伴奏': 2}
vectorizer.idf_
array([1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.51082562, 1.91629073, 1.91629073, 1.91629073, 1.91629073])
X.toarray()
array([[0. , 0.46516193, 0. , 0.46516193, 0. , 0. , 0.46516193, 0.46516193, 0. , 0. , 0.36673901, 0. , 0. , 0. , 0. ], [0. , 0. , 0. , 0. , 0. , 0.46516193, 0. , 0. , 0.46516193, 0.46516193, 0.36673901, 0. , 0. , 0.46516193, 0. ], [0. , 0. , 0. , 0. , 0.5 , 0. , 0. , 0. , 0. , 0. , 0. , 0.5 , 0.5 , 0. , 0.5 ], [0.9486833 , 0. , 0.31622777, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])
vectorizer.stop_words_
set()