sklearn 如何计算 TFIDF¶

此 notebook 是博文 sklearn 如何计算 TFIDF 中的代码。

In [1]:

import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

In [2]:

documents = [
    "低头亲吻我的左手",
    "换取被宽恕的承诺",
    "老旧管风琴在角落",
    "一直一直一直伴奏",
]

In [3]:

documents = [" ".join(jieba.cut(item)) for item in documents]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\secsi\AppData\Local\Temp\jieba.cache
Loading model cost 0.909 seconds.
Prefix dict has been built succesfully.

In [4]:

documents

Out[4]:

['低头 亲吻 我 的 左手', '换取 被 宽恕 的 承诺', '老旧 管风琴 在 角落', '一直 一直 一直 伴奏']

手算¶

以 一直 为例

In [5]:

# idf(一直, D)
np.log((1+4)/(1+1)) + 1

Out[5]:

1.916290731874155

In [6]:

# tfidf(一直, 文档 4, D)
3 * 1.916290731874155

Out[6]:

5.748872195622465

In [7]:

# 文档 4 未归一化 tfidf 向量
no_norm = np.array([5.748872195622465, 0, 1.916290731874155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [8]:

# 文档 4 归一化 tfidf 向量
normed = no_norm / np.sqrt(sum(no_norm ** 2))
normed

Out[8]:

array([0.9486833 , 0.        , 0.31622777, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

使用 sklearn 计算¶

In [9]:

# 默认情况下 sklearn 会莫名其妙地去除掉一些停用词，即使 stop_words=None 
# 详细讨论参见 https://github.com/scikit-learn/scikit-learn/issues/10756
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b', strip_accents=False)
X = vectorizer.fit_transform(documents)

In [10]:

vectorizer.vocabulary_

Out[10]:

{'低头': 3,
 '亲吻': 1,
 '我': 7,
 '的': 10,
 '左手': 6,
 '换取': 9,
 '被': 13,
 '宽恕': 5,
 '承诺': 8,
 '老旧': 12,
 '管风琴': 11,
 '在': 4,
 '角落': 14,
 '一直': 0,
 '伴奏': 2}

In [11]:

vectorizer.idf_

Out[11]:

array([1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.51082562, 1.91629073, 1.91629073, 1.91629073, 1.91629073])

In [12]:

X.toarray()

Out[12]:

array([[0.        , 0.46516193, 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.46516193, 0.        , 0.        ,
        0.36673901, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.46516193, 0.        , 0.        , 0.46516193, 0.46516193,
        0.36673901, 0.        , 0.        , 0.46516193, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.5       ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.5       , 0.5       , 0.        , 0.5       ],
       [0.9486833 , 0.        , 0.31622777, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [13]:

vectorizer.stop_words_

Out[13]:

set()