🚩 Create a free WhyLabs account to get more value out of whylogs!
Did you know you can store, visualize, and monitor whylogs profiles with the WhyLabs Observability Platform? Sign up for a free WhyLabs account to leverage the power of whylogs and WhyLabs together!
In this example, we'll look at how we might use whylogs to monitor a document summarization task.
We'll use NLTK and BeautifulSoup to do some of the basic NLP tasks, so let's install the packages we'll need now.
%pip install nltk
%pip install bs4
%pip install whylogs[embeddings]
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: nltk in /usr/local/lib/python3.8/dist-packages (3.7) Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from nltk) (1.2.0) Requirement already satisfied: click in /usr/local/lib/python3.8/dist-packages (from nltk) (8.1.3) Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.8/dist-packages (from nltk) (2022.6.2) Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from nltk) (4.64.1) Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: bs4 in /usr/local/lib/python3.8/dist-packages (0.0.1) Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (from bs4) (4.6.3) Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Processing ./whylogs-1.1.28-py3-none-any.whl Collecting whylogs-sketching>=3.4.1.dev3 Downloading whylogs_sketching-3.4.1.dev3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (548 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 548.0/548.0 KB 9.4 MB/s eta 0:00:00 Requirement already satisfied: protobuf>=3.19.4 in /usr/local/lib/python3.8/dist-packages (from whylogs==1.1.28) (3.19.6) Requirement already satisfied: typing-extensions>=3.10 in /usr/local/lib/python3.8/dist-packages (from whylogs==1.1.28) (4.5.0) Requirement already satisfied: scikit-learn<2.0.0,>=1.0.2 in /usr/local/lib/python3.8/dist-packages (from whylogs==1.1.28) (1.2.1) Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from scikit-learn<2.0.0,>=1.0.2->whylogs==1.1.28) (1.2.0) Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.8/dist-packages (from scikit-learn<2.0.0,>=1.0.2->whylogs==1.1.28) (1.22.4) Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.8/dist-packages (from scikit-learn<2.0.0,>=1.0.2->whylogs==1.1.28) (1.10.1) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn<2.0.0,>=1.0.2->whylogs==1.1.28) (3.1.0) Installing collected packages: whylogs-sketching, whylogs Successfully installed whylogs-1.1.28 whylogs-sketching-3.4.1.dev3
We'll use the NLTK Reuters corpus as the documents to summarize. As a trivial summarization algorithm, we'll pull out the sentence that contains a document's highest log-entropy weighted term as its summary. Let's start by computing the term-frequency index for the corpus and the term global frequencies and entropies. We'll use NLTK's stemming, stopping, and tokenization for those calcuations, but return the unaltered sentence as the summary.
from typing import Any, Dict, List, Optional, Set
import nltk
import numpy as np
from nltk.corpus import reuters
from bs4 import BeautifulSoup
nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')
STEMMER = nltk.stem.PorterStemmer()
# the NLTK tokenizer produces some junk tokens, so add them to the stopwords
STOPWORDS = set(nltk.corpus.stopwords.words("english") + [
".",
",",
"<",
">",
"'s",
"''",
"``",
]
)
def delete_headline(text: str) -> str:
'''
NLTK's sentence tokenizer includes the headline in the first sentence
if we don't manually exlude it.
'''
lines = text.split("\n")
return "\n".join(lines[1:]) if len(lines) > 1 else text
def global_freq(A: np.ndarray) -> np.ndarray:
'''Sum the columns of the term-frequency index to get term global frequencies'''
gf = np.zeros(A.shape[0])
for i in range(A.shape[0]):
for j in range(A.shape[1]):
gf[i] += A[i, j]
return gf
def entropy(A: np.ndarray, gf: np.ndarray) -> np.ndarray:
'''Compute the term entropy'''
g = np.zeros(A.shape[0])
logN = np.log(A.shape[1])
for i in range(A.shape[0]):
for j in range(A.shape[1]):
p_ij = A[i, j] / gf[i]
g[i] += p_ij * np.log(p_ij) if p_ij != 0 else 0
g[i] = 1 + g[i] / logN
return g
def get_raw_tokens(file) -> List[str]:
'''
The raw NLTK documents contain a few HTML entities, so we'll use BeautifulSoup
to decode them, then apply the NLTK word tokenizer. Skip the headline.
'''
raw = BeautifulSoup(delete_headline(reuters.raw(file)), "html.parser").get_text()
return [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS]
def get_vocabulary(file) -> Set[str]:
'''
Returns the set of stemmed terms in the specified Reuters article (excluding headline).
'''
vocab: Set[str] = set()
tokens = get_raw_tokens(file)
stemmed = [STEMMER.stem(t.casefold()) for t in tokens]
return set(stemmed)
file_ids = reuters.fileids()
train_files = [id for id in file_ids if id.startswith("train")][:500]
vocab: Set[str] = set()
for file in train_files:
vocab.update(get_vocabulary(file))
ndocs = len(train_files)
vocab_size = len(vocab)
print(f"{ndocs} articles {vocab_size} vocabulary")
[nltk_data] Downloading package reuters to /root/nltk_data... [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. [nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
500 articles 6275 vocabulary
It will also be handy to have mappings back and forth between each term (as a string) and the term's row in term frequency matrix. Let's build those up.
vocab_map: Dict[str, int] = dict()
rev_map: List[str] = [''] * vocab_size
for i, t in enumerate(vocab):
vocab_map[t] = i
rev_map[i] = t
index = np.zeros((vocab_size, ndocs))
for col, id in enumerate(train_files):
tokens = get_raw_tokens(id)
stemmed = [STEMMER.stem(t) for t in tokens]
for term in stemmed:
index[ vocab_map[term], col ] += 1
gf = global_freq(index)
g = entropy(index, gf)
Now we have the inputs we need to compute the term weights, so we can implement our summarization algorithm. But since we want to monitor our summarization process with whylogs, we'll need to do a little whylogs setup before we start summarizing.
By default, whylogs uses a TransientLogger
that produces a new profile for every log()
call. For our example, it's nicer to aggregate all the logging into a singe profile. So we'll create a simple PersistentLogger
to do that.
from whylogs.api.logger.logger import Logger
from whylogs.core import DatasetProfile, DatasetSchema
from whylogs.core.configs import SummaryConfig
from whylogs.core.dataset_profile import logger as dp_logger # because it doesn't like vectors
from whylogs.core.preprocessing import ListView, PreprocessedColumn
from whylogs.core.resolvers import MetricSpec, ResolverSpec, STANDARD_RESOLVER
from whylogs.core.schema import DeclarativeSchema
from whylogs.core.stubs import pd
from whylogs.core.view.column_profile_view import ColumnProfileView
from whylogs.experimental.extras.nlp_metric import BagOfWordsMetric
class PersistentLogger(Logger):
def __init__(self, schema: Optional[DatasetSchema] = None):
super().__init__(schema)
self._current_profile = DatasetProfile(schema=self._schema)
def _get_matching_profiles(
self,
obj: Any = None,
*,
pandas: Optional[pd.DataFrame] = None,
row: Optional[Dict[str, Any]] = None,
schema: Optional[DatasetSchema] = None,
) -> List[DatasetProfile]:
if schema and schema is not self._schema:
raise ValueError(
"You cannot pass a DatasetSchema to an instance of PersistentLogger.log(),"
"because schema is set once when instantiated, please use TimedRollingLogger(schema) instead."
)
return [self._current_profile]
We also need to attach the BagOfWordsMetric
to the columns that represent our input articles and output summaries. We log each document as a list of its tokens.
from logging import ERROR
dp_logger.setLevel(ERROR)
resolvers = STANDARD_RESOLVER + [
ResolverSpec(
column_name = "article_bow",
metrics = [MetricSpec(BagOfWordsMetric)]
),
ResolverSpec(
column_name = "summary_bow",
metrics = [MetricSpec(BagOfWordsMetric)]
)
]
schema = DeclarativeSchema(resolvers)
why = PersistentLogger(schema=schema)
Now we're finally ready to do some summarization! We'll compute the log entropy weighted term vector for each article as a whole, then use NLTK's sentence tokenizer to split it into sentences. The first sentence that contains the word with the highest weight in the document will be our summary.
profile = None
for file in train_files:
raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text()
# print(raw.split('\n')[0]) # print article headline
# print(raw) # print the whole input article
raw = delete_headline(raw)
tokens = [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS]
stemmed = [STEMMER.stem(t) for t in tokens]
doc_vec = np.zeros(vocab_size)
for term in stemmed:
doc_vec[ vocab_map[term] ] += 1
max_weight = -1
max_term = ""
for i in range(vocab_size):
doc_vec[i] = g[i] * np.log(doc_vec[i] + 1.0)
if doc_vec[i] > max_weight:
max_weight = doc_vec[i]
max_term = rev_map[i]
sentences = nltk.sent_tokenize(raw)
max_sentence = ""
for sentence in sentences:
tokenized = [t.casefold() for t in nltk.word_tokenize(sentence) if t.casefold() not in STOPWORDS]
stemmed = [STEMMER.stem(t) for t in tokenized]
if max_term in stemmed:
max_sentence = sentence
profile = why.log(obj={"article_bow": tokens, "summary_bow": tokenized})
break
# max_sentence = max_sentence.replace("\n", " ")
# print(f"{max_weight} {max_term}: {max_sentence}")
We've logged the full articles as the article_bow
column and the summaries as the summary_bow
column. Now let's grab the profile from the logger and take a look at it.
def dump_summary(view: ColumnProfileView) -> None:
summary = view.to_summary_dict()
keys = [
"nlp_bow/doc_length:counts/n",
"nlp_bow/doc_length:distribution/mean",
"nlp_bow/doc_length:distribution/stddev",
"nlp_bow/doc_length:distribution/max",
"nlp_bow/doc_length:distribution/min",
"nlp_bow/doc_length:distribution/median",
"nlp_bow/term_length:counts/n",
"nlp_bow/term_length:distribution/mean",
"nlp_bow/term_length:distribution/stddev",
"nlp_bow/term_length:distribution/max",
"nlp_bow/term_length:distribution/min",
"nlp_bow/term_length:distribution/median",
]
for key in keys:
print(f" {key}: {summary[key]}")
print(f" frequent terms: {[t.value for t in summary['nlp_bow/frequent_terms:frequent_items/frequent_strings'][:10]]}")
view = profile.view()
columns = view.get_columns()
for col_name, col_view in columns.items():
print(f"{col_name}:")
dump_summary(col_view)
print()
article_bow: nlp_bow/doc_length:counts/n: 500 nlp_bow/doc_length:distribution/mean: 88.38000000000004 nlp_bow/doc_length:distribution/stddev: 89.40470907065252 nlp_bow/doc_length:distribution/max: 504.0 nlp_bow/doc_length:distribution/min: 1.0 nlp_bow/doc_length:distribution/median: 59.0 nlp_bow/term_length:counts/n: 44190 nlp_bow/term_length:distribution/mean: 5.906223127404392 nlp_bow/term_length:distribution/stddev: 2.5306350762162584 nlp_bow/term_length:distribution/max: 24.0 nlp_bow/term_length:distribution/min: 1.0 nlp_bow/term_length:distribution/median: 6.0 frequent terms: ['said', 'mln', 'dlrs', 'pct', 'vs', 'billion', 'year', 'cts', 'would', 'u.s.'] summary_bow: nlp_bow/doc_length:counts/n: 500 nlp_bow/doc_length:distribution/mean: 21.554000000000002 nlp_bow/doc_length:distribution/stddev: 14.143095074153782 nlp_bow/doc_length:distribution/max: 176.0 nlp_bow/doc_length:distribution/min: 1.0 nlp_bow/doc_length:distribution/median: 18.0 nlp_bow/term_length:counts/n: 10777 nlp_bow/term_length:distribution/mean: 5.419690080727475 nlp_bow/term_length:distribution/stddev: 2.5998033619617535 nlp_bow/term_length:distribution/max: 21.0 nlp_bow/term_length:distribution/min: 1.0 nlp_bow/term_length:distribution/median: 5.0 frequent terms: ['vs', 'mln', 'said', 'cts', 'loss', 'net', 'dlrs', 'shr', 'inc', 'billion']
As expected, we see that the summary documents are shorter than the original articles. We also see some differences and overlap in the most frequent words in the whole articles and the summaries.
resolvers = STANDARD_RESOLVER + [
ResolverSpec(
column_name = "original_bow",
metrics = [MetricSpec(BagOfWordsMetric)]
),
ResolverSpec(
column_name = "split_bow",
metrics = [MetricSpec(BagOfWordsMetric)]
)
]
schema = DeclarativeSchema(resolvers)
why = PersistentLogger(schema=schema)
import random
profile = None
for file in train_files:
raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text()
raw = delete_headline(raw)
sentences = nltk.sent_tokenize(raw)
for sentence in sentences:
tokens = [t.casefold() for t in nltk.word_tokenize(sentence)]
why.log(obj={"original_bow": np.array(tokens)})
phrases = sentence.split(",")
if len(phrases) > 1:
index = random.randint(0, len(phrases))
left = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[:index]) + ".")]
right = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[index:]))]
why.log(obj={"split_bow": left})
profile = why.log(obj={"split_bow": right})
else:
profile = why.log(obj={"split_bow": tokens})
view = profile.view()
columns = view.get_columns()
for col_name, col_view in columns.items():
print(f"{col_name}:")
dump_summary(col_view)
print()
original_bow: nlp_bow/doc_length:counts/n: 0 nlp_bow/doc_length:distribution/mean: 0.0 nlp_bow/doc_length:distribution/stddev: 0.0 nlp_bow/doc_length:distribution/max: nan nlp_bow/doc_length:distribution/min: nan nlp_bow/doc_length:distribution/median: None nlp_bow/term_length:counts/n: 0 nlp_bow/term_length:distribution/mean: 0.0 nlp_bow/term_length:distribution/stddev: 0.0 nlp_bow/term_length:distribution/max: nan nlp_bow/term_length:distribution/min: nan nlp_bow/term_length:distribution/median: None frequent terms: [] split_bow: nlp_bow/doc_length:counts/n: 4545 nlp_bow/doc_length:distribution/mean: 16.67216721672163 nlp_bow/doc_length:distribution/stddev: 13.849889480931045 nlp_bow/doc_length:distribution/max: 207.0 nlp_bow/doc_length:distribution/min: 0.0 nlp_bow/doc_length:distribution/median: 15.0 nlp_bow/term_length:counts/n: 75775 nlp_bow/term_length:distribution/mean: 4.389495216100277 nlp_bow/term_length:distribution/stddev: 2.7034094987711406 nlp_bow/term_length:distribution/max: 24.0 nlp_bow/term_length:distribution/min: 1.0 nlp_bow/term_length:distribution/median: 4.0 frequent terms: ['the', '.', ',', 'to', 'of', 'in', 'said', 'and', 'a', 'mln']