%pip install nltk %pip install bs4 %pip install whylogs[embeddings] from typing import Any, Dict, List, Optional, Set import nltk import numpy as np from nltk.corpus import reuters from bs4 import BeautifulSoup nltk.download('reuters') nltk.download('punkt') nltk.download('stopwords') STEMMER = nltk.stem.PorterStemmer() # the NLTK tokenizer produces some junk tokens, so add them to the stopwords STOPWORDS = set(nltk.corpus.stopwords.words("english") + [ ".", ",", "<", ">", "'s", "''", "``", ] ) def delete_headline(text: str) -> str: ''' NLTK's sentence tokenizer includes the headline in the first sentence if we don't manually exlude it. ''' lines = text.split("\n") return "\n".join(lines[1:]) if len(lines) > 1 else text def global_freq(A: np.ndarray) -> np.ndarray: '''Sum the columns of the term-frequency index to get term global frequencies''' gf = np.zeros(A.shape[0]) for i in range(A.shape[0]): for j in range(A.shape[1]): gf[i] += A[i, j] return gf def entropy(A: np.ndarray, gf: np.ndarray) -> np.ndarray: '''Compute the term entropy''' g = np.zeros(A.shape[0]) logN = np.log(A.shape[1]) for i in range(A.shape[0]): for j in range(A.shape[1]): p_ij = A[i, j] / gf[i] g[i] += p_ij * np.log(p_ij) if p_ij != 0 else 0 g[i] = 1 + g[i] / logN return g def get_raw_tokens(file) -> List[str]: ''' The raw NLTK documents contain a few HTML entities, so we'll use BeautifulSoup to decode them, then apply the NLTK word tokenizer. Skip the headline. ''' raw = BeautifulSoup(delete_headline(reuters.raw(file)), "html.parser").get_text() return [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS] def get_vocabulary(file) -> Set[str]: ''' Returns the set of stemmed terms in the specified Reuters article (excluding headline). ''' vocab: Set[str] = set() tokens = get_raw_tokens(file) stemmed = [STEMMER.stem(t.casefold()) for t in tokens] return set(stemmed) file_ids = reuters.fileids() train_files = [id for id in file_ids if id.startswith("train")][:500] vocab: Set[str] = set() for file in train_files: vocab.update(get_vocabulary(file)) ndocs = len(train_files) vocab_size = len(vocab) print(f"{ndocs} articles {vocab_size} vocabulary") vocab_map: Dict[str, int] = dict() rev_map: List[str] = [''] * vocab_size for i, t in enumerate(vocab): vocab_map[t] = i rev_map[i] = t index = np.zeros((vocab_size, ndocs)) for col, id in enumerate(train_files): tokens = get_raw_tokens(id) stemmed = [STEMMER.stem(t) for t in tokens] for term in stemmed: index[ vocab_map[term], col ] += 1 gf = global_freq(index) g = entropy(index, gf) from whylogs.api.logger.logger import Logger from whylogs.core import DatasetProfile, DatasetSchema from whylogs.core.configs import SummaryConfig from whylogs.core.dataset_profile import logger as dp_logger # because it doesn't like vectors from whylogs.core.preprocessing import ListView, PreprocessedColumn from whylogs.core.resolvers import MetricSpec, ResolverSpec, STANDARD_RESOLVER from whylogs.core.schema import DeclarativeSchema from whylogs.core.stubs import pd from whylogs.core.view.column_profile_view import ColumnProfileView from whylogs.experimental.extras.nlp_metric import BagOfWordsMetric class PersistentLogger(Logger): def __init__(self, schema: Optional[DatasetSchema] = None): super().__init__(schema) self._current_profile = DatasetProfile(schema=self._schema) def _get_matching_profiles( self, obj: Any = None, *, pandas: Optional[pd.DataFrame] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, ) -> List[DatasetProfile]: if schema and schema is not self._schema: raise ValueError( "You cannot pass a DatasetSchema to an instance of PersistentLogger.log()," "because schema is set once when instantiated, please use TimedRollingLogger(schema) instead." ) return [self._current_profile] from logging import ERROR dp_logger.setLevel(ERROR) resolvers = STANDARD_RESOLVER + [ ResolverSpec( column_name = "article_bow", metrics = [MetricSpec(BagOfWordsMetric)] ), ResolverSpec( column_name = "summary_bow", metrics = [MetricSpec(BagOfWordsMetric)] ) ] schema = DeclarativeSchema(resolvers) why = PersistentLogger(schema=schema) profile = None for file in train_files: raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text() # print(raw.split('\n')[0]) # print article headline # print(raw) # print the whole input article raw = delete_headline(raw) tokens = [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS] stemmed = [STEMMER.stem(t) for t in tokens] doc_vec = np.zeros(vocab_size) for term in stemmed: doc_vec[ vocab_map[term] ] += 1 max_weight = -1 max_term = "" for i in range(vocab_size): doc_vec[i] = g[i] * np.log(doc_vec[i] + 1.0) if doc_vec[i] > max_weight: max_weight = doc_vec[i] max_term = rev_map[i] sentences = nltk.sent_tokenize(raw) max_sentence = "" for sentence in sentences: tokenized = [t.casefold() for t in nltk.word_tokenize(sentence) if t.casefold() not in STOPWORDS] stemmed = [STEMMER.stem(t) for t in tokenized] if max_term in stemmed: max_sentence = sentence profile = why.log(obj={"article_bow": tokens, "summary_bow": tokenized}) break # max_sentence = max_sentence.replace("\n", " ") # print(f"{max_weight} {max_term}: {max_sentence}") def dump_summary(view: ColumnProfileView) -> None: summary = view.to_summary_dict() keys = [ "nlp_bow/doc_length:counts/n", "nlp_bow/doc_length:distribution/mean", "nlp_bow/doc_length:distribution/stddev", "nlp_bow/doc_length:distribution/max", "nlp_bow/doc_length:distribution/min", "nlp_bow/doc_length:distribution/median", "nlp_bow/term_length:counts/n", "nlp_bow/term_length:distribution/mean", "nlp_bow/term_length:distribution/stddev", "nlp_bow/term_length:distribution/max", "nlp_bow/term_length:distribution/min", "nlp_bow/term_length:distribution/median", ] for key in keys: print(f" {key}: {summary[key]}") print(f" frequent terms: {[t.value for t in summary['nlp_bow/frequent_terms:frequent_items/frequent_strings'][:10]]}") view = profile.view() columns = view.get_columns() for col_name, col_view in columns.items(): print(f"{col_name}:") dump_summary(col_view) print() resolvers = STANDARD_RESOLVER + [ ResolverSpec( column_name = "original_bow", metrics = [MetricSpec(BagOfWordsMetric)] ), ResolverSpec( column_name = "split_bow", metrics = [MetricSpec(BagOfWordsMetric)] ) ] schema = DeclarativeSchema(resolvers) why = PersistentLogger(schema=schema) import random profile = None for file in train_files: raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text() raw = delete_headline(raw) sentences = nltk.sent_tokenize(raw) for sentence in sentences: tokens = [t.casefold() for t in nltk.word_tokenize(sentence)] why.log(obj={"original_bow": np.array(tokens)}) phrases = sentence.split(",") if len(phrases) > 1: index = random.randint(0, len(phrases)) left = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[:index]) + ".")] right = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[index:]))] why.log(obj={"split_bow": left}) profile = why.log(obj={"split_bow": right}) else: profile = why.log(obj={"split_bow": tokens}) view = profile.view() columns = view.get_columns() for col_name, col_view in columns.items(): print(f"{col_name}:") dump_summary(col_view) print()