%pip install nltk
%pip install bs4
%pip install whylogs[embeddings]

from typing import Any, Dict, List, Optional, Set

import nltk
import numpy as np

from nltk.corpus import reuters
from bs4 import BeautifulSoup

nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

STEMMER = nltk.stem.PorterStemmer()

# the NLTK tokenizer produces some junk tokens, so add them to the stopwords
STOPWORDS = set(nltk.corpus.stopwords.words("english") + [
    ".",
    ",",
    "<",
    ">",
    "'s",
    "''",
    "``",
  ]
)


def delete_headline(text: str) -> str:
  '''
  NLTK's sentence tokenizer includes the headline in the first sentence
  if we don't manually exlude it.
  '''
  lines = text.split("\n")
  return "\n".join(lines[1:]) if len(lines) > 1 else text


def global_freq(A: np.ndarray) -> np.ndarray:
  '''Sum the columns of the term-frequency index to get term global frequencies'''
  gf = np.zeros(A.shape[0])
  for i in range(A.shape[0]):
    for j in range(A.shape[1]):
      gf[i] += A[i, j]
  return gf


def entropy(A: np.ndarray, gf: np.ndarray) -> np.ndarray:
  '''Compute the term entropy'''
  g = np.zeros(A.shape[0])
  logN = np.log(A.shape[1])
  for i in range(A.shape[0]):
    for j in range(A.shape[1]):
        p_ij = A[i, j] / gf[i]
        g[i] += p_ij * np.log(p_ij) if p_ij != 0 else 0
    g[i] = 1 + g[i] / logN
  return g


def get_raw_tokens(file) -> List[str]:
  '''
  The raw NLTK documents contain a few HTML entities, so we'll use BeautifulSoup
  to decode them, then apply the NLTK word tokenizer. Skip the headline.
  '''
  raw = BeautifulSoup(delete_headline(reuters.raw(file)), "html.parser").get_text()
  return [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS]


def get_vocabulary(file) -> Set[str]:
  '''
  Returns the set of stemmed terms in the specified Reuters article (excluding headline).
  '''
  vocab: Set[str] = set()
  tokens = get_raw_tokens(file)
  stemmed = [STEMMER.stem(t.casefold()) for t in tokens]
  return set(stemmed)


file_ids = reuters.fileids()
train_files = [id for id in file_ids if id.startswith("train")][:500]

vocab: Set[str] = set()

for file in train_files:
    vocab.update(get_vocabulary(file))

ndocs = len(train_files)
vocab_size = len(vocab)
print(f"{ndocs} articles   {vocab_size} vocabulary")

vocab_map: Dict[str, int] = dict()
rev_map: List[str] = [''] * vocab_size
for i, t in enumerate(vocab):
    vocab_map[t] = i
    rev_map[i] = t

index = np.zeros((vocab_size, ndocs))
for col, id in enumerate(train_files):
    tokens = get_raw_tokens(id)
    stemmed = [STEMMER.stem(t) for t in tokens]
    for term in stemmed:
        index[ vocab_map[term], col ] += 1

gf = global_freq(index)
g = entropy(index, gf)

from whylogs.api.logger.logger import Logger
from whylogs.core import DatasetProfile, DatasetSchema
from whylogs.core.configs import SummaryConfig
from whylogs.core.dataset_profile import logger as dp_logger  # because it doesn't like vectors
from whylogs.core.preprocessing import ListView, PreprocessedColumn
from whylogs.core.resolvers import MetricSpec, ResolverSpec, STANDARD_RESOLVER
from whylogs.core.schema import DeclarativeSchema
from whylogs.core.stubs import pd
from whylogs.core.view.column_profile_view import ColumnProfileView
from whylogs.experimental.extras.nlp_metric import BagOfWordsMetric

class PersistentLogger(Logger):
    def __init__(self, schema: Optional[DatasetSchema] = None):
        super().__init__(schema)
        self._current_profile = DatasetProfile(schema=self._schema)

    def _get_matching_profiles(
        self,
        obj: Any = None,
        *,
        pandas: Optional[pd.DataFrame] = None,
        row: Optional[Dict[str, Any]] = None,
        schema: Optional[DatasetSchema] = None,
    ) -> List[DatasetProfile]:
        if schema and schema is not self._schema:
            raise ValueError(
                "You cannot pass a DatasetSchema to an instance of PersistentLogger.log(),"
                "because schema is set once when instantiated, please use TimedRollingLogger(schema) instead."
            )
        return [self._current_profile]


from logging import ERROR
dp_logger.setLevel(ERROR)

resolvers = STANDARD_RESOLVER + [
    ResolverSpec(
        column_name = "article_bow",
        metrics = [MetricSpec(BagOfWordsMetric)]
    ),
    ResolverSpec(
        column_name = "summary_bow",
        metrics = [MetricSpec(BagOfWordsMetric)]
    )
]
schema = DeclarativeSchema(resolvers)
why = PersistentLogger(schema=schema)

profile = None
for file in train_files:
    raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text()
    # print(raw.split('\n')[0])   # print article headline
    # print(raw)  # print the whole input article
    raw = delete_headline(raw)
    tokens = [t.casefold() for t in nltk.word_tokenize(raw) if t.casefold() not in STOPWORDS]
    stemmed = [STEMMER.stem(t) for t in tokens]
    doc_vec = np.zeros(vocab_size)
    for term in stemmed:
        doc_vec[ vocab_map[term] ] += 1
    max_weight = -1
    max_term = ""
    for i in range(vocab_size):
        doc_vec[i] = g[i] * np.log(doc_vec[i] + 1.0)
        if doc_vec[i] > max_weight:
            max_weight = doc_vec[i]
            max_term = rev_map[i]
    sentences = nltk.sent_tokenize(raw)
    max_sentence = ""
    for sentence in sentences:
        tokenized = [t.casefold() for t in nltk.word_tokenize(sentence) if t.casefold() not in STOPWORDS]
        stemmed = [STEMMER.stem(t) for t in tokenized]
        if max_term in stemmed:
            max_sentence = sentence
            profile = why.log(obj={"article_bow": tokens, "summary_bow": tokenized})
            break
    # max_sentence = max_sentence.replace("\n", " ")
    # print(f"{max_weight} {max_term}:   {max_sentence}")

def dump_summary(view: ColumnProfileView) -> None:
    summary = view.to_summary_dict()
    keys = [
        "nlp_bow/doc_length:counts/n",
        "nlp_bow/doc_length:distribution/mean",
        "nlp_bow/doc_length:distribution/stddev",
        "nlp_bow/doc_length:distribution/max",
        "nlp_bow/doc_length:distribution/min",
        "nlp_bow/doc_length:distribution/median",

        "nlp_bow/term_length:counts/n",
        "nlp_bow/term_length:distribution/mean",
        "nlp_bow/term_length:distribution/stddev",
        "nlp_bow/term_length:distribution/max",
        "nlp_bow/term_length:distribution/min",
        "nlp_bow/term_length:distribution/median",
    ]
    for key in keys:
        print(f"    {key}: {summary[key]}")
    print(f"    frequent terms: {[t.value for t in summary['nlp_bow/frequent_terms:frequent_items/frequent_strings'][:10]]}")


view = profile.view()
columns = view.get_columns()
for col_name, col_view in columns.items():
    print(f"{col_name}:")
    dump_summary(col_view)
    print()

resolvers = STANDARD_RESOLVER + [
    ResolverSpec(
        column_name = "original_bow",
        metrics = [MetricSpec(BagOfWordsMetric)]
    ),
    ResolverSpec(
        column_name = "split_bow",
        metrics = [MetricSpec(BagOfWordsMetric)]
    )
]
schema = DeclarativeSchema(resolvers)
why = PersistentLogger(schema=schema)

import random

profile = None
for file in train_files:
    raw = BeautifulSoup(reuters.raw(file), 'html.parser').get_text()
    raw = delete_headline(raw)
    sentences = nltk.sent_tokenize(raw)
    for sentence in sentences:
      tokens = [t.casefold() for t in nltk.word_tokenize(sentence)]
      why.log(obj={"original_bow": np.array(tokens)})
      phrases = sentence.split(",")
      if len(phrases) > 1:
        index = random.randint(0, len(phrases))
        left = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[:index]) + ".")]
        right = [t.casefold() for t in nltk.word_tokenize(", ".join(phrases[index:]))]
        why.log(obj={"split_bow": left})
        profile = why.log(obj={"split_bow": right})
      else:
        profile = why.log(obj={"split_bow": tokens})

view = profile.view()
columns = view.get_columns()
for col_name, col_view in columns.items():
    print(f"{col_name}:")
    dump_summary(col_view)
    print()