# JUST RUN THIS, no changes needed

from google.colab import drive
import pandas as pd
import math
import re
from collections import Counter

drive.mount('/content/gdrive')

# Load the "woke" grants dataset
woke_grants_df = pd.read_csv("/content/gdrive/MyDrive/datasets/woke_grants.tsv", delimiter="\t")
woke_grant_ids = woke_grants_df.dropna(subset="AWARD ID")["AWARD ID"]

# Load all NSF grants from 2022
grants_df = pd.read_csv("/content/gdrive/MyDrive/datasets/FY2022_049_Assistance_Full_20250109_1.csv",
                        on_bad_lines='skip', low_memory=False)

# Add a boolean "is_woke" column
grants_df["is_woke"] = grants_df["award_id_fain"].isin(woke_grant_ids)

# Print dataset info
print(f"Total grants: {len(grants_df)}")
print(f"Labeled 'woke': {grants_df['is_woke'].sum()}")
print(f"Percentage: {100 * grants_df['is_woke'].mean():.1f}%")

# Uncomment to sample the dataset for development / testing purposes
#grants_df = grants_df.sample(2000)

grants_df.head(10)

# JUST RUN THIS, no changes needed

STOP_WORDS = {
    "a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
    "has", "he", "in", "is", "it", "its", "of", "on", "that", "the",
    "to", "was", "were", "will", "with", "i", "you", "we", "they",
    "this", "their", "our", "or", "but", "if", "then", "so", "such"
}

def preprocess(text):
    """Convert text to a list of lowercase words, removing stop words"""
    if pd.isna(text):
        return []

    # Convert to lowercase
    text = str(text).lower()

    # Split on punctuation and whitespace
    tokens = re.split(r"[,\.\!\?\s\(\)\[\];:\"']+", text)

    # Keep only non-empty tokens that aren't stop words
    processed_tokens = []
    for token in tokens:
        # Remove any remaining punctuation from edges
        token = token.strip("-/")
        if token and token not in STOP_WORDS and len(token) > 2:
            processed_tokens.append(token)

    return set(processed_tokens)

# Test it
test_text = "This research investigates climate change impacts!"
print(preprocess(test_text))  # Should print: ['research', 'investigates', 'climate', 'change', 'impacts']

def get_grant_description(grants_df, grant_id):
    return grants_df[grants_df['award_id_fain'] == grant_id]['prime_award_base_transaction_description'].values[0]

# Test your function
sample_id = grants_df['award_id_fain'].iloc[0]
description = get_grant_description(grants_df, sample_id)
if description:
    print(f"Sample description: {description[:200]}...")

# Look at some examples of "woke" grants
print("\nExamples of labeled grants:")
woke_examples = grants_df[grants_df['is_woke'] == True].head(3)
for _, row in woke_examples.iterrows():
    print(f"\n{row['recipient_name']}: {row['prime_award_base_transaction_description'][:150]}...")

def calculate_subset_term_frequency(descriptions):
    # Input: descriptions is a list of description strings
    # Output: Returns dictionary mapping word -> total count across all descriptions
    subset_tf = {}
    for grant in descriptions:
        for term in preprocess(grant):
            subset_tf[term] = subset_tf.get(term, 0) + 1
    return subset_tf

# Calculate TF for "woke" grants
woke_descriptions = grants_df[grants_df['is_woke'] == True]['prime_award_base_transaction_description'].tolist()
print(f"Calculating term frequency for {len(woke_descriptions)} 'woke' grants...")

woke_tf = calculate_subset_term_frequency(woke_descriptions)
print(f"Unique terms in subset: {len(woke_tf)}")
print("Top 10 terms by frequency:", sorted(woke_tf.items(), key=lambda x: x[1], reverse=True)[:10])

def calculate_document_frequency(corpus, target_terms):
    # Input: corpus is a list of all description strings
    #        target_terms is a set of terms to check
    # Output: Returns dictionary mapping term -> number of documents containing it

    preprocessed_corpus = []
    for doc in corpus:
        preprocessed_corpus.append(preprocess(doc))

    doc_freq = {}
    for term in target_terms:
        for doc in preprocessed_corpus:
            if term in doc:
                doc_freq[term] = doc_freq.get(term, 0) + 1

    return doc_freq

# Create corpus and calculate DF
all_descriptions = grants_df['prime_award_base_transaction_description'].tolist()
target_terms = set(woke_tf.keys())

print(f"Calculating document frequency for {len(target_terms)} terms...")
print(f"Corpus size: {len(all_descriptions)} grants")

df_counts = calculate_document_frequency(all_descriptions, target_terms)

def calculate_subset_tfidf(subset_tf, doc_freq, total_docs):
    # Input: subset_tf is dictionary of term frequencies in the subset
    #        doc_freq is dictionary of document frequencies in full corpus
    #        total_docs is total number of documents in corpus
    # Output: Returns dictionary mapping term -> Subset TF-IDF score
    tfidf = {}
    for term in subset_tf:
        idf = math.log(total_docs / doc_freq[term])
        tfidf[term] = subset_tf[term] * idf
    return tfidf

# Calculate Subset TF-IDF
subset_tfidf_scores = calculate_subset_tfidf(woke_tf, df_counts, len(all_descriptions))

# Display results
sorted_scores = sorted(subset_tfidf_scores.items(), key=lambda x: x[1], reverse=True)
print(f"\nTop 30 most distinctive terms in 'woke' grants:")
for term, score in sorted_scores[:30]:
    print(f"  {term}: {score:.3f}")