# JUST RUN THIS, no changes needed

from google.colab import drive
import pandas as pd
import math
import re
from collections import Counter

drive.mount('/content/gdrive')

# Load the "woke" grants dataset
woke_grants_df = pd.read_csv("/content/gdrive/MyDrive/datasets/woke_grants.tsv", delimiter="\t")
woke_grant_ids = woke_grants_df.dropna(subset="AWARD ID")["AWARD ID"]

# Load all NSF grants from 2022
grants_df = pd.read_csv("/content/gdrive/MyDrive/datasets/FY2022_049_Assistance_Full_20250109_1.csv",
                        on_bad_lines='skip', low_memory=False)

# Add a boolean "is_woke" column
grants_df["is_woke"] = grants_df["award_id_fain"].isin(woke_grant_ids)

# Print dataset info
print(f"Total grants: {len(grants_df)}")
print(f"Labeled 'woke': {grants_df['is_woke'].sum()}")
print(f"Percentage: {100 * grants_df['is_woke'].mean():.1f}%")

# Visualize the dataframe
grants_df.head()

# JUST RUN THIS, no changes needed

STOP_WORDS = {
    "a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
    "has", "he", "in", "is", "it", "its", "of", "on", "that", "the",
    "to", "was", "were", "will", "with", "i", "you", "we", "they",
    "this", "their", "our", "or", "but", "if", "then", "so", "such"
}

def preprocess(text):
    """Convert text to a list of lowercase words, removing stop words"""
    if pd.isna(text):
        return []

    # Convert to lowercase
    text = str(text).lower()

    # Split on punctuation and whitespace
    tokens = re.split(r"[,\.\!\?\s\(\)\[\];:\"']+", text)

    # Keep only non-empty tokens that aren't stop words
    processed_tokens = []
    for token in tokens:
        # Remove any remaining punctuation from edges
        token = token.strip("-/")
        if token and token not in STOP_WORDS and len(token) > 2:
            processed_tokens.append(token)

    return processed_tokens

# Test it
test_text = "This research investigates climate change impacts!"
print(preprocess(test_text))  # Should print: ['research', 'investigates', 'climate', 'change', 'impacts']

def get_grant_description(grants_df, grant_id):
    # Input: grants_df is the DataFrame, grant_id is the award ID to find
    # Output: Returns the description string (or None if not found)

    # TODO: Your code here!
    # 1. Filter to rows where award_id_fain equals grant_id
    # 2. Get the prime_award_base_transaction_description value
    # 3. Handle the case where grant isn't found
    pass

# Test your function
sample_id = grants_df['award_id_fain'].iloc[0]
description = get_grant_description(grants_df, sample_id)
if description:
    print(f"Sample description: {description[:200]}...")

# Look at some examples of "woke" grants
print("\nExamples of labeled grants:")
woke_examples = grants_df[grants_df['is_woke'] == True].head(3)
for _, row in woke_examples.iterrows():
    print(f"\n{row['recipient_name']}: {row['prime_award_base_transaction_description'][:150]}...")

def calculate_subset_term_frequency(subset_descriptions):
    # Input: descriptions is a list of nsf grant description (our subset of documents)
    # Output: Returns dictionary mapping word -> total count across all descriptions

    subset_tf = {}

    # TODO: Your code here!
    #
    # 1. For each grant description:
    #      2. preprocess the description
    #      3. for each term in the preprocessed description
    #          4. add the count of that term to subset_tf

    return subset_tf

# Calculate TF for "woke" grants
woke_descriptions = grants_df[grants_df['is_woke'] == True]['prime_award_base_transaction_description'].tolist()
print(f"Calculating term frequency for {len(woke_descriptions)} 'woke' grants...")

woke_tf = calculate_subset_term_frequency(woke_descriptions)
print(f"Unique terms in subset: {len(woke_tf)}")
print("Top 10 terms by frequency:", sorted(woke_tf.items(), key=lambda x: x[1], reverse=True)[:10])

def calculate_document_frequency(corpus, target_terms):
    # Input: corpus is a list of all nsf grant description strings (all of our documents)
    #        target_terms is a set of terms to check
    # Output: Returns dictionary mapping term -> number of documents containing it

    doc_freq = {}

    # TODO: Your code here!
    # For each document in corpus:
    # 1. Create a new empty list, preprocessed_corpus
    # 2. For each document in corpus
    #     3. preprocess the document
    #     4. append the new preprocessed doc to the preprocessed_corpus
    # 2. For each term in target_terms
    #     3. For each preprocessed doc in preprocessed_corpus
    #         4. If the term is in the doc, increment doc_feq for that term

    return doc_freq

# Create corpus and calculate DF
all_descriptions = grants_df['prime_award_base_transaction_description'].tolist()
target_terms = set(woke_tf.keys())

print(f"Calculating document frequency for {len(target_terms)} terms...")
print(f"Corpus size: {len(all_descriptions)} grants")

df_counts = calculate_document_frequency(all_descriptions, target_terms)

def calculate_subset_tfidf(subset_tf, doc_freq, total_docs):
    # Input: subset_tf is dictionary of term frequencies in the subset
    #        doc_freq is dictionary of document frequencies in full corpus
    #        total_docs is total number of documents in corpus
    # Output: Returns dictionary mapping term -> Subset TF-IDF score

    tfidf = {}

    # TODO: Your code here!
    # For each term in subset_tf:
    #   1. Calculate IDF = math.log(total_docs / doc_freq[term])
    #   2. Calculate Subset TF-IDF = subset_tf[term] * IDF
    #   3. Store in tfidf dictionary

    return tfidf

# Calculate Subset TF-IDF
subset_tfidf_scores = calculate_subset_tfidf(woke_tf, df_counts, len(all_descriptions))

# Display results
sorted_scores = sorted(subset_tfidf_scores.items(), key=lambda x: x[1], reverse=True)
print(f"\nTop 30 most terms among 'woke' grants:")
for term, score in sorted_scores[:30]:
    print(f"  {term}: {score:.3f}")