# JUST RUN THIS, no changes needed from google.colab import drive import pandas as pd import math import re from collections import Counter drive.mount('/content/gdrive') # Load the "woke" grants dataset woke_grants_df = pd.read_csv("/content/gdrive/MyDrive/datasets/woke_grants.tsv", delimiter="\t") woke_grant_ids = woke_grants_df.dropna(subset="AWARD ID")["AWARD ID"] # Load all NSF grants from 2022 grants_df = pd.read_csv("/content/gdrive/MyDrive/datasets/FY2022_049_Assistance_Full_20250109_1.csv", on_bad_lines='skip', low_memory=False) # Add a boolean "is_woke" column grants_df["is_woke"] = grants_df["award_id_fain"].isin(woke_grant_ids) # Print dataset info print(f"Total grants: {len(grants_df)}") print(f"Labeled 'woke': {grants_df['is_woke'].sum()}") print(f"Percentage: {100 * grants_df['is_woke'].mean():.1f}%") # Uncomment to sample the dataset for development / testing purposes #grants_df = grants_df.sample(2000) grants_df.head(10) # JUST RUN THIS, no changes needed STOP_WORDS = { "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with", "i", "you", "we", "they", "this", "their", "our", "or", "but", "if", "then", "so", "such" } def preprocess(text): """Convert text to a list of lowercase words, removing stop words""" if pd.isna(text): return [] # Convert to lowercase text = str(text).lower() # Split on punctuation and whitespace tokens = re.split(r"[,\.\!\?\s\(\)\[\];:\"']+", text) # Keep only non-empty tokens that aren't stop words processed_tokens = [] for token in tokens: # Remove any remaining punctuation from edges token = token.strip("-/") if token and token not in STOP_WORDS and len(token) > 2: processed_tokens.append(token) return set(processed_tokens) # Test it test_text = "This research investigates climate change impacts!" print(preprocess(test_text)) # Should print: ['research', 'investigates', 'climate', 'change', 'impacts'] def get_grant_description(grants_df, grant_id): return grants_df[grants_df['award_id_fain'] == grant_id]['prime_award_base_transaction_description'].values[0] # Test your function sample_id = grants_df['award_id_fain'].iloc[0] description = get_grant_description(grants_df, sample_id) if description: print(f"Sample description: {description[:200]}...") # Look at some examples of "woke" grants print("\nExamples of labeled grants:") woke_examples = grants_df[grants_df['is_woke'] == True].head(3) for _, row in woke_examples.iterrows(): print(f"\n{row['recipient_name']}: {row['prime_award_base_transaction_description'][:150]}...") def calculate_subset_term_frequency(descriptions): # Input: descriptions is a list of description strings # Output: Returns dictionary mapping word -> total count across all descriptions subset_tf = {} for grant in descriptions: for term in preprocess(grant): subset_tf[term] = subset_tf.get(term, 0) + 1 return subset_tf # Calculate TF for "woke" grants woke_descriptions = grants_df[grants_df['is_woke'] == True]['prime_award_base_transaction_description'].tolist() print(f"Calculating term frequency for {len(woke_descriptions)} 'woke' grants...") woke_tf = calculate_subset_term_frequency(woke_descriptions) print(f"Unique terms in subset: {len(woke_tf)}") print("Top 10 terms by frequency:", sorted(woke_tf.items(), key=lambda x: x[1], reverse=True)[:10]) def calculate_document_frequency(corpus, target_terms): # Input: corpus is a list of all description strings # target_terms is a set of terms to check # Output: Returns dictionary mapping term -> number of documents containing it preprocessed_corpus = [] for doc in corpus: preprocessed_corpus.append(preprocess(doc)) doc_freq = {} for term in target_terms: for doc in preprocessed_corpus: if term in doc: doc_freq[term] = doc_freq.get(term, 0) + 1 return doc_freq # Create corpus and calculate DF all_descriptions = grants_df['prime_award_base_transaction_description'].tolist() target_terms = set(woke_tf.keys()) print(f"Calculating document frequency for {len(target_terms)} terms...") print(f"Corpus size: {len(all_descriptions)} grants") df_counts = calculate_document_frequency(all_descriptions, target_terms) def calculate_subset_tfidf(subset_tf, doc_freq, total_docs): # Input: subset_tf is dictionary of term frequencies in the subset # doc_freq is dictionary of document frequencies in full corpus # total_docs is total number of documents in corpus # Output: Returns dictionary mapping term -> Subset TF-IDF score tfidf = {} for term in subset_tf: idf = math.log(total_docs / doc_freq[term]) tfidf[term] = subset_tf[term] * idf return tfidf # Calculate Subset TF-IDF subset_tfidf_scores = calculate_subset_tfidf(woke_tf, df_counts, len(all_descriptions)) # Display results sorted_scores = sorted(subset_tfidf_scores.items(), key=lambda x: x[1], reverse=True) print(f"\nTop 30 most distinctive terms in 'woke' grants:") for term, score in sorted_scores[:30]: print(f" {term}: {score:.3f}")