# JUST RUN THIS, no changes needed from google.colab import drive import pandas as pd import math import re from collections import Counter drive.mount('/content/gdrive') # Load the "woke" grants dataset woke_grants_df = pd.read_csv("/content/gdrive/MyDrive/datasets/woke_grants.tsv", delimiter="\t") woke_grant_ids = woke_grants_df.dropna(subset="AWARD ID")["AWARD ID"] # Load all NSF grants from 2022 grants_df = pd.read_csv("/content/gdrive/MyDrive/datasets/FY2022_049_Assistance_Full_20250109_1.csv", on_bad_lines='skip', low_memory=False) # Add a boolean "is_woke" column grants_df["is_woke"] = grants_df["award_id_fain"].isin(woke_grant_ids) # Print dataset info print(f"Total grants: {len(grants_df)}") print(f"Labeled 'woke': {grants_df['is_woke'].sum()}") print(f"Percentage: {100 * grants_df['is_woke'].mean():.1f}%") # Visualize the dataframe grants_df.head() # JUST RUN THIS, no changes needed STOP_WORDS = { "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with", "i", "you", "we", "they", "this", "their", "our", "or", "but", "if", "then", "so", "such" } def preprocess(text): """Convert text to a list of lowercase words, removing stop words""" if pd.isna(text): return [] # Convert to lowercase text = str(text).lower() # Split on punctuation and whitespace tokens = re.split(r"[,\.\!\?\s\(\)\[\];:\"']+", text) # Keep only non-empty tokens that aren't stop words processed_tokens = [] for token in tokens: # Remove any remaining punctuation from edges token = token.strip("-/") if token and token not in STOP_WORDS and len(token) > 2: processed_tokens.append(token) return processed_tokens # Test it test_text = "This research investigates climate change impacts!" print(preprocess(test_text)) # Should print: ['research', 'investigates', 'climate', 'change', 'impacts'] def get_grant_description(grants_df, grant_id): # Input: grants_df is the DataFrame, grant_id is the award ID to find # Output: Returns the description string (or None if not found) # TODO: Your code here! # 1. Filter to rows where award_id_fain equals grant_id # 2. Get the prime_award_base_transaction_description value # 3. Handle the case where grant isn't found pass # Test your function sample_id = grants_df['award_id_fain'].iloc[0] description = get_grant_description(grants_df, sample_id) if description: print(f"Sample description: {description[:200]}...") # Look at some examples of "woke" grants print("\nExamples of labeled grants:") woke_examples = grants_df[grants_df['is_woke'] == True].head(3) for _, row in woke_examples.iterrows(): print(f"\n{row['recipient_name']}: {row['prime_award_base_transaction_description'][:150]}...") def calculate_subset_term_frequency(subset_descriptions): # Input: descriptions is a list of nsf grant description (our subset of documents) # Output: Returns dictionary mapping word -> total count across all descriptions subset_tf = {} # TODO: Your code here! # # 1. For each grant description: # 2. preprocess the description # 3. for each term in the preprocessed description # 4. add the count of that term to subset_tf return subset_tf # Calculate TF for "woke" grants woke_descriptions = grants_df[grants_df['is_woke'] == True]['prime_award_base_transaction_description'].tolist() print(f"Calculating term frequency for {len(woke_descriptions)} 'woke' grants...") woke_tf = calculate_subset_term_frequency(woke_descriptions) print(f"Unique terms in subset: {len(woke_tf)}") print("Top 10 terms by frequency:", sorted(woke_tf.items(), key=lambda x: x[1], reverse=True)[:10]) def calculate_document_frequency(corpus, target_terms): # Input: corpus is a list of all nsf grant description strings (all of our documents) # target_terms is a set of terms to check # Output: Returns dictionary mapping term -> number of documents containing it doc_freq = {} # TODO: Your code here! # For each document in corpus: # 1. Create a new empty list, preprocessed_corpus # 2. For each document in corpus # 3. preprocess the document # 4. append the new preprocessed doc to the preprocessed_corpus # 2. For each term in target_terms # 3. For each preprocessed doc in preprocessed_corpus # 4. If the term is in the doc, increment doc_feq for that term return doc_freq # Create corpus and calculate DF all_descriptions = grants_df['prime_award_base_transaction_description'].tolist() target_terms = set(woke_tf.keys()) print(f"Calculating document frequency for {len(target_terms)} terms...") print(f"Corpus size: {len(all_descriptions)} grants") df_counts = calculate_document_frequency(all_descriptions, target_terms) def calculate_subset_tfidf(subset_tf, doc_freq, total_docs): # Input: subset_tf is dictionary of term frequencies in the subset # doc_freq is dictionary of document frequencies in full corpus # total_docs is total number of documents in corpus # Output: Returns dictionary mapping term -> Subset TF-IDF score tfidf = {} # TODO: Your code here! # For each term in subset_tf: # 1. Calculate IDF = math.log(total_docs / doc_freq[term]) # 2. Calculate Subset TF-IDF = subset_tf[term] * IDF # 3. Store in tfidf dictionary return tfidf # Calculate Subset TF-IDF subset_tfidf_scores = calculate_subset_tfidf(woke_tf, df_counts, len(all_descriptions)) # Display results sorted_scores = sorted(subset_tfidf_scores.items(), key=lambda x: x[1], reverse=True) print(f"\nTop 30 most terms among 'woke' grants:") for term, score in sorted_scores[:30]: print(f" {term}: {score:.3f}")