#!/usr/bin/env python # coding: utf-8 #
# # Created by [Nathan Kelber](http://nkelber.com) and Ted Lawless for [JSTOR Labs](https://labs.jstor.org/) under [Creative Commons CC BY License](https://creativecommons.org/licenses/by/4.0/)
# For questions/comments/improvements, email nathan.kelber@ithaka.org.
# ____ # **Exploring Word Frequencies** # # **Description:** # This [notebook](https://docs.tdm-pilot.org/key-terms/#jupyter-notebook) shows how to find the most common words in a # [dataset](https://docs.tdm-pilot.org/key-terms/#dataset). The following processes are described: # # * Using the `tdm_client` to create a Pandas DataFrame # * Filtering based on a pre-processed ID list # * Filtering based on a [stop words list](https://docs.tdm-pilot.org/key-terms/#stop-words) # * Using a `Counter()` object to get the most common words # # **Difficulty:** Intermediate # # **Completion time:** 60 minutes # # **Knowledge Required:** # * Python Basics ([Start Python Basics I](./python-basics-1.ipynb)) # # **Knowledge Recommended:** # # * [Working with Dataset Files](./working-with-dataset-files.ipynb) # * [Pandas I](./pandas-1.ipynb) # * [Counter Objects](./counter-objects.ipynb) # * [Creating a Stopwords List](./creating-stopwords-list.ipynb) # # **Data Format:** [JSON Lines (.jsonl)](https://docs.tdm-pilot.org/key-terms/#jsonl) # # **Libraries Used:** # * **[tdm_client](https://docs.tdm-pilot.org/key-terms/#tdm-client)** to collect, unzip, and read our dataset # * **[NLTK](https://docs.tdm-pilot.org/key-terms/#nltk)** to help [clean](https://docs.tdm-pilot.org/key-terms/#clean-data) up our dataset # * [Counter](https://docs.tdm-pilot.org/key-terms/#python-counter) from **Collections** to help sum up our word frequencies # # **Research Pipeline:** # # 1. Build a dataset # 2. Create a "Pre-Processing CSV" with [Exploring Metadata](./exploring-metadata.ipynb) (Optional) # 3. Create a "Custom Stopwords List" with [Creating a Stopwords List](./creating-stopwords-list.ipynb) (Optional) # 4. Complete the word frequencies analysis with this notebook # ___ # # Import your dataset # # We'll use the tdm_client library to automatically retrieve the dataset in the JSON file format. # # Enter a [dataset ID](https://docs.tdm-pilot.org/key-terms/#dataset-ID) in the next code cell. # # If you don't have a dataset ID, you can: # * Use the sample dataset ID already in the code cell # * [Create a new dataset](https://tdm-pilot.org/builder) # * [Use a dataset ID from other pre-built sample datasets](https://tdm-pilot.org/dataset/dashboard) # In[ ]: # Creating a variable `dataset_id` to hold our dataset ID # The default dataset is Shakespeare Quarterly, 1950-present dataset_id = "7e41317e-740f-e86a-4729-20dab492e925" # Next, import the `tdm_client`, passing the `dataset_id` as an argument using the `get_dataset` method. # In[ ]: # Importing your dataset with a dataset ID import tdm_client # Pull in the dataset that matches `dataset_id` # in the form of a gzipped JSON lines file. dataset_file = tdm_client.get_dataset(dataset_id) # # Apply Pre-Processing Filters (if available) # If you completed pre-processing with the "Exploring Metadata and Pre-processing" notebook, you can use your CSV file of dataset IDs to automatically filter the dataset. Your pre-processed CSV file must be in the same directory as this notebook. # In[ ]: # Import a pre-processed CSV file of filtered dataset IDs. # If you do not have a pre-processed CSV file, the analysis # will run on the full dataset and may take longer to complete. import pandas as pd import os pre_processed_file_name = f'data/pre-processed_{dataset_id}.csv' if os.path.exists(pre_processed_file_name): df = pd.read_csv(pre_processed_file_name) filtered_id_list = df["id"].tolist() use_filtered_list = True print('Pre-Processed CSV found. Successfully read in ' + str(len(df)) + ' documents.') else: use_filtered_list = False print('No pre-processed CSV file found. Full dataset will be used.') # # Extract the Unigram Counts from the dataset JSON file # # We pulled in our dataset using a `dataset_id`. The file, which resides in the datasets/ folder, is a compressed JSON Lines file (jsonl.gz) that contains all the metadata information found in the metadata CSV *plus* the textual data necessary for analysis including: # # * Unigram Counts # * Bigram Counts # * Trigram Counts # * Full-text (if available) # # To complete our analysis, we are going to pull out the unigram counts for each document and store them in a Counter() object. We will import `Counter` which will allow us to use Counter() objects for counting unigrams. Then we will initialize an empty Counter() object `word_frequency` to hold all of our unigram counts. # In[ ]: # Import Counter() from collections import Counter # Create an empty Counter object called `word_frequency` word_frequency = Counter() # In[ ]: # Gather unigramCounts from documents in `filtered_id_list` if it is available for document in tdm_client.dataset_reader(dataset_file): if use_filtered_list is True: document_id = document['id'] # Skip documents not in our filtered_id_list if document_id not in filtered_id_list: continue unigrams = document.get("unigramCount", []) for gram, count in unigrams.items(): word_frequency[gram] += count # Print success message if use_filtered_list is True: print('Unigrams have been collected for documents in filtered_id_list') else: print('Unigrams have been collected for all documents without filtering') # # Find Most Common Unigrams # Now that we have a list of the frequency of all the unigrams in our corpus, we need to sort them to find which are most common # In[ ]: for gram, count in word_frequency.most_common(25): print(gram.ljust(20), count) # # Clean Up Tokens # # We have successfully created a word frequency list. There are a couple small issues, however, that we still need to address: # 1. There are many [function words](https://docs.tdm-pilot.org/key-terms/#function-words), words like "the", "in", and "of" that are grammatically important but do not carry as much semantic meaning like [content words](https://docs.tdm-pilot.org/key-terms/#content-words), such as nouns and verbs. # 2. The words represented here are actually case-sensitive [strings](https://docs.tdm-pilot.org/key-terms/#string). That means that the string "the" is a different from the string "The". You may notice this in your results above. # # To solve these issues, we need to find a way to remove common [function words](https://docs.tdm-pilot.org/key-terms/#function-words) and combine [strings](https://docs.tdm-pilot.org/key-terms/#string) that may have capital letters in them. We can solve these issues by: # # 1. Using a [stopwords](https://docs.tdm-pilot.org/key-terms/#stop-words) list to remove common [function words](https://docs.tdm-pilot.org/key-terms/#function-words) # 2. Lowercasing all the characters in each string to combine our counts # ## Load Stopwords List # # If you have created a stopword list in the stopwords notebook, we will import it here. (You can always modify the CSV file to add or subtract words then reload the list.) Otherwise, we'll load the NLTK [stopwords](https://docs.tdm-pilot.org/key-terms/#stop-words) list automatically. # In[ ]: # Load a custom data/stop_words.csv if available # Otherwise, load the nltk stopwords list in English # Create an empty Python list to hold the stopwords stop_words = [] # The filename of the custom data/stop_words.csv file stopwords_list_filename = 'data/stop_words.csv' if os.path.exists(stopwords_list_filename): import csv with open(stopwords_list_filename, 'r') as f: stop_words = list(csv.reader(f))[0] print('Custom stopwords list loaded from CSV') else: # Load the NLTK stopwords list from nltk.corpus import stopwords stop_words = stopwords.words('english') print('NLTK stopwords list loaded') # ## Apply Processing # In addition to using a stopwords list, we will clean up the tokens by lowercasing all tokens and combining them. This will combine tokens with different capitalization such as "quarterly" and "Quarterly." We will also remove any tokens that are not alphanumeric. # In[ ]: # Gather unigramCounts from documents in `filtered_id_list` if available # and apply the processing. transformed_word_frequency = Counter() for document in tdm_client.dataset_reader(dataset_file): if use_filtered_list is True: document_id = document['id'] # Skip documents not in our filtered_id_list if document_id not in filtered_id_list: continue unigrams = document.get("unigramCount", []) for gram, count in unigrams.items(): clean_gram = gram.lower() if clean_gram in stop_words: continue if not clean_gram.isalpha(): continue transformed_word_frequency[clean_gram] += count # Finally, we will display the 20 most common words by using the `.most_common()` method on the `Counter()` object. # In[ ]: # Print the most common processed unigrams and their counts for gram, count in transformed_word_frequency.most_common(25): print(gram.ljust(20), count) # In[ ]: