Imports, Uploads, and Preprocessing¶

Import Packages¶

In [ ]:

# Import spacy
import spacy

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

Upload Text Files¶

In [ ]:

# Import drive and files to facilitate file uploads
from google.colab import files

In [ ]:

# Selet multiple text files to upload from local folder
uploaded_files = files.upload()

In [ ]:

type(uploaded_files)

In [ ]:

# Add files into DataFrame
paper_df = pd.DataFrame.from_dict(uploaded_files, orient='index')
paper_df.head()

In [ ]:

# Reset index and add column names to make wrangling easier
paper_df = paper_df.reset_index()
paper_df.columns = ["Filename", "Text"]
paper_df.head()

Pre-process Text Files¶

In [ ]:

# Convert papers from bytes to strings
paper_df['Text'] = paper_df['Text'].str.decode('utf-8')
paper_df.head()

In [ ]:

# Remove extra spaces from papers
paper_df['Text'] = paper_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
paper_df.head()

Upload and Merge Metadata Files¶

In [ ]:

# Upload csv with essay metadata
metadata = files.upload()

In [ ]:

metadata_df = pd.read_csv('metadata.csv')
metadata_df = metadata_df.dropna(axis=1, how='all')
metadata_df.head()

In [ ]:

# Remove .txt from title of each paper
paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '')

# Rename column from paper ID to Title
metadata_df.rename(columns={"PAPER ID": "Filename"}, inplace=True)

In [ ]:

# Merge metadata and papers into new DataFrame
# Will only keep rows where both essay and metadata are present
final_paper_df = metadata_df.merge(paper_df,on='Filename')

In [ ]:

# Print DataFrame
final_paper_df.head()

Alternate Code: Installs, Imports and Preprocessing in Jupyter Notebook¶

In [ ]:

# # Install and import spacy
# !pip install spaCy

# # Import spacy
# import spacy

# # Install English language model
# !spacy download en_core_web_sm

# # Import os to upload documents and metadata
# import os

# # Load spaCy visualizer
# from spacy import displacy

# # Import pandas DataFrame packages
# import pandas as pd

# # Import graphing package
# import plotly.graph_objects as go
# import plotly.express as px

In [ ]:

# # Create empty lists for file names and contents
# texts = []
# file_names = []
# # Iterate through each file in the path
# for _file_name in os.listdir('path_to_directory'):
# # Look for only text files
#     if _file_name.endswith('.txt'):
#     # Append contents of each text file to text list
#         texts.append(open('path_to_directory' + '/' + _file_name, 'r').read())
#         # Append name of each file to file name list
#         file_names.append(_file_name)

In [ ]:

# # Create dictionary object associating each file name with its text
# d = {'Filename':file_names,'Text':texts}

In [ ]:

# # Turn dictionary into a dataframe
# paper_df = pd.DataFrame(d)

In [ ]:

# paper_df.head()

In [ ]:

# # Remove extra spaces from papers
# paper_df['Text'] = paper_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
# paper_df.head()

In [ ]:

# metadata_df = pd.read_csv('path_to_directory/metadata.csv')
# metadata_df.head()

In [ ]:

# # Remove .txt from title of each paper
# paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '', regex=True)

# # Rename column from paper ID to Title
# metadata_df.rename(columns={"PAPER ID": "Filename"}, inplace=True)

In [ ]:

# # Merge metadata and papers into new DataFrame
# # Will only keep rows where both essay and metadata are present
# final_paper_df = metadata_df.merge(paper_df,on='Filename')

In [ ]:

# # Print DataFrame
# final_paper_df.head()

Text Enrichment with spaCy¶

Creating Doc Objects¶

In [ ]:

# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

In [ ]:

#Define example sentence
sentence = "This is 'an' example? sentence"

# Call the nlp model on the sentence
doc = nlp(sentence)

In [ ]:

# Loop through each token in doc object
for token in doc:
    # Print text and part of speech for each
    print(token.text, token.pos_)

In [ ]:

# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [ ]:

# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
final_paper_df['Doc'] = final_paper_df['Text'].apply(process_text)

Text Reduction¶

Tokenization¶

In [ ]:

# Define a function to retrieve tokens from a doc object
def get_token(doc):
    # Loop through each token in the doc object
    for token in doc:
        # Retrieve the text of each token
        return token.text

In [ ]:

# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [ ]:

# Run the token retrieval function on the doc objects in the dataframe
final_paper_df['Tokens'] = final_paper_df['Doc'].apply(get_token)
final_paper_df.head()

In [ ]:

tokens = final_paper_df[['Text', 'Tokens']].copy()
tokens.head()

Lemmatization¶

In [ ]:

# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
final_paper_df['Lemmas'] = final_paper_df['Doc'].apply(get_lemma)

In [ ]:

print(f'"Write" appears in the text tokens column ' + str(final_paper_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')
print(f'"Write" appears in the lemmas column ' + str(final_paper_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')

Text Annotation¶

Part of Speech Tagging¶

In [ ]:

# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
final_paper_df['POS'] = final_paper_df['Doc'].apply(get_pos)

In [ ]:

# Create a list of part of speech tags
list(final_paper_df['POS'])

In [ ]:

spacy.explain("IN")

In [ ]:

# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
final_paper_df['Proper_Nouns'] = final_paper_df['Doc'].apply(extract_proper_nouns)

In [ ]:

list(final_paper_df.loc[[3, 163], 'Proper_Nouns'])

Dependency Parsing¶

In [ ]:

# Extract the first sentence from the fifth Doc object
doc = final_paper_df['Doc'][5]

# Create a list of sentence from the doc object
sentences = list(doc.sents)

# Retrieve the first sentence
sentence = sentences[0]

# Create dependency visualization for the first sentence of the 5th essay
displacy.render(sentence, style="dep", jupyter=True)

In [ ]:

#Define function to extract parts of speech of all non-stopwords
def extract_stopwords(doc):
    return [token.text for token in doc if token.text not in nlp.Defaults.stop_words]

#Create list of tokens without stopwords
final_paper_df['Tokens_NoStops'] = final_paper_df['Doc'].apply(extract_stopwords)

#Turn list of stopwords into a string
final_paper_df['Text_NoStops'] = [' '.join(map(str, l)) for l in final_paper_df['Tokens_NoStops']]

#Create new doc object from texts without stopwords
final_paper_df['Doc_NoStops'] = final_paper_df['Text_NoStops'].apply(process_text)

# extract the first sentence from the first Doc object
doc = final_paper_df['Doc_NoStops'][5]
sentences = list(doc.sents)
sentence = sentences[0]

# visualize the dependency parse tree for the sentence
displacy.render(sentence, style='dep', jupyter=True)

In [ ]:

# Define function to extract noun phrases from Doc object
def extract_noun_phrases(doc):
    return [chunk.text for chunk in doc.noun_chunks]

# Apply function to Doc column and store resulting proper nouns in new column
final_paper_df['Noun_Phrases'] = final_paper_df['Doc'].apply(extract_noun_phrases)

In [ ]:

final_paper_df['Noun_Phrases'][0]

Named Entity Recognition¶

In [ ]:

# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels

# Print each label and its description
for label in labels:
    print(label + ' : ' + spacy.explain(label))

In [ ]:

# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
final_paper_df['Named_Entities'] = final_paper_df['Doc'].apply(extract_named_entities)
final_paper_df['Named_Entities']

In [ ]:

# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
final_paper_df['NE_Words'] = final_paper_df['Doc'].apply(extract_named_entities)
final_paper_df['NE_Words']

In [ ]:

# Extract the first Doc object
doc = final_paper_df['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)

Download Enriched Dataset¶

In [ ]:

# Save DataFrame as csv (in Google Drive)
# Use this step only to save  csv to your computer's working directory
final_paper_df.to_csv('MICUSP_papers_with_spaCy_tags.csv')

# Download csv to your computer from Google Drive
files.download('MICUSP_papers_with_spaCy_tags.csv')

Analysis of Linguistic Annotations¶

Part of Speech Analysis¶

In [ ]:

# Create doc object from single sentence
doc = nlp("This is 'an' example? sentence")

# Print counts of each part of speech in sentence
print(doc.count_by(spacy.attrs.POS))

In [ ]:

# Store dictionary with indexes and POS counts in a variable
num_pos = doc.count_by(spacy.attrs.POS)

dictionary = {}

# Create a new dictionary which replaces the index of each part of speech for its label (NOUN, VERB, ADJECTIVE)
for k,v in sorted(num_pos.items()):
  dictionary[doc.vocab[k].text] = v

dictionary

In [ ]:

# Create new DataFrame for analysis purposes
pos_analysis_df = final_paper_df[['Filename','DISCIPLINE', 'Doc']]

# Create list to store each dictionary
num_list = []

# Define a function to get part of speech tags and counts and append them to a new dictionary
def get_pos_tags(doc):
    dictionary = {}
    num_pos = doc.count_by(spacy.attrs.POS)
    for k,v in sorted(num_pos.items()):
        dictionary[doc.vocab[k].text] = v
    num_list.append(dictionary)

# Apply function to each doc object in DataFrame
pos_analysis_df['C_POS'] = pos_analysis_df['Doc'].apply(get_pos_tags)

In [ ]:

# Create new dataframe with part of speech counts
pos_counts = pd.DataFrame(num_list)
columns = list(pos_counts.columns)

# Add discipline of each paper as new column to dataframe
idx = 0
new_col = pos_analysis_df['DISCIPLINE']
pos_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)

pos_counts

In [ ]:

# Get average part of speech counts used in papers of each discipline
average_pos_df = pos_counts.groupby(['DISCIPLINE']).mean()

# Round calculations to the nearest whole number
average_pos_df = average_pos_df.round(0)

# Reset index to improve DataFrame readability
average_pos_df = average_pos_df.reset_index()

# Show dataframe
average_pos_df

In [ ]:

# Use plotly to plot proper noun use per genre
fig = px.bar(average_pos_df, x="DISCIPLINE", y=["ADJ", 'VERB', "NUM"], title="Average Part-of-Speech Use in Papers Written by Biology and English Students", barmode='group')
fig.show()

Fine-Grained Part of Speech Analysis¶

In [ ]:

# Create list to store each dictionary
tag_num_list = []

# Define a function to get part of speech tags and counts and append them to a new dictionary
def get_fine_pos_tags(doc):
    dictionary = {}
    num_tag = doc.count_by(spacy.attrs.TAG)
    for k,v in sorted(num_tag.items()):
        dictionary[doc.vocab[k].text] = v
    tag_num_list.append(dictionary)

# Apply function to each doc object in DataFrame
pos_analysis_df['F_POS'] = pos_analysis_df['Doc'].apply(get_fine_pos_tags)

# Create new dataframe with part of speech counts
tag_counts = pd.DataFrame(tag_num_list)
columns = list(tag_counts.columns)

# Add discipline of each paper as new column to dataframe
idx = 0
new_col = pos_analysis_df['DISCIPLINE']
tag_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)

In [ ]:

# Get average fine-grain part of speech counts used in papers of each discipline
average_tag_df = tag_counts.groupby(['DISCIPLINE']).mean()

# Round calculations to the nearest whole number
average_tag_df = average_tag_df.round(0)

# Reset index to improve DataFrame readability
average_tag_df = average_tag_df.reset_index()

# Show dataframe
average_tag_df

In [ ]:

# Use plotly to plot proper noun use per genre
fig = px.bar(average_tag_df, x="DISCIPLINE", y=["VBD", 'VBP', 'VBZ'], title="Average Verb Tense Usage Differences in Biology and English Student Writing", barmode='group')
fig.show()

Named Entity Analysis¶

In [ ]:

# Create new DataFrame for analysis purposes
ner_analysis_df = final_paper_df[['Filename','PAPER TYPE', 'Named_Entities', 'NE_Words']]

In [ ]:

# Convert named entity lists to strings so we can count specific entities
ner_analysis_df['Named_Entities'] = ner_analysis_df['Named_Entities'].apply(lambda x: ' '.join(x))

# Get the number of each type of entity in each paper
person_counts = ner_analysis_df['Named_Entities'].str.count('PERSON')
loc_counts = ner_analysis_df['Named_Entities'].str.count('LOC')
date_counts = ner_analysis_df['Named_Entities'].str.count('DATE')
woa_counts = ner_analysis_df['Named_Entities'].str.count('WORK_OF_ART')

# Append named entity counts to new DataFrame
ner_counts_df = pd.DataFrame()
ner_counts_df['Genre'] = ner_analysis_df["PAPER TYPE"]
ner_counts_df['PERSON_Counts'] = person_counts
ner_counts_df['LOC_Counts'] = loc_counts
ner_counts_df['DATE_Counts'] = date_counts
ner_counts_df['WORK_OF_ART_Counts'] = woa_counts

ner_counts_df.head()

In [ ]:

# Calculate average usage of each named entity type
average_ner_df = ner_counts_df.groupby(['Genre']).mean()
average_ner_df = average_ner_df.round(0)
average_ner_df = average_ner_df.reset_index()
average_ner_df

# Use plotly to plot proper noun use per genre
fig = px.bar(average_ner_df, x="Genre", y=["PERSON_Counts", 'LOC_Counts', "DATE_Counts", 'WORK_OF_ART_Counts'], title="Average Named Entity Usage Across Student Paper Genres", barmode='group')
fig.show()

Analysis of `DATE` Named Entities¶

In [ ]:

# Define function to extract words tagged as "date" named entities from doc objects
def extract_date_named_entities(doc):
    return [ent for ent in doc.ents if ent.label_ == 'DATE']

# Get all date entity words and apply to new column of DataFrame
ner_analysis_df['Date_Named_Entities'] = final_paper_df['Doc'].apply(extract_date_named_entities)


# Make list of date entities a string so we can count their frequencies
ner_analysis_df['Date_Named_Entities'] = [', '.join(map(str, l)) for l in ner_analysis_df['Date_Named_Entities']]

In [ ]:

# Search for only date words in proposal papers
date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Proposal').any(axis=1)]

# Count the frequency of each word in these essays and append to list
date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()

# Get top 10 most common words and their frequencies
date_word_frequencies[:10]

In [ ]:

# Search for only date words in critique/evaluation papers
date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Critique/Evaluation').any(axis=1)]

# Count the frequency of each word in these essays and append to list
date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()

# Get top 10 most common words and their frequencies
date_word_frequencies[:10]

Imports, Uploads, and Preprocessing¶

Import Packages¶

Upload Text Files¶

Pre-process Text Files¶

Upload and Merge Metadata Files¶

Alternate Code: Installs, Imports and Preprocessing in Jupyter Notebook¶

Text Enrichment with spaCy¶

Creating Doc Objects¶

Text Reduction¶

Tokenization¶

Lemmatization¶

Text Annotation¶

Part of Speech Tagging¶

Dependency Parsing¶

Named Entity Recognition¶

Download Enriched Dataset¶

Analysis of Linguistic Annotations¶

Part of Speech Analysis¶

Fine-Grained Part of Speech Analysis¶

Named Entity Analysis¶

Analysis of DATE Named Entities¶

Analysis of `DATE` Named Entities¶