# Import spacy
import spacy
# Load spaCy visualizer
from spacy import displacy
# Import pandas DataFrame packages
import pandas as pd
# Import graphing package
import plotly.graph_objects as go
import plotly.express as px
# Import drive and files to facilitate file uploads
from google.colab import files
# Selet multiple text files to upload from local folder
uploaded_files = files.upload()
type(uploaded_files)
# Add files into DataFrame
paper_df = pd.DataFrame.from_dict(uploaded_files, orient='index')
paper_df.head()
# Reset index and add column names to make wrangling easier
paper_df = paper_df.reset_index()
paper_df.columns = ["Filename", "Text"]
paper_df.head()
# Convert papers from bytes to strings
paper_df['Text'] = paper_df['Text'].str.decode('utf-8')
paper_df.head()
# Remove extra spaces from papers
paper_df['Text'] = paper_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
paper_df.head()
# Upload csv with essay metadata
metadata = files.upload()
metadata_df = pd.read_csv('metadata.csv')
metadata_df = metadata_df.dropna(axis=1, how='all')
metadata_df.head()
# Remove .txt from title of each paper
paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '')
# Rename column from paper ID to Title
metadata_df.rename(columns={"PAPER ID": "Filename"}, inplace=True)
# Merge metadata and papers into new DataFrame
# Will only keep rows where both essay and metadata are present
final_paper_df = metadata_df.merge(paper_df,on='Filename')
# Print DataFrame
final_paper_df.head()
# # Install and import spacy
# !pip install spaCy
# # Import spacy
# import spacy
# # Install English language model
# !spacy download en_core_web_sm
# # Import os to upload documents and metadata
# import os
# # Load spaCy visualizer
# from spacy import displacy
# # Import pandas DataFrame packages
# import pandas as pd
# # Import graphing package
# import plotly.graph_objects as go
# import plotly.express as px
# # Create empty lists for file names and contents
# texts = []
# file_names = []
# # Iterate through each file in the path
# for _file_name in os.listdir('path_to_directory'):
# # Look for only text files
# if _file_name.endswith('.txt'):
# # Append contents of each text file to text list
# texts.append(open('path_to_directory' + '/' + _file_name, 'r').read())
# # Append name of each file to file name list
# file_names.append(_file_name)
# # Create dictionary object associating each file name with its text
# d = {'Filename':file_names,'Text':texts}
# # Turn dictionary into a dataframe
# paper_df = pd.DataFrame(d)
# paper_df.head()
# # Remove extra spaces from papers
# paper_df['Text'] = paper_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
# paper_df.head()
# metadata_df = pd.read_csv('path_to_directory/metadata.csv')
# metadata_df.head()
# # Remove .txt from title of each paper
# paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '', regex=True)
# # Rename column from paper ID to Title
# metadata_df.rename(columns={"PAPER ID": "Filename"}, inplace=True)
# # Merge metadata and papers into new DataFrame
# # Will only keep rows where both essay and metadata are present
# final_paper_df = metadata_df.merge(paper_df,on='Filename')
# # Print DataFrame
# final_paper_df.head()
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')
# Check what functions it performs
print(nlp.pipe_names)
#Define example sentence
sentence = "This is 'an' example? sentence"
# Call the nlp model on the sentence
doc = nlp(sentence)
# Loop through each token in doc object
for token in doc:
# Print text and part of speech for each
print(token.text, token.pos_)
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
return nlp(text)
# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
final_paper_df['Doc'] = final_paper_df['Text'].apply(process_text)
# Define a function to retrieve tokens from a doc object
def get_token(doc):
# Loop through each token in the doc object
for token in doc:
# Retrieve the text of each token
return token.text
# Define a function to retrieve tokens from a doc object
def get_token(doc):
return [(token.text) for token in doc]
# Run the token retrieval function on the doc objects in the dataframe
final_paper_df['Tokens'] = final_paper_df['Doc'].apply(get_token)
final_paper_df.head()
tokens = final_paper_df[['Text', 'Tokens']].copy()
tokens.head()
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
return [(token.lemma_) for token in doc]
# Run the lemma retrieval function on the doc objects in the dataframe
final_paper_df['Lemmas'] = final_paper_df['Doc'].apply(get_lemma)
print(f'"Write" appears in the text tokens column ' + str(final_paper_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')
print(f'"Write" appears in the lemmas column ' + str(final_paper_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
#Return the coarse- and fine-grained part of speech text for each token in the doc
return [(token.pos_, token.tag_) for token in doc]
# Define a function to retrieve parts of speech from a doc object
final_paper_df['POS'] = final_paper_df['Doc'].apply(get_pos)
# Create a list of part of speech tags
list(final_paper_df['POS'])
spacy.explain("IN")
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
return [token.text for token in doc if token.pos_ == 'PROPN']
# Apply function to Doc column and store resulting proper nouns in new column
final_paper_df['Proper_Nouns'] = final_paper_df['Doc'].apply(extract_proper_nouns)
list(final_paper_df.loc[[3, 163], 'Proper_Nouns'])
# Extract the first sentence from the fifth Doc object
doc = final_paper_df['Doc'][5]
# Create a list of sentence from the doc object
sentences = list(doc.sents)
# Retrieve the first sentence
sentence = sentences[0]
# Create dependency visualization for the first sentence of the 5th essay
displacy.render(sentence, style="dep", jupyter=True)
#Define function to extract parts of speech of all non-stopwords
def extract_stopwords(doc):
return [token.text for token in doc if token.text not in nlp.Defaults.stop_words]
#Create list of tokens without stopwords
final_paper_df['Tokens_NoStops'] = final_paper_df['Doc'].apply(extract_stopwords)
#Turn list of stopwords into a string
final_paper_df['Text_NoStops'] = [' '.join(map(str, l)) for l in final_paper_df['Tokens_NoStops']]
#Create new doc object from texts without stopwords
final_paper_df['Doc_NoStops'] = final_paper_df['Text_NoStops'].apply(process_text)
# extract the first sentence from the first Doc object
doc = final_paper_df['Doc_NoStops'][5]
sentences = list(doc.sents)
sentence = sentences[0]
# visualize the dependency parse tree for the sentence
displacy.render(sentence, style='dep', jupyter=True)
# Define function to extract noun phrases from Doc object
def extract_noun_phrases(doc):
return [chunk.text for chunk in doc.noun_chunks]
# Apply function to Doc column and store resulting proper nouns in new column
final_paper_df['Noun_Phrases'] = final_paper_df['Doc'].apply(extract_noun_phrases)
final_paper_df['Noun_Phrases'][0]
# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels
# Print each label and its description
for label in labels:
print(label + ' : ' + spacy.explain(label))
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
return [ent.label_ for ent in doc.ents]
# Apply function to Doc column and store resulting named entities in new column
final_paper_df['Named_Entities'] = final_paper_df['Doc'].apply(extract_named_entities)
final_paper_df['Named_Entities']
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
return [ent for ent in doc.ents]
# Apply function to Doc column and store resulting text in new column
final_paper_df['NE_Words'] = final_paper_df['Doc'].apply(extract_named_entities)
final_paper_df['NE_Words']
# Extract the first Doc object
doc = final_paper_df['Doc'][1]
# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)
# Save DataFrame as csv (in Google Drive)
# Use this step only to save csv to your computer's working directory
final_paper_df.to_csv('MICUSP_papers_with_spaCy_tags.csv')
# Download csv to your computer from Google Drive
files.download('MICUSP_papers_with_spaCy_tags.csv')
# Create doc object from single sentence
doc = nlp("This is 'an' example? sentence")
# Print counts of each part of speech in sentence
print(doc.count_by(spacy.attrs.POS))
# Store dictionary with indexes and POS counts in a variable
num_pos = doc.count_by(spacy.attrs.POS)
dictionary = {}
# Create a new dictionary which replaces the index of each part of speech for its label (NOUN, VERB, ADJECTIVE)
for k,v in sorted(num_pos.items()):
dictionary[doc.vocab[k].text] = v
dictionary
# Create new DataFrame for analysis purposes
pos_analysis_df = final_paper_df[['Filename','DISCIPLINE', 'Doc']]
# Create list to store each dictionary
num_list = []
# Define a function to get part of speech tags and counts and append them to a new dictionary
def get_pos_tags(doc):
dictionary = {}
num_pos = doc.count_by(spacy.attrs.POS)
for k,v in sorted(num_pos.items()):
dictionary[doc.vocab[k].text] = v
num_list.append(dictionary)
# Apply function to each doc object in DataFrame
pos_analysis_df['C_POS'] = pos_analysis_df['Doc'].apply(get_pos_tags)
# Create new dataframe with part of speech counts
pos_counts = pd.DataFrame(num_list)
columns = list(pos_counts.columns)
# Add discipline of each paper as new column to dataframe
idx = 0
new_col = pos_analysis_df['DISCIPLINE']
pos_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)
pos_counts
# Get average part of speech counts used in papers of each discipline
average_pos_df = pos_counts.groupby(['DISCIPLINE']).mean()
# Round calculations to the nearest whole number
average_pos_df = average_pos_df.round(0)
# Reset index to improve DataFrame readability
average_pos_df = average_pos_df.reset_index()
# Show dataframe
average_pos_df
# Use plotly to plot proper noun use per genre
fig = px.bar(average_pos_df, x="DISCIPLINE", y=["ADJ", 'VERB', "NUM"], title="Average Part-of-Speech Use in Papers Written by Biology and English Students", barmode='group')
fig.show()
# Create list to store each dictionary
tag_num_list = []
# Define a function to get part of speech tags and counts and append them to a new dictionary
def get_fine_pos_tags(doc):
dictionary = {}
num_tag = doc.count_by(spacy.attrs.TAG)
for k,v in sorted(num_tag.items()):
dictionary[doc.vocab[k].text] = v
tag_num_list.append(dictionary)
# Apply function to each doc object in DataFrame
pos_analysis_df['F_POS'] = pos_analysis_df['Doc'].apply(get_fine_pos_tags)
# Create new dataframe with part of speech counts
tag_counts = pd.DataFrame(tag_num_list)
columns = list(tag_counts.columns)
# Add discipline of each paper as new column to dataframe
idx = 0
new_col = pos_analysis_df['DISCIPLINE']
tag_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)
# Get average fine-grain part of speech counts used in papers of each discipline
average_tag_df = tag_counts.groupby(['DISCIPLINE']).mean()
# Round calculations to the nearest whole number
average_tag_df = average_tag_df.round(0)
# Reset index to improve DataFrame readability
average_tag_df = average_tag_df.reset_index()
# Show dataframe
average_tag_df
# Use plotly to plot proper noun use per genre
fig = px.bar(average_tag_df, x="DISCIPLINE", y=["VBD", 'VBP', 'VBZ'], title="Average Verb Tense Usage Differences in Biology and English Student Writing", barmode='group')
fig.show()
# Create new DataFrame for analysis purposes
ner_analysis_df = final_paper_df[['Filename','PAPER TYPE', 'Named_Entities', 'NE_Words']]
# Convert named entity lists to strings so we can count specific entities
ner_analysis_df['Named_Entities'] = ner_analysis_df['Named_Entities'].apply(lambda x: ' '.join(x))
# Get the number of each type of entity in each paper
person_counts = ner_analysis_df['Named_Entities'].str.count('PERSON')
loc_counts = ner_analysis_df['Named_Entities'].str.count('LOC')
date_counts = ner_analysis_df['Named_Entities'].str.count('DATE')
woa_counts = ner_analysis_df['Named_Entities'].str.count('WORK_OF_ART')
# Append named entity counts to new DataFrame
ner_counts_df = pd.DataFrame()
ner_counts_df['Genre'] = ner_analysis_df["PAPER TYPE"]
ner_counts_df['PERSON_Counts'] = person_counts
ner_counts_df['LOC_Counts'] = loc_counts
ner_counts_df['DATE_Counts'] = date_counts
ner_counts_df['WORK_OF_ART_Counts'] = woa_counts
ner_counts_df.head()
# Calculate average usage of each named entity type
average_ner_df = ner_counts_df.groupby(['Genre']).mean()
average_ner_df = average_ner_df.round(0)
average_ner_df = average_ner_df.reset_index()
average_ner_df
# Use plotly to plot proper noun use per genre
fig = px.bar(average_ner_df, x="Genre", y=["PERSON_Counts", 'LOC_Counts', "DATE_Counts", 'WORK_OF_ART_Counts'], title="Average Named Entity Usage Across Student Paper Genres", barmode='group')
fig.show()
DATE
Named Entities¶# Define function to extract words tagged as "date" named entities from doc objects
def extract_date_named_entities(doc):
return [ent for ent in doc.ents if ent.label_ == 'DATE']
# Get all date entity words and apply to new column of DataFrame
ner_analysis_df['Date_Named_Entities'] = final_paper_df['Doc'].apply(extract_date_named_entities)
# Make list of date entities a string so we can count their frequencies
ner_analysis_df['Date_Named_Entities'] = [', '.join(map(str, l)) for l in ner_analysis_df['Date_Named_Entities']]
# Search for only date words in proposal papers
date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Proposal').any(axis=1)]
# Count the frequency of each word in these essays and append to list
date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()
# Get top 10 most common words and their frequencies
date_word_frequencies[:10]
# Search for only date words in critique/evaluation papers
date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Critique/Evaluation').any(axis=1)]
# Count the frequency of each word in these essays and append to list
date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()
# Get top 10 most common words and their frequencies
date_word_frequencies[:10]