import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
import numpy as np
# Story definition
from common_variables import *
from common_text_analysis import story2set, ps
# All functions for text analysis are in this script, load when code is run
%load_ext autoreload
%autoreload 1
%aimport common_functions
from common_data_processing import *
# Read the data
exp = pd.read_csv(path_final_data)
# Separate into chain and network
chain_exp = exp.loc[exp["network_type"] == "chain"]
chain_exp["replication"] = chain_exp["transmission_id"] #replication is not unique, use "transmission_id" as ID
network_exp = exp.loc[exp["network_type"] == "network"]
# Clean data
process_file(chain_exp,f'{path_data_files}/data_processing/cleaned_chain_exp.csv',chain=True)
process_file(network_exp,f'{path_data_files}/data_processing/cleaned_network_exp.csv',chain=False)
# Read and concatenate again
network_exp = pd.read_csv(f"{path_data_files}/data_processing/cleaned_network_exp.csv",sep="\t")
network_exp["condition"] = "Network"
chain_exp = pd.read_csv(f"{path_data_files}/data_processing/cleaned_chain_exp.csv",sep="\t")
chain_exp["condition"] = "Chain"
results = pd.concat([network_exp, chain_exp])
results = results.reset_index(drop=True)
results = results.sort_values(by=["condition","rep","layer_n"],ascending=[False, True, True])
# Manual inspection (+ viz) to see if bots were able to skip the bot detection. Remove when this was the case
print(len(results))
display(results.loc[results["rep"].isin({"44-1","4-0","0-1"}), ["layer_n","rep","story_merged"]].values)
results = results.loc[~results["rep"].isin({"44-1","4-0","0-1"})]
print(len(results))
864
array([[1, '0-1', 'Its long been known that drawing something helps a person remember it. a new study shows that drawing is superior to activities such as reading or writing because it forces the person to process information in multiple ways; visually, kinesthetically, and semantically. across a series of experiments, researchers found drawing information to be a powerful way to boost memory, increasing recall by nearly double. Myra Fernandes, Jeffrey Warms, and Melissa Meade are experts in the science of memory how people encode, retain, and recall information. at the university of waterloo, they conducted experiments to better understand how activities such as writing, looking at pictures, listening to lectures, drawing, and visualizing images affect a students ability to remember information. in an early experiment, they asked undergraduate students to study lists of common terms words like truck and pear and then either writes down or illustrate those words.'], [2, '0-1', 'Drawing something helps people remember it. Drawing is better than reading or writing because it causes a person to process information in many different ways. Drawing has been found to increase memory and recall. Researchers conducted experiments with university students where they asked them to process information in different ways, (reading, writing, drawing, and looking at pictures).'], [3, '0-1', "By drawing certain things, people can remember them better than if they were to read or write it. Based on research and experiments, drawing has been shown to increase people's ability to remember things."], [4, '0-1', 'by drawing things, people can better remember things than reading and writing.'], [5, '0-1', 'By drawing, people can better remember what they read in a text'], [6, '0-1', 'Taking the time to draw out the details of a person makes it easier to remember the details in their features for the future.'], [1, '4-0', "During the history of mankind, people did not live a long life and that it was because of events that caused a premature death such as being attacked by wild animals or dead causing injuries during the industrial revolution. With time and help of science there was the invention of penicillin or antibiotics that helped fight against infections. Population started living longer, however there is a resistance against antibiotics as there is a continuous race to develop antibiotics for each infection causing bacteria. With time the more the antibiotics are used the more trained is getting the bacteria to fight it. If we don't stop taking antibiotics for common infections that could be easily be treated with regular medication the antibiotics are going to be useless. Just like the fight against the global warming and climate change, we need to take those same small steps to fight for antibiotics against the infection causing bacteria."], [2, '4-0', 'The article was about humans living longer over time due to the development of antibiotics to fight bacterial infections. The article says antibiotics are being overused when there are other medications to fight infection and, therefore, the antibiotics are growing less effective and we need to fight to combat their overuse when unneeded.'], [3, '4-0', 'The article was about how antibiotics are growing resistant to our immune systems and we need to use other options to help fight off infections. '], [4, '4-0', "There is no way I can reconstruct an article based on a single sentence summary of it as presented on the previous page. I understand it was about antibiotic resistant infections or something like that, but I have almost no useful knowledge of the article itself. Who wrote it, what experts they spoke to, when it was written, anything that would enable me to do more than repeat the summary as I have just done is completely missing. This is wholly absurd and frankly a waste of everyone's time. I hope you're happy, because I'm not."], [5, '4-0', 'I cannot reconstruct the original article with the given information, I can only reiterate that the previous participant complained about not being able to summarize the article.'], [6, '4-0', 'I cannot duplicate the original article, but can only recite what I remember. '], [1, '44-1', 'Antibiotics such as penicillin have saved many lives since they have been discovered. Infections that were once a death sentence were able to be cured. But over time, bacteria has developed resistance to these drugs, and once again, infections are becoming more dangerous. The drugs that we used to treat these infections are becoming less effective.'], [2, '44-1', 'Drugs like Penacilin are used to cure ilnesses that were once a death sentence. However bacteria has become more resistent making the drugs less effective. These medications that once worked are now not. '], [3, '44-1', 'Penicillin is used to treat infections, but they become more resistant to the antibiotic due to misuse and overuse.'], [4, '44-1', 'Penicillin is used to treat infections but it can become less effective with a lot of use or misuse. '], [5, '44-1', 'it is effective for experiments'], [6, '44-1', 'it is effective for results']], dtype=object)
846
# Save to use in next analysis
results.to_csv(f'{path_data_files}/data_final/cleaned_combined_data.csv')
results.head()
layer_n | rep | story1 | story2 | story3 | story_merged | condition | |
---|---|---|---|---|---|---|---|
0 | 1 | 1 | Through history, most people didn't die of can... | Through history, most people didn't die of can... | Through history, most people didn't die of can... | People didn't use to die of heart disease and ... | Network |
1 | 1 | 1 | Through history, most people didn't die of can... | Through history, most people didn't die of can... | Through history, most people didn't die of can... | People didn't use to die or heart disease or c... | Network |
2 | 1 | 1 | Through history, most people didn't die of can... | Through history, most people didn't die of can... | Through history, most people didn't die of can... | Alexander Fleming discovered penicillin in 192... | Network |
3 | 2 | 1 | People didn't use to die of heart disease and ... | People didn't use to die or heart disease or c... | Alexander Fleming discovered penicillin in 192... | Before the discovery of lifesaving antibiotics... | Network |
4 | 2 | 1 | People didn't use to die of heart disease and ... | People didn't use to die or heart disease or c... | Alexander Fleming discovered penicillin in 192... | People didn't used to die of cancer or heart d... | Network |
emb = create_embeddings(results, transformer_model=transformer_model, path=f"{path_text_embeddings}/story_embeddings")
project_embeddings(emb, path=f"{path_text_embeddings}/X_story_embedded_")
/Users/garci061/miniforge3/envs/rumor/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn(
(847, 2)
/Users/garci061/miniforge3/envs/rumor/lib/python3.9/site-packages/umap/umap_.py:1943: UserWarning: n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.") OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
(847, 2)
# Create dataframe to calculate frequencies
# original story as set of words (stemmed)
set_story_original = set(story2set(story_original, stop_words=eng_stopwords))
set_story_original = set([ps.stem(_) for _ in set_story_original])
# create sets and stem words of the transmitted stories
results_prob = results[["story1", "story2", "story3", "story_merged","layer_n","rep","condition"]]
for s in ["story1", "story2", "story3", "story_merged"]:
results_prob[s] = results_prob[s].apply(story2set,create_set=False,stop_words=eng_stopwords)
results_prob[s] = results_prob[s].apply(lambda x: [ps.stem(_) for _ in x])
results_prob[s] = results_prob[s].apply(lambda x: [_ for _ in x if _ in set_story_original])
# tidy text structure (one word|replicate|condition per row)
all_persons = []
for i,row in results_prob.iterrows():
stories = list((row["story1"])) + list((row["story2"])) + list((row["story3"]))
stories = Counter(stories)
if len(stories) == 0:
continue
person = pd.DataFrame.from_dict(stories, orient="index").reset_index()
person.columns = ["word", "number_observed"]
stories = list(set(row["story1"])) + list(set(row["story2"])) + list(set(row["story3"]))
len_stories = len(row["story1"]) + len(row["story2"]) + len(row["story3"])
if (row["condition"] == "Chain") or (row["layer_n"]==1):
len_stories /= 3
stories = Counter(stories)
if len(stories) == 0:
continue
person1 = pd.DataFrame.from_dict(stories, orient="index").reset_index()
person1.columns = ["word", "number_stories_observed"]
person = pd.merge(person, person1)
person["transmitted"] = person["word"].isin(set(row["story_merged"]))
person["number_words_read"] = len_stories
person["condition"] = row["condition"]
person["layer_n"] = row["layer_n"]
person["rep"] = row["rep"]
all_persons.append(person)
print(len(all_persons))
all_persons = pd.concat(all_persons)
all_persons["transmitted"] = all_persons["transmitted"].astype(int)
all_persons.loc[(all_persons["condition"]=="Network") & (all_persons["layer_n"]==1), "condition"] = "Chain"
all_persons.loc[:, ["number_observed", "number_stories_observed"]] /= 3 #make mean, and fraction fo stories
len(all_persons)
846
45283
# Save to use in next analysis
all_persons.to_csv(f'{path_data_files}/data_processing/transmissions_word_level.csv')