#Random word list generator
from random import choices
_words = ["hello", "goodbye", "this", "th@t", 'whenever', 'wherever', 'cogdog']
choices(_words, k=10)
['th@t', 'this', 'goodbye', 'cogdog', 'th@t', 'whenever', 'goodbye', 'whenever', 'th@t', 'wherever']
%%capture
try:
import pandas as pd
except:
!pip install pandas
#Create a simple dataframe with two random word lists
import pandas as pd
df = pd.DataFrame({'col1':choices(_words, k=100), 'col2':choices(_words+['gotcha'], k=100)})
#We can save the data to a csv file...
df.to_csv('mywords.csv', index=False)
#Or preview it
df.head()
col1 | col2 | |
---|---|---|
0 | th@t | hello |
1 | whenever | th@t |
2 | th@t | goodbye |
3 | cogdog | whenever |
4 | wherever | th@t |
#Here's what it looks like as csv
!head -n 3 mywords.csv
col1,col2 th@t,hello whenever,th@t
#load the csv into another dataframe - to show we can
df2 = pd.read_csv('mywords.csv')
df2.head()
col1 | col2 | |
---|---|---|
0 | th@t | hello |
1 | whenever | th@t |
2 | th@t | goodbye |
3 | cogdog | whenever |
4 | wherever | th@t |
%%capture
#Install wordcloud package
try:
import wordcloud
except:
!pip install wordcloud
%%capture
try:
import matplotlib
except:
!pip install matplotlib
#Required graphics package
import matplotlib.pyplot as plt
#...and magic to diplay results inline in the notebook...
%matplotlib inline
from wordcloud import WordCloud
# Generate a word cloud image
wordcloud = WordCloud().generate(' '.join(df2['col1'].tolist()))
plt.imshow(wordcloud, interpolation='bilinear');
wordcloud = WordCloud().generate(' '.join(df2['col2'].tolist()))
plt.imshow(wordcloud, interpolation='bilinear');
If you have sentences, they can be split...
It's particularly easy if the split is regular. For example:
df['col3a'] = '@'+df['col1']+ '/' + df['col2']
df.head()
col1 | col2 | col3a | |
---|---|---|---|
0 | th@t | hello | @th@t/hello |
1 | whenever | th@t | @whenever/th@t |
2 | th@t | goodbye | @th@t/goodbye |
3 | cogdog | whenever | @cogdog/whenever |
4 | wherever | th@t | @wherever/th@t |
#We can split a string in a column and then expand it over a couple of columns
df[['col3b','col3c']] = df['col3a'].str.split('/', 1, expand=True)
df.head()
col1 | col2 | col3a | col3b | col3c | |
---|---|---|---|---|---|
0 | th@t | hello | @th@t/hello | @th@t | hello |
1 | whenever | th@t | @whenever/th@t | @whenever | th@t |
2 | th@t | goodbye | @th@t/goodbye | @th@t | goodbye |
3 | cogdog | whenever | @cogdog/whenever | @cogdog | whenever |
4 | wherever | th@t | @wherever/th@t | @wherever | th@t |
Want username without the @
?
#We could do a trivial replace, but we can also regex to be more precise
#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.replace.html
df['col3b'] = df['col3b'].str.replace('^@','')
df.head()
col1 | col2 | col3a | col3b | col3c | |
---|---|---|---|---|---|
0 | th@t | hello | @th@t/hello | th@t | hello |
1 | whenever | th@t | @whenever/th@t | whenever | th@t |
2 | th@t | goodbye | @th@t/goodbye | th@t | goodbye |
3 | cogdog | whenever | @cogdog/whenever | cogdog | whenever |
4 | wherever | th@t | @wherever/th@t | wherever | th@t |