#!/usr/bin/env python # coding: utf-8 # In[28]: #Random word list generator from random import choices _words = ["hello", "goodbye", "this", "th@t", 'whenever', 'wherever', 'cogdog'] choices(_words, k=10) # In[29]: get_ipython().run_cell_magic('capture', '', 'try:\n import pandas as pd\nexcept:\n !pip install pandas \n') # In[30]: #Create a simple dataframe with two random word lists import pandas as pd df = pd.DataFrame({'col1':choices(_words, k=100), 'col2':choices(_words+['gotcha'], k=100)}) #We can save the data to a csv file... df.to_csv('mywords.csv', index=False) #Or preview it df.head() # In[31]: #Here's what it looks like as csv get_ipython().system('head -n 3 mywords.csv') # In[32]: #load the csv into another dataframe - to show we can df2 = pd.read_csv('mywords.csv') df2.head() # In[33]: get_ipython().run_cell_magic('capture', '', '#Install wordcloud package\ntry:\n import wordcloud\nexcept:\n !pip install wordcloud\n') # In[34]: get_ipython().run_cell_magic('capture', '', 'try:\n import matplotlib\nexcept:\n !pip install matplotlib\n') # In[35]: #Required graphics package import matplotlib.pyplot as plt #...and magic to diplay results inline in the notebook... get_ipython().run_line_magic('matplotlib', 'inline') # In[36]: from wordcloud import WordCloud # Generate a word cloud image wordcloud = WordCloud().generate(' '.join(df2['col1'].tolist())) plt.imshow(wordcloud, interpolation='bilinear'); # In[37]: wordcloud = WordCloud().generate(' '.join(df2['col2'].tolist())) plt.imshow(wordcloud, interpolation='bilinear'); # If you have sentences, they can be split... # # It's particularly easy if the split is regular. For example: # In[38]: df['col3a'] = '@'+df['col1']+ '/' + df['col2'] df.head() # In[39]: #We can split a string in a column and then expand it over a couple of columns df[['col3b','col3c']] = df['col3a'].str.split('/', 1, expand=True) df.head() # Want username without the `@`? # In[40]: #We could do a trivial replace, but we can also regex to be more precise #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.replace.html df['col3b'] = df['col3b'].str.replace('^@','') df.head() # In[ ]: