#!/usr/bin/env python
# coding: utf-8

# In[28]:


#Random word list generator

from random import choices
_words = ["hello", "goodbye", "this", "th@t", 'whenever', 'wherever', 'cogdog']
choices(_words, k=10)


# In[29]:


get_ipython().run_cell_magic('capture', '', 'try:\n    import pandas as pd\nexcept:\n    !pip install pandas \n')


# In[30]:


#Create a simple dataframe with two random word lists

import pandas as pd

df = pd.DataFrame({'col1':choices(_words, k=100), 'col2':choices(_words+['gotcha'], k=100)})

#We can save the data to a csv file...
df.to_csv('mywords.csv', index=False)

#Or preview it
df.head()


# In[31]:


#Here's what it looks like as csv
get_ipython().system('head -n 3 mywords.csv')


# In[32]:


#load the csv into another dataframe - to show we can
df2 = pd.read_csv('mywords.csv')
df2.head()


# In[33]:


get_ipython().run_cell_magic('capture', '', '#Install wordcloud package\ntry:\n    import wordcloud\nexcept:\n    !pip install wordcloud\n')


# In[34]:


get_ipython().run_cell_magic('capture', '', 'try:\n    import matplotlib\nexcept:\n    !pip install matplotlib\n')


# In[35]:


#Required graphics package
import matplotlib.pyplot as plt
#...and magic to diplay results inline in the notebook...
get_ipython().run_line_magic('matplotlib', 'inline')


# In[36]:


from wordcloud import WordCloud

# Generate a word cloud image
wordcloud = WordCloud().generate(' '.join(df2['col1'].tolist()))


plt.imshow(wordcloud, interpolation='bilinear');


# In[37]:


wordcloud = WordCloud().generate(' '.join(df2['col2'].tolist()))

plt.imshow(wordcloud, interpolation='bilinear');


# If you have sentences, they can be split...
# 
# It's particularly easy if the split is regular. For example:

# In[38]:


df['col3a'] = '@'+df['col1']+ '/' + df['col2']
df.head()


# In[39]:


#We can split a string in a column and then expand it over a couple of columns
df[['col3b','col3c']] = df['col3a'].str.split('/', 1, expand=True)
df.head()


# Want username without the `@`?

# In[40]:


#We could do a trivial replace, but we can also regex to be more precise
#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.replace.html
df['col3b'] = df['col3b'].str.replace('^@','')
df.head()


# In[ ]: