#!/usr/bin/env python
# coding: utf-8

# ## Step 1: Install required libraries for this task
# ### ONLY install once, next time, just run from Cell [2]

# In[ ]:


## Run this cell to install the libraries needed
# you only need to install ONCE

get_ipython().system('pip install fuzzywuzzy')
get_ipython().system('pip install python-Levenshtein')


# ### Step 2: Import necessary libraries

# In[3]:


import pandas as pd
from fuzzywuzzy import fuzz


# ### Step 3: before you start, please make sure the column is named 'company' in both Excel lists. 
# 
# ### Edit the paths below, tell the programme which TWO excel lists you want to check against

# In[2]:


## Bring in the 2 lists that you need to compare
## Turn the lists into 2 dataframes

system_list = pd.read_excel(r'C:\Users\hyyiu\Desktop\similar_names\companies_on_jb\test\system_companies.xlsx')
new_list = pd.read_excel(r'C:\Users\hyyiu\Desktop\similar_names\companies_on_jb\test\new_companies.xlsx')


# In[3]:


## Preview the head rows of System List
system_list.head()


# In[4]:


## Preview the head rows of New List
new_list.head()


# ### Step 4 - Edit the similarity ratio (out of 100%) you want below before running the cell. When it finishes, a new Excel file will be saved to the same folder path.
# 
# #### Please make sure the column name is always 'company' in both Excel lists.

# In[5]:


## Please make sure the column name is always 'company' in both excel lists.

x = np.array(np.meshgrid(system_list['company'].values, new_list['company'].values)).T.reshape(-1,2)
checked_names_df = pd.DataFrame(x)

checked_names_df.columns = ['System List', 'New List']

checked_names_df['Ratio'] = [fuzz.ratio(*i) for i in map(tuple, x)]


print (checked_names_df[checked_names_df.Ratio > 80]) # <---- UPDATE THE RATIO YOU WANT HERE!

checked_names_df[checked_names_df.Ratio > 80].to_excel('results.xlsx') # <---- UPDATE THE RATIO YOU WANT HERE!


# In[ ]: