#!/usr/bin/env python # coding: utf-8 # ## Step 1: Install required libraries for this task # ### ONLY install once, next time, just run from Cell [2] # In[ ]: ## Run this cell to install the libraries needed # you only need to install ONCE get_ipython().system('pip install fuzzywuzzy') get_ipython().system('pip install python-Levenshtein') # ### Step 2: Import necessary libraries # In[3]: import pandas as pd from fuzzywuzzy import fuzz # ### Step 3: before you start, please make sure the column is named 'company' in both Excel lists. # # ### Edit the paths below, tell the programme which TWO excel lists you want to check against # In[2]: ## Bring in the 2 lists that you need to compare ## Turn the lists into 2 dataframes system_list = pd.read_excel(r'C:\Users\hyyiu\Desktop\similar_names\companies_on_jb\test\system_companies.xlsx') new_list = pd.read_excel(r'C:\Users\hyyiu\Desktop\similar_names\companies_on_jb\test\new_companies.xlsx') # In[3]: ## Preview the head rows of System List system_list.head() # In[4]: ## Preview the head rows of New List new_list.head() # ### Step 4 - Edit the similarity ratio (out of 100%) you want below before running the cell. When it finishes, a new Excel file will be saved to the same folder path. # # #### Please make sure the column name is always 'company' in both Excel lists. # In[5]: ## Please make sure the column name is always 'company' in both excel lists. x = np.array(np.meshgrid(system_list['company'].values, new_list['company'].values)).T.reshape(-1,2) checked_names_df = pd.DataFrame(x) checked_names_df.columns = ['System List', 'New List'] checked_names_df['Ratio'] = [fuzz.ratio(*i) for i in map(tuple, x)] print (checked_names_df[checked_names_df.Ratio > 80]) # <---- UPDATE THE RATIO YOU WANT HERE! checked_names_df[checked_names_df.Ratio > 80].to_excel('results.xlsx') # <---- UPDATE THE RATIO YOU WANT HERE! # In[ ]: