#!/usr/bin/env python # coding: utf-8 # In[3]: import pandas as pd from textblob import Word # In[4]: headers = pd.read_csv("header.csv") headers['Header'] # In[9]: citation = [Word("citation").synsets[2], Word("reference").synsets[1], Word("cite").synsets[3]] run = [Word("run").synsets[9],Word("run").synsets[34],Word("execute").synsets[4]] install = [Word("installation").synsets[0],Word("install").synsets[0],Word("setup").synsets[1],Word("prepare").synsets[0],Word("preparation").synsets[0],Word("manual").synsets[0],Word("guide").synsets[2],Word("guide").synsets[9]] download = [Word("download").synsets[0]] requirement = [Word("requirement").synsets[2],Word("prerequisite").synsets[0],Word("prerequisite").synsets[1],Word("dependency").synsets[0],Word("dependent").synsets[0]] contact = [Word("contact").synsets[9]] description = [Word("description").synsets[0],Word("description").synsets[1],Word("introduction").synsets[3],Word("introduction").synsets[6],Word("basics").synsets[0],Word("initiation").synsets[1],Word("start").synsets[0],Word("start").synsets[4],Word("started").synsets[0],Word("started").synsets[1],Word("started").synsets[7],Word("started").synsets[8],Word("overview").synsets[0],Word("summary").synsets[0],Word("summary").synsets[2]] contributor = [Word("contributor").synsets[0]] documentation = [Word("documentation").synsets[1]] license = [Word("license").synsets[3],Word("license").synsets[0]] usage = [Word("usage").synsets[0],Word("example").synsets[0],Word("example").synsets[5],Word("implement").synsets[1],Word("implementation").synsets[1],Word("demo").synsets[1],Word("tutorial").synsets[0],Word("tutorial").synsets[1]] update = [Word("updating").synsets[0],Word("updating").synsets[3]] issues = [Word("issues").synsets[0],Word("errors").synsets[5],Word("problems").synsets[0],Word("problems").synsets[2]] support = [Word("support").synsets[7],Word("help").synsets[0],Word("help").synsets[9],Word("report").synsets[0],Word("report").synsets[6]] group = dict() group.update({"citation":citation}) group.update({"download":download}) group.update({"run":run}) group.update({"installation":install}) group.update({"requirement":requirement}) group.update({"contact":contact}) group.update({"description":description}) group.update({"contributor":contributor}) group.update({"documentation":documentation}) group.update({"license":license}) group.update({"usage":usage}) group.update({"update":update}) group.update({"issues":issues}) group.update({"support":support}) def find_sim(wordlist,wd): #returns the max probability between a word and subgroup simvalue = [] for sense in wordlist: if(wd.path_similarity(sense)!=None): simvalue.append(wd.path_similarity(sense)) if(len(simvalue)!=0): return max(simvalue) else: return 0 def match_group(word_syn,group,threshold): currmax = 0 maxgroup = "" simvalues = dict() for sense in word_syn: #for a given sense of a word similarities = [] for key, value in group.items(): #value has all the similar words path_sim = find_sim(value,sense) # print("Similarity is:",path_sim) if(path_sim>threshold): #then append to the list if(path_sim>currmax): maxgroup = key currmax = path_sim return maxgroup # In[19]: datadf = pd.DataFrame({'Header': [], 'Group': []}) matchedgroups = [] for h in headers["Header"]: sentence = h.split(" ")[1:] for s in sentence: synn = Word(s).synsets if(len(synn)>0): bestgroup = match_group(synn,group,0.6) if(bestgroup!=""): datadf = datadf.append({'Header' : h, 'Group' : bestgroup}, ignore_index=True) print(datadf) datadf.to_csv('header_groups.csv', index=False) # In[ ]: # In[ ]: