import json !pip install transformers import torch import pandas as pd import numpy as np import urllib.request def getEntries ( url): request = urllib.request.Request (url , None, {'User-Agent' : 'test'}) response = urllib.request.urlopen (request) return json.loads(response.read()) df_disease_symptoms = pd.read_csv("https://raw.githubusercontent.com/mcelikkaya/medium_articles2/main/dataset.csv") df_disease_symptoms = df_disease_symptoms.fillna("") df_disease_symptoms.head( 5 ) df_symptom_description = pd.read_csv("https://raw.githubusercontent.com/mcelikkaya/medium_articles2/main/symptom_Description.csv") df_symptom_description = df_symptom_description.fillna("") df_symptom_description.head( 5 ) df_symptom_precaution = pd.read_csv("https://raw.githubusercontent.com/mcelikkaya/medium_articles2/main/symptom_precaution.csv") df_symptom_precaution = df_symptom_precaution.fillna("") df_symptom_precaution.head( 5 ) def get_df_as_map(df): disease_list = df.to_numpy().tolist() diease_map = {} for disease_row in disease_list: disease = disease_row[0] #print(disease) if disease not in diease_map.keys(): diease_map[disease] = [] for symptom in disease_row[1:]: if "" != symptom and symptom not in diease_map[disease]: diease_map[disease].append(symptom) return diease_map df_disease_symptoms_map = get_df_as_map( df_disease_symptoms) df_symptom_precaution_map = get_df_as_map( df_symptom_precaution) df_symptom_description_map = get_df_as_map( df_symptom_description) df_disease_symptoms_questions = ["What is problem if,I have","What happens if,I feel", "What does it show when I have a problem as","What happend when my problem is"] df_symptom_precaution_questions = ["What must I do","What is treatment","What can I do","What do you advice"] df_symptom_description_questions =["What is","How do you define","What happens when you are","How do you understand"] disease_text_map = {} for disease in df_disease_symptoms_map.keys(): try: #print(disease) symptoms = df_disease_symptoms_map[disease] precautions = df_symptom_precaution_map[disease] description = df_symptom_description_map[disease.strip()] desc = description[0] symptoms_text = " ".join(symptoms ) precautions_text = " ".join(precautions ) text = desc # text += "The name of disease is "+ disease text += " is an illness when you have " + symptoms_text + " ." text += " You must " + precautions_text + " ." #print(text) disease_text_map[disease] = [text,desc,symptoms_text,precautions_text] except: print("Error in disease",disease) train_contexts = [] train_questions = [] train_answers = [] val_contexts = [] val_questions = [] val_answers = [] from textwrap import wrap disease = list( disease_text_map.keys() )[0] text = disease_text_map[disease][0] desc = disease_text_map[disease][1] symptoms_text = disease_text_map[disease][2] precautions_text = disease_text_map[disease][3] print("text :","\n".join(wrap(text, 120)) ) print("-----") print("desc :","\n".join(wrap(desc, 120))) print("--") print("symptoms_text : ",symptoms_text) print("precautions_text : ",precautions_text) for disease in disease_text_map.keys(): text = disease_text_map[disease][0] desc = disease_text_map[disease][1] symptoms_text = disease_text_map[disease][2] precautions_text = disease_text_map[disease][3] #for index_symptom,symptom_description_question in enumerate( df_symptom_description_questions): # if index_symptom < 2: # train_contexts.append( text ) # train_questions.append( symptom_description_question + " " + disease ) # train_answers.append( {'text': desc, 'answer_start': text.find(desc)} ) # else: # val_contexts.append( text ) # val_questions.append( symptom_description_question + " " + disease ) # val_answers.append( {'text': desc, 'answer_start': text.find(desc)} ) for index_description,symptom_description_question in enumerate(df_disease_symptoms_questions): if index_description < 2: train_contexts.append( text ) train_questions.append( symptom_description_question + " " + symptoms_text + " ? " ) train_answers.append( {'text': disease, 'answer_start': text.find(disease)} ) else: val_contexts.append( text ) val_questions.append( symptom_description_question + " " + symptoms_text + " ? ") val_answers.append( {'text': disease, 'answer_start': text.find(disease)} ) for index_precaution,symptom_precaution_question in enumerate(df_symptom_precaution_questions): if index_precaution < 2: train_contexts.append( text ) train_questions.append( symptom_precaution_question + " for " + disease + " ? ") train_answers.append( {'text': precautions_text, 'answer_start': text.find(precautions_text)} ) else: val_contexts.append( text ) val_questions.append( symptom_precaution_question + " for " + disease + " ? ") val_answers.append( {'text': precautions_text, 'answer_start': text.find(precautions_text)} ) def dump_index(index): print("train_questions",train_questions[index]) print("train_answers",train_answers[index]) print("val_question:",val_questions[index]) print("val_answers:",val_answers[index]) dump_index(0) #dump_index(1) dump_index(2) dump_index(3) #for ii in range(len(train_questions)): # print(ii,"-----------------") # dump_index(ii) #for ii in range(len(train_questions)): # print(ii,"-----------------") # dump_index(ii) import pandas as pd df = pd.DataFrame() df["train_questions"] =train_questions df["train_answers"] =train_answers df["val_questions"] =val_questions df["val_answers"] =val_answers df print("\n ".join( train_questions[10:16]) ) print(train_answers[10:16]) print("\n ".join( val_questions[0:6]) ) print(val_answers[0:6]) print("text :","\n".join(wrap(train_contexts[3], 120)) ) print("\nQuestion:",train_questions[3]) print(train_answers[3]) print(train_contexts[len(train_contexts)-1]) print("\n\ntrain_questions:",train_questions[len(train_contexts)-1]) print(train_answers[len(train_contexts)-1]) print(len(train_contexts)) print(len(train_questions)) print(len(train_answers)) print(len(val_contexts)) print(len(val_questions)) print(len(val_answers)) #TODO def add_end_idx(answers, contexts): for answer, context in zip(answers, contexts): gold_text = answer['text'] start_idx = answer['answer_start'] end_idx = start_idx + len(gold_text) # sometimes squad answers are off by a character or two – fix this if context[start_idx:end_idx] == gold_text: answer['answer_end'] = end_idx elif context[start_idx-1:end_idx-1] == gold_text: answer['answer_start'] = start_idx - 1 answer['answer_end'] = end_idx - 1 # When the gold label is off by one character elif context[start_idx-2:end_idx-2] == gold_text: answer['answer_start'] = start_idx - 2 answer['answer_end'] = end_idx - 2 # When the gold label is off by two characters add_end_idx(train_answers, train_contexts) add_end_idx(val_answers, val_contexts) print(train_contexts[0]) print("\n\ntrain_questions:",train_questions[0]) print(train_answers[0]) starts = [t["answer_start"] for t in train_answers] ends = [t["answer_end"] for t in train_answers] print("starts",starts) print("ends",ends) #[t["answer_start"] for t in train_answers] #[t["answer_end"] for t in train_answers] starts0s = [t for t in starts if t == 0] print(len(starts0s)) #Open to see question and context #for q,a in zip( val_questions, val_answers): # print(":",q) # print( a["text"]) #Open comment to see question and answer #for q,a in zip( train_questions, train_answers): # print(q) # print( a["text"]) print(type(train_contexts)) print(type(train_questions)) print(train_contexts[0]) print(train_questions[0] ) from transformers import DistilBertTokenizerFast tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True) val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True) tokenizer def add_token_positions(encodings, answers): start_positions = [] end_positions = [] for i in range(len(answers)): start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'])) end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1)) # if start position is None, the answer passage has been truncated if start_positions[-1] is None: #print("start_positions minus,",answers[i]) start_positions[-1] = tokenizer.model_max_length if end_positions[-1] is None: #print("end_positions minus,",answers[i]) end_positions[-1] = tokenizer.model_max_length encodings.update({'start_positions': start_positions, 'end_positions': end_positions}) add_token_positions(train_encodings, train_answers) add_token_positions(val_encodings, val_answers) len(train_encodings["start_positions"]) import torch class CustomDataset(torch.utils.data.Dataset): def __init__(self, encodings): self.encodings = encodings def __getitem__(self, idx): return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} def __len__(self): return len(self.encodings.input_ids) train_dataset = CustomDataset(train_encodings) val_dataset = CustomDataset(val_encodings) from transformers import DistilBertForQuestionAnswering model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") from torch.utils.data import DataLoader from transformers import AdamW device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') _ = model.to(device) def eval_loader(batch_size=16,dump_equals=False): t1 = time() model.eval() acc = [] val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True) dump_count = 0 eval_data = [] for batch in val_loader: with torch.no_grad(): input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) start_true = batch['start_positions'].to(device) end_true = batch['end_positions'].to(device) # make predictions outputs = model(input_ids, attention_mask=attention_mask) # get top prediction with argmax start_pred = torch.argmax(outputs['start_logits'], dim=1) end_pred = torch.argmax(outputs['end_logits'], dim=1) # calculate accuracy acc.append(((start_pred == start_true).sum()/len(start_pred)).item()) acc.append(((end_pred == end_true).sum()/len(end_pred)).item()) eval_data.append( [start_pred , start_true,end_pred , end_true ]) if dump_equals and dump_count < 14 and start_pred == start_true and end_pred == end_true: dump_count += 1 tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][0]) pred = get_text_atrange( tokens,start_pred[0],end_pred[0] ) print("tokens",pred ) print("start_pred == start_true",start_pred , start_true) print("end_pred == end_true",end_pred , end_true) print("--") # calculate accuracy in total acc = sum(acc)/len(acc) t2 = time() elapsed = t2 - t1 print('Elapsed time for eval %f seconds.' % elapsed) return acc ,eval_data from time import time import matplotlib.pyplot as plt def train_model(): t1 = time() model.train() train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) optim = AdamW(model.parameters(), lr=5e-5) outputs_model = [] for epoch in range(1): for batch in train_loader: optim.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) start_positions = batch['start_positions'].to(device) end_positions = batch['end_positions'].to(device) outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions) if len(outputs_model) == 0: outputs_model.append(outputs) loss = outputs[0] loss.backward() optim.step() t2 = time() elapsed = t2 - t1 print('Elapsed time for training %f seconds.' % elapsed) return outputs_model #start_pred , start_true,end_pred , end_true def plot_eval_old(eval_data): pred_x = [] pred_y = [] true_x = [] true_y = [] for e_data in eval_data: if e_data[3].item() - e_data[1].item() > 30: continue pred_x.append( e_data[0].item() ) true_x.append( e_data[1].item() ) pred_y.append( e_data[2].item() ) true_y.append( e_data[3].item() ) plt.scatter(pred_x,pred_y,color="blue",label="prediction") plt.scatter(true_x,true_y,color="orange",label="true") plt.legend() plt.show() plt.plot(pred_x,pred_y,color="blue",label="prediction",linestyle='-') plt.plot(true_x,true_y,color="orange",label="true") plt.legend() plt.show() def plot_eval(eval_data): fig, axes = plt.subplots(3,1) fig.set_size_inches(11, 8) axes[0].set_title("Prediction") axes[1].set_title("Actual") axes[2].set_title("Combined") for index,e_data in enumerate(eval_data): start_pred , end_pred = e_data[0] , e_data[2] start_true , end_true = e_data[1] , e_data[3] if abs(start_true - end_true) < 100: axes[0].plot([index,index],[start_pred,end_pred],color="blue",linestyle='-') axes[1].plot([index,index],[start_true,end_true],color="orange") axes[2].plot([index*2,index*2],[start_pred,end_pred],color="blue",linestyle='-') axes[2].plot([index*2+1,index*2+1],[start_true,end_true],color="orange") #axes[3].plot([index*2+1,index*2+1],[start_true,end_true],color="orange") #axes[3].plot([index*2,index*2],[start_pred,end_pred],color="blue",linestyle='-') fig.tight_layout() acc,eval_data = eval_loader(1,dump_equals=False) plot_eval( eval_data) #Open if you want to see output #outputs_model = train_model() #outputs_model def get_text_atrange(tokens,begin,end): print("begin,end",begin,end) answer = "" for i in range(begin, end + 1): if tokens[i][0:2] == '##': # If it's a subword token, then recombine it with the previous token. answer += tokens[i][2:] else: # Otherwise, add a space then the token. answer += ' ' + tokens[i] return answer #used to combine tokens with prefix ## def get_tokens_at_text(tokens): answer = "" for token in tokens: if token[0:2] == '##': # If it's a subword token, then recombine it with the previous token. answer += token[2:] else: # Otherwise, add a space then the token. answer += ' ' + token return answer import numpy as np def get_top_answers(possible_starts,possible_ends,input_ids): answers = [] for start,end in zip(possible_starts,possible_ends): #+1 for end answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[start:end+1])) answers.append( answer ) return answers def answer_question(question,context,topN): inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt") input_ids = inputs["input_ids"].tolist()[0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) model_out = model(**inputs.to(device)) answer_start_scores = model_out["start_logits"] answer_end_scores = model_out["end_logits"] possible_starts = np.argsort(answer_start_scores.cpu().detach().numpy()).flatten()[::-1][:topN] possible_ends = np.argsort(answer_end_scores.cpu().detach().numpy()).flatten()[::-1][:topN] #get best answer answer_start = torch.argmax(answer_start_scores) answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) answers = get_top_answers(possible_starts,possible_ends,input_ids ) return { "answer":answer,"answer_start":answer_start,"answer_end":answer_end,"input_ids":input_ids, "answer_start_scores":answer_start_scores,"answer_end_scores":answer_end_scores,"inputs":inputs,"answers":answers, "possible_starts":possible_starts,"possible_ends":possible_ends} import matplotlib.pyplot as plt def plot_possible_answer(answer_map,expected_start,expected_end,title_postfix=""): start_scores = answer_map["answer_start_scores"] end_scores = answer_map["answer_end_scores"] tokens = tokenizer.decode( answer_map["input_ids"] ).split(" ") #print("tokens",len(tokens),"---",len(answer_map["input_ids"])) tokens_ind = [ tokenizer.decode(t) for t in answer_map["input_ids"] ] #print("tokens_ind",len(tokens_ind)) #print("tokens_ind",tokens_ind ) fig,axes = plt.subplots(2,1) y_start = start_scores.cpu().detach().numpy().flatten() x_start = [i for i in range(len(y_start))] y_end = end_scores.cpu().detach().numpy().flatten() x_end = [i for i in range(len(y_end))] axes[0].bar(tokens_ind,y_start) axes[0].set_title("start scores( "+ str( len( y_start ) ) +") " + title_postfix ) axes[0].figure.set_size_inches(10, 5) #axes[0].xaxis.set_labels( tokens_ind ) axes[0].xaxis.set_tick_params(rotation=90) axes[0].axvline(expected_start,color="yellow") #print("axes[0].get_xticklabels()",axes[0].get_xticklabels()) #axes[0].get_xticklabels()[0].set_color("yellow") #axes[0].set_xticklabels(tokens_ind, rotation = 90, ha="right") #axes[0].get_xticklabels()[expected_start].set_color("yellow") axes[1].bar(tokens_ind,y_end, color="orange") axes[1].set_title("end scores( "+ str( len( y_end ) ) +") " + title_postfix ) axes[1].axvline(expected_end,color="red") axes[1].xaxis.set_tick_params(rotation=90) #axes[1].get_xticklabels()[expected_end].set_color("red") axes[0].autoscale(tight=True) axes[1].autoscale(tight=True) fig.tight_layout() #return fig ,axes[0].get_xticklabels() eval_loader(batch_size=1,dump_equals=True) #no training yet so accuracy so low acc,eval_data = eval_loader(1,dump_equals=False) plot_eval( eval_data) def dump_validation(index): print( val_contexts[index] ) print( val_questions[index] ) dump_validation(5) index = val_questions.index("What do you advice for Fungal infection") question = val_questions[index] context = val_contexts[index] topN = 5 expected_start = 70 #109 expected_end = 84 #127 answer_steps = [] #print("question",question) #get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) print("question",question) answer_map = answer_question(question,context,topN) answer_steps.append( answer_map ) plot_possible_answer(answer_map,expected_start,expected_end,"Not trained") tokens = tokenizer.convert_ids_to_tokens(answer_map["input_ids"]) for index,t in enumerate(tokens): if index % 10 == 0: print("") print(f"{index} ) {t}", end =" ") _ = train_model() print("question",question) answer_map = answer_question(question,context,topN) answer_steps.append( answer_map ) print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) ) plot_possible_answer(answer_map,expected_start,expected_end) acc,eval_data = eval_loader(1,dump_equals=False) print("accuracy eval :",acc) plot_eval( eval_data) _ = train_model() print("Question",question) answer_map = answer_question(question,context,topN) answer_steps.append( answer_map ) print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) ) plot_possible_answer(answer_map,expected_start,expected_end) acc,eval_data = eval_loader(1,dump_equals=False) print("accuracy eval :",acc) plot_eval( eval_data) _ = train_model() acc,eval_data = eval_loader(1,dump_equals=False) print("accuracy eval :",acc) plot_eval( eval_data) print("question",question) answer_map = answer_question(question,context,topN) answer_steps.append( answer_map ) print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) ) plot_possible_answer(answer_map,expected_start,expected_end) _ = train_model() acc,eval_data = eval_loader(1,dump_equals=False) print("accuracy eval :",acc) plot_eval( eval_data) print( answer_map["possible_starts"] ) print( answer_map["possible_ends"] ) print("question",question) answer_map = answer_question(question,context,topN) answer_steps.append( answer_map ) print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) ) plot_possible_answer(answer_map,expected_start,expected_end) for index,a_map in enumerate(answer_steps): print("\n\nStep ",index+1) plot_possible_answer(a_map,expected_start,expected_end,"Step"+str(index+1)) plt.show() #answer_map["answer"] print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) ) print("s",answer_map["possible_starts"]) print("e",answer_map["possible_ends"]) #template_question = "What do you advice for GERD" template_question = "What is treatment for Dengue" for index,q in enumerate(val_questions): if q == template_question: print( index ) sub_contexts = val_contexts[166:168] sub_questions = val_questions[166:168] sub_answers = val_answers[166:168]