import json

!pip install transformers

import torch
import pandas as pd
import numpy as np

import urllib.request

def getEntries ( url):
    
    request = urllib.request.Request (url   , None, {'User-Agent' : 'test'})
    response = urllib.request.urlopen (request)
    
    return json.loads(response.read())


df_disease_symptoms  = pd.read_csv("https://raw.githubusercontent.com/mcelikkaya/medium_articles2/main/dataset.csv")
df_disease_symptoms = df_disease_symptoms.fillna("")
df_disease_symptoms.head( 5 )

df_symptom_description  = pd.read_csv("https://raw.githubusercontent.com/mcelikkaya/medium_articles2/main/symptom_Description.csv")
df_symptom_description = df_symptom_description.fillna("")
df_symptom_description.head( 5 )

df_symptom_precaution  = pd.read_csv("https://raw.githubusercontent.com/mcelikkaya/medium_articles2/main/symptom_precaution.csv")
df_symptom_precaution = df_symptom_precaution.fillna("")
df_symptom_precaution.head( 5 )

def get_df_as_map(df):
    disease_list = df.to_numpy().tolist()
    diease_map = {}
    for disease_row in disease_list:
        disease = disease_row[0]
        #print(disease)
        if disease not in diease_map.keys():
            diease_map[disease] = []        
        for symptom in disease_row[1:]:
            if "" != symptom and symptom not in diease_map[disease]:            
                diease_map[disease].append(symptom)
    return diease_map

df_disease_symptoms_map = get_df_as_map( df_disease_symptoms)
df_symptom_precaution_map = get_df_as_map( df_symptom_precaution)
df_symptom_description_map = get_df_as_map( df_symptom_description)


df_disease_symptoms_questions = ["What is problem if,I have","What happens if,I feel",
                                 "What does it show when I have a problem as","What happend when my problem is"]
df_symptom_precaution_questions = ["What must I do","What is treatment","What can I do","What do you advice"]
df_symptom_description_questions =["What is","How do you define","What happens when you are","How do you understand"]

disease_text_map = {}
for disease in df_disease_symptoms_map.keys():
    try:
        #print(disease)
        symptoms = df_disease_symptoms_map[disease]
        precautions = df_symptom_precaution_map[disease]
        description = df_symptom_description_map[disease.strip()]    
        
        desc = description[0]
        symptoms_text = " ".join(symptoms )
        precautions_text =  " ".join(precautions )
        text =   desc #
        text +=  "The name of disease is "+ disease 
        text += " is an illness when you have " + symptoms_text + " ."
        text += " You must " + precautions_text + " ."
        #print(text)
        disease_text_map[disease] = [text,desc,symptoms_text,precautions_text]
    except:
        print("Error in disease",disease)

train_contexts = []
train_questions = []
train_answers = []

val_contexts = []
val_questions = []
val_answers = []

from textwrap import wrap

disease = list( disease_text_map.keys() )[0]
text = disease_text_map[disease][0]
desc = disease_text_map[disease][1]
symptoms_text = disease_text_map[disease][2]
precautions_text = disease_text_map[disease][3]
print("text :","\n".join(wrap(text, 120)) )
print("-----")
print("desc :","\n".join(wrap(desc, 120))) 
print("--")
print("symptoms_text : ",symptoms_text)
print("precautions_text : ",precautions_text)


for disease in disease_text_map.keys():
    text = disease_text_map[disease][0]
    desc = disease_text_map[disease][1]
    symptoms_text = disease_text_map[disease][2]
    precautions_text = disease_text_map[disease][3]
    
    
    #for index_symptom,symptom_description_question in enumerate( df_symptom_description_questions):
    #    if index_symptom < 2:
    #      train_contexts.append( text )
    #      train_questions.append( symptom_description_question + " " + disease )
    #      train_answers.append( {'text': desc, 'answer_start': text.find(desc)} )
    #    else:
    #      val_contexts.append( text )
    #      val_questions.append( symptom_description_question + " " + disease )
    #      val_answers.append( {'text': desc, 'answer_start': text.find(desc)} )  
        
    for index_description,symptom_description_question in enumerate(df_disease_symptoms_questions):
        if index_description < 2:
          train_contexts.append( text )
          train_questions.append( symptom_description_question + " " + symptoms_text + " ? " )
          train_answers.append( {'text': disease, 'answer_start': text.find(disease)} ) 
        else:
          val_contexts.append( text )
          val_questions.append( symptom_description_question + " " + symptoms_text + " ? ")
          val_answers.append( {'text': disease, 'answer_start': text.find(disease)} )    
        
    for index_precaution,symptom_precaution_question in enumerate(df_symptom_precaution_questions):
        if index_precaution < 2: 
          train_contexts.append( text )
          train_questions.append( symptom_precaution_question + " for " + disease + " ? ")
          train_answers.append( {'text': precautions_text, 'answer_start': text.find(precautions_text)} ) 
        else:
          val_contexts.append( text )
          val_questions.append( symptom_precaution_question + " for " + disease + " ? ")
          val_answers.append( {'text': precautions_text, 'answer_start': text.find(precautions_text)} )   

def dump_index(index):
  print("train_questions",train_questions[index])  
  print("train_answers",train_answers[index])
  print("val_question:",val_questions[index])
  print("val_answers:",val_answers[index])
  

dump_index(0)

#dump_index(1)
dump_index(2)

dump_index(3)

#for ii in range(len(train_questions)):
#  print(ii,"-----------------")
#  dump_index(ii)

#for ii in range(len(train_questions)):
#  print(ii,"-----------------")
#  dump_index(ii)

import pandas as pd
df = pd.DataFrame()
df["train_questions"] =train_questions 
df["train_answers"] =train_answers 
df["val_questions"] =val_questions 
df["val_answers"] =val_answers  
df

print("\n ".join( train_questions[10:16]) )
print(train_answers[10:16])


print("\n ".join( val_questions[0:6]) )
print(val_answers[0:6])

print("text :","\n".join(wrap(train_contexts[3], 120)) )
print("\nQuestion:",train_questions[3])
print(train_answers[3])

print(train_contexts[len(train_contexts)-1])
print("\n\ntrain_questions:",train_questions[len(train_contexts)-1])
print(train_answers[len(train_contexts)-1])


print(len(train_contexts))
print(len(train_questions))
print(len(train_answers))

print(len(val_contexts))
print(len(val_questions))
print(len(val_answers))

#TODO
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

print(train_contexts[0])
print("\n\ntrain_questions:",train_questions[0])
print(train_answers[0])

starts = [t["answer_start"] for t in train_answers]
ends = [t["answer_end"] for t in train_answers]
print("starts",starts)
print("ends",ends)
#[t["answer_start"] for t in train_answers]
#[t["answer_end"] for t in train_answers]
starts0s = [t for t in starts if t == 0]
print(len(starts0s))

#Open to see question and context
#for q,a in zip( val_questions, val_answers):
#  print(":",q)
#  print( a["text"])

#Open comment to see question and answer
#for q,a in zip( train_questions, train_answers):
#  print(q)
#  print( a["text"])

print(type(train_contexts))
print(type(train_questions))
print(train_contexts[0])
print(train_questions[0] )

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

tokenizer

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:          
            #print("start_positions minus,",answers[i])
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            #print("end_positions minus,",answers[i])
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

len(train_encodings["start_positions"])

import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = CustomDataset(train_encodings)
val_dataset = CustomDataset(val_encodings)

from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

_ = model.to(device)

def eval_loader(batch_size=16,dump_equals=False):
    t1 = time()
    model.eval()
    acc = []
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    dump_count = 0
    eval_data = []
    for batch in val_loader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)
            # make predictions
            outputs = model(input_ids, attention_mask=attention_mask)
            # get top prediction with argmax
            start_pred = torch.argmax(outputs['start_logits'], dim=1)
            end_pred = torch.argmax(outputs['end_logits'], dim=1)
            # calculate accuracy 
            acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
            acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

            eval_data.append( [start_pred , start_true,end_pred , end_true ])

            if dump_equals and dump_count < 14 and start_pred == start_true and end_pred == end_true:
              dump_count += 1
              tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][0])
              pred = get_text_atrange(  tokens,start_pred[0],end_pred[0] )
              print("tokens",pred )
              print("start_pred == start_true",start_pred , start_true)
              print("end_pred == end_true",end_pred , end_true)
              print("--")
    # calculate  accuracy in total
    acc = sum(acc)/len(acc)
    t2 = time()      
    elapsed = t2 - t1
    print('Elapsed time for eval %f seconds.' % elapsed)
    return acc ,eval_data

from time import time
import matplotlib.pyplot as plt

def train_model():
  t1 = time()
  model.train()
  train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
  optim = AdamW(model.parameters(), lr=5e-5)
  outputs_model = []
  for epoch in range(1):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        if len(outputs_model) == 0:
          outputs_model.append(outputs)
        loss = outputs[0]
        loss.backward()
        optim.step()
  t2 = time()      
  elapsed = t2 - t1
  print('Elapsed time for training %f seconds.' % elapsed)
  return outputs_model

#start_pred , start_true,end_pred , end_true
def plot_eval_old(eval_data):
  pred_x = []
  pred_y = []
  true_x = []
  true_y = []
  for e_data in eval_data:
    if e_data[3].item() - e_data[1].item() > 30:
      continue
    pred_x.append( e_data[0].item() )
    true_x.append( e_data[1].item() )
    pred_y.append( e_data[2].item() )
    true_y.append( e_data[3].item() )
  plt.scatter(pred_x,pred_y,color="blue",label="prediction")
  plt.scatter(true_x,true_y,color="orange",label="true")
  plt.legend()  
  plt.show()
  plt.plot(pred_x,pred_y,color="blue",label="prediction",linestyle='-')
  plt.plot(true_x,true_y,color="orange",label="true")
  plt.legend()
  plt.show()

def plot_eval(eval_data):
  fig, axes = plt.subplots(3,1)
  fig.set_size_inches(11, 8)
  axes[0].set_title("Prediction")
  axes[1].set_title("Actual")
  axes[2].set_title("Combined")
  for index,e_data in enumerate(eval_data):
    start_pred , end_pred =  e_data[0] , e_data[2]
    start_true , end_true =  e_data[1] , e_data[3]
    
    if abs(start_true - end_true) < 100:
      axes[0].plot([index,index],[start_pred,end_pred],color="blue",linestyle='-')
      axes[1].plot([index,index],[start_true,end_true],color="orange")  

      axes[2].plot([index*2,index*2],[start_pred,end_pred],color="blue",linestyle='-')
      axes[2].plot([index*2+1,index*2+1],[start_true,end_true],color="orange")  

      #axes[3].plot([index*2+1,index*2+1],[start_true,end_true],color="orange")  
      #axes[3].plot([index*2,index*2],[start_pred,end_pred],color="blue",linestyle='-')
      
  fig.tight_layout()    

      
acc,eval_data = eval_loader(1,dump_equals=False)
plot_eval( eval_data)

#Open if you want to see output
#outputs_model = train_model()
#outputs_model


def get_text_atrange(tokens,begin,end):
  print("begin,end",begin,end)
  answer = ""
  for i in range(begin, end + 1):    
    if tokens[i][0:2] == '##': # If it's a subword token, then recombine it with the previous token.
        answer += tokens[i][2:]    
    else: # Otherwise, add a space then the token.
        answer += ' ' + tokens[i]    
  return answer

#used to combine tokens with prefix ##
def get_tokens_at_text(tokens):
  
  answer = ""
  for token in tokens:    
    if token[0:2] == '##': # If it's a subword token, then recombine it with the previous token.
        answer += token[2:]    
    else: # Otherwise, add a space then the token.
        answer += ' ' + token   
  return answer      

import numpy as np
def get_top_answers(possible_starts,possible_ends,input_ids):
  answers = []
  for start,end in zip(possible_starts,possible_ends):
    #+1 for end
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[start:end+1]))
    answers.append( answer )
  return answers  

def answer_question(question,context,topN):

    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    model_out = model(**inputs.to(device))
     
    answer_start_scores = model_out["start_logits"]
    answer_end_scores = model_out["end_logits"]

    possible_starts = np.argsort(answer_start_scores.cpu().detach().numpy()).flatten()[::-1][:topN]
    possible_ends = np.argsort(answer_end_scores.cpu().detach().numpy()).flatten()[::-1][:topN]
    
    #get best answer
    answer_start = torch.argmax(answer_start_scores)  
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    answers = get_top_answers(possible_starts,possible_ends,input_ids )

    return { "answer":answer,"answer_start":answer_start,"answer_end":answer_end,"input_ids":input_ids,
            "answer_start_scores":answer_start_scores,"answer_end_scores":answer_end_scores,"inputs":inputs,"answers":answers,
            "possible_starts":possible_starts,"possible_ends":possible_ends}

import matplotlib.pyplot as plt

def plot_possible_answer(answer_map,expected_start,expected_end,title_postfix=""):
  start_scores = answer_map["answer_start_scores"]
  end_scores = answer_map["answer_end_scores"]
  tokens = tokenizer.decode( answer_map["input_ids"] ).split(" ") 
  #print("tokens",len(tokens),"---",len(answer_map["input_ids"]))
  tokens_ind = [ tokenizer.decode(t) for t in answer_map["input_ids"] ]
  #print("tokens_ind",len(tokens_ind))
  #print("tokens_ind",tokens_ind )
  fig,axes = plt.subplots(2,1)
  y_start = start_scores.cpu().detach().numpy().flatten()
  x_start = [i for i in range(len(y_start))]

  y_end = end_scores.cpu().detach().numpy().flatten()
  x_end = [i for i in range(len(y_end))]

  axes[0].bar(tokens_ind,y_start)
  axes[0].set_title("start scores( "+ str( len( y_start ) ) +") " + title_postfix )
  axes[0].figure.set_size_inches(10, 5)
  #axes[0].xaxis.set_labels( tokens_ind )
  axes[0].xaxis.set_tick_params(rotation=90)
  axes[0].axvline(expected_start,color="yellow")
  #print("axes[0].get_xticklabels()",axes[0].get_xticklabels())
  #axes[0].get_xticklabels()[0].set_color("yellow")
  #axes[0].set_xticklabels(tokens_ind, rotation = 90, ha="right")
  #axes[0].get_xticklabels()[expected_start].set_color("yellow")
  
  axes[1].bar(tokens_ind,y_end, color="orange")
  axes[1].set_title("end scores( "+ str( len( y_end ) ) +") " + title_postfix )
  axes[1].axvline(expected_end,color="red")
  axes[1].xaxis.set_tick_params(rotation=90)
  #axes[1].get_xticklabels()[expected_end].set_color("red")

  axes[0].autoscale(tight=True)
  axes[1].autoscale(tight=True)
  fig.tight_layout()
  #return fig   ,axes[0].get_xticklabels()         

eval_loader(batch_size=1,dump_equals=True)
#no training yet so accuracy so low

acc,eval_data = eval_loader(1,dump_equals=False)
plot_eval( eval_data)

def dump_validation(index):
  print( val_contexts[index] )
  print( val_questions[index] )

dump_validation(5)


index = val_questions.index("What do you advice for Fungal infection")
question = val_questions[index] 
context = val_contexts[index] 
topN = 5
expected_start = 70 #109
expected_end = 84 #127

answer_steps = []

#print("question",question)
#get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"])


print("question",question)
answer_map = answer_question(question,context,topN)
answer_steps.append( answer_map )
plot_possible_answer(answer_map,expected_start,expected_end,"Not trained")

tokens = tokenizer.convert_ids_to_tokens(answer_map["input_ids"])
for index,t in enumerate(tokens):
  if index % 10 == 0:
    print("")
  print(f"{index} ) {t}", end =" ")

_ = train_model()

print("question",question)
answer_map = answer_question(question,context,topN)
answer_steps.append( answer_map )
print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) )
plot_possible_answer(answer_map,expected_start,expected_end)

acc,eval_data = eval_loader(1,dump_equals=False)
print("accuracy eval :",acc)
plot_eval( eval_data)

_ = train_model()

print("Question",question)
answer_map = answer_question(question,context,topN)
answer_steps.append( answer_map )
print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) )
plot_possible_answer(answer_map,expected_start,expected_end)

acc,eval_data = eval_loader(1,dump_equals=False)
print("accuracy eval :",acc)
plot_eval( eval_data)

_ = train_model()
acc,eval_data = eval_loader(1,dump_equals=False)
print("accuracy eval :",acc)
plot_eval( eval_data)

print("question",question)
answer_map = answer_question(question,context,topN)
answer_steps.append( answer_map )
print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) )
plot_possible_answer(answer_map,expected_start,expected_end)

_ = train_model()
acc,eval_data = eval_loader(1,dump_equals=False)
print("accuracy eval :",acc)
plot_eval( eval_data)

print( answer_map["possible_starts"] )
print( answer_map["possible_ends"] )


print("question",question)
answer_map = answer_question(question,context,topN)
answer_steps.append( answer_map )
print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) )
plot_possible_answer(answer_map,expected_start,expected_end)


for index,a_map in enumerate(answer_steps):
  print("\n\nStep ",index+1)
  plot_possible_answer(a_map,expected_start,expected_end,"Step"+str(index+1))
  plt.show()
  
  
#answer_map["answer"]
print( "\n".join( get_top_answers(answer_map["possible_starts"],answer_map["possible_ends"],answer_map["input_ids"]) ) )
print("s",answer_map["possible_starts"])
print("e",answer_map["possible_ends"])

#template_question = "What do you advice for GERD"
template_question = "What is treatment for Dengue"
for index,q in enumerate(val_questions):
  if q == template_question:
    print( index )

sub_contexts = val_contexts[166:168]
sub_questions = val_questions[166:168]
sub_answers = val_answers[166:168]