#!/usr/bin/env python # coding: utf-8 # In[1]: #%pip install-r requirements.txt # In[1]: from IPython.display import clear_output, Image, display import PIL.Image import io import json import torch import numpy as np from processing_image import Preprocess from visualizing_image import SingleImageViz from modeling_frcnn import GeneralizedRCNN from utils import Config import utils from transformers import LxmertForQuestionAnswering, LxmertTokenizer import wget import pickle import os # URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/images/input.jpg", URL = "https://vqa.cloudcv.org/media/test2014/COCO_test2014_000000262567.jpg" OBJ_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt" ATTR_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt" GQA_URL = "https://raw.githubusercontent.com/airsplay/lxmert/master/data/gqa/trainval_label2ans.json" VQA_URL = "https://raw.githubusercontent.com/airsplay/lxmert/master/data/vqa/trainval_label2ans.json" # for visualizing output def showarray(a, fmt='jpeg'): a = np.uint8(np.clip(a, 0, 255)) f = io.BytesIO() PIL.Image.fromarray(a).save(f, fmt) display(Image(data=f.getvalue())) # In[2]: # load object, attribute, and answer labels objids = utils.get_data(OBJ_URL) attrids = utils.get_data(ATTR_URL) gqa_answers = utils.get_data(GQA_URL) vqa_answers = utils.get_data(VQA_URL) # In[3]: # load models and model components frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg) image_preprocess = Preprocess(frcnn_cfg) lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased") lxmert_gqa = LxmertForQuestionAnswering.from_pretrained("unc-nlp/lxmert-gqa-uncased") lxmert_vqa = LxmertForQuestionAnswering.from_pretrained("unc-nlp/lxmert-vqa-uncased") # In[6]: #image viz frcnn_visualizer = SingleImageViz(URL, id2obj=objids, id2attr=attrids) # run frcnn images, sizes, scales_yx = image_preprocess(URL) output_dict = frcnn( images, sizes, scales_yx=scales_yx, padding="max_detections", max_detections=frcnn_cfg.max_detections, return_tensors="pt" ) # add boxes and labels to the image frcnn_visualizer.draw_boxes( output_dict.get("boxes"), output_dict.pop("obj_ids"), output_dict.pop("obj_probs"), output_dict.pop("attr_ids"), output_dict.pop("attr_probs"), ) showarray(frcnn_visualizer._get_buffer()) # In[7]: test_questions_for_url1 = [ "Where is this scene?", "what is the man riding?", "What is the man wearing?", "What is the color of the horse?" ] test_questions_for_url2 = [ "Where is the cat?", "What is near the disk?", "What is the color of the table?", "What is the color of the cat?", "What is the shape of the monitor?", ] #Very important that the boxes are normalized normalized_boxes = output_dict.get("normalized_boxes") features = output_dict.get("roi_features") for test_question in test_questions_for_url2: # run lxmert test_question = [test_question] inputs = lxmert_tokenizer( test_question, padding="max_length", max_length=20, truncation=True, return_token_type_ids=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt" ) # run lxmert(s) output_gqa = lxmert_gqa( input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, visual_feats=features, visual_pos=normalized_boxes, token_type_ids=inputs.token_type_ids, output_attentions=False, ) output_vqa = lxmert_vqa( input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, visual_feats=features, visual_pos=normalized_boxes, token_type_ids=inputs.token_type_ids, output_attentions=False, ) # get prediction pred_vqa = output_vqa["question_answering_score"].argmax(-1) pred_gqa = output_gqa["question_answering_score"].argmax(-1) print("Question:", test_question) print("prediction from LXMERT GQA:", gqa_answers[pred_gqa]) print("prediction from LXMERT VQA:", vqa_answers[pred_vqa]) # In[ ]: # In[ ]: