#!/usr/bin/env python # coding: utf-8 # # Images extraction and analysis from a PDF file using Azure Open AI GPT4-Vision # In[1]: # %pip install Spire.Pdf # In[2]: import base64 import datetime import json import os import openai import requests import shutil import time from dotenv import load_dotenv from PIL import Image as PILimage from spire.pdf.common import * from spire.pdf import * # In[3]: def check_openai_version(): """ Check Azure Open AI version """ installed_version = openai.__version__ try: version_number = float(installed_version[:3]) except ValueError: print("Invalid OpenAI version format") return print(f"Installed OpenAI version: {installed_version}") if version_number < 1.0: print("[Warning] You should upgrade OpenAI to have version >= 1.0.0") print("To upgrade, run: %pip install openai --upgrade") else: print(f"[OK] OpenAI version {installed_version} is >= 1.0.0") # In[4]: check_openai_version() # In[5]: sys.version # In[6]: print(f"Today is {datetime.datetime.today().strftime('%d-%b-%Y %H:%M:%S')}") # In[7]: print(f"Python version: {sys.version}") # In[8]: print(f"OpenAI version: {openai.__version__}") # ## Azure Open AI # In[9]: load_dotenv("azure.env") # Azure Open AI openai.api_type: str = "azure" openai.api_key = os.getenv("OPENAI_API_KEY") openai.api_base = os.getenv("OPENAI_API_BASE") # Azure AI Vision (aka Azure Computer Vision) azure_aivision_endpoint = os.getenv("azure_aivision_endpoint") azure_aivision_key = os.getenv("azure_aivision_key") # In[10]: model = "GPT4Vision" # This is the deployed name of your GPT4 Vision model from the Azure Open AI studio # In[11]: indexname = "pdf-docs" # ## Directory to save extracted images from the PDF file # In[12]: images_dir = "images" if os.path.exists(images_dir): shutil.rmtree(images_dir) os.makedirs(images_dir, exist_ok=True) # ## Running the extraction of all the images from the PDF file # In[13]: pdf_file = "ios17.pdf" # In[14]: get_ipython().system('ls $pdf_file -lh') # In[15]: # Reading the pdf file doc = PdfDocument() doc.LoadFromFile(pdf_file) images_list = [] nb = 0 print(f"Extracting images from PDF file {pdf_file}\n") # For each page for p in range(doc.Pages.Count): page = doc.Pages.get_Item(p) idx = 1 # Extract any image from the page and save each image to a png file for image in page.ExtractImages(): image_file = ( image_file ) = f"{images_dir}/{os.path.splitext(os.path.basename(pdf_file))[0]}_page_{p + 1:03}_nb_{idx:02}.png" image.Save(image_file, ImageFormat.get_Png()) print(f"{nb + 1:04} Saving image to {image_file}") images_list.append(image_file) idx += 1 nb += 1 doc.Close() print("\nDone") print(f"{nb} images were saved.") # In[16]: print(f"Number of extracted images from the PDF file = {len(images_list)}") # ## Viewing all the extracted images # In[17]: get_ipython().system('ls $images_dir -lh') # In[18]: get_ipython().run_cell_magic('javascript', 'Python', 'OutputArea.auto_scroll_threshold = 9999\n') # In[19]: for idx in range(len(images_list)): print(f"{idx+1:02} Extracted image file: {images_list[idx]}") display(PILimage.open(images_list[idx])) print("\nEnd") # ## Analysing these extracted images with GPT4 Vision # In[20]: def GPT4V_with_AzureAIVision(image_file, prompt, disp=False): """ GPT-4 Turbo with vision and Azure AI enhancements """ # Testing if image file exists if not os.path.exists(image_file): print(f"[Error] Image file {image_file} does not exist.") # Endpoint base_url = f"{openai.api_base}/openai/deployments/{model}" gpt4vision_endpoint = ( f"{base_url}/extensions/chat/completions?api-version=2023-12-01-preview" ) # Header headers = {"Content-Type": "application/json", "api-key": openai.api_key} # Encoded image base_64_encoded_image = base64.b64encode(open(image_file, "rb").read()).decode( "ascii" ) # Context context = "You are an AI assistant. You analyse an image and make some answers based on a prompt. Always reply in English." # Payload json_data = { "model": "gpt-4-vision-preview", "enhancements": {"ocr": {"enabled": True}, "grounding": {"enabled": True}}, "dataSources": [ { "type": "AzureComputerVision", "endpoint": azure_aivision_endpoint, "key": azure_aivision_key, "indexName": indexname, } ], "messages": [ {"role": "system", "content": context}, {"role": "user", "content": [prompt, {"image": base_64_encoded_image}]}, ], "max_tokens": 4000, "temperature": 0.2, "top_p": 1, } # Response response = requests.post( gpt4vision_endpoint, headers=headers, data=json.dumps(json_data) ) # Testing the status code from the model response if response.status_code == 200: now = str(datetime.datetime.today().strftime("%d-%b-%Y %H:%M:%S")) print(f"Analysis of image: {image_file}") results = json.loads(response.text) print("\033[1;31;34m") print(results["choices"][0]["message"]["content"]) prompt_tokens = results["usage"]["prompt_tokens"] completion_tokens = results["usage"]["completion_tokens"] total_tokens = results["usage"]["total_tokens"] print("\n\033[1;31;32mDone:", now) print( f"Prompt tokens = {prompt_tokens} | Completion tokens = {completion_tokens} \ | Total tokens = {total_tokens}" ) print("\n[Note] These results are generated by an AI") print("\033[0m") if disp: return results elif response.status_code == 429: print( "[429 Error] Too many requests. Please wait a couple of seconds and try again.\n" ) print(json.loads(response.text)) else: print(f"[Error] Error code: {response.status_code}\n") print(json.loads(response.text)) # In[22]: print("Analysing all extracted images with Azure Open AI GPT4 Turbo with Vision\n") for idx in range(len(images_list)): # Print image filename print(f"{idx+1:02} Extracted image file: {images_list[idx]}") # Display image display(PILimage.open(images_list[idx])) # Run the GPT4 Turbo Vision model GPT4V_with_AzureAIVision(images_list[idx], "Describe this in one line and generate some hashtags and emojis.") # Pause to avoid 429 errors time.sleep(30) print("\nEnd") # In[ ]: