Generating data for embedding training materials¶

In this notebook we will download training materials in PDF format, extract the text of every page and save the pages as PNG files. We will then use OpenAI's text embeddings and UMAP dimensionality reduction to get a simple embedding of training materials contents.

pip install PyPDF2 pdf2image
conda install umap-learn openai

In [1]:

import os
import requests
import json
from pdf2image import convert_from_path
import PyPDF2
import numpy as np
import pandas as pd
from openai import OpenAI
import umap
import stackview as sv
from PIL import Image
import tempfile

We will use trainng materials about Bio-image Data Science, which is licensed CC-BY 4.0 by Robert Haase.

In [2]:

def download_pdfs_from_zenodo(record_id):
    """Download PDFs from Zenodo record."""
    base_url = f"https://zenodo.org/api/records/{record_id}"
    response = requests.get(base_url)
    data = response.json()
    
    if not os.path.exists('downloads'):
        os.makedirs('downloads')
    
    files_info = []
    for file in data['files']:
        if file['key'].endswith('.pdf'):
            download_url = file['links']['self']
            filename = record_id + "_" + file['key']
            filepath = os.path.join('downloads', filename)

            if not os.path.exists(filepath):
                # Download file
                response = requests.get(download_url)
                with open(filepath, 'wb') as f:
                    f.write(response.content)
            
            files_info.append({'filename': filename, 'url': download_url})
    
    return files_info


# Download PDFs
files_info = download_pdfs_from_zenodo('12623730')
files_info

Out[2]:

[{'filename': '12623730_14_Summary.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/14_Summary.pdf/content'},
 {'filename': '12623730_10_function_calling.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/10_function_calling.pdf/content'},
 {'filename': '12623730_11_prompteng_rag_finetuning.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/11_prompteng_rag_finetuning.pdf/content'},
 {'filename': '12623730_12_Vision_models.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/12_Vision_models.pdf/content'},
 {'filename': '12623730_09_Deep_Learning.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/09_Deep_Learning.pdf/content'},
 {'filename': '12623730_08_Sup_Unsup_Machine_Learning.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/08_Sup_Unsup_Machine_Learning.pdf/content'},
 {'filename': '12623730_03_RSM_Image_Processing.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/03_RSM_Image_Processing.pdf/content'},
 {'filename': '12623730_01_Introduction_BIDS_2024.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/01_Introduction_BIDS_2024.pdf/content'},
 {'filename': '12623730_13_quality_assurance.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/13_quality_assurance.pdf/content'},
 {'filename': '12623730_02_Introduction_RDM_2024.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/02_Introduction_RDM_2024.pdf/content'},
 {'filename': '12623730_04_Image_segmentation.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/04_Image_segmentation.pdf/content'},
 {'filename': '12623730_05_Surface_Recon_QA.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/05_Surface_Recon_QA.pdf/content'},
 {'filename': '12623730_07_distributed_gpu_computing.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/07_distributed_gpu_computing.pdf/content'},
 {'filename': '12623730_06_Chatbots.pdf',
  'url': 'https://zenodo.org/api/records/12623730/files/06_Chatbots.pdf/content'}]

Next we go through all pages, save them as PDF and take the text to create embedding vectors.

In [3]:

def resize_image(image, height):
    """
    Resize the image to the specified height while maintaining aspect ratio.

    Parameters
    ----------
    image : PIL.Image.Image
        The image to resize.
    height : int
        The desired height in pixels.
    
    Returns
    -------
    PIL.Image.Image
        The resized image.
    """
    aspect_ratio = image.width / image.height
    new_width = int(aspect_ratio * height)
    return image.resize((new_width, height), Image.LANCZOS)

def process_pdf(pdf_info):
    """Process PDF file to extract images and text."""
    filename = pdf_info['filename']
    filepath = os.path.join('downloads', filename)
    base_name = os.path.splitext(filename)[0]
    
    if not os.path.exists('downloads'):
        os.makedirs('downloads')
    if not os.path.exists('downloads'):
        os.makedirs('downloads')

    # Set your OpenAI API key
    client = OpenAI()
    
    # Convert PDF pages to images
    images = [resize_image(i, height=300) for i in convert_from_path(filepath)]
    
    # Extract text from PDF
    pdf_reader = PyPDF2.PdfReader(filepath)
    
    page_data = []
    
    for i, image in enumerate(images):
        # Save image
        png_filename = f"{base_name}_{i}.png"
        png_path = os.path.join('downloads', png_filename)
        image.save(png_path)
        
        # Save text
        txt_filename = f"{base_name}_{i}.txt"
        txt_path = os.path.join('downloads', txt_filename)
        text = pdf_reader.pages[i].extract_text()
        if not os.path.exists(txt_filename):
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(text)
            
        # Get embedding
        response = client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"
        )
        embedding_vector = response.data[0].embedding
        
        page_data.append({
            'filename': filename,
            'url': pdf_info['url'],
            'page_index': i,
            'text': text,
            'png_filename': png_filename,
            'txt_filename': txt_filename,
            'embedding_vector': embedding_vector
        })
     
    return page_data

# Process all PDFs
all_page_data = []
for pdf_info in files_info:
    all_page_data.extend(process_pdf(pdf_info))

C:\Users\rober\AppData\Local\Temp\ipykernel_12900\2891063455.py:19: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.
  return image.resize((new_width, height), Image.LANCZOS)

The result will be saved as dataframe.

In [4]:

# Create DataFrame
df = pd.DataFrame(all_page_data)
df

Out[4]:

	filename	url	page_index	text	png_filename	txt_filename	embedding_vector
0	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	0	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_0.png	12623730_14_Summary_0.txt	[-0.01753188483417034, 0.009571048431098461, 0...
1	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	1	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_1.png	12623730_14_Summary_1.txt	[0.001144174369983375, 0.008919398300349712, -...
2	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	2	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_2.png	12623730_14_Summary_2.txt	[0.01131830457597971, 0.033359214663505554, 0....
3	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	3	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_3.png	12623730_14_Summary_3.txt	[0.018105685710906982, 0.026488685980439186, 0...
4	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	4	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_4.png	12623730_14_Summary_4.txt	[-0.027609605342149734, 0.0015738429501652718,...
...	...	...	...	...	...	...	...
858	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	29	Slide 30\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_29.png	12623730_06_Chatbots_29.txt	[-0.011728818528354168, -0.0007099526119418442...
859	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	30	Slide 31\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_30.png	12623730_06_Chatbots_30.txt	[-0.007209372241050005, 0.004134070128202438, ...
860	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	31	Slide 32\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_31.png	12623730_06_Chatbots_31.txt	[-0.014446760527789593, 0.013194024562835693, ...
861	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	32	Slide 33\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_32.png	12623730_06_Chatbots_32.txt	[-0.035361308604478836, -0.001887816353701055,...
862	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	33	Slide 34\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_33.png	12623730_06_Chatbots_33.txt	[-0.006428151857107878, 0.016945311799645424, ...

863 rows × 7 columns

We then perform dimensionality reduction on the embedding vectors and add two new columns to the dataset: UMAP0 and UMAP1.

In [5]:

# Convert embedding vectors to numpy array for UMAP
embeddings = np.array(df['embedding_vector'].tolist())

# Apply UMAP
reducer = umap.UMAP(n_components=2, random_state=42)
umap_embeddings = reducer.fit_transform(embeddings)

df['UMAP0'] = umap_embeddings[:, 0]
df['UMAP1'] = umap_embeddings[:, 1]

df

C:\Users\rober\miniforge3\envs\devbio-napari-env\Lib\site-packages\umap\umap_.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")

Out[5]:

	filename	url	page_index	text	png_filename	txt_filename	embedding_vector	UMAP0	UMAP1
0	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	0	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_0.png	12623730_14_Summary_0.txt	[-0.01753188483417034, 0.009571048431098461, 0...	2.785299	5.125338
1	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	1	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_1.png	12623730_14_Summary_1.txt	[0.001144174369983375, 0.008919398300349712, -...	1.759109	5.196022
2	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	2	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_2.png	12623730_14_Summary_2.txt	[0.01131830457597971, 0.033359214663505554, 0....	1.605859	6.084491
3	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	3	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_3.png	12623730_14_Summary_3.txt	[0.018105685710906982, 0.026488685980439186, 0...	1.581907	6.084695
4	12623730_14_Summary.pdf	https://zenodo.org/api/records/12623730/files/...	4	Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/...	12623730_14_Summary_4.png	12623730_14_Summary_4.txt	[-0.027609605342149734, 0.0015738429501652718,...	2.163119	7.161102
...	...	...	...	...	...	...	...	...	...
858	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	29	Slide 30\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_29.png	12623730_06_Chatbots_29.txt	[-0.011728818528354168, -0.0007099526119418442...	3.970344	5.525424
859	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	30	Slide 31\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_30.png	12623730_06_Chatbots_30.txt	[-0.007209372241050005, 0.004134070128202438, ...	5.693106	7.587674
860	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	31	Slide 32\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_31.png	12623730_06_Chatbots_31.txt	[-0.014446760527789593, 0.013194024562835693, ...	4.285961	5.682843
861	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	32	Slide 33\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_32.png	12623730_06_Chatbots_32.txt	[-0.035361308604478836, -0.001887816353701055,...	6.068371	4.506485
862	12623730_06_Chatbots.pdf	https://zenodo.org/api/records/12623730/files/...	33	Slide 34\nRobert Haase\n@haesleinhuepf\nBIDS L...	12623730_06_Chatbots_33.png	12623730_06_Chatbots_33.txt	[-0.006428151857107878, 0.016945311799645424, ...	4.330415	5.363225

863 rows × 9 columns

In [6]:

import stackview
stackview.scatterplot(df, column_x="UMAP0", column_y="UMAP1")

Out[6]:

HBox(children=(VBox(children=(VBox(children=(HBox(children=(Label(value='Axes '), Dropdown(index=7, layout=Lay…

In [8]:

df["selection"]

Out[8]:

0      False
1      False
2      False
3      False
4      False
       ...  
858    False
859    False
860    False
861    False
862    False
Name: selection, Length: 863, dtype: bool

Finally, we save the data in two yml files: One complete set including the embedding vectors and another one with only the UMAP columns.

In [9]:

import yaml

# Convert DataFrame to dictionary
data_dict = df.to_dict()

# Save as YAML file
with open('data.yml', 'w') as file:
    yaml.dump(data_dict, file)

print("DataFrame saved as data.yml")

DataFrame saved as data.yml

In [10]:

import yaml

# Convert DataFrame to dictionary
data_dict = df[["filename", "png_filename", "text", "url", "page_index", "UMAP0", "UMAP1"]].to_dict()

# Save as YAML file
with open('data_png_umap.yml', 'w') as file:
    yaml.dump(data_dict, file)

print("DataFrame saved as data.yml")

DataFrame saved as data.yml

In [ ]:

# Read YAML file
with open('data.yml', 'r') as file:
    loaded_dict = yaml.safe_load(file)

# Convert dictionary back to DataFrame 
loaded_df = pd.DataFrame(loaded_dict)

# Show first few rows of the loaded DataFrame
loaded_df.head()

In [ ]: