In this notebook we will download training materials in PDF format, extract the text of every page and save the pages as PNG files. We will then use OpenAI's text embeddings and UMAP dimensionality reduction to get a simple embedding of training materials contents.
pip install PyPDF2 pdf2image
conda install umap-learn openai
import os
import requests
import json
from pdf2image import convert_from_path
import PyPDF2
import numpy as np
import pandas as pd
from openai import OpenAI
import umap
import stackview as sv
from PIL import Image
import tempfile
We will use trainng materials about Bio-image Data Science, which is licensed CC-BY 4.0 by Robert Haase.
def download_pdfs_from_zenodo(record_id):
"""Download PDFs from Zenodo record."""
base_url = f"https://zenodo.org/api/records/{record_id}"
response = requests.get(base_url)
data = response.json()
if not os.path.exists('downloads'):
os.makedirs('downloads')
files_info = []
for file in data['files']:
if file['key'].endswith('.pdf'):
download_url = file['links']['self']
filename = record_id + "_" + file['key']
filepath = os.path.join('downloads', filename)
if not os.path.exists(filepath):
# Download file
response = requests.get(download_url)
with open(filepath, 'wb') as f:
f.write(response.content)
files_info.append({'filename': filename, 'url': download_url})
return files_info
# Download PDFs
files_info = download_pdfs_from_zenodo('12623730')
files_info
[{'filename': '12623730_14_Summary.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/14_Summary.pdf/content'}, {'filename': '12623730_10_function_calling.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/10_function_calling.pdf/content'}, {'filename': '12623730_11_prompteng_rag_finetuning.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/11_prompteng_rag_finetuning.pdf/content'}, {'filename': '12623730_12_Vision_models.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/12_Vision_models.pdf/content'}, {'filename': '12623730_09_Deep_Learning.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/09_Deep_Learning.pdf/content'}, {'filename': '12623730_08_Sup_Unsup_Machine_Learning.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/08_Sup_Unsup_Machine_Learning.pdf/content'}, {'filename': '12623730_03_RSM_Image_Processing.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/03_RSM_Image_Processing.pdf/content'}, {'filename': '12623730_01_Introduction_BIDS_2024.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/01_Introduction_BIDS_2024.pdf/content'}, {'filename': '12623730_13_quality_assurance.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/13_quality_assurance.pdf/content'}, {'filename': '12623730_02_Introduction_RDM_2024.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/02_Introduction_RDM_2024.pdf/content'}, {'filename': '12623730_04_Image_segmentation.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/04_Image_segmentation.pdf/content'}, {'filename': '12623730_05_Surface_Recon_QA.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/05_Surface_Recon_QA.pdf/content'}, {'filename': '12623730_07_distributed_gpu_computing.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/07_distributed_gpu_computing.pdf/content'}, {'filename': '12623730_06_Chatbots.pdf', 'url': 'https://zenodo.org/api/records/12623730/files/06_Chatbots.pdf/content'}]
Next we go through all pages, save them as PDF and take the text to create embedding vectors.
def resize_image(image, height):
"""
Resize the image to the specified height while maintaining aspect ratio.
Parameters
----------
image : PIL.Image.Image
The image to resize.
height : int
The desired height in pixels.
Returns
-------
PIL.Image.Image
The resized image.
"""
aspect_ratio = image.width / image.height
new_width = int(aspect_ratio * height)
return image.resize((new_width, height), Image.LANCZOS)
def process_pdf(pdf_info):
"""Process PDF file to extract images and text."""
filename = pdf_info['filename']
filepath = os.path.join('downloads', filename)
base_name = os.path.splitext(filename)[0]
if not os.path.exists('downloads'):
os.makedirs('downloads')
if not os.path.exists('downloads'):
os.makedirs('downloads')
# Set your OpenAI API key
client = OpenAI()
# Convert PDF pages to images
images = [resize_image(i, height=300) for i in convert_from_path(filepath)]
# Extract text from PDF
pdf_reader = PyPDF2.PdfReader(filepath)
page_data = []
for i, image in enumerate(images):
# Save image
png_filename = f"{base_name}_{i}.png"
png_path = os.path.join('downloads', png_filename)
image.save(png_path)
# Save text
txt_filename = f"{base_name}_{i}.txt"
txt_path = os.path.join('downloads', txt_filename)
text = pdf_reader.pages[i].extract_text()
if not os.path.exists(txt_filename):
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(text)
# Get embedding
response = client.embeddings.create(
input=text,
model="text-embedding-ada-002"
)
embedding_vector = response.data[0].embedding
page_data.append({
'filename': filename,
'url': pdf_info['url'],
'page_index': i,
'text': text,
'png_filename': png_filename,
'txt_filename': txt_filename,
'embedding_vector': embedding_vector
})
return page_data
# Process all PDFs
all_page_data = []
for pdf_info in files_info:
all_page_data.extend(process_pdf(pdf_info))
C:\Users\rober\AppData\Local\Temp\ipykernel_12900\2891063455.py:19: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead. return image.resize((new_width, height), Image.LANCZOS)
The result will be saved as dataframe.
# Create DataFrame
df = pd.DataFrame(all_page_data)
df
filename | url | page_index | text | png_filename | txt_filename | embedding_vector | |
---|---|---|---|---|---|---|---|
0 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 0 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_0.png | 12623730_14_Summary_0.txt | [-0.01753188483417034, 0.009571048431098461, 0... |
1 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 1 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_1.png | 12623730_14_Summary_1.txt | [0.001144174369983375, 0.008919398300349712, -... |
2 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 2 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_2.png | 12623730_14_Summary_2.txt | [0.01131830457597971, 0.033359214663505554, 0.... |
3 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 3 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_3.png | 12623730_14_Summary_3.txt | [0.018105685710906982, 0.026488685980439186, 0... |
4 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 4 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_4.png | 12623730_14_Summary_4.txt | [-0.027609605342149734, 0.0015738429501652718,... |
... | ... | ... | ... | ... | ... | ... | ... |
858 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 29 | Slide 30\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_29.png | 12623730_06_Chatbots_29.txt | [-0.011728818528354168, -0.0007099526119418442... |
859 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 30 | Slide 31\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_30.png | 12623730_06_Chatbots_30.txt | [-0.007209372241050005, 0.004134070128202438, ... |
860 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 31 | Slide 32\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_31.png | 12623730_06_Chatbots_31.txt | [-0.014446760527789593, 0.013194024562835693, ... |
861 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 32 | Slide 33\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_32.png | 12623730_06_Chatbots_32.txt | [-0.035361308604478836, -0.001887816353701055,... |
862 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 33 | Slide 34\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_33.png | 12623730_06_Chatbots_33.txt | [-0.006428151857107878, 0.016945311799645424, ... |
863 rows × 7 columns
We then perform dimensionality reduction on the embedding vectors and add two new columns to the dataset: UMAP0 and UMAP1.
# Convert embedding vectors to numpy array for UMAP
embeddings = np.array(df['embedding_vector'].tolist())
# Apply UMAP
reducer = umap.UMAP(n_components=2, random_state=42)
umap_embeddings = reducer.fit_transform(embeddings)
df['UMAP0'] = umap_embeddings[:, 0]
df['UMAP1'] = umap_embeddings[:, 1]
df
C:\Users\rober\miniforge3\envs\devbio-napari-env\Lib\site-packages\umap\umap_.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
filename | url | page_index | text | png_filename | txt_filename | embedding_vector | UMAP0 | UMAP1 | |
---|---|---|---|---|---|---|---|---|---|
0 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 0 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_0.png | 12623730_14_Summary_0.txt | [-0.01753188483417034, 0.009571048431098461, 0... | 2.785299 | 5.125338 |
1 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 1 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_1.png | 12623730_14_Summary_1.txt | [0.001144174369983375, 0.008919398300349712, -... | 1.759109 | 5.196022 |
2 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 2 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_2.png | 12623730_14_Summary_2.txt | [0.01131830457597971, 0.033359214663505554, 0.... | 1.605859 | 6.084491 |
3 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 3 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_3.png | 12623730_14_Summary_3.txt | [0.018105685710906982, 0.026488685980439186, 0... | 1.581907 | 6.084695 |
4 | 12623730_14_Summary.pdf | https://zenodo.org/api/records/12623730/files/... | 4 | Robert Haase\n@haesleinhuepf\nBIDS Lecture 14/... | 12623730_14_Summary_4.png | 12623730_14_Summary_4.txt | [-0.027609605342149734, 0.0015738429501652718,... | 2.163119 | 7.161102 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
858 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 29 | Slide 30\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_29.png | 12623730_06_Chatbots_29.txt | [-0.011728818528354168, -0.0007099526119418442... | 3.970344 | 5.525424 |
859 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 30 | Slide 31\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_30.png | 12623730_06_Chatbots_30.txt | [-0.007209372241050005, 0.004134070128202438, ... | 5.693106 | 7.587674 |
860 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 31 | Slide 32\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_31.png | 12623730_06_Chatbots_31.txt | [-0.014446760527789593, 0.013194024562835693, ... | 4.285961 | 5.682843 |
861 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 32 | Slide 33\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_32.png | 12623730_06_Chatbots_32.txt | [-0.035361308604478836, -0.001887816353701055,... | 6.068371 | 4.506485 |
862 | 12623730_06_Chatbots.pdf | https://zenodo.org/api/records/12623730/files/... | 33 | Slide 34\nRobert Haase\n@haesleinhuepf\nBIDS L... | 12623730_06_Chatbots_33.png | 12623730_06_Chatbots_33.txt | [-0.006428151857107878, 0.016945311799645424, ... | 4.330415 | 5.363225 |
863 rows × 9 columns
import stackview
stackview.scatterplot(df, column_x="UMAP0", column_y="UMAP1")
HBox(children=(VBox(children=(VBox(children=(HBox(children=(Label(value='Axes '), Dropdown(index=7, layout=Lay…
df["selection"]
0 False 1 False 2 False 3 False 4 False ... 858 False 859 False 860 False 861 False 862 False Name: selection, Length: 863, dtype: bool
Finally, we save the data in two yml files: One complete set including the embedding vectors and another one with only the UMAP columns.
import yaml
# Convert DataFrame to dictionary
data_dict = df.to_dict()
# Save as YAML file
with open('data.yml', 'w') as file:
yaml.dump(data_dict, file)
print("DataFrame saved as data.yml")
DataFrame saved as data.yml
import yaml
# Convert DataFrame to dictionary
data_dict = df[["filename", "png_filename", "text", "url", "page_index", "UMAP0", "UMAP1"]].to_dict()
# Save as YAML file
with open('data_png_umap.yml', 'w') as file:
yaml.dump(data_dict, file)
print("DataFrame saved as data.yml")
DataFrame saved as data.yml
# Read YAML file
with open('data.yml', 'r') as file:
loaded_dict = yaml.safe_load(file)
# Convert dictionary back to DataFrame
loaded_df = pd.DataFrame(loaded_dict)
# Show first few rows of the loaded DataFrame
loaded_df.head()