#!/usr/bin/env python # coding: utf-8 # ## Notebook 1: PDF Pre-processing # In the series, we will be going from a PDF to Podcast using all open models. # # The first step in getting to the podcast is finding a script, right now our logic is: # - Use any PDF on any topic # - Prompt `Llama-3.2-3B-Instruct` model to process it into a text file # - Re-write this into a podcast transcript in next notebook. # # In this notebook, we will upload a PDF and save it into a `.txt` file using the `PyPDF2` library, later we will process chunks from the text file using our featherlight model. # Most of us shift-enter pass the comments to realise later we need to install libraries. For the few that read the instructions, please remember to do so: # In[1]: #!pip install PyPDF2 #!pip install rich ipywidgets # Assuming you have a PDF uploaded on the same machine, please set the path for the file. # # Also, if you want to flex your GPU-please switch to a bigger model although the featherlight models work perfectly for this task: # In[2]: pdf_path = './resources/2106.09685v2.pdf' DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct" # In[3]: import PyPDF2 from typing import Optional import os import torch from accelerate import Accelerator from transformers import AutoModelForCausalLM, AutoTokenizer from tqdm.notebook import tqdm import warnings warnings.filterwarnings('ignore') # Let's make sure we don't stub our toe by checking if the file exists # In[4]: def validate_pdf(file_path: str) -> bool: if not os.path.exists(file_path): print(f"Error: File not found at path: {file_path}") return False if not file_path.lower().endswith('.pdf'): print("Error: File is not a PDF") return False return True # Convert PDF to a `.txt` file. This would simply read and dump the contents of the file. We set the maximum characters to 100k. # # For people converting their favorite novels into a podcast, they will have to add extra logic of going outside the Llama models context length which is 128k tokens. # In[5]: def extract_text_from_pdf(file_path: str, max_chars: int = 100000) -> Optional[str]: if not validate_pdf(file_path): return None try: with open(file_path, 'rb') as file: # Create PDF reader object pdf_reader = PyPDF2.PdfReader(file) # Get total number of pages num_pages = len(pdf_reader.pages) print(f"Processing PDF with {num_pages} pages...") extracted_text = [] total_chars = 0 # Iterate through all pages for page_num in range(num_pages): # Extract text from page page = pdf_reader.pages[page_num] text = page.extract_text() # Check if adding this page's text would exceed the limit if total_chars + len(text) > max_chars: # Only add text up to the limit remaining_chars = max_chars - total_chars extracted_text.append(text[:remaining_chars]) print(f"Reached {max_chars} character limit at page {page_num + 1}") break extracted_text.append(text) total_chars += len(text) print(f"Processed page {page_num + 1}/{num_pages}") final_text = '\n'.join(extracted_text) print(f"\nExtraction complete! Total characters: {len(final_text)}") return final_text except PyPDF2.PdfReadError: print("Error: Invalid or corrupted PDF file") return None except Exception as e: print(f"An unexpected error occurred: {str(e)}") return None # Helper function to grab meta info about our PDF # In[6]: # Get PDF metadata def get_pdf_metadata(file_path: str) -> Optional[dict]: if not validate_pdf(file_path): return None try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) metadata = { 'num_pages': len(pdf_reader.pages), 'metadata': pdf_reader.metadata } return metadata except Exception as e: print(f"Error extracting metadata: {str(e)}") return None # Finally, we can run our logic to extract the details from the file # In[7]: # Extract metadata first print("Extracting metadata...") metadata = get_pdf_metadata(pdf_path) if metadata: print("\nPDF Metadata:") print(f"Number of pages: {metadata['num_pages']}") print("Document info:") for key, value in metadata['metadata'].items(): print(f"{key}: {value}") # Extract text print("\nExtracting text...") extracted_text = extract_text_from_pdf(pdf_path) # Display first 500 characters of extracted text as preview if extracted_text: print("\nPreview of extracted text (first 500 characters):") print("-" * 50) print(extracted_text[:500]) print("-" * 50) print(f"\nTotal characters extracted: {len(extracted_text)}") # Optional: Save the extracted text to a file if extracted_text: output_file = 'extracted_text.txt' with open(output_file, 'w', encoding='utf-8') as f: f.write(extracted_text) print(f"\nExtracted text has been saved to {output_file}") # ### Llama Pre-Processing # # Now let's proceed to justify our distaste for writing regex and use that as a justification for a LLM instead: # # At this point, have a text file extracted from a PDF of a paper. Generally PDF extracts can be messy due to characters, formatting, Latex, Tables, etc. # # One way to handle this would be using regex, instead we can also prompt the feather light Llama models to clean up our text for us. # # Please try changing the `SYS_PROMPT` below to see what improvements you can make: # In[8]: device = "cuda" if torch.cuda.is_available() else "mps" SYS_PROMPT = """ You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer. The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript. Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive Please be smart with what you remove and be creative ok? Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RE-WRITING WHEN NEEDED Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text. PLEASE DO NOT ADD MARKDOWN FORMATTING, STOP ADDING SPECIAL CHARACTERS THAT MARKDOWN CAPATILISATION ETC LIKES ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok? Here is the text: """ # Instead of having the model process the entire file at once, as you noticed in the prompt-we will pass chunks of the file. # # One issue with passing chunks counted by characters is, we lose meaning of words so instead we chunk by words: # In[9]: def create_word_bounded_chunks(text, target_chunk_size): """ Split text into chunks at word boundaries close to the target chunk size. """ words = text.split() chunks = [] current_chunk = [] current_length = 0 for word in words: word_length = len(word) + 1 # +1 for the space if current_length + word_length > target_chunk_size and current_chunk: # Join the current chunk and add it to chunks chunks.append(' '.join(current_chunk)) current_chunk = [word] current_length = word_length else: current_chunk.append(word) current_length += word_length # Add the last chunk if it exists if current_chunk: chunks.append(' '.join(current_chunk)) return chunks # Let's load in the model and start processing the text chunks # In[11]: accelerator = Accelerator() model = AutoModelForCausalLM.from_pretrained( DEFAULT_MODEL, torch_dtype=torch.bfloat16, use_safetensors=True, device_map=device, ) tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True) model, tokenizer = accelerator.prepare(model, tokenizer) # In[12]: def process_chunk(text_chunk, chunk_num): """Process a chunk of text and return both input and output for verification""" conversation = [ {"role": "system", "content": SYS_PROMPT}, {"role": "user", "content": text_chunk}, ] prompt = tokenizer.apply_chat_template(conversation, tokenize=False) inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): output = model.generate( **inputs, temperature=0.7, top_p=0.9, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id ) processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip() # Print chunk information for monitoring #print(f"\n{'='*40} Chunk {chunk_num} {'='*40}") print(f"INPUT TEXT:\n{text_chunk[:500]}...") # Show first 500 chars of input print(f"\nPROCESSED TEXT:\n{processed_text[:500]}...") # Show first 500 chars of output print(f"{'='*90}\n") return processed_text # In[13]: INPUT_FILE = "./extracted_text.txt" # Replace with your file path CHUNK_SIZE = 1000 # Adjust chunk size if needed text = open(INPUT_FILE, "r").read() chunks = create_word_bounded_chunks(text, CHUNK_SIZE) num_chunks = len(chunks) # In[14]: num_chunks # In[15]: # Read the file with open(INPUT_FILE, 'r', encoding='utf-8') as file: text = file.read() # Calculate number of chunks num_chunks = (len(text) + CHUNK_SIZE - 1) // CHUNK_SIZE # Cell 6: Process the file with ordered output # Create output file name output_file = f"clean_{os.path.basename(INPUT_FILE)}" # In[16]: with open(output_file, 'w', encoding='utf-8') as out_file: processed_text = "" for chunk_num, chunk in enumerate(tqdm(chunks, desc="Processing chunks")): # Process chunk and append to complete text processed_chunk = process_chunk(chunk, chunk_num) processed_text += processed_chunk + "\n" # Write chunk immediately to file out_file.write(processed_chunk + "\n") out_file.flush() # Let's print out the final processed versions to make sure things look good # In[17]: print(f"\nProcessing complete!") print(f"Input file: {INPUT_FILE}") print(f"Output file: {output_file}") print(f"Total chunks processed: {num_chunks}") # Preview the beginning and end of the complete processed text print("\nPreview of final processed text:") print("\nBEGINNING:") print(processed_text[:1000]) print("\n...\n\nEND:") print(processed_text[-1000:]) # ### Next Notebook: Transcript Writer # # Now that we have the pre-processed text ready, we can move to converting into a transcript in the next notebook # In[18]: #fin # In[ ]: