#!/usr/bin/env python # coding: utf-8 # The first step is to import the libraries and set the OpenAI API key and endpoint. You'll need to set the following environment variables: # # - `AZURE_OPENAI_API_KEY` - Your OpenAI API key # - `AZURE_OPENAI_ENDPOINT` - Your OpenAI endpoint # In[ ]: import os import pandas as pd import openai from openai.embeddings_utils import cosine_similarity, get_embedding OPENAI_EMBEDDING_ENGINE = "text-embedding-ada-002" SIMILARITIES_RESULTS_THRESHOLD = 0.75 DATASET_NAME = "embedding_index_3m.json" openai.api_type = "azure" openai.api_key = os.environ["AZURE_OPENAI_API_KEY"] openai.api_base = os.environ["AZURE_OPENAI_ENDPOINT"] openai.api_version = "2023-07-01-preview" OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.environ[ "AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME" ] # Next, we are going to load the Embedding Index into a Pandas Dataframe. The Embedding Index is stored in a JSON file called `embedding_index_3m.json`. The Embedding Index contains the Embeddings for each of the YouTube transcripts up until late Oct 2023. # In[ ]: def load_dataset(source: str) -> pd.core.frame.DataFrame: # Load the video session index pd_vectors = pd.read_json(source) return pd_vectors.drop(columns=["text"], errors="ignore").fillna("") # Next, we are going to create a function called `get_videos` that will search the Embedding Index for the query. The function will return the top 5 videos that are most similar to the query. The function works as follows: # # 1. First, a copy of the Embedding Index is created. # 2. Next, the Embedding for the query is calculated using the OpenAI Embedding API. # 3. Then a new column is created in the Embedding Index called `similarity`. The `similarity` column contains the cosine similarity between the query Embedding and the Embedding for each video segment. # 4. Next, the Embedding Index is filtered by the `similarity` column. The Embedding Index is filtered to only include videos that have a cosine similarity greater than or equal to 0.75. # 5. Finally, the Embedding Index is sorted by the `similarity` column and the top 5 videos are returned. # In[ ]: def get_videos( query: str, dataset: pd.core.frame.DataFrame, rows: int ) -> pd.core.frame.DataFrame: # create a copy of the dataset video_vectors = dataset.copy() # get the embeddings for the query query_embeddings = get_embedding(query, OPENAI_EMBEDDING_ENGINE) # create a new column with the calculated similarity for each row video_vectors["similarity"] = video_vectors["ada_v2"].apply( lambda x: cosine_similarity(query_embeddings, x) ) # filter the videos by similarity mask = video_vectors["similarity"] >= SIMILARITIES_RESULTS_THRESHOLD video_vectors = video_vectors[mask].copy() # sort the videos by similarity video_vectors = video_vectors.sort_values(by="similarity", ascending=False).head( rows ) # return the top rows return video_vectors.head(rows) # This function is very simple, it just prints out the results of the search query. # In[ ]: def display_results(videos: pd.core.frame.DataFrame, query: str): def _gen_yt_url(video_id: str, seconds: int) -> str: """convert time in format 00:00:00 to seconds""" return f"https://youtu.be/{video_id}?t={seconds}" print(f"\nVideos similar to '{query}':") for index, row in videos.iterrows(): youtube_url = _gen_yt_url(row["videoId"], row["seconds"]) print(f" - {row['title']}") print(f" Summary: {' '.join(row['summary'].split()[:15])}...") print(f" YouTube: {youtube_url}") print(f" Similarity: {row['similarity']}") print(f" Speakers: {row['speaker']}") # 1. First, the Embedding Index is loaded into a Pandas Dataframe. # 2. Next, the user is prompted to enter a query. # 3. Then the `get_videos` function is called to search the Embedding Index for the query. # 4. Finally, the `display_results` function is called to display the results to the user. # 5. The user is then prompted to enter another query. This process continues until the user enters `exit`. # #  # # You will be prompted to enter a query. Enter a query and press enter. The application will return a list of videos that are relevant to the query. The application will also return a link to the place in the video where the answer to the question is located. # # Here are some queries to try out: # # - What is Azure Machine Learning? # - How do convolutional neural networks work? # - What is a neural network? # - Can I use Jupyter Notebooks with Azure Machine Learning? # - What is ONNX? # In[ ]: pd_vectors = load_dataset(DATASET_NAME) # get user query from imput while True: query = input("Enter a query: ") if query == "exit": break videos = get_videos(query, pd_vectors, 5) display_results(videos, query) #