The first step is to import the libraries and set the OpenAI API key and endpoint. You'll need to set the following environment variables:
AZURE_OPENAI_API_KEY
- Your OpenAI API keyAZURE_OPENAI_ENDPOINT
- Your OpenAI endpointimport os
import pandas as pd
import openai
from openai.embeddings_utils import cosine_similarity, get_embedding
OPENAI_EMBEDDING_ENGINE = "text-embedding-ada-002"
SIMILARITIES_RESULTS_THRESHOLD = 0.75
DATASET_NAME = "embedding_index_3m.json"
openai.api_type = "azure"
openai.api_key = os.environ["AZURE_OPENAI_API_KEY"]
openai.api_base = os.environ["AZURE_OPENAI_ENDPOINT"]
openai.api_version = "2023-07-01-preview"
OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.environ[
"AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME"
]
Next, we are going to load the Embedding Index into a Pandas Dataframe. The Embedding Index is stored in a JSON file called embedding_index_3m.json
. The Embedding Index contains the Embeddings for each of the YouTube transcripts up until late Oct 2023.
def load_dataset(source: str) -> pd.core.frame.DataFrame:
# Load the video session index
pd_vectors = pd.read_json(source)
return pd_vectors.drop(columns=["text"], errors="ignore").fillna("")
Next, we are going to create a function called get_videos
that will search the Embedding Index for the query. The function will return the top 5 videos that are most similar to the query. The function works as follows:
similarity
. The similarity
column contains the cosine similarity between the query Embedding and the Embedding for each video segment.similarity
column. The Embedding Index is filtered to only include videos that have a cosine similarity greater than or equal to 0.75.similarity
column and the top 5 videos are returned.def get_videos(
query: str, dataset: pd.core.frame.DataFrame, rows: int
) -> pd.core.frame.DataFrame:
# create a copy of the dataset
video_vectors = dataset.copy()
# get the embeddings for the query
query_embeddings = get_embedding(query, OPENAI_EMBEDDING_ENGINE)
# create a new column with the calculated similarity for each row
video_vectors["similarity"] = video_vectors["ada_v2"].apply(
lambda x: cosine_similarity(query_embeddings, x)
)
# filter the videos by similarity
mask = video_vectors["similarity"] >= SIMILARITIES_RESULTS_THRESHOLD
video_vectors = video_vectors[mask].copy()
# sort the videos by similarity
video_vectors = video_vectors.sort_values(by="similarity", ascending=False).head(
rows
)
# return the top rows
return video_vectors.head(rows)
This function is very simple, it just prints out the results of the search query.
def display_results(videos: pd.core.frame.DataFrame, query: str):
def _gen_yt_url(video_id: str, seconds: int) -> str:
"""convert time in format 00:00:00 to seconds"""
return f"https://youtu.be/{video_id}?t={seconds}"
print(f"\nVideos similar to '{query}':")
for index, row in videos.iterrows():
youtube_url = _gen_yt_url(row["videoId"], row["seconds"])
print(f" - {row['title']}")
print(f" Summary: {' '.join(row['summary'].split()[:15])}...")
print(f" YouTube: {youtube_url}")
print(f" Similarity: {row['similarity']}")
print(f" Speakers: {row['speaker']}")
get_videos
function is called to search the Embedding Index for the query.display_results
function is called to display the results to the user.exit
.You will be prompted to enter a query. Enter a query and press enter. The application will return a list of videos that are relevant to the query. The application will also return a link to the place in the video where the answer to the question is located.
Here are some queries to try out:
pd_vectors = load_dataset(DATASET_NAME)
# get user query from imput
while True:
query = input("Enter a query: ")
if query == "exit":
break
videos = get_videos(query, pd_vectors, 5)
display_results(videos, query)