Skip to content

Commit

Permalink
Youtube (#25)
Browse files Browse the repository at this point in the history
* ✅Setup LLM
✅Load Youtube video transcripts

* ✅Retreiving the Youtube Transcripts
✅Summarizing the transcript using map_reduce and stuff
✅Saving the original as well as summarized transcript

* ✅Retreiving the Youtube Transcripts
✅Summarizing the transcript using map_reduce and stuff
✅Saving the original as well as summarized transcript

* ✅Successful ingestion of Transcripts
✅Successful retrieval of transcripts from Pinecone
✅Use Pinecone as a retriever chain
✅Integrating Langfuse for tracing
✅Using Langfuse Callback as LangChain Callbacks

* ✅Updated requirements.txt
  • Loading branch information
arkapatra31 authored Jan 22, 2025
1 parent 9a09ce8 commit 3b5d28f
Show file tree
Hide file tree
Showing 21 changed files with 1,332 additions and 560 deletions.
3 changes: 3 additions & 0 deletions llms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from llms.openai_llm import llm

__all__ = [llm]
9 changes: 9 additions & 0 deletions llms/openai_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()

# Initialize ChatOpenAI
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", verbose=True)

__all__ = [llm]
Binary file modified requirements.txt
Binary file not shown.
53 changes: 1 addition & 52 deletions synthetic_data/config/Cars_config.json
Original file line number Diff line number Diff line change
@@ -1,52 +1 @@
{
"car_id ": {
"data_type": "string",
"example_data": "[]",
"foreign_key": 0
},
"make ": {
"data_type": "string",
"example_data": "['Honda', 'Hyundai', 'BMW']",
"foreign_key": 0
},
"model ": {
"data_type": "string",
"example_data": "['City', 'i20', 'X1']",
"foreign_key": 0
},
"year ": {
"data_type": "string",
"example_data": "[]",
"foreign_key": 0
},
"color ": {
"data_type": "string",
"example_data": "[]",
"foreign_key": 0
},
"engine_type ": {
"data_type": "string",
"example_data": "[]",
"foreign_key": 0
},
"price ": {
"data_type": "string",
"example_data": "[]",
"foreign_key": 0
},
"owner_id ": {
"data_type": "string",
"example_data": "[]",
"foreign_key": 0
},
"description ": {
"data_type": "string",
"example_data": "[]",
"foreign_key": 0
},
"dealer_id ": {
"data_type": "string",
"example_data": "[]",
"foreign_key": 0
}
}
{"car_id ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "make ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "model ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "year ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "color ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "mileage ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "price ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "engine_type ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "transmission ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "fuel_type ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "service_history_id ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "description ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "created_at ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "updated_at ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "showroom_id": {"data_type": "string", "example_data": "[]", "foreign_key": 1}}
2 changes: 1 addition & 1 deletion synthetic_data/config/Showroom_config.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"showroom_id ": {"data_type": "string", "example_data": "[]"}, "showroom_name ": {"data_type": "string", "example_data": "[]"}, "location ": {"data_type": "string", "example_data": "[]"}, "manager_id ": {"data_type": "string", "example_data": "[]"}, "contact_number ": {"data_type": "string", "example_data": "[]"}, "opening_hours ": {"data_type": "string", "example_data": "[]"}, "capacity ": {"data_type": "string", "example_data": "[]"}, "description ": {"data_type": "string", "example_data": "[]"}, "brand_id ": {"data_type": "string", "example_data": "[]"}, "city_id ": {"data_type": "string", "example_data": "[]"}}
{"showroom_id ": {"data_type": "string", "example_data": "['SHRM12589', 'SHRM55896']", "foreign_key": 0}, "showroom_name ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "location ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "manager_id ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "contact_number ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "opening_hours ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "capacity ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "description ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "brand_id ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "created_at ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}}
1,502 changes: 1,001 additions & 501 deletions synthetic_data/data/products/Cars_data.csv

Large diffs are not rendered by default.

107 changes: 101 additions & 6 deletions synthetic_data/data/products/Showroom_data.csv

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions youtube_transcripts/UI/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from dotenv import load_dotenv
import streamlit as st
from youtube_transcripts import retrieval_chain
from youtube_transcripts.langfuse.callbackHandler import langfuseHandler

# Set page configuration and set page icon to youtube logo
st.set_page_config(page_title="Ask Youtube", page_icon="📹", layout="centered")

st.title("Q&A")

# Input widget for user question
question = st.text_input("Enter your question here")

if question:
btn = st.button("Ask")
if btn:
# Invoke Retrieval Chain using Pinecone as Vector Store with QA prompt from LangChain Hub
result = retrieval_chain.invoke(
{
"input": question
},
config={
"callbacks": [langfuseHandler],
"run_name": "langfuse-trace-qa",
}
)
st.write(result["answer"])
7 changes: 7 additions & 0 deletions youtube_transcripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from youtube_transcripts.openai_llm import llm
from youtube_transcripts.retrieve_transcripts.retriever import retrieval_chain
from youtube_transcripts.transcripts_loaders.video_transcripts_loaders import load_youtube_transcripts
from youtube_transcripts.summarizer.export_transcript import export_transcript_text
from youtube_transcripts.langfuse.callbackHandler import langfuseHandler

__all__ = [llm, load_youtube_transcripts, export_transcript_text, retrieval_chain]
39 changes: 39 additions & 0 deletions youtube_transcripts/ingest_transcript/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from dotenv import load_dotenv
import os
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import OllamaEmbeddings
from youtube_transcripts import llm, export_transcript_text
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv()

path = "../summarizer/long_video/transcript.txt"
transcript = export_transcript_text(path)

pinecone_api_key = os.getenv('PINECONE_API_KEY')
pinecone_index = "youtube-transcripts"
try:

textSplitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=0,
length_function=len,
keep_separator=True,
separators=["", "\n"]
)

chunks = textSplitter.split_text(transcript)

embeddings = OllamaEmbeddings(model="llama3:latest", num_gpu=1)

# Create new Pinecone Vector Store
vector_store = PineconeVectorStore.from_texts(
index_name=pinecone_index,
embedding=embeddings,
texts=chunks
)

print("Pinecone Vector Store Created")

except Exception as e:
print(f"Error: {e}")
13 changes: 13 additions & 0 deletions youtube_transcripts/langfuse/callbackHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import os
from dotenv import load_dotenv
from langfuse.callback import CallbackHandler

load_dotenv()

langfuseHandler = CallbackHandler(
secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
host=os.getenv("LANGFUSE_HOST")
)

__all__ = [langfuseHandler]
9 changes: 9 additions & 0 deletions youtube_transcripts/openai_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()

# Initialize OpenAI LLM
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", verbose=True, stream_usage=True)

__all__ = [llm]
31 changes: 31 additions & 0 deletions youtube_transcripts/retrieve_transcripts/retriever.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from langchain.chains.combine_documents import create_stuff_documents_chain
from youtube_transcripts import llm
from langchain import hub
from langchain_community.embeddings import OllamaEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.chains.retrieval import create_retrieval_chain


#pinecone_api_key = os.getenv('PINECONE_API_KEY')
pinecone_index = "youtube-transcripts"

# Pull the prompt for qa-chain from LangChain Hub
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

# Initialise Embeddings
embeddings = OllamaEmbeddings(model="llama3:latest")

# Intialise vector store
vector_store = PineconeVectorStore(index_name=pinecone_index, embedding=embeddings)

# Create the chain of type document stuff
combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)

# Create the Retrieval Chain
retrieval_chain = create_retrieval_chain(
retriever=vector_store.as_retriever(), combine_docs_chain=combine_docs_chain
)

__all__ = [
retrieval_chain
]
19 changes: 19 additions & 0 deletions youtube_transcripts/summarizer/export_transcript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os

# Export the text of summary.txt file
def export_transcript_text(file_path: str):
transcript = ""

# Check if the file exists
if os.path.exists(file_path):
try:
with open(file_path, "r") as file:
transcript = file.read()
except Exception as e:
print(f"An error occurred while reading the file: {e}")
else:
print(f"File not found: {file_path}")

return transcript

__all__ = [export_transcript_text]
1 change: 1 addition & 0 deletions youtube_transcripts/summarizer/long_video/summary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
In this course, Lance Martin from LangChain teaches how to implement Retrieval-Augmented Generation (RAG) to enhance large language models (LLMs) using private data. The curriculum covers the entire RAG pipeline, including data indexing, document retrieval, and answer generation, while exploring techniques like query translation and multi-query approaches. Advanced methods such as hierarchical indexing and corrective RAG are also discussed to improve retrieval accuracy. The course aims to equip learners with practical skills to build effective RAG systems that integrate public and private data.
1 change: 1 addition & 0 deletions youtube_transcripts/summarizer/long_video/transcript.txt

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions youtube_transcripts/summarizer/long_video/transcript_summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from youtube_transcripts import llm, load_youtube_transcripts
from langchain.chains.summarize import load_summarize_chain

# Fetch the transcript of a YouTube video
summary = load_youtube_transcripts()

# Save the actual transcript in a text file
with open("transcript.txt", "w") as f:
for doc in summary:
f.write(doc.page_content)
f.close()

# Summarize the transcript using the summarization chain
chain = load_summarize_chain(llm, chain_type="map_reduce", verbose=True)

response = chain.run(summary)

# Save the response in a text file
with open("summary.txt", "w") as f:
f.write(response)
1 change: 1 addition & 0 deletions youtube_transcripts/summarizer/short_video/summary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
In this course, Lance Martin, a software engineer at LangChain, teaches how to implement Retrieval-Augmented Generation (RAG) from scratch. RAG combines custom data with large language models (LLMs) to enhance their capabilities, especially given that most data is private while LLMs are trained on public data. The course covers the entire RAG pipeline, including indexing external data, retrieval of relevant documents, and generation of answers based on those documents. Key techniques discussed include query translation, routing to appropriate data sources, and query construction for various databases. Advanced methods such as multi-query approaches, hierarchical indexing (Raptor), and corrective RAG are also explored to improve retrieval accuracy and efficiency. The course emphasizes the importance of integrating private data into LLMs and the evolving landscape of RAG technology as LLMs' context windows expand.
1 change: 1 addition & 0 deletions youtube_transcripts/summarizer/short_video/transcript.txt

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import List
from langchain_core.documents import Document
from youtube_transcripts import llm, load_youtube_transcripts
from langchain.chains.summarize import load_summarize_chain

# Fetch the transcript of a YouTube video
summary = load_youtube_transcripts()

# Save the actual transcript in a text file
with open("transcript.txt", "w") as f:
for doc in summary:
f.write(doc.page_content)
f.close()

# Summarize the transcript using the summarization chain
chain = load_summarize_chain(llm, chain_type="stuff", verbose=True)

response = chain.run(summary)

# Save the response in a text file
with open("summary.txt", "w") as f:
f.write(response)
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat

def load_youtube_transcripts():
try:
# Load the transcript of a YouTube video
yt_loader = YoutubeLoader(video_id="sVcwVQRHIc8", language=["en"], translation="en",
transcript_format=TranscriptFormat.TEXT)

# Get the transcript
transcript = yt_loader.load()

if transcript and len(transcript) > 0:
return transcript

except ImportError as ie:
print(f"Import error: {ie}")
except ValueError as ve:
print(f"Value error: {ve}")
except Exception as e:
print(f"An unexpected error occurred: {e}")



__all__ = [load_youtube_transcripts]

0 comments on commit 3b5d28f

Please sign in to comment.