-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* ✅Setup LLM ✅Load Youtube video transcripts * ✅Retreiving the Youtube Transcripts ✅Summarizing the transcript using map_reduce and stuff ✅Saving the original as well as summarized transcript * ✅Retreiving the Youtube Transcripts ✅Summarizing the transcript using map_reduce and stuff ✅Saving the original as well as summarized transcript * ✅Successful ingestion of Transcripts ✅Successful retrieval of transcripts from Pinecone ✅Use Pinecone as a retriever chain ✅Integrating Langfuse for tracing ✅Using Langfuse Callback as LangChain Callbacks * ✅Updated requirements.txt
- Loading branch information
1 parent
9a09ce8
commit 3b5d28f
Showing
21 changed files
with
1,332 additions
and
560 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from llms.openai_llm import llm | ||
|
||
__all__ = [llm] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from dotenv import load_dotenv | ||
from langchain_openai import ChatOpenAI | ||
|
||
load_dotenv() | ||
|
||
# Initialize ChatOpenAI | ||
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", verbose=True) | ||
|
||
__all__ = [llm] |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,52 +1 @@ | ||
{ | ||
"car_id ": { | ||
"data_type": "string", | ||
"example_data": "[]", | ||
"foreign_key": 0 | ||
}, | ||
"make ": { | ||
"data_type": "string", | ||
"example_data": "['Honda', 'Hyundai', 'BMW']", | ||
"foreign_key": 0 | ||
}, | ||
"model ": { | ||
"data_type": "string", | ||
"example_data": "['City', 'i20', 'X1']", | ||
"foreign_key": 0 | ||
}, | ||
"year ": { | ||
"data_type": "string", | ||
"example_data": "[]", | ||
"foreign_key": 0 | ||
}, | ||
"color ": { | ||
"data_type": "string", | ||
"example_data": "[]", | ||
"foreign_key": 0 | ||
}, | ||
"engine_type ": { | ||
"data_type": "string", | ||
"example_data": "[]", | ||
"foreign_key": 0 | ||
}, | ||
"price ": { | ||
"data_type": "string", | ||
"example_data": "[]", | ||
"foreign_key": 0 | ||
}, | ||
"owner_id ": { | ||
"data_type": "string", | ||
"example_data": "[]", | ||
"foreign_key": 0 | ||
}, | ||
"description ": { | ||
"data_type": "string", | ||
"example_data": "[]", | ||
"foreign_key": 0 | ||
}, | ||
"dealer_id ": { | ||
"data_type": "string", | ||
"example_data": "[]", | ||
"foreign_key": 0 | ||
} | ||
} | ||
{"car_id ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "make ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "model ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "year ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "color ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "mileage ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "price ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "engine_type ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "transmission ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "fuel_type ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "service_history_id ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "description ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "created_at ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "updated_at ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "showroom_id": {"data_type": "string", "example_data": "[]", "foreign_key": 1}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
{"showroom_id ": {"data_type": "string", "example_data": "[]"}, "showroom_name ": {"data_type": "string", "example_data": "[]"}, "location ": {"data_type": "string", "example_data": "[]"}, "manager_id ": {"data_type": "string", "example_data": "[]"}, "contact_number ": {"data_type": "string", "example_data": "[]"}, "opening_hours ": {"data_type": "string", "example_data": "[]"}, "capacity ": {"data_type": "string", "example_data": "[]"}, "description ": {"data_type": "string", "example_data": "[]"}, "brand_id ": {"data_type": "string", "example_data": "[]"}, "city_id ": {"data_type": "string", "example_data": "[]"}} | ||
{"showroom_id ": {"data_type": "string", "example_data": "['SHRM12589', 'SHRM55896']", "foreign_key": 0}, "showroom_name ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "location ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "manager_id ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "contact_number ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "opening_hours ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "capacity ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "description ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "brand_id ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}, "created_at ": {"data_type": "string", "example_data": "[]", "foreign_key": 0}} |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from dotenv import load_dotenv | ||
import streamlit as st | ||
from youtube_transcripts import retrieval_chain | ||
from youtube_transcripts.langfuse.callbackHandler import langfuseHandler | ||
|
||
# Set page configuration and set page icon to youtube logo | ||
st.set_page_config(page_title="Ask Youtube", page_icon="📹", layout="centered") | ||
|
||
st.title("Q&A") | ||
|
||
# Input widget for user question | ||
question = st.text_input("Enter your question here") | ||
|
||
if question: | ||
btn = st.button("Ask") | ||
if btn: | ||
# Invoke Retrieval Chain using Pinecone as Vector Store with QA prompt from LangChain Hub | ||
result = retrieval_chain.invoke( | ||
{ | ||
"input": question | ||
}, | ||
config={ | ||
"callbacks": [langfuseHandler], | ||
"run_name": "langfuse-trace-qa", | ||
} | ||
) | ||
st.write(result["answer"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from youtube_transcripts.openai_llm import llm | ||
from youtube_transcripts.retrieve_transcripts.retriever import retrieval_chain | ||
from youtube_transcripts.transcripts_loaders.video_transcripts_loaders import load_youtube_transcripts | ||
from youtube_transcripts.summarizer.export_transcript import export_transcript_text | ||
from youtube_transcripts.langfuse.callbackHandler import langfuseHandler | ||
|
||
__all__ = [llm, load_youtube_transcripts, export_transcript_text, retrieval_chain] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from dotenv import load_dotenv | ||
import os | ||
from langchain_pinecone import PineconeVectorStore | ||
from langchain_community.embeddings import OllamaEmbeddings | ||
from youtube_transcripts import llm, export_transcript_text | ||
from langchain_text_splitters import RecursiveCharacterTextSplitter | ||
|
||
load_dotenv() | ||
|
||
path = "../summarizer/long_video/transcript.txt" | ||
transcript = export_transcript_text(path) | ||
|
||
pinecone_api_key = os.getenv('PINECONE_API_KEY') | ||
pinecone_index = "youtube-transcripts" | ||
try: | ||
|
||
textSplitter = RecursiveCharacterTextSplitter( | ||
chunk_size=2000, | ||
chunk_overlap=0, | ||
length_function=len, | ||
keep_separator=True, | ||
separators=["", "\n"] | ||
) | ||
|
||
chunks = textSplitter.split_text(transcript) | ||
|
||
embeddings = OllamaEmbeddings(model="llama3:latest", num_gpu=1) | ||
|
||
# Create new Pinecone Vector Store | ||
vector_store = PineconeVectorStore.from_texts( | ||
index_name=pinecone_index, | ||
embedding=embeddings, | ||
texts=chunks | ||
) | ||
|
||
print("Pinecone Vector Store Created") | ||
|
||
except Exception as e: | ||
print(f"Error: {e}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import os | ||
from dotenv import load_dotenv | ||
from langfuse.callback import CallbackHandler | ||
|
||
load_dotenv() | ||
|
||
langfuseHandler = CallbackHandler( | ||
secret_key=os.getenv("LANGFUSE_SECRET_KEY"), | ||
public_key=os.getenv("LANGFUSE_PUBLIC_KEY"), | ||
host=os.getenv("LANGFUSE_HOST") | ||
) | ||
|
||
__all__ = [langfuseHandler] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from dotenv import load_dotenv | ||
from langchain_openai import ChatOpenAI | ||
|
||
load_dotenv() | ||
|
||
# Initialize OpenAI LLM | ||
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", verbose=True, stream_usage=True) | ||
|
||
__all__ = [llm] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from langchain.chains.combine_documents import create_stuff_documents_chain | ||
from youtube_transcripts import llm | ||
from langchain import hub | ||
from langchain_community.embeddings import OllamaEmbeddings | ||
from langchain_pinecone import PineconeVectorStore | ||
from langchain.chains.retrieval import create_retrieval_chain | ||
|
||
|
||
#pinecone_api_key = os.getenv('PINECONE_API_KEY') | ||
pinecone_index = "youtube-transcripts" | ||
|
||
# Pull the prompt for qa-chain from LangChain Hub | ||
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat") | ||
|
||
# Initialise Embeddings | ||
embeddings = OllamaEmbeddings(model="llama3:latest") | ||
|
||
# Intialise vector store | ||
vector_store = PineconeVectorStore(index_name=pinecone_index, embedding=embeddings) | ||
|
||
# Create the chain of type document stuff | ||
combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt) | ||
|
||
# Create the Retrieval Chain | ||
retrieval_chain = create_retrieval_chain( | ||
retriever=vector_store.as_retriever(), combine_docs_chain=combine_docs_chain | ||
) | ||
|
||
__all__ = [ | ||
retrieval_chain | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import os | ||
|
||
# Export the text of summary.txt file | ||
def export_transcript_text(file_path: str): | ||
transcript = "" | ||
|
||
# Check if the file exists | ||
if os.path.exists(file_path): | ||
try: | ||
with open(file_path, "r") as file: | ||
transcript = file.read() | ||
except Exception as e: | ||
print(f"An error occurred while reading the file: {e}") | ||
else: | ||
print(f"File not found: {file_path}") | ||
|
||
return transcript | ||
|
||
__all__ = [export_transcript_text] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
In this course, Lance Martin from LangChain teaches how to implement Retrieval-Augmented Generation (RAG) to enhance large language models (LLMs) using private data. The curriculum covers the entire RAG pipeline, including data indexing, document retrieval, and answer generation, while exploring techniques like query translation and multi-query approaches. Advanced methods such as hierarchical indexing and corrective RAG are also discussed to improve retrieval accuracy. The course aims to equip learners with practical skills to build effective RAG systems that integrate public and private data. |
Large diffs are not rendered by default.
Oops, something went wrong.
20 changes: 20 additions & 0 deletions
20
youtube_transcripts/summarizer/long_video/transcript_summarizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from youtube_transcripts import llm, load_youtube_transcripts | ||
from langchain.chains.summarize import load_summarize_chain | ||
|
||
# Fetch the transcript of a YouTube video | ||
summary = load_youtube_transcripts() | ||
|
||
# Save the actual transcript in a text file | ||
with open("transcript.txt", "w") as f: | ||
for doc in summary: | ||
f.write(doc.page_content) | ||
f.close() | ||
|
||
# Summarize the transcript using the summarization chain | ||
chain = load_summarize_chain(llm, chain_type="map_reduce", verbose=True) | ||
|
||
response = chain.run(summary) | ||
|
||
# Save the response in a text file | ||
with open("summary.txt", "w") as f: | ||
f.write(response) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
In this course, Lance Martin, a software engineer at LangChain, teaches how to implement Retrieval-Augmented Generation (RAG) from scratch. RAG combines custom data with large language models (LLMs) to enhance their capabilities, especially given that most data is private while LLMs are trained on public data. The course covers the entire RAG pipeline, including indexing external data, retrieval of relevant documents, and generation of answers based on those documents. Key techniques discussed include query translation, routing to appropriate data sources, and query construction for various databases. Advanced methods such as multi-query approaches, hierarchical indexing (Raptor), and corrective RAG are also explored to improve retrieval accuracy and efficiency. The course emphasizes the importance of integrating private data into LLMs and the evolving landscape of RAG technology as LLMs' context windows expand. |
Large diffs are not rendered by default.
Oops, something went wrong.
22 changes: 22 additions & 0 deletions
22
youtube_transcripts/summarizer/short_video/transcript_summarizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from typing import List | ||
from langchain_core.documents import Document | ||
from youtube_transcripts import llm, load_youtube_transcripts | ||
from langchain.chains.summarize import load_summarize_chain | ||
|
||
# Fetch the transcript of a YouTube video | ||
summary = load_youtube_transcripts() | ||
|
||
# Save the actual transcript in a text file | ||
with open("transcript.txt", "w") as f: | ||
for doc in summary: | ||
f.write(doc.page_content) | ||
f.close() | ||
|
||
# Summarize the transcript using the summarization chain | ||
chain = load_summarize_chain(llm, chain_type="stuff", verbose=True) | ||
|
||
response = chain.run(summary) | ||
|
||
# Save the response in a text file | ||
with open("summary.txt", "w") as f: | ||
f.write(response) |
25 changes: 25 additions & 0 deletions
25
youtube_transcripts/transcripts_loaders/video_transcripts_loaders.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from langchain_community.document_loaders import YoutubeLoader | ||
from langchain_community.document_loaders.youtube import TranscriptFormat | ||
|
||
def load_youtube_transcripts(): | ||
try: | ||
# Load the transcript of a YouTube video | ||
yt_loader = YoutubeLoader(video_id="sVcwVQRHIc8", language=["en"], translation="en", | ||
transcript_format=TranscriptFormat.TEXT) | ||
|
||
# Get the transcript | ||
transcript = yt_loader.load() | ||
|
||
if transcript and len(transcript) > 0: | ||
return transcript | ||
|
||
except ImportError as ie: | ||
print(f"Import error: {ie}") | ||
except ValueError as ve: | ||
print(f"Value error: {ve}") | ||
except Exception as e: | ||
print(f"An unexpected error occurred: {e}") | ||
|
||
|
||
|
||
__all__ = [load_youtube_transcripts] |