-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtranscript_processing_util.py
144 lines (115 loc) · 6.85 KB
/
transcript_processing_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
from typing import Union, List, Optional, Dict
from youtube_chapters_extractor import load_transcript
from fpdf import FPDF
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
import tiktoken
from langchain.vectorstores import Chroma
def convert_string_to_pdf(input_str: str, filename: str = "output.pdf", font_path: str = "ARIALUNI.TTF") -> None:
"""
Converts a string to a PDF file.
:param input_str: The string to be converted to PDF.
:param filename: The name of the output PDF file. Defaults to "output.pdf".
:param font_path: The path to the font file supporting Unicode characters. Defaults to "ARIALUNI.TTF".
"""
pdf = FPDF()
pdf.add_page()
pdf.add_font("ArialUnicode", style="", fname=font_path, uni=True)
pdf.set_font("ArialUnicode", size=12)
pdf.multi_cell(0, 10, input_str)
pdf.output(filename)
def count_tokens_in_string(string: str, encoding_name: str) -> int:
"""
Counts the number of tokens in a given string.
:param string: The input string to count tokens.
:param encoding_name: The encoding name for the token counting model.
:return: The number of tokens in the input string.
"""
encoding = tiktoken.encoding_for_model(encoding_name)
return len(encoding.encode(string))
def calculate_openai_api_cost(input_text: str, output_text: str, model_name: str) -> float:
"""
Calculates the cost of OpenAI API calls based on the input and output text.
:param input_text: The input text string.
:param output_text: The output text string.
:param model_name: The name of the model used for the API call.
:return: The cost of the API call in USD.
"""
in_costs = {'gpt-3.5-turbo-16k': 0.003, 'gpt-3.5-turbo': 0.0015}
out_costs = {'gpt-3.5-turbo-16k': 0.004, 'gpt-3.5-turbo': 0.002}
input_tokens = count_tokens_in_string(input_text, model_name)
output_tokens = count_tokens_in_string(output_text, model_name)
input_cost = input_tokens * in_costs[model_name] / 1000.
output_cost = output_tokens * out_costs[model_name] / 1000.
return input_cost + output_cost
def chunk_transcript(document: Union[str, bytes]) -> List[Document]:
"""
Splits the document into chunks.
:param document: The input document string or bytes.
:return: A list of Document objects containing the chunks of the input document.
"""
if document is None or not isinstance(document, (str, bytes)):
return []
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", " "], chunk_size=2000, chunk_overlap=200)
segmented_documents = text_splitter.split_text(document)
return [Document(page_content=t) for t in segmented_documents]
def display_embedding_cost(texts: List[Document]) -> None:
"""
Displays the embedding cost of the texts in USD.
:param texts: A list of Document objects containing the texts.
"""
enc = tiktoken.encoding_for_model('text-embedding-ada-002')
total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
print(f'Total Tokens: {total_tokens}')
print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
def initialize_vector_store(video_url: str) -> Optional[Chroma]:
"""
Initializes the vector store from the transcript of the video URL.
:param video_url: The URL of the video to load the transcript.
:return: The initialized Chroma object containing the vector store or None if unsuccessful.
"""
result = load_transcript(video_url)
if result is None or not isinstance(result, (str, bytes)):
return None
docs = chunk_transcript(result)
if not docs:
return None
return Chroma.from_documents(docs, OpenAIEmbeddings())
def generate_video_summary(model_name: str, chapter: str, vector_store: Chroma, summary_type: str) -> Dict[str, str]:
"""
Generates a summary of the video transcript.
:param model_name: The name of the model used for summarization.
:param chapter: The chapter of the video to summarize.
:param vector_store: The Chroma object containing the vector store.
:param summary_type: The type of summary, either "bullet points" or "paragraph".
:return: A dictionary containing the chapter and the generated summary.
"""
if vector_store is None:
return {}
if summary_type not in ["bullet points", "paragraph"]:
raise ValueError("Invalid summary_type. Accepted values are 'bullet points' or 'paragraph'.")
llm = ChatOpenAI(temperature=0, model=model_name)
system_template = f"""
You are tasked with generating a concise, clear, and relevant summary based on a provided discussion transcript. Please pay meticulous attention to the following guidelines to ensure the highest quality summary:
- **Word Limit:** Your summary must strictly adhere to a maximum of 100 words. Any excess will render the summary invalid. Be succinct and choose words that convey maximum meaning.
- **Format:** Employ {summary_type} format, focusing on clarity and succinctness to relay explicit details, distinctive insights, and specific points from the transcript.
- **Relevance:** Only include pertinent and topic-specific information and insights. Eliminate any generalized, unrelated, or superfluous content.
- **References & Names:** Should there be any references, list them briefly by title and author. Use participants’ known names if provided.
**Mandatory Compliance:**
- Confine your summary to information explicitly stated in the transcript, excluding any form of assumptions, inferences, or embellishments.
- Do not allude to the presence or absence of references or speculate on further exploration unless specific resources are explicitly cited in the transcript.
```{{context}}```
**Urgent Reminder:** Precision, focus, and adherence to the 100-word limit are imperative. Every word should significantly contribute to the understanding of the topic. Non-compliance with the guidelines and word limit will lead to summary rejection. Please review your summary carefully to ensure strict adherence to the word limit and guidelines.
"""
messages = [SystemMessagePromptTemplate.from_template(system_template), HumanMessagePromptTemplate.from_template("{question}")]
chat_prompt = ChatPromptTemplate.from_messages(messages)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever(), chain_type_kwargs={'prompt': chat_prompt})
expanded_topic = qa.run(chapter)
return {'chapter': chapter, 'summary': expanded_topic}
if __name__ == "__main__":
None