-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode.py
159 lines (129 loc) · 6.46 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import streamlit as st
import os
import fitz # PyMuPDF
from pptx import Presentation # For handling PPTX files
import docx # For handling DOCX files
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
# Define the correct passkey
correct_passkey = "godsparky1237" # Replace with your actual passkey
# Function to read a PDF file and extract its text content
def read_pdf(file):
doc = fitz.open(stream=file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
# Function to read a PPTX file and extract its text content
def read_pptx(file):
presentation = Presentation(file)
text = ""
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + "\n"
return text
# Function to read a DOCX file and extract its text content
def read_docx(file):
doc = docx.Document(file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
# Function to split text into smaller chunks
def split_text(text, chunk_size=5000000, chunk_overlap=100):
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
return splitter.split_text(text)
# Function to ask a question and get a response
def ask_question(question, text_chunks, google_api_key):
# Initialize the model with the provided API key
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key=google_api_key)
# Clean question for keywords extraction
cleaned_question = re.sub(r'[^\w\s]', '', question.lower())
keywords = cleaned_question.split()
# Filter relevant text chunks based on keywords
relevant_chunks = []
for chunk in text_chunks:
chunk_lower = chunk.lower()
if any(keyword in chunk_lower for keyword in keywords):
relevant_chunks.append(chunk)
# If no relevant chunks found, use all chunks
if not relevant_chunks:
relevant_chunks = text_chunks
# Prepare prompts and get responses
prompt_template = (
"system: You are an outstanding AI tutor known for your expertise and clarity. Your role is to assist students by providing clear, concise, and comprehensive answers to their assignment questions using the provided content from the uploaded document. Use your extensive knowledge only if necessary.\n"
"user: Context: {context}\n\n"
"Question: {question}\n\n"
"assistant: \n"
"Your answers should be:\n"
"- Well-structured\n"
"- Very detailed\n"
"- Comprehensive for the topic\n"
"- contain short and easy code snippets if needed in context to the topic of the document"
"- Include relevant examples\n"
"- Easy to understand for everyone\n"
"- Free of any repetitive content and unnecessary length\n"
"- Appropriate for an undergrad engineering AI and Data Science student\n"
"Begin the answer by writing the question and then the answer"
"Provide a clear, concise and comprehensive answer based on the PDF context"
)
full_response = []
for chunk in relevant_chunks:
prompt = prompt_template.format(context=chunk, question=question)
message = HumanMessage(content=prompt)
response = model.stream([message])
response_texts = [chunk.content for chunk in response]
full_response.extend(response_texts)
return ''.join(full_response)
st.set_page_config(layout="wide", page_title="SparkDocs", page_icon=":robot_face:") # Set app to wide mode and customize title and icon
# Load custom CSS for styling
with open("style.css") as css:
st.markdown(f'<style>{css.read()}</style>', unsafe_allow_html=True)
# Authentication section
if 'authenticated' not in st.session_state:
st.session_state.authenticated = False
if not st.session_state.authenticated:
st.title("Welcome to SparkDocs")
# Add an input field for the user to enter the passkey
passkey = st.text_input("🚀 Ready to dive into document analysis? Authenticate with the passkey (hint: it's definitely not 'password123' 😜)", type="password")
# Check if the passkey matches the correct_passkey
if st.button("Submit"):
if passkey == correct_passkey:
st.session_state.authenticated = True
st.success("Authentication successful! Welcome aboard! 🎉")
st.rerun() # Refresh the app to show the main content
else:
st.error("Authentication failed. Please enter a valid passkey. 🛑")
st.image("cool logo.png", width=600, caption="Developed by Sparky Labs")
else:
# Input for API key
api_key = st.text_input("Enter your Google API key", type="password")
if not api_key:
st.warning("Please enter your API key to continue.")
else:
# Now you can proceed with your app content here
st.title("SparkDocs")
st.subheader("AI-Powered Document Q&A System")
st.write("Upload a PDF, PPTX, or DOCX file, ask questions, and get answers in context with the document content.")
uploaded_file = st.file_uploader("Upload Document", type=["pdf", "pptx", "docx"])
if uploaded_file is not None:
file_type = uploaded_file.type
if file_type == "application/pdf":
doc_text = read_pdf(uploaded_file)
elif file_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
doc_text = read_pptx(uploaded_file)
elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc_text = read_docx(uploaded_file)
else:
st.error("Unsupported file type!")
doc_text = ""
if doc_text:
text_chunks = split_text(doc_text)
question = st.text_input("Enter your question:")
if st.button("Ask"):
with st.spinner("Processing..."):
answer = ask_question(question, text_chunks, api_key)
st.markdown(answer)