Skip to content

Commit

Permalink
fix: some bug
Browse files Browse the repository at this point in the history
  • Loading branch information
3Alan committed May 7, 2023
1 parent 491982f commit 1e1b8e9
Show file tree
Hide file tree
Showing 14 changed files with 170 additions and 52,063 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Deploy on Vercel and Railway
- 🤖 Ask a question with your docs
- 📝 Summarize docs
- 🖍️ Highlight source
- 📤 Upload docs (.pdf,.md)
- 📤 Upload docs .pdf,.md(best support)
- 💾 Data saved locally
- 💰 Token usage tracker
- 🐳 Dockerize
Expand Down
4 changes: 3 additions & 1 deletion client/src/components/chatWindow/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ import Message from './Message';

interface ChatWindowProps {
fileName: string;
fullFileName: string;
className?: string;
onReplyComplete: (data: any) => void;
onSourceClick: (data: any) => void;
}

const ChatWindow: FC<ChatWindowProps> = ({
fileName,
fullFileName,
className,
onReplyComplete,
onSourceClick
Expand Down Expand Up @@ -68,7 +70,7 @@ const ChatWindow: FC<ChatWindowProps> = ({

if (summarize) {
res = await fetchRequest('/api/summarize', {
index: fileName,
file: fullFileName,
openAiKey
});
} else {
Expand Down
1 change: 1 addition & 0 deletions client/src/pages/Home.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ const Home = () => {
</div>

<ChatWindow
fullFileName={currentFile?.fullName || ''}
fileName={currentFile?.name.split(currentFile.ext)[0] || ''}
className="flex flex-col"
onReplyComplete={handleHighLight}
Expand Down
4 changes: 2 additions & 2 deletions server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@ def handle_error(error):

@app.route("/api/summarize", methods=["GET"])
def summarize_index():
index_name = request.args.get("index")
file = request.args.get("file")
open_ai_key = request.args.get("openAiKey")
if open_ai_key:
os.environ["OPENAI_API_KEY"] = open_ai_key

UnstructuredReader = download_loader("UnstructuredReader")
loader = UnstructuredReader()
documents = loader.load_data(file=Path(f"./{staticPath}/file/{index_name}"))
documents = loader.load_data(file=Path(f"./{staticPath}/file/{file}"))
index = GPTListIndex.from_documents(documents)

# predictor cost
Expand Down
4 changes: 2 additions & 2 deletions server/create_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def create_index(filepath, filename) -> int:

if ext == ".pdf":
loader = CJKPDFReader()
documents = loader.load_data(file=filepath)
documents = loader.load_data(filepath=filepath, filename=filename)
elif ext == ".md":
with open(filepath, "r", encoding="utf-8") as f:
file_text = f.read()
Expand All @@ -23,7 +23,7 @@ def create_index(filepath, filename) -> int:
# TODO: 利用 langchain splitter重写 https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/markdown.html
# 直接将markdown分段再分别转化成html,最后将所有html拼接起来并加上chunk_id
loader = CustomReader()
documents = loader.load_data(html=html, file=filepath)
documents = loader.load_data(html=html, filename=name)
elif ext == ".html":
# TODO:
pass
Expand Down
4 changes: 2 additions & 2 deletions server/custom_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
"""Init params."""
super().__init__(*args, **kwargs)

def load_data(self, html, file) -> List[Document]:
def load_data(self, html, filename) -> List[Document]:
soup = BeautifulSoup(html, "html.parser")
current_chunk_text = ""
current_chunk_id = 1
Expand Down Expand Up @@ -109,7 +109,7 @@ def load_data(self, html, file) -> List[Document]:
current_chunk_id += 1

# 保存修改后的HTML文件
with open(file, "w", encoding="utf-8") as f:
with open(f"{staticPath}/file/{filename}.html", "w", encoding="utf-8") as f:
f.write(str(soup))

return document_list
9 changes: 7 additions & 2 deletions server/pdf_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Read PDF files."""

import shutil
from pathlib import Path
from typing import Any, List

Expand All @@ -9,6 +10,8 @@

# https://github.com/emptycrown/llama-hub/blob/main/loader_hub/file/cjk_pdf/base.py

staticPath = "static"


class CJKPDFReader(BaseReader):
"""CJK PDF reader.
Expand All @@ -20,7 +23,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
"""Init params."""
super().__init__(*args, **kwargs)

def load_data(self, file: Path) -> List[Document]:
def load_data(self, filepath: Path, filename) -> List[Document]:
"""Parse file."""

# Import pdfminer
Expand All @@ -42,7 +45,7 @@ def load_data(self, file: Path) -> List[Document]:
# Create a PDF interpreter
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Open the PDF file
fp = open(file, "rb")
fp = open(filepath, "rb")
# Create a list to store the text of each page
document_list = []
# Extract text from each page
Expand All @@ -66,4 +69,6 @@ def load_data(self, file: Path) -> List[Document]:
# Close the device
device.close()

shutil.copy2(filepath, f"{staticPath}/file/{filename}")

return document_list
149 changes: 149 additions & 0 deletions server/static/file/AA-README.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
<h1 data-chunk_id="chunk-1">DocsMind</h1>
<p data-chunk_id="chunk-1">DocsMind is an open-source project that allows you to chat with your docs.</p>
<p data-chunk_id="chunk-1"><img alt="Stack" src="https://skillicons.dev/icons?i=vite,react,ts,tailwind,flask"/></p>
<h2 data-chunk_id="chunk-2">Demo</h2>
<p data-chunk_id="chunk-2"><a href="https://docs-mind.alanwang.site/">Demo Site</a></p>
<p data-chunk_id="chunk-2">Deploy on Vercel and Railway</p>
<blockquote data-chunk_id="chunk-2">
<p><strong>Warning</strong></p>
<p>Due to the free plan of Railway only providing 500 hours per month, the Demo on the 21st day of each month will not be available. Please clone it locally for use at that time.</p>
</blockquote>
<h2 data-chunk_id="chunk-3">Features</h2>
<ul data-chunk_id="chunk-3">
<li>🤖 Ask a question with your docs</li>
<li>📝 Summarize docs</li>
<li>🖍️ Highlight source</li>
<li>📤 Upload docs .pdf,.md(best support)</li>
<li>💾 Data saved locally</li>
<li>💰 Token usage tracker</li>
<li>🐳 Dockerize</li>
</ul>
<h2 data-chunk_id="chunk-4">Future Development</h2>
<ul data-chunk_id="chunk-4">
<li>[ ] Chat mode</li>
<li>[ ] Dark mode</li>
<li>[ ] Improve the UI (Skeleton,Loading)</li>
<li>[ ] / command (/fetch /summarize)</li>
<li>[ ] Reduce the size of the server image.</li>
<li>[ ] Support for more docs formats: txt...</li>
<li>[ ] Download docs from the internet</li>
<li>[ ] Markdown-formatted message</li>
<li>[ ] i18n</li>
<li>[ ] Desktop application</li>
</ul>
<p data-chunk_id="chunk-4">If you find this project helpful, please consider giving it a star 🌟</p>
<h2 data-chunk_id="chunk-5">Environment Variables</h2>
<table data-chunk_id="chunk-5">
<thead>
<tr>
<th>Name</th>
<th>Description</th>
<th>Optional</th>
</tr>
</thead>
<tbody>
<tr>
<td>OPENAI_API_KEY</td>
<td>sk-xxx</td>
<td></td>
</tr>
<tr>
<td>OPENAI_PROXY</td>
<td>will replace https://api.openai.com/v1</td>
<td></td>
</tr>
<tr>
<td>VITE_SERVICES_URL</td>
<td>backend url for frontend code</td>
<td></td>
</tr>
<tr>
<td>VITE_DISABLED_UPLOAD</td>
<td>DISABLED_UPLOAD</td>
<td></td>
</tr>
</tbody>
</table>
<h2 data-chunk_id="chunk-6">Q&amp;A</h2>
<h3 data-chunk_id="chunk-7">How to run locally?</h3>
<blockquote data-chunk_id="chunk-7">
<p><strong>Warning</strong></p>
<p>Please check if you can access OpenAI in your region, you can refer to the <a href="https://github.com/3Alan/DocsMind/issues/3#issuecomment-1511470063">issue</a> for more information.</p>
</blockquote>
<ol data-chunk_id="chunk-7">
<li>Create .env(Optional)</li>
</ol>
<p data-chunk_id="chunk-7">Create a <code>.env</code> file and copy the contents of <code>.env.example</code> to modify it.</p>
<ol data-chunk_id="chunk-7">
<li>Run App</li>
</ol>
<div class="highlight" data-chunk_id="chunk-7"><pre><span></span><code>docker-compose<span class="w"> </span>up<span class="w"> </span>-d
</code></pre></div>
<p data-chunk_id="chunk-7">Please add <code>--build</code> to rebuild the image after each code update.</p>
<div class="highlight" data-chunk_id="chunk-7"><pre><span></span><code>docker-compose<span class="w"> </span>up<span class="w"> </span>-d<span class="w"> </span>--build
</code></pre></div>
<p data-chunk_id="chunk-7">now you can access the app at <code>http://localhost:8081</code></p>
<h3 data-chunk_id="chunk-8">Local Development</h3>
<details data-chunk_id="chunk-8">
<summary>Detail</summary>

#### Create .env(Optional)

Create a `.env` file and copy the contents of `.env.example` to modify it.

#### Run Frontend

1. Install dependencies

<div class="highlight"><pre><span></span><code>yarn
</code></pre></div>

2. Run app

<div class="highlight"><pre><span></span><code>yarn dev
</code></pre></div>

#### Run Backend

you need a python environment

1. Create virtual environment

<div class="highlight"><pre><span></span><code>cd server
python -m venv .venv
</code></pre></div>

2. Active virtual environment

windows

<div class="highlight"><pre><span></span><code>.venv\Scripts\activate
</code></pre></div>

mac

<div class="highlight"><pre><span></span><code>. .venv/bin/activate
</code></pre></div>

3. Install dependencies

<div class="highlight"><pre><span></span><code>pip install -r requirements.txt
</code></pre></div>

4. Run Services

<div class="highlight"><pre><span></span><code>flask run --reload --port=8080
</code></pre></div>

#### Install pdf2htmlEX for PDF convert

<div class="highlight"><pre><span></span><code>docker<span class="w"> </span>pull<span class="w"> </span>pdf2htmlex/pdf2htmlex:0.18.8.rc2-master-20200820-alpine-3.12.0-x86_64
</code></pre></div>

set alias

<div class="highlight"><pre><span></span><code><span class="nb">alias</span><span class="w"> </span><span class="nv">pdf2htmlEX</span><span class="o">=</span><span class="s1">'docker run -ti --rm -v "`pwd`":/pdf -w /pdf pdf2htmlex/pdf2htmlex:0.18.8.rc2-master-20200820-alpine-3.12.0-x86_64'</span>
</code></pre></div>
</details>
<h2 data-chunk_id="chunk-9">Buy me a coffee</h2>
<p data-chunk_id="chunk-9"><img height="300" src="https://raw.githubusercontent.com/3Alan/images/master/img/%E5%BE%AE%E4%BF%A1%E6%94%AF%E4%BB%98%E5%AE%9D%E4%BA%8C%E5%90%88%E4%B8%80%E6%94%B6%E6%AC%BE%E7%A0%81.jpg"/></p>
35 changes: 0 additions & 35 deletions server/static/file/no-match-heading-test.html

This file was deleted.

1 change: 1 addition & 0 deletions server/static/index/AA-README.json

Large diffs are not rendered by default.

Loading

0 comments on commit 1e1b8e9

Please sign in to comment.