-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
46 lines (41 loc) · 1011 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from idna import check_nfc
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
loader_mapping = {
"pdf": PyPDFDirectoryLoader,
"markdown": DirectoryLoader,
"text": DirectoryLoader,
"html": DirectoryLoader
}
file_extensions = {
"pdf": ".pdf",
"markdown": ".md",
"text": ".txt",
"html": ".html"
}
splitter_mapping = {
"text": RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
length_function=len,
add_start_index=True
),
"markdown": RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
length_function=len,
add_start_index=True
),
"html": RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
length_function=len,
add_start_index=True
),
"pdf": RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=80,
length_function=len,
is_separator_regex=False,
)
}