-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
119 lines (95 loc) · 4.84 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
import requests
import time
from unstructured.partition.auto import partition
from about import intro
def extract_text(uploaded_file, **kwargs):
"""
Process extraction data in streamlit server.
:param uploaded_file: The file uploaded by the user
:param kwargs: Required settings fields for the extraction
:return: True if the input is valid, False otherwise
"""
try:
elements = partition(file=uploaded_file, **kwargs)
return "\n\n".join([str(el) for el in elements])
except Exception as e:
st.error(f"Failed to extract text: {e}")
return None
def send_file_to_api(uploaded_file, api_key, **kwargs):
"""
Send data to unstructed.io API. It will use unstructured.io server to process the extraction.
:param uploaded_file: The file uploaded by the user
:param api_key: The API key entered by the user
:param kwargs: Required fields for submiting to unstructured.io
:return: True if the input is valid, False otherwise
"""
url = 'https://api.unstructured.io/general/v0.0.33/general'
headers = {'accept': 'application/json', 'unstructured-api-key': api_key}
# Prepare the files parameter for the API request
# The dict must have a tuple format type
files = {
'pdf_infer_table_structure': (None, kwargs['pdf_infer_table_structure']),
'xml_keep_tags': (None, kwargs['xml_keep_tags']),
'include_page_breaks': (None, kwargs['include_page_breaks']),
'encoding': (None, kwargs['encoding']),
'strategy': (None, kwargs['strategy']),
'output_format': (None, kwargs['output_format']),
'files': uploaded_file,
'gz_uncompressed_content_type': (None, ''),
'ocr_languages': (None, kwargs['ocr_languages']),
'coordinates': (None, ''),
'hi_res_model_name': (None, ''),
}
try:
response = requests.post(url, headers=headers, files=files)
response.raise_for_status() # raise exception if request was unsuccessful
return response.json()
except requests.exceptions.RequestException as e:
st.error(f"Failed to send file to API: {e}") # Display error message on Streamlit
return None
def boolean_to_string(value):
return 'true' if value else ''
def main():
"""
The main function for the Streamlit app.
:return: None.
"""
intro()
if "enable_api" in st.session_state and st.session_state.enable_api:
st.session_state.advance = True
active_online_api = st.checkbox("Use unstructured.io API?", help="Please note: The file will be uploaded and extracted through the unstructured.io server.", key="enable_api")
unstructured_api_input = None
with st.expander("Advance Options", expanded=st.session_state.get("advance", False)):
if active_online_api:
unstructured_api_input = st.text_input('Input your API key:', st.secrets["UNSTRUCTURED_API_KEY"], help="You can use mine first, you can also request your own API key here: https://unstructured.io/api-key/#get-api-key")
settings = {
'strategy': st.radio("Choose the strategy", ('auto', 'hi_res', 'fast', 'ocr_only'), horizontal=True, index=1),
'pdf_infer_table_structure': boolean_to_string(st.checkbox('pdf_infer_table_structure')),
'xml_keep_tags': boolean_to_string(st.checkbox('xml_keep_tags')),
'include_page_breaks': boolean_to_string(st.checkbox('include_page_breaks')),
'encoding': st.text_input('encoding', 'utf_8'),
'ocr_languages': st.text_input('ocr_languages', 'en'),
'output_format': st.radio("Choose the output format", ('text/json', 'text/csv'), horizontal=True, index=0)
}
st.info("For more information visit: https://unstructured-io.github.io/unstructured/api.html")
uploaded_file = st.file_uploader("Upload your document. Accept (HTML, PDF, CSV, PNG, PPTX, and more)")
if uploaded_file is not None:
start_time = time.time()
with st.spinner("Extracting document. This may take a while⏳"):
try:
if unstructured_api_input:
texts = send_file_to_api(uploaded_file=uploaded_file, api_key=unstructured_api_input, **settings)
else:
texts = extract_text(uploaded_file=uploaded_file, **settings)
st.write('### Preview:')
st.text_area('Preview textbox', texts, height=500)
st.divider()
with st.expander("See Preview Text"):
st.write(texts)
except Exception as e:
st.error(f"Failed to process document: {e}")
execution_time = time.time() - start_time # Calculate the execution time
st.write(f"Execution time: {execution_time} seconds") # Display the execution time
if __name__ == '__main__':
main()