Skip to content

Commit

Permalink
accelerate PDF extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
tuhahaha committed Dec 12, 2024
1 parent fd17df1 commit 5aa0070
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions qwen_agent/tools/simple_doc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ def parse_pdf(pdf_path: str, extract_image: bool = False) -> List[dict]:
from pdfminer.layout import LTImage, LTRect, LTTextContainer

doc = []
import pdfplumber
pdf = pdfplumber.open(pdf_path)
for i, page_layout in enumerate(extract_pages(pdf_path)):
page = {'page_num': page_layout.pageid, 'content': []}

Expand All @@ -212,7 +214,7 @@ def parse_pdf(pdf_path: str, extract_image: bool = False) -> List[dict]:
for element in elements:
if isinstance(element, LTRect):
if not tables:
tables = extract_tables(pdf_path, i)
tables = extract_tables(pdf, i)
if table_num < len(tables):
table_string = table_converter(tables[table_num])
table_num += 1
Expand Down Expand Up @@ -298,9 +300,7 @@ def get_font(element):
return []


def extract_tables(pdf_path, page_num):
import pdfplumber
pdf = pdfplumber.open(pdf_path)
def extract_tables(pdf, page_num):
table_page = pdf.pages[page_num]
tables = table_page.extract_tables()
return tables
Expand Down

0 comments on commit 5aa0070

Please sign in to comment.