Skip to content

Commit a6889ba

Browse files
committed
fix(scraper): add id column for single entry scraping
1 parent 111506e commit a6889ba

File tree

1 file changed

+23
-10
lines changed

1 file changed

+23
-10
lines changed

npiai/tools/web/scraper/app.py

+23-10
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class ParsedResult:
5656

5757

5858
__ID_COLUMN__ = Column(
59-
name="id",
59+
name="[[@item_id]]",
6060
type="text",
6161
description="Unique identifier for each item",
6262
prompt="Fill in the unique identifier for the corresponding <section> that represents the item",
@@ -561,17 +561,20 @@ async def _llm_summarize(
561561
The summarized items as a list of dictionaries.
562562
"""
563563

564-
if scraping_type == "list-like":
565-
prompt = MULTI_COLUMN_SCRAPING_PROMPT
566-
output_columns = [__ID_COLUMN__, *output_columns]
567-
else:
568-
prompt = SINGLE_COLUMN_SCRAPING_PROMPT
564+
prompt = (
565+
MULTI_COLUMN_SCRAPING_PROMPT
566+
if scraping_type == "list-like"
567+
else SINGLE_COLUMN_SCRAPING_PROMPT
568+
)
569+
570+
# add id column to the output columns
571+
output_columns_with_id = [__ID_COLUMN__, *output_columns]
569572

570573
messages = [
571574
ChatCompletionSystemMessageParam(
572575
role="system",
573576
content=prompt.format(
574-
column_defs=json.dumps(output_columns, ensure_ascii=False)
577+
column_defs=json.dumps(output_columns_with_id, ensure_ascii=False)
575578
),
576579
),
577580
ChatCompletionUserMessageParam(
@@ -619,9 +622,19 @@ async def _llm_summarize(
619622

620623
results = []
621624

622-
for row in csv.DictReader(final_response_content.splitlines()):
623-
index = int(row.pop("id"))
624-
results.append(SummaryItem(hash=parsed_result.hashes[index], values=row))
625+
try:
626+
for row in csv.DictReader(final_response_content.splitlines()):
627+
index = int(row.pop(__ID_COLUMN__["name"]))
628+
results.append(
629+
SummaryItem(
630+
hash=parsed_result.hashes[index],
631+
values=row,
632+
)
633+
)
634+
except Exception as e:
635+
await ctx.send_error_message(
636+
f"[{self.name}] Error parsing the response: {str(e)}"
637+
)
625638

626639
return results
627640

0 commit comments

Comments
 (0)