25
25
from .types import (
26
26
Column ,
27
27
SourceItem ,
28
- SummaryItem ,
29
- SummaryChunk ,
28
+ Row ,
29
+ RowBatch ,
30
30
)
31
31
32
32
__INDEX_COLUMN__ = Column (
@@ -44,10 +44,12 @@ class BaseScraper(FunctionTool, ABC):
44
44
infer_prompt : str = DEFAULT_COLUMN_INFERENCE_PROMPT
45
45
46
46
@abstractmethod
47
- async def init_data (self , ctx : Context ): ...
47
+ async def init_data (self , ctx : Context ):
48
+ ...
48
49
49
50
@abstractmethod
50
- async def next_items (self , ctx : Context , count : int ) -> List [SourceItem ] | None : ...
51
+ async def next_items (self , ctx : Context , count : int ) -> List [SourceItem ] | None :
52
+ ...
51
53
52
54
async def summarize_stream (
53
55
self ,
@@ -56,101 +58,110 @@ async def summarize_stream(
56
58
batch_size : int = 1 ,
57
59
limit : int = - 1 ,
58
60
concurrency : int = 1 ,
59
- ) -> AsyncGenerator [SummaryChunk , None ]:
61
+ row_offset : int = 0 ,
62
+ ) -> AsyncGenerator [RowBatch , None ]:
60
63
"""
61
64
Summarize the content of a webpage into a csv table represented as a stream of item objects.
62
65
63
66
Args:
67
+ row_offset: row offset of the first batch in the entire task
64
68
ctx: NPi context.
65
69
output_columns: The columns of the output table. If not provided, use the `infer_columns` function to infer the columns.
66
- batch_size: The number of items to summarize in each batch. Default is 1.
67
- limit: The maximum number of items to summarize. If -1, all items are summarized.
70
+ batch_size: The number of rows to summarize in each batch. Default is 1.
71
+ limit: The maximum number of rows to summarize. If -1, all rows are summarized.
68
72
concurrency: The number of concurrent tasks to run. Default is 1.
69
73
70
74
Returns:
71
- A stream of items . Each item is a dictionary with keys corresponding to the column names and values corresponding to the column values.
75
+ A stream of rows . Each item is a dictionary with keys corresponding to the column names and values corresponding to the column values.
72
76
"""
73
77
if limit == 0 :
74
78
return
75
79
76
80
await self .init_data (ctx )
77
81
78
- # total items summarized
79
- count = 0
80
- # remaining items to summarize, excluding the items being summarized
81
- remaining = limit
82
- # batch index
83
- batch_index = 0
82
+ total_row_summarized = 0
83
+ # remaining rows to summarize, excluding the rows being summarized
84
+ remaining_rows = limit
85
+ batch_no = 0
84
86
85
87
lock = asyncio .Lock ()
86
88
87
- no_count_index = 0
89
+ row_number_count = 0
88
90
89
- async def run_batch (results_queue : asyncio .Queue [SummaryChunk ]):
90
- nonlocal count , no_count_index , remaining , batch_index
91
+ # TODO
92
+ # 1. one task for retrieve html items
93
+ # 2. one task for summarize html items
91
94
92
- if limit != - 1 and remaining <= 0 :
95
+ async def run_batch (results_queue : asyncio .Queue [RowBatch ]):
96
+ nonlocal total_row_summarized , row_number_count , remaining_rows , batch_no
97
+
98
+ if limit != - 1 and remaining_rows <= 0 :
93
99
return
94
100
95
101
async with lock :
96
- current_index = batch_index
97
- batch_index += 1
102
+ current_batch = batch_no
103
+ batch_no += 1
98
104
99
- # calculate the number of items to summarize in the current batch
105
+ # calculate the number of rows to summarize in the current batch
100
106
requested_count = (
101
- min (batch_size , remaining ) if limit != - 1 else batch_size
107
+ min (batch_size , remaining_rows ) if limit != - 1 else batch_size
102
108
)
103
- # reduce the remaining count by the number of items in the current batch
109
+ # reduce the remaining count by the number of rows in the current batch
104
110
# so that the other tasks will not exceed the limit
105
- remaining -= requested_count
111
+ remaining_rows -= requested_count
106
112
107
113
data = await self .next_items (ctx = ctx , count = requested_count )
108
114
109
115
if not data :
110
- await ctx .send_debug_message (f"[{ self .name } ] No more items found" )
116
+ await ctx .send_debug_message (f"[{ self .name } ] No more rows found" )
111
117
return
112
118
113
119
# await ctx.send_debug_message(
114
120
# f"[{self.name}] Parsed markdown: {parsed_result.markdown}"
115
121
# )
116
122
117
123
async with lock :
118
- no_index = no_count_index
119
- no_count_index += len (data )
124
+ current_batch_row_number_offset = row_number_count
125
+ row_number_count += len (data )
120
126
121
- items = await self ._summarize_llm_call (
127
+ rows = await self ._summarize_llm_call (
122
128
ctx = ctx ,
123
129
items = data ,
124
130
output_columns = output_columns ,
125
131
)
126
132
127
- await ctx .send_debug_message (f"[{ self .name } ] Summarized { len (items )} items " )
133
+ await ctx .send_debug_message (f"[{ self .name } ] Summarized { len (rows )} rows " )
128
134
#
129
- # if not items :
130
- # await ctx.send_debug_message(f"[{self.name}] No items summarized")
135
+ # if not rows :
136
+ # await ctx.send_debug_message(f"[{self.name}] No rows summarized")
131
137
# return
132
138
133
139
async with lock :
134
- items_slice = items [:requested_count ] if limit != - 1 else items
140
+ items_slice = rows [:requested_count ] if limit != - 1 else rows
135
141
summarized_count = len (items_slice )
136
- count += summarized_count
137
- # recalculate the remaining count in case summary returned fewer items than requested
142
+ total_row_summarized += summarized_count
143
+ # recalculate the remaining count in case summary returned fewer rows than requested
138
144
if summarized_count < requested_count :
139
- remaining += requested_count - summarized_count
145
+ remaining_rows += requested_count - summarized_count
146
+
147
+ count = 1
148
+ for row in items_slice :
149
+ row ["row_no" ] = current_batch_row_number_offset + row_offset + count
150
+ count += 1
140
151
141
152
await results_queue .put (
142
- SummaryChunk (
143
- index = no_index ,
144
- batch_id = current_index ,
153
+ RowBatch (
154
+ offset = current_batch_row_number_offset + row_offset ,
155
+ batch_id = current_batch ,
145
156
items = items_slice ,
146
157
)
147
158
)
148
159
149
160
await ctx .send_debug_message (
150
- f"[{ self .name } ] Summarized { count } items in total"
161
+ f"[{ self .name } ] Summarized { total_row_summarized } rows in total"
151
162
)
152
163
153
- if limit == - 1 or remaining > 0 :
164
+ if limit == - 1 or remaining_rows > 0 :
154
165
await run_batch (results_queue )
155
166
156
167
async for chunk in concurrent_task_runner (run_batch , concurrency ):
@@ -268,7 +279,7 @@ async def _summarize_llm_call(
268
279
ctx : Context ,
269
280
items : List [SourceItem ],
270
281
output_columns : List [Column ],
271
- ) -> List [SummaryItem ]:
282
+ ) -> List [Row ]:
272
283
"""
273
284
Summarize the content of a webpage into a table using LLM.
274
285
@@ -309,9 +320,9 @@ async def _summarize_llm_call(
309
320
async for row in llm_summarize (ctx .llm , messages ):
310
321
index = int (row .pop (__INDEX_COLUMN__ ["name" ]))
311
322
results .append (
312
- SummaryItem (
323
+ Row (
313
324
hash = items [index ]["hash" ],
314
- index = index ,
325
+ original_data_index = index ,
315
326
values = row ,
316
327
)
317
328
)
0 commit comments