diff --git a/pydantic_ai_slim/pydantic_ai/evals/__init__.py b/pydantic_ai_slim/pydantic_ai/evals/__init__.py index 3461dc0e0..5b6d6f0f8 100644 --- a/pydantic_ai_slim/pydantic_ai/evals/__init__.py +++ b/pydantic_ai_slim/pydantic_ai/evals/__init__.py @@ -59,9 +59,9 @@ async def function_i_want_to_evaluate(x: int, deps: str) -> int: eval_case.record_label('sentiment', 'positive') eval_case.record_label('new_label', 'world') - baseline_eval.print_report() - new_eval.print_report() - new_eval.print_diff(baseline=baseline_eval, include_removed_cases=True) + baseline_eval.print_report(include_input=True, include_output=True) + new_eval.print_report(include_input=True, include_output=True) + new_eval.print_diff(baseline=baseline_eval, include_input=True, include_output=True, include_removed_cases=True) if __name__ == '__main__': diff --git a/pydantic_ai_slim/pydantic_ai/evals/evals.py b/pydantic_ai_slim/pydantic_ai/evals/evals.py index a36785d11..de8e54947 100644 --- a/pydantic_ai_slim/pydantic_ai/evals/evals.py +++ b/pydantic_ai_slim/pydantic_ai/evals/evals.py @@ -47,6 +47,7 @@ def print_report( include_input: bool = False, include_output: bool = False, include_total_duration: bool = False, + include_averages: bool = True, input_config: RenderValueConfig | None = None, output_config: RenderValueConfig | None = None, score_configs: dict[str, RenderNumberConfig] | None = None, @@ -60,6 +61,7 @@ def print_report( include_input=include_input, include_output=include_output, include_total_duration=include_total_duration, + include_averages=include_averages, input_config=input_config, output_config=output_config, score_configs=score_configs, @@ -78,6 +80,7 @@ def print_diff( include_output: bool = False, include_total_duration: bool = False, include_removed_cases: bool = False, + include_averages: bool = True, input_config: RenderValueConfig | None = None, output_config: RenderValueConfig | None = None, score_configs: dict[str, RenderNumberConfig] | None = None, @@ -93,6 +96,7 @@ def print_diff( include_output=include_output, include_total_duration=include_total_duration, include_removed_cases=include_removed_cases, + include_averages=include_averages, input_config=input_config, output_config=output_config, score_configs=score_configs, @@ -220,11 +224,11 @@ def as_report_case(self) -> EvalReportCase: case_id=self.case_id, case_input=self.case_input, case_output=self.case_output, - task_duration=_get_span_duration(self.task_span), - total_duration=_get_span_duration(self.case_span), scores=self.scores, metrics=self.metrics, labels=self.labels, + task_duration=_get_span_duration(self.task_span), + total_duration=_get_span_duration(self.case_span), ) diff --git a/pydantic_ai_slim/pydantic_ai/evals/render_numbers.py b/pydantic_ai_slim/pydantic_ai/evals/render_numbers.py index 4bac61d5a..c8adbfa7c 100644 --- a/pydantic_ai_slim/pydantic_ai/evals/render_numbers.py +++ b/pydantic_ai_slim/pydantic_ai/evals/render_numbers.py @@ -115,38 +115,35 @@ def default_render_number_diff(old: float | int, new: float | int) -> str | None # Compute the raw difference. delta = new - old - diff_str = _format_signed(delta, ABS_SIG_FIGS) + abs_diff_str = _render_signed(delta, ABS_SIG_FIGS) + rel_diff_str = _render_relative(new, old, BASE_THRESHOLD) + if rel_diff_str is None: + return abs_diff_str + else: + return f'{abs_diff_str} / {rel_diff_str}' - # If we cannot compute a relative change, return just the diff. - if old == 0: - return diff_str - # For very small base values with huge changes, drop the relative indicator. - if abs(old) < BASE_THRESHOLD and abs(delta) > MULTIPLIER_DROP_FACTOR * abs(old): - return diff_str +def default_render_duration(seconds: float) -> str: + """Format a duration given in seconds to a string. - # Compute the relative change as a percentage. - rel_change = (delta / old) * 100 - perc_str = f'{rel_change:+.{PERC_DECIMALS}f}%' - # If the percentage rounds to 0.0%, return only the absolute difference. - if perc_str in ('+0.0%', '-0.0%'): - return diff_str + If the duration is less than 1 millisecond, show microseconds. + If it's less than one second, show milliseconds. + Otherwise, show seconds. + """ + return _render_duration(seconds, False) - # Decide whether to use percentage style or multiplier style. - if abs(delta) / abs(old) <= 1: - # Percentage style. - return f'{diff_str} / {perc_str}' + +def default_render_duration_diff(old: float, new: float) -> str: + """Format a duration difference (in seconds) with an explicit sign.""" + abs_diff_str = _render_duration(new - old, True) + rel_diff_str = _render_relative(new, old, BASE_THRESHOLD) + if rel_diff_str is None: + return abs_diff_str else: - # Multiplier style. - multiplier = new / old - if abs(multiplier) < MULTIPLIER_ONE_DECIMAL_THRESHOLD: - mult_str = f'{multiplier:+.1f}x' - else: - mult_str = f'{multiplier:+.0f}x' - return f'{diff_str} / {mult_str}' + return f'{abs_diff_str} / {rel_diff_str}' -def _format_signed(val: float, sig_figs: int = ABS_SIG_FIGS) -> str: +def _render_signed(val: float, sig_figs: int) -> str: """Format a number with a fixed number of significant figures. If the result does not use scientific notation and lacks a decimal point, @@ -158,7 +155,39 @@ def _format_signed(val: float, sig_figs: int = ABS_SIG_FIGS) -> str: return f"{'+' if val >= 0 else '-'}{s}" -def default_render_duration(seconds: float) -> str: +def _render_relative(new: float, base: float, small_base_threshold: float) -> str | None: + # If we cannot compute a relative change, return just the diff. + if base == 0: + return None + + delta = new - base + + # For very small base values with huge changes, drop the relative indicator. + if abs(base) < small_base_threshold and abs(delta) > MULTIPLIER_DROP_FACTOR * abs(base): + return None + + # Compute the relative change as a percentage. + rel_change = (delta / base) * 100 + perc_str = f'{rel_change:+.{PERC_DECIMALS}f}%' + # If the percentage rounds to 0.0%, return only the absolute difference. + if perc_str in ('+0.0%', '-0.0%'): + return None + + # Decide whether to use percentage style or multiplier style. + if abs(delta) / abs(base) <= 1: + # Percentage style. + return perc_str + else: + # Multiplier style. + multiplier = new / base + if abs(multiplier) < MULTIPLIER_ONE_DECIMAL_THRESHOLD: + mult_str = f'{multiplier:.1f}x' + else: + mult_str = f'{multiplier:.0f}x' + return mult_str + + +def _render_duration(seconds: float, signed: bool) -> str: """Format a duration given in seconds to a string. If the duration is less than 1 millisecond, show microseconds. @@ -177,26 +206,7 @@ def default_render_duration(seconds: float) -> str: else: value = seconds unit = 's' - return f'{value:.{precision}f}{unit}' - - -def default_render_duration_diff(old: float, new: float) -> str: - """Format a duration difference (in seconds) with an explicit sign. - - Uses the same unit as format_duration. - """ - diff = new - old - precision = 1 - abs_diff = abs(diff) - if abs_diff < 1e-3: - value = diff * 1_000_000 - unit = 'µs' - if abs(value) >= 1: - precision = 0 - elif abs_diff < 1: - value = diff * 1_000 - unit = 'ms' + if signed: + return f'{value:+.{precision}f}{unit}' else: - value = diff - unit = 's' - return f'{value:+.{precision}f}{unit}' + return f'{value:.{precision}f}{unit}' diff --git a/pydantic_ai_slim/pydantic_ai/evals/reports.py b/pydantic_ai_slim/pydantic_ai/evals/reports.py index 0aeeee302..8c6d94ee5 100644 --- a/pydantic_ai_slim/pydantic_ai/evals/reports.py +++ b/pydantic_ai_slim/pydantic_ai/evals/reports.py @@ -1,5 +1,6 @@ from __future__ import annotations as _annotations +from collections import defaultdict from collections.abc import Mapping from dataclasses import dataclass from typing import Any, Callable, Literal, Protocol, TypeVar @@ -20,6 +21,8 @@ __all__ = ('EvalReport', 'EvalReportCase', 'EvalRenderer', 'RenderValueConfig', 'RenderNumberConfig') MISSING_VALUE_STR = '[i][/i]' +EMPTY_CELL_STR = '-' +EMPTY_AGGREGATE_CELL_STR = '' class RenderValueConfig(TypedDict, total=False): @@ -274,6 +277,46 @@ def render_diff(self, name: str | None, old: T_contra | None, new: T_contra | No ) +class EvalReportCaseAggregate(BaseModel): + name: str + + scores: dict[str, float | int] + metrics: dict[str, float | int] + task_duration: float + total_duration: float + + @staticmethod + def average(cases: list[EvalReportCase]) -> EvalReportCaseAggregate: + """Produce a synthetic "summary" case by averaging quantitative attributes.""" + num_cases = len(cases) + if num_cases == 0: + raise ValueError('Cannot summarize an empty list of cases') + + def _averages_by_name(values_by_name: list[dict[str, int | float]]) -> dict[str, float]: + counts_by_name: dict[str, int] = defaultdict(int) + sums_by_name: dict[str, float] = defaultdict(float) + for values in values_by_name: + for name, value in values.items(): + counts_by_name[name] += 1 + sums_by_name[name] += value + return {name: sums_by_name[name] / counts_by_name[name] for name in sums_by_name} + + average_task_duration = sum(case.task_duration for case in cases) / num_cases + average_total_duration = sum(case.total_duration for case in cases) / num_cases + + average_scores: dict[str, float] = _averages_by_name([case.scores for case in cases]) + # TODO: Aggregate labels, showing the percentage occurrences of each label + average_metrics: dict[str, float] = _averages_by_name([case.metrics for case in cases]) + + return EvalReportCaseAggregate( + name='Averages', + scores=average_scores, + metrics=average_metrics, + task_duration=average_task_duration, + total_duration=average_total_duration, + ) + + class EvalReportCase(BaseModel): """A single case in an evaluation report.""" @@ -281,11 +324,11 @@ class EvalReportCase(BaseModel): case_input: dict[str, Any] case_output: Any - task_duration: float - total_duration: float # includes scoring time scores: dict[str, float | int] metrics: dict[str, float | int] labels: dict[str, bool | str] + task_duration: float + total_duration: float # includes scoring time class EvalReport(BaseModel): @@ -301,11 +344,12 @@ def console_table( include_output: bool = False, include_total_duration: bool = False, include_removed_cases: bool = False, + include_averages: bool = True, + input_config: RenderValueConfig | None = None, + output_config: RenderValueConfig | None = None, score_configs: dict[str, RenderNumberConfig] | None = None, label_configs: dict[str, RenderValueConfig] | None = None, metric_configs: dict[str, RenderNumberConfig] | None = None, - input_config: RenderValueConfig | None = None, - output_config: RenderValueConfig | None = None, duration_config: RenderNumberConfig | None = None, ) -> Table: """Print a diff table comparing the baseline and new EvalReport. @@ -317,11 +361,12 @@ def console_table( include_output=include_output, include_total_duration=include_total_duration, include_removed_cases=include_removed_cases, + include_averages=include_averages, + input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})}, + output_config=output_config or _DEFAULT_VALUE_CONFIG, score_configs=score_configs or {}, label_configs=label_configs or {}, metric_configs=metric_configs or {}, - input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})}, - output_config=output_config or _DEFAULT_VALUE_CONFIG, duration_config=duration_config or _DEFAULT_DURATION_CONFIG, ) if baseline is None: @@ -342,12 +387,11 @@ class EvalCaseRenderer: include_metrics: bool include_total_duration: bool + input_renderer: _ValueRenderer + output_renderer: _ValueRenderer score_renderers: dict[str, _NumberRenderer] label_renderers: dict[str, _ValueRenderer] metric_renderers: dict[str, _NumberRenderer] - - input_renderer: _ValueRenderer - output_renderer: _ValueRenderer duration_renderer: _NumberRenderer def build_base_table(self, title: str) -> Table: @@ -372,10 +416,10 @@ def build_row(self, case: EvalReportCase) -> list[str]: row = [case.case_id] if self.include_input: - row.append(self.input_renderer.render_value(None, case.case_input) or MISSING_VALUE_STR) + row.append(self.input_renderer.render_value(None, case.case_input) or EMPTY_CELL_STR) if self.include_output: - row.append(self.output_renderer.render_value(None, case.case_output) or MISSING_VALUE_STR) + row.append(self.output_renderer.render_value(None, case.case_output) or EMPTY_CELL_STR) if self.include_scores: row.append(self._render_dict(case.scores, self.score_renderers)) @@ -389,10 +433,33 @@ def build_row(self, case: EvalReportCase) -> list[str]: row.append(self._render_durations(case)) return row + def build_aggregate_row(self, aggregate: EvalReportCaseAggregate) -> list[str]: + """Build a table row for an aggregated case.""" + row = [f'[b i]{aggregate.name}[/]'] + + if self.include_input: + row.append(EMPTY_AGGREGATE_CELL_STR) + + if self.include_output: + row.append(EMPTY_AGGREGATE_CELL_STR) + + if self.include_scores: + row.append(self._render_dict(aggregate.scores, self.score_renderers)) + + if self.include_labels: + # TODO: Aggregate labels, showing the percentage occurrences of each label + row.append(EMPTY_AGGREGATE_CELL_STR) + + if self.include_metrics: + row.append(self._render_dict(aggregate.metrics, self.metric_renderers)) + + row.append(self._render_durations(aggregate)) + return row + def build_diff_row( self, - baseline: EvalReportCase, new_case: EvalReportCase, + baseline: EvalReportCase, ) -> list[str]: """Build a table row for a given case ID.""" assert baseline.case_id == new_case.case_id, 'This should only be called for matching case IDs' @@ -400,13 +467,13 @@ def build_diff_row( if self.include_input: input_diff = ( - self.input_renderer.render_diff(None, baseline.case_input, new_case.case_input) or MISSING_VALUE_STR + self.input_renderer.render_diff(None, baseline.case_input, new_case.case_input) or EMPTY_CELL_STR ) row.append(input_diff) if self.include_output: output_diff = ( - self.output_renderer.render_diff(None, baseline.case_output, new_case.case_output) or MISSING_VALUE_STR + self.output_renderer.render_diff(None, baseline.case_output, new_case.case_output) or EMPTY_CELL_STR ) row.append(output_diff) @@ -426,7 +493,37 @@ def build_diff_row( return row - def _render_durations(self, case: EvalReportCase) -> str: + def build_diff_aggregate_row( + self, + new: EvalReportCaseAggregate, + baseline: EvalReportCaseAggregate, + ) -> list[str]: + """Build a table row for a given case ID.""" + assert baseline.name == new.name, 'This should only be called for aggregates with matching names' + row = [f'[b i]{baseline.name}[/]'] + + if self.include_input: + row.append(EMPTY_AGGREGATE_CELL_STR) + + if self.include_output: + row.append(EMPTY_AGGREGATE_CELL_STR) + + if self.include_scores: + scores_diff = self._render_dicts_diff(baseline.scores, new.scores, self.score_renderers) + row.append(scores_diff) + + if self.include_labels: + row.append(EMPTY_AGGREGATE_CELL_STR) + + if self.include_metrics: + metrics_diff = self._render_dicts_diff(baseline.metrics, new.metrics, self.metric_renderers) + row.append(metrics_diff) + + row.append(self._render_durations_diff(baseline, new)) + + return row + + def _render_durations(self, case: EvalReportCase | EvalReportCaseAggregate) -> str: """Build the diff string for a duration value.""" case_durations: dict[str, float] = {'task': case.task_duration} if self.include_total_duration: @@ -437,7 +534,11 @@ def _render_durations(self, case: EvalReportCase) -> str: include_names=self.include_total_duration, ) - def _render_durations_diff(self, base_case: EvalReportCase, new_case: EvalReportCase) -> str: + def _render_durations_diff( + self, + base_case: EvalReportCase | EvalReportCaseAggregate, + new_case: EvalReportCase | EvalReportCaseAggregate, + ) -> str: """Build the diff string for a duration value.""" base_case_durations: dict[str, float] = {'task': base_case.task_duration} new_case_durations: dict[str, float] = {'task': new_case.task_duration} @@ -468,7 +569,7 @@ def _render_dicts_diff( new_val = new_dict.get(key) rendered = renderers[key].render_diff(key if include_names else None, old_val, new_val) diff_lines.append(rendered) - return '\n'.join(diff_lines) if diff_lines else MISSING_VALUE_STR + return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR @staticmethod def _render_dict( @@ -481,24 +582,27 @@ def _render_dict( for key, val in case_dict.items(): rendered = renderers[key].render_value(key if include_names else None, val) diff_lines.append(rendered) - return '\n'.join(diff_lines) if diff_lines else MISSING_VALUE_STR + return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR @dataclass class EvalRenderer: """A class for rendering an EvalReport or the diff between two EvalReports.""" + # Columns to include include_input: bool include_output: bool include_total_duration: bool + + # Rows to include include_removed_cases: bool + include_averages: bool + input_config: RenderValueConfig + output_config: RenderValueConfig score_configs: dict[str, RenderNumberConfig] label_configs: dict[str, RenderValueConfig] metric_configs: dict[str, RenderNumberConfig] - - input_config: RenderValueConfig - output_config: RenderValueConfig duration_config: RenderNumberConfig def include_scores(self, report: EvalReport, baseline: EvalReport | None = None): @@ -523,11 +627,11 @@ def _baseline_cases_to_include(self, report: EvalReport, baseline: EvalReport) - return [case for case in baseline.cases if case.case_id in report_case_ids] def _get_case_renderer(self, report: EvalReport, baseline: EvalReport | None = None) -> EvalCaseRenderer: + input_renderer = _ValueRenderer.from_config(self.input_config) + output_renderer = _ValueRenderer.from_config(self.output_config) score_renderers = self._infer_score_renderers(report, baseline) label_renderers = self._infer_label_renderers(report, baseline) metric_renderers = self._infer_metric_renderers(report, baseline) - input_renderer = _ValueRenderer.from_config(self.input_config) - output_renderer = _ValueRenderer.from_config(self.output_config) duration_renderer = _NumberRenderer.infer_from_config( self.duration_config, 'duration', [x.task_duration for x in self._all_cases(report, baseline)] ) @@ -539,11 +643,11 @@ def _get_case_renderer(self, report: EvalReport, baseline: EvalReport | None = N include_labels=self.include_labels(report, baseline), include_metrics=self.include_metrics(report, baseline), include_total_duration=self.include_total_duration, + input_renderer=input_renderer, + output_renderer=output_renderer, score_renderers=score_renderers, label_renderers=label_renderers, metric_renderers=metric_renderers, - input_renderer=input_renderer, - output_renderer=output_renderer, duration_renderer=duration_renderer, ) @@ -552,11 +656,18 @@ def build_table(self, report: EvalReport) -> Table: table = case_renderer.build_base_table(f'Evaluation Summary: {report.name}') for case in report.cases: table.add_row(*case_renderer.build_row(case)) + + if self.include_averages: + average = EvalReportCaseAggregate.average(report.cases) + table.add_row(*case_renderer.build_aggregate_row(average)) return table def build_diff_table(self, report: EvalReport, baseline: EvalReport) -> Table: - report_cases_by_id = {case.case_id: case for case in report.cases} - baseline_cases_by_id = {case.case_id: case for case in self._baseline_cases_to_include(report, baseline)} + report_cases = report.cases + baseline_cases = self._baseline_cases_to_include(report, baseline) + + report_cases_by_id = {case.case_id: case for case in report_cases} + baseline_cases_by_id = {case.case_id: case for case in baseline_cases} diff_cases: list[tuple[EvalReportCase, EvalReportCase]] = [] removed_cases: list[EvalReportCase] = [] @@ -577,7 +688,7 @@ def build_diff_table(self, report: EvalReport, baseline: EvalReport) -> Table: case_renderer = self._get_case_renderer(report, baseline) table = case_renderer.build_base_table(f'Evaluation Diff: {baseline.name} → {report.name}') for baseline_case, new_case in diff_cases: - table.add_row(*case_renderer.build_diff_row(baseline_case, new_case)) + table.add_row(*case_renderer.build_diff_row(new_case, baseline_case)) for case in added_cases: row = case_renderer.build_row(case) row[0] = f'[green]+ Added Case[/]\n{row[0]}' @@ -586,6 +697,12 @@ def build_diff_table(self, report: EvalReport, baseline: EvalReport) -> Table: row = case_renderer.build_row(case) row[0] = f'[red]- Removed Case[/]\n{row[0]}' table.add_row(*row) + + if self.include_averages: + report_average = EvalReportCaseAggregate.average(report_cases) + baseline_average = EvalReportCaseAggregate.average(baseline_cases) + table.add_row(*case_renderer.build_diff_aggregate_row(report_average, baseline_average)) + return table def _infer_score_renderers(self, report: EvalReport, baseline: EvalReport | None) -> dict[str, _NumberRenderer]: