Add averages and do some more cleanup

pydantic · Feb 17, 2025 · 1ba636e · 1ba636e
1 parent 4091b5a
commit 1ba636e
Show file tree

Hide file tree

Showing 4 changed files with 212 additions and 81 deletions.
diff --git a/pydantic_ai_slim/pydantic_ai/evals/__init__.py b/pydantic_ai_slim/pydantic_ai/evals/__init__.py
@@ -59,9 +59,9 @@ async def function_i_want_to_evaluate(x: int, deps: str) -> int:
                 eval_case.record_label('sentiment', 'positive')
                 eval_case.record_label('new_label', 'world')
 
-    baseline_eval.print_report()
-    new_eval.print_report()
-    new_eval.print_diff(baseline=baseline_eval, include_removed_cases=True)
+    baseline_eval.print_report(include_input=True, include_output=True)
+    new_eval.print_report(include_input=True, include_output=True)
+    new_eval.print_diff(baseline=baseline_eval, include_input=True, include_output=True, include_removed_cases=True)
 
 
 if __name__ == '__main__':

diff --git a/pydantic_ai_slim/pydantic_ai/evals/evals.py b/pydantic_ai_slim/pydantic_ai/evals/evals.py
@@ -47,6 +47,7 @@ def print_report(
         include_input: bool = False,
         include_output: bool = False,
         include_total_duration: bool = False,
+        include_averages: bool = True,
         input_config: RenderValueConfig | None = None,
         output_config: RenderValueConfig | None = None,
         score_configs: dict[str, RenderNumberConfig] | None = None,
@@ -60,6 +61,7 @@ def print_report(
                 include_input=include_input,
                 include_output=include_output,
                 include_total_duration=include_total_duration,
+                include_averages=include_averages,
                 input_config=input_config,
                 output_config=output_config,
                 score_configs=score_configs,
@@ -78,6 +80,7 @@ def print_diff(
         include_output: bool = False,
         include_total_duration: bool = False,
         include_removed_cases: bool = False,
+        include_averages: bool = True,
         input_config: RenderValueConfig | None = None,
         output_config: RenderValueConfig | None = None,
         score_configs: dict[str, RenderNumberConfig] | None = None,
@@ -93,6 +96,7 @@ def print_diff(
                 include_output=include_output,
                 include_total_duration=include_total_duration,
                 include_removed_cases=include_removed_cases,
+                include_averages=include_averages,
                 input_config=input_config,
                 output_config=output_config,
                 score_configs=score_configs,
@@ -220,11 +224,11 @@ def as_report_case(self) -> EvalReportCase:
             case_id=self.case_id,
             case_input=self.case_input,
             case_output=self.case_output,
-            task_duration=_get_span_duration(self.task_span),
-            total_duration=_get_span_duration(self.case_span),
             scores=self.scores,
             metrics=self.metrics,
             labels=self.labels,
+            task_duration=_get_span_duration(self.task_span),
+            total_duration=_get_span_duration(self.case_span),
         )
 
 

diff --git a/pydantic_ai_slim/pydantic_ai/evals/render_numbers.py b/pydantic_ai_slim/pydantic_ai/evals/render_numbers.py
@@ -115,38 +115,35 @@ def default_render_number_diff(old: float | int, new: float | int) -> str | None
 
     # Compute the raw difference.
     delta = new - old
-    diff_str = _format_signed(delta, ABS_SIG_FIGS)
+    abs_diff_str = _render_signed(delta, ABS_SIG_FIGS)
+    rel_diff_str = _render_relative(new, old, BASE_THRESHOLD)
+    if rel_diff_str is None:
+        return abs_diff_str
+    else:
+        return f'{abs_diff_str} / {rel_diff_str}'
 
-    # If we cannot compute a relative change, return just the diff.
-    if old == 0:
-        return diff_str
 
-    # For very small base values with huge changes, drop the relative indicator.
-    if abs(old) < BASE_THRESHOLD and abs(delta) > MULTIPLIER_DROP_FACTOR * abs(old):
-        return diff_str
+def default_render_duration(seconds: float) -> str:
+    """Format a duration given in seconds to a string.
 
-    # Compute the relative change as a percentage.
-    rel_change = (delta / old) * 100
-    perc_str = f'{rel_change:+.{PERC_DECIMALS}f}%'
-    # If the percentage rounds to 0.0%, return only the absolute difference.
-    if perc_str in ('+0.0%', '-0.0%'):
-        return diff_str
+    If the duration is less than 1 millisecond, show microseconds.
+    If it's less than one second, show milliseconds.
+    Otherwise, show seconds.
+    """
+    return _render_duration(seconds, False)
 
-    # Decide whether to use percentage style or multiplier style.
-    if abs(delta) / abs(old) <= 1:
-        # Percentage style.
-        return f'{diff_str} / {perc_str}'
+
+def default_render_duration_diff(old: float, new: float) -> str:
+    """Format a duration difference (in seconds) with an explicit sign."""
+    abs_diff_str = _render_duration(new - old, True)
+    rel_diff_str = _render_relative(new, old, BASE_THRESHOLD)
+    if rel_diff_str is None:
+        return abs_diff_str
     else:
-        # Multiplier style.
-        multiplier = new / old
-        if abs(multiplier) < MULTIPLIER_ONE_DECIMAL_THRESHOLD:
-            mult_str = f'{multiplier:+.1f}x'
-        else:
-            mult_str = f'{multiplier:+.0f}x'
-        return f'{diff_str} / {mult_str}'
+        return f'{abs_diff_str} / {rel_diff_str}'
 
 
-def _format_signed(val: float, sig_figs: int = ABS_SIG_FIGS) -> str:
+def _render_signed(val: float, sig_figs: int) -> str:
     """Format a number with a fixed number of significant figures.
 
     If the result does not use scientific notation and lacks a decimal point,
@@ -158,7 +155,39 @@ def _format_signed(val: float, sig_figs: int = ABS_SIG_FIGS) -> str:
     return f"{'+' if val >= 0 else '-'}{s}"
 
 
-def default_render_duration(seconds: float) -> str:
+def _render_relative(new: float, base: float, small_base_threshold: float) -> str | None:
+    # If we cannot compute a relative change, return just the diff.
+    if base == 0:
+        return None
+
+    delta = new - base
+
+    # For very small base values with huge changes, drop the relative indicator.
+    if abs(base) < small_base_threshold and abs(delta) > MULTIPLIER_DROP_FACTOR * abs(base):
+        return None
+
+    # Compute the relative change as a percentage.
+    rel_change = (delta / base) * 100
+    perc_str = f'{rel_change:+.{PERC_DECIMALS}f}%'
+    # If the percentage rounds to 0.0%, return only the absolute difference.
+    if perc_str in ('+0.0%', '-0.0%'):
+        return None
+
+    # Decide whether to use percentage style or multiplier style.
+    if abs(delta) / abs(base) <= 1:
+        # Percentage style.
+        return perc_str
+    else:
+        # Multiplier style.
+        multiplier = new / base
+        if abs(multiplier) < MULTIPLIER_ONE_DECIMAL_THRESHOLD:
+            mult_str = f'{multiplier:.1f}x'
+        else:
+            mult_str = f'{multiplier:.0f}x'
+        return mult_str
+
+
+def _render_duration(seconds: float, signed: bool) -> str:
     """Format a duration given in seconds to a string.
 
     If the duration is less than 1 millisecond, show microseconds.
@@ -177,26 +206,7 @@ def default_render_duration(seconds: float) -> str:
     else:
         value = seconds
         unit = 's'
-    return f'{value:.{precision}f}{unit}'
-
-
-def default_render_duration_diff(old: float, new: float) -> str:
-    """Format a duration difference (in seconds) with an explicit sign.
-
-    Uses the same unit as format_duration.
-    """
-    diff = new - old
-    precision = 1
-    abs_diff = abs(diff)
-    if abs_diff < 1e-3:
-        value = diff * 1_000_000
-        unit = 'µs'
-        if abs(value) >= 1:
-            precision = 0
-    elif abs_diff < 1:
-        value = diff * 1_000
-        unit = 'ms'
+    if signed:
+        return f'{value:+.{precision}f}{unit}'
     else:
-        value = diff
-        unit = 's'
-    return f'{value:+.{precision}f}{unit}'
+        return f'{value:.{precision}f}{unit}'