Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: breaking, add status enum for evaluations #2169

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions genkit-tools/common/src/eval/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ export function enrichResultsWithScoring(
evaluator,
score: s.score,
scoreId: s.id,
status: s.status,
rationale: s.details?.reasoning,
error: s.error,
traceId: scoredSample.traceId,
Expand Down
3 changes: 3 additions & 0 deletions genkit-tools/common/src/types/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,13 @@ export type EvalInput = z.infer<typeof EvalInputSchema>;
export const EvalInputDatasetSchema = z.array(EvalInputSchema);
export type EvalInputDataset = z.infer<typeof EvalInputDatasetSchema>;

const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);

export const EvalMetricSchema = z.object({
evaluator: z.string(),
scoreId: z.string().optional(),
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
status: EvalStatusEnumSchema.optional(),
rationale: z.string().optional(),
error: z.string().optional(),
traceId: z.string().optional(),
Expand Down
7 changes: 6 additions & 1 deletion genkit-tools/common/src/types/evaluators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@
*/
import { z } from 'zod';

//
// IMPORTANT: Keep this file in sync with genkit/ai/src/evaluator.ts!
//

const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);
export const ScoreSchema = z.object({
id: z
.string()
Expand All @@ -31,7 +36,7 @@ export const ScoreSchema = z.object({
)
.optional(),
score: z.number().optional(),
// TODO: use StatusSchema
status: EvalStatusEnumSchema.optional(),
error: z.string().optional(),
details: z
.object({
Expand Down
73 changes: 62 additions & 11 deletions js/ai/src/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ export const BaseEvalDataPointSchema = BaseDataPointSchema.extend({
});
export type BaseEvalDataPoint = z.infer<typeof BaseEvalDataPointSchema>;

const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);

/** Enum that indicates if an evaluation has passed or failed */
export enum EvalStatusEnum {
UNKNOWN = 'UNKNOWN',
PASS = 'PASS',
FAIL = 'FAIL',
}

export const ScoreSchema = z.object({
id: z
.string()
Expand All @@ -46,7 +55,7 @@ export const ScoreSchema = z.object({
)
.optional(),
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
// TODO: use StatusSchema
status: EvalStatusEnumSchema.optional(),
error: z.string().optional(),
details: z
.object({
Expand Down Expand Up @@ -76,13 +85,28 @@ export const EvalResponseSchema = z.object({
});
export type EvalResponse = z.infer<typeof EvalResponseSchema>;

export const EvalResponsesSchema = z.array(EvalResponseSchema);
export type EvalResponses = z.infer<typeof EvalResponsesSchema>;
const StatusOverrideFnSchema = z
.function()
.args(ScoreSchema)
.returns(EvalStatusEnumSchema);
export type StatusOverrideFn = z.infer<typeof StatusOverrideFnSchema>;
// Base options object
export const BaseEvalOptionsSchema = z
.object({
statusOverrideFn: StatusOverrideFnSchema.optional(),
})
.passthrough()
.optional();
export type BaseEvalOptions = z.infer<typeof BaseEvalOptionsSchema>;

export const EvalActionResponseSchema = z.array(EvalResponseSchema);
export type EvalResponses = z.infer<typeof EvalActionResponseSchema>;

export type EvaluatorFn<
EvalDataPoint extends
typeof BaseEvalDataPointSchema = typeof BaseEvalDataPointSchema,
CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,
CustomOptions extends
typeof BaseEvalOptionsSchema = typeof BaseEvalOptionsSchema,
> = (
input: z.infer<EvalDataPoint>,
evaluatorOptions?: z.infer<CustomOptions>
Expand All @@ -91,7 +115,7 @@ export type EvaluatorFn<
export type EvaluatorAction<
DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,
CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,
> = Action<typeof EvalRequestSchema, typeof EvalResponsesSchema> & {
> = Action<typeof EvalRequestSchema, typeof EvalActionResponseSchema> & {
__dataPointType?: DataPoint;
__configSchema?: CustomOptions;
};
Expand All @@ -100,7 +124,7 @@ function withMetadata<
DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,
CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,
>(
evaluator: Action<typeof EvalRequestSchema, typeof EvalResponsesSchema>,
evaluator: Action<typeof EvalRequestSchema, typeof EvalActionResponseSchema>,
dataPointType?: DataPoint,
configSchema?: CustomOptions
): EvaluatorAction<DataPoint, CustomOptions> {
Expand Down Expand Up @@ -133,7 +157,8 @@ export function defineEvaluator<
DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,
EvalDataPoint extends
typeof BaseEvalDataPointSchema = typeof BaseEvalDataPointSchema,
EvaluatorOptions extends z.ZodTypeAny = z.ZodTypeAny,
EvaluatorOptions extends
typeof BaseEvalOptionsSchema = typeof BaseEvalOptionsSchema,
>(
registry: Registry,
options: {
Expand All @@ -160,10 +185,10 @@ export function defineEvaluator<
dataset: options.dataPointType
? z.array(options.dataPointType)
: z.array(BaseDataPointSchema),
options: options.configSchema ?? z.unknown(),
options: options.configSchema ?? BaseEvalOptionsSchema.optional(),
evalRunId: z.string(),
}),
outputSchema: EvalResponsesSchema,
outputSchema: EvalActionResponseSchema,
metadata: metadata,
},
async (i) => {
Expand Down Expand Up @@ -199,7 +224,14 @@ export function defineEvaluator<
testCaseOutput.spanId = spanId;
testCaseOutput.traceId = traceId;
metadata.output = testCaseOutput;
evalResponses.push(testCaseOutput);
evalResponses.push(
i.options?.statusOverrideFn
? augementStatus(
testCaseOutput,
i.options?.statusOverrideFn
)
: testCaseOutput
);
return testCaseOutput;
} catch (e) {
evalResponses.push({
Expand All @@ -209,8 +241,10 @@ export function defineEvaluator<
testCaseId: datapoint.testCaseId,
evaluation: {
error: `Evaluation of test case ${datapoint.testCaseId} failed: \n${(e as Error).stack}`,
status: EvalStatusEnum.FAIL,
},
});
// Throw to mark the span as failed.
throw e;
}
}
Expand All @@ -228,7 +262,7 @@ export function defineEvaluator<
const ewm = withMetadata(
evaluator as any as Action<
typeof EvalRequestSchema,
typeof EvalResponsesSchema
typeof EvalActionResponseSchema
>,
options.dataPointType,
options.configSchema
Expand Down Expand Up @@ -297,3 +331,20 @@ export function evaluatorRef<
): EvaluatorReference<CustomOptionsSchema> {
return { ...options };
}

function augementStatus(
response: EvalResponse,
statusOverrideFn: StatusOverrideFn
): EvalResponse {
let scores = Array.isArray(response.evaluation)
? response.evaluation
: [response.evaluation];
const newScores = scores.map((s) => ({
...s,
status: statusOverrideFn(s) ?? EvalStatusEnum.UNKNOWN,
}));
return {
...response,
evaluation: newScores.length == 1 ? newScores[0] : newScores,
};
}
1 change: 1 addition & 0 deletions js/ai/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ export {
} from './embedder.js';
export {
BaseDataPointSchema,
EvalStatusEnum,
evaluate,
evaluatorRef,
type EvalResponses,
Expand Down
6 changes: 5 additions & 1 deletion js/genkit/src/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,16 @@
export {
BaseDataPointSchema,
BaseEvalDataPointSchema,
BaseEvalOptionsSchema,
EvalActionResponseSchema,
EvalResponseSchema,
EvalResponsesSchema,
EvalStatusEnum,
EvaluatorInfoSchema,
ScoreSchema,
evaluatorRef,
type BaseDataPoint,
type BaseEvalDataPoint,
type BaseEvalOptions,
type Dataset,
type EvalResponse,
type EvalResponses,
Expand All @@ -33,4 +36,5 @@ export {
type EvaluatorParams,
type EvaluatorReference,
type Score,
type StatusOverrideFn,
} from '@genkit-ai/ai/evaluator';
5 changes: 3 additions & 2 deletions js/genkit/src/genkit.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ import {
z,
} from '@genkit-ai/core';
import { HasRegistry } from '@genkit-ai/core/registry';
import { BaseEvalDataPointSchema } from './evaluator.js';
import { BaseEvalDataPointSchema, BaseEvalOptionsSchema } from './evaluator.js';
import { logger } from './logging.js';
import { GenkitPlugin } from './plugin.js';
import { Registry } from './registry.js';
Expand Down Expand Up @@ -415,7 +415,8 @@ export class Genkit implements HasRegistry {
DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,
EvalDataPoint extends
typeof BaseEvalDataPointSchema = typeof BaseEvalDataPointSchema,
EvaluatorOptions extends z.ZodTypeAny = z.ZodTypeAny,
EvaluatorOptions extends
typeof BaseEvalOptionsSchema = typeof BaseEvalOptionsSchema,
>(
options: {
name: string;
Expand Down
Loading
Loading