Skip to content
57 changes: 56 additions & 1 deletion py/packages/genkit/src/genkit/ai/_aio.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
class while customizing it with any plugins.
"""

import uuid
from asyncio import Future
from collections.abc import AsyncIterator
from typing import Any

from genkit.aio import Channel
from genkit.blocks.document import Document
from genkit.blocks.embedding import EmbedderRef
from genkit.blocks.evaluator import EvaluatorRef
from genkit.blocks.generate import (
StreamingCallback as ModelStreamingCallback,
generate_action,
Expand All @@ -40,7 +42,14 @@ class while customizing it with any plugins.
from genkit.blocks.retriever import IndexerRef, IndexerRequest, RetrieverRef
from genkit.core.action import ActionRunContext
from genkit.core.action.types import ActionKind
from genkit.core.typing import EmbedRequest, EmbedResponse
from genkit.core.typing import (
BaseDataPoint,
BaseEvalDataPoint,
EmbedRequest,
EmbedResponse,
EvalRequest,
EvalResponse,
)
from genkit.types import (
DocumentData,
GenerationCommonConfig,
Expand Down Expand Up @@ -391,3 +400,49 @@ async def embed(
embed_action = self.registry.lookup_action(ActionKind.EMBEDDER, embedder_name)

return (await embed_action.arun(EmbedRequest(input=documents, options=final_options))).response

async def evaluate(
self,
evaluator: str | EvaluatorRef | None = None,
dataset: list[BaseDataPoint] | None = None,
options: Any | None = None,
eval_run_id: str | None = None,
) -> EvalResponse:
"""Evaluates a dataset using an evaluator.

Args:
evaluator: Name or reference of the evaluator to use.
dataset: Dataset to evaluate.
options: Evaluation options.
eval_run_id: Optional ID for the evaluation run.

Returns:
The evaluation results.
"""
evaluator_name: str = ''
evaluator_config: dict[str, Any] = {}

if isinstance(evaluator, EvaluatorRef):
evaluator_name = evaluator.name
evaluator_config = evaluator.config_schema or {}
elif isinstance(evaluator, str):
evaluator_name = evaluator
else:
raise ValueError('Evaluator must be specified as a string name or an EvaluatorRef.')

final_options = {**(evaluator_config or {}), **(options or {})}

eval_action = self.registry.lookup_action(ActionKind.EVALUATOR, evaluator_name)

if not eval_run_id:
eval_run_id = str(uuid.uuid4())

return (
await eval_action.arun(
EvalRequest(
dataset=dataset,
options=final_options,
eval_run_id=eval_run_id,
)
)
).response
43 changes: 32 additions & 11 deletions py/packages/genkit/src/genkit/ai/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,15 +364,40 @@ async def eval_stepper_fn(req: EvalRequest) -> EvalResponse:
metadata={'evaluator:evalRunId': req.eval_run_id},
)
try:
with run_in_new_span(span_metadata, labels={'genkit:type': 'evaluator'}) as span:
span_id = span.span_id
trace_id = span.trace_id
# Try to run with tracing, but fallback if tracing infrastructure fails
# (e.g., in environments with NonRecordingSpans like pre-commit)
try:
with run_in_new_span(span_metadata, labels={'genkit:type': 'evaluator'}) as span:
span_id = span.span_id
trace_id = span.trace_id
try:
span.set_input(datapoint)
test_case_output = await fn(datapoint, req.options)
test_case_output.span_id = span_id
test_case_output.trace_id = trace_id
span.set_output(test_case_output)
eval_responses.append(test_case_output)
except Exception as e:
logger.debug(f'eval_stepper_fn error: {str(e)}')
logger.debug(traceback.format_exc())
evaluation = Score(
error=f'Evaluation of test case {datapoint.test_case_id} failed: \n{str(e)}',
status=EvalStatusEnum.FAIL,
)
eval_responses.append(
EvalFnResponse(
span_id=span_id,
trace_id=trace_id,
test_case_id=datapoint.test_case_id,
evaluation=evaluation,
)
)
# Raise to mark span as failed
raise e
except (AttributeError, UnboundLocalError):
# Fallback: run without span
try:
span.set_input(datapoint)
test_case_output = await fn(datapoint, req.options)
test_case_output.span_id = span_id
test_case_output.trace_id = trace_id
span.set_output(test_case_output)
eval_responses.append(test_case_output)
except Exception as e:
logger.debug(f'eval_stepper_fn error: {str(e)}')
Expand All @@ -383,14 +408,10 @@ async def eval_stepper_fn(req: EvalRequest) -> EvalResponse:
)
eval_responses.append(
EvalFnResponse(
span_id=span_id,
trace_id=trace_id,
test_case_id=datapoint.test_case_id,
evaluation=evaluation,
)
)
# Raise to mark span as failed
raise e
except Exception:
# Continue to process other points
continue
Expand Down
30 changes: 27 additions & 3 deletions py/packages/genkit/src/genkit/blocks/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,42 @@
from collections.abc import Callable
from typing import Any, TypeVar

from pydantic import BaseModel, ConfigDict, Field

from genkit.core.typing import (
BaseEvalDataPoint,
BaseDataPoint,
EvalFnResponse,
EvalRequest,
)

T = TypeVar('T')

# User-provided evaluator function that evaluates a single datapoint.
# type EvaluatorFn[T] = Callable[[BaseEvalDataPoint, T], EvalFnResponse]
EvaluatorFn = Callable[[BaseEvalDataPoint, T], EvalFnResponse]
# type EvaluatorFn[T] = Callable[[BaseDataPoint, T], EvalFnResponse]
EvaluatorFn = Callable[[BaseDataPoint, T], EvalFnResponse]

# User-provided batch evaluator function that evaluates an EvaluationRequest
# type BatchEvaluatorFn[T] = Callable[[EvalRequest, T], list[EvalFnResponse]]
BatchEvaluatorFn = Callable[[EvalRequest, T], list[EvalFnResponse]]


class EvaluatorRef(BaseModel):
"""Reference to an evaluator."""

model_config = ConfigDict(extra='forbid', populate_by_name=True)

name: str
config_schema: Any | None = Field(None, alias='configSchema')


def evaluator_ref(name: str, config_schema: Any | None = None) -> EvaluatorRef:
"""Create a reference to an evaluator.

Args:
name: Name of the evaluator.
config_schema: Optional schema for evaluator configuration.

Returns:
An EvaluatorRef instance.
"""
return EvaluatorRef(name=name, config_schema=config_schema)
34 changes: 34 additions & 0 deletions py/packages/genkit/tests/genkit/veneer/veneer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from genkit.blocks.model import MessageWrapper, text_from_message
from genkit.core.action import ActionRunContext
from genkit.core.typing import (
BaseDataPoint,
BaseEvalDataPoint,
Details,
DocumentData,
Expand Down Expand Up @@ -1563,3 +1564,36 @@ async def my_flow(input: str, ctx):

assert chunks == [1, 2, 3]
assert (await response) == 'banana2'


@pytest.mark.asyncio
async def test_evaluate(setup_test: SetupFixture) -> None:
"""Test that the evaluate function works."""
ai, _, _, *_ = setup_test

async def my_eval_fn(datapoint: BaseDataPoint, options: Any | None):
return EvalFnResponse(
test_case_id=datapoint.test_case_id,
evaluation=Score(score=True, details=Details(reasoning='I think it is true')),
)

ai.define_evaluator(
name='my_eval',
display_name='Test evaluator',
definition='Test evaluator that always returns True',
fn=my_eval_fn,
)

dataset = [
BaseDataPoint(input='hi', output='hi', test_case_id='case1'),
BaseDataPoint(input='bye', output='bye', test_case_id='case2'),
]

response = await ai.evaluate(evaluator='my_eval', dataset=dataset)

assert isinstance(response, EvalResponse)
assert len(response.root) == 2
assert response.root[0].test_case_id == 'case1'
assert response.root[0].evaluation.score is True
assert response.root[1].test_case_id == 'case2'
assert response.root[1].evaluation.score is True
2 changes: 1 addition & 1 deletion py/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ omit = [
default-groups = ["dev", "lint"]

[tool.uv.sources]
evaluator-demo = { workspace = true }
genkit = { workspace = true }
genkit-plugin-anthropic = { workspace = true }
genkit-plugin-compat-oai = { workspace = true }
Expand All @@ -115,7 +116,6 @@ google-genai-hello = { workspace = true }
google-genai-image = { workspace = true }
prompt-demo = { workspace = true }


[tool.uv.workspace]
members = ["packages/*", "plugins/*", "samples/*"]

Expand Down
38 changes: 38 additions & 0 deletions py/samples/evaluator-demo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Evaluator Demo

An example demonstrating running flows using the Google GenAI plugin.

## Setup environment

Obtain an API key from [ai.dev](https://ai.dev).

Export the API key as env variable `GEMINI\_API\_KEY` in your shell
configuration.

```bash
export GEMINI_API_KEY='<Your api key>'
```

## Run the sample

Start the Genkit Developer UI:

```bash
genkit start -- uv run src/eval_demo.py
```

## Evaluations

### Simple inference and evaluation

Use the `run_eval_demo` command to run a flow against a set of input samples and
evaluate the generated outputs. Input (JSON) = "{}"


## Run tests

To run the automated tests for this sample:

```bash
uv run pytest -v src/eval_demo.py
```
20 changes: 20 additions & 0 deletions py/samples/evaluator-demo/data/dataset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[
{
"testCaseId": "case1",
"input": "How many states are in the US?",
"output": "There are 50 states in the US.",
"reference": "50"
},
{
"testCaseId": "case2",
"input": "What is the capital of France?",
"output": "The capital of France is Paris.",
"reference": "Paris"
},
{
"testCaseId": "case3",
"input": "Who Lily is?",
"output": "Lily is a cat.",
"reference": "cat"
}
]
29 changes: 29 additions & 0 deletions py/samples/evaluator-demo/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0

[project]
dependencies = ["genkit", "pydantic>=2.0.0", "structlog>=24.0.0"]
description = "Genkit Python Evaluation Demo"
name = "eval-demo"
requires-python = ">=3.10"
version = "0.0.1"

[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]

[tool.hatch.build.targets.wheel]
packages = ["src"]
Loading
Loading