firebase · yesudeep · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/py/packages/genkit/src/genkit/ai/_aio.py b/py/packages/genkit/src/genkit/ai/_aio.py
@@ -20,13 +20,15 @@
 class while customizing it with any plugins.
 """
 
+import uuid
 from asyncio import Future
 from collections.abc import AsyncIterator
 from typing import Any
 
 from genkit.aio import Channel
 from genkit.blocks.document import Document
 from genkit.blocks.embedding import EmbedderRef
+from genkit.blocks.evaluator import EvaluatorRef
 from genkit.blocks.generate import (
     StreamingCallback as ModelStreamingCallback,
     generate_action,
@@ -40,7 +42,14 @@ class while customizing it with any plugins.
 from genkit.blocks.retriever import IndexerRef, IndexerRequest, RetrieverRef
 from genkit.core.action import ActionRunContext
 from genkit.core.action.types import ActionKind
-from genkit.core.typing import EmbedRequest, EmbedResponse
+from genkit.core.typing import (
+    BaseDataPoint,
+    BaseEvalDataPoint,
+    EmbedRequest,
+    EmbedResponse,
+    EvalRequest,
+    EvalResponse,
+)
 from genkit.types import (
     DocumentData,
     GenerationCommonConfig,
@@ -391,3 +400,49 @@ async def embed(
         embed_action = self.registry.lookup_action(ActionKind.EMBEDDER, embedder_name)
 
         return (await embed_action.arun(EmbedRequest(input=documents, options=final_options))).response
+
+    async def evaluate(
+        self,
+        evaluator: str | EvaluatorRef | None = None,
+        dataset: list[BaseDataPoint] | None = None,
+        options: Any | None = None,
+        eval_run_id: str | None = None,
+    ) -> EvalResponse:
+        """Evaluates a dataset using an evaluator.
+
+        Args:
+            evaluator: Name or reference of the evaluator to use.
+            dataset: Dataset to evaluate.
+            options: Evaluation options.
+            eval_run_id: Optional ID for the evaluation run.
+
+        Returns:
+            The evaluation results.
+        """
+        evaluator_name: str = ''
+        evaluator_config: dict[str, Any] = {}
+
+        if isinstance(evaluator, EvaluatorRef):
+            evaluator_name = evaluator.name
+            evaluator_config = evaluator.config_schema or {}
+        elif isinstance(evaluator, str):
+            evaluator_name = evaluator
+        else:
+            raise ValueError('Evaluator must be specified as a string name or an EvaluatorRef.')
+
+        final_options = {**(evaluator_config or {}), **(options or {})}
+
+        eval_action = self.registry.lookup_action(ActionKind.EVALUATOR, evaluator_name)
+
+        if not eval_run_id:
+            eval_run_id = str(uuid.uuid4())
+
+        return (
+            await eval_action.arun(
+                EvalRequest(
+                    dataset=dataset,
+                    options=final_options,
+                    eval_run_id=eval_run_id,
+                )
+            )
+        ).response
diff --git a/py/packages/genkit/src/genkit/ai/_registry.py b/py/packages/genkit/src/genkit/ai/_registry.py
@@ -364,15 +364,40 @@ async def eval_stepper_fn(req: EvalRequest) -> EvalResponse:
                     metadata={'evaluator:evalRunId': req.eval_run_id},
                 )
                 try:
-                    with run_in_new_span(span_metadata, labels={'genkit:type': 'evaluator'}) as span:
-                        span_id = span.span_id
-                        trace_id = span.trace_id
+                    # Try to run with tracing, but fallback if tracing infrastructure fails
+                    # (e.g., in environments with NonRecordingSpans like pre-commit)
+                    try:
+                        with run_in_new_span(span_metadata, labels={'genkit:type': 'evaluator'}) as span:
+                            span_id = span.span_id
+                            trace_id = span.trace_id
+                            try:
+                                span.set_input(datapoint)
+                                test_case_output = await fn(datapoint, req.options)
+                                test_case_output.span_id = span_id
+                                test_case_output.trace_id = trace_id
+                                span.set_output(test_case_output)
+                                eval_responses.append(test_case_output)
+                            except Exception as e:
+                                logger.debug(f'eval_stepper_fn error: {str(e)}')
+                                logger.debug(traceback.format_exc())
+                                evaluation = Score(
+                                    error=f'Evaluation of test case {datapoint.test_case_id} failed: \n{str(e)}',
+                                    status=EvalStatusEnum.FAIL,
+                                )
+                                eval_responses.append(
+                                    EvalFnResponse(
+                                        span_id=span_id,
+                                        trace_id=trace_id,
+                                        test_case_id=datapoint.test_case_id,
+                                        evaluation=evaluation,
+                                    )
+                                )
+                                # Raise to mark span as failed
+                                raise e
+                    except (AttributeError, UnboundLocalError):
+                        # Fallback: run without span
                         try:
-                            span.set_input(datapoint)
                             test_case_output = await fn(datapoint, req.options)
-                            test_case_output.span_id = span_id
-                            test_case_output.trace_id = trace_id
-                            span.set_output(test_case_output)
                             eval_responses.append(test_case_output)
                         except Exception as e:
                             logger.debug(f'eval_stepper_fn error: {str(e)}')
@@ -383,14 +408,10 @@ async def eval_stepper_fn(req: EvalRequest) -> EvalResponse:
                             )
                             eval_responses.append(
                                 EvalFnResponse(
-                                    span_id=span_id,
-                                    trace_id=trace_id,
                                     test_case_id=datapoint.test_case_id,
                                     evaluation=evaluation,
                                 )
                             )
-                            # Raise to mark span as failed
-                            raise e
                 except Exception:
                     # Continue to process other points
                     continue

diff --git a/py/packages/genkit/src/genkit/blocks/evaluator.py b/py/packages/genkit/src/genkit/blocks/evaluator.py
@@ -24,18 +24,42 @@
 from collections.abc import Callable
 from typing import Any, TypeVar
 
+from pydantic import BaseModel, ConfigDict, Field
+
 from genkit.core.typing import (
-    BaseEvalDataPoint,
+    BaseDataPoint,
     EvalFnResponse,
     EvalRequest,
 )
 
 T = TypeVar('T')
 
 # User-provided evaluator function that evaluates a single datapoint.
-# type EvaluatorFn[T] = Callable[[BaseEvalDataPoint, T], EvalFnResponse]
-EvaluatorFn = Callable[[BaseEvalDataPoint, T], EvalFnResponse]
+# type EvaluatorFn[T] = Callable[[BaseDataPoint, T], EvalFnResponse]
+EvaluatorFn = Callable[[BaseDataPoint, T], EvalFnResponse]
 
 # User-provided batch evaluator function that evaluates an EvaluationRequest
 # type BatchEvaluatorFn[T] = Callable[[EvalRequest, T], list[EvalFnResponse]]
 BatchEvaluatorFn = Callable[[EvalRequest, T], list[EvalFnResponse]]
+
+
+class EvaluatorRef(BaseModel):
+    """Reference to an evaluator."""
+
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    name: str
+    config_schema: Any | None = Field(None, alias='configSchema')
+
+
+def evaluator_ref(name: str, config_schema: Any | None = None) -> EvaluatorRef:
+    """Create a reference to an evaluator.
+
+    Args:
+        name: Name of the evaluator.
+        config_schema: Optional schema for evaluator configuration.
+
+    Returns:
+        An EvaluatorRef instance.
+    """
+    return EvaluatorRef(name=name, config_schema=config_schema)
diff --git a/py/packages/genkit/tests/genkit/veneer/veneer_test.py b/py/packages/genkit/tests/genkit/veneer/veneer_test.py
@@ -17,6 +17,7 @@
 from genkit.blocks.model import MessageWrapper, text_from_message
 from genkit.core.action import ActionRunContext
 from genkit.core.typing import (
+    BaseDataPoint,
     BaseEvalDataPoint,
     Details,
     DocumentData,
@@ -1563,3 +1564,36 @@ async def my_flow(input: str, ctx):
 
     assert chunks == [1, 2, 3]
     assert (await response) == 'banana2'
+
+
+@pytest.mark.asyncio
+async def test_evaluate(setup_test: SetupFixture) -> None:
+    """Test that the evaluate function works."""
+    ai, _, _, *_ = setup_test
+
+    async def my_eval_fn(datapoint: BaseDataPoint, options: Any | None):
+        return EvalFnResponse(
+            test_case_id=datapoint.test_case_id,
+            evaluation=Score(score=True, details=Details(reasoning='I think it is true')),
+        )
+
+    ai.define_evaluator(
+        name='my_eval',
+        display_name='Test evaluator',
+        definition='Test evaluator that always returns True',
+        fn=my_eval_fn,
+    )
+
+    dataset = [
+        BaseDataPoint(input='hi', output='hi', test_case_id='case1'),
+        BaseDataPoint(input='bye', output='bye', test_case_id='case2'),
+    ]
+
+    response = await ai.evaluate(evaluator='my_eval', dataset=dataset)
+
+    assert isinstance(response, EvalResponse)
+    assert len(response.root) == 2
+    assert response.root[0].test_case_id == 'case1'
+    assert response.root[0].evaluation.score is True
+    assert response.root[1].test_case_id == 'case2'
+    assert response.root[1].evaluation.score is True
diff --git a/py/pyproject.toml b/py/pyproject.toml
@@ -100,6 +100,7 @@ omit = [
 default-groups = ["dev", "lint"]
 
 [tool.uv.sources]
+evaluator-demo                      = { workspace = true }
 genkit                              = { workspace = true }
 genkit-plugin-anthropic             = { workspace = true }
 genkit-plugin-compat-oai            = { workspace = true }
@@ -115,7 +116,6 @@ google-genai-hello                  = { workspace = true }
 google-genai-image                  = { workspace = true }
 prompt-demo                         = { workspace = true }
 
-
 [tool.uv.workspace]
 members = ["packages/*", "plugins/*", "samples/*"]
 

diff --git a/py/samples/evaluator-demo/README.md b/py/samples/evaluator-demo/README.md
@@ -0,0 +1,38 @@
+# Evaluator Demo
+
+An example demonstrating running flows using the Google GenAI plugin.
+
+## Setup environment
+
+Obtain an API key from [ai.dev](https://ai.dev).
+
+Export the API key as env variable `GEMINI\_API\_KEY` in your shell
+configuration.
+
+```bash
+export GEMINI_API_KEY='<Your api key>'
+```
+
+## Run the sample
+
+Start the Genkit Developer UI:
+
+```bash
+genkit start -- uv run src/eval_demo.py
+```
+
+## Evaluations
+
+### Simple inference and evaluation
+
+Use the `run_eval_demo` command to run a flow against a set of input samples and
+evaluate the generated outputs. Input (JSON) = "{}"
+
+
+## Run tests
+
+To run the automated tests for this sample:
+
+```bash
+uv run pytest -v src/eval_demo.py
+```
diff --git a/py/samples/evaluator-demo/data/dataset.json b/py/samples/evaluator-demo/data/dataset.json
@@ -0,0 +1,20 @@
+[
+    {
+        "testCaseId": "case1",
+        "input": "How many states are in the US?",
+        "output": "There are 50 states in the US.",
+        "reference": "50"
+    },
+    {
+        "testCaseId": "case2",
+        "input": "What is the capital of France?",
+        "output": "The capital of France is Paris.",
+        "reference": "Paris"
+    },
+    {
+        "testCaseId": "case3",
+        "input": "Who Lily is?",
+        "output": "Lily is a cat.",
+        "reference": "cat"
+    }
+]
diff --git a/py/samples/evaluator-demo/pyproject.toml b/py/samples/evaluator-demo/pyproject.toml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+[project]
+dependencies    = ["genkit", "pydantic>=2.0.0", "structlog>=24.0.0"]
+description     = "Genkit Python Evaluation Demo"
+name            = "eval-demo"
+requires-python = ">=3.10"
+version         = "0.0.1"
+
+[build-system]
+build-backend = "hatchling.build"
+requires      = ["hatchling"]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src"]