Skip to content

Commit 7b4abaa

Browse files
fix(py): add more test flows in evaluator demo to match JS SDK (#4015)
Co-authored-by: Mengqin Shen <mengqin@google.com>
1 parent b903baf commit 7b4abaa

File tree

19 files changed

+698
-140
lines changed

19 files changed

+698
-140
lines changed

‎py/packages/genkit/src/genkit/core/reflection.py‎

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def send_chunk(chunk):
185185

186186
async def run_fn():
187187
return await action.arun_raw(
188-
raw_input=payload['input'],
188+
raw_input=payload.get('input'),
189189
on_chunk=send_chunk,
190190
context=context,
191191
)
@@ -216,7 +216,7 @@ async def run_fn():
216216
try:
217217

218218
async def run_fn():
219-
return await action.arun_raw(raw_input=payload['input'], context=context)
219+
return await action.arun_raw(raw_input=payload.get('input'), context=context)
220220

221221
output = run_async(loop, run_fn)
222222

@@ -413,7 +413,7 @@ async def send_chunk(chunk):
413413
yield f'{out}\n'
414414

415415
output = await action.arun_raw(
416-
raw_input=payload['input'],
416+
raw_input=payload.get('input'),
417417
on_chunk=send_chunk,
418418
context=context,
419419
)
@@ -462,7 +462,7 @@ async def run_standard_action(
462462
A JSONResponse with the action result or error.
463463
"""
464464
try:
465-
output = await action.arun_raw(raw_input=payload['input'], context=context)
465+
output = await action.arun_raw(raw_input=payload.get('input'), context=context)
466466
response = {
467467
'result': dump_dict(output.response),
468468
'telemetry': {'traceId': output.trace_id},

‎py/plugins/dev-local-vectorstore/src/genkit/plugins/dev_local_vectorstore/indexer.py‎

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from hashlib import md5
2121

2222
from genkit.blocks.document import Document
23+
from genkit.blocks.retriever import IndexerRequest
2324
from genkit.codec import dump_json
2425
from genkit.types import DocumentData, Embedding
2526

@@ -30,7 +31,8 @@
3031

3132

3233
class DevLocalVectorStoreIndexer(LocalVectorStoreAPI):
33-
async def index(self, docs: list[DocumentData]) -> None:
34+
async def index(self, request: IndexerRequest) -> None:
35+
docs = request.documents
3436
data = self._load_filestore()
3537
tasks = []
3638

‎py/plugins/evaluators/src/genkit/plugins/evaluators/plugin_api.py‎

Lines changed: 69 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717

1818
import json
19+
import os
1920
import re
2021
from collections.abc import Callable
2122
from typing import Any
@@ -37,6 +38,12 @@
3738
from genkit.types import BaseEvalDataPoint, EvalFnResponse, EvalStatusEnum, Score
3839

3940

41+
def _get_prompt_path(filename: str) -> str:
42+
"""Get absolute path to a prompt file in the prompts directory."""
43+
plugin_dir = os.path.dirname(os.path.abspath(__file__))
44+
return os.path.join(plugin_dir, '..', '..', '..', '..', 'prompts', filename)
45+
46+
4047
def evaluators_name(name: str) -> str:
4148
"""Create an evaluators plugin name.
4249
@@ -67,8 +74,10 @@ class GenkitEvaluators(Plugin):
6774

6875
name = 'genkitEval'
6976

70-
def __init__(self, params: PluginOptions):
77+
def __init__(self, params: PluginOptions | list[MetricConfig]):
7178
"""Initialize Genkit Evaluators plugin."""
79+
if isinstance(params, list):
80+
params = PluginOptions(root=params)
7281
self.params = params
7382

7483
def initialize(self, ai: Genkit) -> None:
@@ -84,20 +93,18 @@ def _configure_evaluator(self, ai: Genkit, param: MetricConfig):
8493

8594
async def _relevancy_eval(datapoint: BaseEvalDataPoint, options: Any | None):
8695
assert datapoint.output is not None, 'output is required'
87-
assert datapoint.reference is not None, 'reference is required'
88-
assert isinstance(datapoint.reference, str), 'reference must be of string (regex)'
8996
output_string = (
9097
datapoint.output if isinstance(datapoint.output, str) else json.dumps(datapoint.output)
9198
)
9299
input_string = datapoint.input if isinstance(datapoint.input, str) else json.dumps(datapoint.input)
93-
prompt_function = await load_prompt_file('../../prompts/faithfulness_long_form.prompt')
100+
prompt_function = await load_prompt_file(_get_prompt_path('faithfulness_long_form.prompt'))
94101
context = ' '.join(json.dumps(e) for e in datapoint.context)
95102
prompt = await render_text(
96103
prompt_function, {'input': input_string, 'output': output_string, 'context': context}
97104
)
98105

99106
response = await ai.generate(
100-
model=param.judge,
107+
model=param.judge.name,
101108
prompt=prompt,
102109
config=param.config,
103110
output_schema=AnswerRelevancyResponseSchema,
@@ -107,54 +114,90 @@ async def _relevancy_eval(datapoint: BaseEvalDataPoint, options: Any | None):
107114
return fill_scores(datapoint, Score(score=score, status=status), param.status_override_fn)
108115

109116
ai.define_evaluator(
110-
name=evaluators_name(str(GenkitMetricType.MALICIOUSNESS).lower()),
117+
name=evaluators_name(str(GenkitMetricType.ANSWER_RELEVANCY).lower()),
111118
display_name='Answer Relevancy',
112119
definition='Assesses how pertinent the generated answer is to the given prompt',
113120
fn=_relevancy_eval,
114121
)
115122
case GenkitMetricType.FAITHFULNESS:
123+
# Cache for prompts (loaded on first use)
124+
_faithfulness_prompts = {}
116125

117126
async def _faithfulness_eval(datapoint: BaseEvalDataPoint, options: Any | None):
118127
assert datapoint.output is not None, 'output is required'
119-
assert datapoint.reference is not None, 'reference is required'
120-
assert isinstance(datapoint.reference, str), 'reference must be of string (regex)'
121128
output_string = (
122129
datapoint.output if isinstance(datapoint.output, str) else json.dumps(datapoint.output)
123130
)
124131
input_string = datapoint.input if isinstance(datapoint.input, str) else json.dumps(datapoint.input)
125-
prompt_function = await load_prompt_file('../../prompts/faithfulness_long_form.prompt')
126-
context = ' '.join(json.dumps(e) for e in datapoint.context)
132+
context_list = [(json.dumps(e) if not isinstance(e, str) else e) for e in (datapoint.context or [])]
133+
134+
# Lazy load and cache prompts
135+
if 'longform' not in _faithfulness_prompts:
136+
_faithfulness_prompts['longform'] = await load_prompt_file(
137+
_get_prompt_path('faithfulness_long_form.prompt')
138+
)
139+
if 'nli' not in _faithfulness_prompts:
140+
_faithfulness_prompts['nli'] = await load_prompt_file(
141+
_get_prompt_path('faithfulness_nli.prompt')
142+
)
143+
144+
# Step 1: Extract statements
127145
prompt = await render_text(
128-
prompt_function, {'input': input_string, 'output': output_string, 'context': context}
146+
_faithfulness_prompts['longform'], {'question': input_string, 'answer': output_string}
129147
)
130-
131148
longform_response = await ai.generate(
132-
model=param.judge_llm,
149+
model=param.judge.name,
133150
prompt=prompt,
134-
config=param.config,
151+
config=param.judge_config,
135152
output_schema=LongFormResponseSchema,
136153
)
154+
statements = (
155+
longform_response.output.get('statements', [])
156+
if isinstance(longform_response.output, dict)
157+
else (longform_response.output.statements if longform_response.output else [])
158+
)
159+
if not statements:
160+
raise ValueError('No statements returned')
137161

138-
prompt_function = await load_prompt_file('../../prompts/faithfulness_nli.prompt')
139-
context = ' '.join(json.dumps(e) for e in datapoint.context)
162+
# Step 2: NLI Check
163+
all_statements = '\n'.join([f'statement: {s}' for s in statements])
164+
all_context = '\n'.join(context_list)
140165
prompt = await render_text(
141-
prompt_function, {'input': input_string, 'output': output_string, 'context': context}
166+
_faithfulness_prompts['nli'], {'context': all_context, 'statements': all_statements}
142167
)
143168

144-
longform_response = await ai.generate(
145-
model=param.judge_llm,
169+
nli_response = await ai.generate(
170+
model=param.judge.name,
146171
prompt=prompt,
147-
config=param.config,
148-
output_schema=LongFormResponseSchema,
172+
config=param.judge_config,
173+
output_schema=NliResponse,
174+
)
175+
176+
nli_output = nli_response.output
177+
if isinstance(nli_output, dict):
178+
responses = nli_output.get('responses', [])
179+
else:
180+
responses = nli_output.responses if nli_output else []
181+
182+
if not responses:
183+
raise ValueError('Evaluator response empty')
184+
185+
# Handle both dict and object responses
186+
faithful_count = sum(
187+
1 for r in responses if (r.get('verdict') if isinstance(r, dict) else r.verdict)
149188
)
189+
score_val = faithful_count / len(responses)
190+
reasoning = '; '.join([r.get('reason', '') if isinstance(r, dict) else r.reason for r in responses])
191+
status = EvalStatusEnum.PASS_ if score_val > 0.5 else EvalStatusEnum.FAIL
150192

151-
status = EvalStatusEnum.PASS_ if longform_response else EvalStatusEnum.FAIL
152193
return fill_scores(
153-
datapoint, Score(score=longform_response, status=status), param.status_override_fn
194+
datapoint,
195+
Score(score=score_val, status=status, details={'reasoning': reasoning}),
196+
param.status_override_fn,
154197
)
155198

156199
ai.define_evaluator(
157-
name=evaluators_name(str(GenkitMetricType.MALICIOUSNESS).lower()),
200+
name=evaluators_name(str(GenkitMetricType.FAITHFULNESS).lower()),
158201
display_name='Faithfulness',
159202
definition='Measures the factual consistency of the generated answer against the given context',
160203
fn=_faithfulness_eval,
@@ -164,20 +207,18 @@ async def _faithfulness_eval(datapoint: BaseEvalDataPoint, options: Any | None):
164207

165208
async def _maliciousness_eval(datapoint: BaseEvalDataPoint, options: Any | None):
166209
assert datapoint.output is not None, 'output is required'
167-
assert datapoint.reference is not None, 'reference is required'
168-
assert isinstance(datapoint.reference, str), 'reference must be of string (regex)'
169210
output_string = (
170211
datapoint.output if isinstance(datapoint.output, str) else json.dumps(datapoint.output)
171212
)
172213
input_string = datapoint.input if isinstance(datapoint.input, str) else json.dumps(datapoint.input)
173-
prompt_function = await load_prompt_file('../../prompts/maliciousness.prompt')
214+
prompt_function = await load_prompt_file(_get_prompt_path('maliciousness.prompt'))
174215
context = ' '.join(json.dumps(e) for e in datapoint.context)
175216
prompt = await render_text(
176217
prompt_function, {'input': input_string, 'output': output_string, 'context': context}
177218
)
178219

179220
score = await ai.generate(
180-
model=param.judge_llm,
221+
model=param.judge.name,
181222
prompt=prompt,
182223
config=param.config,
183224
output_schema=MaliciousnessResponseSchema,
Lines changed: 61 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,82 @@
1-
# Evaluator Demo
1+
# Evaluation in Genkit
22

3-
An example demonstrating running flows using the Google GenAI plugin.
3+
This sample demonstrates the different evaluation features using Genkit Python SDK.
44

5-
## Setup environment
5+
Note: This sample focuses on evaluation features in Genkit, by utilizing the official Genkit Evaluators plugin. If you are interested in writing your own custom evaluator, please check the `custom/test_evaluator` defined in `src/index.py`.
66

7-
Obtain an API key from [ai.dev](https://ai.dev).
8-
9-
Export the API key as env variable `GEMINI\_API\_KEY` in your shell
10-
configuration.
7+
## Setup and start the sample
118

129
```bash
13-
export GEMINI_API_KEY='<Your api key>'
10+
11+
# Start the Genkit Dev UI
12+
genkit start -- uv run samples/evaluator-demo/src/index.py
13+
# This command should output the link to the Genkit Dev UI.
1414
```
1515

16-
## Run the sample
16+
The rest of the commands in this guide can be run in a separate terminal or directly in the Dev UI.
1717

18-
Start the Genkit Developer UI:
18+
### Initial Setup
1919

2020
```bash
21-
genkit start -- uv run src/eval_demo.py
21+
# Index "docs/cat-handbook.pdf" to start
22+
# testing Genkit evaluation features. Please see
23+
# src/setup.py for more details.
24+
25+
genkit flow:run setup
2226
```
2327

2428
## Evaluations
2529

26-
### Simple inference and evaluation
30+
### Running Evaluations via CLI
31+
32+
Use the `eval:flow` command to run a flow against a dataset and evaluate the outputs:
33+
34+
```bash
35+
# Evaluate with a specific evaluator
36+
genkit eval:flow pdf_qa --input data/cat_adoption_questions.json --evaluator=custom/test_evaluator
37+
38+
# Evaluate with multiple evaluators
39+
genkit eval:flow pdf_qa --input data/cat_adoption_questions.json --evaluator=genkitEval/faithfulness --evaluator=genkitEval/maliciousness
2740

28-
Use the `run_eval_demo` command to run a flow against a set of input samples and
29-
evaluate the generated outputs. Input (JSON) = "{}"
41+
# Evaluate with all available evaluators (omit --evaluator flag)
42+
genkit eval:flow pdf_qa --input data/cat_adoption_questions.json
43+
```
44+
45+
### Running Evaluations in Dev UI
3046

47+
1. Navigate to the **Evaluations** tab in the Dev UI
48+
2. Click **"Run Evaluation"** or **"New Evaluation"**
49+
3. Configure:
50+
- **Flow**: Select the flow to evaluate (e.g., `pdf_qa`)
51+
- **Dataset**: Upload or select a JSON file (e.g., `data/cat_adoption_questions.json`)
52+
- **Evaluators**: Select one or more evaluators:
53+
- `custom/test_evaluator` - Random evaluator for testing (fast, no LLM calls)
54+
- `genkitEval/faithfulness` - Checks if output is faithful to context
55+
- `genkitEval/maliciousness` - Detects harmful content
56+
- `genkitEval/answer_relevancy` - Checks if answer is relevant to question
57+
4. Click **"Run"**
58+
5. View results in the Evaluations tab
3159

32-
## Run tests
60+
### Programmatic Evaluation
3361

34-
To run the automated tests for this sample:
62+
The `dog_facts_eval` flow demonstrates running evaluations from code. See `src/eval_in_code.py` for implementation details.
3563

3664
```bash
37-
uv run pytest -v src/eval_demo.py
65+
# Run programmatic evaluation
66+
genkit flow:run dog_facts_eval
3867
```
68+
69+
**Note:** The `dog_facts_eval` flow evaluates 20 test cases with the faithfulness metric, making 40 LLM API calls. This takes approximately 5 minutes to complete.
70+
71+
## Available Flows
72+
73+
- **setup**: Indexes the default PDF document (`docs/cat-handbook.pdf`) into the vector store
74+
- **index_pdf**: Indexes a specified PDF file (defaults to `docs/cat-wiki.pdf`)
75+
- **pdf_qa**: RAG flow that answers questions based on indexed PDF documents. It requires `setup` flow run first.
76+
- **simple_structured**: Simple flow with structured input/output
77+
- **simple_echo**: Simple echo flow
78+
- **dog_facts_eval**: Programmatic evaluation flow using the faithfulness metric on a dog facts dataset
79+
80+
## Reference
81+
82+
For more details on using Genkit evaluations, please refer to the official [Genkit documentation](https://firebase.google.com/docs/genkit/evaluation).

‎py/samples/evaluator-demo/data/capra-test.json‎

Lines changed: 7 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{"input":"What are typical cat behaviors?"}
2+
{"input":"What supplies do you need when bringing home a new cat?"}
3+
{"input":"How often should you trim your cat's nails?"}
4+
{"input":"What are some plants that are toxic to cats?"}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[
2+
{
3+
"input": "What are typical cat behaviors?",
4+
"reference": "Cats like to purr, push things away and cuddle."
5+
},
6+
{
7+
"input": "What supplies do you need when bringing home a new cat?",
8+
"reference": "Litter box, cat food and plenty of yarn"
9+
},
10+
{
11+
"input": "How often should you trim your cat's nails?",
12+
"reference": "Trim your cat's nails only when you feel like they're overgrown"
13+
},
14+
{
15+
"input": "What are some plants that are toxic to cats?",
16+
"reference": "I don't know, maybe poison ivy?"
17+
}
18+
]

0 commit comments

Comments
 (0)