Skip to content

Commit 9b2e5e5

Browse files
DARREN OBERSTDARREN OBERST
authored andcommitted
updating tests for models and prompts
1 parent a3043e6 commit 9b2e5e5

File tree

3 files changed

+158
-34
lines changed

3 files changed

+158
-34
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
2+
""" Tests the execution of a multi-step Agent process using multiple SLIM models. """
3+
4+
from llmware.agents import LLMfx
5+
6+
7+
def test_multistep_agent_process():
8+
9+
# sample customer transcript
10+
11+
customer_transcript = "My name is Michael Jones, and I am a long-time customer. " \
12+
"The Mixco product is not working currently, and it is having a negative impact " \
13+
"on my business, as we can not deliver our products while it is down. " \
14+
"This is the fourth time that I have called. My account number is 93203, and " \
15+
"my user name is mjones. Our company is based in Tampa, Florida."
16+
17+
# create an agent using LLMfx class
18+
agent = LLMfx()
19+
20+
agent.load_work(customer_transcript)
21+
22+
# load tools individually
23+
agent.load_tool("sentiment")
24+
agent.load_tool("ner")
25+
26+
# load multiple tools
27+
agent.load_tool_list(["emotions", "topics", "intent", "tags", "ratings", "answer"])
28+
29+
# start deploying tools and running various analytics
30+
31+
# first conduct three 'soft skills' initial assessment using 3 different models
32+
agent.sentiment()
33+
agent.emotions()
34+
agent.intent()
35+
36+
# alternative way to execute a tool, passing the tool name as a string
37+
agent.exec_function_call("ratings")
38+
39+
# call multiple tools concurrently
40+
agent.exec_multitool_function_call(["ner","topics","tags"])
41+
42+
# the 'answer' tool is a quantized question-answering model - ask an 'inline' question
43+
# the optional 'key' assigns the output to a dictionary key for easy consolidation
44+
agent.answer("What is a short summary?",key="summary")
45+
46+
# prompting tool to ask a quick question as part of the analytics
47+
response = agent.answer("What is the customer's account number and user name?", key="customer_info")
48+
49+
# you can 'unload_tool' to release it from memory
50+
agent.unload_tool("ner")
51+
agent.unload_tool("topics")
52+
53+
# at end of processing, show the report that was automatically aggregated by key
54+
report = agent.show_report()
55+
56+
# displays a summary of the activity in the process
57+
activity_summary = agent.activity_summary()
58+
59+
# list of the responses gathered
60+
for i, entries in enumerate(agent.response_list):
61+
print("update: response analysis: ", i, entries)
62+
63+
assert entries is not None
64+
65+
assert activity_summary is not None
66+
assert agent.journal is not None
67+
assert report is not None
68+
69+
output = {"report": report, "activity_summary": activity_summary, "journal": agent.journal}
70+
71+
return output
72+
73+
74+
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
2+
"""This runs a benchmark test dataset against a series of prompts. It can be used to test any model type for
3+
longer running series of prompts, as well as the fact-checking capability. """
4+
5+
6+
import time
7+
import random
8+
9+
from llmware.prompts import Prompt
10+
from datasets import load_dataset
11+
12+
13+
def load_rag_benchmark_tester_dataset():
14+
15+
""" Loads benchmark dataset used in the prompt test. """
16+
17+
dataset_name = "llmware/rag_instruct_benchmark_tester"
18+
print(f"\n > Loading RAG dataset '{dataset_name}'...")
19+
dataset = load_dataset(dataset_name)
20+
21+
test_set = []
22+
for i, samples in enumerate(dataset["train"]):
23+
test_set.append(samples)
24+
25+
return test_set
26+
27+
28+
# Run the benchmark test
29+
def test_prompt_rag_benchmark():
30+
31+
test_dataset = load_rag_benchmark_tester_dataset()
32+
33+
# SELECTED MODELS
34+
35+
selected_test_models = ["llmware/bling-1b-0.1", "llmware/bling-1.4b-0.1", "llmware/bling-falcon-1b-0.1",
36+
"llmware/bling-tiny-llama-v0",
37+
"bling-phi-3-gguf", "bling-answer-tool", "dragon-yi-answer-tool",
38+
"dragon-llama-answer-tool", "dragon-mistral-answer-tool"]
39+
40+
# randomly select one model from the list
41+
r = random.randint(0,len(selected_test_models)-1)
42+
43+
model_name = selected_test_models[r]
44+
45+
print(f"\n > Loading model '{model_name}'")
46+
prompter = Prompt().load_model(model_name)
47+
48+
print(f"\n > Running RAG Benchmark Test against '{model_name}' - 200 questions")
49+
for i, entry in enumerate(test_dataset):
50+
51+
start_time = time.time()
52+
53+
prompt = entry["query"]
54+
context = entry["context"]
55+
response = prompter.prompt_main(prompt, context=context, prompt_name="default_with_context", temperature=0.3)
56+
57+
assert response is not None
58+
59+
# Print results
60+
time_taken = round(time.time() - start_time, 2)
61+
print("\n")
62+
print(f"{i + 1}. llm_response - {response['llm_response']}")
63+
print(f"{i + 1}. gold_answer - {entry['answer']}")
64+
print(f"{i + 1}. time_taken - {time_taken}")
65+
66+
# Fact checking
67+
fc = prompter.evidence_check_numbers(response)
68+
sc = prompter.evidence_comparison_stats(response)
69+
sr = prompter.evidence_check_sources(response)
70+
71+
for fc_entry in fc:
72+
for f, facts in enumerate(fc_entry["fact_check"]):
73+
print(f"{i + 1}. fact_check - {f} {facts}")
74+
75+
for sc_entry in sc:
76+
print(f"{i + 1}. comparison_stats - {sc_entry['comparison_stats']}")
77+
78+
for sr_entry in sr:
79+
for s, source in enumerate(sr_entry["source_review"]):
80+
print(f"{i + 1}. source - {s} {source}")
81+
82+
return 0
83+
84+

‎tests/prompts/test_fact_checking.py‎

Lines changed: 0 additions & 34 deletions
This file was deleted.

0 commit comments

Comments
 (0)