PranavPC2003
diff --git a/‎tests/models/test_agent_llmfx_process.py‎
Lines changed: 74 additions & 0 deletions b/‎tests/models/test_agent_llmfx_process.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎tests/models/test_prompt_benchmark_test.py‎
Lines changed: 84 additions & 0 deletions b/‎tests/models/test_prompt_benchmark_test.py‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎tests/prompts/test_fact_checking.py‎
Lines changed: 0 additions & 34 deletions b/‎tests/prompts/test_fact_checking.py‎
Lines changed: 0 additions & 34 deletions
@@ -0,0 +1,74 @@
+
+""" Tests the execution of a multi-step Agent process using multiple SLIM models. """
+
+from llmware.agents import LLMfx
+
+
+def test_multistep_agent_process():
+
+    # sample customer transcript
+
+    customer_transcript = "My name is Michael Jones, and I am a long-time customer.  " \
+                          "The Mixco product is not working currently, and it is having a negative impact " \
+                          "on my business, as we can not deliver our products while it is down. " \
+                          "This is the fourth time that I have called.  My account number is 93203, and " \
+                          "my user name is mjones. Our company is based in Tampa, Florida."
+
+    #   create an agent using LLMfx class
+    agent = LLMfx()
+
+    agent.load_work(customer_transcript)
+
+    #   load tools individually
+    agent.load_tool("sentiment")
+    agent.load_tool("ner")
+
+    #   load multiple tools
+    agent.load_tool_list(["emotions", "topics", "intent", "tags", "ratings", "answer"])
+
+    #   start deploying tools and running various analytics
+
+    #   first conduct three 'soft skills' initial assessment using 3 different models
+    agent.sentiment()
+    agent.emotions()
+    agent.intent()
+
+    #   alternative way to execute a tool, passing the tool name as a string
+    agent.exec_function_call("ratings")
+
+    #   call multiple tools concurrently
+    agent.exec_multitool_function_call(["ner","topics","tags"])
+
+    #   the 'answer' tool is a quantized question-answering model - ask an 'inline' question
+    #   the optional 'key' assigns the output to a dictionary key for easy consolidation
+    agent.answer("What is a short summary?",key="summary")
+
+    #   prompting tool to ask a quick question as part of the analytics
+    response = agent.answer("What is the customer's account number and user name?", key="customer_info")
+
+    #   you can 'unload_tool' to release it from memory
+    agent.unload_tool("ner")
+    agent.unload_tool("topics")
+
+    #   at end of processing, show the report that was automatically aggregated by key
+    report = agent.show_report()
+
+    #   displays a summary of the activity in the process
+    activity_summary = agent.activity_summary()
+
+    #   list of the responses gathered
+    for i, entries in enumerate(agent.response_list):
+        print("update: response analysis: ", i, entries)
+
+        assert entries is not None
+
+    assert activity_summary is not None
+    assert agent.journal is not None
+    assert report is not None
+
+    output = {"report": report, "activity_summary": activity_summary, "journal": agent.journal}
+
+    return output
+
+
+
@@ -0,0 +1,84 @@
+
+"""This runs a benchmark test dataset against a series of prompts.  It can be used to test any model type for
+    longer running series of prompts, as well as the fact-checking capability. """
+
+
+import time
+import random
+
+from llmware.prompts import Prompt
+from datasets import load_dataset
+
+
+def load_rag_benchmark_tester_dataset():
+
+    """ Loads benchmark dataset used in the prompt test. """
+
+    dataset_name = "llmware/rag_instruct_benchmark_tester"
+    print(f"\n > Loading RAG dataset '{dataset_name}'...")
+    dataset = load_dataset(dataset_name)
+
+    test_set = []
+    for i, samples in enumerate(dataset["train"]):
+        test_set.append(samples)
+
+    return test_set
+
+
+# Run the benchmark test
+def test_prompt_rag_benchmark():
+
+    test_dataset = load_rag_benchmark_tester_dataset()
+
+    # SELECTED MODELS
+
+    selected_test_models = ["llmware/bling-1b-0.1", "llmware/bling-1.4b-0.1",  "llmware/bling-falcon-1b-0.1",
+                            "llmware/bling-tiny-llama-v0",
+                            "bling-phi-3-gguf", "bling-answer-tool", "dragon-yi-answer-tool",
+                            "dragon-llama-answer-tool", "dragon-mistral-answer-tool"]
+
+    # randomly select one model from the list
+    r = random.randint(0,len(selected_test_models)-1)
+
+    model_name = selected_test_models[r]
+
+    print(f"\n > Loading model '{model_name}'")
+    prompter = Prompt().load_model(model_name)
+
+    print(f"\n > Running RAG Benchmark Test against '{model_name}' - 200 questions")
+    for i, entry in enumerate(test_dataset):
+
+        start_time = time.time()
+
+        prompt = entry["query"]
+        context = entry["context"]
+        response = prompter.prompt_main(prompt, context=context, prompt_name="default_with_context", temperature=0.3)
+
+        assert response is not None
+
+        # Print results
+        time_taken = round(time.time() - start_time, 2)
+        print("\n")
+        print(f"{i + 1}. llm_response - {response['llm_response']}")
+        print(f"{i + 1}. gold_answer - {entry['answer']}")
+        print(f"{i + 1}. time_taken - {time_taken}")
+
+        # Fact checking
+        fc = prompter.evidence_check_numbers(response)
+        sc = prompter.evidence_comparison_stats(response)
+        sr = prompter.evidence_check_sources(response)
+
+        for fc_entry in fc:
+            for f, facts in enumerate(fc_entry["fact_check"]):
+                print(f"{i + 1}. fact_check - {f} {facts}")
+
+        for sc_entry in sc:
+            print(f"{i + 1}. comparison_stats - {sc_entry['comparison_stats']}")
+
+        for sr_entry in sr:
+            for s, source in enumerate(sr_entry["source_review"]):
+                print(f"{i + 1}. source - {s} {source}")
+
+    return 0
+
+