Skip to content

Commit 7d1eb1b

Browse files
authored
Merge pull request llmware-ai#700 from llmware-ai/update-tests-embeddings
updating embedding tests
2 parents 700903e + c7a6328 commit 7d1eb1b

File tree

9 files changed

+168
-169
lines changed

9 files changed

+168
-169
lines changed

‎tests/embeddings/__init__.py‎

Whitespace-only changes.

‎tests/embeddings/reset_milvus.py‎

Lines changed: 0 additions & 7 deletions
This file was deleted.

‎tests/embeddings/test_all_embedding_dbs.py‎

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
2+
""" Provides a set of short tests for specific vector DB. """
3+
14
import os
25
import pytest
36
import time
@@ -12,53 +15,66 @@
1215
from llmware.resources import CloudBucketManager
1316
from tests.embeddings.utils import qdrant_installed
1417

18+
1519
def test_unsupported_embedding_db():
20+
1621
embedding_db = "milvusXYZ" # Bad Embedding DB Name
1722
with pytest.raises(UnsupportedEmbeddingDatabaseException) as excinfo:
1823
embedding_handler = EmbeddingHandler(library=None)
1924
embedding_summary = embedding_handler.create_new_embedding(embedding_db=embedding_db, model=None)
2025
assert str(excinfo.value) == f"'{embedding_db}' is not a supported vector embedding database"
2126

27+
2228
def test_milvus_embedding_and_query():
29+
2330
sample_files_path = Setup().load_sample_files()
2431
library = Library().create_new_library("test_embedding_milvus")
2532
library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
2633
results = generic_embedding_and_query(library, "milvus")
2734
assert len(results) > 0
2835
library.delete_library(confirm_delete=True)
2936

37+
3038
def test_neo4j_embedding_and_query():
39+
3140
sample_files_path = Setup().load_sample_files()
3241
library = Library().create_new_library("test_embedding_neo4j")
3342
library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
3443
results = generic_embedding_and_query(library, "neo4j")
3544
assert len(results) > 0
3645
library.delete_library(confirm_delete=True)
3746

47+
3848
def test_chromadb_embedding_and_query():
49+
3950
sample_files_path = Setup().load_sample_files()
4051
library = Library().create_new_library("test_embedding_neo4j")
4152
library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
4253
results = generic_embedding_and_query(library, "chromadb")
4354
assert len(results) > 0
4455
library.delete_library(confirm_delete=True)
4556

57+
4658
def test_faiss_embedding_and_query():
59+
4760
sample_files_path = Setup().load_sample_files()
4861
library = Library().create_new_library("test_embedding_faiss")
4962
library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
5063
results = generic_embedding_and_query(library, "faiss")
5164
assert len(results) > 0
5265
library.delete_library(confirm_delete=True)
5366

67+
5468
def test_lancedb_embedding_and_query():
69+
5570
sample_files_path = Setup().load_sample_files()
5671
library = Library().create_new_library("test_embedding_lancedb")
5772
library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
5873
results = generic_embedding_and_query(library, "lancedb")
5974
assert len(results) > 0
6075
library.delete_library(confirm_delete=True)
6176

77+
6278
@pytest.mark.skipif(not qdrant_installed(), reason="Qdrant client is not installed")
6379
def test_qdrant_embedding_and_query():
6480
os.environ["USER_MANAGED_QDRANT_LOCATION"] = ":memory:"
@@ -87,7 +103,9 @@ def test_qdrant_embedding_and_query():
87103
# assert len(results) > 0
88104
# library.delete_library(confirm_delete=True)
89105

106+
90107
def generic_embedding_and_query(library, embedding_db):
108+
91109
# Run the embeddings (only of first 3 docs )
92110
model=ModelCatalog().load_model("mini-lm-sbert")
93111
embedding_handler = EmbeddingHandler(library=library)

‎tests/embeddings/test_all_sentence_transformer_models.py‎

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
2+
""" Test for pulling (all) embedding models from Sentence Transformer catalog. """
3+
14
import os
2-
import pytest
5+
import pytest
36
import time
47

58
from llmware.library import Library
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
2+
""" Tests that embedding model is loaded and yielding a structurally correct embedding vector. """
3+
4+
5+
from llmware.models import ModelCatalog
6+
7+
8+
def test_embedding_model_local_load():
9+
10+
emb_models = ModelCatalog().list_embedding_models()
11+
12+
test_text = ("This is just a sample text to confirm that the embedding model is loading and correctly "
13+
"converting into a structurally accurate embedding vector.")
14+
15+
for model_card in emb_models:
16+
17+
if model_card["model_family"] in ["HFEmbeddingModel"]:
18+
19+
print(f"\nloading model - {model_card['model_name']} - embedding dims - {model_card['embedding_dims']}")
20+
21+
model = ModelCatalog().load_model(model_card["model_name"])
22+
23+
embedding_vector = model.embedding(test_text)
24+
25+
assert embedding_vector is not None
26+
27+
print(f"created vector successfully with dimensions: ", embedding_vector[0].shape)
28+
29+
assert embedding_vector[0].shape[0] == model_card['embedding_dims']
30+
31+
return 0
32+
33+

‎tests/embeddings/test_embeddings.py‎

Lines changed: 0 additions & 100 deletions
This file was deleted.
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
2+
""" Test for embedding vector creation and storage in a selected vector DB with selected embedding model. """
3+
4+
5+
import os
6+
from llmware.library import Library
7+
from llmware.retrieval import Query
8+
from llmware.setup import Setup
9+
from llmware.status import Status
10+
from llmware.configs import LLMWareConfig
11+
12+
13+
def setup_library(library_name):
14+
15+
""" Note: this setup_library method is provided to enable a self-contained example to create a test library """
16+
17+
# Step 1 - Create library which is the main 'organizing construct' in llmware
18+
print ("\nupdate: Creating library: {}".format(library_name))
19+
20+
library = Library().create_new_library(library_name)
21+
22+
# check the embedding status 'before' installing the embedding
23+
embedding_record = library.get_embedding_status()
24+
print("embedding record - before embedding ", embedding_record)
25+
26+
# Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
27+
# --note: if you need to refresh the sample files, set 'over_write=True'
28+
print ("update: Downloading Sample Files")
29+
30+
sample_files_path = Setup().load_sample_files(over_write=False)
31+
32+
# Step 3 - point ".add_files" method to the folder of documents that was just created
33+
# this method parses the documents, text chunks, and captures in database
34+
35+
print("update: Parsing and Text Indexing Files")
36+
37+
library.add_files(input_folder_path=os.path.join(sample_files_path, "Agreements"),
38+
chunk_size=400, max_chunk_size=600, smart_chunking=1)
39+
40+
return library
41+
42+
43+
def test_install_vector_embeddings():
44+
45+
LLMWareConfig().set_active_db("sqlite")
46+
47+
library = setup_library("test_emb_install_09123")
48+
49+
# select vector db that you would like to test
50+
vector_db = "chromadb"
51+
52+
LLMWareConfig().set_vector_db(vector_db)
53+
54+
# select embedding model
55+
embedding_model = "mini-lm-sbert"
56+
57+
library_name = library.library_name
58+
59+
print(f"\nupdate: Starting the Embedding: "
60+
f"library - {library_name} - "
61+
f"vector_db - {vector_db} - "
62+
f"model - {embedding_model}")
63+
64+
# *** this is the one key line of code to create the embedding ***
65+
library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db,batch_size=100)
66+
67+
# note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
68+
# --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
69+
update = Status().get_embedding_status(library_name, embedding_model)
70+
print("update: Embeddings Complete - Status() check at end of embedding - ", update)
71+
72+
# Start using the new vector embeddings with Query
73+
sample_query = "incentive compensation"
74+
print("\n\nupdate: Run a sample semantic/vector query: {}".format(sample_query))
75+
76+
# queries are constructed by creating a Query object, and passing a library as input
77+
query_results = Query(library).semantic_query(sample_query, result_count=20)
78+
79+
assert query_results is not None
80+
81+
for i, entries in enumerate(query_results):
82+
83+
# each query result is a dictionary with many useful keys
84+
85+
text = entries["text"]
86+
document_source = entries["file_source"]
87+
page_num = entries["page_num"]
88+
vector_distance = entries["distance"]
89+
90+
# to see all of the dictionary keys returned, uncomment the line below
91+
# print("update: query_results - all - ", i, entries)
92+
93+
# for display purposes only, we will only show the first 125 characters of the text
94+
if len(text) > 125: text = text[0:125] + " ... "
95+
96+
print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
97+
.format( i, document_source, page_num, vector_distance))
98+
99+
print("update: text sample - ", text)
100+
101+
# lets take a look at the library embedding status again at the end to confirm embeddings were created
102+
embedding_record = library.get_embedding_status()
103+
104+
assert embedding_record is not None
105+
106+
print("\nupdate: embedding record - ", embedding_record)
107+
108+
return 0
109+
110+
111+
112+
113+

0 commit comments

Comments
 (0)