feng-huang
diff --git a/‎tests/embeddings/__init__.py‎ b/‎tests/embeddings/__init__.py‎
diff --git a/‎tests/embeddings/reset_milvus.py‎
Lines changed: 0 additions & 7 deletions b/‎tests/embeddings/reset_milvus.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎tests/embeddings/test_all_embedding_dbs.py‎
Lines changed: 18 additions & 0 deletions b/‎tests/embeddings/test_all_embedding_dbs.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎tests/embeddings/test_all_sentence_transformer_models.py‎
Lines changed: 4 additions & 1 deletion b/‎tests/embeddings/test_all_sentence_transformer_models.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tests/embeddings/test_embedding_model_load.py‎
Lines changed: 33 additions & 0 deletions b/‎tests/embeddings/test_embedding_model_load.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎tests/embeddings/test_embeddings.py‎
Lines changed: 0 additions & 100 deletions b/‎tests/embeddings/test_embeddings.py‎
Lines changed: 0 additions & 100 deletions
diff --git a/‎tests/embeddings/test_install_embeddings.py‎
Lines changed: 113 additions & 0 deletions b/‎tests/embeddings/test_install_embeddings.py‎
Lines changed: 113 additions & 0 deletions
@@ -1,3 +1,6 @@
+
+""" Provides a set of short tests for specific vector DB. """
+
 import os
 import pytest 
 import time
@@ -12,53 +15,66 @@
 from llmware.resources import CloudBucketManager
 from tests.embeddings.utils import qdrant_installed
 
+
 def test_unsupported_embedding_db():
+
     embedding_db = "milvusXYZ"  # Bad Embedding DB Name
     with pytest.raises(UnsupportedEmbeddingDatabaseException) as excinfo:
         embedding_handler = EmbeddingHandler(library=None)
         embedding_summary = embedding_handler.create_new_embedding(embedding_db=embedding_db, model=None)
     assert str(excinfo.value) == f"'{embedding_db}' is not a supported vector embedding database" 
 
+
 def test_milvus_embedding_and_query():
+
     sample_files_path = Setup().load_sample_files()
     library = Library().create_new_library("test_embedding_milvus")
     library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
     results = generic_embedding_and_query(library, "milvus")
     assert len(results) > 0
     library.delete_library(confirm_delete=True)
 
+
 def test_neo4j_embedding_and_query():
+
     sample_files_path = Setup().load_sample_files()
     library = Library().create_new_library("test_embedding_neo4j")
     library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
     results = generic_embedding_and_query(library, "neo4j")
     assert len(results) > 0
     library.delete_library(confirm_delete=True)
 
+
 def test_chromadb_embedding_and_query():
+
     sample_files_path = Setup().load_sample_files()
     library = Library().create_new_library("test_embedding_neo4j")
     library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
     results = generic_embedding_and_query(library, "chromadb")
     assert len(results) > 0
     library.delete_library(confirm_delete=True)
 
+
 def test_faiss_embedding_and_query():
+
     sample_files_path = Setup().load_sample_files()
     library = Library().create_new_library("test_embedding_faiss")
     library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
     results = generic_embedding_and_query(library, "faiss")
     assert len(results) > 0
     library.delete_library(confirm_delete=True)
 
+
 def test_lancedb_embedding_and_query():
+
     sample_files_path = Setup().load_sample_files()
     library = Library().create_new_library("test_embedding_lancedb")
     library.add_files(os.path.join(sample_files_path,"SmallLibrary"))
     results = generic_embedding_and_query(library, "lancedb")
     assert len(results) > 0
     library.delete_library(confirm_delete=True)
 
+
 @pytest.mark.skipif(not qdrant_installed(), reason="Qdrant client is not installed")
 def test_qdrant_embedding_and_query():
     os.environ["USER_MANAGED_QDRANT_LOCATION"] = ":memory:"
@@ -87,7 +103,9 @@ def test_qdrant_embedding_and_query():
 #     assert len(results) > 0
 #     library.delete_library(confirm_delete=True)
 
+
 def generic_embedding_and_query(library, embedding_db):
+
     # Run the embeddings (only of first 3 docs ) 
     model=ModelCatalog().load_model("mini-lm-sbert")
     embedding_handler = EmbeddingHandler(library=library)
 
@@ -1,5 +1,8 @@
+
+""" Test for pulling (all) embedding models from Sentence Transformer catalog. """
+
 import os
-import pytest 
+import pytest
 import time
 
 from llmware.library import Library
 
@@ -0,0 +1,33 @@
+
+""" Tests that embedding model is loaded and yielding a structurally correct embedding vector. """
+
+
+from llmware.models import ModelCatalog
+
+
+def test_embedding_model_local_load():
+
+    emb_models = ModelCatalog().list_embedding_models()
+
+    test_text = ("This is just a sample text to confirm that the embedding model is loading and correctly "
+                 "converting into a structurally accurate embedding vector.")
+
+    for model_card in emb_models:
+
+        if model_card["model_family"] in ["HFEmbeddingModel"]:
+
+            print(f"\nloading model - {model_card['model_name']} - embedding dims - {model_card['embedding_dims']}")
+
+            model = ModelCatalog().load_model(model_card["model_name"])
+
+            embedding_vector = model.embedding(test_text)
+
+            assert embedding_vector is not None
+
+            print(f"created vector successfully with dimensions: ", embedding_vector[0].shape)
+
+            assert embedding_vector[0].shape[0] == model_card['embedding_dims']
+
+    return 0
+
+
@@ -0,0 +1,113 @@
+
+""" Test for embedding vector creation and storage in a selected vector DB with selected embedding model. """
+
+
+import os
+from llmware.library import Library
+from llmware.retrieval import Query
+from llmware.setup import Setup
+from llmware.status import Status
+from llmware.configs import LLMWareConfig
+
+
+def setup_library(library_name):
+
+    """ Note: this setup_library method is provided to enable a self-contained example to create a test library """
+
+    #   Step 1 - Create library which is the main 'organizing construct' in llmware
+    print ("\nupdate: Creating library: {}".format(library_name))
+
+    library = Library().create_new_library(library_name)
+
+    #   check the embedding status 'before' installing the embedding
+    embedding_record = library.get_embedding_status()
+    print("embedding record - before embedding ", embedding_record)
+
+    #   Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
+    #   --note: if you need to refresh the sample files, set 'over_write=True'
+    print ("update: Downloading Sample Files")
+
+    sample_files_path = Setup().load_sample_files(over_write=False)
+
+    #   Step 3 - point ".add_files" method to the folder of documents that was just created
+    #   this method parses the documents, text chunks, and captures in database
+
+    print("update: Parsing and Text Indexing Files")
+
+    library.add_files(input_folder_path=os.path.join(sample_files_path, "Agreements"),
+                      chunk_size=400, max_chunk_size=600, smart_chunking=1)
+
+    return library
+
+
+def test_install_vector_embeddings():
+
+    LLMWareConfig().set_active_db("sqlite")
+
+    library = setup_library("test_emb_install_09123")
+
+    #   select vector db that you would like to test
+    vector_db = "chromadb"
+
+    LLMWareConfig().set_vector_db(vector_db)
+
+    #   select embedding model
+    embedding_model = "mini-lm-sbert"
+
+    library_name = library.library_name
+
+    print(f"\nupdate: Starting the Embedding: "
+          f"library - {library_name} - "
+          f"vector_db - {vector_db} - "
+          f"model - {embedding_model}")
+
+    #   *** this is the one key line of code to create the embedding ***
+    library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db,batch_size=100)
+
+    #   note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
+    #   --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
+    update = Status().get_embedding_status(library_name, embedding_model)
+    print("update: Embeddings Complete - Status() check at end of embedding - ", update)
+
+    # Start using the new vector embeddings with Query
+    sample_query = "incentive compensation"
+    print("\n\nupdate: Run a sample semantic/vector query: {}".format(sample_query))
+
+    #   queries are constructed by creating a Query object, and passing a library as input
+    query_results = Query(library).semantic_query(sample_query, result_count=20)
+
+    assert query_results is not None
+
+    for i, entries in enumerate(query_results):
+
+        #   each query result is a dictionary with many useful keys
+
+        text = entries["text"]
+        document_source = entries["file_source"]
+        page_num = entries["page_num"]
+        vector_distance = entries["distance"]
+
+        #   to see all of the dictionary keys returned, uncomment the line below
+        #   print("update: query_results - all - ", i, entries)
+
+        #  for display purposes only, we will only show the first 125 characters of the text
+        if len(text) > 125:  text = text[0:125] + " ... "
+
+        print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
+              .format( i, document_source, page_num, vector_distance))
+
+        print("update: text sample - ", text)
+
+    #   lets take a look at the library embedding status again at the end to confirm embeddings were created
+    embedding_record = library.get_embedding_status()
+
+    assert embedding_record is not None
+
+    print("\nupdate:  embedding record - ", embedding_record)
+
+    return 0
+
+
+
+
+