gongwork
diff --git a/‎examples/docs2vecs_with_milvus-contracts.py‎
Lines changed: 80 additions & 0 deletions b/‎examples/docs2vecs_with_milvus-contracts.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎examples/docs2vecs_with_milvus-un_resolutions.py‎
Lines changed: 81 additions & 0 deletions b/‎examples/docs2vecs_with_milvus-un_resolutions.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎examples/extract_pdf_tables.py‎
Lines changed: 59 additions & 0 deletions b/‎examples/extract_pdf_tables.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎examples/parsing.py‎
Lines changed: 6 additions & 2 deletions b/‎examples/parsing.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎llmware/setup.py‎
Lines changed: 4 additions & 2 deletions b/‎llmware/setup.py‎
Lines changed: 4 additions & 2 deletions
@@ -0,0 +1,80 @@
+
+#                   *** FAST START to create vector embeddings from documents ***
+#
+#   docs2vecs_with_milvus-contracts - parses, text chunks and embeds legal contracts
+#   the sample documents (~80 legal template contracts) can be pulled down from a public S3 repo with the command:
+#           sample_files_path = Setup().load_sample_files()
+
+#   note: the example assumes that you have installed Milvus and MongoDB per the separate instructions in the README
+
+
+import os
+from llmware.library import Library
+from llmware.retrieval import Query
+from llmware.setup import Setup
+from llmware.status import Status
+
+
+def parse_and_generate_vector_embeddings(library_name):
+
+    # Step 0 - Configuration - we will use these in Step 4 to install the embeddings
+    embedding_model = "industry-bert-contracts"
+    vector_db = "milvus"
+
+    # Step 1 - Create library which is the main 'organizing construct' in llmware
+    print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
+
+    library = Library().create_new_library(library_name)
+
+    # Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
+    #   --note: if you need to refresh the sample files, set 'over_write=True'
+    print ("update: Step 2 - Downloading Sample Files")
+
+    sample_files_path = Setup().load_sample_files(over_write=False)
+
+    # Step 3 - point ".add_files" method to the folder of documents that was just created
+    #   this method parses all of the documents, text chunks, and captures in MongoDB
+    print("update: Step 3 - Parsing and Text Indexing Files")
+
+    library.add_files(input_folder_path=os.path.join(sample_files_path, "AgreementsLarge"))
+
+    # Step 4 - Install the embeddings
+    print("\nupdate: Step 4 - Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model))
+
+    library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
+
+    # note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
+    #   --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
+    update = Status().get_embedding_status(library_name, embedding_model)
+    print("update: Embeddings Complete - Status() check at end of embedding - ", update)
+
+    # Step 5 - start using the new vector embeddings with Query
+    sample_query = "incentive compensation"
+    print("\n\nupdate: Step 5 - Query: {}".format(sample_query))
+
+    query_results = Query(library).semantic_query(sample_query, result_count=20)
+
+    for i, entries in enumerate(query_results):
+
+        # each query result is a dictionary with many useful keys
+
+        text = entries["text"]
+        document_source = entries["file_source"]
+        page_num = entries["page_num"]
+        vector_distance = entries["distance"]
+
+        #  for display purposes only, we will only show the first 100 characters of the text
+        if len(text) > 125:  text = text[0:125] + " ... "
+
+        print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
+              .format( i, document_source, page_num, vector_distance))
+
+        print("update: text sample - ", text)
+
+
+if __name__ == "__main__":
+
+    # pick any name for the library
+    user_selected_name = "contracts"
+    parse_and_generate_vector_embeddings(user_selected_name)
+
@@ -0,0 +1,81 @@
+
+#                   *** FAST START to create vector embeddings from documents ***
+#
+#   docs2vecs_with_milvus-un_resolutions - parses, text chunks and embeds 500 United Nations (UN) Resolutions
+#   the sample documents (500 PDFs - 2-15 pages each) can be pulled down from a public S3 repo with the command:
+#           sample_files_path = Setup().load_sample_files()
+
+#   note: the example assumes that you have installed Milvus and MongoDB per the separate instructions in the README
+
+
+import os
+from llmware.library import Library
+from llmware.retrieval import Query
+from llmware.setup import Setup
+from llmware.status import Status
+
+
+def parse_and_generate_vector_embeddings(library_name):
+
+    # Step 0 - Configuration - we will use these in Step 4 to install the embeddings
+    embedding_model = "mini-lm-sbert"
+    vector_db = "milvus"
+
+    # Step 1 - Create library which is the main 'organizing construct' in llmware
+    print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
+
+    library = Library().create_new_library(library_name)
+
+    # Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
+    #   --note: if you need to refresh the sample files, set 'over_write=True'
+    print ("update: Step 2 - Downloading Sample Files")
+
+    sample_files_path = Setup().load_sample_files(over_write=False)
+
+    # Step 3 - point ".add_files" method to the folder of documents that was just created
+    #   this method parses all of the documents, text chunks, and captures in MongoDB
+    print("update: Step 3 - Parsing and Text Indexing Files")
+
+    library.add_files(input_folder_path=os.path.join(sample_files_path, "UN-Resolutions-500"))
+
+    # Step 4 - Install the embeddings
+    print("\nupdate: Step 4 - Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model))
+
+    library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
+
+    # note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
+    #   --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
+    update = Status().get_embedding_status(library_name, embedding_model)
+    print("update: Embeddings Complete - Status() check at end of embedding - ", update)
+
+    # Step 5 - start using the new vector embeddings with Query
+    sample_query = "sustainability issues impacting women"
+    print("\n\nupdate: Step 5 - Query: {}".format(sample_query))
+
+    query_results = Query(library).semantic_query(sample_query, result_count=20)
+
+    for i, entries in enumerate(query_results):
+
+        # each query result is a dictionary with many useful keys
+
+        text = entries["text"]
+        document_source = entries["file_source"]
+        page_num = entries["page_num"]
+        vector_distance = entries["distance"]
+
+        #  for display purposes only, we will only show the first 100 characters of the text
+        if len(text) > 125:  text = text[0:125] + " ... "
+
+        print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
+              .format( i, document_source, page_num, vector_distance))
+
+        print("update: text sample - ", text)
+
+
+if __name__ == "__main__":
+
+    # pick any name for the library
+    user_selected_name = "un_resolutions500"
+    parse_and_generate_vector_embeddings(user_selected_name)
+
+
@@ -0,0 +1,59 @@
+
+
+#                   *** FAST START - extract financial tables from PDF documents ***
+#
+#   extract_pdf_tables - shows end-to-end flow to automatically extract tables from PDFs
+#   the sample documents (~15 financial documents - mostly 10Ks and annual reports) are available in public S3 repo
+#   note: this example assumes that you have installed MongoDB - see instructions in README
+#
+#   this example is also reviewed in the llmware YouTube video 'Extract Tables from PDFs'
+#   Check out this video on the llmware Youtube channel at:  https://www.youtube.com/watch?v=YYcimVQEgO8&t=4s
+
+
+import os
+
+from llmware.library import Library
+from llmware.retrieval import Query
+from llmware.setup import Setup
+from llmware.configs import LLMWareConfig
+
+
+def extract_pdf_tables(library_name):
+
+    #   Step 1 - create library
+    print("\nupdate: Step 1- create library - {}".format(library_name))
+
+    lib = Library().create_new_library(library_name)
+
+    #   Step 2 - pull down the sample files (or insert your own files here)
+    #   --note: if you need to pull updated sample files, set 'over_write=True'
+    print("update: Step 2 - pull sample files - FinDocs")
+
+    sample_files_path = Setup().load_sample_files(over_write=True)
+
+    #   Step 3 - parse and extract all of the content from the Financial Documents
+    print("update: Step 3 - parse, text chunk and text index the documents")
+
+    parsing_output = lib.add_files(input_folder_path=os.path.join(sample_files_path, "FinDocs"))
+
+    #   review the parsing output summary info - all of the text and table blocks are in Mongo collection
+    print("update: parsing_output - ", parsing_output)
+
+    #   Step 4 - export all of the content into .jsonl files with metadata
+    output_fp = LLMWareConfig().get_tmp_path()
+    print("update: Step 4 - exporting all blocks into file path - ", output_fp)
+
+    output1 = lib.export_library_to_jsonl_file(output_fp, "{}_export.jsonl".format(library_name))
+
+    #   Step 5 - export all of the tables as csv with 'amazon'
+    print("update: Step 5 - exporting all tables with 'amazon' as csv files into file path - ", output_fp)
+
+    output2 = Query(lib).export_all_tables(query="amazon", output_fp=output_fp)
+
+    return 0
+
+
+if __name__ == "__main__":
+
+    p = extract_pdf_tables("pdf_table_lib_example")
+
@@ -20,8 +20,12 @@ def parsing_files_into_library(library_name):
 
     # Load the llmware sample files
     print (f"\n > Loading the llmware sample files...")
-    sample_files_path = Setup().load_sample_files()
-    pdf_file_path = os.path.join(sample_files_path,"UN-Resolutions-76th")
+
+    # note: if you have used this example previously, UN-Resolutions-500 is new path
+    #   -- to pull updated sample files, set: 'over_write=True'
+
+    sample_files_path = Setup().load_sample_files(over_write=False)
+    pdf_file_path = os.path.join(sample_files_path,"UN-Resolutions-500")
     office_file_path = os.path.join(sample_files_path,"Agreements")
 
     # Add files from a local path (this will pull in all supported file types:
 
@@ -30,7 +30,7 @@
 class Setup:
 
     @staticmethod
-    def load_sample_files():
+    def load_sample_files(over_write=False):
 
         #   changed name from demo to 'sample_files'
         #   simplified:  no user config - pulls into llmware_path
@@ -44,7 +44,9 @@ def load_sample_files():
         if not os.path.exists(sample_files_path):
             os.makedirs(sample_files_path,exist_ok=True)
         else:
-            logging.info("update: sample_files path already exists - %s ", sample_files_path)
+            if not over_write:
+                logging.info("update: sample_files path already exists - %s ", sample_files_path)
+                return sample_files_path
 
         # pull from sample files bucket
         bucket_name = LLMWareConfig().get_config("llmware_sample_files_bucket")