Skip to content

Commit 5c78bd7

Browse files
authored
Merge pull request llmware-ai#151 from llmware-ai/new-embedding-and-parsing-examples
updating examples for parsing and embedding
2 parents 4717d33 + e7bc6b4 commit 5c78bd7

File tree

5 files changed

+230
-4
lines changed

5 files changed

+230
-4
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
2+
# *** FAST START to create vector embeddings from documents ***
3+
#
4+
# docs2vecs_with_milvus-contracts - parses, text chunks and embeds legal contracts
5+
# the sample documents (~80 legal template contracts) can be pulled down from a public S3 repo with the command:
6+
# sample_files_path = Setup().load_sample_files()
7+
8+
# note: the example assumes that you have installed Milvus and MongoDB per the separate instructions in the README
9+
10+
11+
import os
12+
from llmware.library import Library
13+
from llmware.retrieval import Query
14+
from llmware.setup import Setup
15+
from llmware.status import Status
16+
17+
18+
def parse_and_generate_vector_embeddings(library_name):
19+
20+
# Step 0 - Configuration - we will use these in Step 4 to install the embeddings
21+
embedding_model = "industry-bert-contracts"
22+
vector_db = "milvus"
23+
24+
# Step 1 - Create library which is the main 'organizing construct' in llmware
25+
print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
26+
27+
library = Library().create_new_library(library_name)
28+
29+
# Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
30+
# --note: if you need to refresh the sample files, set 'over_write=True'
31+
print ("update: Step 2 - Downloading Sample Files")
32+
33+
sample_files_path = Setup().load_sample_files(over_write=False)
34+
35+
# Step 3 - point ".add_files" method to the folder of documents that was just created
36+
# this method parses all of the documents, text chunks, and captures in MongoDB
37+
print("update: Step 3 - Parsing and Text Indexing Files")
38+
39+
library.add_files(input_folder_path=os.path.join(sample_files_path, "AgreementsLarge"))
40+
41+
# Step 4 - Install the embeddings
42+
print("\nupdate: Step 4 - Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model))
43+
44+
library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
45+
46+
# note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
47+
# --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
48+
update = Status().get_embedding_status(library_name, embedding_model)
49+
print("update: Embeddings Complete - Status() check at end of embedding - ", update)
50+
51+
# Step 5 - start using the new vector embeddings with Query
52+
sample_query = "incentive compensation"
53+
print("\n\nupdate: Step 5 - Query: {}".format(sample_query))
54+
55+
query_results = Query(library).semantic_query(sample_query, result_count=20)
56+
57+
for i, entries in enumerate(query_results):
58+
59+
# each query result is a dictionary with many useful keys
60+
61+
text = entries["text"]
62+
document_source = entries["file_source"]
63+
page_num = entries["page_num"]
64+
vector_distance = entries["distance"]
65+
66+
# for display purposes only, we will only show the first 100 characters of the text
67+
if len(text) > 125: text = text[0:125] + " ... "
68+
69+
print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
70+
.format( i, document_source, page_num, vector_distance))
71+
72+
print("update: text sample - ", text)
73+
74+
75+
if __name__ == "__main__":
76+
77+
# pick any name for the library
78+
user_selected_name = "contracts"
79+
parse_and_generate_vector_embeddings(user_selected_name)
80+
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
2+
# *** FAST START to create vector embeddings from documents ***
3+
#
4+
# docs2vecs_with_milvus-un_resolutions - parses, text chunks and embeds 500 United Nations (UN) Resolutions
5+
# the sample documents (500 PDFs - 2-15 pages each) can be pulled down from a public S3 repo with the command:
6+
# sample_files_path = Setup().load_sample_files()
7+
8+
# note: the example assumes that you have installed Milvus and MongoDB per the separate instructions in the README
9+
10+
11+
import os
12+
from llmware.library import Library
13+
from llmware.retrieval import Query
14+
from llmware.setup import Setup
15+
from llmware.status import Status
16+
17+
18+
def parse_and_generate_vector_embeddings(library_name):
19+
20+
# Step 0 - Configuration - we will use these in Step 4 to install the embeddings
21+
embedding_model = "mini-lm-sbert"
22+
vector_db = "milvus"
23+
24+
# Step 1 - Create library which is the main 'organizing construct' in llmware
25+
print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
26+
27+
library = Library().create_new_library(library_name)
28+
29+
# Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
30+
# --note: if you need to refresh the sample files, set 'over_write=True'
31+
print ("update: Step 2 - Downloading Sample Files")
32+
33+
sample_files_path = Setup().load_sample_files(over_write=False)
34+
35+
# Step 3 - point ".add_files" method to the folder of documents that was just created
36+
# this method parses all of the documents, text chunks, and captures in MongoDB
37+
print("update: Step 3 - Parsing and Text Indexing Files")
38+
39+
library.add_files(input_folder_path=os.path.join(sample_files_path, "UN-Resolutions-500"))
40+
41+
# Step 4 - Install the embeddings
42+
print("\nupdate: Step 4 - Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model))
43+
44+
library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
45+
46+
# note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
47+
# --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
48+
update = Status().get_embedding_status(library_name, embedding_model)
49+
print("update: Embeddings Complete - Status() check at end of embedding - ", update)
50+
51+
# Step 5 - start using the new vector embeddings with Query
52+
sample_query = "sustainability issues impacting women"
53+
print("\n\nupdate: Step 5 - Query: {}".format(sample_query))
54+
55+
query_results = Query(library).semantic_query(sample_query, result_count=20)
56+
57+
for i, entries in enumerate(query_results):
58+
59+
# each query result is a dictionary with many useful keys
60+
61+
text = entries["text"]
62+
document_source = entries["file_source"]
63+
page_num = entries["page_num"]
64+
vector_distance = entries["distance"]
65+
66+
# for display purposes only, we will only show the first 100 characters of the text
67+
if len(text) > 125: text = text[0:125] + " ... "
68+
69+
print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
70+
.format( i, document_source, page_num, vector_distance))
71+
72+
print("update: text sample - ", text)
73+
74+
75+
if __name__ == "__main__":
76+
77+
# pick any name for the library
78+
user_selected_name = "un_resolutions500"
79+
parse_and_generate_vector_embeddings(user_selected_name)
80+
81+

‎examples/extract_pdf_tables.py‎

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
2+
3+
# *** FAST START - extract financial tables from PDF documents ***
4+
#
5+
# extract_pdf_tables - shows end-to-end flow to automatically extract tables from PDFs
6+
# the sample documents (~15 financial documents - mostly 10Ks and annual reports) are available in public S3 repo
7+
# note: this example assumes that you have installed MongoDB - see instructions in README
8+
#
9+
# this example is also reviewed in the llmware YouTube video 'Extract Tables from PDFs'
10+
# Check out this video on the llmware Youtube channel at: https://www.youtube.com/watch?v=YYcimVQEgO8&t=4s
11+
12+
13+
import os
14+
15+
from llmware.library import Library
16+
from llmware.retrieval import Query
17+
from llmware.setup import Setup
18+
from llmware.configs import LLMWareConfig
19+
20+
21+
def extract_pdf_tables(library_name):
22+
23+
# Step 1 - create library
24+
print("\nupdate: Step 1- create library - {}".format(library_name))
25+
26+
lib = Library().create_new_library(library_name)
27+
28+
# Step 2 - pull down the sample files (or insert your own files here)
29+
# --note: if you need to pull updated sample files, set 'over_write=True'
30+
print("update: Step 2 - pull sample files - FinDocs")
31+
32+
sample_files_path = Setup().load_sample_files(over_write=True)
33+
34+
# Step 3 - parse and extract all of the content from the Financial Documents
35+
print("update: Step 3 - parse, text chunk and text index the documents")
36+
37+
parsing_output = lib.add_files(input_folder_path=os.path.join(sample_files_path, "FinDocs"))
38+
39+
# review the parsing output summary info - all of the text and table blocks are in Mongo collection
40+
print("update: parsing_output - ", parsing_output)
41+
42+
# Step 4 - export all of the content into .jsonl files with metadata
43+
output_fp = LLMWareConfig().get_tmp_path()
44+
print("update: Step 4 - exporting all blocks into file path - ", output_fp)
45+
46+
output1 = lib.export_library_to_jsonl_file(output_fp, "{}_export.jsonl".format(library_name))
47+
48+
# Step 5 - export all of the tables as csv with 'amazon'
49+
print("update: Step 5 - exporting all tables with 'amazon' as csv files into file path - ", output_fp)
50+
51+
output2 = Query(lib).export_all_tables(query="amazon", output_fp=output_fp)
52+
53+
return 0
54+
55+
56+
if __name__ == "__main__":
57+
58+
p = extract_pdf_tables("pdf_table_lib_example")
59+

‎examples/parsing.py‎

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,12 @@ def parsing_files_into_library(library_name):
2020

2121
# Load the llmware sample files
2222
print (f"\n > Loading the llmware sample files...")
23-
sample_files_path = Setup().load_sample_files()
24-
pdf_file_path = os.path.join(sample_files_path,"UN-Resolutions-76th")
23+
24+
# note: if you have used this example previously, UN-Resolutions-500 is new path
25+
# -- to pull updated sample files, set: 'over_write=True'
26+
27+
sample_files_path = Setup().load_sample_files(over_write=False)
28+
pdf_file_path = os.path.join(sample_files_path,"UN-Resolutions-500")
2529
office_file_path = os.path.join(sample_files_path,"Agreements")
2630

2731
# Add files from a local path (this will pull in all supported file types:

‎llmware/setup.py‎

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
class Setup:
3131

3232
@staticmethod
33-
def load_sample_files():
33+
def load_sample_files(over_write=False):
3434

3535
# changed name from demo to 'sample_files'
3636
# simplified: no user config - pulls into llmware_path
@@ -44,7 +44,9 @@ def load_sample_files():
4444
if not os.path.exists(sample_files_path):
4545
os.makedirs(sample_files_path,exist_ok=True)
4646
else:
47-
logging.info("update: sample_files path already exists - %s ", sample_files_path)
47+
if not over_write:
48+
logging.info("update: sample_files path already exists - %s ", sample_files_path)
49+
return sample_files_path
4850

4951
# pull from sample files bucket
5052
bucket_name = LLMWareConfig().get_config("llmware_sample_files_bucket")

0 commit comments

Comments
 (0)