@@ -1320,37 +1320,46 @@ def aggregate_text(self, qr_list):
13201320
13211321 return text_agg , meta_agg
13221322
1323- def document_lookup (self , doc_id = "" , file_source = "" ):
1324-
1325- """ Takes as an input either a doc_id or file_source (e.g., filename) that is in a Library, and
1326- returns all of the non-image text and table blocks in the document. """
1323+ def document_lookup (self , doc_id = "" , file_source = "" , include_images = False ):
1324+ """
1325+ Takes as an input either a doc_id or file_source (e.g., filename) that is in a Library, and
1326+ returns all of the text and table blocks in the document. Images can be optionally included.
1327+
1328+ Parameters:
1329+ doc_id (str): Document ID.
1330+ file_source (str): Source file name.
1331+ include_images (bool): Whether to include images in the result. Defaults to False.
1332+
1333+ Returns:
1334+ list: Filtered list of document blocks.
1335+ """
13271336
13281337 if doc_id :
13291338 kv_dict = {"doc_ID" : doc_id }
13301339 elif file_source :
13311340 kv_dict = {"file_source" : file_source }
13321341 else :
1333- raise RuntimeError ("Query document_lookup method requires as input either a document ID or "
1334- "the name of a file already parsed in the library " )
1342+ raise RuntimeError (
1343+ "Query document_lookup method requires as input either a document ID or "
1344+ "the name of a file already parsed in the library"
1345+ )
13351346
13361347 output = CollectionRetrieval (self .library_name , account_name = self .account_name ).filter_by_key_dict (kv_dict )
13371348
13381349 if len (output ) == 0 :
13391350 logger .warning (f"update: Query - document_lookup - nothing found - { doc_id } - { file_source } " )
1340- result = []
1341-
1342- return result
1351+ return []
13431352
13441353 output_final = []
13451354
1346- # exclude images to avoid potential duplicate text
13471355 for entries in output :
1348- if entries ["content_type" ] != "image" :
1356+ # Filter out images if include_images is False
1357+ if include_images or entries ["content_type" ] != "image" :
13491358 entries .update ({"matches" : []})
13501359 entries .update ({"page_num" : entries ["master_index" ]})
13511360 output_final .append (entries )
13521361
1353- output_final = sorted (output_final , key = lambda x :x ["block_ID" ], reverse = False )
1362+ output_final = sorted (output_final , key = lambda x : x ["block_ID" ], reverse = False )
13541363
13551364 return output_final
13561365
0 commit comments