GitSentinel
diff --git a/‎examples/Parsing/ocr_embedded_doc_images.py‎
Lines changed: 9 additions & 2 deletions b/‎examples/Parsing/ocr_embedded_doc_images.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎examples/Parsing/parse_pdf_by_ocr.py‎
Lines changed: 20 additions & 2 deletions b/‎examples/Parsing/parse_pdf_by_ocr.py‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎llmware/requirements.txt‎
Lines changed: 0 additions & 2 deletions b/‎llmware/requirements.txt‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎setup.py‎
Lines changed: 0 additions & 2 deletions b/‎setup.py‎
Lines changed: 0 additions & 2 deletions
@@ -7,13 +7,15 @@
     B.  run an OCR against the images to derive the text from the image using the OCR
     C.  insert the text into the database library collection for subsequent retrieval.
 
+    Note: this example uses additional python dependencies:
+
+        -- pip3 install pytesseract
+
     Note: this example uses an OCR engine, which is outside of the core llmware package.  To install on Ubuntu:
 
         -- sudo apt install tesseract-ocr
         -- sudo apt install libtesseract-dev
 
-        -- pip3 install pytesseract   [should already be installed with llmware requirements.txt]
-
     [Other platforms:
         -- Mac: brew install tesseract
         -- Windows:   GUI download installer - see UB-Mannheim @ www.github.com/UB-Mannheim/tesseract/wiki
@@ -47,6 +49,11 @@
 from llmware.resources import CollectionRetrieval, CollectionWriter
 from llmware.parsers import ImageParser
 
+from importlib import util
+if not util.find_spec("pytesseract"):
+    print("\nto run this example requires additional dependencies, including pytesseract - see comments above in "
+          "this script.  to install pytesseract:  pip3 install pytesseract.")
+
 
 def ocr_images_in_library(library_name, add_new_text_block=False, chunk_size=400, min_chars=10):
 
 
@@ -1,7 +1,21 @@
 
 """ This example demonstrates how to parse PDF documents consisting of scanned pages using OCR
-      1. Note: uses pdf2image library - requires separate install locally of lib tesseract and poppler
-      2. This is a useful fall-back for scanned documents, if not possible to parse digitally
+
+    Parsing a PDF-by-OCR is much slower and loses metadata, compared with a digital parse - but this is a
+    necessary fall-back for many 'paper-scanned' PDFs, or in the relatively rare cases in which
+    digital parsing is not successful
+
+    NOTE:  there are several dependencies that must be installed to run this example:
+
+    pip install:
+        -- pip3 install pytesseract
+        -- pip3 install pdf2image
+
+    core libraries:
+        -- tesseract: e.g., (Mac OS) - brew install tesseract or (Linux) - sudo apt install tesseract
+        -- poppler:   e.g., (Mac OS) - brew install poppler or (Linux) - sudo apt-get install -y poppler-utils
+                     for Windows download see - https://poppler.freedesktop.org/
+
 """
 
 import os
@@ -10,6 +24,10 @@
 from llmware.parsers import Parser
 from llmware.setup import Setup
 
+from importlib import util
+if not util.find_spec("pytesseract") or not util.find_spec("pdf2image"):
+    print("\nto run this example, please install pytesseract and pdf2image - and there may be core libraries "
+          "that need to be installed as well - see comments above more details.")
 
 def parsing_pdf_by_ocr ():
 
 
@@ -3,10 +3,8 @@ datasets==2.15.0
 huggingface-hub==0.19.4  
 numpy>=1.23.2
 openai>=1.0
-pdf2image==1.16.0
 pymilvus>=2.3.0
 pymongo>=4.7.0
-pytesseract==0.3.10
 sentence-transformers==2.2.2
 tabulate==0.9.0
 tokenizers>=0.15.0
 
@@ -58,10 +58,8 @@ def glob_fix(package_name, glob):
         'huggingface-hub==0.19.4',
         'numpy>=1.23.2',
         'openai>=1.0.0',
-        'pdf2image==1.16.0',
         'pymilvus>=2.3.0',
         'pymongo>=4.7.0',
-        'pytesseract==0.3.10',
         'sentence-transformers==2.2.2',
         'tabulate==0.9.0',
         'tokenizers>=0.15.0',