Skip to content

Commit

Permalink
feat: Deletion script fix to remove entities (#173)
Browse files Browse the repository at this point in the history
Co-authored-by: Eyal Ben Ivri <[email protected]>
  • Loading branch information
eyalbenivri and Eyal Ben Ivri authored Dec 12, 2024
1 parent 95e7d72 commit 2f8936b
Show file tree
Hide file tree
Showing 23 changed files with 1,994 additions and 201 deletions.
2 changes: 1 addition & 1 deletion components/doc-classifier/src/Procfile
Original file line number Diff line number Diff line change
@@ -1 +1 @@
web: python3 main.py
web: python3 doc_classifier_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import logging
import logging.config
import os
import sys
from typing import Optional
Expand All @@ -31,6 +32,37 @@
documentai, # type: ignore # pylint: disable = no-name-in-module # pylint: disable = import-error
)

logging_config = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"simple": {
"format": "[%(levelname)s|%(module)s|L%(lineno)d] %(asctime)s: %(message)s",
"datefmt": "%Y-%m-%dT%H:%M:%S%z",
}
},
"handlers": {
"console": {
"class": "logging.StreamHandler",
"level": "INFO",
"formatter": "simple",
"stream": "ext://sys.stdout",
}
},
"loggers": {
"root": {
"level": "DEBUG",
"handlers": [
"console",
],
}
},
}

logging.config.dictConfig(logging_config)
logging.basicConfig(level="INFO")
logger = logging.getLogger(__name__)

USER_AGENT = "cloud-solutions/eks-docai-v1"


Expand All @@ -42,7 +74,7 @@ def batch_classify_documents(
gcs_output_uri: str,
processor_version_id: Optional[str] = None,
field_mask: Optional[str] = None,
timeout: int = 400,
timeout: int = 1000,
):
"""Function for processing PDF documents in batch"""
# You must set the `api_endpoint` if you use a location other than "us".
Expand Down Expand Up @@ -84,19 +116,19 @@ def batch_classify_documents(

# BatchProcess returns a Long Running Operation (LRO)
operation = client.batch_process_documents(request)
logging.info(f"Started batch process; {operation.metadata=};")
logger.info(f"Started batch process; {operation.metadata=};")

# Continually polls the operation until it is complete.
# This could take some time for larger files
# Format: projects/{project_id}/locations/{location}/operations/{operation_id}
try:
logging.info(
logger.info(
f"Waiting for operation {operation.operation.name} to " f"complete..."
)
operation.result(timeout=timeout)
# Catch exception when operation doesn't finish before timeout
except (RetryError, InternalServerError) as e:
logging.error(e.message)
logger.error(e.message)

# NOTE: Can also use callbacks for asynchronous processing
#
Expand Down Expand Up @@ -134,12 +166,12 @@ def batch_classify_documents(
f"{GCS_INPUT_PREFIX=}, "
f"{GCS_OUTPUT_URI=}"
)
logging.error(message)
logger.error(message)
sys.exit(1)

try:
logging.info(f"Starting Task #{TASK_INDEX} (att. {TASK_ATTEMPT}.")
logging.info(
logger.info(f"Starting Task #{TASK_INDEX} (att. {TASK_ATTEMPT}.")
logger.info(
f"{PROCESSOR_ID=}, "
f"{PROJECT_ID=}, "
f"{LOCATION=}, "
Expand All @@ -153,7 +185,7 @@ def batch_classify_documents(
gcs_input_prefix=GCS_INPUT_PREFIX,
gcs_output_uri=GCS_OUTPUT_URI,
)
logging.info(f"Completed Task #{TASK_INDEX} (att. {TASK_ATTEMPT}.")
logger.info(f"Completed Task #{TASK_INDEX} (att. {TASK_ATTEMPT}.")
except Exception as e:
logging.error(f"Task Index {TASK_INDEX} (att. {TASK_ATTEMPT} failed!" f"{e}")
logger.error(f"Task Index {TASK_INDEX} (att. {TASK_ATTEMPT} failed!" f"{e}")
sys.exit(1)
1 change: 1 addition & 0 deletions components/doc-deletion/src/Procfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
web: python3 doc_deletion_main.py
Loading

0 comments on commit 2f8936b

Please sign in to comment.