Merge pull request #116 from wri/develop

Develop to master
wri · Feb 1, 2023 · 93ff657 · 93ff657
2 parents f27bc2d + c9ecba5
commit 93ff657
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 19 deletions.
diff --git a/src/Dockerfile b/src/Dockerfile
@@ -11,7 +11,7 @@ COPY . $WORKDIR
 # installing dependencies to build package
 RUN pip install . -t python
 
-# change 31852
+# change 31855
 
 # Precompile all python packages and remove .py files
 RUN find python/ -type f -name '*.pyc' -print0 | xargs -0 rm -rf

diff --git a/src/datapump/jobs/geotrellis.py b/src/datapump/jobs/geotrellis.py
@@ -192,8 +192,8 @@ def upload(self):
                             table.partitions.dict()
                             if table.partitions
                             else table.partitions,
-                            table.longitude_field,
-                            table.latitude_field,
+                            longitude_field=table.longitude_field,
+                            latitude_field=table.latitude_field,
                         )
                 else:
                     client.append(table.dataset, table.version, table.source_uri)
@@ -208,8 +208,8 @@ def upload(self):
                     table.cluster.dict() if table.cluster else table.cluster,
                     table.table_schema,
                     table.partitions.dict() if table.partitions else table.partitions,
-                    table.longitude_field,
-                    table.latitude_field,
+                    longitude_field=table.longitude_field,
+                    latitude_field=table.latitude_field,
                 )
 
     def check_upload(self) -> JobStatus:
@@ -278,18 +278,20 @@ def _get_result_tables(self) -> List[AnalysisResultTable]:
         bucket, prefix = get_s3_path_parts(result_path)
 
         LOGGER.debug(f"Looking for analysis results at {result_path}")
-        resp = get_s3_client().list_objects_v2(Bucket=bucket, Prefix=prefix)
-
-        LOGGER.debug(resp)
-
-        if "Contents" not in resp:
-            raise AssertionError("No results found in S3")
-
-        keys = [
-            item["Key"]
-            for item in resp["Contents"]
-            if item["Key"].endswith(".csv") and "download" not in item["Key"]
-        ]
+        paginator = get_s3_client().get_paginator("list_objects_v2")
+        pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
+
+        keys = []
+        for page in pages:
+            if "Contents" not in page:
+                raise AssertionError("No results found in S3")
+
+            page_keys = [
+                item["Key"]
+                for item in page["Contents"]
+                if item["Key"].endswith(".csv") and "download" not in item["Key"]
+            ]
+            keys += page_keys
 
         result_tables = [
             self._get_result_table(bucket, path, list(files))
@@ -551,6 +553,7 @@ def _get_field_type(self, field, is_whitelist=False):
                 or field.endswith("__perc")
                 or field.endswith("__year")
                 or field.endswith("__week")
+                or field.endswith("__decile")
                 or field == "adm1"
                 or field == "adm2"
             ):
@@ -861,7 +864,7 @@ def _configurations(self, worker_count: str) -> List[Dict[str, Any]]:
             "spark.driver.cores": "1",
             "spark.executor.cores": "1",
             "spark.yarn.executor.memoryOverhead": "1G",
-            "spark.dynamicAllocation.enabled": "false"
+            "spark.dynamicAllocation.enabled": "false",
         }
 
         if self.geotrellis_version >= "2.0.0":
@@ -905,6 +908,7 @@ def _configurations(self, worker_count: str) -> List[Dict[str, Any]]:
 class FireAlertsGeotrellisJob(GeotrellisJob):
     alert_type: str
     alert_sources: Optional[List[str]] = []
+    timeout_sec = 43200
 
     FIRE_SOURCE_DEFAULT_PATHS: Dict[str, str] = {
         "viirs": f"s3://{GLOBALS.s3_bucket_data_lake}/nasa_viirs_fire_alerts/v1/vector/epsg-4326/tsv",

diff --git a/src/setup.py b/src/setup.py
@@ -14,7 +14,6 @@
         "google-cloud-storage~=2.1.0",
         "pyshp~=2.1.0",
         "pydantic~=1.7.2",
-        "smart-open~=4.0.1",
         "retry~=0.9.2",
     ],  # noqa: E231
 )