From abb685c0ebe3c9245a6c220e2805f66554a73ab2 Mon Sep 17 00:00:00 2001
From: Manuel Wedler <manuel@wedler.dev>
Date: Thu, 24 Oct 2024 18:26:35 +0200
Subject: [PATCH 1/4] Add github action for build and publish

---
 .github/workflows/release.yml | 70 +++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 .github/workflows/release.yml

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..3814680
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,70 @@
+name: Release on Google Cloud Run
+
+on:
+  push:
+    branches:
+      - main
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: verifier-alliance/parquet-export
+  GCP_PROJECT: verifier-alliance
+  WORKLOAD_IDENTITY_PROVIDER: projects/1064646032521/locations/global/workloadIdentityPools/github-actions/providers/vera-github-actions
+  SERVICE_ACCOUNT: vera-cloud-run-deployer@verifier-alliance.iam.gserviceaccount.com
+  JOB_NAME: parquet-export-${{ github.ref_name }}
+  REGION: europe-west3
+  GCP_REPOSITORY: ghcr
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+      id-token: write
+      attestations: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      - name: Build and push Docker image
+        id: push
+        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
+        with:
+          context: .
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
+      - name: Generate artifact attestation
+        uses: actions/attest-build-provenance@v1
+        with:
+          subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
+          subject-digest: ${{ steps.push.outputs.digest }}
+          push-to-registry: true
+
+      - name: "auth GCP"
+        uses: "google-github-actions/auth@v2"
+        with:
+          workload_identity_provider: ${{ env.WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ env.SERVICE_ACCOUNT }}
+
+      - id: 'deploy'
+        uses: 'google-github-actions/deploy-cloudrun@v2'
+        with:
+          job: ${{ env.JOB_NAME }}
+          region: ${{ env.REGION }}
+          image: ${{ env.REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT }}/${{ env.GCP_REPOSITORY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}@${{ steps.push.outputs.digest }}

From 1e91533dcdbac97c1d62c96862511a0497b50e08 Mon Sep 17 00:00:00 2001
From: Manuel Wedler <manuel@wedler.dev>
Date: Mon, 12 Aug 2024 17:42:24 +0200
Subject: [PATCH 2/4] Update config.py for database schema changes

---
 config.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/config.py b/config.py
index 50c2396..51beea2 100644
--- a/config.py
+++ b/config.py
@@ -6,7 +6,12 @@
         'name': 'code',
         'datatypes': {
             'code_hash': 'object',
-            'code': 'object'
+            'code': 'object',
+            'code_hash_keccak': 'object',
+            'created_at': 'datetime64[ns]',
+            'updated_at': 'datetime64[ns]',
+            'created_by': 'string',
+            'updated_by': 'string'
         },
         'chunk_size': 10000,
         'num_chunks_per_file': 10
@@ -16,7 +21,11 @@
         'datatypes': {
             'id': 'string',
             'creation_code_hash': 'object',
-            'runtime_code_hash': 'object'
+            'runtime_code_hash': 'object',
+            'created_at': 'datetime64[ns]',
+            'updated_at': 'datetime64[ns]',
+            'created_by': 'string',
+            'updated_by': 'string'
         },
         'chunk_size': 100000,
         'num_chunks_per_file': 10
@@ -31,7 +40,11 @@
             'block_number': 'Int64',
             'transaction_index': 'Int32',
             'deployer': 'object',
-            'contract_id': 'string'
+            'contract_id': 'string',
+            'created_at': 'datetime64[ns]',
+            'updated_at': 'datetime64[ns]',
+            'created_by': 'string',
+            'updated_by': 'string'
         },
         'chunk_size': 100000,
         'num_chunks_per_file': 10
@@ -75,7 +88,9 @@
             'creation_transformations': 'json',
             'runtime_match': 'bool',
             'runtime_values': 'json',
-            'runtime_transformations': 'json'
+            'runtime_transformations': 'json',
+            'runtime_metadata_match': 'bool',
+            'creation_metadata_match': 'bool'
         },
         'chunk_size': 100000,
         'num_chunks_per_file': 10

From 550c8cc4dc9eb1294adef0e8ce975264b2ea4395 Mon Sep 17 00:00:00 2001
From: Manuel Wedler <manuel@wedler.dev>
Date: Fri, 25 Oct 2024 16:48:15 +0200
Subject: [PATCH 3/4] Update config for schema changes (sources in separate
 table)

---
 config.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/config.py b/config.py
index 51beea2..326d566 100644
--- a/config.py
+++ b/config.py
@@ -62,7 +62,6 @@
             'language': 'string',
             'name': 'string',
             'fully_qualified_name': 'string',
-            'sources': 'json',
             'compiler_settings': 'json',
             'compilation_artifacts': 'json',
             'creation_code_hash': 'object',
@@ -71,7 +70,32 @@
             'runtime_code_artifacts': 'json'
         },
         'chunk_size': 1000,
-        'num_chunks_per_file': 5
+        'num_chunks_per_file': 10
+    },
+    {
+        'name': 'compiled_contracts_sources',
+        'datatypes': {
+            'id': 'string',
+            'compilation_id': 'string',
+            'source_hash': 'object',
+            'path': 'string'
+        },
+        'chunk_size': 100000,
+        'num_chunks_per_file': 10
+    },
+    {
+        'name': 'sources',
+        'datatypes': {
+            'source_hash': 'object',
+            'source_hash_keccak': 'object',
+            'content': 'string',
+            'created_at': 'datetime64[ns]',
+            'updated_at': 'datetime64[ns]',
+            'created_by': 'string',
+            'updated_by': 'string'
+        },
+        'chunk_size': 1000,
+        'num_chunks_per_file': 10
     },
     {
         'name': 'verified_contracts',

From 979b78a5ceb71ea9e14e5fcf06d17680455229d9 Mon Sep 17 00:00:00 2001
From: Manuel Wedler <manuel@wedler.dev>
Date: Thu, 5 Sep 2024 12:46:16 +0200
Subject: [PATCH 4/4] Declare schema for pyarrow table

Fixes a problem where the column type was not inferable because all entries had a NULL value for that column
---
 main.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 41f961d..801a639 100644
--- a/main.py
+++ b/main.py
@@ -130,6 +130,28 @@ def process_df(df, dtypes):
             df[col] = df[col].astype(pd.UInt16Dtype() if dtype == 'UInt16' else dtype)
     return df
 
+def get_pyarrow_type(dt):
+    match dt:
+        case 'bool':
+            return pa.bool_()
+        case 'Int32':
+            return pa.int32()
+        case 'Int64':
+            return pa.int64()
+        case 'string':
+            return pa.string()
+        case 'object':
+            return pa.binary()
+        case 'datetime64[ns]':
+            return pa.timestamp('ns')
+        case 'json':
+            return pa.string()
+        case _:
+            raise ValueError("Type not supported")
+
+def get_pyarrow_schema(dtypes):
+    return pa.schema([pa.field(col, get_pyarrow_type(dt)) for col, dt in dtypes.items()])
+
 def upload_to_s3(file_path, bucket_name, object_name):
     logger.info(f"Uploading {object_name} to S3")
     if os.getenv("DEBUG"):
@@ -156,6 +178,7 @@ def upload_to_s3(file_path, bucket_name, object_name):
 def fetch_and_write(table_config, engine):
     table_name = table_config['name']
     dtypes = table_config['datatypes']
+    schema = get_pyarrow_schema(dtypes)
     chunk_size = table_config['chunk_size']
     if os.getenv('DEBUG'):
         logger.debug(f"DEBUG: Setting chunk_size to 1/100 of {chunk_size} = {chunk_size // 100}")
@@ -191,7 +214,7 @@ def fetch_and_write(table_config, engine):
 
             logger.info(f"Processed chunk {chunk_counter} of file {file_counter}")
             logger.info(f"DataFrame size: {df.memory_usage(deep=True).sum() / (1024 * 1024):.2f} MB")
-            chunk_table = pa.Table.from_pandas(df) # Convert the dataframe to a PyArrow table
+            chunk_table = pa.Table.from_pandas(df, schema=schema) # Convert the dataframe to a PyArrow table
 
             if writer is None:
                 # file name: contracts_0_10000_zstd.parquet, contracts_10000_20000_zstd.parquet, etc.