From abb685c0ebe3c9245a6c220e2805f66554a73ab2 Mon Sep 17 00:00:00 2001 From: Manuel Wedler Date: Thu, 24 Oct 2024 18:26:35 +0200 Subject: [PATCH 1/4] Add github action for build and publish --- .github/workflows/release.yml | 70 +++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..3814680 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,70 @@ +name: Release on Google Cloud Run + +on: + push: + branches: + - main + +env: + REGISTRY: ghcr.io + IMAGE_NAME: verifier-alliance/parquet-export + GCP_PROJECT: verifier-alliance + WORKLOAD_IDENTITY_PROVIDER: projects/1064646032521/locations/global/workloadIdentityPools/github-actions/providers/vera-github-actions + SERVICE_ACCOUNT: vera-cloud-run-deployer@verifier-alliance.iam.gserviceaccount.com + JOB_NAME: parquet-export-${{ github.ref_name }} + REGION: europe-west3 + GCP_REPOSITORY: ghcr + +jobs: + release: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + id-token: write + attestations: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + id: push + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} + subject-digest: ${{ steps.push.outputs.digest }} + push-to-registry: true + + - name: "auth GCP" + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: ${{ env.WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ env.SERVICE_ACCOUNT }} + + - id: 'deploy' + uses: 'google-github-actions/deploy-cloudrun@v2' + with: + job: ${{ env.JOB_NAME }} + region: ${{ env.REGION }} + image: ${{ env.REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT }}/${{ env.GCP_REPOSITORY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}@${{ steps.push.outputs.digest }} From 1e91533dcdbac97c1d62c96862511a0497b50e08 Mon Sep 17 00:00:00 2001 From: Manuel Wedler Date: Mon, 12 Aug 2024 17:42:24 +0200 Subject: [PATCH 2/4] Update config.py for database schema changes --- config.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/config.py b/config.py index 50c2396..51beea2 100644 --- a/config.py +++ b/config.py @@ -6,7 +6,12 @@ 'name': 'code', 'datatypes': { 'code_hash': 'object', - 'code': 'object' + 'code': 'object', + 'code_hash_keccak': 'object', + 'created_at': 'datetime64[ns]', + 'updated_at': 'datetime64[ns]', + 'created_by': 'string', + 'updated_by': 'string' }, 'chunk_size': 10000, 'num_chunks_per_file': 10 @@ -16,7 +21,11 @@ 'datatypes': { 'id': 'string', 'creation_code_hash': 'object', - 'runtime_code_hash': 'object' + 'runtime_code_hash': 'object', + 'created_at': 'datetime64[ns]', + 'updated_at': 'datetime64[ns]', + 'created_by': 'string', + 'updated_by': 'string' }, 'chunk_size': 100000, 'num_chunks_per_file': 10 @@ -31,7 +40,11 @@ 'block_number': 'Int64', 'transaction_index': 'Int32', 'deployer': 'object', - 'contract_id': 'string' + 'contract_id': 'string', + 'created_at': 'datetime64[ns]', + 'updated_at': 'datetime64[ns]', + 'created_by': 'string', + 'updated_by': 'string' }, 'chunk_size': 100000, 'num_chunks_per_file': 10 @@ -75,7 +88,9 @@ 'creation_transformations': 'json', 'runtime_match': 'bool', 'runtime_values': 'json', - 'runtime_transformations': 'json' + 'runtime_transformations': 'json', + 'runtime_metadata_match': 'bool', + 'creation_metadata_match': 'bool' }, 'chunk_size': 100000, 'num_chunks_per_file': 10 From 550c8cc4dc9eb1294adef0e8ce975264b2ea4395 Mon Sep 17 00:00:00 2001 From: Manuel Wedler Date: Fri, 25 Oct 2024 16:48:15 +0200 Subject: [PATCH 3/4] Update config for schema changes (sources in separate table) --- config.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/config.py b/config.py index 51beea2..326d566 100644 --- a/config.py +++ b/config.py @@ -62,7 +62,6 @@ 'language': 'string', 'name': 'string', 'fully_qualified_name': 'string', - 'sources': 'json', 'compiler_settings': 'json', 'compilation_artifacts': 'json', 'creation_code_hash': 'object', @@ -71,7 +70,32 @@ 'runtime_code_artifacts': 'json' }, 'chunk_size': 1000, - 'num_chunks_per_file': 5 + 'num_chunks_per_file': 10 + }, + { + 'name': 'compiled_contracts_sources', + 'datatypes': { + 'id': 'string', + 'compilation_id': 'string', + 'source_hash': 'object', + 'path': 'string' + }, + 'chunk_size': 100000, + 'num_chunks_per_file': 10 + }, + { + 'name': 'sources', + 'datatypes': { + 'source_hash': 'object', + 'source_hash_keccak': 'object', + 'content': 'string', + 'created_at': 'datetime64[ns]', + 'updated_at': 'datetime64[ns]', + 'created_by': 'string', + 'updated_by': 'string' + }, + 'chunk_size': 1000, + 'num_chunks_per_file': 10 }, { 'name': 'verified_contracts', From 979b78a5ceb71ea9e14e5fcf06d17680455229d9 Mon Sep 17 00:00:00 2001 From: Manuel Wedler Date: Thu, 5 Sep 2024 12:46:16 +0200 Subject: [PATCH 4/4] Declare schema for pyarrow table Fixes a problem where the column type was not inferable because all entries had a NULL value for that column --- main.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 41f961d..801a639 100644 --- a/main.py +++ b/main.py @@ -130,6 +130,28 @@ def process_df(df, dtypes): df[col] = df[col].astype(pd.UInt16Dtype() if dtype == 'UInt16' else dtype) return df +def get_pyarrow_type(dt): + match dt: + case 'bool': + return pa.bool_() + case 'Int32': + return pa.int32() + case 'Int64': + return pa.int64() + case 'string': + return pa.string() + case 'object': + return pa.binary() + case 'datetime64[ns]': + return pa.timestamp('ns') + case 'json': + return pa.string() + case _: + raise ValueError("Type not supported") + +def get_pyarrow_schema(dtypes): + return pa.schema([pa.field(col, get_pyarrow_type(dt)) for col, dt in dtypes.items()]) + def upload_to_s3(file_path, bucket_name, object_name): logger.info(f"Uploading {object_name} to S3") if os.getenv("DEBUG"): @@ -156,6 +178,7 @@ def upload_to_s3(file_path, bucket_name, object_name): def fetch_and_write(table_config, engine): table_name = table_config['name'] dtypes = table_config['datatypes'] + schema = get_pyarrow_schema(dtypes) chunk_size = table_config['chunk_size'] if os.getenv('DEBUG'): logger.debug(f"DEBUG: Setting chunk_size to 1/100 of {chunk_size} = {chunk_size // 100}") @@ -191,7 +214,7 @@ def fetch_and_write(table_config, engine): logger.info(f"Processed chunk {chunk_counter} of file {file_counter}") logger.info(f"DataFrame size: {df.memory_usage(deep=True).sum() / (1024 * 1024):.2f} MB") - chunk_table = pa.Table.from_pandas(df) # Convert the dataframe to a PyArrow table + chunk_table = pa.Table.from_pandas(df, schema=schema) # Convert the dataframe to a PyArrow table if writer is None: # file name: contracts_0_10000_zstd.parquet, contracts_10000_20000_zstd.parquet, etc.