feat(openchallenges): add CI workflow for updating the DB dump files …

…on a daily cadence (#2489)
Sage-Bionetworks · Feb 13, 2024 · 73aeb0b · 73aeb0b
1 parent b5c9bce
commit 73aeb0b
Showing 3 changed files with 281 additions and 0 deletions.
diff --git a/.github/workflows/update-oc-db-csv-files.yml b/.github/workflows/update-oc-db-csv-files.yml
@@ -0,0 +1,58 @@
+name: Update OpenChallenges DB files
+on:
+  schedule:
+    - cron: "0 0 * * *"  # daily at 00:00 UTC 
+  workflow_dispatch:
+
+jobs:
+  update:
+    runs-on: ubuntu-latest
+
+    steps:
+
+      # schematic only supports Python 3.9 and 3.10, so we will need
+      # to specifically use one of these versions.
+      - name: Install system dependencies
+        run: |
+          sudo add-apt-repository ppa:deadsnakes/ppa
+          sudo apt-get update
+          sudo apt-get install -y pip python3.10-venv libcurl4-openssl-dev
+
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.ref }}
+          persist-credentials: false
+          fetch-depth: 0
+
+      - name: Create Google Client credentials file
+        uses: jsdaniell/create-json@v1.2.3
+        with:
+          name: "service_account.json"
+          json: ${{ secrets.GOOGLE_CLIENT_JSON }}
+
+      - name: Update dump files
+        run: |
+          python3 -m pip install --upgrade pip
+          pip install gspread pandas numpy
+          python3 apps/openchallenges/db-update/update_db_csv.py
+
+      # - name: Install schematic and validate files
+      #   shell: bash
+      #   run: |
+      #     python3.10 -m venv .venv
+      #     chmod 755 .venv/bin/activate
+      #     source .venv/bin/activate
+      #     pip3.10 install schematicpy
+
+      - name: Get current date
+        run: |
+          echo "TODAY=$(date +"%Y-%m-%d")" >> $GITHUB_ENV
+    
+      - name: Push changes, then create or update pull request
+        uses: peter-evans/create-pull-request@v6
+        with: 
+          title: "chore(openchallenges): ${{ env.TODAY }} DB update"
+          body: Daily OC database update(s)
+          labels: sonar-scan-approved
+          commit-message: "${{ env.TODAY }}: add latest CSV dump files"
+          branch: openchallenges/db-update
diff --git a/.gitignore b/.gitignore
@@ -55,6 +55,7 @@ npm-debug.log
 yarn-error.log
 testem.log
 /typings
+service_account.json
 
 # System Files
 .DS_Store

diff --git a/apps/openchallenges/db-update/update_db_csv.py b/apps/openchallenges/db-update/update_db_csv.py
@@ -0,0 +1,222 @@
+import os
+import csv
+import gspread
+import numpy as np
+import pandas as pd
+
+GOOGLE_SHEET_TITLE = "OpenChallenges Data"
+CHALLENGE_FOLDER = "apps/openchallenges/challenge-service/src/main/resources/db"
+ORGANIZATION_FOLDER = "apps/openchallenges/organization-service/src/main/resources/db"
+
+
+def output_csv(df, output_filename, output_folder="", print_row=False):
+    """Output a dataframe into CSV file.
+
+    CSV file should not include index numbers and all values should
+    be surrounded with double-quotes.
+    """
+    df.to_csv(
+        os.path.join(output_folder, output_filename),
+        index=print_row,
+        quoting=csv.QUOTE_ALL,
+    )
+
+
+def get_challenge_data(wks, sheet_name="challenges"):
+    """Get challenges data and clean up as needed.
+
+    Output:
+        - challenges
+        - challenge incentives
+        - challenge submission types
+    """
+    df = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
+    df.loc[df._platform == "Other", "platform"] = "\\N"
+
+    challenges = df[
+        [
+            "id",
+            "slug",
+            "name",
+            "headline",
+            "description",
+            "avatar_url",
+            "website_url",
+            "status",
+            "platform",
+            "doi",
+            "start_date",
+            "end_date",
+            "created_at",
+            "updated_at",
+        ]
+    ]
+    challenges = (
+        challenges.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
+        .replace(r"\n", " ", regex=True)
+        .replace("'", "''")
+    )
+    challenges["headline"] = (
+        challenges["headline"]
+        .astype(str)
+        .apply(lambda x: x[:76] + "..." if len(x) > 80 else x)
+    )
+    challenges["description"] = (
+        challenges["description"]
+        .astype(str)
+        .apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
+    )
+    challenges.loc[challenges.start_date == "", "start_date"] = "\\N"
+    challenges.loc[challenges.end_date == "", "end_date"] = "\\N"
+
+    incentives = pd.concat(
+        [
+            df[df.monetary_incentive == "TRUE"][["id", "created_at"]].assign(
+                incentives="monetary"
+            ),
+            df[df.publication_incentive == "TRUE"][["id", "created_at"]].assign(
+                incentives="publication"
+            ),
+            df[df.speaking_incentive == "TRUE"][["id", "created_at"]].assign(
+                incentives="speaking_engagement"
+            ),
+            df[df.other_incentive == "TRUE"][["id", "created_at"]].assign(
+                incentives="other"
+            ),
+        ]
+    ).rename(columns={"id": "challenge_id"})
+    incentives["incentives"] = pd.Categorical(
+        incentives["incentives"],
+        categories=["monetary", "publication", "speaking_engagement", "other"],
+    )
+    incentives = incentives.sort_values(["challenge_id", "incentives"])
+    incentives.index = np.arange(1, len(incentives) + 1)
+
+    sub_types = pd.concat(
+        [
+            df[df.file_submission == "TRUE"][["id", "created_at"]].assign(
+                submission_types="prediction_file"
+            ),
+            df[df.container_submission == "TRUE"][["id", "created_at"]].assign(
+                submission_types="container_image"
+            ),
+            df[df.notebook_submission == "TRUE"][["id", "created_at"]].assign(
+                submission_types="notebook"
+            ),
+            df[df.other_submission == "TRUE"][["id", "created_at"]].assign(
+                submission_types="other"
+            ),
+        ]
+    ).rename(columns={"id": "challenge_id"})
+    sub_types["submission_types"] = pd.Categorical(
+        sub_types["submission_types"],
+        categories=["prediction_file", "container_image", "notebook", "other"],
+    )
+    sub_types = sub_types.sort_values(["challenge_id", "submission_types"])
+    sub_types.index = np.arange(1, len(sub_types) + 1)
+
+    return (
+        challenges,
+        incentives[["incentives", "challenge_id", "created_at"]],
+        sub_types[["submission_types", "challenge_id", "created_at"]],
+    )
+
+
+def get_challenge_categories(wks, sheet_name="challenge_category"):
+    """Get challenge categories."""
+    return pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")[
+        ["id", "challenge_id", "category"]
+    ]
+
+
+def get_platform_data(wks, sheet_name="platforms"):
+    """Get platform data and clean up as needed."""
+    platforms = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
+    return platforms[platforms._public == "TRUE"][
+        ["id", "slug", "name", "avatar_url", "website_url", "created_at", "updated_at"]
+    ]
+
+
+def get_organization_data(wks, sheet_name="organizations"):
+    """Get organization data and clean up as needed."""
+    organizations = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
+    organizations = organizations[organizations._public == "TRUE"][
+        [
+            "id",
+            "name",
+            "login",
+            "avatar_url",
+            "website_url",
+            "description",
+            "challenge_count",
+            "created_at",
+            "updated_at",
+            "acronym",
+        ]
+    ]
+    organizations = (
+        organizations.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
+        .replace(r"\n", " ", regex=True)
+        .replace("'", "''")
+    )
+    organizations["description"] = (
+        organizations["description"]
+        .astype(str)
+        .apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
+    )
+    return organizations
+
+
+def get_roles(wks, sheet_name="contribution_role"):
+    """Get data on organization's role(s) in challenges."""
+    return (
+        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
+        .fillna("")
+        .drop(["_challenge", "_organization"], axis=1)
+    )
+
+
+def get_edam_terms(wks, sheet_name="edam_terms"):
+    """Get list of EDAM terms currently used in the DB."""
+    return pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")[
+        ["id", "edam_id", "name", "subclass_of", "created_at", "updated_at"]
+    ]
+
+
+def get_edam_annotations(wks, sheet_name="challenge_data"):
+    """Get data on challenge's EDAM annotations."""
+    return (
+        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
+        .fillna("")
+        .drop(["_challenge", "_edam_name"], axis=1)
+    )
+
+
+def main(gc):
+    """Main function."""
+    wks = gc.open(GOOGLE_SHEET_TITLE)
+
+    platforms = get_platform_data(wks)
+    output_csv(platforms, "platforms.csv", output_folder=CHALLENGE_FOLDER)
+
+    roles = get_roles(wks)
+    output_csv(roles, "contribution_roles.csv", output_folder=CHALLENGE_FOLDER)
+    output_csv(roles, "contribution_roles.csv", output_folder=ORGANIZATION_FOLDER)
+
+    categories = get_challenge_categories(wks)
+    output_csv(categories, "categories.csv", output_folder=CHALLENGE_FOLDER)
+
+    organizations = get_organization_data(wks)
+    output_csv(organizations, "organizations.csv", output_folder=ORGANIZATION_FOLDER)
+
+    challenges, incentives, sub_types = get_challenge_data(wks)
+    output_csv(challenges, "challenges.csv", output_folder=CHALLENGE_FOLDER)
+    output_csv(incentives, "incentives.csv", 
+               output_folder=CHALLENGE_FOLDER, print_row=True)
+    output_csv(sub_types, "submission_types.csv",
+               output_folder=CHALLENGE_FOLDER, print_row=True)
+
+
+if __name__ == "__main__":
+    google_client = gspread.service_account(filename="service_account.json")
+    main(google_client)