From 76e1ade2e0dfcfd0634c059edeef032dc9032e60 Mon Sep 17 00:00:00 2001 From: Rike Date: Mon, 29 Apr 2024 10:45:14 +0200 Subject: [PATCH 1/2] Feature/db ci (#139) chore: init pipelines for db data updates --- .github/workflows/data-upgrade.yml | 23 ++++-------- .github/workflows/release-new-data.yml | 50 ++++++++++++++++++-------- db/convert-csv-to-json.py | 32 +++++++++++++++++ db/csv/quotes.csv | 2 +- db/import_json_to_mongo.sh | 10 ++++++ 5 files changed, 86 insertions(+), 31 deletions(-) create mode 100644 db/convert-csv-to-json.py create mode 100644 db/import_json_to_mongo.sh diff --git a/.github/workflows/data-upgrade.yml b/.github/workflows/data-upgrade.yml index bf20a9f..e32994f 100644 --- a/.github/workflows/data-upgrade.yml +++ b/.github/workflows/data-upgrade.yml @@ -25,23 +25,14 @@ jobs: - name: Install pandas run: pip install pandas + - name: Install regex + run: pip install regex - name: Convert CSV to JSON - run: | - import os - import pandas as pd - - os.makedirs('db/json', exist_ok=True) - csv_files = [f for f in os.listdir('db/csv') if f.endswith('.csv')] - - for file in csv_files: - df = pd.read_csv(f'db/csv/{file}') - json_path = f'db/json/{file.replace(".csv", ".json")}' - df.to_json(json_path, orient='records', lines=True) - print("Conversion complete.") + run: python db/convert-csv-to-json.py - name: Upload JSON files as artifacts - uses: actions/upload-artifact@v4 - with: - name: json-files - path: db/json/ \ No newline at end of file + uses: actions/upload-artifact@v4 + with: + name: json-files + path: db/json/ \ No newline at end of file diff --git a/.github/workflows/release-new-data.yml b/.github/workflows/release-new-data.yml index 26a5f3d..0a35c9a 100644 --- a/.github/workflows/release-new-data.yml +++ b/.github/workflows/release-new-data.yml @@ -7,6 +7,8 @@ on: branches: - main pull_request: + branches: + - main paths: - 'db/**' @@ -25,25 +27,45 @@ jobs: python-version: '3.x' - name: Install pandas - run: pip install pandas + run: pip install pandas - name: Convert CSV to JSON - run: | - import os - import pandas as pd - - os.makedirs('db/json', exist_ok=True) - csv_files = [f for f in os.listdir('db/csv') if f.endswith('.csv')] - - for file in csv_files: - df = pd.read_csv(f'db/csv/{file}') - json_path = f'db/json/{file.replace(".csv", ".json")}' - df.to_json(json_path, orient='records', lines=True) - print("Conversion complete.") + run: python db/convert-csv-to-json.py - name: Upload JSON files as artifacts uses: actions/upload-artifact@v4 with: name: json-files - path: db/json/ \ No newline at end of file + path: db/json/ + + mongoimport: + needs: convert + runs-on: ubuntu-latest + strategy: + matrix: + mongodb-version: ['6.0'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Download JSON files as artifacts + uses: actions/download-artifact@v4 + with: + name: json-files + path: db/json/ + + - name: Install MongoDB Tools + run: | + wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | sudo apt-key add - + echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu $(lsb_release -sc)/mongodb-org/5.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-5.0.list + sudo apt-get update + sudo apt-get install -y mongodb-database-tools + + - name: Import to MongoDB + run: | + chmod +x ./db/import_json_to_mongo.sh + ./db/import_json_to_mongo.sh + shell: bash + env: + MONGODB_URI: ${{ secrets.MONGODB_URI }} \ No newline at end of file diff --git a/db/convert-csv-to-json.py b/db/convert-csv-to-json.py new file mode 100644 index 0000000..327e6dd --- /dev/null +++ b/db/convert-csv-to-json.py @@ -0,0 +1,32 @@ +import os +import pandas as pd +import json +import regex as re + +def transform_objectid(text): + """Replace MongoDB ObjectId references to proper JSON format.""" + # Use non-capturing group and directly format the string with $oid. + pattern = r'ObjectId\(([^)]+)\)' + replacements = re.findall(pattern, text) + for r in replacements: + text = text.replace(f'ObjectId({r})', f'{{"$oid": "{r}"}}') + return text + +def main(): + os.makedirs('db/json', exist_ok=True) # Ensure the directory for JSON files exists + csv_files = [f for f in os.listdir('db/csv') if f.endswith('.csv')] + + for file in csv_files: + df = pd.read_csv(f'db/csv/{file}') + # Transform all string columns that may contain ObjectId references + for column in df.select_dtypes(include=['object']): + df[column] = df[column].apply(lambda x: transform_objectid(str(x)) if pd.notna(x) else x) + # Convert transformed string JSON to actual JSON objects + for column in df.select_dtypes(include=['object']): + df[column] = df[column].apply(lambda x: json.loads(x) if pd.notna(x) and x.startswith('{') else x) + # Save each dataframe as a JSON file with all objects in a single array + json_path = f'db/json/{file.replace(".csv", ".json")}' + df.to_json(json_path, orient='records', indent=4) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/db/csv/quotes.csv b/db/csv/quotes.csv index 22db7e7..5295bd6 100644 --- a/db/csv/quotes.csv +++ b/db/csv/quotes.csv @@ -1,5 +1,5 @@ dialog,movie,character,_id -Deagol!,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7e9) +Deagol!!,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7e9) Deagol!,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7ea) Deagol!,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7eb) Give us that! Deagol my love,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7ec) diff --git a/db/import_json_to_mongo.sh b/db/import_json_to_mongo.sh new file mode 100644 index 0000000..204bc51 --- /dev/null +++ b/db/import_json_to_mongo.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Script to import JSON files to MongoDB collections +for file in db/json/*.json; do + # Extract the collection name from the filename + collection=$(basename "$file" .json) + echo "Importing $file to collection $collection" + # Run mongoimport command + mongoimport --type json --uri "$MONGODB_URI" --collection $collection --file "$file" --drop --maintainInsertionOrder --jsonArray +done \ No newline at end of file From bb1bc34cff20bfa68679aec7ca7785372384b563 Mon Sep 17 00:00:00 2001 From: Rike Date: Mon, 29 Apr 2024 14:44:04 +0200 Subject: [PATCH 2/2] fix: fix release job --- .github/workflows/release-new-data.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release-new-data.yml b/.github/workflows/release-new-data.yml index 0a35c9a..5bb2fb9 100644 --- a/.github/workflows/release-new-data.yml +++ b/.github/workflows/release-new-data.yml @@ -29,6 +29,8 @@ jobs: - name: Install pandas run: pip install pandas + - name: Install regex + run: pip install regex - name: Convert CSV to JSON run: python db/convert-csv-to-json.py