Merge pull request #140 from gitfrosh/beta

v2.5.0
gitfrosh · Apr 29, 2024 · 35e064d · 35e064d
2 parents 617e1eb + bb1bc34
commit 35e064d
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 31 deletions.
diff --git a/.github/workflows/data-upgrade.yml b/.github/workflows/data-upgrade.yml
@@ -25,23 +25,14 @@ jobs:
     - name: Install pandas
       run: pip install pandas
 
+    - name: Install regex
+      run: pip install regex
 
     - name: Convert CSV to JSON
-      run: |
-          import os
-          import pandas as pd
-
-          os.makedirs('db/json', exist_ok=True)
-          csv_files = [f for f in os.listdir('db/csv') if f.endswith('.csv')]
-
-          for file in csv_files:
-              df = pd.read_csv(f'db/csv/{file}')
-              json_path = f'db/json/{file.replace(".csv", ".json")}'
-              df.to_json(json_path, orient='records', lines=True)
-          print("Conversion complete.")
+      run: python db/convert-csv-to-json.py
 
     - name: Upload JSON files as artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: json-files
-          path: db/json/
+      uses: actions/upload-artifact@v4
+      with:
+        name: json-files
+        path: db/json/
diff --git a/.github/workflows/release-new-data.yml b/.github/workflows/release-new-data.yml
@@ -7,6 +7,8 @@ on:
     branches:
       - main
   pull_request:
+    branches:
+      - main
     paths:
       - 'db/**'
 
@@ -25,25 +27,47 @@ jobs:
         python-version: '3.x'
 
     - name: Install pandas
-        run: pip install pandas
+      run: pip install pandas
 
+    - name: Install regex
+      run: pip install regex
 
     - name: Convert CSV to JSON
-        run: |
-          import os
-          import pandas as pd
-
-          os.makedirs('db/json', exist_ok=True)
-          csv_files = [f for f in os.listdir('db/csv') if f.endswith('.csv')]
-
-          for file in csv_files:
-              df = pd.read_csv(f'db/csv/{file}')
-              json_path = f'db/json/{file.replace(".csv", ".json")}'
-              df.to_json(json_path, orient='records', lines=True)
-          print("Conversion complete.")
+      run: python db/convert-csv-to-json.py
 
     - name: Upload JSON files as artifacts
       uses: actions/upload-artifact@v4
       with:
         name: json-files
-        path: db/json/
+        path: db/json/
+
+  mongoimport:
+    needs: convert
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        mongodb-version: ['6.0']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Download JSON files as artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: json-files
+          path: db/json/
+
+      - name: Install MongoDB Tools
+        run: |
+          wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | sudo apt-key add -
+          echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu $(lsb_release -sc)/mongodb-org/5.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-5.0.list
+          sudo apt-get update
+          sudo apt-get install -y mongodb-database-tools
+
+      - name: Import to MongoDB
+        run: |
+          chmod +x ./db/import_json_to_mongo.sh
+          ./db/import_json_to_mongo.sh
+        shell: bash
+        env:
+          MONGODB_URI: ${{ secrets.MONGODB_URI }}       
diff --git a/db/convert-csv-to-json.py b/db/convert-csv-to-json.py
@@ -0,0 +1,32 @@
+import os
+import pandas as pd
+import json
+import regex as re
+
+def transform_objectid(text):
+    """Replace MongoDB ObjectId references to proper JSON format."""
+    # Use non-capturing group and directly format the string with $oid.
+    pattern = r'ObjectId\(([^)]+)\)'
+    replacements = re.findall(pattern, text)
+    for r in replacements:
+        text = text.replace(f'ObjectId({r})', f'{{"$oid": "{r}"}}')
+    return text
+
+def main():
+    os.makedirs('db/json', exist_ok=True)  # Ensure the directory for JSON files exists
+    csv_files = [f for f in os.listdir('db/csv') if f.endswith('.csv')]
+
+    for file in csv_files:
+        df = pd.read_csv(f'db/csv/{file}')
+        # Transform all string columns that may contain ObjectId references
+        for column in df.select_dtypes(include=['object']):
+            df[column] = df[column].apply(lambda x: transform_objectid(str(x)) if pd.notna(x) else x)
+        # Convert transformed string JSON to actual JSON objects
+        for column in df.select_dtypes(include=['object']):
+            df[column] = df[column].apply(lambda x: json.loads(x) if pd.notna(x) and x.startswith('{') else x)
+        # Save each dataframe as a JSON file with all objects in a single array
+        json_path = f'db/json/{file.replace(".csv", ".json")}'
+        df.to_json(json_path, orient='records', indent=4)
+
+if __name__ == "__main__":
+    main()
diff --git a/db/csv/quotes.csv b/db/csv/quotes.csv
@@ -1,5 +1,5 @@
 dialog,movie,character,_id
-Deagol!,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7e9)
+Deagol!!,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7e9)
 Deagol!,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7ea)
 Deagol!,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7eb)
 Give us that! Deagol my love,ObjectId(5cd95395de30eff6ebccde5d),ObjectId(5cd99d4bde30eff6ebccfe9e),ObjectId(5cd96e05de30eff6ebcce7ec)

diff --git a/db/import_json_to_mongo.sh b/db/import_json_to_mongo.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Script to import JSON files to MongoDB collections
+for file in db/json/*.json; do
+    # Extract the collection name from the filename
+    collection=$(basename "$file" .json)
+    echo "Importing $file to collection $collection"
+    # Run mongoimport command
+    mongoimport --type json --uri "$MONGODB_URI" --collection $collection --file "$file" --drop --maintainInsertionOrder --jsonArray
+done