cpdat installations weren't working, so I updated the build stage and…

… pushed
biobricks-ai · Sep 11, 2024 · f10d8d6 · f10d8d6
1 parent fe7ac5a
commit f10d8d6
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 58 deletions.
diff --git a/dvc.lock b/dvc.lock
@@ -1,47 +1,47 @@
----
 schema: '2.0'
 stages:
   build:
-    cmd: stages/03_build.sh
+    cmd: python stages/03_build.py
     deps:
-    - md5: 75ab6120bac0a1065986912e5e5eb79b.dir
-      nfiles: 11
-      path: raw
+    - path: raw
+      md5: 75ab6120bac0a1065986912e5e5eb79b.dir
       size: 572654300
-    - md5: ad705d3aff32f182707d7aa88a87fd18
-      path: stages/03_build.sh
-      size: 410
-    - md5: 9fe6f058abb56acd3f330ec3d1e34e03
-      path: stages/csv2parquet.py
-      size: 660
+      nfiles: 11
+    - path: stages/03_build.py
+      hash: md5
+      md5: d24fc4f447d1dee7a672ee735efef1f7
+      size: 777
     outs:
-    - md5: 92fec7f055de167dfabdea546b4fa407.dir
+    - path: brick
+      hash: md5
+      md5: fbc4fa7b30c3b0fa89954d6d14dfccd4.dir
+      size: 153212127
       nfiles: 10
-      path: brick
-      size: 155500050
   download:
     cmd: stages/01_download.sh
     deps:
-    - md5: 9ad7abc4441bb03db746763323943f66
-      path: stages/01_download.sh
+    - path: stages/01_download.sh
+      md5: 9ad7abc4441bb03db746763323943f66
       size: 558
+      isexec: true
     outs:
-    - md5: 4f1c0c762afa84306d347f4b366b11d2.dir
-      nfiles: 1
-      path: download
+    - path: download
+      md5: 4f1c0c762afa84306d347f4b366b11d2.dir
       size: 93725827
+      nfiles: 1
   unzip:
     cmd: stages/02_unzip.sh
     deps:
-    - md5: 4f1c0c762afa84306d347f4b366b11d2.dir
-      nfiles: 1
-      path: download
+    - path: download
+      md5: 4f1c0c762afa84306d347f4b366b11d2.dir
       size: 93725827
-    - md5: d0e1b99ce0d7915c3becc3fa210348d2
-      path: stages/02_unzip.sh
+      nfiles: 1
+    - path: stages/02_unzip.sh
+      md5: d0e1b99ce0d7915c3becc3fa210348d2
       size: 466
+      isexec: true
     outs:
-    - md5: 75ab6120bac0a1065986912e5e5eb79b.dir
-      nfiles: 11
-      path: raw
+    - path: raw
+      md5: 75ab6120bac0a1065986912e5e5eb79b.dir
       size: 572654300
+      nfiles: 11
diff --git a/dvc.yaml b/dvc.yaml
@@ -1,13 +1,3 @@
-# Brick DVC stages
-# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml
-
-# The complete process can be executed using:
-# dvc repro
-# If you want to force redoing the process use 
-# dvc repro -f
-# Individual stage can be executed using: 
-# dvc repro <stage>
-
 stages:
   download:
     cmd: stages/01_download.sh
@@ -23,10 +13,9 @@ stages:
     outs:
       - raw
   build: 
-    cmd: stages/03_build.sh
+    cmd: python stages/03_build.py
     deps:
-      - stages/03_build.sh
-      - stages/csv2parquet.py
+      - stages/03_build.py
       - raw
     outs:
       - brick
diff --git a/stages/03_build.py b/stages/03_build.py
@@ -0,0 +1,31 @@
+import os
+import pandas as pd
+from pathlib import Path
+
+raw_dir = Path("./raw")
+brick_dir = Path("./brick")
+brick_dir.mkdir(exist_ok=True)
+
+csv_files = list(raw_dir.glob("*.csv"))
+
+for csv_file in csv_files:
+    try:
+        df = pd.read_csv(csv_file, encoding='utf-8', low_memory=False)
+    except UnicodeDecodeError:
+        # If UTF-8 fails, try with 'latin-1' encoding
+        df = pd.read_csv(csv_file, encoding='latin-1', low_memory=False)
+
+    # Convert problematic columns to string type
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            df[col] = df[col].astype(str)
+
+    output_file = brick_dir / f"{csv_file.stem}.parquet"
+    df.to_parquet(output_file, index=False)
+    print(f"Converted {csv_file.name} to {output_file.name}")
+
+
+# add a test to read all the parquet files and make sure they all have more than 5 rows
+for parquet_file in brick_dir.glob("*.parquet"):
+    df = pd.read_parquet(parquet_file)
+    assert len(df) > 5, f"File {parquet_file.name} has less than 5 rows"
diff --git a/stages/03_build.sh b/stages/03_build.sh