Skip to content

Commit

Permalink
cpdat installations weren't working, so I updated the build stage and…
Browse files Browse the repository at this point in the history
… pushed
  • Loading branch information
tomlue committed Sep 11, 2024
1 parent fe7ac5a commit f10d8d6
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 58 deletions.
54 changes: 27 additions & 27 deletions dvc.lock
Original file line number Diff line number Diff line change
@@ -1,47 +1,47 @@
---
schema: '2.0'
stages:
build:
cmd: stages/03_build.sh
cmd: python stages/03_build.py
deps:
- md5: 75ab6120bac0a1065986912e5e5eb79b.dir
nfiles: 11
path: raw
- path: raw
md5: 75ab6120bac0a1065986912e5e5eb79b.dir
size: 572654300
- md5: ad705d3aff32f182707d7aa88a87fd18
path: stages/03_build.sh
size: 410
- md5: 9fe6f058abb56acd3f330ec3d1e34e03
path: stages/csv2parquet.py
size: 660
nfiles: 11
- path: stages/03_build.py
hash: md5
md5: d24fc4f447d1dee7a672ee735efef1f7
size: 777
outs:
- md5: 92fec7f055de167dfabdea546b4fa407.dir
- path: brick
hash: md5
md5: fbc4fa7b30c3b0fa89954d6d14dfccd4.dir
size: 153212127
nfiles: 10
path: brick
size: 155500050
download:
cmd: stages/01_download.sh
deps:
- md5: 9ad7abc4441bb03db746763323943f66
path: stages/01_download.sh
- path: stages/01_download.sh
md5: 9ad7abc4441bb03db746763323943f66
size: 558
isexec: true
outs:
- md5: 4f1c0c762afa84306d347f4b366b11d2.dir
nfiles: 1
path: download
- path: download
md5: 4f1c0c762afa84306d347f4b366b11d2.dir
size: 93725827
nfiles: 1
unzip:
cmd: stages/02_unzip.sh
deps:
- md5: 4f1c0c762afa84306d347f4b366b11d2.dir
nfiles: 1
path: download
- path: download
md5: 4f1c0c762afa84306d347f4b366b11d2.dir
size: 93725827
- md5: d0e1b99ce0d7915c3becc3fa210348d2
path: stages/02_unzip.sh
nfiles: 1
- path: stages/02_unzip.sh
md5: d0e1b99ce0d7915c3becc3fa210348d2
size: 466
isexec: true
outs:
- md5: 75ab6120bac0a1065986912e5e5eb79b.dir
nfiles: 11
path: raw
- path: raw
md5: 75ab6120bac0a1065986912e5e5eb79b.dir
size: 572654300
nfiles: 11
15 changes: 2 additions & 13 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,3 @@
# Brick DVC stages
# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml

# The complete process can be executed using:
# dvc repro
# If you want to force redoing the process use
# dvc repro -f
# Individual stage can be executed using:
# dvc repro <stage>

stages:
download:
cmd: stages/01_download.sh
Expand All @@ -23,10 +13,9 @@ stages:
outs:
- raw
build:
cmd: stages/03_build.sh
cmd: python stages/03_build.py
deps:
- stages/03_build.sh
- stages/csv2parquet.py
- stages/03_build.py
- raw
outs:
- brick
31 changes: 31 additions & 0 deletions stages/03_build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import pandas as pd
from pathlib import Path

raw_dir = Path("./raw")
brick_dir = Path("./brick")
brick_dir.mkdir(exist_ok=True)

csv_files = list(raw_dir.glob("*.csv"))

for csv_file in csv_files:
try:
df = pd.read_csv(csv_file, encoding='utf-8', low_memory=False)
except UnicodeDecodeError:
# If UTF-8 fails, try with 'latin-1' encoding
df = pd.read_csv(csv_file, encoding='latin-1', low_memory=False)

# Convert problematic columns to string type
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].astype(str)

output_file = brick_dir / f"{csv_file.stem}.parquet"
df.to_parquet(output_file, index=False)
print(f"Converted {csv_file.name} to {output_file.name}")


# add a test to read all the parquet files and make sure they all have more than 5 rows
for parquet_file in brick_dir.glob("*.parquet"):
df = pd.read_parquet(parquet_file)
assert len(df) > 5, f"File {parquet_file.name} has less than 5 rows"
18 changes: 0 additions & 18 deletions stages/03_build.sh

This file was deleted.

0 comments on commit f10d8d6

Please sign in to comment.