diff --git a/stages/01_download.sh b/stages/01_download.sh deleted file mode 100755 index e901153..0000000 --- a/stages/01_download.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -# Script to download files - -# Get local path -localpath=$(pwd) -echo "Local path: $localpath" - -# Create the list directory to save list of remote files and directories -listpath="$localpath/list" -echo "List path: $listpath" -mkdir -p $listpath -cd $listpath; - -# Define the FTP base address -export ftpbase="" - -# Retrieve the list of files to download from FTP base address -wget --no-remove-listing $ftpbase -cat index.html | grep -Po '(?<=href=")[^"]*' | sort | cut -d "/" -f 10 > files.txt -rm .listing -rm index.html - -# Create the download directory -export downloadpath="$localpath/download" -echo "Download path: $downloadpath" -mkdir -p "$downloadpath" -cd $downloadpath; - -# Download files in parallel -cat $listpath/files.txt | xargs -P14 -n1 bash -c ' - echo $0 - wget -nH -q -nc -P $downloadpath $ftpbase$0 -' - -echo "Download done." diff --git a/stages/02_unzip.sh b/stages/02_unzip.sh deleted file mode 100755 index 286198f..0000000 --- a/stages/02_unzip.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -# Script to unzip files - -# Get local path -localpath=$(pwd) -echo "Local path: $localpath" - -# Set download path -export downloadpath="$localpath/download" -echo "Download path: $downloadpath" - -# Set list path -listpath="$localpath/list" -echo "List path: $listpath" - -# Create raw path -export rawpath="$localpath/raw" -mkdir -p $rawpath -echo "Raw path: $rawpath" - -# Unzip files in parallel -cat $listpath/files.txt | tail -n +2 | xargs -P14 -n1 bash -c ' - filename="${0%.*}" - echo $downloadpath/$0 - echo $rawpath/$filename - unzip $downloadpath/$0 -d $rawpath/$filename -' diff --git a/stages/03_build.sh b/stages/03_build.sh deleted file mode 100755 index 60b2ab7..0000000 --- a/stages/03_build.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Script to process unzipped files and build parquet files - -# Get local path -localpath=$(pwd) -echo "Local path: $localpath" - -# Set list path -listpath="$localpath/list" -mkdir -p $listpath -echo "List path: $listpath" - -# Set raw path -export rawpath="$localpath/raw" -echo "Raw path: $rawpath" - -# Create brick directory -export brickpath="$localpath/brick" -mkdir -p $brickpath -echo "Brick path: $brickpath" - -# Process raw files and create parquet files in parallel -# calling a Python function with arguments input and output filenames -cat $listpath/files.txt | tail -n +4 | xargs -P14 -n1 bash -c ' - filename="${0%.*}" - echo $rawpath/$filename/$filename.txt - echo $brickpath/$filename.parquet - python stages/csv2parquet.py $rawpath/$filename.txt $brickpath/$filename.parquet -' diff --git a/stages/csv2parquet.R b/stages/csv2parquet.R deleted file mode 100644 index 6576d98..0000000 --- a/stages/csv2parquet.R +++ /dev/null @@ -1,2 +0,0 @@ -# edit this stage to create new resources in the data directory -mtcars |> arrow::write_parquet("brick/mtcars.parquet") diff --git a/stages/csv2parquet.py b/stages/csv2parquet.py deleted file mode 100644 index f821fa8..0000000 --- a/stages/csv2parquet.py +++ /dev/null @@ -1,11 +0,0 @@ -import pandas as pd -import sys -import pyarrow as pyarrow -import fastparquet as fastparquet - -InFileName = sys.argv[1] -OutFileName = sys.argv[2] - -print(f"csv2parquet: Converting file {InFileName}") -DF = pd.read_csv(InFileName, sep=',') -DF.to_parquet(OutFileName)