diff --git a/README.md b/README.md index 1aafff4..af00476 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ pip-compile Initializing a dbt project: ```bash -dbt init healthcare_data +dbt init data_processing ``` ## Building the datasets @@ -96,7 +96,7 @@ dbt init healthcare_data 1. Generate the synthetic healthcare data schemas using the data dictionary: ```bash -cd healthcare_data +cd data_processing python scripts/generate_syh_dr_data_models.py ~/data/syh_dr https://www.ahrq.gov/sites/default/files/wysiwyg/data/SyH-DR-Codebook.pdf ``` @@ -109,7 +109,7 @@ dbt run --threads 8 3. Verify that you can query the data on the command line: ```bash -duckdb -c "SELECT * FROM '/Users/me/data/syh_dr/syhdr_commercial_inpatient_2016.parquet'" +duckdb -c "SELECT * FROM '~/data/syh_dr/syhdr_commercial_inpatient_2016.parquet'" ``` This should show the data: diff --git a/data_processing/scripts/generate_syh_dr_data_models.py b/data_processing/scripts/generate_syh_dr_data_models.py index eacef23..1c42d51 100644 --- a/data_processing/scripts/generate_syh_dr_data_models.py +++ b/data_processing/scripts/generate_syh_dr_data_models.py @@ -137,7 +137,9 @@ def process_csv_files(pdf_url, csv_folder): print(csv_str) username = os.environ.get("USER") path_without_user = "~/" + csv_path.split(username + '/')[1] - select_statement = f"SELECT\n {',\n '.join(column_list)}\nFROM read_csv('{path_without_user}', header=True, null_padding=true{csv_str if csv_types else ''})" + select_statement = f"""SELECT + {',\n '.join(column_list)} +FROM read_csv('{path_without_user}', header=True, null_padding=true{csv_str if csv_types else ''})""" f.write(select_statement) print(f"Generated SQL model: {sql_file}")