diff --git a/bids/remove_bids_files.py b/bids/remove_bids_files.py index 8ccfe88..a15f3bc 100644 --- a/bids/remove_bids_files.py +++ b/bids/remove_bids_files.py @@ -12,74 +12,59 @@ import os, time, sys, glob from os.path import join import json +from pathlib import Path +import pandas as pd # example file dir # /dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/dartmouth/sub-0085/ses-01/func/sub-0085_ses-01_task-alignvideo_acq-mb8_run-01_bold__dup-01.nii.gz' # {main_dir}/{sub}/{ses}/func - +fpattern = sys.argv[1] +bids_dir = sys.argv[2] # reason for this: script is in preprocessing git repo. BIDS repo is in a totally different directory, not parent or child. # 1. get list of dup names _____________________________________________________________ - # TODO: remove all 4 types of files - See if you can remove and datalad save within this script. - # 1) remove bold__dup-01.json - # 2) remove bold__dup-01.nii.gz - # 3) remove sbref__dup-01.json - # 4) remove sbref__dup-01.nii.gz -main_dir = '/dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/dartmouth' -save_dir = '/dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/scripts/spacetop_log' -dup_pattern = 'sub-*/ses-*/func/sub-*_ses-*_task-*_acq-mb8_run-*_bold__dup*.nii.gz' +current_dir = os.getcwd() +main_dir = Path(current_dir).parents[1] +save_dir = join(main_dir, 'log') -dup_glob = glob.glob(join(main_dir, dup_pattern)) +dup_glob = glob.glob(join(bids_dir, fpattern)) flaglist = [] flaglist.append(f"This file keeps track of dup file and fieldmap mismatches.\nTwo erroneous cases:\n 1) IntendedFor field does not exist from the get go [IntendedFor X] \n 2) IntendedFor field exists. However, Duplicate file does not exist within this key [IntendedFor O; Dup X]") for ind, dup_fpath in enumerate(dup_glob): - - dup_fname = os.path.basename(dup_fpath) - sub = [match for match in dup_fname.split('_') if "sub" in match][0] - ses = [match for match in dup_fname.split('_') if "ses" in match][0] - run = [match for match in dup_fname.split('_') if "run" in match][0] - task = [match for match in dup_fname.split('_') if "task" in match][0] + fmap_dir = os.path.join(Path( os.path.dirname(dup_fpath)).parents[0], 'fmap') + fmap_glob = '*-run-' + dup_fname = os.path.basename(dup_fpath) + # TODO: make it flexible to handle DWI and T1 .jsonl. + # currently, we're just going to feed in epi images and update epi.jsons. + # dictionary: + # - DWI: acq-96dirX6b0Xmb + # - BOLD: acq-mb8 # 2. open fieldmap .json with corresponding .dup files _____________________________________________________________ - fmap_glob = glob.glob(join(main_dir, f'{sub}/{ses}/fmap/{sub}_{ses}_acq-mb8_dir-*_epi.json')) - for fmap_ind, fmap_fname in enumerate(fmap_glob): + for fmap_ind, fmap_fname in enumerate(join(fmap_dir, '*')): sidecar = load_json(fmap_fname) # 2-1. check if "IntendedFor" field exists within json if 'IntendedFor' in sidecar: - if key_field: - copy_list = f['IntendedFor'] + copy_list = sidecar['IntendedFor'] print(copy_list) # 2-2. find "IntendedFor" field and if dup_fname exists, pop item dup_index = [i for i, s in enumerate(copy_list) if dup_fname in s] if dup_index: copy_list.pop(dup_index[0]) - f['IntendedFor'] = copy_list + sidecar['IntendedFor'] = copy_list print(f"removed {dup_fname} from list") + save_json(fmap_fname, sidecar) +# 3. remove files __________________________________________ + os.remove(dup_fpath) + # TODO: remove all 4 types of files - See if you can remove and datalad save within this script. + # 1) remove bold__dup-01.json + # 2) remove bold__dup-01.nii.gz + # 3) remove sbref__dup-01.json + # 4) remove sbref__dup-01.nii.gz - save_json(fmap_fname, f) - else: - flag_msg1 = f"Intended For field exists - dup filename does not exist : {dup_fname}" - flaglist.append(flag_msg1) - - elif not key_field: - flag_msg2 = f"IntendedFor field does not exist - {fmap_fname}" - print(flag_msg2) - flaglist.append(flag_msg2) - -# 3. save filenames with missing intendedfor fields or missing dup filenames __________________________________________ +# 4. update scans.tsv by removing the dup_fname from entire row + tsv_fname = glob.glob(join(os.path.dirname(dup_fpath)).parents[0], '*_scans.tsv')[0] + df = pd.read_csv(tsv_fname, sep = '\t') + drop_df = df[ df[ 'filename' ].str.contains(dup_fname )==False ] + drop_df.to_csv(tsv_fname) +# 5. save filenames with missing intendedfor fields or missing dup filenames __________________________________________ txt_filename = os.path.join(save_dir, 'dup_flaglist.txt') with open(txt_filename, 'w') as f: - f.write(json.dumps(flaglist)) - - - - -# TODO: delete later. clean up anything below __________________________________________ -# 3-3. if the file is at the end of the list, deleted -# if dup_index[0] == len(copy_list)-1: -# copy_list.pop(-1) -# print(f"removed {dup_fname} from list") -# f['IntendedFor'] = copy_list -# 3-4. if the file is in the middle of the list, simply delete -# elif dup_index[0] < len(copy_list)-1: - -# conda activate spacetop_env (within shell script) -# /dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/dartmouth/sub-0009/ses-01/fmap/sub-0009_ses-01_acq-mb8_dir-ap_run-01_epi.json -# /dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/dartmouth/sub-0085/ses-01/fmap/sub-0085_ses-01_acq-mb8_dir-ap_run-01_epi.json + f.write(json.dumps(flaglist)) \ No newline at end of file