include environment arg; use json keys; update scans.tsv

spatialtopology · Mar 25, 2022 · f035a3c · f035a3c
1 parent e53be56
commit f035a3c
Showing 1 changed file with 34 additions and 49 deletions.
diff --git a/bids/remove_bids_files.py b/bids/remove_bids_files.py
@@ -12,74 +12,59 @@
 import os, time, sys, glob
 from os.path import join
 import json
+from pathlib import Path
+import pandas as pd
 
 # example file dir
 # /dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/dartmouth/sub-0085/ses-01/func/sub-0085_ses-01_task-alignvideo_acq-mb8_run-01_bold__dup-01.nii.gz'
 # {main_dir}/{sub}/{ses}/func
-
+fpattern = sys.argv[1]
+bids_dir = sys.argv[2] # reason for this: script is in preprocessing git repo. BIDS repo is in a totally different directory, not parent or child.
 # 1. get list of dup names _____________________________________________________________
-    # TODO: remove all 4 types of files - See if you can remove and datalad save within this script.
-    # 1) remove bold__dup-01.json
-    # 2) remove bold__dup-01.nii.gz
-    # 3) remove sbref__dup-01.json
-    # 4) remove sbref__dup-01.nii.gz
-main_dir = '/dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/dartmouth'
-save_dir = '/dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/scripts/spacetop_log'
-dup_pattern = 'sub-*/ses-*/func/sub-*_ses-*_task-*_acq-mb8_run-*_bold__dup*.nii.gz'
+current_dir = os.getcwd()
+main_dir = Path(current_dir).parents[1]
+save_dir = join(main_dir, 'log')
 
-dup_glob = glob.glob(join(main_dir, dup_pattern))
+dup_glob = glob.glob(join(bids_dir, fpattern))
 flaglist = []
 flaglist.append(f"This file keeps track of dup file and fieldmap mismatches.\nTwo erroneous cases:\n 1) IntendedFor field does not exist from the get go [IntendedFor X] \n 2) IntendedFor field exists. However, Duplicate file does not exist within this key [IntendedFor O; Dup X]")
 for ind, dup_fpath in enumerate(dup_glob):
-
-    dup_fname  = os.path.basename(dup_fpath)
-    sub = [match for match in dup_fname.split('_') if "sub" in match][0] 
-    ses = [match for match in dup_fname.split('_') if "ses" in match][0] 
-    run = [match for match in dup_fname.split('_') if "run" in match][0] 
-    task = [match for match in dup_fname.split('_') if "task" in match][0] 
+    fmap_dir = os.path.join(Path( os.path.dirname(dup_fpath)).parents[0], 'fmap')
+    fmap_glob = '*-run-'
+    dup_fname = os.path.basename(dup_fpath)
+    # TODO: make it flexible to handle DWI and T1 .jsonl.
+    # currently, we're just going to feed in epi images and update epi.jsons. 
+    # dictionary: 
+    # - DWI: acq-96dirX6b0Xmb
+    # - BOLD: acq-mb8
 # 2. open fieldmap .json with corresponding .dup files _____________________________________________________________
-    fmap_glob = glob.glob(join(main_dir, f'{sub}/{ses}/fmap/{sub}_{ses}_acq-mb8_dir-*_epi.json'))
-    for fmap_ind, fmap_fname in enumerate(fmap_glob):
+    for fmap_ind, fmap_fname in enumerate(join(fmap_dir, '*')):
         sidecar = load_json(fmap_fname)
         # 2-1. check if "IntendedFor" field exists within json
         if 'IntendedFor' in sidecar:
-        if key_field:
-            copy_list = f['IntendedFor']
+            copy_list = sidecar['IntendedFor']
             print(copy_list)
             # 2-2. find "IntendedFor" field and if dup_fname exists, pop item
             dup_index = [i for i, s in enumerate(copy_list) if dup_fname in s]
             if dup_index:
                 copy_list.pop(dup_index[0])
-                f['IntendedFor'] = copy_list
+                sidecar['IntendedFor'] = copy_list
                 print(f"removed {dup_fname} from list")
+                save_json(fmap_fname, sidecar)
+# 3. remove files __________________________________________
+                os.remove(dup_fpath)
+                # TODO: remove all 4 types of files - See if you can remove and datalad save within this script.
+                # 1) remove bold__dup-01.json
+                # 2) remove bold__dup-01.nii.gz
+                # 3) remove sbref__dup-01.json
+                # 4) remove sbref__dup-01.nii.gz
 
-                save_json(fmap_fname, f)
-            else:
-                flag_msg1 = f"Intended For field exists - dup filename does not exist : {dup_fname}"
-                flaglist.append(flag_msg1)
-
-        elif not key_field:
-            flag_msg2 = f"IntendedFor field does not exist - {fmap_fname}"
-            print(flag_msg2)
-            flaglist.append(flag_msg2)
-
-# 3. save filenames with missing intendedfor fields or missing dup filenames __________________________________________
+# 4. update scans.tsv by removing the dup_fname from entire row
+    tsv_fname = glob.glob(join(os.path.dirname(dup_fpath)).parents[0], '*_scans.tsv')[0]
+    df = pd.read_csv(tsv_fname, sep = '\t')
+    drop_df = df[ df[ 'filename' ].str.contains(dup_fname )==False ]
+    drop_df.to_csv(tsv_fname) 
+# 5. save filenames with missing intendedfor fields or missing dup filenames __________________________________________
 txt_filename = os.path.join(save_dir, 'dup_flaglist.txt')
 with open(txt_filename, 'w') as f:
-    f.write(json.dumps(flaglist))
-
-
-
-
-# TODO: delete later. clean up anything below __________________________________________
-# 3-3. if the file is at the end of the list, deleted 
-# if dup_index[0] == len(copy_list)-1:
-#     copy_list.pop(-1)
-#     print(f"removed {dup_fname} from list")
-#     f['IntendedFor'] = copy_list
-# 3-4. if the file is in the middle of the list, simply delete
-# elif dup_index[0] < len(copy_list)-1:
-
-# conda activate spacetop_env (within shell script)
-# /dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/dartmouth/sub-0009/ses-01/fmap/sub-0009_ses-01_acq-mb8_dir-ap_run-01_epi.json
-# /dartfs-hpc/rc/lab/C/CANlab/labdata/data/spacetop/dartmouth/sub-0085/ses-01/fmap/sub-0085_ses-01_acq-mb8_dir-ap_run-01_epi.json
+    f.write(json.dumps(flaglist))