From 55a933b756afccfd087bc03778fd55b53268b585 Mon Sep 17 00:00:00 2001
From: jungheejung <heejung.jung@colorado.edu>
Date: Thu, 8 Aug 2024 18:14:58 -0400
Subject: [PATCH] DEV: identify files to delete

---
 .../datalad/remove_scannotes_nodata.py        | 383 +++++++++++-------
 1 file changed, 242 insertions(+), 141 deletions(-)

diff --git a/spacetop_prep/datalad/remove_scannotes_nodata.py b/spacetop_prep/datalad/remove_scannotes_nodata.py
index b8a8bfd..9f81ca5 100644
--- a/spacetop_prep/datalad/remove_scannotes_nodata.py
+++ b/spacetop_prep/datalad/remove_scannotes_nodata.py
@@ -1,23 +1,14 @@
-################################################################################
-
-"""
-This script deletes files in the datalad directory based on scan notes.
-
-If a "no-data" entry is found in the scan notes, the corresponding directory in
-datalad is queried. The script checks if the TR count is less than expected or
-if the _bold.nii.gz file exists. It then cross-compares with empty events.tsv 
-files to identify failed fMRI acquisitions. If conditions are met, related files 
-are deleted to prevent the use of incomplete data.
-"""
-
 import re
 import pandas as pd
 from pathlib import Path
 import json
 import subprocess
 
-datalad_dir = '/Users/h/Documents/projects_local/1076_spacetop'
+# Constants
+DATALAD_DIR = Path('/Users/h/Documents/projects_local/1076_spacetop')
+SCANNOTES_FNAME = Path('/Users/h/Documents/projects_local/cue_expectancy/resources/spacetop_scannotes/ST_Participants - scan_info_08-10-2022.csv')
 
+# Task mapping dictionary
 task_mapping = {
     'T1': 'task-t1',
     'DWI': 'task-dwi',
@@ -30,6 +21,15 @@
 }
 
 def extract_task_run(column_name):
+    """
+    Extracts task and run information from a given column name.
+    
+    Parameters:
+        column_name (str): The column name containing task and run information.
+    
+    Returns:
+        dict: A dictionary containing the task and run extracted from the column name.
+    """
     parts = column_name.split('\n')
     if len(parts) > 2:
         task_name = parts[1].strip()
@@ -38,153 +38,254 @@ def extract_task_run(column_name):
         return {"task": task, "run": run_name}
     return {"task": "unknown-task", "run": "unknown-run"}
 
-# load st_participants
-
-scannotes_fname = '/Users/h/Documents/projects_local/cue_expectancy/resources/spacetop_scannotes/ST_Participants - scan_info_08-10-2022.csv'
-df = pd.read_csv(scannotes_fname)# load st_participants
-results = []
-
-# find the cell that has "no_data". From that, extract bids metadata (sub, ses, task, run)
-for row in range(df.shape[0]):
-    for col in range(df.shape[1]):
-        if df.iat[row, col] == "no_data":
-            # Store the coordinates
-            bids_sub = df.at[row, "BIDS_id"]
-            col_value = df.columns[col]
-            bids_ses = df.at[row, "Session #"]
-            #first_bids_sub = df.iloc[0, col]
-
-            result = extract_task_run(col_value)
-            # print(result)
-            bids_task = result['task']
-            bids_run = result['run']
-
-            results.append({
-                "BIDS_sub": bids_sub,
-                "Column": col_value,
-                "BIDS_task": bids_task,
-                "BIDS_run": bids_run,
-                "BIDS_ses": bids_ses
-                #"First Row Value": first_bids_sub
-            })
-
-sorted_results = sorted(results, key=lambda x: x["BIDS_sub"])
-
-TRdict = {
-    "task-narratives_acq-mb8_run-01": 967,
-    "task-narratives_acq-mb8_run-02": 1098,
-    "task-narratives_acq-mb8_run-03": 1298,
-    "task-narratives_acq-mb8_run-04": 1156,
-    "task-social": 872,
-    "task-fractional_acq-mb8_run-01": 1323,
-    "task-fractional_acq-mb8_run-02": 1322,
-    "task-shortvideo": 1616,
-    "task-faces": 914,
-    "ses-01_task-alignvideo_acq-mb8_run-01": 1073,
-    "ses-01_task-alignvideo_acq-mb8_run-02": 1376,
-    "ses-01_task-alignvideo_acq-mb8_run-03": 1016,
-    "ses-01_task-alignvideo_acq-mb8_run-04": 1209,
-    "ses-02_task-alignvideo_acq-mb8_run-01": 839,
-    "ses-02_task-alignvideo_acq-mb8_run-02": 1859,
-    "ses-02_task-alignvideo_acq-mb8_run-03": 1158,
-    "ses-02_task-alignvideo_acq-mb8_run-04": 914,
-    "ses-03_task-alignvideo_acq-mb8_run-01": 1157,
-    "ses-03_task-alignvideo_acq-mb8_run-02": 1335,
-    "ses-03_task-alignvideo_acq-mb8_run-03": 1065,
-    "ses-04_task-alignvideo_acq-mb8_run-01": 1268,
-    "ses-04_task-alignvideo_acq-mb8_run-02": 926
-}
-
 def TRmapper(result):
+    """
+    Maps the task and run combination to the corresponding TR value.
+    
+    Parameters:
+        result (dict): A dictionary containing BIDS session, task, and run information.
+    
+    Returns:
+        int: The TR value for the given task and run.
+    
+    Raises:
+        KeyError: If the task and run combination is not found in the TRdict.
+    """
+    TRdict = {
+        "task-narratives_acq-mb8_run-01": 967,
+        "task-narratives_acq-mb8_run-02": 1098,
+        "task-narratives_acq-mb8_run-03": 1298,
+        "task-narratives_acq-mb8_run-04": 1156,
+        "task-social": 872,
+        "task-fractional_acq-mb8_run-01": 1323,
+        "task-fractional_acq-mb8_run-02": 1322,
+        "task-shortvideo": 1616,
+        "task-faces": 914,
+        "ses-01_task-alignvideo_acq-mb8_run-01": 1073,
+        "ses-01_task-alignvideo_acq-mb8_run-02": 1376,
+        "ses-01_task-alignvideo_acq-mb8_run-03": 1016,
+        "ses-01_task-alignvideo_acq-mb8_run-04": 1209,
+        "ses-02_task-alignvideo_acq-mb8_run-01": 839,
+        "ses-02_task-alignvideo_acq-mb8_run-02": 1859,
+        "ses-02_task-alignvideo_acq-mb8_run-03": 1158,
+        "ses-02_task-alignvideo_acq-mb8_run-04": 914,
+        "ses-03_task-alignvideo_acq-mb8_run-01": 1157,
+        "ses-03_task-alignvideo_acq-mb8_run-02": 1335,
+        "ses-03_task-alignvideo_acq-mb8_run-03": 1065,
+        "ses-04_task-alignvideo_acq-mb8_run-01": 1268,
+        "ses-04_task-alignvideo_acq-mb8_run-02": 926
+    }
     bids_string = f"{result['BIDS_ses']}_{result['BIDS_task']}_acq-mb8_{result['BIDS_run']}"
-    TR = TRdict[bids_string]
-    return TR
-
+    for key in TRdict.keys():
+        if key in bids_string:
+            return TRdict[key]
+    raise KeyError(f"{bids_string} not found in TRdict")
 
+def load_scan_notes(filename):
+    """
+    Loads the scan notes CSV file into a pandas DataFrame.
+    
+    Parameters:
+        filename (str): The path to the scan notes CSV file.
+    
+    Returns:
+        DataFrame: The loaded pandas DataFrame.
+    """
+    return pd.read_csv(filename)
 
-# Run the git grep command to find empty events.tsv files. 
-# This is potentially indicative of a failed MR run 
-process = subprocess.Popen(['git', 'grep', '-l', 'TODO'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-stdout, stderr = process.communicate()
+def find_no_data_cells(df):
+    """
+    Finds cells in the DataFrame that match specific conditions and extracts relevant BIDS metadata.
+    
+    Parameters:
+        df (DataFrame): The DataFrame containing scan notes.
+    
+    Returns:
+        list: A sorted list of dictionaries with BIDS metadata for cells that match the conditions.
+    """
+    results = []
+    for row in range(df.shape[0]):
+        for col in range(df.shape[1]):
+            cell_value = df.iat[row, col]
+            if (cell_value == "no_data" or
+                cell_value == "complete_dontuse" or
+                cell_value == "repeat_dontuse"):
+                bids_sub = df.at[row, "BIDS_id"]
+                col_value = df.columns[col]
+                bids_ses = df.at[row, "Session #"]
+                scan_comments = df.at[row, "Scan comments?"]
+                result = extract_task_run(col_value)
+                bids_task = result['task']
+                bids_run = result['run']
+                results.append({
+                    "BIDS_sub": bids_sub,
+                    "Column": col_value,
+                    "BIDS_task": bids_task,
+                    "BIDS_run": bids_run,
+                    "BIDS_ses": bids_ses,
+                    "Scan_comments": scan_comments
+                })
+    return sorted(results, key=lambda x: x["BIDS_sub"])
 
-if process.returncode != 0:
-    print(f"Error: {stderr.decode()}")
-else:
-    # Decode the output and filter for files ending with _event.tsv
-    files = stdout.decode().splitlines()
-    todo_events_list = [file for file in files if file.endswith('_event.tsv')]
+def run_git_grep():
+    """
+    Runs a git grep command to find files containing 'TODO' and returns the list of files.
+    
+    Returns:
+        list: A list of file paths containing 'TODO'.
+    
+    Raises:
+        RuntimeError: If the git grep command fails.
+    """
+    process = subprocess.Popen(['git', 'grep', '-l', 'TODO'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = process.communicate()
+    if process.returncode != 0:
+        raise RuntimeError(f"Error: {stderr.decode()}")
+    return stdout.decode().splitlines()
 
-    # Print the filtered file names
-    for file in todo_events_list:
-        print(file)
+def filter_todo_events(files):
+    """
+    Filters the list of files to include only those ending with '_events.tsv'.
+    
+    Parameters:
+        files (list): The list of file paths.
+    
+    Returns:
+        list: The filtered list of file paths.
+    """
+    return [file for file in files if Path(file).name.endswith('_events.tsv')]
 
-
-# Based on no_data basenames
-for result in sorted_results:
+def check_files_to_delete(result, todo_events_list):
+    """
+    Checks if files should be deleted based on BIDS metadata and specific conditions.
+    
+    Parameters:
+        result (dict): A dictionary containing BIDS metadata.
+        todo_events_list (list): A list of files containing 'TODO'.
+    
+    Returns:
+        str: The action ('delete', 'investigate_intactevents', 'investigate_intactTR', 'investigate_nojson', or 'other').
+        str: The BIDS string for the files.
+        str: The scan comments for the files.
+    """
     sub = result['BIDS_sub']
     ses = result['BIDS_ses']
     task = result['BIDS_task']
     run = result['BIDS_run']
-    print(f"BIDS_sub: {result['BIDS_sub']},\n 
-          Column: {result['Column']},\n, 
-          BIDS_task: {result['BIDS_task']},
-          BIDS_run:{result['BIDS_run']}, 
-          BIDS_ses: {result['BIDS_ses']} \n\n")
-
-    # Glob every file that matches the sorted_results basename
-    bids_string = f"{sub}_{ses}_{task}_acq-mb8_{run}*"
-    bids_subdir = Path(datalad_dir, sub, ses, 'func')
-    matching_files = list(bids_subdir.glob(bids_string))
-
-    #### if _bold.nii.gz does exist
-    # -> check json
-    # if acquisition is less than XX TR,
-    # be ready to delete list of globed files first check passed
-
-    #### check the TO and events.tsv lists that need resolving
-    # if they intersect, DELETE
-    # if they don't keep a tally of the non intersecting file lists. 
-    # Check if _bold.nii.gz exists and perform further checks
-    # For every no_data file
+    scan_comments = result['Scan_comments']
+    bids_string = f"{sub}_{ses}_{task}_acq-mb8_{run}"
+    bids_subdir = DATALAD_DIR / sub / ses / 'func'
+    matching_files = sorted(list(bids_subdir.glob(bids_string + "*")))
+
+    print(f"Matching files for {bids_string}: {matching_files}")
+
+    found_relevant_file = False
+
     for file_path in matching_files:
-        # If the no_data_bold.nii.gz does exist,
-        if file_path.suffix == '.nii.gz' and '_bold' in file_path.stem:
-            # Check the corresponding JSON file to check the number of TRs
-            json_file_path = file_path.with_suffix('.json')
+        print(f"Processing file: {file_path}")
+
+        if file_path.suffix == '.gz' and '_bold' in file_path.stem:
+            found_relevant_file = True
+            json_file_path = file_path.with_name(file_path.stem.split('.nii')[0] + '.json')
+            print(f"Expected JSON file path: {json_file_path}")
+
             if json_file_path.exists():
+                print(f"JSON file exists: {json_file_path}")
                 with open(json_file_path, 'r') as json_file:
                     json_data = json.load(json_file)
                     TRlength = json_data.get('dcmmeta_shape', None)[-1]
-                    # if TRlength is less than expected, be ready to delete
-                    expectedTR = TRmapper(results)
-                    if TRlength and TRlength < expectedTR: 
-                        print(f"Acquisition time is less than {expectedTR}. Ready to delete: {file_path}")
+                    expectedTR = TRmapper(result)
+                    print(f"TR length: {TRlength}, Expected TR: {expectedTR}")
 
-                        # Check TO and events.tsv lists
-                        todo_intersect = set(todo_events_list) & set(matching_files)
-                        # events_intersect = set(events_tsv_list) & set(matching_files)
+                    if TRlength and TRlength < expectedTR:
+                        matching_filenames = [file.name for file in matching_files]
+                        todo_intersect = [file for file in todo_events_list if any(file.endswith(match) for match in matching_filenames)]
+                        print(f"todo_intersect: {todo_intersect}")
 
                         if todo_intersect:
-                            print(f"Deleting files: {matching_files}")
-                            for file_to_delete in matching_files:
-                                print("DELETE")
-                                # file_to_delete.unlink()  # This deletes the file
+                            return "delete", bids_string, scan_comments
                         else:
-                            print(f"No intersection found. Keeping files: {matching_files}")
+                            return "investigate_intactevents", bids_string, scan_comments
                     else:
-                        print(f"Acquisition time is acceptable for file: {file_path}")
+                        return "investigate_intactTR", bids_string, scan_comments
             else:
-                print(f"JSON file does not exist for: {file_path}")
-                print(f"delete all others")
-                for file_to_delete in matching_files:
-                    print("DELETE")
-                    # file_to_delete.unlink()  # This deletes the file
-
-
-# ### SCANNOTES ###
-# check json
-# if aquisition is less than XX TR, 
-# delete all files altogether (all files with name)
-# EVENTS.TSV
-# cross compare this delete list with the grep TODO
-# if scannote no_data and events.tsv TODO coincides, finaly delete
\ No newline at end of file
+                print(f"Missing JSON file for: {file_path}")
+                return "investigate_nojson", bids_string, scan_comments
+
+    if not found_relevant_file:
+        print(f"No relevant files found for: {bids_string}")
+    
+    return "other", bids_string, scan_comments
+
+def main():
+    """
+    Main function to process the scan notes and identify files to delete or investigate.
+    """
+    df = load_scan_notes(SCANNOTES_FNAME)
+    sorted_results = find_no_data_cells(df)
+    try:
+        files = run_git_grep()
+        todo_events_list = filter_todo_events(files)
+    except RuntimeError as e:
+        print(e)
+        return
+
+    # Lists to hold BIDS strings and comments
+    deletelist = []
+    investigate_intactevents = []
+    investigate_intactTR = []
+    investigate_nojson = []
+    other = []
+
+    for result in sorted_results:
+        action, bids_string, scan_comments = check_files_to_delete(result, todo_events_list)
+        if action == "delete":
+            deletelist.append({"BIDS_string": bids_string, "Scan_comments": scan_comments})
+        elif action == "investigate_intactevents":
+            investigate_intactevents.append({"BIDS_string": bids_string, "Scan_comments": scan_comments})
+        elif action == "investigate_intactTR":
+            investigate_intactTR.append({"BIDS_string": bids_string, "Scan_comments": scan_comments})
+        elif action == "investigate_nojson":
+            investigate_nojson.append({"BIDS_string": bids_string, "Scan_comments": scan_comments})
+        elif action == "other":
+            other.append({"BIDS_string": bids_string, "Scan_comments": scan_comments})
+
+    # Convert lists to DataFrames for easier display and manipulation
+    df_delete = pd.DataFrame(deletelist)
+    df_investigate_intactevents = pd.DataFrame(investigate_intactevents)
+    df_investigate_intactTR = pd.DataFrame(investigate_intactTR)
+    df_investigate_nojson = pd.DataFrame(investigate_nojson)
+    df_other = pd.DataFrame(other)
+
+    # Display the DataFrames
+    print("Files to delete:")
+    print(df_delete)
+    print("\nFiles to investigate (intact events.tsv):")
+    print(df_investigate_intactevents)
+    print("\nFiles to investigate (intact TR):")
+    print(df_investigate_intactTR)
+    print("\nFiles to investigate (missing JSON):")
+    print(df_investigate_nojson)
+    print("\nOther files:")
+    print(df_other)
+
+    # Add the Deprecate_category column to each DataFrame
+    df_delete['Deprecate_category'] = 'delete'
+    df_investigate_intactevents['Deprecate_category'] = 'investigate: non empty events file'
+    df_investigate_intactTR['Deprecate_category'] = 'investigate: TR length correct'
+    df_investigate_nojson['Deprecate_category'] = 'investigate: no json'
+    df_other['Deprecate_category'] = 'investigate: other'
+
+    # Combine all DataFrames
+    merged_df = pd.concat([df_delete, df_investigate_intactevents, df_investigate_intactTR, df_investigate_nojson, df_other], ignore_index=True)
+
+    # Reorder the columns
+    merged_df = merged_df[['Deprecate_category', 'BIDS_string', 'Scan_comments']]
+
+    # Display the merged DataFrame
+    print(merged_df)
+
+    merged_df.to_csv(Path(DATALAD_DIR,'code','spacetop-prep','spacetop_prep','datalad','delete_bold.tsv'), index=False, sep='\t')
+
+if __name__ == "__main__":
+    main()