Merge pull request #25 from nolanwelch/nolanwelch-sync-thrive-scripts

Update thrive-dataset scripts
NDCLab · Dec 5, 2024 · 353b684 · 353b684
2 parents da07116 + 85046d6
commit 353b684
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 43 deletions.
diff --git a/code/MADE_pipeline.m b/code/MADE_pipeline.m
@@ -292,10 +292,8 @@
                 else
                     [subj, task, sess, desc, ext] = filename_re{1}{:};
                     if length(desc) == 0
-                        %output_report_path = [output_location filesep 'MADE_preprocessing_report_' task '_' sess '.csv'];
                         output_report_path = [output_location filesep 'MADE_preprocessing_report_' task '_' sess];
                     else
-                        %output_report_path = [output_location filesep 'MADE_preprocessing_report_' task '_' sess '_' desc '.csv'];
                         output_report_path = [output_location filesep 'MADE_preprocessing_report_' task '_' sess '_' desc];
                         desc = ['_' desc];
                     end

diff --git a/data-monitoring/check-id.py b/data-monitoring/check-id.py
@@ -3,10 +3,14 @@
 import math
 import sys
 
-if __name__ == "__main__":
-    id = sys.argv[1]
-    file = sys.argv[2]
-
+class c:
+    RED = '\033[31m'
+    GREEN = '\033[32m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+def check_id(id, file):
     # extract id col
     if pd.__version__ >= "1.4.0":
         file_df = pd.read_csv(file, on_bad_lines="skip")
@@ -17,11 +21,16 @@
     elif "participant" in file_df:
         id_col = file_df["participant"]
     else:
-        sys.exit("Error: cannot find id or participant column in"+ file)
+        sys.exit(c.RED + "Error: cannot find id or participant column in"+ file + c.ENDC)
 
     # check if first ids match vals listed
     if isinstance(id_col[0], float) and math.isnan(id_col[0]):
-        print("Error: nan value seen in ID for", file, "file")
+        print(c.RED + "Error: nan value seen in ID for", file, "file" + c.ENDC)
     else:
         if not int(id_col[0]) == int(id):
-            print("Error: ID value in", file, "does not match", id)
+            print(c.RED + "Error: ID value in", file, str(id_col[0]), "does not match", id + c.ENDC)
+
+if __name__ == "__main__":
+    id = sys.argv[1]
+    file = sys.argv[2]
+    check_id(id, file)
diff --git a/data-monitoring/check_existence_datatype_folders.py b/data-monitoring/check_existence_datatype_folders.py
@@ -67,57 +67,48 @@
             sys.exit("Can't find redcap with name " + vals[0] +", exiting.")
         rc_df = pd.read_csv(redcap, index_col = vals[2])
         rc_var = vals[1]
-        #subs_w_data = list(rc_df[rc_df[rc_var+"_"+session+"_e1_complete"] != 0].index) #? always be a _complete column?
         subs_w_data = list(rc_df[rc_df[rc_var+"_"+session+"_e1_complete"] == 2].index) #? always be a _complete column?
         tracker_df.loc[subs_w_data, visit+'_status_'+session+'_e1'] = 1
         for sub in subs_w_data:
-            ignore_no_data = False
+            no_data_tasks = []
             for task, dtype in task_datatype.items():
                 if dtype == 'combination':
+                    comb_vars = df_dd.loc['arrow-alert_psychopy','provenance'].split(":")[1].split(",")
+                    comb_vars = [x.strip("\" ") for x in comb_vars]
+                    #comb_vars = [x.strip("\" ") for x in df_dd.loc['arrow-alert_psychopy','provenance'].split(":")[1].split(",")
+                    folders = []
+                    for var in comb_vars:
+                        folders.append(df_dd.loc[var,'dataType'])
+                    for folder in folders:
+                        if isdir(join(checked, 'sub-'+str(int(sub)), session, folder)):
+                            for dfile in listdir(join(checked, 'sub-'+str(int(sub)), session, folder)):
+                                if dfile == "no-data.txt":
+                                    if task not in no_data_tasks:
+                                        no_data_tasks.append(task)
                     continue
                 if isdir(join(checked, 'sub-'+str(int(sub)), session, dtype)):
                     for dfile in listdir(join(checked, 'sub-'+str(int(sub)), session, dtype)):
                         if dfile == "no-data.txt":
-                            ignore_no_data = True
+                            no_data_tasks.append(task)
                             break
-            if ignore_no_data:
-                tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 0
-                # don't print error if no data
-                continue
             allpresent = True
-            #corrected = False
-            checked = join(dataset, 'sourcedata', 'checked')
+            checked = join(dataset, 'sourcedata', 'checked') #?
             missing_tasks = []
+            ignore_no_data = False
             for task in vals[3]:
-                if not tracker_df.loc[sub, task + '_' + session + '_e1'] == 1:
+                if not tracker_df.loc[sub, task + '_' + session + '_e1'] == 1 and task not in no_data_tasks:
                     allpresent = False
                     missing_tasks.append(task)
+                elif task in no_data_tasks:
+                    ignore_no_data = True
+                    tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 0
+            if allpresent and ignore_no_data:
+                continue # no error
             if allpresent:
                 tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 1
             else:
                 tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 0
                 print("\033[31mError: Expected tasks " + ", ".join(missing_tasks) + " not seen in subject " + str(sub) + ", session " + session + ".\033[0m")
-        # check raw
-        #raw = join(dataset, 'sourcedata', 'raw', session)
-        #for task in vals[3]:
-        #    if not isdir(join(raw, task)):
-        #        #error
-        #    else:
-        #        for sub in subs_w_data:
-        #            if not isdir(join(raw, task, 'sub-'+str(int(sub)))):
-        #                #error
-        ## check checked
-        #checked = join(dataset, 'sourcedata', 'checked')
-        #for task in vals[3]:
-        #    allpresent = True
-        #    for sub in subs_w_data:
-        #        if not isdir(join(checked, 'sub-'+str(int(sub)), session, task)):
-        #            #error
-        #            allpresent = False
-        #if allpresent:
-        #    tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 1
-        #else:
-        #    tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 0
     tracker_df.to_csv(tracker)
 
 

diff --git a/data-monitoring/hallMonitor.sub b/data-monitoring/hallMonitor.sub
@@ -8,7 +8,7 @@
 #module load miniconda3-4.5.11-gcc-8.2.0-oqs2mbg
 #./hallMonitor.sh
 module load singularity-3.8.2
-singularity exec -e /home/data/NDClab/tools/containers/python-3.8/python-3.8.simg ./hallMonitor.sh
+singularity exec -e /home/data/NDClab/tools/containers/python-3.9/python-3.9.simg ./hallMonitor.sh
 
 source /home/data/NDClab/tools/lab-devOps/scripts/monitor/tools.sh
 logfile="data-monitoring-log.md"
@@ -18,6 +18,7 @@ if [[ $NUMERRORS -gt 0 ]]; then
     error_detected="true"
 else
     error_detected="false"
+    touch slurm-${SLURM_JOB_ID}_errorlog.out
 fi
 if [ $error_detected = true ]; then
     update_log "error; $NUMERRORS errors seen, check slurm-${SLURM_JOB_ID}_errorlog.out for more info" $logfile

diff --git a/data-monitoring/preprocess_wrapper.sh b/data-monitoring/preprocess_wrapper.sh
@@ -103,9 +103,12 @@ fi
 if [[ -z "$score_only" ]]
     then
     #mem_needed=$(( $totalsubs * 10 )) # ~10gb / sub
+    if [[ $totalsubs -lt 4 ]]
+        then
+        cpus=$totalsubs
+    fi
     mem_needed=$(( $cpus * 10 )) # ~10gb / sub
-    #walltime_needed=$(( (totalsubs+cpus-1) / 4 * 10 ))
-    walltime_needed=$(( (totalsubs+cpus-1) / 4 * 16 ))
+    walltime_needed=$(( (totalsubs+cpus-1) / 4 * 10 ))
     sbatch --mem=${mem_needed}G --time=${walltime_needed}:00:00 --cpus-per-task=$cpus --account=iacc_gbuzzell --partition=highmem1 --qos=highmem1 --export=ALL,sstr=${sstr},nstr=${nstr} preprocess.sub
 else
     sbatch --mem=1G --time=00:30:00 --export=All,score=${score_only},dummy=${dummy} preprocess.sub

diff --git a/data-monitoring/update-tracker-postMADE.py b/data-monitoring/update-tracker-postMADE.py