Skip to content

Commit

Permalink
Merge pull request #25 from nolanwelch/nolanwelch-sync-thrive-scripts
Browse files Browse the repository at this point in the history
Update thrive-dataset scripts
  • Loading branch information
lillylaplace authored Dec 5, 2024
2 parents da07116 + 85046d6 commit 353b684
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 43 deletions.
2 changes: 0 additions & 2 deletions code/MADE_pipeline.m
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,8 @@
else
[subj, task, sess, desc, ext] = filename_re{1}{:};
if length(desc) == 0
%output_report_path = [output_location filesep 'MADE_preprocessing_report_' task '_' sess '.csv'];
output_report_path = [output_location filesep 'MADE_preprocessing_report_' task '_' sess];
else
%output_report_path = [output_location filesep 'MADE_preprocessing_report_' task '_' sess '_' desc '.csv'];
output_report_path = [output_location filesep 'MADE_preprocessing_report_' task '_' sess '_' desc];
desc = ['_' desc];
end
Expand Down
23 changes: 16 additions & 7 deletions data-monitoring/check-id.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
import math
import sys

if __name__ == "__main__":
id = sys.argv[1]
file = sys.argv[2]

class c:
RED = '\033[31m'
GREEN = '\033[32m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'

def check_id(id, file):
# extract id col
if pd.__version__ >= "1.4.0":
file_df = pd.read_csv(file, on_bad_lines="skip")
Expand All @@ -17,11 +21,16 @@
elif "participant" in file_df:
id_col = file_df["participant"]
else:
sys.exit("Error: cannot find id or participant column in"+ file)
sys.exit(c.RED + "Error: cannot find id or participant column in"+ file + c.ENDC)

# check if first ids match vals listed
if isinstance(id_col[0], float) and math.isnan(id_col[0]):
print("Error: nan value seen in ID for", file, "file")
print(c.RED + "Error: nan value seen in ID for", file, "file" + c.ENDC)
else:
if not int(id_col[0]) == int(id):
print("Error: ID value in", file, "does not match", id)
print(c.RED + "Error: ID value in", file, str(id_col[0]), "does not match", id + c.ENDC)

if __name__ == "__main__":
id = sys.argv[1]
file = sys.argv[2]
check_id(id, file)
53 changes: 22 additions & 31 deletions data-monitoring/check_existence_datatype_folders.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,57 +67,48 @@
sys.exit("Can't find redcap with name " + vals[0] +", exiting.")
rc_df = pd.read_csv(redcap, index_col = vals[2])
rc_var = vals[1]
#subs_w_data = list(rc_df[rc_df[rc_var+"_"+session+"_e1_complete"] != 0].index) #? always be a _complete column?
subs_w_data = list(rc_df[rc_df[rc_var+"_"+session+"_e1_complete"] == 2].index) #? always be a _complete column?
tracker_df.loc[subs_w_data, visit+'_status_'+session+'_e1'] = 1
for sub in subs_w_data:
ignore_no_data = False
no_data_tasks = []
for task, dtype in task_datatype.items():
if dtype == 'combination':
comb_vars = df_dd.loc['arrow-alert_psychopy','provenance'].split(":")[1].split(",")
comb_vars = [x.strip("\" ") for x in comb_vars]
#comb_vars = [x.strip("\" ") for x in df_dd.loc['arrow-alert_psychopy','provenance'].split(":")[1].split(",")
folders = []
for var in comb_vars:
folders.append(df_dd.loc[var,'dataType'])
for folder in folders:
if isdir(join(checked, 'sub-'+str(int(sub)), session, folder)):
for dfile in listdir(join(checked, 'sub-'+str(int(sub)), session, folder)):
if dfile == "no-data.txt":
if task not in no_data_tasks:
no_data_tasks.append(task)
continue
if isdir(join(checked, 'sub-'+str(int(sub)), session, dtype)):
for dfile in listdir(join(checked, 'sub-'+str(int(sub)), session, dtype)):
if dfile == "no-data.txt":
ignore_no_data = True
no_data_tasks.append(task)
break
if ignore_no_data:
tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 0
# don't print error if no data
continue
allpresent = True
#corrected = False
checked = join(dataset, 'sourcedata', 'checked')
checked = join(dataset, 'sourcedata', 'checked') #?
missing_tasks = []
ignore_no_data = False
for task in vals[3]:
if not tracker_df.loc[sub, task + '_' + session + '_e1'] == 1:
if not tracker_df.loc[sub, task + '_' + session + '_e1'] == 1 and task not in no_data_tasks:
allpresent = False
missing_tasks.append(task)
elif task in no_data_tasks:
ignore_no_data = True
tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 0
if allpresent and ignore_no_data:
continue # no error
if allpresent:
tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 1
else:
tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 0
print("\033[31mError: Expected tasks " + ", ".join(missing_tasks) + " not seen in subject " + str(sub) + ", session " + session + ".\033[0m")
# check raw
#raw = join(dataset, 'sourcedata', 'raw', session)
#for task in vals[3]:
# if not isdir(join(raw, task)):
# #error
# else:
# for sub in subs_w_data:
# if not isdir(join(raw, task, 'sub-'+str(int(sub)))):
# #error
## check checked
#checked = join(dataset, 'sourcedata', 'checked')
#for task in vals[3]:
# allpresent = True
# for sub in subs_w_data:
# if not isdir(join(checked, 'sub-'+str(int(sub)), session, task)):
# #error
# allpresent = False
#if allpresent:
# tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 1
#else:
# tracker_df.loc[sub, visit+'_data_'+session+'_e1'] = 0
tracker_df.to_csv(tracker)


Expand Down
3 changes: 2 additions & 1 deletion data-monitoring/hallMonitor.sub
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#module load miniconda3-4.5.11-gcc-8.2.0-oqs2mbg
#./hallMonitor.sh
module load singularity-3.8.2
singularity exec -e /home/data/NDClab/tools/containers/python-3.8/python-3.8.simg ./hallMonitor.sh
singularity exec -e /home/data/NDClab/tools/containers/python-3.9/python-3.9.simg ./hallMonitor.sh

source /home/data/NDClab/tools/lab-devOps/scripts/monitor/tools.sh
logfile="data-monitoring-log.md"
Expand All @@ -18,6 +18,7 @@ if [[ $NUMERRORS -gt 0 ]]; then
error_detected="true"
else
error_detected="false"
touch slurm-${SLURM_JOB_ID}_errorlog.out
fi
if [ $error_detected = true ]; then
update_log "error; $NUMERRORS errors seen, check slurm-${SLURM_JOB_ID}_errorlog.out for more info" $logfile
Expand Down
7 changes: 5 additions & 2 deletions data-monitoring/preprocess_wrapper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,12 @@ fi
if [[ -z "$score_only" ]]
then
#mem_needed=$(( $totalsubs * 10 )) # ~10gb / sub
if [[ $totalsubs -lt 4 ]]
then
cpus=$totalsubs
fi
mem_needed=$(( $cpus * 10 )) # ~10gb / sub
#walltime_needed=$(( (totalsubs+cpus-1) / 4 * 10 ))
walltime_needed=$(( (totalsubs+cpus-1) / 4 * 16 ))
walltime_needed=$(( (totalsubs+cpus-1) / 4 * 10 ))
sbatch --mem=${mem_needed}G --time=${walltime_needed}:00:00 --cpus-per-task=$cpus --account=iacc_gbuzzell --partition=highmem1 --qos=highmem1 --export=ALL,sstr=${sstr},nstr=${nstr} preprocess.sub
else
sbatch --mem=1G --time=00:30:00 --export=All,score=${score_only},dummy=${dummy} preprocess.sub
Expand Down
Empty file modified data-monitoring/update-tracker-postMADE.py
100644 → 100755
Empty file.

0 comments on commit 353b684

Please sign in to comment.