Merge pull request #1 from hharveygit/main

Initial commit of HH preprocessing code and scalar broadband analysis, from discussion this morning
MultimodalNeuroimagingLab · Mar 19, 2024 · 3f31f5e · 3f31f5e
2 parents f4611ea + a2e75f9
commit 3f31f5e
Show file tree

Hide file tree

Showing 14 changed files with 1,013 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+output/
+setPathsNSD.m
+.DS_Store
+*/.DS_Store
diff --git a/README_preprocessing_notes_HH.txt b/README_preprocessing_notes_HH.txt
@@ -0,0 +1,30 @@
+Follow order of code in main.m
+
+1. preprocNSDMef
+	- subjectConfig: which tasks and runs (copies) to load? Which out of these are bad to skip?
+	- metadata/sub_sessions.txt -- loaded and added as column in events to use as possible predictor
+	- load mef data
+	- create events structure that only has stim and test trials. Rest are "pre_duration"
+		- Keep only trials that are good DURING trial and in rest before trial
+		- Also remove trials with high interictals during or in rest before trial
+		- random effects predictors: task number run, session
+	- ieeg_highpass data
+	- convert data to match events, on interval [-2, 2)s. Remove any trials missing this full range
+	- re-referencing by CARVariance, with 'var' option and bottom 25%ile. badChs are SOZ and "bad" chs
+	- concatenate all trials across all runs
+	- badChs unique to each run stored in taskruns variable
+	- save data as .mat file
+	- save eventsST as text file
+	- currently, EKG channel is not saved
+
+2. analyzePreprocScalarBB
+	- Remove all trials corresponding to bad tasks/runs
+	- bipolar-reference electrodes, using manually written lead segment data in metadata/.
+	- for each run, find which bipolar electrodes are bad. Create masks (chs x trs) that equal 1 when the channel is bad in the run in which the trial is located
+	- for electrode localization, I use getXyzsInf to inflate electrodes. Checks 4mm of white, and also 2mm of pial, which is assigned if none found in proximity to white.
+	- aCAR electrode analysis
+		- PSDs: hann(srate/4), pwelch on 0-0.5s, normalized by -0.5-0s.
+		- scalar broadband power = geometric mean of PSD from 70 to 170 Hz in 1 Hz bins, ignoring 115-125 range
+		- rsqs are calculated using power. Keep only channels with at least 5 good runs. Saved as elecsBB1.tsv -> do this in log power as well
+		- Noise ceiling SNR and permutation p-value calculated with estimateNCSNR on log-transformed scalar broadband power. Saved as elecsBB2.tsv
+	- The same thing as above is done for the bipolar-referenced electrodes
diff --git a/data/metadata/sub-X_segmentedLeads.txt b/data/metadata/sub-X_segmentedLeads.txt
@@ -0,0 +1,2 @@
+seg5	seg6
+LB	LA, LC
diff --git a/data/metadata/sub-X_sessions.txt b/data/metadata/sub-X_sessions.txt
@@ -0,0 +1,11 @@
+task	run	session	notes
+1	1	1	
+2	1	1	
+3	1	2	
+4	1	2	
+5	1	3	
+6	1	3	notes about something important in this run
+7	1	3	
+8	1	4	
+9	1	4	
+10	1	4	
diff --git a/data/stims/special100Names.txt b/data/stims/special100Names.txt
@@ -0,0 +1,100 @@
+shared0004_nsd03078
+shared0008_nsd03172
+shared0022_nsd03914
+shared0030_nsd04424
+shared0033_nsd04668
+shared0052_nsd05584
+shared0064_nsd06522
+shared0069_nsd06802
+shared0073_nsd07208
+shared0137_nsd11636
+shared0139_nsd11726
+shared0140_nsd11797
+shared0145_nsd11943
+shared0157_nsd12923
+shared0159_nsd13139
+shared0163_nsd13614
+shared0186_nsd15365
+shared0194_nsd16345
+shared0197_nsd16617
+shared0211_nsd17375
+shared0234_nsd19437
+shared0267_nsd21951
+shared0287_nsd22880
+shared0300_nsd23994
+shared0307_nsd24531
+shared0310_nsd24641
+shared0318_nsd25112
+shared0326_nsd25372
+shared0334_nsd25703
+shared0350_nsd26459
+shared0358_nsd27243
+shared0362_nsd27436
+shared0369_nsd28069
+shared0378_nsd28350
+shared0382_nsd28596
+shared0404_nsd30396
+shared0405_nsd30408
+shared0425_nsd31660
+shared0463_nsd34830
+shared0474_nsd35987
+shared0487_nsd36975
+shared0488_nsd36979
+shared0491_nsd37225
+shared0498_nsd38023
+shared0507_nsd38487
+shared0520_nsd39370
+shared0530_nsd40549
+shared0535_nsd40847
+shared0568_nsd42698
+shared0570_nsd42852
+shared0579_nsd43225
+shared0588_nsd43676
+shared0589_nsd43690
+shared0591_nsd43819
+shared0610_nsd44845
+shared0614_nsd45214
+shared0616_nsd45596
+shared0623_nsd45982
+shared0634_nsd46373
+shared0646_nsd47071
+shared0650_nsd47294
+shared0689_nsd50115
+shared0694_nsd50756
+shared0695_nsd50812
+shared0700_nsd51078
+shared0727_nsd53053
+shared0730_nsd53156
+shared0733_nsd53371
+shared0745_nsd54258
+shared0746_nsd54362
+shared0754_nsd54914
+shared0764_nsd55679
+shared0768_nsd55969
+shared0786_nsd56785
+shared0789_nsd56912
+shared0790_nsd56949
+shared0797_nsd57554
+shared0811_nsd59024
+shared0825_nsd59995
+shared0853_nsd61874
+shared0857_nsd62210
+shared0869_nsd63183
+shared0876_nsd63932
+shared0882_nsd64499
+shared0896_nsd65254
+shared0905_nsd65800
+shared0910_nsd66005
+shared0925_nsd66977
+shared0936_nsd67830
+shared0941_nsd68340
+shared0944_nsd68742
+shared0948_nsd68898
+shared0960_nsd69814
+shared0962_nsd69840
+shared0968_nsd70194
+shared0969_nsd70233
+shared0974_nsd70506
+shared0986_nsd71411
+shared0991_nsd72016
+shared0999_nsd72720
diff --git a/functions/estimateNCSNR.m b/functions/estimateNCSNR.m
@@ -0,0 +1,90 @@
+%% This function estimates the NCSNR of input signal strengths, as the SD of signal divided by the SD of noise
+%   Derived from estimateNoiseCeiling.m
+%
+%   Variance due to noise is estimated from trials of the same stimulus, averaged across all stimuli. The remaining
+%   variance is explained by signal, at the single trial level. If numAvg is given, will return the noise ceiling
+%   corresponding to averaging numAvg trials together (noise variance is reduced numAvg-fold, assuming Gaussian).
+%
+%   NCSNR = estimateNoiseCeiling(strength);
+%   [NCSNR, p] = estimateNoiseCeiling(strength, nperm);
+%       strength =          nx1 cell, measure of scalar signal strength for each trial with n unique stimuli and variable trials per stimulus.
+%                               Examples include average bandpower or a coefficient fit for the trial on some template (e.g. fmri beta)
+%       nPerm =            (optional) positive integer, number of permutations to use to construct permutation null model. Default = 1000
+%
+%   Returns:
+%       NCSNR =             double, SNR as (SD of Signal)/(SD of noise).
+%
+%   Based on:
+%   Allen, E.J., St-Yves, G., Wu, Y. et al. A massive 7T fMRI dataset to bridge cognitive neuroscience and artificial intelligence.
+%       Nat Neurosci 25, 116?126 (2022). https://doi.org/10.1038/s41593-021-00962-x
+%
+%   For derivations and detailed explanation, please see noise ceiling estimation in Methods section of article.
+%       - Some more info is in "Functional data (NSD)/Noise ceiling" in NSD data manual, https://cvnlab.slite.com/p/channel/CPyFRAyDYpxdkPK6YbB5R1
+%
+%   Demo:
+%       NCSNRtest = zeros(10000, 1);
+%       for ii = 1:10000
+%           xx = mat2cell(rand*10*randn(600, 1) + rand*10, 6*ones(100, 1));
+%           NCSNRtest(ii) = estimateNCSNR(xx);
+%       end
+%       figure; histogram(NCSNRtest, 30);
+%
+% HH 2022/05
+%
+function [NCSNR, p, NCSNRNull] = estimateNCSNR(strength, nPerm)
+
+
+    % Formatting: vertical cell array and vertical array in each cell, remove nan entries
+    strength = strength(:);
+    strength = cellfun(@(x) x(:), strength, 'UniformOutput', false);
+    strength = cellfun(@(x) x(~isnan(x)), strength, 'UniformOutput', false);
+
+    % number of entries in each cell
+    cellLengths = cellfun(@length, strength);
+    assert(sum(cellLengths >= 3) > 0.5*length(cellLengths), 'Error: More than half of conditions have <3 trials. Noise SD is poorly estimated');
+
+    % Zscore strength so SD = var = 1
+    meanStrength = mean(cell2mat(strength));
+    SDStrength = std(cell2mat(strength), 0); % using N-1 unbiased estimator
+    Zstr = cellfun(@(x) (x - meanStrength)/SDStrength, strength, 'UniformOutput', false);
+
+    % Calculate NCSNR
+    NCSNR = calcNCSNR(Zstr);
+
+    if nargout < 2, return; end % no p-value requested
+
+    % Calculate null distribution and p-value
+    if nargin < 2 || isempty(nPerm), nPerm = 1000; end
+    assert(nPerm > 0, 'Error: nPerm must be a positive integer');
+    NCSNRNull = nan(nPerm, 1);
+    ZstrAll = cell2mat(Zstr);
+    n = length(ZstrAll); % total number of strength entries
+    for ii = 1:nPerm
+        Perm = randperm(n);
+        ZstrNull = mat2cell(ZstrAll(Perm), cellLengths);
+        NCSNRNull(ii) = calcNCSNR(ZstrNull);
+    end
+    p = sum(NCSNRNull >= NCSNR) / nPerm;
+
+end
+
+% Helper function so as to call it directly with normalized ZstrNull
+function NCSNR = calcNCSNR(Zstr)
+
+    cellLengths = cellfun(@length, Zstr);
+
+    % residual variance across trials of each stimuli, averaged across stimuli.
+    % Reason to avg stimuli with >= 3 trials bc Bessel correction is still biased for SD (underestimates noise SD here). 3 matches sample size in NSD fMRI
+    varNoise = mean(cellfun(@(x) var(x, 0), Zstr(cellLengths >= 3)));
+    sdNoise = sqrt(varNoise);
+
+    varSig = 1 - varNoise; % variability explained by signal. Check this distribution using demo. White noise is expected to have varSig ~0 on avg
+
+    if varSig <= 0
+        sdSig = 0; % half-rectification of signal SD (signal variance can be negative like R^2)
+    else
+        sdSig = sqrt(varSig); % sqrt of variance explained by signal
+    end
+    NCSNR = sdSig/sdNoise; % noise ceiling signal-to-noise ratio
+
+end
diff --git a/functions/getSpecial100Idxes.m b/functions/getSpecial100Idxes.m
@@ -0,0 +1,27 @@
+%% Finds trial indices corresponding to each special 100 image, by path name
+% Returns a 100 x 2 cell array. First column = name of special100 file, second column = trial indices
+%
+function special100Idxes = getSpecial100Idxes(stimFiles)
+
+    if exist('data/stims/special100Names.txt', 'file') % read stim names directly if they've been written
+        stimNames = readcell('data/stims/special100Names.txt', 'Delimiter', ',');
+        assert(length(stimNames) == 100, 'Error: special100Names did not contain 100 entries');
+    else % get names from image set and save as output
+        % Get names of stimuli
+        stimPaths = glob('data/stims/special100/*.png');
+        assert(length(stimPaths) == 100, 'Error: Did not find 100 special100 files');
+        stimNames = extractBetween(stimPaths, 'special100/', '.png');
+        writecell(stimNames, 'data/stims/special100Names.txt');
+    end
+
+    % find indices of each special100 image
+    idxes = cell(100, 1);
+    for ii = 1:100
+        idxes{ii} = find(contains(stimFiles, stimNames{ii}));
+        assert(~isempty(idxes{ii}), 'Error: No trial index found for special100 image %d\n', ii);
+    end
+
+    % Add name and indices
+    special100Idxes = [stimNames, idxes];
+
+end
diff --git a/functions/getXyzsInf.m b/functions/getXyzsInf.m
@@ -0,0 +1,65 @@
+%% This function returns inflated gifti XYZ positions from an electrodes.tsv table
+% Adapted from DH ieeg_snap2inflate.m to perform separately for each hemisphere
+%
+%   xyzsInf = getXyzsInf(electrodes, hemi, gii, giiInf);
+%   xyzsInf = getXyzsInf(electrodes, hemi, gii, giiInf, distThresh);
+%       electrodes =        nx_ table, read from an electrodes.tsv file. Must contain column "hemisphere" (str, 'R' or 'L').
+%                               Must also contain columns "Destrieux_label" (numerical) and "Destrieux_label_text" if <checks> is given as true.
+%       hemi =              char, 'r' or 'l', corresponding to the hemisphere of the pial and inflated giftis
+%       gii =               gifti object, pial surface of one hemisphere
+%       giiInf =            gifti object, inflated surface of one hemisphere
+%       distThresh =        double (optional), Only electrodes within <distThresh> mm of a pial vertex are kept in output.
+%                               Default = 5.
+%       checks =            logical (optional), whether to more strictly check that each electrode is cortical and not WM/thalamic/outside brain. Default == false
+%
+%   Returns:
+%       xyzsInf =           nx3 double, XYZ positions of gray matter electrodes in inflated gifti space. Electrodes in
+%                               white matter, contralateral hemisphere, or beyond <distThresh> are returned as [NaN, NaN, NaN] rows.
+%
+%   HH 2021
+%
+function xyzsInf = getXyzsInf(electrodes, hemi, gii, giiInf, distThresh, checks)
+
+    if nargin < 6, checks = false; end % to perform checks based on Destrieux atlas names
+    if nargin < 5 || isempty(distThresh), distThresh = 5; end % maximum distance allowed between valid electrode and gifti vertex
+
+    hemi = lower(hemi);
+    assert(ismember(hemi, {'r', 'l'}), "hemi must be given as 'r' or 'l'");
+    assert(size(gii.vertices, 1) == size(giiInf.vertices, 1), 'gii and giiInf do not have the same number of vertices');
+
+    xyzs = [electrodes.x, electrodes.y, electrodes.z];
+    hemiLabs = lower(electrodes.hemisphere);
+
+    if checks
+        anatLabs = electrodes.Destrieux_label; % numerical labels
+        anatText = electrodes.Destrieux_label_text; % char labels
+    else
+        anatLabs = zeros(height(electrodes), 1);
+        anatText = cell(height(electrodes), 1);
+    end
+
+    xyzsInf = nan(size(xyzs));
+    for ii = 1:size(xyzs, 1)
+
+        if ~strcmp(hemiLabs(ii), hemi) % wrong hemisphere, continue
+            continue
+        end
+
+        if checks && (isnan(anatLabs(ii)) || ismember(anatLabs(ii), [0, 41, 49, 10])) % non existent, outside brain, WM, or L/R thalamic
+            continue
+        end
+
+        if checks && (~startsWith(anatText{ii}, sprintf('%sh', hemi))) % must start with lh or rh to be a cortical site
+            continue
+        end
+
+        dists = vecnorm(gii.vertices - xyzs(ii, :), 2, 2); % distance from each vertex in gifti to current electrode
+        [minDist, idx] = min(dists);
+
+        if minDist <= distThresh
+            xyzsInf(ii, :) = giiInf.vertices(idx, :);
+        end
+
+    end
+
+end
diff --git a/functions/goodChsNRuns.m b/functions/goodChsNRuns.m
@@ -0,0 +1,33 @@
+% This function returns which channels are good across at least n runs, where the input is bad chs for each run
+%
+%   goodChs = goodChsNRuns(badChs, numChs, n);
+%   [goodChs, badChs] = goodChsNRuns(badChs, numChs, n);
+%
+%   INPUTS
+%       badChsRuns =        1xn numerical cell array. bad channels in each run
+%       numChs =        num. Total number of channels
+%       n      =        num. Minimum number of runs for a channel to be good in
+%
+%   OUTPUTS
+%       goodChsOnN =       1xp num. Indices of good channels across >= n runs
+%       badChsOnN =        1xq num. Indices of bad channels (complement of goodChs on 1:numChs)
+%
+function [goodChsOnN, badChsOnN] = goodChsNRuns(badChsRuns, numChs, n)
+
+    numRuns = length(badChsRuns);
+    assert(n >= 1 && n <= numRuns, 'Error: n must be specified between 1 and %d runs', numRuns);
+
+    % chs x runs. 1 indicates a bad channel, 0 indicates a good channel
+    badChsLogical = zeros(numChs, numRuns, 'logical');
+    for ii = 1:numRuns
+        badChsLogical(badChsRuns{ii}, ii) = true;
+    end
+
+    % find which rows have at least n 0s
+    goodChsLogical = sum(~badChsLogical, 2) >= n;
+
+    % convert to numerical indices, and also return complement as bad chs
+    goodChsOnN = find(goodChsLogical);
+    badChsOnN = find(~goodChsLogical);
+
+end