-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscand.m
273 lines (259 loc) · 8.84 KB
/
scand.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
function model = scand(X, Y, Z, G, varargin)
% SCAND Approximately fit a linear mixed effects model to the data.
%
% SCAND (Speedy Cluster Analysis for Neuroimaging Data) is a high-level
% function for approximating a linear mixed effects model to large data
% for which optimal fitting with REML (restricted maximum likelihood)
% algorithms would be prohibitively slow. The function syntax approximates
% that of Matlab's FITLMEMATRIX:
%
% model = scand(X, Y, Z, G, 'method', method_name ...)
%
% And approximately fits a subset of the random effects models supported by
% fitlmematrix, specifically:
%
% Y = X*B + sum(sigma2_i * Z_i * Z_i') + e
%
% The required parameters are:
%
% X: The n observations x p predictors fixed-effects design matrix.
% Y: The n x m observed data, with m independent columns.
% Z: The n x q predictors random-effects design matrix, see notes below.
% G: An n x 1 categorical grouping variable.
%
% Arguments X and Y have the same meaning as in ordinary least squares
% regression. The random-effects design matrix Z follows a Matlab
% convention and is different from the "raw" fixed effects design matrix.
% Consider the simple case of modeling a random interecept for three
% clusters (groups). A typical random-effects design matrix strcmpwould have
% three columns encoding the three groups with ones and zeros:
%
% Z =
% 1 0 0
% 1 0 0
% 0 1 0
% 0 1 0
% 0 0 1
% 0 0 1
%
% Matlab (and scand) use an alternative convention in which Z is a single
% column and groups are encoded in a separate, categorical variable G:
%
% Z =
% 1
% 1
% 1
% 1
% 1
% 1
% G =
% 1
% 1
% 2
% 2
% 3
% 3
%
% If there is more than one grouping variable, then Z and G can each be a
% cell array of the same length, where each cell contains a matrix Z and
% vector G.
%
% SCAND accepts an optional argument, method, which can be one of:
%
% swe
% Fits random effects as fixed effects and adjusts the standard error
% of the marginal model using a sandwich estimator. (default)
%
% mom
% Method of moments, slower than swe but also generates estimates of
% the random effect variances. May also generate more efficient
% estimates of the marginal standard errors under some circumstances.
%
% reml
% Calls Matlab's FITLMEMATRIX. Slowest but most accurate. Useful for
% comparing models fit with one of the approximate methods above
% against the "gold standard" results. Any additional arguments
% passed to SCAND will be forwarded to FITLMEMATRIX.
%
% The output of SCAND is a structure with the following elements:
%
% B
% The (marginal) fixed effects estimates,
% in the same order as the columns in X.
%
% covB
% The variance-covariance matrix of the fixed effects, B.
% Note: to compute t-values, take B ./ sqrt(diag(covB))
%
% SE
% If SE_only is true (see below), returns only the standard error
% instead of covB. SE = sqrt(diag(covB))
%
% u
% The random effects estimates in the same order returned by Matlab's
% randomEffects() function. The random effects are ordered first by
% G, then by categories within G, and lastly by columns in Z.
%
% For the method swe, SCAND accepts another optional argument, SE_only
% (default = false). When SE_only is true then a more performant algorithm
% is used to compute only the standard errors of covB: SE =
% sqrt(diag(covB)). This is useful for performing t-tests vs Wald tests.
%
% See also: covB_to_SE()
% Parse arguments.
p = inputParser;
p.KeepUnmatched = true;
addRequired(p, 'X');
addRequired(p, 'Y');
addRequired(p, 'Z');
addRequired(p, 'G');
addParameter(p, 'method', 'swe');
addParameter(p, 'SE_only', false);
parse(p, X, Y, Z, G, varargin{:});
X = p.Results.X;
assert(size(X,2) < size(X,1));
Y = p.Results.Y;
assert(size(Y,1) == size(X,1));
Z = p.Results.Z;
G = p.Results.G;
if ~iscell(Z) && ~iscell(G)
Z = {Z};
G = {G};
end
if iscell(Z) && iscell(G)
assert(length(Z) == length(G))
for i=1:length(Z)
assert(size(Z{i},1) == size(X,1));
assert(size(Z{i},2) < size(Z{i},1));
assert(size(G{i},1) == size(X,1));
assert(iscategorical(G{i}));
assert(size(G{i},2) == 1);
end
else
error('If one of Z or G is a cell array then both must be cell arrays.');
end
method = p.Results.method;
assert( strcmp(method, 'swe') | strcmp(method, 'mom') | strcmp(method, 'reml') );
reml_args = p.Unmatched;
if ~strcmp(method, 'reml')
assert(isempty(fieldnames(reml_args)));
end
SE_only = p.Results.SE_only;
assert(islogical(SE_only));
if SE_only
assert(strcmp(method, 'swe'));
end
reml_args = struct2opt(reml_args);
clear p
% Invoke the selected method.
if strcmp(method, 'reml')
% Easy case, just delegate to fitlmematrix.
if ndims(Y) == 1 || size(Y,2) == 1
% Just one REML model to fit.
model = fitlmematrix(X, Y, Z, G, reml_args{:});
% Fixed effects.
B = fixedEffects(model);
% Covariance of fixed effects.
covB = model.CoefficientCovariance;
% Random effects.
u = randomEffects(model);
% Variance of each random effect in Z.
% The mse is the remaining homoskedastic variance.
[psi, mse] = covarianceParameters(model);
sigmas = cell2mat(cellfun(@(x)diag(x), psi, 'UniformOutput', false));
else
% Preallocate memory for results.
B = zeros(size(X,2), size(Y,2));
covB = zeros(size(X,2), size(X,2), size(Y,2));
u_len = 0; % number of random effects to allocate memory for
z_len = 0; % number of random effect variances (sigmas)
for i=1:length(G)
u_len = u_len + size(Z{i},2)*length(categories(G{i}));
z_len = z_len + size(Z{i},2);
end
u = zeros(u_len, size(Y,2));
sigmas = zeros(z_len, size(Y,2));
mse = zeros(1, size(Y,2));
% Fit a separate model independently for each column in Y.
parfor i=1:size(Y,2)
% Invoke fitlmematrix() to solve the model for this column.
model = fitlmematrix(X, Y(:,i), Z, G, reml_args{:});
% Unpack and store the results.
B(:,i) = fixedEffects(model);
covB(:,:,i) = model.CoefficientCovariance;
u(:,i) = randomEffects(model);
[psi, mse_i] = covarianceParameters(model);
mse(1,i) = mse_i;
sigmas(:,i) = cell2mat(cellfun(@(x)diag(x), psi, 'UniformOutput', false));
if mod(i, 1000) == 0
fprintf(1, '%0.2f%%\n', i/size(Y,2)*100);
end
end
clear mse_i
end
% Compose the results.
model = struct( ...
'B', B, ...
'covB', covB, ...
'u', u, ...
'sigmas', sigmas, ...
'mse', mse ...
);
elseif strcmp(method, 'swe')
% Autodetect which columns in X correspond are also columns in Z.
% Generates a cell array of X times Z matrices, one for each cell entry
% in Z. The matrix rows are columns in X. The matrix columns are
% columns in Z. A matrix element of 1 (true) means the column in X is
% also a column in Z. The matrix is zero everywhere else.
%
% TODO current version of scand() does not need this functionality.
% Consider removing completely in future version.
%{
XZ_maps = cell(1, length(Z));
for i=1:length(XZ_maps)
r = corr(X,Z{i});
r(diag(isnan(diag(r)))) = 1;
r(isnan(r)) = 0;
XZ_maps{i} = r > 0.99;
end
clear r
%}
% Augment the matrix X with a matrix of additional fixed effect
% covariates to account for the random effects.
Xaug = random_to_fixed(Z, G);
% Make sure we have enough degrees of freedom to account for all the
% random effects.
assert((size(X,2) + size(Xaug,2)) < size(X,1));
% Solve the model.
B = qr_pinv([X, Xaug]) * Y;
[B, Baug] = deal(B(1:size(X,2), :), B(size(X,2)+1:end, :));
% Approximate the random effect parameters by recombining their
% augmented columns from B.
u = fixed_to_random(Z, G, Baug);
% Find the standard error of the fixed effects using the sandwich
% estimator. We need the pseudoinverse and residuals using the fixed
% effects only.
covB = swe_block(qr_pinv(X), Y-X*B, G, SE_only);
% Compose the results.
model = struct( ...
'B', B, ...
'u', u ...
);
if SE_only
model.SE = covB;
else
model.covB = covB;
end
elseif strcmp(method, 'mom')
error('Not implemented.');
else
error(['Invalid method: ' method]);
end
end
% Unpack struct (i.e. unmatched arguments) into variadic arguments.
function c = struct2opt(s)
fname = fieldnames(s);
fval = struct2cell(s);
c = [fname, fval]';
c = c(:);
end