-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDBscanDynamicEpi.m
281 lines (242 loc) · 10.1 KB
/
DBscanDynamicEpi.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
function [idxVals] = DBscanDynamicEpi(varargin)
%This function is a wrapper for Matlab's builtin DBscan function
%There must be at least one input in the first input position:
%1) corMat : a symmetric matrix of correlations between points to be
% clustered (also works with any symmetric similarity matrix)
%There are four optional inputs (which are assumed to be inputs 2-5):
%2) k : The k parameter for input to DBscan [default = 3]
%3) transform: how the input corMat should be transformed before clustering
% [default = 'raw']
%4) d : the number of extra loops of distancing to perform; only
% works with 'mapDistx' as the transformation; [default = 1]
%5) plotIt : 0 = do not make summary plots; 1 = make summary plots
% [default = 0]
%four main computations are performed:
%1) transformation of the corMat. The goal is to increase the contrast of
%the input matrix to make for better clustering output. The possible
%transformations are specified as string (or numeric) inputs to the transform varargin
%as follows:
% 1 OR 'raw'[default]: pdist2(corMat, corMat, 'correlation')
% 2 OR 'map' : pdist2(corrcoef(corMat), corrcoef(corMat), 'correlation')
% 3 OR 'mapDist' : pdist2(corrcoef(corMat), corrcoef(corMat))
% 4 OR 'mapDist2' : pdist2(corrcoef(corMat), corrcoef(corMat)) .^2
% 5 OR 'mapDistX' : loops d additional times on the output from the
% mapDist2 option
%2) finding of appropriate epsilon value. Using the k-distance values in
%the transformed matrix, an epsilon value is selected that maximizes the
%steepness of the sorted k-distance curve. See discussion of Figure 4 in
%Ester et al. (1986) for more detail.
%3) application of DBscan
%4) plotting if requested
%output is a 1-d vector of cluster assignments where -1 means unclustered and
%positive integers indicate different clusters
%the general idea of transforming the space to be clustered from one of
%point-wise connections/similarities/correlations to one of row by column
%connections using corrcoef(corMat) was inspired by Liu et al. (2012).
%NOTE: the way the threshold for step 2 (finding epsilon) is decided effectively
%assumes that there are 30-100 items to be clustered. This is because it
%disregards the points that are +-5 positions around the maximum in the
%derivative of the k-distance curve. For clustering of larger data sets
%this value should be increased. Key edit at lines 154-160
%Adam Dede, [email protected], Fall 2021
%Refs:
%Liu, Zhu, Chen, 2012. J neurosci Methods.
%Ester, Kriegel, Sander, Xu, 1996
%% get out the inputs
switch nargin
case 1
corMat = varargin{1};
k = 3;
transform = 'raw';
d = 1;
plotIt = 0;
case 2
corMat = varargin{1};
k = varargin{2};
transform = 'raw';
d = 1;
plotIt = 0;
case 3
corMat = varargin{1};
k = varargin{2};
transform = varargin{3};
d = 1;
plotIt = 0;
case 4
corMat = varargin{1};
k = varargin{2};
transform = varargin{3};
d = varargin{4};
plotIt = 0;
case 5
corMat = varargin{1};
k = varargin{2};
transform = varargin{3};
d = varargin{4};
plotIt = varargin{5};
otherwise
warning('Error: at least one input is needed')
return
end
if isnumeric(transform)
if transform==1
transform = 'raw';
elseif transform==2
transform = 'map';
elseif transform==3
transform = 'mapDist';
elseif transform ==4
transform = 'mapDist2';
elseif transform > 4
transform = 'mapDistX';
end
end
if isnumeric(transform)
warning('transform argument must be a positive integer or a string')
return
end
if isempty(k) || ~isnumeric(k)
k = 3;
end
%% transformation step
% various ways of increasing contrast on the input correlationo matrix
% convert into a correlation matrix of relationships between connectivity
% maps rather than between individual channels is inspired by Liu et al., 2012
switch transform
case 'raw'
distMat = pdist2(corMat, corMat, 'correlation');
case 'map'
distMat = pdist2(corrcoef(corMat), corrcoef(corMat), 'correlation');
case 'mapDist'
distMat = pdist2(corrcoef(corMat), corrcoef(corMat));
case 'mapDist2'
distMat = pdist2(corrcoef(corMat), corrcoef(corMat)) .^2;
case 'mapDistX'
distMat = pdist2(corrcoef(corMat), corrcoef(corMat)) .^2;
for ii = 1:d
distMat = pdist2(distMat, distMat).^2;
end
end
%% find appropriate epsilon value:
%f
%% find the k-distance for each point
kdist = zeros(length(distMat),1);
for ii = 1:length(kdist)
curRow = distMat(ii,:);
curRow = sort(curRow);
kdist(ii) = curRow(k);
end
kdist = sort(kdist);
%use the derivative (running diff) of k-distances to find discontinuity
%smoothing is important to identify where the initial sustained rise occurs
kdist_diff = smoothdata(diff(kdist), 'gaussian', length(kdist)/10);
%% create a threshold for what steepness counts as real
% Larger sets won't work so well because of how this threshold is set
% Attempted fix: use of trim variable makes cutting off
% around the max dynamic
[~, max_diff_loc] = max(kdist_diff);
temp = kdist_diff;
trim = round(length(temp)/20);
%trim out the area around the max value (usually going to be the end)
if max_diff_loc<trim
temp(1:trim) = [];
elseif max_diff_loc>length(kdist_diff)-trim
temp(length(kdist_diff)-trim:end) = [];
else
temp(max_diff_loc-ceil(trim/2):max_diff_loc+floor(trim/2)) = [];
end
mean_diff = mean(temp);
std_diff = std(temp);
thresh = mean_diff + 2*std_diff;
%% find candidate inflection points where steepness is maximal
%where does the second derivative cross 0 in the negative direction?
kdist_diff2 = diff(kdist_diff);
kdist_diff2(kdist_diff2<0) = -1;
kdist_diff2(kdist_diff2>0) = 1;
candidates = find(diff(kdist_diff2)==-2) + 1;
%down select to above threshold peaks in steepness
candidates(kdist_diff(candidates)<thresh) = [];
%don't use a peak too near the end or beginning
candidates(candidates>(length(kdist)-trim/2) | candidates<(trim*4)) = [];
%% figure out which steepness peak to use
if isempty(candidates)
%check if the threshold was ever crossed
if max(kdist_diff) > thresh
%it is crossed, so just grab the point where it crosses and use it
peakToUse = find(kdist_diff(trim*4:end)>thresh,1)-1+trim*4;
if peakToUse == 0 && max(kdist_diff(2:end)) > thresh
peakToUse = find(kdist_diff(2:end)>thresh,1);
elseif peakToUse == 0
peakToUse = 1;
end
else
%try again without the smoothing to see if it can be salvaged
kdist_diff = diff(kdist);
kdist_diff2 = diff(kdist_diff);
kdist_diff2(kdist_diff2<0) = -1;
kdist_diff2(kdist_diff2>0) = 1;
candidates = find(diff(kdist_diff2)==-2) + 1;
%down select to above threshold peaks in steepness
candidates(kdist_diff(candidates)<thresh) = [];
%don't use a peak too near the end
candidates(candidates>length(kdist)-5) = [];
if isempty(candidates)
%if there is still no possible peak above threshold, then it
%suggests that the input matrix is just a bunch of noise.
%it might be possible to salvage by trying adding one or more
%rounds of extra distancing in the initial step above:
%distMat = pdist2(distMat, distMat) .^2;
peakToUse = [];
else
peakToUse = candidates(1);
end
end
else
%take the first peak as this will yield the most granular
%clustering
peakToUse = candidates(1);
end
%% now do the clustering!
if ~isempty(peakToUse)
% epsis(epsiidx)
% idxVals = dbscan(distMat, epsis(epsiidx), k, 'Distance','precomputed');
% kdist(peakToUse)
idxVals = dbscan(distMat, kdist(peakToUse), k, 'Distance','precomputed');
%% plot summary of how epsilon was chosen?
if plotIt ==1
figure
subplot(231)
plot(kdist)
hold on
yline(kdist(peakToUse))
xline(peakToUse)
subplot(232)
plot(kdist_diff)
hold on
yline(thresh, '--', 'LineWidth', 3, 'alpha', .25)
xline(peakToUse)
subplot(233)
imagesc(corMat)
subplot(234)
imagesc(distMat)
subplot(235)
[~, order] = sort(idxVals);
imagesc(corMat(order, order))
locs = find(diff(sort(idxVals)));
for val=1:length(locs)
yline(locs(val)+.5, '--', 'LineWidth', 3, 'alpha', .5, 'color', 'k')
xline(locs(val)+.5, '--', 'LineWidth', 3, 'alpha', .5, 'color', 'k')
end
subplot(236)
imagesc(distMat(order, order))
for val=1:length(locs)
yline(locs(val)+.5, '--', 'LineWidth', 3, 'alpha', .5, 'color', 'k')
xline(locs(val)+.5, '--', 'LineWidth', 3, 'alpha', .5, 'color', 'k')
end
end
else
idxVals = ones(length(distMat), 1) * -1;
% "warning: no good epsilon value found, returning -1"
% d
end
end