-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgreedyNMIalignNeighborhood.m
108 lines (94 loc) · 4.27 KB
/
greedyNMIalignNeighborhood.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
function [outClust] = greedyNMIalignNeighborhood(sets, W, maxIter)
%this function takes in a matrix where the rows indicate variables and
%the columns indicate different clusterings of those variables. All
%values should be positive integers except for the value -1, which is
%reserved to indicate unclustered/noise. The input sets are all made to
%fit a standardized format. Then the average normalized mutual
%information between each set and all of the other sets is calculated.
%The set with the best starting average normalized mutual information
%is used as the seed for the algorithm. A greedy search algorithm loops
%through the variables changing each one, one at a time, to all of the
%possible clusters and recalculating normalized mutual information at
%each step. Changes that increase the normalized mutual information are
%accepted. This loop continues until no changes are made. The resulting
%cluster scheme is returned.
%input:
% sets: variables X cluster schemes
% W : weight vector of how much to weight the input sets by
%maxIter: maximum iterations
%output:
% outClust: a vector with a single cluster scheme that has maximal
% average mutual information with all of the input sets
%Adam Dede, [email protected], Fall 2021
%Ref: Strehl, A., Ghosh, J. (2002). Cluster Ensembles--A knowledge reuse
%framework for combining multiple partitions. J Machine Learning Research
%% first make the input sets fit the form
%1) label of item 1 = 1 (or -1, and first clustered electrode is
%labeled as 1)
%2) label of item i+1 is <= max( of all labels used in items 1:i) + 1
%3) keep -1 label for "noise" electrodes
for ii = 1:size(sets,2) %loop over sets
curID = 1;
curS = sets(:,ii);
outSet = zeros(length(curS),1);
for tt = 1:size(sets,1) %loop over electrodes
if curS(tt)==-1 %don't change noise
outSet(tt) = -1;
else
idx = find(curS(1:tt-1)==curS(tt));
if ~isempty(idx) %this cluster has been seen before, so grab whatever label was used before
outSet(tt) = outSet(idx(1));
else %this is the first instance of this cluster
outSet(tt) = curID;
curID = curID + 1;
end
end
end
sets(:,ii) = outSet;
end
W = W ./ sum(W);
W = eye(length(W)) .* W;
%% get the starting best average nmi and use it as a starting seed
test = mean(W*(reshape(arrayfun(@(x,y) nmi(sets(:,x), sets(:,y)), ...
reshape(repmat([1:size(sets,2)],[size(sets,2),1]), [size(sets,2)^2,1]), ...
reshape(repmat([1:size(sets,2)],[size(sets,2),1])', [size(sets,2)^2,1])), ...
[size(sets,2), size(sets,2)])-eye(size(W))), 'omitnan');
[~, seedI] = max(test);
outClust = sets(:,seedI);
valToBeat = mean(diag(W)'.*arrayfun(@(x) nmi(outClust, sets(:,x)), [1:size(sets,2)]), 'omitnan');
IDs = unique(sets); %possible cluster IDs
check = true;
loopi = 1;
%loop through all trodes and try changing each to each possible ID,
%recheck aNMI, if better accept change, if not, don't, stop looping
%when no changes are made for a whole run through the trodes
while check
noChange = true;
changei = 0;
% tic
for ii = 1:length(outClust)
curID = outClust(ii);
center = find(IDs==curID);
if center<4
center = 4;
end
if center+3>length(IDs)
center = length(IDs) - 3;
end
for kk = center-3:center+3
testSet = outClust;
testSet(ii) = IDs(kk);
test = mean(diag(W)'.*arrayfun(@(x) nmi(testSet, sets(:,x)), [1:size(sets,2)]), 'omitnan');
if test > valToBeat
noChange = false;
outClust = testSet;
valToBeat = test;
end
end
end
loopi = loopi + 1;
if noChange || loopi > maxIter
check = false;
end
end
end