-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
253 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
#------------------------------------------------------------- | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# | ||
#------------------------------------------------------------- | ||
# | ||
# Builtin function that implements the k-Means clustering algorithm | ||
# | ||
# INPUT: | ||
# --------------------------------------------------------------------------------------- | ||
# X The input Matrix to do KMeans on. | ||
# k Number of centroids | ||
# runs Number of runs (with different initial centroids) | ||
# max_iter Maximum number of iterations per run | ||
# eps Tolerance (epsilon) for WCSS change ratio | ||
# is_verbose do not print per-iteration stats | ||
# avg_sample_size_per_centroid Average number of records per centroid in data samples | ||
# seed The seed used for initial sampling. If set to -1 | ||
# random seeds are selected. | ||
# --------------------------------------------------------------------------------------- | ||
# | ||
# OUTPUT: | ||
# --------------------------------------------------------------- | ||
# Y The mapping of records to centroids | ||
# C The output matrix with the centroids | ||
# --------------------------------------------------------------- | ||
|
||
def get_sample_maps(num_records:si64, num_samples:si64, approx_sample_size:si64, seed:si64) -> matrix<f64>, matrix<f64>, si64 { | ||
sample_col_map = [0.0]; | ||
if (approx_sample_size < num_records) { | ||
sample_block_size = as.si64(approx_sample_size + round(10 * sqrt(approx_sample_size))); | ||
num_rows = sample_block_size * num_samples; | ||
sample_rec_ids = rand(sample_block_size, num_samples, 0.0, 1.0, 0.5, seed); | ||
sample_rec_ids = round(log(sample_rec_ids, 2) / log(1.0 - as.f64(approx_sample_size) / as.f64(num_records), 2) + 0.5); | ||
sample_rec_ids = cumSum(sample_rec_ids); | ||
is_sample_rec_id_within_range = (sample_rec_ids <= num_records); | ||
sample_rec_ids = as.f64(sample_rec_ids * is_sample_rec_id_within_range + (num_records + 1) * (1 - is_sample_rec_id_within_range)); | ||
sample_rec_ids = reshape(as.f64(sample_rec_ids), num_rows, 1); | ||
is_row_in_samples = reshape(as.f64(is_sample_rec_id_within_range), num_rows, 1); | ||
sample_maps = ctable(seq(as.f64(1), num_rows, 1 <= num_rows ? 1 : -1), sample_rec_ids, num_rows, num_records); | ||
sample_positions = (num_rows + 1) - as.f64(is_row_in_samples) * seq(as.f64(num_rows), 1, (-1.0)); | ||
col_positions = round(0.5 + seq(as.f64(0), num_rows - 1, 1) / sample_block_size); | ||
sample_col_map = ctable(sample_positions, col_positions - 1); | ||
sample_col_map = as.matrix<f64>(sample_col_map[0:(num_rows), ]); | ||
return as.matrix<f64>(sample_maps), as.matrix<f64>(sample_col_map), sample_block_size; | ||
} | ||
one_per_record = fill(as.f64(1), num_records, 1); | ||
sample_block_size = as.si64(num_records); | ||
sample_maps = fill(as.f64(0), (num_records * num_samples), num_records); | ||
sample_col_map = fill(as.f64(0), (num_records * num_samples), num_samples); | ||
for(i in 1:num_samples) { | ||
sample_maps[(num_records*(i-1)+1) - 1:(num_records*i), ] = diagMatrix(one_per_record); | ||
sample_col_map[(num_records*(i-1)+1) - 1:(num_records*i), i - 1] = one_per_record; | ||
} | ||
return as.matrix<f64>(sample_maps), as.matrix<f64>(sample_col_map), sample_block_size; | ||
} | ||
|
||
def m_kmeans(X:matrix<f64>, k:si64 /*= 10*/, runs:si64 /*= 10*/, max_iter:si64 /*= 1000*/, eps:f64 /*= 0.000001*/, is_verbose:bool /*= false*/, avg_sample_size_per_centroid:si64 /*= 50*/, seed:si64 /*= (-1.0)*/) -> matrix<f64>, matrix<f64> { | ||
if (is_verbose) { | ||
print("BEGIN K-MEANS SCRIPT"); | ||
} | ||
|
||
num_records = as.si64(nrow(X)); | ||
num_features = as.si64(ncol(X)); | ||
num_centroids = k; | ||
num_runs = runs; | ||
|
||
if (is_verbose) { | ||
print("dim X="); print(as.si64(nrow(X))); print("x"); print(as.si64(ncol(X))); | ||
} | ||
|
||
sumXsq = sum(X ^ 2); | ||
|
||
if (is_verbose) { | ||
print("Taking data samples for initialization..."); | ||
} | ||
|
||
sample_maps, samples_vs_runs_map, sample_block_size = get_sample_maps(num_records, num_runs, num_centroids * avg_sample_size_per_centroid, seed); | ||
|
||
is_row_in_samples = sum(sample_maps, 0); | ||
X_samples = sample_maps @ X; | ||
X_samples_sq_norms = sum(X_samples ^ 2, 0); | ||
|
||
if (is_verbose) { | ||
print("Initializing the centroids for all runs..."); | ||
} | ||
|
||
All_Centroids = fill(as.f64(0), num_runs * num_centroids, num_features); | ||
|
||
min_distances = is_row_in_samples; | ||
|
||
for(i in 1:num_centroids) { | ||
min_distances_matrix_form = reshape(min_distances, sample_block_size, num_runs); | ||
cdf_min_distances = cumSum(min_distances_matrix_form); | ||
|
||
lseed = seed == (-1.0) ? (-1.0) : as.f64(seed + i); | ||
random_row = rand(1, num_runs, 0.0, 1.0, 1.0, lseed); # this is not equal to: rand(rows=1, cols=num_runs, seed=lseed) | ||
threshold_matrix = random_row * cdf_min_distances[sample_block_size - 1, ]; | ||
centroid_ids = t(sum(cdf_min_distances < threshold_matrix, 1)) + 1; | ||
|
||
centroid_placer = ctable(seq(as.f64(1), num_runs, 1 <= num_runs ? 1 : -1), sample_block_size * seq(as.f64(0), num_runs - 1, 0 <= num_runs - 1 ? 1 : -1) + centroid_ids, num_runs, sample_block_size * num_runs); | ||
centroids = centroid_placer @ X_samples; | ||
|
||
centroid_placer = ctable(seq(as.f64(i), num_centroids * (num_runs - 1) + i, num_centroids), seq(as.f64(1), num_runs, 1), as.si64(nrow(All_Centroids)), num_runs); | ||
All_Centroids = as.matrix<f64>(All_Centroids + centroid_placer @ centroids); | ||
|
||
distances = X_samples_sq_norms + samples_vs_runs_map @ sum(centroids ^ 2, 0) - 2 * sum(X_samples * (samples_vs_runs_map @ centroids), 0); | ||
min_distances = i == 1 ? as.matrix<f64>(is_row_in_samples * distances) : as.f64(min(min_distances, distances)); | ||
} | ||
|
||
# Perform K-Means iterations for all runs: | ||
|
||
termination_code = fill(as.f64(0), num_runs, 1); | ||
final_wcss = fill(as.f64(0), num_runs, 1); | ||
num_iterations = fill(as.f64(0), num_runs, 1); | ||
|
||
if (is_verbose) { | ||
print("Performing k-means iterations for all runs..."); | ||
} | ||
|
||
C = [1.0]; | ||
|
||
for(run_index in 1:num_runs) { | ||
C = All_Centroids[(num_centroids*(run_index-1)+1) - 1:(num_centroids*run_index), ]; | ||
C_old = C; | ||
iter_count = 0; | ||
term_code = 0; | ||
wcss = inf; | ||
|
||
while (term_code == 0) { | ||
D = (-2.0) * (X @ t(C)) + t(sum(C ^ 2, 0)); | ||
minD = aggMin(D, 0); | ||
wcss_old = wcss; | ||
wcss = as.f64(sumXsq + sum(minD)); | ||
|
||
if (is_verbose) { | ||
if (iter_count == 0) { | ||
print("Run ", 0); print(run_index, 0); print(", At Start-Up: Centroid WCSS = ", 0); print(wcss); | ||
} else { | ||
print("Run "); print(run_index); print(", Iteration "); print(iter_count); print(": Centroid WCSS = "); print(wcss); print("; Centroid change (avg.sq.dist.) = "); print((sum((C - C_old) ^ 2) / num_centroids)); | ||
} | ||
|
||
} | ||
|
||
P = D <= minD; | ||
P = as.matrix<f64>(P / sum(P, 0)); | ||
P_denom = sum(P, 1); | ||
C_new = (t(P) @ X) / t(P_denom); | ||
|
||
|
||
iter_count = as.si64(iter_count + 1); | ||
if (wcss_old - wcss < eps * wcss) { | ||
term_code = as.si64(1); | ||
} else { | ||
if (iter_count >= max_iter) { | ||
term_code = as.si64(2); | ||
} else { | ||
if (sum(P_denom <= 0) > 0) { | ||
term_code = as.si64(3); | ||
} else { | ||
C_old = as.matrix<f64>(C); | ||
C = as.matrix<f64>(C_new); | ||
} | ||
} | ||
} | ||
} | ||
|
||
if (is_verbose) { | ||
print("Run "); print(run_index); print(", Iteration "); print(iter_count); print(": Terminated with code = "); print(term_code); print(", Centroid WCSS = "); print(wcss); | ||
} | ||
|
||
All_Centroids[(num_centroids*(run_index-1)+1) - 1:(num_centroids*run_index), ] = C; | ||
final_wcss[run_index - 1, 0] = [wcss]; | ||
termination_code[run_index - 1, 0] = as.matrix<f64>(term_code); | ||
num_iterations[run_index - 1, 0] = as.matrix<f64>(iter_count); | ||
} | ||
|
||
# Select the run with the best Centroid-WCSS and output its centroids | ||
|
||
termination_bitmap = fill(as.f64(0), num_runs, 3); | ||
termination_bitmap_raw = ctable(seq(as.f64(1), num_runs, 1), termination_code); | ||
termination_bitmap[, 0:ncol(termination_bitmap_raw)] = termination_bitmap_raw; | ||
termination_stats = sum(termination_bitmap, 1); | ||
|
||
if (is_verbose) { | ||
print("Number of successful runs = "); print(as.si64(as.scalar(as.f64(termination_stats[0, 0])))); | ||
print("Number of incomplete runs = "); print(as.si64(as.scalar(as.f64(termination_stats[0, 1])))); | ||
print("Number of failed runs (with lost centroids) = "); print(as.si64(as.scalar(as.f64(termination_stats[0, 2])))); | ||
} | ||
|
||
num_successful_runs = as.scalar(as.f64(termination_stats[0, 0])); | ||
|
||
Y = [1.0]; | ||
|
||
if (num_successful_runs > 0) { | ||
final_wcss_successful = replace(final_wcss, inf, 0) * termination_bitmap[, 0]; | ||
worst_wcss = aggMax(final_wcss_successful); | ||
best_wcss = aggMin(final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap[, 0])); | ||
avg_wcss = sum(final_wcss_successful) / num_successful_runs; | ||
best_index_vector = (final_wcss_successful == best_wcss); | ||
aggr_best_index_vector = cumSum(best_index_vector); | ||
best_index = as.si64(sum(aggr_best_index_vector == 0) + 1); | ||
|
||
if (is_verbose) { | ||
print("Successful runs: Best run is "); print(best_index); print(" with Centroid WCSS = "); print(best_wcss); print("; Avg WCSS = "); print(avg_wcss); print("; Worst WCSS = "); print(worst_wcss); | ||
} | ||
|
||
C = as.matrix<f64>(All_Centroids[(num_centroids*(best_index-1)+1) - 1:(num_centroids*best_index), ]); | ||
D = as.matrix<f64>((-2.0) * (X @ t(C)) + t(sum(C ^ 2, 0))); | ||
P = as.matrix<f64>((D <= aggMin(D, 0))); | ||
aggr_P = t(cumSum(t(P))); | ||
Y = sum(aggr_P == 0, 0) + 1; | ||
|
||
if (is_verbose) { | ||
print("dim C="); print(as.si64(nrow(C))); print("x"); print(as.si64(ncol(C))); print(", dim Y="); print(as.si64(nrow(Y))); print("x"); print(as.si64(ncol(Y))); | ||
} | ||
|
||
} else { | ||
print("K-means: No output is produced. Try increasing the number of iterations and/or lower eps."); | ||
C = fill(as.f64(0), num_centroids, num_records); | ||
Y = fill(as.f64((-1.0)), 1, num_records); | ||
} | ||
|
||
return as.matrix<f64>(C), as.matrix<f64>(Y); | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import "kmeans_final.daph"; | ||
|
||
X = rand(5, 100, -10.0, 10.0, 0.5, -1); | ||
k = 2; | ||
runs = 5; | ||
max_iter = 5; | ||
eps = 0.0001; | ||
is_verbose = true; | ||
avg_sample_size_per_centroid = 2; | ||
seed = -1; | ||
|
||
kmeans_final.m_kmeans(as.matrix<f64>(X), k, runs, max_iter, eps, is_verbose, avg_sample_size_per_centroid, seed); |