diff --git a/scripts/algorithms/kmeans_final.daph b/scripts/algorithms/kmeans_final.daph new file mode 100644 index 000000000..9f1b2bc48 --- /dev/null +++ b/scripts/algorithms/kmeans_final.daph @@ -0,0 +1,241 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# +# Builtin function that implements the k-Means clustering algorithm +# +# INPUT: +# --------------------------------------------------------------------------------------- +# X The input Matrix to do KMeans on. +# k Number of centroids +# runs Number of runs (with different initial centroids) +# max_iter Maximum number of iterations per run +# eps Tolerance (epsilon) for WCSS change ratio +# is_verbose do not print per-iteration stats +# avg_sample_size_per_centroid Average number of records per centroid in data samples +# seed The seed used for initial sampling. If set to -1 +# random seeds are selected. +# --------------------------------------------------------------------------------------- +# +# OUTPUT: +# --------------------------------------------------------------- +# Y The mapping of records to centroids +# C The output matrix with the centroids +# --------------------------------------------------------------- + +def get_sample_maps(num_records:si64, num_samples:si64, approx_sample_size:si64, seed:si64) -> matrix<f64>, matrix<f64>, si64 { + sample_col_map = [0.0]; + if (approx_sample_size < num_records) { + sample_block_size = as.si64(approx_sample_size + round(10 * sqrt(approx_sample_size))); + num_rows = sample_block_size * num_samples; + sample_rec_ids = rand(sample_block_size, num_samples, 0.0, 1.0, 0.5, seed); + sample_rec_ids = round(log(sample_rec_ids, 2) / log(1.0 - as.f64(approx_sample_size) / as.f64(num_records), 2) + 0.5); + sample_rec_ids = cumSum(sample_rec_ids); + is_sample_rec_id_within_range = (sample_rec_ids <= num_records); + sample_rec_ids = as.f64(sample_rec_ids * is_sample_rec_id_within_range + (num_records + 1) * (1 - is_sample_rec_id_within_range)); + sample_rec_ids = reshape(as.f64(sample_rec_ids), num_rows, 1); + is_row_in_samples = reshape(as.f64(is_sample_rec_id_within_range), num_rows, 1); + sample_maps = ctable(seq(as.f64(1), num_rows, 1 <= num_rows ? 1 : -1), sample_rec_ids, num_rows, num_records); + sample_positions = (num_rows + 1) - as.f64(is_row_in_samples) * seq(as.f64(num_rows), 1, (-1.0)); + col_positions = round(0.5 + seq(as.f64(0), num_rows - 1, 1) / sample_block_size); + sample_col_map = ctable(sample_positions, col_positions - 1); + sample_col_map = as.matrix<f64>(sample_col_map[0:(num_rows), ]); + return as.matrix<f64>(sample_maps), as.matrix<f64>(sample_col_map), sample_block_size; + } + one_per_record = fill(as.f64(1), num_records, 1); + sample_block_size = as.si64(num_records); + sample_maps = fill(as.f64(0), (num_records * num_samples), num_records); + sample_col_map = fill(as.f64(0), (num_records * num_samples), num_samples); + for(i in 1:num_samples) { + sample_maps[(num_records*(i-1)+1) - 1:(num_records*i), ] = diagMatrix(one_per_record); + sample_col_map[(num_records*(i-1)+1) - 1:(num_records*i), i - 1] = one_per_record; + } + return as.matrix<f64>(sample_maps), as.matrix<f64>(sample_col_map), sample_block_size; +} + +def m_kmeans(X:matrix<f64>, k:si64 /*= 10*/, runs:si64 /*= 10*/, max_iter:si64 /*= 1000*/, eps:f64 /*= 0.000001*/, is_verbose:bool /*= false*/, avg_sample_size_per_centroid:si64 /*= 50*/, seed:si64 /*= (-1.0)*/) -> matrix<f64>, matrix<f64> { + if (is_verbose) { + print("BEGIN K-MEANS SCRIPT"); + } + + num_records = as.si64(nrow(X)); + num_features = as.si64(ncol(X)); + num_centroids = k; + num_runs = runs; + + if (is_verbose) { + print("dim X="); print(as.si64(nrow(X))); print("x"); print(as.si64(ncol(X))); + } + + sumXsq = sum(X ^ 2); + + if (is_verbose) { + print("Taking data samples for initialization..."); + } + + sample_maps, samples_vs_runs_map, sample_block_size = get_sample_maps(num_records, num_runs, num_centroids * avg_sample_size_per_centroid, seed); + + is_row_in_samples = sum(sample_maps, 0); + X_samples = sample_maps @ X; + X_samples_sq_norms = sum(X_samples ^ 2, 0); + + if (is_verbose) { + print("Initializing the centroids for all runs..."); + } + + All_Centroids = fill(as.f64(0), num_runs * num_centroids, num_features); + + min_distances = is_row_in_samples; + + for(i in 1:num_centroids) { + min_distances_matrix_form = reshape(min_distances, sample_block_size, num_runs); + cdf_min_distances = cumSum(min_distances_matrix_form); + + lseed = seed == (-1.0) ? (-1.0) : as.f64(seed + i); + random_row = rand(1, num_runs, 0.0, 1.0, 1.0, lseed); # this is not equal to: rand(rows=1, cols=num_runs, seed=lseed) + threshold_matrix = random_row * cdf_min_distances[sample_block_size - 1, ]; + centroid_ids = t(sum(cdf_min_distances < threshold_matrix, 1)) + 1; + + centroid_placer = ctable(seq(as.f64(1), num_runs, 1 <= num_runs ? 1 : -1), sample_block_size * seq(as.f64(0), num_runs - 1, 0 <= num_runs - 1 ? 1 : -1) + centroid_ids, num_runs, sample_block_size * num_runs); + centroids = centroid_placer @ X_samples; + + centroid_placer = ctable(seq(as.f64(i), num_centroids * (num_runs - 1) + i, num_centroids), seq(as.f64(1), num_runs, 1), as.si64(nrow(All_Centroids)), num_runs); + All_Centroids = as.matrix<f64>(All_Centroids + centroid_placer @ centroids); + + distances = X_samples_sq_norms + samples_vs_runs_map @ sum(centroids ^ 2, 0) - 2 * sum(X_samples * (samples_vs_runs_map @ centroids), 0); + min_distances = i == 1 ? as.matrix<f64>(is_row_in_samples * distances) : as.f64(min(min_distances, distances)); + } + + # Perform K-Means iterations for all runs: + + termination_code = fill(as.f64(0), num_runs, 1); + final_wcss = fill(as.f64(0), num_runs, 1); + num_iterations = fill(as.f64(0), num_runs, 1); + + if (is_verbose) { + print("Performing k-means iterations for all runs..."); + } + + C = [1.0]; + + for(run_index in 1:num_runs) { + C = All_Centroids[(num_centroids*(run_index-1)+1) - 1:(num_centroids*run_index), ]; + C_old = C; + iter_count = 0; + term_code = 0; + wcss = inf; + + while (term_code == 0) { + D = (-2.0) * (X @ t(C)) + t(sum(C ^ 2, 0)); + minD = aggMin(D, 0); + wcss_old = wcss; + wcss = as.f64(sumXsq + sum(minD)); + + if (is_verbose) { + if (iter_count == 0) { + print("Run ", 0); print(run_index, 0); print(", At Start-Up: Centroid WCSS = ", 0); print(wcss); + } else { + print("Run "); print(run_index); print(", Iteration "); print(iter_count); print(": Centroid WCSS = "); print(wcss); print("; Centroid change (avg.sq.dist.) = "); print((sum((C - C_old) ^ 2) / num_centroids)); + } + + } + + P = D <= minD; + P = as.matrix<f64>(P / sum(P, 0)); + P_denom = sum(P, 1); + C_new = (t(P) @ X) / t(P_denom); + + + iter_count = as.si64(iter_count + 1); + if (wcss_old - wcss < eps * wcss) { + term_code = as.si64(1); + } else { + if (iter_count >= max_iter) { + term_code = as.si64(2); + } else { + if (sum(P_denom <= 0) > 0) { + term_code = as.si64(3); + } else { + C_old = as.matrix<f64>(C); + C = as.matrix<f64>(C_new); + } + } + } + } + + if (is_verbose) { + print("Run "); print(run_index); print(", Iteration "); print(iter_count); print(": Terminated with code = "); print(term_code); print(", Centroid WCSS = "); print(wcss); + } + + All_Centroids[(num_centroids*(run_index-1)+1) - 1:(num_centroids*run_index), ] = C; + final_wcss[run_index - 1, 0] = [wcss]; + termination_code[run_index - 1, 0] = as.matrix<f64>(term_code); + num_iterations[run_index - 1, 0] = as.matrix<f64>(iter_count); + } + + # Select the run with the best Centroid-WCSS and output its centroids + + termination_bitmap = fill(as.f64(0), num_runs, 3); + termination_bitmap_raw = ctable(seq(as.f64(1), num_runs, 1), termination_code); + termination_bitmap[, 0:ncol(termination_bitmap_raw)] = termination_bitmap_raw; + termination_stats = sum(termination_bitmap, 1); + + if (is_verbose) { + print("Number of successful runs = "); print(as.si64(as.scalar(as.f64(termination_stats[0, 0])))); + print("Number of incomplete runs = "); print(as.si64(as.scalar(as.f64(termination_stats[0, 1])))); + print("Number of failed runs (with lost centroids) = "); print(as.si64(as.scalar(as.f64(termination_stats[0, 2])))); + } + + num_successful_runs = as.scalar(as.f64(termination_stats[0, 0])); + + Y = [1.0]; + + if (num_successful_runs > 0) { + final_wcss_successful = replace(final_wcss, inf, 0) * termination_bitmap[, 0]; + worst_wcss = aggMax(final_wcss_successful); + best_wcss = aggMin(final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap[, 0])); + avg_wcss = sum(final_wcss_successful) / num_successful_runs; + best_index_vector = (final_wcss_successful == best_wcss); + aggr_best_index_vector = cumSum(best_index_vector); + best_index = as.si64(sum(aggr_best_index_vector == 0) + 1); + + if (is_verbose) { + print("Successful runs: Best run is "); print(best_index); print(" with Centroid WCSS = "); print(best_wcss); print("; Avg WCSS = "); print(avg_wcss); print("; Worst WCSS = "); print(worst_wcss); + } + + C = as.matrix<f64>(All_Centroids[(num_centroids*(best_index-1)+1) - 1:(num_centroids*best_index), ]); + D = as.matrix<f64>((-2.0) * (X @ t(C)) + t(sum(C ^ 2, 0))); + P = as.matrix<f64>((D <= aggMin(D, 0))); + aggr_P = t(cumSum(t(P))); + Y = sum(aggr_P == 0, 0) + 1; + + if (is_verbose) { + print("dim C="); print(as.si64(nrow(C))); print("x"); print(as.si64(ncol(C))); print(", dim Y="); print(as.si64(nrow(Y))); print("x"); print(as.si64(ncol(Y))); + } + + } else { + print("K-means: No output is produced. Try increasing the number of iterations and/or lower eps."); + C = fill(as.f64(0), num_centroids, num_records); + Y = fill(as.f64((-1.0)), 1, num_records); + } + + return as.matrix<f64>(C), as.matrix<f64>(Y); +} + diff --git a/scripts/algorithms/test_kmeans.daph b/scripts/algorithms/test_kmeans.daph new file mode 100644 index 000000000..adfbcb585 --- /dev/null +++ b/scripts/algorithms/test_kmeans.daph @@ -0,0 +1,12 @@ +import "kmeans_final.daph"; + +X = rand(5, 100, -10.0, 10.0, 0.5, -1); +k = 2; +runs = 5; +max_iter = 5; +eps = 0.0001; +is_verbose = true; +avg_sample_size_per_centroid = 2; +seed = -1; + +kmeans_final.m_kmeans(as.matrix<f64>(X), k, runs, max_iter, eps, is_verbose, avg_sample_size_per_centroid, seed);