Skip to content

Commit

Permalink
added kmeans.daph and testfile
Browse files Browse the repository at this point in the history
  • Loading branch information
Garic152 committed Oct 24, 2024
1 parent 649b2da commit 6991e6c
Show file tree
Hide file tree
Showing 2 changed files with 253 additions and 0 deletions.
241 changes: 241 additions & 0 deletions scripts/algorithms/kmeans_final.daph
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Builtin function that implements the k-Means clustering algorithm
#
# INPUT:
# ---------------------------------------------------------------------------------------
# X The input Matrix to do KMeans on.
# k Number of centroids
# runs Number of runs (with different initial centroids)
# max_iter Maximum number of iterations per run
# eps Tolerance (epsilon) for WCSS change ratio
# is_verbose do not print per-iteration stats
# avg_sample_size_per_centroid Average number of records per centroid in data samples
# seed The seed used for initial sampling. If set to -1
# random seeds are selected.
# ---------------------------------------------------------------------------------------
#
# OUTPUT:
# ---------------------------------------------------------------
# Y The mapping of records to centroids
# C The output matrix with the centroids
# ---------------------------------------------------------------

def get_sample_maps(num_records:si64, num_samples:si64, approx_sample_size:si64, seed:si64) -> matrix<f64>, matrix<f64>, si64 {
sample_col_map = [0.0];
if (approx_sample_size < num_records) {
sample_block_size = as.si64(approx_sample_size + round(10 * sqrt(approx_sample_size)));
num_rows = sample_block_size * num_samples;
sample_rec_ids = rand(sample_block_size, num_samples, 0.0, 1.0, 0.5, seed);
sample_rec_ids = round(log(sample_rec_ids, 2) / log(1.0 - as.f64(approx_sample_size) / as.f64(num_records), 2) + 0.5);
sample_rec_ids = cumSum(sample_rec_ids);
is_sample_rec_id_within_range = (sample_rec_ids <= num_records);
sample_rec_ids = as.f64(sample_rec_ids * is_sample_rec_id_within_range + (num_records + 1) * (1 - is_sample_rec_id_within_range));
sample_rec_ids = reshape(as.f64(sample_rec_ids), num_rows, 1);
is_row_in_samples = reshape(as.f64(is_sample_rec_id_within_range), num_rows, 1);
sample_maps = ctable(seq(as.f64(1), num_rows, 1 <= num_rows ? 1 : -1), sample_rec_ids, num_rows, num_records);
sample_positions = (num_rows + 1) - as.f64(is_row_in_samples) * seq(as.f64(num_rows), 1, (-1.0));
col_positions = round(0.5 + seq(as.f64(0), num_rows - 1, 1) / sample_block_size);
sample_col_map = ctable(sample_positions, col_positions - 1);
sample_col_map = as.matrix<f64>(sample_col_map[0:(num_rows), ]);
return as.matrix<f64>(sample_maps), as.matrix<f64>(sample_col_map), sample_block_size;
}
one_per_record = fill(as.f64(1), num_records, 1);
sample_block_size = as.si64(num_records);
sample_maps = fill(as.f64(0), (num_records * num_samples), num_records);
sample_col_map = fill(as.f64(0), (num_records * num_samples), num_samples);
for(i in 1:num_samples) {
sample_maps[(num_records*(i-1)+1) - 1:(num_records*i), ] = diagMatrix(one_per_record);
sample_col_map[(num_records*(i-1)+1) - 1:(num_records*i), i - 1] = one_per_record;
}
return as.matrix<f64>(sample_maps), as.matrix<f64>(sample_col_map), sample_block_size;
}

def m_kmeans(X:matrix<f64>, k:si64 /*= 10*/, runs:si64 /*= 10*/, max_iter:si64 /*= 1000*/, eps:f64 /*= 0.000001*/, is_verbose:bool /*= false*/, avg_sample_size_per_centroid:si64 /*= 50*/, seed:si64 /*= (-1.0)*/) -> matrix<f64>, matrix<f64> {
if (is_verbose) {
print("BEGIN K-MEANS SCRIPT");
}

num_records = as.si64(nrow(X));
num_features = as.si64(ncol(X));
num_centroids = k;
num_runs = runs;

if (is_verbose) {
print("dim X="); print(as.si64(nrow(X))); print("x"); print(as.si64(ncol(X)));
}

sumXsq = sum(X ^ 2);

if (is_verbose) {
print("Taking data samples for initialization...");
}

sample_maps, samples_vs_runs_map, sample_block_size = get_sample_maps(num_records, num_runs, num_centroids * avg_sample_size_per_centroid, seed);

is_row_in_samples = sum(sample_maps, 0);
X_samples = sample_maps @ X;
X_samples_sq_norms = sum(X_samples ^ 2, 0);

if (is_verbose) {
print("Initializing the centroids for all runs...");
}

All_Centroids = fill(as.f64(0), num_runs * num_centroids, num_features);

min_distances = is_row_in_samples;

for(i in 1:num_centroids) {
min_distances_matrix_form = reshape(min_distances, sample_block_size, num_runs);
cdf_min_distances = cumSum(min_distances_matrix_form);

lseed = seed == (-1.0) ? (-1.0) : as.f64(seed + i);
random_row = rand(1, num_runs, 0.0, 1.0, 1.0, lseed); # this is not equal to: rand(rows=1, cols=num_runs, seed=lseed)
threshold_matrix = random_row * cdf_min_distances[sample_block_size - 1, ];
centroid_ids = t(sum(cdf_min_distances < threshold_matrix, 1)) + 1;

centroid_placer = ctable(seq(as.f64(1), num_runs, 1 <= num_runs ? 1 : -1), sample_block_size * seq(as.f64(0), num_runs - 1, 0 <= num_runs - 1 ? 1 : -1) + centroid_ids, num_runs, sample_block_size * num_runs);
centroids = centroid_placer @ X_samples;

centroid_placer = ctable(seq(as.f64(i), num_centroids * (num_runs - 1) + i, num_centroids), seq(as.f64(1), num_runs, 1), as.si64(nrow(All_Centroids)), num_runs);
All_Centroids = as.matrix<f64>(All_Centroids + centroid_placer @ centroids);

distances = X_samples_sq_norms + samples_vs_runs_map @ sum(centroids ^ 2, 0) - 2 * sum(X_samples * (samples_vs_runs_map @ centroids), 0);
min_distances = i == 1 ? as.matrix<f64>(is_row_in_samples * distances) : as.f64(min(min_distances, distances));
}

# Perform K-Means iterations for all runs:

termination_code = fill(as.f64(0), num_runs, 1);
final_wcss = fill(as.f64(0), num_runs, 1);
num_iterations = fill(as.f64(0), num_runs, 1);

if (is_verbose) {
print("Performing k-means iterations for all runs...");
}

C = [1.0];

for(run_index in 1:num_runs) {
C = All_Centroids[(num_centroids*(run_index-1)+1) - 1:(num_centroids*run_index), ];
C_old = C;
iter_count = 0;
term_code = 0;
wcss = inf;

while (term_code == 0) {
D = (-2.0) * (X @ t(C)) + t(sum(C ^ 2, 0));
minD = aggMin(D, 0);
wcss_old = wcss;
wcss = as.f64(sumXsq + sum(minD));

if (is_verbose) {
if (iter_count == 0) {
print("Run ", 0); print(run_index, 0); print(", At Start-Up: Centroid WCSS = ", 0); print(wcss);
} else {
print("Run "); print(run_index); print(", Iteration "); print(iter_count); print(": Centroid WCSS = "); print(wcss); print("; Centroid change (avg.sq.dist.) = "); print((sum((C - C_old) ^ 2) / num_centroids));
}

}

P = D <= minD;
P = as.matrix<f64>(P / sum(P, 0));
P_denom = sum(P, 1);
C_new = (t(P) @ X) / t(P_denom);


iter_count = as.si64(iter_count + 1);
if (wcss_old - wcss < eps * wcss) {
term_code = as.si64(1);
} else {
if (iter_count >= max_iter) {
term_code = as.si64(2);
} else {
if (sum(P_denom <= 0) > 0) {
term_code = as.si64(3);
} else {
C_old = as.matrix<f64>(C);
C = as.matrix<f64>(C_new);
}
}
}
}

if (is_verbose) {
print("Run "); print(run_index); print(", Iteration "); print(iter_count); print(": Terminated with code = "); print(term_code); print(", Centroid WCSS = "); print(wcss);
}

All_Centroids[(num_centroids*(run_index-1)+1) - 1:(num_centroids*run_index), ] = C;
final_wcss[run_index - 1, 0] = [wcss];
termination_code[run_index - 1, 0] = as.matrix<f64>(term_code);
num_iterations[run_index - 1, 0] = as.matrix<f64>(iter_count);
}

# Select the run with the best Centroid-WCSS and output its centroids

termination_bitmap = fill(as.f64(0), num_runs, 3);
termination_bitmap_raw = ctable(seq(as.f64(1), num_runs, 1), termination_code);
termination_bitmap[, 0:ncol(termination_bitmap_raw)] = termination_bitmap_raw;
termination_stats = sum(termination_bitmap, 1);

if (is_verbose) {
print("Number of successful runs = "); print(as.si64(as.scalar(as.f64(termination_stats[0, 0]))));
print("Number of incomplete runs = "); print(as.si64(as.scalar(as.f64(termination_stats[0, 1]))));
print("Number of failed runs (with lost centroids) = "); print(as.si64(as.scalar(as.f64(termination_stats[0, 2]))));
}

num_successful_runs = as.scalar(as.f64(termination_stats[0, 0]));

Y = [1.0];

if (num_successful_runs > 0) {
final_wcss_successful = replace(final_wcss, inf, 0) * termination_bitmap[, 0];
worst_wcss = aggMax(final_wcss_successful);
best_wcss = aggMin(final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap[, 0]));
avg_wcss = sum(final_wcss_successful) / num_successful_runs;
best_index_vector = (final_wcss_successful == best_wcss);
aggr_best_index_vector = cumSum(best_index_vector);
best_index = as.si64(sum(aggr_best_index_vector == 0) + 1);

if (is_verbose) {
print("Successful runs: Best run is "); print(best_index); print(" with Centroid WCSS = "); print(best_wcss); print("; Avg WCSS = "); print(avg_wcss); print("; Worst WCSS = "); print(worst_wcss);
}

C = as.matrix<f64>(All_Centroids[(num_centroids*(best_index-1)+1) - 1:(num_centroids*best_index), ]);
D = as.matrix<f64>((-2.0) * (X @ t(C)) + t(sum(C ^ 2, 0)));
P = as.matrix<f64>((D <= aggMin(D, 0)));
aggr_P = t(cumSum(t(P)));
Y = sum(aggr_P == 0, 0) + 1;

if (is_verbose) {
print("dim C="); print(as.si64(nrow(C))); print("x"); print(as.si64(ncol(C))); print(", dim Y="); print(as.si64(nrow(Y))); print("x"); print(as.si64(ncol(Y)));
}

} else {
print("K-means: No output is produced. Try increasing the number of iterations and/or lower eps.");
C = fill(as.f64(0), num_centroids, num_records);
Y = fill(as.f64((-1.0)), 1, num_records);
}

return as.matrix<f64>(C), as.matrix<f64>(Y);
}

12 changes: 12 additions & 0 deletions scripts/algorithms/test_kmeans.daph
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import "kmeans_final.daph";

X = rand(5, 100, -10.0, 10.0, 0.5, -1);
k = 2;
runs = 5;
max_iter = 5;
eps = 0.0001;
is_verbose = true;
avg_sample_size_per_centroid = 2;
seed = -1;

kmeans_final.m_kmeans(as.matrix<f64>(X), k, runs, max_iter, eps, is_verbose, avg_sample_size_per_centroid, seed);

0 comments on commit 6991e6c

Please sign in to comment.