udpates

quadram-institute-bioscience · Sep 19, 2024 · 6d8cd87 · 6d8cd87
1 parent 294d2d1
commit 6d8cd87
Show file tree

Hide file tree

Showing 17 changed files with 129 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 test/local
 .vscode/
 bin/*
+test/sim/*.*
diff --git a/Makefile b/Makefile
@@ -5,35 +5,48 @@ LDFLAGS = -lz -lpthread
 SRC_DIR = src
 BIN_DIR = bin
 TEST_DIR = test
-TARGET  = $(SRC_DIR)/n50 
+TARGET  = $(BIN_DIR)/n50 
 SIMTARGET = $(BIN_DIR)/gen
-
+SIMDATA = test/sim/list.txt
 .PHONY: all clean test
 
 all: $(TARGET) $(SIMTARGET) $(TESTTARGET)
 
 #Make targets
 $(TARGET): $(SRC_DIR)/n50.c | $(BIN_DIR)
 	$(CC) $(CFLAGS) $< -o $@ $(LDFLAGS)
+
 $(TESTTARGET): $(SRC_DIR)/n50_opt.c | $(BIN_DIR)
 	$(CC)  $(CFLAGS) $< -o $@ $(LDFLAGS)
 
 $(SIMTARGET): $(SRC_DIR)/gen.c | $(BIN_DIR)
-	$(CC) $(CFLAGS) $< -o $@
+	$(CC) $(CFLAGS) $< -o $@  
 
 $(BIN_DIR):
 	mkdir -p $(BIN_DIR)
 
 clean:
 	rm -rf $(BIN_DIR)
-	if [ -d "test/sim" ]; then rm -f test/sim/*; fi
+	if [ -d "test/sim" ]; then echo "Removing sim"; rm -rf test/sim/; fi
 
+$(SIMDATA): $(SIMTARGET)
+	# Generate simulated files each with filename like {N50}_{num_seqs}_{total_length}.{format} \
+	mkdir -p test/sim; \
+	# ./program <min_seqs> <max_seqs> <min_len> <max_len> <tot_files> <format> <outdir>\
+	# small datasets \
+	$(SIMTARGET) 500        1000          5      2000000    5 fasta test/sim/; \
+	$(SIMTARGET) 500        1000          5        10000    5 fastq test/sim/; \
+	# large datasets
+	$(SIMTARGET) 5000     10000          1000      20000000    3 fasta test/sim/; \
+	$(SIMTARGET) 5000     10000          1000      20000000    3 fastq test/sim/; \
+	gzip -k test/sim/*.*; \
+	ls test/sim/*.* > $(SIMDATA)
 
 # Test rule
 test: $(TARGET) $(SIMTARGET)
 	@passed=0; failed=0; \
 	if [ -d "$(TEST_DIR)" ]; then \
-		echo "Running tests in $(TEST_DIR)"; \
+		echo "[1] Running tests in $(TEST_DIR)"; \
 		for file in $(TEST_DIR)/*.*; do \
 			filename=$$(basename "$$file"); \
 			expected_n50=$${filename%%.*}; \
@@ -49,8 +62,9 @@ test: $(TARGET) $(SIMTARGET)
 			fi; \
 		done; \
 	fi; \
+	mkdir -p test/sim; \
 	if [ -d "test/sim" ]; then \
-		echo "Generating simulated reads" \
+		echo "[2] Generating simulated reads" \
 		# Generate simulated files each with filename like {N50}_{num_seqs}_{total_length}.{format} \
 		$(SIMTARGET) 10 35 1 2000 10 fasta test/sim/; \
 		$(SIMTARGET) 5 15 1 1000 8 fastq test/sim/; \
@@ -94,6 +108,7 @@ test: $(TARGET) $(SIMTARGET)
 	else \
 		echo "test/sim directory does not exist. Skipping simulation tests."; \
 	fi; \
+	echo "Tested $(TARGET)"; \
 	echo "Tests completed. Passed: $$passed, Failed: $$failed"; \
 	if [ $$failed -ne 0 ]; then exit 1; fi
 
@@ -123,24 +138,23 @@ autotest: $(TARGET)
 	@echo "Simple test completed."
 
 
-benchmark: $(TARGET) $(SIMTARGET)
+benchmark: $(TARGET) $(SIMTARGET) $(SIMDATA)
+
 	if [ -d "test/sim" ]; then \
-		echo "Generating simulated reads" \
-		# Generate simulated files each with filename like {N50}_{num_seqs}_{total_length}.{format} \
-		# ./program <min_seqs> <max_seqs> <min_len> <max_len> <tot_files> <format> <outdir>\
-		$(SIMTARGET) 20        1000           1      2000000    2 fasta test/sim/; \
-		$(SIMTARGET) 10        1000         100        10000    1 fastq test/sim/; \
+		if [ ! -d "test/benchmark" ]; then mkdir -p test/benchmark; fi; \
+		if [ ! -d "test/local/" ]; then mkdir -p test/local/; fi; \
 		echo "Running simulation tests in test/sim"; \
 		for file in test/sim/*.*; do \
 			hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/$$(basename "$$file").csv" "$(TARGET) $$file" "seqfu stats $$file" "seqkit stats $$file"; \
 		done; \
 		hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/all.csv" "$(TARGET) test/sim/*.*" "seqfu stats test/sim/*.*" "seqkit stats test/sim/*.*"; \
 		hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/all_fasta.csv" "$(TARGET) test/sim/*.fasta" "seqfu stats test/sim/*.fasta" "seqkit stats test/sim/*.fasta"; \
 		hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/all_fastq.csv" "$(TARGET) test/sim/*.fastq" "seqfu stats test/sim/*.fastq" "seqkit stats test/sim/*.fastq"; \
-		gzip test/sim/*.*; \
-		hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all.csv" "$(TARGET) test/sim/*.gz" "seqfu stats test/sim/*.gz" "seqkit stats test/sim/*.gz"; \
-		hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fasta.csv" "$(TARGET) test/sim/*.fasta.gz" "seqfu stats test/sim/*.fasta.gz" "seqkit stats test/sim/*.fasta.gz"; \
-		hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fastq.csv" "$(TARGET) test/sim/*.fastq.gz" "seqfu stats test/sim/*.fastq.gz" "seqkit stats test/sim/*.fastq.gz"; \
+		gzip -f test/sim/*.*; \
+		hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all.csv" "$(TARGET) test/{sim,local}/*.gz" "seqfu stats test/{sim,local}/*.gz" "seqkit stats test/{sim,local}/*.gz"; \
+		hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fasta.csv" "$(TARGET) test/{sim,local}/*.fasta.gz" "seqfu stats test/{sim,local}/*.fasta.gz" "seqkit stats test/{sim,local}/*.fasta.gz"; \
+		hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fastq.csv" "$(TARGET) test/{sim,local}/*.fastq.gz" "seqfu stats test/{sim,local}/*.fastq.gz" "seqkit stats test/{sim,local}/*.fastq.gz"; \
 	else \
 		echo "test/sim directory does not exist. Skipping simulation tests."; \
-	fi
+	fi
+
diff --git a/bin/n50 b/bin/n50
diff --git a/src/bench.sh b/src/bench.sh
@@ -1,5 +1,18 @@
 #!/bin/bash
 PARENT_DIR=$(dirname $(dirname $(readlink -f $0)))
+
+mkdir -p $PARENT_DIR/test/benchmark
+mkdir -p $PARENT_DIR/test/local
+mkdir -p $PARENT_DIR/test/sim
+
+for bin in seqfu seqkit ./bin/n50 ./bin/n50o; do
+    if ! command -v $bin &> /dev/null; then
+        echo "Error: $bin could not be found."
+        echo "Please make sure that the binary is compiled and in the bin/ directory."
+        exit 1
+    fi
+done
+
 set -euo pipefail
 # Let's if the subdirectories bin/  and test/ exist
 if [ ! -d "$PARENT_DIR/bin" ] || [ ! -d "$PARENT_DIR/test" ] || [ ! -d "$PARENT_DIR/test/local" ]; then

diff --git a/src/bench_comp.py b/src/bench_comp.py
@@ -29,6 +29,9 @@ def parse_csv(file_path: str) -> List[ToolData]:
         for row in reader:
             command, mean, stddev, median, user, system, min_time, max_time = row
             tool = command.split()[0]
+            # if tool is a path, extract the basename
+            if '/' in tool:
+                tool = tool.split('/')[-1]
             data.append(ToolData(
                 tool,
                 float(mean),

diff --git a/src/gent.c → src/gen-thread.c b/src/gent.c → src/gen-thread.c
@@ -8,16 +8,17 @@
 #define MAX_SEQ_NAME_LEN 256
 #define MAX_FILENAME_LEN 256
 #define MAX_THREADS 16
+#define CHUNK_SIZE 1000 // Number of sequences to process in each chunk
 
 // Thread-related structures
 typedef struct {
     char **sequences;
     int *lengths;
-    int start_index;
-    int end_index;
+    int num_seqs;
+    int *next_seq;
+    pthread_mutex_t *mutex;
 } ThreadData;
 
-
 // Function prototypes
 long long calculate_n50(const int *lengths, int num_seqs, long long *total_length);
 void *generate_sequences(void *arg);
@@ -27,8 +28,8 @@ void free_resources(char **sequences, int *lengths, int num_seqs);
 void write_sequences(char **sequences, int *lengths, int num_seqs, const char *outfile, const char *format);
 
 int main(int argc, char *argv[]) {
-    if (argc != 9) {
-        fprintf(stderr, "Usage: %s <min_seqs> <max_seqs> <min_len> <max_len> <tot_files> <format> <outdir> <num_threads>\n", argv[0]);
+    if (argc < 8) {
+        fprintf(stderr, "Usage: %s <min_seqs> <max_seqs> <min_len> <max_len> <tot_files> <format> <outdir>\n", argv[0]);
         return 1;
     }
 
@@ -39,7 +40,12 @@ int main(int argc, char *argv[]) {
     int tot_files = atoi(argv[5]);
     char *format = argv[6];
     char *outdir = argv[7];
-    int num_threads = atoi(argv[8]);
+    // if specified, the number of threads to use else 1
+    int num_threads = 1;
+    if (argc == 9) {
+        num_threads = atoi(argv[8]);
+    }
+
 
     // Input validation
     if (min_seqs <= 0 || max_seqs <= 0 || min_len <= 0 || max_len <= 0 || tot_files <= 0 || num_threads <= 0 || num_threads > MAX_THREADS) {
@@ -101,18 +107,20 @@ int main(int argc, char *argv[]) {
 
         // Set up threading for sequence generation
         pthread_t threads[num_threads];
-        ThreadData thread_data[num_threads];
-
-        int seqs_per_thread = num_seqs / num_threads;
-        int remaining_seqs = num_seqs % num_threads;
+        ThreadData thread_data;
+        thread_data.sequences = sequences;
+        thread_data.lengths = contig_lengths;
+        thread_data.num_seqs = num_seqs;
+
+        int next_seq = 0;
+        thread_data.next_seq = &next_seq;
+
+        pthread_mutex_t mutex;
+        pthread_mutex_init(&mutex, NULL);
+        thread_data.mutex = &mutex;
 
         for (int t = 0; t < num_threads; t++) {
-            thread_data[t].sequences = sequences;
-            thread_data[t].lengths = contig_lengths;
-            thread_data[t].start_index = t * seqs_per_thread + (t < remaining_seqs ? t : remaining_seqs);
-            thread_data[t].end_index = (t + 1) * seqs_per_thread + (t < remaining_seqs ? t + 1 : remaining_seqs);
-
-            if (pthread_create(&threads[t], NULL, generate_sequences, (void *)&thread_data[t]) != 0) {
+            if (pthread_create(&threads[t], NULL, generate_sequences, (void *)&thread_data) != 0) {
                 fprintf(stderr, "Failed to create thread %d\n", t);
                 free_resources(sequences, contig_lengths, num_seqs);
                 return 1;
@@ -124,6 +132,8 @@ int main(int argc, char *argv[]) {
             pthread_join(threads[t], NULL);
         }
 
+        pthread_mutex_destroy(&mutex);
+
         // Write sequences to file (single-threaded)
         write_sequences(sequences, contig_lengths, num_seqs, outfile, format);
 
@@ -134,7 +144,6 @@ int main(int argc, char *argv[]) {
     return 0;
 }
 
-
 void gen_ctg_len(int min_len, int max_len, int num_seqs, int *lengths) {
     // calculate the even distance between num_seqs sequences spanning from min_len to max_len
     long long STEP = (max_len - min_len) / num_seqs;
@@ -208,11 +217,26 @@ int *generate_contigs(int N50, int SUM_LEN, int TOT_SEQS) {
     return contig_list;
 }
 
+
 void *generate_sequences(void *arg) {
     ThreadData *data = (ThreadData *)arg;
-    for (int i = data->start_index; i < data->end_index; i++) {
-        generate_sequence(data->lengths[i], data->sequences[i]);
+    int start, end;
+
+    while (1) {
+        pthread_mutex_lock(data->mutex);
+        start = *data->next_seq;
+        end = start + CHUNK_SIZE;
+        if (end > data->num_seqs) end = data->num_seqs;
+        *data->next_seq = end;
+        pthread_mutex_unlock(data->mutex);
+
+        if (start >= data->num_seqs) break;
+
+        for (int i = start; i < end; i++) {
+            generate_sequence(data->lengths[i], data->sequences[i]);
+        }
     }
+
     return NULL;
 }
 

diff --git a/src/gen.c b/src/gen.c
@@ -1,8 +1,10 @@
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <ctype.h>
+
 /*
  * DNA Sequence Generator
  * Andrea Telatin 2023, (C) Quadram Institute Bioscience
@@ -22,6 +24,7 @@
  * This tool is useful for testing sequence analysis software, benchmarking
  * bioinformatics pipelines, and generating sample data for educational purposes.
  */
+#define BUFFER_SIZE 1048576 // 1 MB buffer
 #define MAX_SEQ_NAME_LEN 256
 #define MAX_FILENAME_LEN 256
 
@@ -81,9 +84,9 @@ int main(int argc, char *argv[]) {
             fprintf(stderr, "Memory allocation failed for contig_lengths\n");
             return 1;
         }
-        fprintf(stderr, "%d file [%d num seqs]\n", i, num_seqs);
+        fprintf(stderr, "%d/%d %s file [%d num seqs]: ", i, tot_files, format, num_seqs);
         gen_ctg_len(min_len, max_len, num_seqs, contig_lengths);
-
+        fprintf(stderr, "OK\n");
         long long total_length; // Change to long long
         int N50 = calculate_n50(contig_lengths, num_seqs, &total_length);
         fprintf(stderr, "\tTotal length: %lld\n", total_length); // Use %lld for long long
@@ -113,7 +116,7 @@ int main(int argc, char *argv[]) {
         } else {
             write_fastq(sequences, contig_lengths, num_seqs, outfile);
         }
-        fprintf(stderr, "  [Done: %s]\n", outfile);
+        fprintf(stderr, "  [Written to %s]\n", outfile);
         free_resources(sequences, contig_lengths, num_seqs);
     }
 
@@ -203,6 +206,15 @@ void write_fasta(char **sequences, int *lengths, int num_seqs, const char *outfi
         return;
     }
 
+    char *buffer = malloc(BUFFER_SIZE);
+    if (!buffer) {
+        fprintf(stderr, "Unable to allocate buffer for writing\n");
+        fclose(f);
+        return;
+    }
+
+    setvbuf(f, buffer, _IOFBF, BUFFER_SIZE);
+
     for (int i = 0; i < num_seqs; i++) {
         fprintf(f, ">seq%d\n", i + 1);
         for (int j = 0; j < lengths[i]; j += 60) {
@@ -211,7 +223,9 @@ void write_fasta(char **sequences, int *lengths, int num_seqs, const char *outfi
         }
     }
 
+    fflush(f);
     fclose(f);
+    free(buffer);
 }
 
 void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfile) {
@@ -221,6 +235,15 @@ void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfi
         return;
     }
 
+    char *buffer = malloc(BUFFER_SIZE);
+    if (!buffer) {
+        fprintf(stderr, "Unable to allocate buffer for writing\n");
+        fclose(f);
+        return;
+    }
+
+    setvbuf(f, buffer, _IOFBF, BUFFER_SIZE);
+
     for (int i = 0; i < num_seqs; i++) {
         fprintf(f, "@seq%d\n%s\n+\n", i + 1, sequences[i]);
         for (int j = 0; j < lengths[i]; j++) {
@@ -229,7 +252,9 @@ void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfi
         fputc('\n', f);
     }
 
+    fflush(f);
     fclose(f);
+    free(buffer);
 }
 
 void free_resources(char **sequences, int *lengths, int num_seqs) {

diff --git a/test/benchmark/*.*.csv b/test/benchmark/*.*.csv
diff --git a/test/benchmark/1396484_902_903600707.fasta.csv b/test/benchmark/1396484_902_903600707.fasta.csv
@@ -0,0 +1,2 @@
+command,mean,stddev,median,user,system,min,max
+bin/n50  test/sim/1396484_902_903600707.fasta,1.6391795357799999,0.020583813470225212,1.63523198198,1.5063611,0.11844614,1.61149171148,1.6722827934800002
diff --git a/test/benchmark/1454777_575_591143151.fasta.csv b/test/benchmark/1454777_575_591143151.fasta.csv
@@ -0,0 +1,4 @@
+command,mean,stddev,median,user,system,min,max
+src/n50  test/sim/1454777_575_591143151.fasta,1.06535981002,0.01887172614604079,1.0614336600199998,0.9829298200000001,0.07575556,1.04674813952,1.11139147252
+seqfu stats test/sim/1454777_575_591143151.fasta,0.3583592433200001,0.005983688320461325,0.35700688952000004,0.28345232,0.06740586,0.35192797252,0.37087238852000004
+seqkit stats test/sim/1454777_575_591143151.fasta,0.39314732252000006,0.006703037820939213,0.39226030602000006,0.32644592000000006,0.06484776,0.38318080552000006,0.40773626352000003
diff --git a/test/benchmark/6917_312_1496654.fastq.csv b/test/benchmark/6917_312_1496654.fastq.csv
@@ -0,0 +1,4 @@
+command,mean,stddev,median,user,system,min,max
+src/n50  test/sim/6917_312_1496654.fastq,0.008528284129999994,0.0006431936679455821,0.008399792479999994,0.003592820000000002,0.0029625599999999986,0.007366979979999995,0.009651687979999995
+seqfu stats test/sim/6917_312_1496654.fastq,0.013984221579999997,0.0010317730767312214,0.013940396479999995,0.004972670000000002,0.004072159999999999,0.011888604979999997,0.015608520979999997
+seqkit stats test/sim/6917_312_1496654.fastq,0.033020338329999996,0.0027281133626468773,0.03256454247999999,0.021750320000000004,0.011906559999999998,0.029904187979999997,0.041659812979999995
diff --git a/test/benchmark/all.csv b/test/benchmark/all.csv
diff --git a/test/benchmark/all_fasta.csv b/test/benchmark/all_fasta.csv
diff --git a/test/benchmark/all_fastq.csv b/test/benchmark/all_fastq.csv
diff --git a/test/benchmark/gz_all.csv b/test/benchmark/gz_all.csv
diff --git a/test/benchmark/gz_all_fasta.csv b/test/benchmark/gz_all_fasta.csv
diff --git a/test/benchmark/gz_all_fastq.csv b/test/benchmark/gz_all_fastq.csv
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		command,mean,stddev,median,user,system,min,max
		bin/n50 test/sim/1396484_902_903600707.fasta,1.6391795357799999,0.020583813470225212,1.63523198198,1.5063611,0.11844614,1.61149171148,1.6722827934800002