diff --git a/.gitignore b/.gitignore index 7be0234..5d5f075 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ test/local .vscode/ bin/* +test/sim/*.* diff --git a/Makefile b/Makefile index e475cba..04ba70e 100644 --- a/Makefile +++ b/Makefile @@ -5,9 +5,9 @@ LDFLAGS = -lz -lpthread SRC_DIR = src BIN_DIR = bin TEST_DIR = test -TARGET = $(SRC_DIR)/n50 +TARGET = $(BIN_DIR)/n50 SIMTARGET = $(BIN_DIR)/gen - +SIMDATA = test/sim/list.txt .PHONY: all clean test all: $(TARGET) $(SIMTARGET) $(TESTTARGET) @@ -15,25 +15,38 @@ all: $(TARGET) $(SIMTARGET) $(TESTTARGET) #Make targets $(TARGET): $(SRC_DIR)/n50.c | $(BIN_DIR) $(CC) $(CFLAGS) $< -o $@ $(LDFLAGS) + $(TESTTARGET): $(SRC_DIR)/n50_opt.c | $(BIN_DIR) $(CC) $(CFLAGS) $< -o $@ $(LDFLAGS) $(SIMTARGET): $(SRC_DIR)/gen.c | $(BIN_DIR) - $(CC) $(CFLAGS) $< -o $@ + $(CC) $(CFLAGS) $< -o $@ $(BIN_DIR): mkdir -p $(BIN_DIR) clean: rm -rf $(BIN_DIR) - if [ -d "test/sim" ]; then rm -f test/sim/*; fi + if [ -d "test/sim" ]; then echo "Removing sim"; rm -rf test/sim/; fi +$(SIMDATA): $(SIMTARGET) + # Generate simulated files each with filename like {N50}_{num_seqs}_{total_length}.{format} \ + mkdir -p test/sim; \ + # ./program \ + # small datasets \ + $(SIMTARGET) 500 1000 5 2000000 5 fasta test/sim/; \ + $(SIMTARGET) 500 1000 5 10000 5 fastq test/sim/; \ + # large datasets + $(SIMTARGET) 5000 10000 1000 20000000 3 fasta test/sim/; \ + $(SIMTARGET) 5000 10000 1000 20000000 3 fastq test/sim/; \ + gzip -k test/sim/*.*; \ + ls test/sim/*.* > $(SIMDATA) # Test rule test: $(TARGET) $(SIMTARGET) @passed=0; failed=0; \ if [ -d "$(TEST_DIR)" ]; then \ - echo "Running tests in $(TEST_DIR)"; \ + echo "[1] Running tests in $(TEST_DIR)"; \ for file in $(TEST_DIR)/*.*; do \ filename=$$(basename "$$file"); \ expected_n50=$${filename%%.*}; \ @@ -49,8 +62,9 @@ test: $(TARGET) $(SIMTARGET) fi; \ done; \ fi; \ + mkdir -p test/sim; \ if [ -d "test/sim" ]; then \ - echo "Generating simulated reads" \ + echo "[2] Generating simulated reads" \ # Generate simulated files each with filename like {N50}_{num_seqs}_{total_length}.{format} \ $(SIMTARGET) 10 35 1 2000 10 fasta test/sim/; \ $(SIMTARGET) 5 15 1 1000 8 fastq test/sim/; \ @@ -94,6 +108,7 @@ test: $(TARGET) $(SIMTARGET) else \ echo "test/sim directory does not exist. Skipping simulation tests."; \ fi; \ + echo "Tested $(TARGET)"; \ echo "Tests completed. Passed: $$passed, Failed: $$failed"; \ if [ $$failed -ne 0 ]; then exit 1; fi @@ -123,13 +138,11 @@ autotest: $(TARGET) @echo "Simple test completed." -benchmark: $(TARGET) $(SIMTARGET) +benchmark: $(TARGET) $(SIMTARGET) $(SIMDATA) + if [ -d "test/sim" ]; then \ - echo "Generating simulated reads" \ - # Generate simulated files each with filename like {N50}_{num_seqs}_{total_length}.{format} \ - # ./program \ - $(SIMTARGET) 20 1000 1 2000000 2 fasta test/sim/; \ - $(SIMTARGET) 10 1000 100 10000 1 fastq test/sim/; \ + if [ ! -d "test/benchmark" ]; then mkdir -p test/benchmark; fi; \ + if [ ! -d "test/local/" ]; then mkdir -p test/local/; fi; \ echo "Running simulation tests in test/sim"; \ for file in test/sim/*.*; do \ hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/$$(basename "$$file").csv" "$(TARGET) $$file" "seqfu stats $$file" "seqkit stats $$file"; \ @@ -137,10 +150,11 @@ benchmark: $(TARGET) $(SIMTARGET) hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/all.csv" "$(TARGET) test/sim/*.*" "seqfu stats test/sim/*.*" "seqkit stats test/sim/*.*"; \ hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/all_fasta.csv" "$(TARGET) test/sim/*.fasta" "seqfu stats test/sim/*.fasta" "seqkit stats test/sim/*.fasta"; \ hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/all_fastq.csv" "$(TARGET) test/sim/*.fastq" "seqfu stats test/sim/*.fastq" "seqkit stats test/sim/*.fastq"; \ - gzip test/sim/*.*; \ - hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all.csv" "$(TARGET) test/sim/*.gz" "seqfu stats test/sim/*.gz" "seqkit stats test/sim/*.gz"; \ - hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fasta.csv" "$(TARGET) test/sim/*.fasta.gz" "seqfu stats test/sim/*.fasta.gz" "seqkit stats test/sim/*.fasta.gz"; \ - hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fastq.csv" "$(TARGET) test/sim/*.fastq.gz" "seqfu stats test/sim/*.fastq.gz" "seqkit stats test/sim/*.fastq.gz"; \ + gzip -f test/sim/*.*; \ + hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all.csv" "$(TARGET) test/{sim,local}/*.gz" "seqfu stats test/{sim,local}/*.gz" "seqkit stats test/{sim,local}/*.gz"; \ + hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fasta.csv" "$(TARGET) test/{sim,local}/*.fasta.gz" "seqfu stats test/{sim,local}/*.fasta.gz" "seqkit stats test/{sim,local}/*.fasta.gz"; \ + hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fastq.csv" "$(TARGET) test/{sim,local}/*.fastq.gz" "seqfu stats test/{sim,local}/*.fastq.gz" "seqkit stats test/{sim,local}/*.fastq.gz"; \ else \ echo "test/sim directory does not exist. Skipping simulation tests."; \ - fi \ No newline at end of file + fi + \ No newline at end of file diff --git a/bin/n50 b/bin/n50 index 4354e55..4fdde53 100755 Binary files a/bin/n50 and b/bin/n50 differ diff --git a/src/bench.sh b/src/bench.sh index 5979e8f..368bc7e 100644 --- a/src/bench.sh +++ b/src/bench.sh @@ -1,5 +1,18 @@ #!/bin/bash PARENT_DIR=$(dirname $(dirname $(readlink -f $0))) + +mkdir -p $PARENT_DIR/test/benchmark +mkdir -p $PARENT_DIR/test/local +mkdir -p $PARENT_DIR/test/sim + +for bin in seqfu seqkit ./bin/n50 ./bin/n50o; do + if ! command -v $bin &> /dev/null; then + echo "Error: $bin could not be found." + echo "Please make sure that the binary is compiled and in the bin/ directory." + exit 1 + fi +done + set -euo pipefail # Let's if the subdirectories bin/ and test/ exist if [ ! -d "$PARENT_DIR/bin" ] || [ ! -d "$PARENT_DIR/test" ] || [ ! -d "$PARENT_DIR/test/local" ]; then diff --git a/src/bench_comp.py b/src/bench_comp.py index e3751ef..319b063 100755 --- a/src/bench_comp.py +++ b/src/bench_comp.py @@ -29,6 +29,9 @@ def parse_csv(file_path: str) -> List[ToolData]: for row in reader: command, mean, stddev, median, user, system, min_time, max_time = row tool = command.split()[0] + # if tool is a path, extract the basename + if '/' in tool: + tool = tool.split('/')[-1] data.append(ToolData( tool, float(mean), diff --git a/src/gent.c b/src/gen-thread.c similarity index 87% rename from src/gent.c rename to src/gen-thread.c index a0d71f1..acddc61 100644 --- a/src/gent.c +++ b/src/gen-thread.c @@ -8,16 +8,17 @@ #define MAX_SEQ_NAME_LEN 256 #define MAX_FILENAME_LEN 256 #define MAX_THREADS 16 +#define CHUNK_SIZE 1000 // Number of sequences to process in each chunk // Thread-related structures typedef struct { char **sequences; int *lengths; - int start_index; - int end_index; + int num_seqs; + int *next_seq; + pthread_mutex_t *mutex; } ThreadData; - // Function prototypes long long calculate_n50(const int *lengths, int num_seqs, long long *total_length); void *generate_sequences(void *arg); @@ -27,8 +28,8 @@ void free_resources(char **sequences, int *lengths, int num_seqs); void write_sequences(char **sequences, int *lengths, int num_seqs, const char *outfile, const char *format); int main(int argc, char *argv[]) { - if (argc != 9) { - fprintf(stderr, "Usage: %s \n", argv[0]); + if (argc < 8) { + fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } @@ -39,7 +40,12 @@ int main(int argc, char *argv[]) { int tot_files = atoi(argv[5]); char *format = argv[6]; char *outdir = argv[7]; - int num_threads = atoi(argv[8]); + // if specified, the number of threads to use else 1 + int num_threads = 1; + if (argc == 9) { + num_threads = atoi(argv[8]); + } + // Input validation if (min_seqs <= 0 || max_seqs <= 0 || min_len <= 0 || max_len <= 0 || tot_files <= 0 || num_threads <= 0 || num_threads > MAX_THREADS) { @@ -101,18 +107,20 @@ int main(int argc, char *argv[]) { // Set up threading for sequence generation pthread_t threads[num_threads]; - ThreadData thread_data[num_threads]; - - int seqs_per_thread = num_seqs / num_threads; - int remaining_seqs = num_seqs % num_threads; + ThreadData thread_data; + thread_data.sequences = sequences; + thread_data.lengths = contig_lengths; + thread_data.num_seqs = num_seqs; + + int next_seq = 0; + thread_data.next_seq = &next_seq; + + pthread_mutex_t mutex; + pthread_mutex_init(&mutex, NULL); + thread_data.mutex = &mutex; for (int t = 0; t < num_threads; t++) { - thread_data[t].sequences = sequences; - thread_data[t].lengths = contig_lengths; - thread_data[t].start_index = t * seqs_per_thread + (t < remaining_seqs ? t : remaining_seqs); - thread_data[t].end_index = (t + 1) * seqs_per_thread + (t < remaining_seqs ? t + 1 : remaining_seqs); - - if (pthread_create(&threads[t], NULL, generate_sequences, (void *)&thread_data[t]) != 0) { + if (pthread_create(&threads[t], NULL, generate_sequences, (void *)&thread_data) != 0) { fprintf(stderr, "Failed to create thread %d\n", t); free_resources(sequences, contig_lengths, num_seqs); return 1; @@ -124,6 +132,8 @@ int main(int argc, char *argv[]) { pthread_join(threads[t], NULL); } + pthread_mutex_destroy(&mutex); + // Write sequences to file (single-threaded) write_sequences(sequences, contig_lengths, num_seqs, outfile, format); @@ -134,7 +144,6 @@ int main(int argc, char *argv[]) { return 0; } - void gen_ctg_len(int min_len, int max_len, int num_seqs, int *lengths) { // calculate the even distance between num_seqs sequences spanning from min_len to max_len long long STEP = (max_len - min_len) / num_seqs; @@ -208,11 +217,26 @@ int *generate_contigs(int N50, int SUM_LEN, int TOT_SEQS) { return contig_list; } + void *generate_sequences(void *arg) { ThreadData *data = (ThreadData *)arg; - for (int i = data->start_index; i < data->end_index; i++) { - generate_sequence(data->lengths[i], data->sequences[i]); + int start, end; + + while (1) { + pthread_mutex_lock(data->mutex); + start = *data->next_seq; + end = start + CHUNK_SIZE; + if (end > data->num_seqs) end = data->num_seqs; + *data->next_seq = end; + pthread_mutex_unlock(data->mutex); + + if (start >= data->num_seqs) break; + + for (int i = start; i < end; i++) { + generate_sequence(data->lengths[i], data->sequences[i]); + } } + return NULL; } diff --git a/src/gen.c b/src/gen.c index e3b1f24..5aad5f2 100644 --- a/src/gen.c +++ b/src/gen.c @@ -1,8 +1,10 @@ + #include #include #include #include #include + /* * DNA Sequence Generator * Andrea Telatin 2023, (C) Quadram Institute Bioscience @@ -22,6 +24,7 @@ * This tool is useful for testing sequence analysis software, benchmarking * bioinformatics pipelines, and generating sample data for educational purposes. */ +#define BUFFER_SIZE 1048576 // 1 MB buffer #define MAX_SEQ_NAME_LEN 256 #define MAX_FILENAME_LEN 256 @@ -81,9 +84,9 @@ int main(int argc, char *argv[]) { fprintf(stderr, "Memory allocation failed for contig_lengths\n"); return 1; } - fprintf(stderr, "%d file [%d num seqs]\n", i, num_seqs); + fprintf(stderr, "%d/%d %s file [%d num seqs]: ", i, tot_files, format, num_seqs); gen_ctg_len(min_len, max_len, num_seqs, contig_lengths); - + fprintf(stderr, "OK\n"); long long total_length; // Change to long long int N50 = calculate_n50(contig_lengths, num_seqs, &total_length); fprintf(stderr, "\tTotal length: %lld\n", total_length); // Use %lld for long long @@ -113,7 +116,7 @@ int main(int argc, char *argv[]) { } else { write_fastq(sequences, contig_lengths, num_seqs, outfile); } - fprintf(stderr, " [Done: %s]\n", outfile); + fprintf(stderr, " [Written to %s]\n", outfile); free_resources(sequences, contig_lengths, num_seqs); } @@ -203,6 +206,15 @@ void write_fasta(char **sequences, int *lengths, int num_seqs, const char *outfi return; } + char *buffer = malloc(BUFFER_SIZE); + if (!buffer) { + fprintf(stderr, "Unable to allocate buffer for writing\n"); + fclose(f); + return; + } + + setvbuf(f, buffer, _IOFBF, BUFFER_SIZE); + for (int i = 0; i < num_seqs; i++) { fprintf(f, ">seq%d\n", i + 1); for (int j = 0; j < lengths[i]; j += 60) { @@ -211,7 +223,9 @@ void write_fasta(char **sequences, int *lengths, int num_seqs, const char *outfi } } + fflush(f); fclose(f); + free(buffer); } void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfile) { @@ -221,6 +235,15 @@ void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfi return; } + char *buffer = malloc(BUFFER_SIZE); + if (!buffer) { + fprintf(stderr, "Unable to allocate buffer for writing\n"); + fclose(f); + return; + } + + setvbuf(f, buffer, _IOFBF, BUFFER_SIZE); + for (int i = 0; i < num_seqs; i++) { fprintf(f, "@seq%d\n%s\n+\n", i + 1, sequences[i]); for (int j = 0; j < lengths[i]; j++) { @@ -229,7 +252,9 @@ void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfi fputc('\n', f); } + fflush(f); fclose(f); + free(buffer); } void free_resources(char **sequences, int *lengths, int num_seqs) { diff --git a/test/benchmark/*.*.csv b/test/benchmark/*.*.csv new file mode 100644 index 0000000..e69de29 diff --git a/test/benchmark/1396484_902_903600707.fasta.csv b/test/benchmark/1396484_902_903600707.fasta.csv new file mode 100644 index 0000000..0a3764b --- /dev/null +++ b/test/benchmark/1396484_902_903600707.fasta.csv @@ -0,0 +1,2 @@ +command,mean,stddev,median,user,system,min,max +bin/n50 test/sim/1396484_902_903600707.fasta,1.6391795357799999,0.020583813470225212,1.63523198198,1.5063611,0.11844614,1.61149171148,1.6722827934800002 diff --git a/test/benchmark/1454777_575_591143151.fasta.csv b/test/benchmark/1454777_575_591143151.fasta.csv new file mode 100644 index 0000000..0c084bd --- /dev/null +++ b/test/benchmark/1454777_575_591143151.fasta.csv @@ -0,0 +1,4 @@ +command,mean,stddev,median,user,system,min,max +src/n50 test/sim/1454777_575_591143151.fasta,1.06535981002,0.01887172614604079,1.0614336600199998,0.9829298200000001,0.07575556,1.04674813952,1.11139147252 +seqfu stats test/sim/1454777_575_591143151.fasta,0.3583592433200001,0.005983688320461325,0.35700688952000004,0.28345232,0.06740586,0.35192797252,0.37087238852000004 +seqkit stats test/sim/1454777_575_591143151.fasta,0.39314732252000006,0.006703037820939213,0.39226030602000006,0.32644592000000006,0.06484776,0.38318080552000006,0.40773626352000003 diff --git a/test/benchmark/6917_312_1496654.fastq.csv b/test/benchmark/6917_312_1496654.fastq.csv new file mode 100644 index 0000000..ba26e24 --- /dev/null +++ b/test/benchmark/6917_312_1496654.fastq.csv @@ -0,0 +1,4 @@ +command,mean,stddev,median,user,system,min,max +src/n50 test/sim/6917_312_1496654.fastq,0.008528284129999994,0.0006431936679455821,0.008399792479999994,0.003592820000000002,0.0029625599999999986,0.007366979979999995,0.009651687979999995 +seqfu stats test/sim/6917_312_1496654.fastq,0.013984221579999997,0.0010317730767312214,0.013940396479999995,0.004972670000000002,0.004072159999999999,0.011888604979999997,0.015608520979999997 +seqkit stats test/sim/6917_312_1496654.fastq,0.033020338329999996,0.0027281133626468773,0.03256454247999999,0.021750320000000004,0.011906559999999998,0.029904187979999997,0.041659812979999995 diff --git a/test/benchmark/all.csv b/test/benchmark/all.csv new file mode 100644 index 0000000..e69de29 diff --git a/test/benchmark/all_fasta.csv b/test/benchmark/all_fasta.csv new file mode 100644 index 0000000..e69de29 diff --git a/test/benchmark/all_fastq.csv b/test/benchmark/all_fastq.csv new file mode 100644 index 0000000..e69de29 diff --git a/test/benchmark/gz_all.csv b/test/benchmark/gz_all.csv new file mode 100644 index 0000000..e69de29 diff --git a/test/benchmark/gz_all_fasta.csv b/test/benchmark/gz_all_fasta.csv new file mode 100644 index 0000000..e69de29 diff --git a/test/benchmark/gz_all_fastq.csv b/test/benchmark/gz_all_fastq.csv new file mode 100644 index 0000000..e69de29