Skip to content

Commit

Permalink
udpates
Browse files Browse the repository at this point in the history
  • Loading branch information
telatin committed Sep 19, 2024
1 parent 294d2d1 commit 6d8cd87
Show file tree
Hide file tree
Showing 17 changed files with 129 additions and 39 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
test/local
.vscode/
bin/*
test/sim/*.*
48 changes: 31 additions & 17 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,48 @@ LDFLAGS = -lz -lpthread
SRC_DIR = src
BIN_DIR = bin
TEST_DIR = test
TARGET = $(SRC_DIR)/n50
TARGET = $(BIN_DIR)/n50
SIMTARGET = $(BIN_DIR)/gen

SIMDATA = test/sim/list.txt
.PHONY: all clean test

all: $(TARGET) $(SIMTARGET) $(TESTTARGET)

#Make targets
$(TARGET): $(SRC_DIR)/n50.c | $(BIN_DIR)
$(CC) $(CFLAGS) $< -o $@ $(LDFLAGS)

$(TESTTARGET): $(SRC_DIR)/n50_opt.c | $(BIN_DIR)
$(CC) $(CFLAGS) $< -o $@ $(LDFLAGS)

$(SIMTARGET): $(SRC_DIR)/gen.c | $(BIN_DIR)
$(CC) $(CFLAGS) $< -o $@
$(CC) $(CFLAGS) $< -o $@

$(BIN_DIR):
mkdir -p $(BIN_DIR)

clean:
rm -rf $(BIN_DIR)
if [ -d "test/sim" ]; then rm -f test/sim/*; fi
if [ -d "test/sim" ]; then echo "Removing sim"; rm -rf test/sim/; fi

$(SIMDATA): $(SIMTARGET)
# Generate simulated files each with filename like {N50}_{num_seqs}_{total_length}.{format} \
mkdir -p test/sim; \
# ./program <min_seqs> <max_seqs> <min_len> <max_len> <tot_files> <format> <outdir>\
# small datasets \
$(SIMTARGET) 500 1000 5 2000000 5 fasta test/sim/; \
$(SIMTARGET) 500 1000 5 10000 5 fastq test/sim/; \
# large datasets
$(SIMTARGET) 5000 10000 1000 20000000 3 fasta test/sim/; \
$(SIMTARGET) 5000 10000 1000 20000000 3 fastq test/sim/; \
gzip -k test/sim/*.*; \
ls test/sim/*.* > $(SIMDATA)

# Test rule
test: $(TARGET) $(SIMTARGET)
@passed=0; failed=0; \
if [ -d "$(TEST_DIR)" ]; then \
echo "Running tests in $(TEST_DIR)"; \
echo "[1] Running tests in $(TEST_DIR)"; \
for file in $(TEST_DIR)/*.*; do \
filename=$$(basename "$$file"); \
expected_n50=$${filename%%.*}; \
Expand All @@ -49,8 +62,9 @@ test: $(TARGET) $(SIMTARGET)
fi; \
done; \
fi; \
mkdir -p test/sim; \
if [ -d "test/sim" ]; then \
echo "Generating simulated reads" \
echo "[2] Generating simulated reads" \
# Generate simulated files each with filename like {N50}_{num_seqs}_{total_length}.{format} \
$(SIMTARGET) 10 35 1 2000 10 fasta test/sim/; \
$(SIMTARGET) 5 15 1 1000 8 fastq test/sim/; \
Expand Down Expand Up @@ -94,6 +108,7 @@ test: $(TARGET) $(SIMTARGET)
else \
echo "test/sim directory does not exist. Skipping simulation tests."; \
fi; \
echo "Tested $(TARGET)"; \
echo "Tests completed. Passed: $$passed, Failed: $$failed"; \
if [ $$failed -ne 0 ]; then exit 1; fi

Expand Down Expand Up @@ -123,24 +138,23 @@ autotest: $(TARGET)
@echo "Simple test completed."


benchmark: $(TARGET) $(SIMTARGET)
benchmark: $(TARGET) $(SIMTARGET) $(SIMDATA)

if [ -d "test/sim" ]; then \
echo "Generating simulated reads" \
# Generate simulated files each with filename like {N50}_{num_seqs}_{total_length}.{format} \
# ./program <min_seqs> <max_seqs> <min_len> <max_len> <tot_files> <format> <outdir>\
$(SIMTARGET) 20 1000 1 2000000 2 fasta test/sim/; \
$(SIMTARGET) 10 1000 100 10000 1 fastq test/sim/; \
if [ ! -d "test/benchmark" ]; then mkdir -p test/benchmark; fi; \
if [ ! -d "test/local/" ]; then mkdir -p test/local/; fi; \
echo "Running simulation tests in test/sim"; \
for file in test/sim/*.*; do \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/$$(basename "$$file").csv" "$(TARGET) $$file" "seqfu stats $$file" "seqkit stats $$file"; \
done; \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/all.csv" "$(TARGET) test/sim/*.*" "seqfu stats test/sim/*.*" "seqkit stats test/sim/*.*"; \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/all_fasta.csv" "$(TARGET) test/sim/*.fasta" "seqfu stats test/sim/*.fasta" "seqkit stats test/sim/*.fasta"; \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/all_fastq.csv" "$(TARGET) test/sim/*.fastq" "seqfu stats test/sim/*.fastq" "seqkit stats test/sim/*.fastq"; \
gzip test/sim/*.*; \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all.csv" "$(TARGET) test/sim/*.gz" "seqfu stats test/sim/*.gz" "seqkit stats test/sim/*.gz"; \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fasta.csv" "$(TARGET) test/sim/*.fasta.gz" "seqfu stats test/sim/*.fasta.gz" "seqkit stats test/sim/*.fasta.gz"; \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fastq.csv" "$(TARGET) test/sim/*.fastq.gz" "seqfu stats test/sim/*.fastq.gz" "seqkit stats test/sim/*.fastq.gz"; \
gzip -f test/sim/*.*; \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all.csv" "$(TARGET) test/{sim,local}/*.gz" "seqfu stats test/{sim,local}/*.gz" "seqkit stats test/{sim,local}/*.gz"; \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fasta.csv" "$(TARGET) test/{sim,local}/*.fasta.gz" "seqfu stats test/{sim,local}/*.fasta.gz" "seqkit stats test/{sim,local}/*.fasta.gz"; \
hyperfine --warmup 2 --max-runs 20 --export-csv "test/benchmark/gz_all_fastq.csv" "$(TARGET) test/{sim,local}/*.fastq.gz" "seqfu stats test/{sim,local}/*.fastq.gz" "seqkit stats test/{sim,local}/*.fastq.gz"; \
else \
echo "test/sim directory does not exist. Skipping simulation tests."; \
fi
fi

Binary file modified bin/n50
Binary file not shown.
13 changes: 13 additions & 0 deletions src/bench.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
#!/bin/bash
PARENT_DIR=$(dirname $(dirname $(readlink -f $0)))

mkdir -p $PARENT_DIR/test/benchmark
mkdir -p $PARENT_DIR/test/local
mkdir -p $PARENT_DIR/test/sim

for bin in seqfu seqkit ./bin/n50 ./bin/n50o; do
if ! command -v $bin &> /dev/null; then
echo "Error: $bin could not be found."
echo "Please make sure that the binary is compiled and in the bin/ directory."
exit 1
fi
done

set -euo pipefail
# Let's if the subdirectories bin/ and test/ exist
if [ ! -d "$PARENT_DIR/bin" ] || [ ! -d "$PARENT_DIR/test" ] || [ ! -d "$PARENT_DIR/test/local" ]; then
Expand Down
3 changes: 3 additions & 0 deletions src/bench_comp.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def parse_csv(file_path: str) -> List[ToolData]:
for row in reader:
command, mean, stddev, median, user, system, min_time, max_time = row
tool = command.split()[0]
# if tool is a path, extract the basename
if '/' in tool:
tool = tool.split('/')[-1]
data.append(ToolData(
tool,
float(mean),
Expand Down
62 changes: 43 additions & 19 deletions src/gent.c → src/gen-thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@
#define MAX_SEQ_NAME_LEN 256
#define MAX_FILENAME_LEN 256
#define MAX_THREADS 16
#define CHUNK_SIZE 1000 // Number of sequences to process in each chunk

// Thread-related structures
typedef struct {
char **sequences;
int *lengths;
int start_index;
int end_index;
int num_seqs;
int *next_seq;
pthread_mutex_t *mutex;
} ThreadData;


// Function prototypes
long long calculate_n50(const int *lengths, int num_seqs, long long *total_length);
void *generate_sequences(void *arg);
Expand All @@ -27,8 +28,8 @@ void free_resources(char **sequences, int *lengths, int num_seqs);
void write_sequences(char **sequences, int *lengths, int num_seqs, const char *outfile, const char *format);

int main(int argc, char *argv[]) {
if (argc != 9) {
fprintf(stderr, "Usage: %s <min_seqs> <max_seqs> <min_len> <max_len> <tot_files> <format> <outdir> <num_threads>\n", argv[0]);
if (argc < 8) {
fprintf(stderr, "Usage: %s <min_seqs> <max_seqs> <min_len> <max_len> <tot_files> <format> <outdir>\n", argv[0]);
return 1;
}

Expand All @@ -39,7 +40,12 @@ int main(int argc, char *argv[]) {
int tot_files = atoi(argv[5]);
char *format = argv[6];
char *outdir = argv[7];
int num_threads = atoi(argv[8]);
// if specified, the number of threads to use else 1
int num_threads = 1;
if (argc == 9) {
num_threads = atoi(argv[8]);
}


// Input validation
if (min_seqs <= 0 || max_seqs <= 0 || min_len <= 0 || max_len <= 0 || tot_files <= 0 || num_threads <= 0 || num_threads > MAX_THREADS) {
Expand Down Expand Up @@ -101,18 +107,20 @@ int main(int argc, char *argv[]) {

// Set up threading for sequence generation
pthread_t threads[num_threads];
ThreadData thread_data[num_threads];

int seqs_per_thread = num_seqs / num_threads;
int remaining_seqs = num_seqs % num_threads;
ThreadData thread_data;
thread_data.sequences = sequences;
thread_data.lengths = contig_lengths;
thread_data.num_seqs = num_seqs;

int next_seq = 0;
thread_data.next_seq = &next_seq;

pthread_mutex_t mutex;
pthread_mutex_init(&mutex, NULL);
thread_data.mutex = &mutex;

for (int t = 0; t < num_threads; t++) {
thread_data[t].sequences = sequences;
thread_data[t].lengths = contig_lengths;
thread_data[t].start_index = t * seqs_per_thread + (t < remaining_seqs ? t : remaining_seqs);
thread_data[t].end_index = (t + 1) * seqs_per_thread + (t < remaining_seqs ? t + 1 : remaining_seqs);

if (pthread_create(&threads[t], NULL, generate_sequences, (void *)&thread_data[t]) != 0) {
if (pthread_create(&threads[t], NULL, generate_sequences, (void *)&thread_data) != 0) {
fprintf(stderr, "Failed to create thread %d\n", t);
free_resources(sequences, contig_lengths, num_seqs);
return 1;
Expand All @@ -124,6 +132,8 @@ int main(int argc, char *argv[]) {
pthread_join(threads[t], NULL);
}

pthread_mutex_destroy(&mutex);

// Write sequences to file (single-threaded)
write_sequences(sequences, contig_lengths, num_seqs, outfile, format);

Expand All @@ -134,7 +144,6 @@ int main(int argc, char *argv[]) {
return 0;
}


void gen_ctg_len(int min_len, int max_len, int num_seqs, int *lengths) {
// calculate the even distance between num_seqs sequences spanning from min_len to max_len
long long STEP = (max_len - min_len) / num_seqs;
Expand Down Expand Up @@ -208,11 +217,26 @@ int *generate_contigs(int N50, int SUM_LEN, int TOT_SEQS) {
return contig_list;
}


void *generate_sequences(void *arg) {
ThreadData *data = (ThreadData *)arg;
for (int i = data->start_index; i < data->end_index; i++) {
generate_sequence(data->lengths[i], data->sequences[i]);
int start, end;

while (1) {
pthread_mutex_lock(data->mutex);
start = *data->next_seq;
end = start + CHUNK_SIZE;
if (end > data->num_seqs) end = data->num_seqs;
*data->next_seq = end;
pthread_mutex_unlock(data->mutex);

if (start >= data->num_seqs) break;

for (int i = start; i < end; i++) {
generate_sequence(data->lengths[i], data->sequences[i]);
}
}

return NULL;
}

Expand Down
31 changes: 28 additions & 3 deletions src/gen.c
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <ctype.h>

/*
* DNA Sequence Generator
* Andrea Telatin 2023, (C) Quadram Institute Bioscience
Expand All @@ -22,6 +24,7 @@
* This tool is useful for testing sequence analysis software, benchmarking
* bioinformatics pipelines, and generating sample data for educational purposes.
*/
#define BUFFER_SIZE 1048576 // 1 MB buffer
#define MAX_SEQ_NAME_LEN 256
#define MAX_FILENAME_LEN 256

Expand Down Expand Up @@ -81,9 +84,9 @@ int main(int argc, char *argv[]) {
fprintf(stderr, "Memory allocation failed for contig_lengths\n");
return 1;
}
fprintf(stderr, "%d file [%d num seqs]\n", i, num_seqs);
fprintf(stderr, "%d/%d %s file [%d num seqs]: ", i, tot_files, format, num_seqs);
gen_ctg_len(min_len, max_len, num_seqs, contig_lengths);

fprintf(stderr, "OK\n");
long long total_length; // Change to long long
int N50 = calculate_n50(contig_lengths, num_seqs, &total_length);
fprintf(stderr, "\tTotal length: %lld\n", total_length); // Use %lld for long long
Expand Down Expand Up @@ -113,7 +116,7 @@ int main(int argc, char *argv[]) {
} else {
write_fastq(sequences, contig_lengths, num_seqs, outfile);
}
fprintf(stderr, " [Done: %s]\n", outfile);
fprintf(stderr, " [Written to %s]\n", outfile);
free_resources(sequences, contig_lengths, num_seqs);
}

Expand Down Expand Up @@ -203,6 +206,15 @@ void write_fasta(char **sequences, int *lengths, int num_seqs, const char *outfi
return;
}

char *buffer = malloc(BUFFER_SIZE);
if (!buffer) {
fprintf(stderr, "Unable to allocate buffer for writing\n");
fclose(f);
return;
}

setvbuf(f, buffer, _IOFBF, BUFFER_SIZE);

for (int i = 0; i < num_seqs; i++) {
fprintf(f, ">seq%d\n", i + 1);
for (int j = 0; j < lengths[i]; j += 60) {
Expand All @@ -211,7 +223,9 @@ void write_fasta(char **sequences, int *lengths, int num_seqs, const char *outfi
}
}

fflush(f);
fclose(f);
free(buffer);
}

void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfile) {
Expand All @@ -221,6 +235,15 @@ void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfi
return;
}

char *buffer = malloc(BUFFER_SIZE);
if (!buffer) {
fprintf(stderr, "Unable to allocate buffer for writing\n");
fclose(f);
return;
}

setvbuf(f, buffer, _IOFBF, BUFFER_SIZE);

for (int i = 0; i < num_seqs; i++) {
fprintf(f, "@seq%d\n%s\n+\n", i + 1, sequences[i]);
for (int j = 0; j < lengths[i]; j++) {
Expand All @@ -229,7 +252,9 @@ void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfi
fputc('\n', f);
}

fflush(f);
fclose(f);
free(buffer);
}

void free_resources(char **sequences, int *lengths, int num_seqs) {
Expand Down
Empty file added test/benchmark/*.*.csv
Empty file.
2 changes: 2 additions & 0 deletions test/benchmark/1396484_902_903600707.fasta.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
command,mean,stddev,median,user,system,min,max
bin/n50 test/sim/1396484_902_903600707.fasta,1.6391795357799999,0.020583813470225212,1.63523198198,1.5063611,0.11844614,1.61149171148,1.6722827934800002
4 changes: 4 additions & 0 deletions test/benchmark/1454777_575_591143151.fasta.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
command,mean,stddev,median,user,system,min,max
src/n50 test/sim/1454777_575_591143151.fasta,1.06535981002,0.01887172614604079,1.0614336600199998,0.9829298200000001,0.07575556,1.04674813952,1.11139147252
seqfu stats test/sim/1454777_575_591143151.fasta,0.3583592433200001,0.005983688320461325,0.35700688952000004,0.28345232,0.06740586,0.35192797252,0.37087238852000004
seqkit stats test/sim/1454777_575_591143151.fasta,0.39314732252000006,0.006703037820939213,0.39226030602000006,0.32644592000000006,0.06484776,0.38318080552000006,0.40773626352000003
4 changes: 4 additions & 0 deletions test/benchmark/6917_312_1496654.fastq.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
command,mean,stddev,median,user,system,min,max
src/n50 test/sim/6917_312_1496654.fastq,0.008528284129999994,0.0006431936679455821,0.008399792479999994,0.003592820000000002,0.0029625599999999986,0.007366979979999995,0.009651687979999995
seqfu stats test/sim/6917_312_1496654.fastq,0.013984221579999997,0.0010317730767312214,0.013940396479999995,0.004972670000000002,0.004072159999999999,0.011888604979999997,0.015608520979999997
seqkit stats test/sim/6917_312_1496654.fastq,0.033020338329999996,0.0027281133626468773,0.03256454247999999,0.021750320000000004,0.011906559999999998,0.029904187979999997,0.041659812979999995
Empty file added test/benchmark/all.csv
Empty file.
Empty file added test/benchmark/all_fasta.csv
Empty file.
Empty file added test/benchmark/all_fastq.csv
Empty file.
Empty file added test/benchmark/gz_all.csv
Empty file.
Empty file added test/benchmark/gz_all_fasta.csv
Empty file.
Empty file added test/benchmark/gz_all_fastq.csv
Empty file.

0 comments on commit 6d8cd87

Please sign in to comment.