From 186d21bf50e6ce6d78e4ef429bca5ffe5c016744 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 9 Apr 2024 17:13:46 +0100 Subject: [PATCH] Increase the input block size for bgzip. Commit e495718 changed bgzip from unix raw POSIX read() calls to hread(). Unfortunately hread gets its buffer size from stat of the input file descriptor, which can be 4kb for a pipe. We're reading 0xff00 bytes, so this ends up being split over two reads mostly, with one or both involving additional memcpys. This makes the buffered I/O worse performing than non-buffered. In the most extreme cases (cat data | bgzip -l0 > /dev/null) this is a two fold slow down. The easy solution is just to increase the buffer size to something sensible. Currently we play it cautiously and only do this on pipes and fifos. Fixes #1767 --- hfile.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/hfile.c b/hfile.c index 552b71774..20245db43 100644 --- a/hfile.c +++ b/hfile.c @@ -107,9 +107,11 @@ hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity) hFILE *fp = (hFILE *) malloc(struct_size); if (fp == NULL) goto error; - if (capacity == 0) capacity = 32768; + const int maxcap = 128*1024; + + if (capacity == 0) capacity = maxcap; // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory - if (strchr(mode, 'r') && capacity > 32768) capacity = 32768; + if (strchr(mode, 'r') && capacity > maxcap) capacity = maxcap; fp->buffer = (char *) malloc(capacity); if (fp->buffer == NULL) goto error; @@ -629,7 +631,12 @@ static size_t blksize(int fd) #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE struct stat sbuf; if (fstat(fd, &sbuf) != 0) return 0; - return sbuf.st_blksize; + + // Pipes/FIFOs on linux return 4Kb here often, but it's much too small + // for performant I/O. + return S_ISFIFO(sbuf.st_mode) + ? 128*1024 + : sbuf.st_blksize; #else return 0; #endif