From f29174d9f775e58a8505b5cdc4f35e16b7aa65f7 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 23 Jan 2025 17:18:16 +0000 Subject: [PATCH] Remove UR:file:// and UR:ftp:// from ref search path, plus REF_PATH to EBI. While use of the EBI refget server was originally encouraged by the CRAM inventors, it became a self-imposed DDOS and it is now unreliable. This removes EBI as a fallback when REF_PATH has not been set. In doing this we discovered that we could still retrieve references (ironically also from EBI due to the test being a 1000genomes CRAM) via the SQ UR: tag supporting remote URIs. This behaviour is explicity listed as not being supported in the samtools manpage and we believe it was an accidental ability added when switching from fopen to bgzf_open for reading the UR reference file. Note this check must be in cram_populate_ref and not load_ref_portion or bgzf_open_ref as the user still has the ability to explicitly request an external reference, eg via "samtools view -T URI". open_path_mfile() now takes an extra 'int *local' argument which is filled out with non-zero if the find found in REF_PATH is local. Non-local files will be cached to REF_CACHE if set, but it no longer has a default value as we did when ebi refget was the default REF_PATH. This means it should operate much as before, except for the lack of EBI defaults. --- cram/cram_io.c | 78 ++++++++++++------------------------------ cram/open_trace_file.c | 17 ++++++--- cram/open_trace_file.h | 6 +++- 3 files changed, 40 insertions(+), 61 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 5e5891114..6cf639df4 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2473,6 +2473,9 @@ static refs_t *refs_create(void) { static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) { BGZF *fp; + if (strncmp(fn, "file://", 7) == 0) + fn += 7; + if (!is_md5 && !hisremote(fn)) { char fai_file[PATH_MAX]; @@ -2934,30 +2937,6 @@ static void mkdir_prefix(char *path, int mode) { *cp = '/'; } -/* - * Return the cache directory to use, based on the first of these - * environment variables to be set to a non-empty value. - */ -static const char *get_cache_basedir(const char **extra) { - char *base; - - *extra = ""; - - base = getenv("XDG_CACHE_HOME"); - if (base && *base) return base; - - base = getenv("HOME"); - if (base && *base) { *extra = "/.cache"; return base; } - - base = getenv("TMPDIR"); - if (base && *base) return base; - - base = getenv("TEMP"); - if (base && *base) return base; - - return "/tmp"; -} - /* * Queries the M5 string from the header and attempts to populate the * reference from this using the REF_PATH environment. @@ -2971,31 +2950,12 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { sam_hrec_tag_t *tag; char path[PATH_MAX]; kstring_t path_tmp = KS_INITIALIZE; - char cache[PATH_MAX], cache_root[PATH_MAX]; char *local_cache = getenv("REF_CACHE"); mFILE *mf; int local_path = 0; hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id); - cache_root[0] = '\0'; - - if (!ref_path || *ref_path == '\0') { - /* - * If we have no ref path, we use the EBI server. - * However to avoid spamming it we require a local ref cache too. - */ - ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s"; - if (!local_cache || *local_cache == '\0') { - const char *extra; - const char *base = get_cache_basedir(&extra); - snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra); - snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra); - local_cache = cache; - hts_log_info("Populating local cache: %s", local_cache); - } - } - if (!r->name) return -1; @@ -3009,7 +2969,10 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { /* Use cache if available */ if (local_cache && *local_cache) { - if (expand_cache_path(path, local_cache, tag->str+3) == 0) + struct stat sb; + if (expand_cache_path(path, local_cache, tag->str+3) == 0 && + stat(path, &sb) == 0) + // Found it in the local cache local_path = 1; } @@ -3053,7 +3016,8 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { /* Otherwise search full REF_PATH; slower as loads entire file */ - if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) { + int is_local = 0; + if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) { size_t sz; r->seq = mfsteal(mf, &sz); if (r->seq) { @@ -3069,15 +3033,23 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { } else { refs_t *refs; const char *fn; + sam_hrec_tag_t *UR_tag; no_M5: /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ - if (!(tag = sam_hrecs_find_key(ty, "UR", NULL))) + if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL))) + return -1; + + if (strstr(UR_tag->str+3, "://") && + strncmp(UR_tag->str+3, "file:", 5) != 0) { + // Documented as omitted, but accidentally supported until now + hts_log_error("UR tags pointing to remote files are not supported"); return -1; + } - fn = (strncmp(tag->str+3, "file:", 5) == 0) - ? tag->str+8 - : tag->str+3; + fn = (strncmp(UR_tag->str+3, "file:", 5) == 0) + ? UR_tag->str+8 + : UR_tag->str+3; if (fd->refs->fp) { if (bgzf_close(fd->refs->fp) != 0) @@ -3108,15 +3080,9 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { } /* Populate the local disk cache if required */ - if (local_cache && *local_cache) { + if (!is_local && local_cache && *local_cache) { hFILE *fp; - if (*cache_root && !is_directory(cache_root)) { - hts_log_warning("Creating reference cache directory %s\n" - "This may become large; see the samtools(1) manual page REF_CACHE discussion", - cache_root); - } - if (expand_cache_path(path, local_cache, tag->str+3) < 0) { return 0; // Not fatal - we have the data already so keep going. } diff --git a/cram/open_trace_file.c b/cram/open_trace_file.c index 4d617b736..31f450cd7 100644 --- a/cram/open_trace_file.c +++ b/cram/open_trace_file.c @@ -324,14 +324,21 @@ static mFILE *find_file_dir(const char *file, char *dirname) { * all of the locations listed in 'path' (which is a colon separated list). * If 'path' is NULL it uses the RAWDATA environment variable instead. * + * If non-NULL *local is filled out to 1 for a local file and 0 for a remote + * URL. + * * Returns a mFILE pointer when found. * NULL otherwise. */ -mFILE *open_path_mfile(const char *file, char *path, char *relative_to) { +mFILE *open_path_mfile(const char *file, char *path, char *relative_to, + int *local) { char *newsearch; char *ele; mFILE *fp; + if (local) + *local = 1; + /* Use path first */ if (!path) path = getenv("RAWDATA"); @@ -361,14 +368,16 @@ mFILE *open_path_mfile(const char *file, char *path, char *relative_to) { if (0 == strncmp(ele2, "URL=", 4)) { if ((fp = find_file_url(file, ele2+4))) { + if (local) + *local = strncmp(ele2+4, "file:", 5) == 0 ? 1 : 0; free(newsearch); return fp; } - } else if (!strncmp(ele2, "http:", 5) || - !strncmp(ele2, "https:", 6) || - !strncmp(ele2, "ftp:", 4)) { + } else if (hisremote(ele2)) { if ((fp = find_file_url(file, ele2))) { free(newsearch); + if (local) + *local = 0; return fp; } } else if ((fp = find_file_dir(file, ele2))) { diff --git a/cram/open_trace_file.h b/cram/open_trace_file.h index 45860980d..b7b6a4a7c 100644 --- a/cram/open_trace_file.h +++ b/cram/open_trace_file.h @@ -96,10 +96,14 @@ char *tokenise_search_path(const char *searchpath); * all of the locations listed in 'path' (which is a colon separated list). * If 'path' is NULL it uses the RAWDATA environment variable instead. * + * If non-NULL *local is filled out to 1 for a local file and 0 for a remote + * URL. + * * Returns a mFILE pointer when found. * NULL otherwise. */ -mFILE *open_path_mfile(const char *file, char *path, char *relative_to); +mFILE *open_path_mfile(const char *file, char *path, char *relative_to, + int *local); /* * Returns a mFILE containing the entire contents of the url;