diff --git a/cram/cram_io.c b/cram/cram_io.c index 5e5891114..ce549b728 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2473,6 +2473,9 @@ static refs_t *refs_create(void) { static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) { BGZF *fp; + if (strncmp(fn, "file://", 7) == 0) + fn += 7; + if (!is_md5 && !hisremote(fn)) { char fai_file[PATH_MAX]; @@ -2934,30 +2937,6 @@ static void mkdir_prefix(char *path, int mode) { *cp = '/'; } -/* - * Return the cache directory to use, based on the first of these - * environment variables to be set to a non-empty value. - */ -static const char *get_cache_basedir(const char **extra) { - char *base; - - *extra = ""; - - base = getenv("XDG_CACHE_HOME"); - if (base && *base) return base; - - base = getenv("HOME"); - if (base && *base) { *extra = "/.cache"; return base; } - - base = getenv("TMPDIR"); - if (base && *base) return base; - - base = getenv("TEMP"); - if (base && *base) return base; - - return "/tmp"; -} - /* * Queries the M5 string from the header and attempts to populate the * reference from this using the REF_PATH environment. @@ -2971,31 +2950,12 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { sam_hrec_tag_t *tag; char path[PATH_MAX]; kstring_t path_tmp = KS_INITIALIZE; - char cache[PATH_MAX], cache_root[PATH_MAX]; char *local_cache = getenv("REF_CACHE"); mFILE *mf; int local_path = 0; hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id); - cache_root[0] = '\0'; - - if (!ref_path || *ref_path == '\0') { - /* - * If we have no ref path, we use the EBI server. - * However to avoid spamming it we require a local ref cache too. - */ - ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s"; - if (!local_cache || *local_cache == '\0') { - const char *extra; - const char *base = get_cache_basedir(&extra); - snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra); - snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra); - local_cache = cache; - hts_log_info("Populating local cache: %s", local_cache); - } - } - if (!r->name) return -1; @@ -3009,7 +2969,10 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { /* Use cache if available */ if (local_cache && *local_cache) { - if (expand_cache_path(path, local_cache, tag->str+3) == 0) + struct stat sb; + if (expand_cache_path(path, local_cache, tag->str+3) == 0 && + stat(path, &sb) == 0) + // Found it in the local cache local_path = 1; } @@ -3053,7 +3016,8 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { /* Otherwise search full REF_PATH; slower as loads entire file */ - if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) { + int is_local = 0; + if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) { size_t sz; r->seq = mfsteal(mf, &sz); if (r->seq) { @@ -3069,15 +3033,22 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { } else { refs_t *refs; const char *fn; + sam_hrec_tag_t *UR_tag; no_M5: /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ - if (!(tag = sam_hrecs_find_key(ty, "UR", NULL))) + if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL))) + return -1; + + if (strncmp(UR_tag->str+3, "file:", 5) != 0) { + // Documented as omitted, but accidentally supported until now + hts_log_error("UR tags pointing to remote files are not supported"); return -1; + } - fn = (strncmp(tag->str+3, "file:", 5) == 0) - ? tag->str+8 - : tag->str+3; + fn = (strncmp(UR_tag->str+3, "file:", 5) == 0) + ? UR_tag->str+8 + : UR_tag->str+3; if (fd->refs->fp) { if (bgzf_close(fd->refs->fp) != 0) @@ -3108,15 +3079,9 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { } /* Populate the local disk cache if required */ - if (local_cache && *local_cache) { + if (!is_local && local_cache && *local_cache) { hFILE *fp; - if (*cache_root && !is_directory(cache_root)) { - hts_log_warning("Creating reference cache directory %s\n" - "This may become large; see the samtools(1) manual page REF_CACHE discussion", - cache_root); - } - if (expand_cache_path(path, local_cache, tag->str+3) < 0) { return 0; // Not fatal - we have the data already so keep going. } diff --git a/cram/open_trace_file.c b/cram/open_trace_file.c index 4d617b736..31f450cd7 100644 --- a/cram/open_trace_file.c +++ b/cram/open_trace_file.c @@ -324,14 +324,21 @@ static mFILE *find_file_dir(const char *file, char *dirname) { * all of the locations listed in 'path' (which is a colon separated list). * If 'path' is NULL it uses the RAWDATA environment variable instead. * + * If non-NULL *local is filled out to 1 for a local file and 0 for a remote + * URL. + * * Returns a mFILE pointer when found. * NULL otherwise. */ -mFILE *open_path_mfile(const char *file, char *path, char *relative_to) { +mFILE *open_path_mfile(const char *file, char *path, char *relative_to, + int *local) { char *newsearch; char *ele; mFILE *fp; + if (local) + *local = 1; + /* Use path first */ if (!path) path = getenv("RAWDATA"); @@ -361,14 +368,16 @@ mFILE *open_path_mfile(const char *file, char *path, char *relative_to) { if (0 == strncmp(ele2, "URL=", 4)) { if ((fp = find_file_url(file, ele2+4))) { + if (local) + *local = strncmp(ele2+4, "file:", 5) == 0 ? 1 : 0; free(newsearch); return fp; } - } else if (!strncmp(ele2, "http:", 5) || - !strncmp(ele2, "https:", 6) || - !strncmp(ele2, "ftp:", 4)) { + } else if (hisremote(ele2)) { if ((fp = find_file_url(file, ele2))) { free(newsearch); + if (local) + *local = 0; return fp; } } else if ((fp = find_file_dir(file, ele2))) { diff --git a/cram/open_trace_file.h b/cram/open_trace_file.h index 45860980d..b7b6a4a7c 100644 --- a/cram/open_trace_file.h +++ b/cram/open_trace_file.h @@ -96,10 +96,14 @@ char *tokenise_search_path(const char *searchpath); * all of the locations listed in 'path' (which is a colon separated list). * If 'path' is NULL it uses the RAWDATA environment variable instead. * + * If non-NULL *local is filled out to 1 for a local file and 0 for a remote + * URL. + * * Returns a mFILE pointer when found. * NULL otherwise. */ -mFILE *open_path_mfile(const char *file, char *path, char *relative_to); +mFILE *open_path_mfile(const char *file, char *path, char *relative_to, + int *local); /* * Returns a mFILE containing the entire contents of the url;