Skip to content

Commit

Permalink
Remove UR:file:// and UR:ftp:// from ref search path, plus REF_PATH t…
Browse files Browse the repository at this point in the history
…o EBI.

While use of the EBI refget server was originally encouraged by the
CRAM inventors, it became a self-imposed DDOS and it is now
unreliable.  This removes EBI as a fallback when REF_PATH has not been
set.

In doing this we discovered that we could still retrieve references
(ironically also from EBI due to the test being a 1000genomes CRAM)
via the SQ UR: tag supporting remote URIs.  This behaviour is
explicity listed as not being supported in the samtools manpage and we
believe it was an accidental ability added when switching from fopen
to bgzf_open for reading the UR reference file.

Note this check must be in cram_populate_ref and not load_ref_portion
or bgzf_open_ref as the user still has the ability to explicitly
request an external reference, eg via "samtools view -T URI".

open_path_mfile() now takes an extra 'int *local' argument which is
filled out with non-zero if the find found in REF_PATH is local.
Non-local files will be cached to REF_CACHE if set, but it no longer
has a default value as we did when ebi refget was the default REF_PATH.
This means it should operate much as before, except for the lack of
EBI defaults.
  • Loading branch information
jkbonfield committed Jan 27, 2025
1 parent 7b65da3 commit e76d1ae
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 61 deletions.
77 changes: 21 additions & 56 deletions cram/cram_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -2473,6 +2473,9 @@ static refs_t *refs_create(void) {
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
BGZF *fp;

if (strncmp(fn, "file://", 7) == 0)
fn += 7;

if (!is_md5 && !hisremote(fn)) {
char fai_file[PATH_MAX];

Expand Down Expand Up @@ -2934,30 +2937,6 @@ static void mkdir_prefix(char *path, int mode) {
*cp = '/';
}

/*
* Return the cache directory to use, based on the first of these
* environment variables to be set to a non-empty value.
*/
static const char *get_cache_basedir(const char **extra) {
char *base;

*extra = "";

base = getenv("XDG_CACHE_HOME");
if (base && *base) return base;

base = getenv("HOME");
if (base && *base) { *extra = "/.cache"; return base; }

base = getenv("TMPDIR");
if (base && *base) return base;

base = getenv("TEMP");
if (base && *base) return base;

return "/tmp";
}

/*
* Queries the M5 string from the header and attempts to populate the
* reference from this using the REF_PATH environment.
Expand All @@ -2971,31 +2950,12 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
sam_hrec_tag_t *tag;
char path[PATH_MAX];
kstring_t path_tmp = KS_INITIALIZE;
char cache[PATH_MAX], cache_root[PATH_MAX];
char *local_cache = getenv("REF_CACHE");
mFILE *mf;
int local_path = 0;

hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);

cache_root[0] = '\0';

if (!ref_path || *ref_path == '\0') {
/*
* If we have no ref path, we use the EBI server.
* However to avoid spamming it we require a local ref cache too.
*/
ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s";
if (!local_cache || *local_cache == '\0') {
const char *extra;
const char *base = get_cache_basedir(&extra);
snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
local_cache = cache;
hts_log_info("Populating local cache: %s", local_cache);
}
}

if (!r->name)
return -1;

Expand All @@ -3009,7 +2969,10 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {

/* Use cache if available */
if (local_cache && *local_cache) {
if (expand_cache_path(path, local_cache, tag->str+3) == 0)
struct stat sb;
if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
stat(path, &sb) == 0)
// Found it in the local cache
local_path = 1;
}

Expand Down Expand Up @@ -3053,7 +3016,8 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {


/* Otherwise search full REF_PATH; slower as loads entire file */
if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
int is_local = 0;
if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
size_t sz;
r->seq = mfsteal(mf, &sz);
if (r->seq) {
Expand All @@ -3069,15 +3033,22 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
} else {
refs_t *refs;
const char *fn;
sam_hrec_tag_t *UR_tag;

no_M5:
/* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
if (!(tag = sam_hrecs_find_key(ty, "UR", NULL)))
if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
return -1;

if (strncmp(UR_tag->str+3, "file:", 5) != 0) {
// Documented as omitted, but accidentally supported until now
hts_log_error("UR tags pointing to remote files are not supported");
return -1;
}

fn = (strncmp(tag->str+3, "file:", 5) == 0)
? tag->str+8
: tag->str+3;
fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
? UR_tag->str+8
: UR_tag->str+3;

if (fd->refs->fp) {
if (bgzf_close(fd->refs->fp) != 0)
Expand Down Expand Up @@ -3108,15 +3079,9 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
}

/* Populate the local disk cache if required */
if (local_cache && *local_cache) {
if (!is_local && local_cache && *local_cache) {
hFILE *fp;

if (*cache_root && !is_directory(cache_root)) {
hts_log_warning("Creating reference cache directory %s\n"
"This may become large; see the samtools(1) manual page REF_CACHE discussion",
cache_root);
}

if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
return 0; // Not fatal - we have the data already so keep going.
}
Expand Down
17 changes: 13 additions & 4 deletions cram/open_trace_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -324,14 +324,21 @@ static mFILE *find_file_dir(const char *file, char *dirname) {
* all of the locations listed in 'path' (which is a colon separated list).
* If 'path' is NULL it uses the RAWDATA environment variable instead.
*
* If non-NULL *local is filled out to 1 for a local file and 0 for a remote
* URL.
*
* Returns a mFILE pointer when found.
* NULL otherwise.
*/
mFILE *open_path_mfile(const char *file, char *path, char *relative_to) {
mFILE *open_path_mfile(const char *file, char *path, char *relative_to,
int *local) {
char *newsearch;
char *ele;
mFILE *fp;

if (local)
*local = 1;

/* Use path first */
if (!path)
path = getenv("RAWDATA");
Expand Down Expand Up @@ -361,14 +368,16 @@ mFILE *open_path_mfile(const char *file, char *path, char *relative_to) {

if (0 == strncmp(ele2, "URL=", 4)) {
if ((fp = find_file_url(file, ele2+4))) {
if (local)
*local = strncmp(ele2+4, "file:", 5) == 0 ? 1 : 0;
free(newsearch);
return fp;
}
} else if (!strncmp(ele2, "http:", 5) ||
!strncmp(ele2, "https:", 6) ||
!strncmp(ele2, "ftp:", 4)) {
} else if (hisremote(ele2)) {
if ((fp = find_file_url(file, ele2))) {
free(newsearch);
if (local)
*local = 0;
return fp;
}
} else if ((fp = find_file_dir(file, ele2))) {
Expand Down
6 changes: 5 additions & 1 deletion cram/open_trace_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,14 @@ char *tokenise_search_path(const char *searchpath);
* all of the locations listed in 'path' (which is a colon separated list).
* If 'path' is NULL it uses the RAWDATA environment variable instead.
*
* If non-NULL *local is filled out to 1 for a local file and 0 for a remote
* URL.
*
* Returns a mFILE pointer when found.
* NULL otherwise.
*/
mFILE *open_path_mfile(const char *file, char *path, char *relative_to);
mFILE *open_path_mfile(const char *file, char *path, char *relative_to,
int *local);

/*
* Returns a mFILE containing the entire contents of the url;
Expand Down

0 comments on commit e76d1ae

Please sign in to comment.