diff --git a/README.md b/README.md index 11dc0b2..c50b95c 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,29 @@ end BGZIP ----- +http://samtools.github.io/hts-specs/SAMv1.pdf + + +``` +The random access method to be described next limits the uncompressed contents of each BGZF block +to a maximum of 216 bytes of data. Thus while ISIZE is stored as a uint32 t as per the gzip format, in +BGZF it is limited to the range [0, 65536]. BSIZE can represent BGZF block sizes in the range [1, 65536], +though typically BSIZE will be rather less than ISIZE due to compression. + +4.1.1 Random access +BGZF files support random access through the BAM file index. To achieve this, the BAM file index uses +virtual file offsets into the BGZF file. Each virtual file offset is an unsigned 64-bit integer, defined as: +coffset<<16|uoffset, where coffset is an unsigned byte offset into the BGZF file to the beginning of a +BGZF block, and uoffset is an unsigned byte offset into the uncompressed data stream represented by that +BGZF block. Virtual file offsets can be compared, but subtraction between virtual file offsets and addition +between a virtual offset and an integer are both disallowed. +``` + +TABIX +----- + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3042176/ +https://samtools.github.io/hts-specs/tabix.pdf ``` 2.1 Sorting and BGZF compression @@ -149,9 +171,9 @@ and then test each record in the collected bins for overlaps. In principle, bins can be selected freely as long as each record can be assigned to a bin. In the Tabix binning index, we adopt a multilevel binning scheme where bins at same level are non-overlapping and of the same size. In -Tabix, each bin k, 0<=k<=37 449, represents a half-close-half-open interval +Tabix, each bin k, 0<=k<=37,449, represents a half-close-half-open interval [(k-ol)sl, (k-ol+1)sl), where l = [log2(7k+1)/3] is the level of the bin, -sl = 229-3l is the size of the bin at level l and ol = (23l - 1)/7 is the +sl = 2^(29-3l) is the size of the bin at level l and ol = (2^3l - 1)/7 is the offset at l. In this scheme, bin 0 spans 512 Mb, 1-8 span 64 Mb, 9-72 8 Mb, 73-584 1 Mb, 585-4680 128 kb and 4681-37449 span 16 kb intervals. The scheme is very similar to the UCSC binning (Kent et al., 2002) except that in UCSC, diff --git a/VERSION b/VERSION index ea710ab..a58941b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2 \ No newline at end of file +1.3 \ No newline at end of file diff --git a/tabixpy/tabixpy.py b/tabixpy/tabixpy.py index bbff5b4..a845b76 100644 --- a/tabixpy/tabixpy.py +++ b/tabixpy/tabixpy.py @@ -5,14 +5,22 @@ import json import logging -__format_ver__ = 3 +__format_ver__ = 4 __format_name__ = "TBJ" COMPRESS = True +BRICK_SIZE = 216 +BLOCK_SIZE = 2**16 +FILE_BYTES_MASK = 0xFFFFFFFFFFFFF0000 +BLOCK_BYTES_MASK = 0x0000000000000FFFF + logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)23s - %(name)s - %(levelname)6s - %(message)s" + level = logging.DEBUG, + format = "%(asctime)8s - %(name)s - %(levelname)-6s - %(message)s", + datefmt = "%H:%M:%S", + # format="%(asctime)23s - %(name)s - %(levelname)6s - %(message)s", + # datefmt='%Y-%m-%d %H:%M:%S', ) logger = logging.getLogger('tabixpy') @@ -54,6 +62,89 @@ def save(data, ingz, compress=COMPRESS): with opener(outfileJ, "wt") as fhd: json.dump(data, fhd, indent=1) +def reg2bin(begPos, endPos): + #- Given a zero-based, half-closed and half-open region [beg, end), + # the bin number is calculated with the following C function: + + endPos -= 1 + if (begPos>>14 == endPos>>14): return ((1<<15)-1)//7 + (begPos>>14) + if (begPos>>17 == endPos>>17): return ((1<<12)-1)//7 + (begPos>>17) + if (begPos>>20 == endPos>>20): return ((1<< 9)-1)//7 + (begPos>>20) + if (begPos>>23 == endPos>>23): return ((1<< 6)-1)//7 + (begPos>>23) + if (begPos>>26 == endPos>>26): return ((1<< 3)-1)//7 + (begPos>>26) + return 0 + +MAX_BIN = (((1<<18)-1)//7) +def reg2bins(rbeg, rend): + # The list of bins that may overlap a region [beg, end) can be obtained + # with the following C function: + #define MAX_BIN (((1<<18)-1)/7) + + res = [None] * MAX_BIN + + i = 0 + k = 0 + rend -= 1 + res[i] = 0 + i += 1 + + ranges = [ + [ 1 + (rbeg>>26), 1 + (rend>>26)], + [ 9 + (rbeg>>23), 9 + (rend>>23)], + [ 73 + (rbeg>>20), 73 + (rend>>20)], + [ 585 + (rbeg>>17), 585 + (rend>>17)], + [ 4681 + (rbeg>>14), 4681 + (rend>>14)] + ] + + for b,e in ranges: + for k in range(b ,e+1): + res[i] = k + i+=1 + + return i; # #elements in list[] + +def getPos(filehandle, real_pos, bytes_pos): + # print(f"seek {real_pos} {bytes_pos}") + filehandle.seek(real_pos, 0) + + bin_pos = -1 + first_pos = -1 + last_pos = -1 + with gzip.open(filehandle, 'rb') as g: + block = g.read(BLOCK_SIZE).decode() + if len(block) > 0 and block[0] != "#": + rows = block.split("\n") + first_row = None + last_row = None + if len(rows) > 1: + if len(rows[0]) == len(rows[1]): + first_row = rows[0] + else: + first_row = rows[1] + + if len(rows[-2]) == len(rows[-1]): + last_row = rows[-1] + else: + last_row = rows[-2] + else: + first_row = rows[0] + last_row = rows[-1] + + first_cols = first_row.split("\t") + first_pos = first_cols[1] + first_pos = int(first_pos) + + last_cols = last_row.split("\t") + last_pos = last_cols[1] + last_pos = int(last_pos) + + bin_reg = block[bytes_pos:bytes_pos+1024] + bin_cols = bin_reg.split("\t") + bin_pos = bin_cols[1] + bin_pos = int(bin_pos) + return bin_pos, first_pos, last_pos + + def read_tabix(infile): logger.info(f"reading {infile}") @@ -62,6 +153,7 @@ def read_tabix(infile): assert os.path.exists(inid), inid fhd = gzip.open(inid, "rb") + inf = open(ingz, "rb") get_values = gen_value_getter(fhd) data = {} @@ -137,7 +229,7 @@ def read_tabix(infile): ref["ref_name"] = names[ref_n] data["refs"][ref_n] = ref - (n_bin,) = get_values('> 16 for c in chunks] - block_bytes_offsets = [c & block_bytes_mask for c in chunks] + block_bytes_offsets = [c & BLOCK_BYTES_MASK for c in chunks] - assert all([i >= 0 for i in real_file_offsets]) - assert all([i < 2**48 for i in real_file_offsets]) + assert all([i >= 0 for i in real_file_offsets]) + assert all([i < 2**48 for i in real_file_offsets]) - assert all([i >= 0 for i in block_bytes_offsets]) - assert all([i < 2**16 for i in block_bytes_offsets]) + assert all([i >= 0 for i in block_bytes_offsets]) + assert all([i < BLOCK_SIZE for i in block_bytes_offsets]) - chunks_data = [None] * n_chunk + chunks_data = { + "chunk_begin" : [None] * n_chunk, + "chunk_end" : [None] * n_chunk, + } - for p, c in enumerate(range(0,len(chunks),2)): - chk_beg = chunks[c+0] - chk_end = chunks[c+1] + for chunk_n, chunk_i in enumerate(range(0,len(chunks),2)): + chk_beg = chunks[chunk_i+0] + chk_end = chunks[chunk_i+1] - chk_beg_real = real_file_offsets[c+0] - chk_end_real = real_file_offsets[c+1] + chk_real_beg = real_file_offsets[chunk_i+0] + chk_real_end = real_file_offsets[chunk_i+1] - chk_beg_bytes = block_bytes_offsets[c+0] - chk_end_bytes = block_bytes_offsets[c+1] + chk_bytes_beg = block_bytes_offsets[chunk_i+0] + chk_bytes_end = block_bytes_offsets[chunk_i+1] # logger.debug(f"begin") # logger.debug(f" block {chk_beg:064b} {chk_beg:15,d}") - # logger.debug(f" mask {block_bytes_mask:064b}") - # logger.debug(f" real offset {chk_beg_real:064b} {chk_beg_real:15,d}") - # logger.debug(f" block byte offset {chk_beg_bytes:064b} {chk_beg_bytes:15,d}") + # logger.debug(f" mask {BLOCK_BYTES_MASK:064b}") + # logger.debug(f" real offset {chk_real_beg:064b} {chk_real_beg:15,d}") + # logger.debug(f" block byte offset {chk_bytes_beg:064b} {chk_bytes_beg:15,d}") # logger.debug(f"end") # logger.debug(f" block {chk_end:064b} {chk_end:15,d}") - # logger.debug(f" mask {block_bytes_mask:064b}") - # logger.debug(f" real offset {chk_end_real:064b} {chk_end_real:15,d}") - # logger.debug(f" block byte offset {chk_end_bytes:064b} {chk_end_bytes:15,d}") + # logger.debug(f" mask {BLOCK_BYTES_MASK:064b}") + # logger.debug(f" real offset {chk_real_end:064b} {chk_real_end:15,d}") + # logger.debug(f" block byte offset {chk_bytes_end:064b} {chk_bytes_end:15,d}") # logger.debug("") - chunks_data[p] = [ - [chk_beg_real, chk_beg_bytes], - [chk_end_real, chk_end_bytes] - ] - - if getLogLevel() == "DEBUG": - logger.debug(f"======================== List of chunks (n=n_chunk[{n_chunk:15,d}]) ============================") - for chunk_n in range(n_chunk): - [ - [chk_beg_real, chk_beg_bytes], - [chk_end_real, chk_end_bytes] - ] = chunks_data[chunk_n] + assert chk_beg < chk_end + assert chk_real_beg < chk_real_end + # assert chk_bytes_beg < chk_bytes_end + + if last_chk_real_beg is not None: + assert last_chk_real_beg < chk_real_beg + if last_chk_real_end is not None: + assert last_chk_real_end == chk_real_beg + + if last_chk_real_end is not None: + assert last_chk_real_end < chk_real_end + assert last_chk_real_end == chk_real_beg + + last_chk_real_beg = chk_real_beg + last_chk_real_end = chk_real_end + + if chk_beg in position_memoize: + chk_bin_pos_beg, chk_first_pos_beg, chk_last_pos_beg = position_memoize[chk_beg] + else: + chk_bin_pos_beg, chk_first_pos_beg, chk_last_pos_beg = getPos(inf, chk_real_beg, chk_bytes_beg) + position_memoize[chk_beg] = (chk_bin_pos_beg, chk_first_pos_beg, chk_last_pos_beg) + + if chk_end in position_memoize: + chk_bin_pos_end, chk_first_pos_end, chk_last_pos_end = position_memoize[chk_end] + else: + chk_bin_pos_end, chk_first_pos_end, chk_last_pos_end = getPos(inf, chk_real_end, chk_bytes_end) + position_memoize[chk_end] = (chk_bin_pos_end, chk_first_pos_end, chk_last_pos_end) + + chunks_data["chunk_begin" ][chunk_n] = { + "real": chk_real_beg, + "bytes": chk_bytes_beg, + "bin_pos": chk_bin_pos_beg, + "first_pos": chk_first_pos_beg, + "last_pos": chk_last_pos_beg + } + + chunks_data["chunk_end" ][chunk_n] = { + "real": chk_real_end, + "bytes": chk_bytes_end, + "bin_pos": chk_bin_pos_end, + "first_pos": chk_first_pos_end, + "last_pos": chk_last_pos_end + } + + if getLogLevel() == "DEBUG": + logger.debug(f"======================== List of chunks (n=n_chunk[{chunk_n:15,d}]) ============================") logger.debug(f" chunk_n {chunk_n+1:15,d}/{n_chunk:15,d}") - logger.debug(f" cnk_beg Virtual file offset of the start of the chunk uint64_t {chk_beg_real:15,d} {chk_beg_bytes:15,d}") - logger.debug(f" cnk_end Virtual file offset of the end of the chunk uint64_t {chk_end_real:15,d} {chk_end_bytes:15,d}") - ref["bins"][bin_n]["chunks"] = chunks_data - - # chunks = [(b,e) for b,e in zip(chunks[0::2], chunks[1::2])] - # ref["bins"][bin_n]["chunks"] = chunks - # if getLogLevel() == "DEBUG": - # logger.debug(f"======================== List of chunks (n=n_chunk[{n_chunk:15,d}]) ============================") - # for chunk_n in range(n_chunk): - # cnk_beg, cnk_end = chunks[chunk_n] - # logger.debug(f" chunk_n {chunk_n+1:15,d}/{n_chunk:15,d}") - # logger.debug(f" cnk_beg Virtual file offset of the start of the chunk uint64_t {cnk_beg:15,d}") - # logger.debug(f" cnk_end Virtual file offset of the end of the chunk uint64_t {cnk_end:15,d}") + logger.debug(f" cnk_beg Virtual file offset of the start of the chunk uint64_t {chk_real_beg:15,d} {chk_bytes_beg:15,d}") + logger.debug(f" cnk_end Virtual file offset of the end of the chunk uint64_t {chk_real_end:15,d} {chk_bytes_end:15,d}") + logger.debug(f" cnk_beg_1st_pos uint64_t {chk_first_pos_beg:15,d}") + logger.debug(f" cnk_end_1st_pos uint64_t {chk_first_pos_end:15,d}") + logger.debug(f" cnk_beg_bin_pos uint64_t {chk_bin_pos_beg:15,d}") + logger.debug(f" cnk_end_bin_pos uint64_t {chk_bin_pos_end:15,d}") + logger.debug(f" cnk_beg_lst_pos uint64_t {chk_last_pos_beg:15,d}") + logger.debug(f" cnk_end_lst_pos uint64_t {chk_last_pos_end:15,d}") + + ref["bins" ][bin_n]["chunks"] = chunks_data + ref["bins_begin"] = chunks_data["chunk_begin"][ 0] + ref["bins_end" ] = chunks_data["chunk_begin"][-1] (n_intv,) = get_values(' 0 + assert ioff < 2**63 + + ioff_real = ioff >> 16 + ioff_bytes = ioff & BLOCK_BYTES_MASK + + assert ioff_real >= 0 + assert ioff_real < 2**48 + + assert ioff_bytes >= 0 + assert ioff_bytes < BLOCK_SIZE + + ioff_bin_pos, ioff_first_pos, ioff_last_pos = -1, -1 ,-1 + if ioff in position_memoize: + ioff_bin_pos, ioff_first_pos, ioff_last_pos = position_memoize[ioff] + else: + ioff_bin_pos, ioff_first_pos, ioff_last_pos = getPos(inf, ioff_real, ioff_bytes) + + ioffs_be[intv_n] = { + "real": ioff_real, + "bytes": ioff_bytes, + "bin_pos": ioff_bin_pos, + "first_pos": ioff_first_pos, + "last_pos": ioff_last_pos + } + + if getLogLevel() == "DEBUG": + logger.debug(f"======================== List of distinct intervals (n=n_intv [{n_intv:15,d}]) ================") + logger.debug(f" ioff File offset of the first record in the interval uint64_t [{intv_n+1:15,d}/{n_intv:15,d}]") + logger.debug(f" real File offset of the first record in the interval uint48_t {ioff_real:15,d}") + logger.debug(f" bytes File offset of the first record in the interval uint16_t {ioff_bytes:15,d}") + logger.debug(f" position 1st File offset of the first record in the interval uint64_t {ioff_first_pos:15,d}") + logger.debug(f" position bin File offset of the first record in the interval uint64_t {ioff_bin_pos:15,d}") + logger.debug(f" position lst File offset of the first record in the interval uint64_t {ioff_last_pos:15,d}") - assert all([i > 0 for i in ioffs]) - assert all([i < 2**63 for i in ioffs]) + ref["intvs"] = ioffs_be - ref["intvs"] = ioffs - if getLogLevel() == "DEBUG": - logger.debug(f"======================== List of distinct intervals (n=n_intv [{n_intv:15,d}]) ================") - for intv_n in range(n_intv): - # logger.debug(f" intv_n {intv_n+1:15,d}/{n_intv:15,d}") - ioff = ioffs[intv_n] - logger.debug(f" ioff File offset of the first record in the interval uint64_t {ioff:15,d} [{intv_n+1:15,d}/{n_intv:15,d}]") n_no_coor = None try: @@ -280,6 +445,7 @@ def read_tabix(infile): data["n_no_coor"] = n_no_coor fhd.close() + inf.close() logger.debug("finished reading") diff --git a/tests/annotated_tomato_150.100000.vcf.gz b/tests/annotated_tomato_150.100000.vcf.gz new file mode 100644 index 0000000..bc71a00 Binary files /dev/null and b/tests/annotated_tomato_150.100000.vcf.gz differ diff --git a/tests/run.sh b/tests/run.sh index 70025dd..d51a4e2 100644 --- a/tests/run.sh +++ b/tests/run.sh @@ -3,6 +3,6 @@ set -xeu rm -v *.tbj *.log || true rm -v __pycache__ || true -# pypy test.py annotated_tomato_150.100000.vcf.gz.tbi 2>&1 > annotated_tomato_150.100000.vcf.gz.tbj.log +pypy test.py annotated_tomato_150.100000.vcf.gz.tbi 2>&1 > annotated_tomato_150.100000.vcf.gz.tbj.log -pypy test.py annotated_tomato_150.vcf.bgz.tbi 2>&1 > annotated_tomato_150.vcf.bgz.tbj.log +# pypy test.py annotated_tomato_150.vcf.bgz.tbi 2>&1 > annotated_tomato_150.vcf.bgz.tbj.log diff --git a/tests/test.py b/tests/test.py index a6d673f..703e9e7 100644 --- a/tests/test.py +++ b/tests/test.py @@ -8,8 +8,74 @@ infile = sys.argv[1] ingz, inid = tabixpy.get_filenames(infile) - # tabixpy.logger.setLevel(tabixpy.logging.DEBUG) + tabixpy.logger.setLevel(tabixpy.logging.DEBUG) data = tabixpy.read_tabix(ingz) - tabixpy.save(data, ingz, compress=True) + tabixpy.save(data, ingz, compress=False) + + + # for p in range(999999999900,1000000000000): + # print(p, tabixpy.reg2bin(p, p+1), tabixpy.reg2bins(p, p+1)) + + # import gzip + # print("opening") + # with open(ingz, "rb") as inf: + # for cp, chrom in enumerate(data["names"]): + # print("chrom", chrom) + + # refs = data["refs"][cp] + + # n_intv = refs["n_intv"] + # print("n_intv", n_intv) + # intvs = refs["intvs"] + + # last_chunk = None + # for virtual_offset, chunk_real_begin, chunk_bytes_begin in intvs: + # print(virtual_offset, chunk_real_begin, chunk_bytes_begin) + # if last_chunk is not None: + # print(" block size", chunk_real_begin - last_chunk) + # last_chunk = chunk_real_begin + # # inf.seek(0, 0) + # inf.seek(chunk_real_begin, 0) + # with gzip.open(inf, 'rb') as g: + # b = g.read(tabixpy.BLOCK_SIZE) + # i = b[chunk_bytes_begin:chunk_bytes_begin+20] + # p = int(i.decode().split("\t")[1]) + # print(" i", i) + # print(" p", p) + + + # n_bin = refs["n_bin"] + # bins = refs["bins"] + # for bin_n, bind in enumerate(bins): + # print(" bin_n", bin_n) + # n_chunk = bind["n_chunk"] + # chunk_real_begins = bind["chunks"]["chunk_real_begin"] + # chunk_bytes_begins = bind["chunks"]["chunk_bytes_begin"] + # for chunk_n in range(n_chunk): + # print(" chunk_n", chunk_n) + # chunk_real_begin = chunk_real_begins[chunk_n] + # chunk_bytes_begin = chunk_bytes_begins[chunk_n] + # print(" chunk_real_begin ", chunk_real_begin) + # print(" chunk_bytes_begin", chunk_bytes_begin) + # inf.seek(0, 0) + # inf.seek(chunk_real_begin, 0) + # with gzip.open(inf, 'r') as g: + # b = g.read(tabixpy.BLOCK_SIZE) + # print(" b", b) + # i = b[chunk_bytes_begin:chunk_bytes_begin+20] + # print(" i", i) + + + # print("seeking") + # f.seek(7021611) + # # f.seek(4631) + # g=gzip.open(f) + # d=g.read(16 * 1024) + # # print("d", d) + # r=d[4631:4631+100] + # print(r) + # # for line in g: + # # print("line", line) + # # break \ No newline at end of file