Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
mihaitodor committed Oct 5, 2023
1 parent 9d1b253 commit e8b639a
Showing 1 changed file with 19 additions and 38 deletions.
57 changes: 19 additions & 38 deletions utilities/SPDI_Normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
from math import ceil
from pathlib import Path
from threading import Lock
from memory_profiler import profile

MAX_REFSEQ_CACHE_SIZE = 300 * 1024 * 1024 # 450MB
MAX_REFSEQ_CACHE_SIZE = 200 * 1024 * 1024 # 450MB


def hex2Code(code):
def hex_to_code(code):
"""
Convert a 4 byte integer to a sequence code. See `code2Hex()` for details
"""
Expand All @@ -31,6 +32,11 @@ def hex2Code(code):
raise NotImplementedError(f"unsupported code '{code}'")


def refseq_length_to_size(length):
# Each byte contains at most 2 codes
return ceil(length / 2)


class RefSeq:
def __init__(self, refSeq, length):
"""
Expand All @@ -42,7 +48,7 @@ def __init__(self, refSeq, length):
self.__length = length

# Size in bytes
self.size = ceil(length / 2)
self.size = refseq_length_to_size(length)

def __getitem__(self, subscript):
if isinstance(subscript, slice):
Expand All @@ -54,41 +60,12 @@ def __getitem__(self, subscript):
byte = self.__packedRefSeq[subscript // 2]

if subscript % 2 == 0:
return hex2Code((byte >> 4) & 0xF)

return hex2Code(byte & 0xF)


class RefSeqCache:
def __init__(self):
self.cache = {}
self.size = 0

def __getKey(self, build, refSeqName):
return f'{build}/{refSeqName}'

def contains(self, build, refSeqName):
return self.__getKey(build, refSeqName) in self.cache
return hex_to_code((byte >> 4) & 0xF)

def get(self, build, refSeqName):
return self.cache[self.__getKey(build, refSeqName)]
return hex_to_code(byte & 0xF)

def add(self, build, refSeqName, refSeq):
"""
Evacuates enough items from the cache until we have room for the new one and then adds it to the cache.
"""

while self.size + refSeq.size > MAX_REFSEQ_CACHE_SIZE:
key, cachedRefSeq = next(iter(self.cache.items()))
self.size -= cachedRefSeq.size
del self.cache[key]

self.cache[self.__getKey(build, refSeqName)] = refSeq
self.size += refSeq.size


refSeqCache = RefSeqCache()
refSeqCacheLock = Lock()
refSeqLock = Lock()


def get_ref_seq(build, refSeqName):
Expand All @@ -100,9 +77,13 @@ def get_ref_seq(build, refSeqName):
raise ValueError(f'failed to find expected refseq file for build "{build}" and RefSeq "{refSeqName}" ')

refSeqFile = foundFiles[0]
refSeqLength = Path(refSeqFile).stem.rpartition('_')[-1]
refSeqLength = int(Path(refSeqFile).stem.rpartition('_')[-1])

# Make room for this refseq in the cache
refSeqCache.reserve_space(refSeqLength)

with open(refSeqFile, mode='rb') as file:
refSeq = RefSeq(file.read(), int(refSeqLength))
refSeq = RefSeq(file.read(), refSeqLength)
except Exception as err:
print(f"failed to read refseq file: {err=}, {type(err)=}")
raise
Expand All @@ -112,7 +93,7 @@ def get_ref_seq(build, refSeqName):

def get_normalized_spdi(ref_seq, pos, ref, alt, build):
# Need to serialise this if we can't keep all the ref seq data in memory
with refSeqCacheLock:
with refSeqLock:
return get_normalized_spdi_impl(ref_seq, pos, ref, alt, build)


Expand Down

0 comments on commit e8b639a

Please sign in to comment.