diff --git a/pyfastani/_fastani.pyx b/pyfastani/_fastani.pyx index a2781c9..11ce28a 100644 --- a/pyfastani/_fastani.pyx +++ b/pyfastani/_fastani.pyx @@ -54,6 +54,7 @@ cdef bool isupper(const unsigned char[::1] seq): return False return True + cdef void upper(unsigned char[::1] seq): """Make the letters in a buffer upper case, in place. """ @@ -62,6 +63,7 @@ cdef void upper(unsigned char[::1] seq): if seq[i] >= b'a' and seq[i] <= b'z': seq[i] -= 32 + # --- Cython classes --------------------------------------------------------- cdef class Mapper: @@ -265,14 +267,16 @@ cdef class Mapper: self._sk.sequencesByFileInfo.push_back(self._counter) def add_draft(self, str name, object contigs): - """Add a reference genome to the sketcher. + """add_draft(name, contigs)\n-- + + Add a reference draft genome to the sketcher. Using this method is fine even when the genome has a single contig, although `Mapper.add_genome` is easier to use in that case. Arguments: name (`str`): The name of the genome to add. - sequence (`str` or `bytes`): The sequence of the genome. + contigs (iterable of `str` or `bytes`): The contigs of the genome. Note: Contigs smaller than the window size and the k-mer size will @@ -283,7 +287,9 @@ cdef class Mapper: self._add_draft(name.encode("utf-8"), contigs) def add_genome(self, str name, object sequence): - """Add a reference genome to the sketcher. + """add_genome(name, sequence)\n-- + + Add a reference genome to the sketcher. This method is a shortcut for `Mapper.add_draft` when a genome is complete (i.e. only contains a single contig). @@ -337,22 +343,19 @@ cdef class Mapper: const unsigned char[::1] seq, int min_read_length, Map_t* map, - MappingResultsVector_t* l2_mappings + kseq_t* kseq, + ofstream* out, + MappingResultsVector_t* mappings ) nogil: - - cdef kseq_t kseq_buffer - cdef ofstream out = ofstream() - cdef QueryMetaData_t[kseq_ptr_t, Map_t.MinVec_Type] query - query.kseq = &kseq_buffer + query.kseq = kseq query.kseq.seq.s = &seq[i * min_read_length] query.kseq.seq.l = query.kseq.seq.m = min_read_length query.seqCounter = seq_counter + i + map.mapSingleQuerySeq[QueryMetaData_t[kseq_ptr_t, Map_t.MinVec_Type]](query, mappings[0], out[0]) - map.mapSingleQuerySeq[QueryMetaData_t[kseq_ptr_t, Map_t.MinVec_Type]](query, l2_mappings[0], out) - - cdef object _query_genome(self, object sequence): - """Query the sketcher for the given sequence. + cdef object _query_draft(self, object contigs): + """Query the sketcher for the given contigs. Adapted from the ``skch::Map::mapQuery`` method in ``computeMap.hpp``. @@ -364,67 +367,80 @@ cdef class Mapper: """ assert self._sk != nullptr - cdef int i - cdef int fragment_count + cdef int i # fragment counter + cdef kseq_t kseq # cdef Map_t* map - cdef MappingResultsVector_t l2_mappings + cdef MappingResultsVector_t final_mappings cdef vector[CGI_Results] results + cdef object contig cdef const unsigned char[::1] seq - cdef MappingResultsVector_t final_mappings cdef uint64_t seql + cdef int fragment_count = 0 + cdef int seq_counter = 0 cdef uint64_t total_fragments = 0 cdef Parameters_t p = self._param cdef list hits = [] + cdef ofstream out = ofstream() - # make sure the sequence is bytes - if isinstance(sequence, str): - sequence = sequence.encode("ascii") - # make sure the sequence is uppercase, otherwise a `makeUpperCase` - # is going to write in our read-only buffer in `addMinimizers` - if not isupper(sequence): - warnings.warn( - UserWarning, - "Sequence contains lowercase characters, reallocating." - ) - sequence = bytearray(sequence) - upper(sequence) - - # get a memory view of the sequence - seq = sequence - seql = seq.shape[0] - - # query if the sequence is large enough - if seql >= p.windowSize and seql >= p.kmerSize and seql >= p.minReadLength: - # create a new mapper with the given mapping result vector - map = new Map_t(p, self._sk[0], total_fragments, 0) - # compute the expected number of blocks - fragment_count = seql // p.minReadLength - total_fragments = fragment_count - # map the blocks - for i in range(fragment_count): - l2_mappings.clear() - self._query_fragment(i, 0, seq, p.minReadLength, map, &l2_mappings) - for m in l2_mappings: - final_mappings.push_back(m) - # compute core genomic identity after successful mapping - computeCGI( - p, - final_mappings, - map[0], - self._sk[0], - total_fragments, # total query fragments, here equal to fragment count - 0, # queryFileNo, only used for visualization, ignored - string(), # fileName, only used for reporting, ignored - results, - ) - # free the map - del map - else: - warnings.warn(UserWarning, ( - "Mapper received a short sequence relative to parameters, " - "mapping will not be computed." - )) + # create a new mapper with the given mapping result vector + map = new Map_t(p, self._sk[0], total_fragments, 0) + + # iterate over contigs + for contig in contigs: + # make sure the sequence is bytes + if isinstance(contig, str): + contig = contig.encode("ascii") + # make sure the sequence is uppercase, otherwise a `makeUpperCase` + # is going to write in our read-only buffer in `addMinimizers` + if not isupper(contig): + warnings.warn( + UserWarning, + "Sequence contains lowercase characters, reallocating." + ) + contig = bytearray(contig) + upper(contig) + # get a memory view of the sequence + seq = contig + seql = seq.shape[0] + + # query if the sequence is large enough + if seql >= p.windowSize and seql >= p.kmerSize and seql >= p.minReadLength: + # compute the expected number of blocks + fragment_count = seql // p.minReadLength + # map the blocks + for i in range(fragment_count): + self._query_fragment( + i, + total_fragments, + seq, + p.minReadLength, + map, + &kseq, + &out, + &final_mappings + ) + # record the number of fragments + total_fragments += fragment_count + else: + fragmentCount = 0 + warnings.warn(UserWarning, ( + "Mapper received a short sequence relative to parameters, " + "mapping will not be computed." + )) + # compute core genomic identity after successful mapping + computeCGI( + p, + final_mappings, + map[0], + self._sk[0], + total_fragments, # total query fragments + 0, # queryFileNo, only used for visualization, ignored + string(), # fileName, only used for reporting, ignored + results, + ) + # free the map + del map # build and return the list of hits for res in results: assert res.refGenomeId < self._lengths.size() @@ -440,6 +456,27 @@ cdef class Mapper: )) return hits + def query_draft(self, object contigs): + """query_draft(self, contigs)\n-- + + Query the mapper for a complete genome. + + Arguments: + contigs (iterable or `str` or `bytes`): The genome to query the mapper + with. + + Note: + Sequence must be larger than the window size, the k-mer size, + and the fragment length to be mapped, otherwise an empty list + of hits will be returned. + + Returns: + `list` of `~pyfastani.Hit`: The hits found for the query. + + """ + # delegate to C code + return self._query_draft(contigs) + def query_genome(self, object sequence): """query_genome(self, sequence)\n-- @@ -458,10 +495,8 @@ cdef class Mapper: `list` of `~pyfastani.Hit`: The hits found for the query. """ - # check if bytes - # delegate to C code - return self._query_genome(sequence) + return self._query_draft((sequence,)) cdef class Hit: diff --git a/pyfastani/tests/test_mapper.py b/pyfastani/tests/test_mapper.py index 2e08279..a57f93f 100644 --- a/pyfastani/tests/test_mapper.py +++ b/pyfastani/tests/test_mapper.py @@ -19,7 +19,20 @@ def test_fastani_example(self): # $ ./fastANI -q data/Shigella_flexneri_2a_01.fna -r data/Escherichia_coli_str_K12_MG1655.fna # data/Shigella_flexneri_2a_01.fna data/Escherichia_coli_str_K12_MG1655.fna 97.7507 1303 1608 - pass + mapper = pyfastani.Mapper() + + ref = self._load_fasta(os.path.join(FASTANI_PATH, "data", "Escherichia_coli_str_K12_MG1655.fna")) + mapper.add_genome("Escherichia_coli_str_K12_MG1655", self._get_sequence(ref[0])) + mapper.index() + + contigs = self._load_fasta(os.path.join(FASTANI_PATH, "data", "Shigella_flexneri_2a_01.fna")) + hits = mapper.query_draft(map(self._get_sequence, contigs)) + + self.assertEqual(len(hits), 1) + self.assertEqual(hits[0].name, "Escherichia_coli_str_K12_MG1655") + self.assertEqual(hits[0].matches, 1303) + self.assertEqual(hits[0].fragments, 1608) + self.assertAlmostEqual(hits[0].identity, 97.7507, places=4) def test_fastani_example_reversed(self): # Same as the FastANI README example, but swapping query and reference