Skip to content

Commit

Permalink
workaround problem which can happen when read starts are found off th…
Browse files Browse the repository at this point in the history
…e end of a sequence
  • Loading branch information
sbastkowski committed Jan 7, 2025
1 parent 6a73c0b commit f1feb78
Showing 1 changed file with 20 additions and 5 deletions.
25 changes: 20 additions & 5 deletions quatradis/tisp/generator/from_alignments.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,38 @@ def number_of_reads(self, read_coord, direction):
return 0

# Prints out the current coordinate counts for both strands. Automatically applies any necessary padding to file
# prior to this read coord in necessary
# prior to this read coord if necessary
def print_coord(self, read_coord):
padding_string = self.create_padding_string(read_coord)
forward_reads = self.number_of_reads(read_coord, 1)
reverse_reads = self.number_of_reads(read_coord, -1)
self.output_file_handle.write(padding_string + str(forward_reads) + " " + str(reverse_reads) + "\n")

def process_read_starts(self):
"""
Actually creates the output plot file by processing all read start positions in order. Before writing the
count at the current position it pads any skipped positions with '0 0'.
There is a hack implemented here to avoid accidentally creating plot files that are too large.
Any reads that start off the end of the reference, which can happen from alignments to circular sequences,
are ignored. We just count how often we are ignoring them and return that value along with the number of
insertion sites found in this reference sequence.
"""
nb_unique_insertion_sites = 0
nb_skipped = 0
for read_coord in sorted(self.read_starts.keys()):
if read_coord > self.sequence_length:
nb_skipped += 1
continue
self.print_coord(read_coord)
nb_unique_insertion_sites += 1

# Finish up this sequence with padding then close
# Finish up this sequence with any required padding then close the file
padding_string = self.create_padding_string(self.sequence_length+1)
self.output_file_handle.write(padding_string)
self.output_file_handle.close()

return nb_unique_insertion_sites
return nb_unique_insertion_sites, nb_skipped


def op_is_aligned(op):
Expand Down Expand Up @@ -137,15 +150,17 @@ def create_plot_files(mapped_reads, plot_out_prefix="tradis.plot", cutoff_score=
uis_map = {}
is_plotters, nb_mapped, nb_mapped_and_good = count_read_starts(mapped_reads, cutoff_score, plot_out_prefix)
for sequence_name in sorted(is_plotters.keys()):
nb_unique_insertion_sites = is_plotters[sequence_name].process_read_starts()
nb_unique_insertion_sites, nb_skipped = is_plotters[sequence_name].process_read_starts()
uis_map[sequence_name] = nb_unique_insertion_sites
if nb_skipped > 0:
print("Skipped", nb_skipped, "insertion sites which were found off the end of the reference sequence:", sequence_name)

return nb_mapped, nb_mapped_and_good, uis_map


def get_number_reads(seq_file_in):

# If running on mac take form stdin
# If running on mac take from stdin
joiner = " < " if sys.platform == "darwin" else " "

# Make sure we can handle gzipped or non-gzipped fastq input
Expand Down

0 comments on commit f1feb78

Please sign in to comment.