Skip to content

Commit

Permalink
Fixing API incomplete functionality for read side detection (#251)
Browse files Browse the repository at this point in the history
* forgotten `push_pysam` fully replaced by `group_alignments_by_side` (read side shall be determined through flag and not through sam entry attributes)

* Return `--readid-transform` functionality broken after latest release.

* fix pair expand with parse2 in `--single-end` mode

* single-end and expand tests added. fixed issue with empty alignment at R2 in single-end mode that was emerging together with `--expand`.

* test files for `parse2` added
  • Loading branch information
agalitsyna authored Oct 22, 2024
1 parent 6303de6 commit ae544ff
Show file tree
Hide file tree
Showing 5 changed files with 258 additions and 103 deletions.
4 changes: 2 additions & 2 deletions pairtools/cli/parse2.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@
help="""Reported position of alignments in pairs of complex walks (pos columns).
Each alignment in .bam/.sam Hi-C-like data has two ends, and you can report one or another depending of the position of alignment on a read or in a pair.
"junction" - inner ends of sequential alignments in each pair, aka ligation junctions (complex walks default),
"junction" - inner ends of sequential alignments in each pair, aka ligation junctions,
"read" - 5'-end of alignments relative to R1 or R2 read coordinate system (as in traditional Hi-C),
"walk" - 5'-end of alignments relative to the whole walk coordinate system,
"outer" - outer ends of sequential alignments in each pair. """,
"outer" - outer ends of sequential alignments in each pair (parse2 default). """,
)
@click.option(
"--report-orientation",
Expand Down
203 changes: 102 additions & 101 deletions pairtools/lib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
each sam entry is in fact special AlignedSegmentPairtoolized Cython object
that has alignment attributes and can be easily accessed from Python.
Sam entries are gathered into reads by `push_pysam` function.
Sam entries are gathered into reads by `group_alignments_by_side` function.
2. **read** is a collection of sam entries corresponding to a single Hi-C molecule.
It is represented by three variables:
Expand Down Expand Up @@ -37,36 +37,6 @@
from . import pairsam_format
from .parse_pysam import get_mismatches_c


def group_alignments_by_side(sams):
return [sam for sam in sams if sam.is_read1], [sam for sam in sams if sam.is_read2]


def read_alignment_block(instream, sort=True, group_by_side=True, return_readID=True):
sams = []

prev_readID = None
while True:
sam_entry = next(instream, None)
readID = sam_entry.query_name if sam_entry else None

# Read is fully populated, then parse and write:
if not (sam_entry) or ((readID != prev_readID) and prev_readID):
if sort:
sams = sorted(sams, key=lambda a: (a.is_read2, a.query_alignment_start))
out = sams if not group_by_side else group_alignments_by_side(sams)
out = out if not return_readID else (prev_readID, out)
yield out

sams.clear()

if sam_entry is None:
break
else:
sams.append(sam_entry)
prev_readID = readID


def streaming_classify(
instream, outstream, chromosomes, out_alignments_stream, out_stat, **kwargs
):
Expand Down Expand Up @@ -124,9 +94,7 @@ def streaming_classify(

### Iterate over input pysam:
instream = iter(instream)
for (readID, (sams1, sams2)) in read_alignment_block(instream, sort=True, group_by_side=True, return_readID=True):
if readID_transform is not None:
readID = eval(readID_transform)
for (readID, (sams1, sams2)) in read_alignment_block(instream, sort=True, group_by_side=True, return_readID=True, readID_transform=readID_transform):

### Parse
if not parse2: # regular parser:
Expand Down Expand Up @@ -216,17 +184,6 @@ def streaming_classify(
### Alignment utilities: ###
############################


def push_pysam(sam_entry, sams1, sams2):
"""Parse pysam AlignedSegment (sam) into pairtools sams entry"""
flag = sam_entry.flag
if (flag & 0x40) != 0:
sams1.append(sam_entry) # left read, or first read in a pair
else:
sams2.append(sam_entry) # right read, or mate pair
return


def empty_alignment():
return {
"chrom": pairsam_format.UNMAPPED_CHROM,
Expand All @@ -251,6 +208,45 @@ def empty_alignment():
"mismatches": "",
}

def group_alignments_by_side(sams):
"""Group pysam AlignedSegments (sams) into left-read (R1) and right-read (R2) sam entries"""

sams1 = []
sams2 = []
for sam_entry in sams:
flag = sam_entry.flag
if (flag & 0x40) != 0:
sams1.append(sam_entry) # left read, or first read in a pair
else:
sams2.append(sam_entry) # right read, or mate pair
return sams1, sams2


def read_alignment_block(instream, sort=True, group_by_side=True, return_readID=True, readID_transform=None):
sams = []

prev_readID = None
while True:
sam_entry = next(instream, None)
readID = sam_entry.query_name if sam_entry else None
if readID_transform is not None and readID is not None:
readID = eval(readID_transform)

# Read is fully populated, then parse and write:
if not (sam_entry) or ((readID != prev_readID) and prev_readID):
if sort:
sams = sorted(sams, key=lambda a: (a.is_read2, a.query_alignment_start))
out = sams if not group_by_side else group_alignments_by_side(sams)
out = out if not return_readID else (prev_readID, out)
yield out

sams.clear()

if sam_entry is None:
break
else:
sams.append(sam_entry)
prev_readID = readID

def parse_pysam_entry(
sam,
Expand Down Expand Up @@ -672,7 +668,7 @@ def parse2_read(
]

algns1 = normalize_alignment_list(algns1, 1, sort_by="dist_to_5", max_inter_align_gap=max_inter_align_gap)
algns2 = [empty_alignment()] # Empty alignment dummy
algns2 = [] # Empty alignment dummy

if len(algns1) > 1:
# Look for ligation pair, and report linear alignments after deduplication of complex walks:
Expand All @@ -684,6 +680,8 @@ def parse2_read(
report_position,
report_orientation,
dedup_max_mismatch,
expand,
max_expansion_depth,
)
output = [x for x in output if x[-1][-1] != "R1-2"]
return (output, algns1, algns2)
Expand Down Expand Up @@ -893,11 +891,11 @@ def parse_complex_walk(
**Intramolecular deduplication**
Forward read (left): right read (right):
Forward read (left): right read (right):
5'------------------------->3' 3'<--------------------------5'
algns1 algns2
algns1 algns2
<5---3><5---3><5---3><5---3> <3---5><3---5><3---5><3---5>
l0 l1 l2 l3 r3 r2 r1 r0
l0 l1 l2 l3 r3 r2 r1 r0
Alignment - bwa mem reported hit or alignment after gaps conversion.
Left and right alignments (algns1: [l0, l1, l2, l3], algns2: [r0, r1, r2, r3])
Expand Down Expand Up @@ -931,8 +929,8 @@ def parse_complex_walk(
If comparison is successful, go to 6.
6. Verify.
Check that downstream pairs on the left read overlap with the upstream pairs on the right read.
If yes, exit.
If not, we do not have an overlap, go to step 3.
If yes, exit.
If not, we do not have an overlap, go to step 3.
"""

AVAILABLE_REPORT_POSITION = ["outer", "junction", "read", "walk"]
Expand Down Expand Up @@ -1009,66 +1007,70 @@ def parse_complex_walk(
if not is_overlap:
current_right_pair = 1

# II. Search of partial overlap if there are less than 2 alignments at either sides, or no overlaps found
if current_right_pair == 1:
last_reported_alignment_left = last_reported_alignment_right = 1
if partial_overlap(
algns1[-1],
algns2[-1],
max_insert_size=max_insert_size,
dedup_max_mismatch=dedup_max_mismatch,
):
if (
n_algns1 >= 2
): # single alignment on right read and multiple alignments on left
pair_index = (len(algns1) - 1, "R1")
output_pairs.append(
format_pair(
algns1[-2],
algns1[-1],
pair_index=pair_index,
algn2_pos3=algns2[-1]["pos5"],
report_position=report_position,
report_orientation=report_orientation,
if (n_algns2 == 0):
last_reported_alignment_left = 1
last_reported_alignment_right = 0
else:
# II. Search of partial overlap if there are less than 2 alignments at either sides, or no overlaps found
if (current_right_pair == 1):
last_reported_alignment_left = last_reported_alignment_right = 1
if partial_overlap(
algns1[-1],
algns2[-1],
max_insert_size=max_insert_size,
dedup_max_mismatch=dedup_max_mismatch,
):
if (
n_algns1 >= 2
): # single alignment on right read and multiple alignments on left
pair_index = (len(algns1) - 1, "R1")
output_pairs.append(
format_pair(
algns1[-2],
algns1[-1],
pair_index=pair_index,
algn2_pos3=algns2[-1]["pos5"],
report_position=report_position,
report_orientation=report_orientation,
)
)
)
last_reported_alignment_left = 2 # set the pointer for reporting
last_reported_alignment_left = 2 # set the pointer for reporting

if (
n_algns2 >= 2
): # single alignment on left read and multiple alignments on right
pair_index = (len(algns1), "R2")
output_pairs.append(
format_pair(
algns2[-1],
algns2[-2],
pair_index=pair_index,
algn1_pos3=algns1[-1]["pos5"],
report_position=report_position,
report_orientation=report_orientation,
)
)
last_reported_alignment_right = 2 # set the pointer for reporting

# Note that if n_algns1==n_algns2==1 and alignments overlap, then we don't need to check,
# it's a non-ligated DNA fragment that we don't report.

if (
n_algns2 >= 2
): # single alignment on left read and multiple alignments on right
pair_index = (len(algns1), "R2")
else: # end alignments do not overlap, report regular pair:
pair_index = (len(algns1), "R1-2")
output_pairs.append(
format_pair(
algns1[-1],
algns2[-1],
algns2[-2],
pair_index=pair_index,
algn1_pos3=algns1[-1]["pos5"],
report_position=report_position,
report_orientation=report_orientation,
)
)
last_reported_alignment_right = 2 # set the pointer for reporting

# Note that if n_algns1==n_algns2==1 and alignments overlap, then we don't need to check,
# it's a non-ligated DNA fragment that we don't report.

else: # end alignments do not overlap, report regular pair:
pair_index = (len(algns1), "R1-2")
output_pairs.append(
format_pair(
algns1[-1],
algns2[-1],
pair_index=pair_index,
report_position=report_position,
report_orientation=report_orientation,
)
)

else: # there was an overlap, set some pointers:
last_reported_alignment_left = (
last_reported_alignment_right
) = current_right_pair
else: # there was an overlap, set some pointers:
last_reported_alignment_left = (
last_reported_alignment_right
) = current_right_pair

# III. Report all remaining alignments.
# Report all unique alignments on left read (sequential):
Expand Down Expand Up @@ -1148,7 +1150,6 @@ def expand_pairs(pairs_list, max_expansion_depth=None):
list of expanded pairs
"""

for algn1, _algn1, pair_index1 in pairs_list:
for _algn2, algn2, pair_index2 in pairs_list:
if pair_index1 > pair_index2:
Expand Down
11 changes: 11 additions & 0 deletions tests/data/mock.parse2-single-end.expand.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
@SQ SN:chr1 LN:10000
@SQ SN:chr2 LN:10000
@PG ID:mock PN:mock VN:0.0.0 CL:mock
readid01 0 chr1 10 60 50M chr1 200 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1
readid01 0 chr1 200 60 50M chr1 10 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1
readid02 0 chr1 10 60 50M chr1 200 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1|chr1,10,chr1,500,+,+,UU,1,E1_R1|chr1,200,chr1,500,+,+,UU,2,R1
readid02 0 chr1 200 60 50M chr1 10 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1|chr1,10,chr1,500,+,+,UU,1,E1_R1|chr1,200,chr1,500,+,+,UU,2,R1
readid02 16 chr1 500 60 50M chr1 10 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1|chr1,10,chr1,500,+,+,UU,1,E1_R1|chr1,200,chr1,500,+,+,UU,2,R1
readid03 0 chr1 10 60 50M chr1 200 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,200,+,+,UU,1,R1|chr1,10,chr1,500,+,+,UU,1,E1_R1|chr1,249,chr1,500,-,+,UU,2,R1
readid03 16 chr1 200 60 50M chr1 10 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,200,+,+,UU,1,R1|chr1,10,chr1,500,+,+,UU,1,E1_R1|chr1,249,chr1,500,-,+,UU,2,R1
readid03 16 chr1 500 60 50M chr1 10 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,200,+,+,UU,1,R1|chr1,10,chr1,500,+,+,UU,1,E1_R1|chr1,249,chr1,500,-,+,UU,2,R1
8 changes: 8 additions & 0 deletions tests/data/mock.parse2-single-end.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
@SQ SN:chr1 LN:10000
@SQ SN:chr2 LN:10000
@PG ID:mock PN:mock VN:0.0.0 CL:mock
readid01 0 chr1 10 60 50M chr1 200 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1
readid01 0 chr1 200 60 50M chr1 10 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1
readid02 0 chr1 10 60 50M chr1 200 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1|chr1,200,chr1,500,+,+,UU,2,R1
readid02 0 chr1 200 60 50M chr1 10 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1|chr1,200,chr1,500,+,+,UU,2,R1
readid02 16 chr1 500 60 50M chr1 10 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 NM:i:0 CT:Z:SIMULATED:chr1,10,chr1,249,+,-,UU,1,R1|chr1,200,chr1,500,+,+,UU,2,R1
Loading

0 comments on commit ae544ff

Please sign in to comment.