Skip to content

Commit

Permalink
Fix order of pairs when saving dups together with nodups, ensure corr…
Browse files Browse the repository at this point in the history
…ect headers
  • Loading branch information
Phlya committed Mar 20, 2024
1 parent 5b45f29 commit 01efdf2
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 20 deletions.
16 changes: 9 additions & 7 deletions pairtools/cli/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@
" It is available for backwards compatibility and to allow specification of the"
" column order."
" Now the default scipy backend is generally the fastest, and with chunksize below"
" 1 mln has the lowest memory requirements. [dedup option]"
" 1 mln has the lowest memory requirements. [dedup option]",
# " 'cython' is deprecated and provided for backwards compatibility",
)

Expand Down Expand Up @@ -486,12 +486,14 @@ def dedup_py(
"Pairs file appears not to be sorted, dedup might produce wrong results."
)
header = headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
dups_header = header.copy()
if keep_parent_id and len(dups_header) > 0:
dups_header = headerops.append_columns(dups_header, ["parent_readID"])
if outstream == outstream_dups:
header = dups_header
if send_header_to_dedup:
outstream.writelines((l + "\n" for l in header))
if send_header_to_dup and outstream_dups and (outstream_dups != outstream):
dups_header = header
if keep_parent_id and len(dups_header) > 0:
dups_header = headerops.append_columns(dups_header, ["parent_readID"])
outstream_dups.writelines((l + "\n" for l in dups_header))
if (
outstream_unmapped
Expand Down Expand Up @@ -576,9 +578,9 @@ def dedup_py(
out_stat.save(
out_stats_stream,
yaml=kwargs.get("yaml", False), # format as yaml
filter=first_filter_name
if not kwargs.get("yaml", False)
else None, # output only the first filter if non-YAML output
filter=(
first_filter_name if not kwargs.get("yaml", False) else None
), # output only the first filter if non-YAML output
)

if bytile_dups:
Expand Down
45 changes: 32 additions & 13 deletions pairtools/lib/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,32 @@ def streaming_dedup(
# Clean up dataframe:
df_chunk = df_chunk.drop(columns=["duplicate"])

# Stream the dups:
if outstream_dups:
df_chunk.loc[mask_mapped & mask_duplicates, :].to_csv(
outstream_dups, index=False, header=False, sep="\t", quoting=QUOTE_NONE
# Stream the pairs:
# If outstream_dups is the same as outstream, we save all mapped pairs to the same file

if outstream_dups == outstream:
df_chunk.loc[mask_mapped, :].to_csv(
outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
)
else:
# Save the dups:
if outstream_dups:
df_chunk.loc[mask_duplicates, :].to_csv(
outstream_dups,
index=False,
header=False,
sep="\t",
quoting=QUOTE_NONE,
)
# Drop readID if it was created (not needed for nodup and unmapped pairs):
if keep_parent_id:
df_chunk = df_chunk.drop(columns=["parent_readID"])

# Drop readID if it was created (not needed for nodup and unmapped pairs):
if keep_parent_id:
df_chunk = df_chunk.drop(columns=["parent_readID"])
# Save unique:
if outstream:
df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv(
outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
)

# Stream unmapped:
if outstream_unmapped:
Expand All @@ -109,11 +126,6 @@ def streaming_dedup(
quoting=QUOTE_NONE,
)

# Stream unique pairs:
df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv(
outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
)

t1 = time.time()
t = t1 - t0
logger.debug(f"total time: {t}")
Expand Down Expand Up @@ -144,7 +156,14 @@ def _dedup_stream(
unmapped_chrom,
):
# Stream the input dataframe:
dfs = pd.read_table(in_stream, comment=None, names=colnames, chunksize=chunksize)
dfs = pd.read_table(
in_stream,
comment=None,
names=colnames,
chunksize=chunksize,
dtype=pairsam_format.DTYPES_PAIRSAM,
sep="\t",
)

# Set up the carryover dataframe:
df_prev_nodups = pd.DataFrame([])
Expand Down

0 comments on commit 01efdf2

Please sign in to comment.