From 01efdf20fe41135825f70189c1b0b3d2ff06c049 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 20 Mar 2024 14:27:16 +0100 Subject: [PATCH] Fix order of pairs when saving dups together with nodups, ensure correct headers --- pairtools/cli/dedup.py | 16 ++++++++------- pairtools/lib/dedup.py | 45 ++++++++++++++++++++++++++++++------------ 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py index 3a1fe50..5dbe6fd 100644 --- a/pairtools/cli/dedup.py +++ b/pairtools/cli/dedup.py @@ -104,7 +104,7 @@ " It is available for backwards compatibility and to allow specification of the" " column order." " Now the default scipy backend is generally the fastest, and with chunksize below" - " 1 mln has the lowest memory requirements. [dedup option]" + " 1 mln has the lowest memory requirements. [dedup option]", # " 'cython' is deprecated and provided for backwards compatibility", ) @@ -486,12 +486,14 @@ def dedup_py( "Pairs file appears not to be sorted, dedup might produce wrong results." ) header = headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME) + dups_header = header.copy() + if keep_parent_id and len(dups_header) > 0: + dups_header = headerops.append_columns(dups_header, ["parent_readID"]) + if outstream == outstream_dups: + header = dups_header if send_header_to_dedup: outstream.writelines((l + "\n" for l in header)) if send_header_to_dup and outstream_dups and (outstream_dups != outstream): - dups_header = header - if keep_parent_id and len(dups_header) > 0: - dups_header = headerops.append_columns(dups_header, ["parent_readID"]) outstream_dups.writelines((l + "\n" for l in dups_header)) if ( outstream_unmapped @@ -576,9 +578,9 @@ def dedup_py( out_stat.save( out_stats_stream, yaml=kwargs.get("yaml", False), # format as yaml - filter=first_filter_name - if not kwargs.get("yaml", False) - else None, # output only the first filter if non-YAML output + filter=( + first_filter_name if not kwargs.get("yaml", False) else None + ), # output only the first filter if non-YAML output ) if bytile_dups: diff --git a/pairtools/lib/dedup.py b/pairtools/lib/dedup.py index 81f1fb5..47f57ff 100644 --- a/pairtools/lib/dedup.py +++ b/pairtools/lib/dedup.py @@ -89,15 +89,32 @@ def streaming_dedup( # Clean up dataframe: df_chunk = df_chunk.drop(columns=["duplicate"]) - # Stream the dups: - if outstream_dups: - df_chunk.loc[mask_mapped & mask_duplicates, :].to_csv( - outstream_dups, index=False, header=False, sep="\t", quoting=QUOTE_NONE + # Stream the pairs: + # If outstream_dups is the same as outstream, we save all mapped pairs to the same file + + if outstream_dups == outstream: + df_chunk.loc[mask_mapped, :].to_csv( + outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE ) + else: + # Save the dups: + if outstream_dups: + df_chunk.loc[mask_duplicates, :].to_csv( + outstream_dups, + index=False, + header=False, + sep="\t", + quoting=QUOTE_NONE, + ) + # Drop readID if it was created (not needed for nodup and unmapped pairs): + if keep_parent_id: + df_chunk = df_chunk.drop(columns=["parent_readID"]) - # Drop readID if it was created (not needed for nodup and unmapped pairs): - if keep_parent_id: - df_chunk = df_chunk.drop(columns=["parent_readID"]) + # Save unique: + if outstream: + df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv( + outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE + ) # Stream unmapped: if outstream_unmapped: @@ -109,11 +126,6 @@ def streaming_dedup( quoting=QUOTE_NONE, ) - # Stream unique pairs: - df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv( - outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE - ) - t1 = time.time() t = t1 - t0 logger.debug(f"total time: {t}") @@ -144,7 +156,14 @@ def _dedup_stream( unmapped_chrom, ): # Stream the input dataframe: - dfs = pd.read_table(in_stream, comment=None, names=colnames, chunksize=chunksize) + dfs = pd.read_table( + in_stream, + comment=None, + names=colnames, + chunksize=chunksize, + dtype=pairsam_format.DTYPES_PAIRSAM, + sep="\t", + ) # Set up the carryover dataframe: df_prev_nodups = pd.DataFrame([])