From 01efdf20fe41135825f70189c1b0b3d2ff06c049 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 20 Mar 2024 14:27:16 +0100
Subject: [PATCH] Fix order of pairs when saving dups together with nodups,
 ensure correct headers

---
 pairtools/cli/dedup.py | 16 ++++++++-------
 pairtools/lib/dedup.py | 45 ++++++++++++++++++++++++++++++------------
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py
index 3a1fe50..5dbe6fd 100644
--- a/pairtools/cli/dedup.py
+++ b/pairtools/cli/dedup.py
@@ -104,7 +104,7 @@
     " It is available for backwards compatibility and to allow specification of the"
     " column order."
     " Now the default scipy backend is generally the fastest, and with chunksize below"
-    " 1 mln has the lowest memory requirements. [dedup option]"
+    " 1 mln has the lowest memory requirements. [dedup option]",
     # " 'cython' is deprecated and provided for backwards compatibility",
 )
 
@@ -486,12 +486,14 @@ def dedup_py(
             "Pairs file appears not to be sorted, dedup might produce wrong results."
         )
     header = headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
+    dups_header = header.copy()
+    if keep_parent_id and len(dups_header) > 0:
+        dups_header = headerops.append_columns(dups_header, ["parent_readID"])
+    if outstream == outstream_dups:
+        header = dups_header
     if send_header_to_dedup:
         outstream.writelines((l + "\n" for l in header))
     if send_header_to_dup and outstream_dups and (outstream_dups != outstream):
-        dups_header = header
-        if keep_parent_id and len(dups_header) > 0:
-            dups_header = headerops.append_columns(dups_header, ["parent_readID"])
         outstream_dups.writelines((l + "\n" for l in dups_header))
     if (
         outstream_unmapped
@@ -576,9 +578,9 @@ def dedup_py(
         out_stat.save(
             out_stats_stream,
             yaml=kwargs.get("yaml", False),  # format as yaml
-            filter=first_filter_name
-            if not kwargs.get("yaml", False)
-            else None,  # output only the first filter if non-YAML output
+            filter=(
+                first_filter_name if not kwargs.get("yaml", False) else None
+            ),  # output only the first filter if non-YAML output
         )
 
     if bytile_dups:
diff --git a/pairtools/lib/dedup.py b/pairtools/lib/dedup.py
index 81f1fb5..47f57ff 100644
--- a/pairtools/lib/dedup.py
+++ b/pairtools/lib/dedup.py
@@ -89,15 +89,32 @@ def streaming_dedup(
         # Clean up dataframe:
         df_chunk = df_chunk.drop(columns=["duplicate"])
 
-        # Stream the dups:
-        if outstream_dups:
-            df_chunk.loc[mask_mapped & mask_duplicates, :].to_csv(
-                outstream_dups, index=False, header=False, sep="\t", quoting=QUOTE_NONE
+        # Stream the pairs:
+        # If outstream_dups is the same as outstream, we save all mapped pairs to the same file
+
+        if outstream_dups == outstream:
+            df_chunk.loc[mask_mapped, :].to_csv(
+                outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
             )
+        else:
+            # Save the dups:
+            if outstream_dups:
+                df_chunk.loc[mask_duplicates, :].to_csv(
+                    outstream_dups,
+                    index=False,
+                    header=False,
+                    sep="\t",
+                    quoting=QUOTE_NONE,
+                )
+            # Drop readID if it was created (not needed for nodup and unmapped pairs):
+            if keep_parent_id:
+                df_chunk = df_chunk.drop(columns=["parent_readID"])
 
-        # Drop readID if it was created (not needed for nodup and unmapped pairs):
-        if keep_parent_id:
-            df_chunk = df_chunk.drop(columns=["parent_readID"])
+            # Save unique:
+            if outstream:
+                df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv(
+                    outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
+                )
 
         # Stream unmapped:
         if outstream_unmapped:
@@ -109,11 +126,6 @@ def streaming_dedup(
                 quoting=QUOTE_NONE,
             )
 
-        # Stream unique pairs:
-        df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv(
-            outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
-        )
-
     t1 = time.time()
     t = t1 - t0
     logger.debug(f"total time: {t}")
@@ -144,7 +156,14 @@ def _dedup_stream(
     unmapped_chrom,
 ):
     # Stream the input dataframe:
-    dfs = pd.read_table(in_stream, comment=None, names=colnames, chunksize=chunksize)
+    dfs = pd.read_table(
+        in_stream,
+        comment=None,
+        names=colnames,
+        chunksize=chunksize,
+        dtype=pairsam_format.DTYPES_PAIRSAM,
+        sep="\t",
+    )
 
     # Set up the carryover dataframe:
     df_prev_nodups = pd.DataFrame([])