From 26b64c4a989b24b4aa130cc6c12b680b2a04d612 Mon Sep 17 00:00:00 2001
From: Deren <de2356@columbia.edu>
Date: Mon, 8 Apr 2024 17:23:22 -0400
Subject: [PATCH] input fastqs can be format _R1_., _R1., or _1.

---
 ipyrad/assemble/demultiplex.py | 107 ++++++++++++-------
 ipyrad/assemble/pair_fastqs.py | 182 +++++++++++++++++++++++++++++++++
 2 files changed, 251 insertions(+), 38 deletions(-)
 create mode 100644 ipyrad/assemble/pair_fastqs.py

diff --git a/ipyrad/assemble/demultiplex.py b/ipyrad/assemble/demultiplex.py
index 8c86c3f8..2f24c151 100644
--- a/ipyrad/assemble/demultiplex.py
+++ b/ipyrad/assemble/demultiplex.py
@@ -26,7 +26,10 @@
 # ipyrad imports
 from ipyrad.core.sample import Sample
 from ipyrad.assemble.utils import IPyradError, ambigcutters, BADCHARS
-
+from ipyrad.assemble.pair_fastqs import (
+    get_fastq_tuples_dict_from_paths_list,
+    get_paths_list_from_fastq_str,
+)
 
 
 class Step1:
@@ -166,58 +169,64 @@ def check_files(self):
                 NO_FILES_FOUND_PAIRS
                 .format(self.data.params.sorted_fastq_path))
 
-        # link pairs into tuples
-        if 'pair' in self.data.params.datatype:
-            # check that names fit the paired naming convention
-            r1s = [i for i in self.fastqs if "_R1_" in i]
-            r2s = [i for i in self.fastqs if "_R2_" in i]
-
-            # file checks
-            if not r1s:
-                raise IPyradError(
-                    "No fastqs files found. File names must contain '_R1_' "
-                    "(and '_R2_' for paired data). See Docs.")
-            if len(r1s) != len(r2s):
-                raise IPyradError(
-                    R1_R2_name_error.format(len(r1s), len(r2s)))
-
-            # store tuples                    
-            self.ftuples = []
-            for r1file in r1s:
-                r2file = r1file.replace("_R1_", "_R2_")
-                if not os.path.exists(r2file):
-                    raise IPyradError(
-                        "Expected R2 file {} to match R1 file {}"
-                        .format(r1file, r2file)
-                        )
-                self.ftuples.append((r1file, r2file))
-
-        # data are not paired, create empty tuple pair
-        else:
-            # print warning if _R2_ is in names when not paired
-            if any(["_R2_" in i for i in self.fastqs]):
-                print(NAMES_LOOK_PAIRED_WARNING)
-            self.ftuples = [(i, "") for i in self.fastqs]
+        # get list of expanded paths
+        paths = get_paths_list_from_fastq_str(self.fastqs)
+
+        # get dict of {basename: (Path-R1, Path-R2)} from filenames
+        self.ftuples = get_fastq_tuples_dict_from_paths_list(paths)
+
+        # # link pairs into tuples
+        # if 'pair' in self.data.params.datatype:
+        #     # check that names fit the paired naming convention
+        #     r1s = [i for i in self.fastqs if "_R1_" in i]
+        #     r2s = [i for i in self.fastqs if "_R2_" in i]
+
+        #     # file checks
+        #     if not r1s:
+        #         raise IPyradError(
+        #             "No fastqs files found. File names must contain '_R1_' "
+        #             "(and '_R2_' for paired data). See Docs.")
+        #     if len(r1s) != len(r2s):
+        #         raise IPyradError(
+        #             R1_R2_name_error.format(len(r1s), len(r2s)))
+
+        #     # store tuples                    
+        #     self.ftuples = []
+        #     for r1file in r1s:
+        #         r2file = r1file.replace("_R1_", "_R2_")
+        #         if not os.path.exists(r2file):
+        #             raise IPyradError(
+        #                 "Expected R2 file {} to match R1 file {}"
+        #                 .format(r1file, r2file)
+        #                 )
+        #         self.ftuples.append((r1file, r2file))
+
+        # # data are not paired, create empty tuple pair
+        # else:
+        #     # print warning if _R2_ is in names when not paired
+        #     if any(["_R2_" in i for i in self.fastqs]):
+        #         print(NAMES_LOOK_PAIRED_WARNING)
+        #     self.ftuples = [(i, "") for i in self.fastqs]
 
 
     def remote_run_linker(self):
         "read in fastq files and count nreads for stats and chunking in s2."
 
-        # local counters 
+        # local counters
         createdinc = 0
 
         # iterate over input files
-        for ftup in self.ftuples:
-
+        for sname, ftup in self.ftuples.items():
             # remove file extension from name
-            sname = get_name_from_file(ftup[0], None, None)
+            # sname = get_name_from_file(ftup[0], None, None)
+            print(sname, ftup)
 
             # Create new Sample Class objects with names from files
             if sname not in self.data.samples:
                 newsamp = Sample(sname)
                 newsamp.stats.state = 1
                 newsamp.barcode = None
-                newsamp.files.fastqs = [ftup]
+                newsamp.files.fastqs = [(str(ftup[0]), str(ftup[1]))]
                 self.data.samples[sname] = newsamp
                 createdinc += 1
 
@@ -1445,3 +1454,25 @@ def zcat_make_temps(data, ftup, num, tmpdir, optim, start):
     option (e.g., pairddrad or pairgbs) and re-run step 1, which will require
     using the force flag (-f) to overwrite existing data.
     """
+
+
+if __name__ == "__main__":
+
+    import ipyrad as ip
+
+    # test to load samples w/ names "*_R1_*", "*_R2_*"
+    # data = ip.Assembly("TEST")
+    # data.params.sorted_fastq_path = "../../sra-fastqs/*.fastq"
+    # data.params.project_dir = "/tmp/9test"
+    # data.run("1", force=True, auto=True)
+    # print(data.stats)
+
+    # test to load samples w/ names "*_R1.*", "*_R2.*"
+    data = ip.Assembly("TEST")
+    data.params.sorted_fastq_path = "../../pedtest/DEMUX_fastqs/integ*.gz"
+    data.params.project_dir = "/tmp/9test"
+    data.run("1", force=True, auto=True)
+    print(data.stats)
+
+    # test to load samples w/ names "*_1.*", "*_2.*"
+    # ...
diff --git a/ipyrad/assemble/pair_fastqs.py b/ipyrad/assemble/pair_fastqs.py
new file mode 100644
index 00000000..b53b7db8
--- /dev/null
+++ b/ipyrad/assemble/pair_fastqs.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+
+"""Group files into pairs based on file name matching patterns.
+
+The `get_filenames_to_paired_fastqs` function is used in both demux
+and step1 of assembly to return full Paths to fastq files as tuples
+of either (R1, None) or (R1, R2).
+"""
+
+from typing import List, Tuple, Dict, Union
+from pathlib import Path
+import itertools
+
+
+def get_paths_list_from_fastq_str(fastq_paths: Union[Path, List[Path]]) -> List[Path]:
+    """Expand fastq_paths str argument into a list of Path objects.
+
+    """
+    expanded = []
+    # ensure paths is a List[Path] but where the Path elements may be
+    # regex path names that have not yet been expanded.
+
+    # ensure it is a list
+    if isinstance(fastq_paths, (str, Path)):
+        fastq_paths = [fastq_paths]
+
+    # ensure each is a Path object
+    paths = []
+    for path in fastq_paths:
+        if isinstance(path, str):
+            path = Path(path)
+        paths.append(path)
+
+    # for each Path in paths list expand into a list of Paths
+    for path in paths:
+        # expand a regex operator to possibly match multiple files
+        # such as paired-end files.
+        try:
+            fastqs = list(path.parent.glob(path.name))
+            assert fastqs
+        except (ValueError, AssertionError):
+            msg = f"No fastq data match input: {path}"
+            raise ValueError(msg)
+        expanded.extend([Path(i).expanduser().resolve() for i in fastqs])
+    return expanded
+
+
+def drop_from_right(path: Path, delim: str = "_", idx: int = 0) -> str:
+    """Return a name with an underscore separated portion removed.
+
+    This is used to find matching paired files by iteratively removing
+    subsections of filenames to find pairs that match when the sections
+    are removed (e.g., usually _R1 _R2 or _1 _2).
+
+    Example
+    -------
+    >>> path = Path("name_prefix_001_R1_002.fastq.gz")
+    >>> drop_from_right(path, "_", 0)
+    >>> # "name_prefix_001_R1.fastq.gz"
+    >>> drop_from_right(path, "_", 1)
+    >>> # "name_prefix_001_002.fastq.gz"
+    """
+    # save and remove suffixes (it seems this method is needed with .x.y
+    # suffixes compared to using Path.stem or similar.)
+    suffixes = path.suffixes
+    while path.suffix in suffixes:
+        path = path.with_suffix('')
+
+    # break file name on delimiter and get chunks in reverse order
+    chunks = path.name.split(delim)[::-1]
+
+    # get chunks minus the index from the right
+    sublist = [j for i, j in enumerate(chunks) if i != idx][::-1]
+    path = path.parent / "_".join([i for i in sublist if i]).rstrip(delim)
+    path = path.with_suffix("".join(suffixes))
+    return path
+
+
+def get_fastq_tuples_dict_from_paths_list(fastqs: List[Path]) -> Dict[str, Tuple[Path, Path]]:
+    """Return {sample_name_str: tuple_of_paired_fastqs} dict.
+
+    If technical replicates are being merged then a sample can
+    be assigned multiple pairs of paired fastqs files. Paired
+    file names may differ by having the following diffs, which
+    may occur anywhere in the file name.
+    >>> '_1', '_2'
+    >>> '_R1', '_R2'
+    >>> '_R1_', '_R2_'
+
+    This func works by splitting file names by "_" and "." examining
+    each section in turn, starting from the end, to find one that
+    appears to match the paired naming conventions above.
+    """
+    # dict to return
+    snames_to_fastq_tuples = {}
+
+    # look for PE matching file names.
+    idx = 0
+    paired = False
+    while 1:
+
+        # group names with matching names when _ section is removed.
+        try:
+            groups = itertools.groupby(
+                sorted(fastqs),
+                key=lambda x: drop_from_right(x, "_", idx)
+            )
+            assert groups
+
+            # sort the tuples to (R1, R2) and remove suffixes from names
+            sorted_tuple_groups = {}
+            for (name, gtup) in groups:
+                gtup = sorted(gtup)
+                path = Path(name)
+                suffixes = path.suffixes
+                while path.suffix in suffixes:
+                    path = path.with_suffix('')
+                sorted_tuple_groups[path.name] = gtup
+            groups = sorted_tuple_groups
+
+            # groups = {i.with_suffix("").name: sorted(j) for (i, j) in groups}
+            assert len(groups) == len(fastqs) / 2
+            assert all(len(j) == 2 for i, j in groups.items())
+            paired = True
+            break
+
+        # if >5 tries and pairs not found then either data are not
+        # paired, or the file names format is strange. We will raise
+        # a warning later if the data seem to be PE but it was not
+        # detected.
+        except (AssertionError, ValueError):
+            idx += 1
+            if idx > 4:
+                print(
+                    "No PE fastq pairs detected based on filenames, "
+                    "assuming SE data."
+                )
+                break
+
+    # store paired tuples to each basename
+    if paired:
+        for name, paths in groups.items():
+            snames_to_fastq_tuples[name] = tuple(
+                [i.expanduser().resolve() for i in paths]
+            )
+
+    # store (R1, '') tuples for each basename, and report a warning
+    # if names are suspiciously paired looking.
+    else:
+        for path in fastqs:
+            subpath = path.with_suffix("")
+            while subpath.suffix:
+                subpath = subpath.with_suffix("")
+            snames_to_fastq_tuples[subpath.name] = (path.expanduser().resolve(), "")
+
+            # warning if the data appear to include R2s
+            if any(i in str(path.name) for i in ("_R2_", "_2.", "_R2.")):
+                print(
+                    f"fastq file name ({path.name}) has a filename "
+                    "that suggests it may be an R2 read, but its paired "
+                    "R1 file could not be found. "
+                    "Paired files should have matching names except "
+                    "for _1 _2, _R1 _R2, or any of these followed by a "
+                    "'_' or '.'."
+                )
+    return snames_to_fastq_tuples
+
+
+if __name__ == "__main__":
+
+    import ipyrad as ip
+
+    FASTQ_PATH = Path("../../pedtest/small_tmp_R*.gz")
+    FASTQS = get_paths_list_from_fastq_str(FASTQ_PATH)
+    pairs = get_fastq_tuples_dict_from_paths_list(FASTQS)
+    print(FASTQS)
+    print(pairs)
+
+    FASTQ_PATH = Path("../../pedtest/NEW_fastqs/*fastq.gz")
+    FASTQS = get_paths_list_from_fastq_str(FASTQ_PATH)
+    pairs = get_fastq_tuples_dict_from_paths_list(FASTQS)
+    # print(pairs)