From 26b64c4a989b24b4aa130cc6c12b680b2a04d612 Mon Sep 17 00:00:00 2001 From: Deren Date: Mon, 8 Apr 2024 17:23:22 -0400 Subject: [PATCH] input fastqs can be format _R1_., _R1., or _1. --- ipyrad/assemble/demultiplex.py | 107 ++++++++++++------- ipyrad/assemble/pair_fastqs.py | 182 +++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+), 38 deletions(-) create mode 100644 ipyrad/assemble/pair_fastqs.py diff --git a/ipyrad/assemble/demultiplex.py b/ipyrad/assemble/demultiplex.py index 8c86c3f8..2f24c151 100644 --- a/ipyrad/assemble/demultiplex.py +++ b/ipyrad/assemble/demultiplex.py @@ -26,7 +26,10 @@ # ipyrad imports from ipyrad.core.sample import Sample from ipyrad.assemble.utils import IPyradError, ambigcutters, BADCHARS - +from ipyrad.assemble.pair_fastqs import ( + get_fastq_tuples_dict_from_paths_list, + get_paths_list_from_fastq_str, +) class Step1: @@ -166,58 +169,64 @@ def check_files(self): NO_FILES_FOUND_PAIRS .format(self.data.params.sorted_fastq_path)) - # link pairs into tuples - if 'pair' in self.data.params.datatype: - # check that names fit the paired naming convention - r1s = [i for i in self.fastqs if "_R1_" in i] - r2s = [i for i in self.fastqs if "_R2_" in i] - - # file checks - if not r1s: - raise IPyradError( - "No fastqs files found. File names must contain '_R1_' " - "(and '_R2_' for paired data). See Docs.") - if len(r1s) != len(r2s): - raise IPyradError( - R1_R2_name_error.format(len(r1s), len(r2s))) - - # store tuples - self.ftuples = [] - for r1file in r1s: - r2file = r1file.replace("_R1_", "_R2_") - if not os.path.exists(r2file): - raise IPyradError( - "Expected R2 file {} to match R1 file {}" - .format(r1file, r2file) - ) - self.ftuples.append((r1file, r2file)) - - # data are not paired, create empty tuple pair - else: - # print warning if _R2_ is in names when not paired - if any(["_R2_" in i for i in self.fastqs]): - print(NAMES_LOOK_PAIRED_WARNING) - self.ftuples = [(i, "") for i in self.fastqs] + # get list of expanded paths + paths = get_paths_list_from_fastq_str(self.fastqs) + + # get dict of {basename: (Path-R1, Path-R2)} from filenames + self.ftuples = get_fastq_tuples_dict_from_paths_list(paths) + + # # link pairs into tuples + # if 'pair' in self.data.params.datatype: + # # check that names fit the paired naming convention + # r1s = [i for i in self.fastqs if "_R1_" in i] + # r2s = [i for i in self.fastqs if "_R2_" in i] + + # # file checks + # if not r1s: + # raise IPyradError( + # "No fastqs files found. File names must contain '_R1_' " + # "(and '_R2_' for paired data). See Docs.") + # if len(r1s) != len(r2s): + # raise IPyradError( + # R1_R2_name_error.format(len(r1s), len(r2s))) + + # # store tuples + # self.ftuples = [] + # for r1file in r1s: + # r2file = r1file.replace("_R1_", "_R2_") + # if not os.path.exists(r2file): + # raise IPyradError( + # "Expected R2 file {} to match R1 file {}" + # .format(r1file, r2file) + # ) + # self.ftuples.append((r1file, r2file)) + + # # data are not paired, create empty tuple pair + # else: + # # print warning if _R2_ is in names when not paired + # if any(["_R2_" in i for i in self.fastqs]): + # print(NAMES_LOOK_PAIRED_WARNING) + # self.ftuples = [(i, "") for i in self.fastqs] def remote_run_linker(self): "read in fastq files and count nreads for stats and chunking in s2." - # local counters + # local counters createdinc = 0 # iterate over input files - for ftup in self.ftuples: - + for sname, ftup in self.ftuples.items(): # remove file extension from name - sname = get_name_from_file(ftup[0], None, None) + # sname = get_name_from_file(ftup[0], None, None) + print(sname, ftup) # Create new Sample Class objects with names from files if sname not in self.data.samples: newsamp = Sample(sname) newsamp.stats.state = 1 newsamp.barcode = None - newsamp.files.fastqs = [ftup] + newsamp.files.fastqs = [(str(ftup[0]), str(ftup[1]))] self.data.samples[sname] = newsamp createdinc += 1 @@ -1445,3 +1454,25 @@ def zcat_make_temps(data, ftup, num, tmpdir, optim, start): option (e.g., pairddrad or pairgbs) and re-run step 1, which will require using the force flag (-f) to overwrite existing data. """ + + +if __name__ == "__main__": + + import ipyrad as ip + + # test to load samples w/ names "*_R1_*", "*_R2_*" + # data = ip.Assembly("TEST") + # data.params.sorted_fastq_path = "../../sra-fastqs/*.fastq" + # data.params.project_dir = "/tmp/9test" + # data.run("1", force=True, auto=True) + # print(data.stats) + + # test to load samples w/ names "*_R1.*", "*_R2.*" + data = ip.Assembly("TEST") + data.params.sorted_fastq_path = "../../pedtest/DEMUX_fastqs/integ*.gz" + data.params.project_dir = "/tmp/9test" + data.run("1", force=True, auto=True) + print(data.stats) + + # test to load samples w/ names "*_1.*", "*_2.*" + # ... diff --git a/ipyrad/assemble/pair_fastqs.py b/ipyrad/assemble/pair_fastqs.py new file mode 100644 index 00000000..b53b7db8 --- /dev/null +++ b/ipyrad/assemble/pair_fastqs.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python + +"""Group files into pairs based on file name matching patterns. + +The `get_filenames_to_paired_fastqs` function is used in both demux +and step1 of assembly to return full Paths to fastq files as tuples +of either (R1, None) or (R1, R2). +""" + +from typing import List, Tuple, Dict, Union +from pathlib import Path +import itertools + + +def get_paths_list_from_fastq_str(fastq_paths: Union[Path, List[Path]]) -> List[Path]: + """Expand fastq_paths str argument into a list of Path objects. + + """ + expanded = [] + # ensure paths is a List[Path] but where the Path elements may be + # regex path names that have not yet been expanded. + + # ensure it is a list + if isinstance(fastq_paths, (str, Path)): + fastq_paths = [fastq_paths] + + # ensure each is a Path object + paths = [] + for path in fastq_paths: + if isinstance(path, str): + path = Path(path) + paths.append(path) + + # for each Path in paths list expand into a list of Paths + for path in paths: + # expand a regex operator to possibly match multiple files + # such as paired-end files. + try: + fastqs = list(path.parent.glob(path.name)) + assert fastqs + except (ValueError, AssertionError): + msg = f"No fastq data match input: {path}" + raise ValueError(msg) + expanded.extend([Path(i).expanduser().resolve() for i in fastqs]) + return expanded + + +def drop_from_right(path: Path, delim: str = "_", idx: int = 0) -> str: + """Return a name with an underscore separated portion removed. + + This is used to find matching paired files by iteratively removing + subsections of filenames to find pairs that match when the sections + are removed (e.g., usually _R1 _R2 or _1 _2). + + Example + ------- + >>> path = Path("name_prefix_001_R1_002.fastq.gz") + >>> drop_from_right(path, "_", 0) + >>> # "name_prefix_001_R1.fastq.gz" + >>> drop_from_right(path, "_", 1) + >>> # "name_prefix_001_002.fastq.gz" + """ + # save and remove suffixes (it seems this method is needed with .x.y + # suffixes compared to using Path.stem or similar.) + suffixes = path.suffixes + while path.suffix in suffixes: + path = path.with_suffix('') + + # break file name on delimiter and get chunks in reverse order + chunks = path.name.split(delim)[::-1] + + # get chunks minus the index from the right + sublist = [j for i, j in enumerate(chunks) if i != idx][::-1] + path = path.parent / "_".join([i for i in sublist if i]).rstrip(delim) + path = path.with_suffix("".join(suffixes)) + return path + + +def get_fastq_tuples_dict_from_paths_list(fastqs: List[Path]) -> Dict[str, Tuple[Path, Path]]: + """Return {sample_name_str: tuple_of_paired_fastqs} dict. + + If technical replicates are being merged then a sample can + be assigned multiple pairs of paired fastqs files. Paired + file names may differ by having the following diffs, which + may occur anywhere in the file name. + >>> '_1', '_2' + >>> '_R1', '_R2' + >>> '_R1_', '_R2_' + + This func works by splitting file names by "_" and "." examining + each section in turn, starting from the end, to find one that + appears to match the paired naming conventions above. + """ + # dict to return + snames_to_fastq_tuples = {} + + # look for PE matching file names. + idx = 0 + paired = False + while 1: + + # group names with matching names when _ section is removed. + try: + groups = itertools.groupby( + sorted(fastqs), + key=lambda x: drop_from_right(x, "_", idx) + ) + assert groups + + # sort the tuples to (R1, R2) and remove suffixes from names + sorted_tuple_groups = {} + for (name, gtup) in groups: + gtup = sorted(gtup) + path = Path(name) + suffixes = path.suffixes + while path.suffix in suffixes: + path = path.with_suffix('') + sorted_tuple_groups[path.name] = gtup + groups = sorted_tuple_groups + + # groups = {i.with_suffix("").name: sorted(j) for (i, j) in groups} + assert len(groups) == len(fastqs) / 2 + assert all(len(j) == 2 for i, j in groups.items()) + paired = True + break + + # if >5 tries and pairs not found then either data are not + # paired, or the file names format is strange. We will raise + # a warning later if the data seem to be PE but it was not + # detected. + except (AssertionError, ValueError): + idx += 1 + if idx > 4: + print( + "No PE fastq pairs detected based on filenames, " + "assuming SE data." + ) + break + + # store paired tuples to each basename + if paired: + for name, paths in groups.items(): + snames_to_fastq_tuples[name] = tuple( + [i.expanduser().resolve() for i in paths] + ) + + # store (R1, '') tuples for each basename, and report a warning + # if names are suspiciously paired looking. + else: + for path in fastqs: + subpath = path.with_suffix("") + while subpath.suffix: + subpath = subpath.with_suffix("") + snames_to_fastq_tuples[subpath.name] = (path.expanduser().resolve(), "") + + # warning if the data appear to include R2s + if any(i in str(path.name) for i in ("_R2_", "_2.", "_R2.")): + print( + f"fastq file name ({path.name}) has a filename " + "that suggests it may be an R2 read, but its paired " + "R1 file could not be found. " + "Paired files should have matching names except " + "for _1 _2, _R1 _R2, or any of these followed by a " + "'_' or '.'." + ) + return snames_to_fastq_tuples + + +if __name__ == "__main__": + + import ipyrad as ip + + FASTQ_PATH = Path("../../pedtest/small_tmp_R*.gz") + FASTQS = get_paths_list_from_fastq_str(FASTQ_PATH) + pairs = get_fastq_tuples_dict_from_paths_list(FASTQS) + print(FASTQS) + print(pairs) + + FASTQ_PATH = Path("../../pedtest/NEW_fastqs/*fastq.gz") + FASTQS = get_paths_list_from_fastq_str(FASTQ_PATH) + pairs = get_fastq_tuples_dict_from_paths_list(FASTQS) + # print(pairs)