Skip to content

Commit

Permalink
input fastqs can be format _R1_., _R1., or _1.
Browse files Browse the repository at this point in the history
  • Loading branch information
eaton-lab committed Apr 8, 2024
1 parent 099df55 commit 26b64c4
Show file tree
Hide file tree
Showing 2 changed files with 251 additions and 38 deletions.
107 changes: 69 additions & 38 deletions ipyrad/assemble/demultiplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@
# ipyrad imports
from ipyrad.core.sample import Sample
from ipyrad.assemble.utils import IPyradError, ambigcutters, BADCHARS

from ipyrad.assemble.pair_fastqs import (
get_fastq_tuples_dict_from_paths_list,
get_paths_list_from_fastq_str,
)


class Step1:
Expand Down Expand Up @@ -166,58 +169,64 @@ def check_files(self):
NO_FILES_FOUND_PAIRS
.format(self.data.params.sorted_fastq_path))

# link pairs into tuples
if 'pair' in self.data.params.datatype:
# check that names fit the paired naming convention
r1s = [i for i in self.fastqs if "_R1_" in i]
r2s = [i for i in self.fastqs if "_R2_" in i]

# file checks
if not r1s:
raise IPyradError(
"No fastqs files found. File names must contain '_R1_' "
"(and '_R2_' for paired data). See Docs.")
if len(r1s) != len(r2s):
raise IPyradError(
R1_R2_name_error.format(len(r1s), len(r2s)))

# store tuples
self.ftuples = []
for r1file in r1s:
r2file = r1file.replace("_R1_", "_R2_")
if not os.path.exists(r2file):
raise IPyradError(
"Expected R2 file {} to match R1 file {}"
.format(r1file, r2file)
)
self.ftuples.append((r1file, r2file))

# data are not paired, create empty tuple pair
else:
# print warning if _R2_ is in names when not paired
if any(["_R2_" in i for i in self.fastqs]):
print(NAMES_LOOK_PAIRED_WARNING)
self.ftuples = [(i, "") for i in self.fastqs]
# get list of expanded paths
paths = get_paths_list_from_fastq_str(self.fastqs)

# get dict of {basename: (Path-R1, Path-R2)} from filenames
self.ftuples = get_fastq_tuples_dict_from_paths_list(paths)

# # link pairs into tuples
# if 'pair' in self.data.params.datatype:
# # check that names fit the paired naming convention
# r1s = [i for i in self.fastqs if "_R1_" in i]
# r2s = [i for i in self.fastqs if "_R2_" in i]

# # file checks
# if not r1s:
# raise IPyradError(
# "No fastqs files found. File names must contain '_R1_' "
# "(and '_R2_' for paired data). See Docs.")
# if len(r1s) != len(r2s):
# raise IPyradError(
# R1_R2_name_error.format(len(r1s), len(r2s)))

# # store tuples
# self.ftuples = []
# for r1file in r1s:
# r2file = r1file.replace("_R1_", "_R2_")
# if not os.path.exists(r2file):
# raise IPyradError(
# "Expected R2 file {} to match R1 file {}"
# .format(r1file, r2file)
# )
# self.ftuples.append((r1file, r2file))

# # data are not paired, create empty tuple pair
# else:
# # print warning if _R2_ is in names when not paired
# if any(["_R2_" in i for i in self.fastqs]):
# print(NAMES_LOOK_PAIRED_WARNING)
# self.ftuples = [(i, "") for i in self.fastqs]


def remote_run_linker(self):
"read in fastq files and count nreads for stats and chunking in s2."

# local counters
# local counters
createdinc = 0

# iterate over input files
for ftup in self.ftuples:

for sname, ftup in self.ftuples.items():
# remove file extension from name
sname = get_name_from_file(ftup[0], None, None)
# sname = get_name_from_file(ftup[0], None, None)
print(sname, ftup)

# Create new Sample Class objects with names from files
if sname not in self.data.samples:
newsamp = Sample(sname)
newsamp.stats.state = 1
newsamp.barcode = None
newsamp.files.fastqs = [ftup]
newsamp.files.fastqs = [(str(ftup[0]), str(ftup[1]))]
self.data.samples[sname] = newsamp
createdinc += 1

Expand Down Expand Up @@ -1445,3 +1454,25 @@ def zcat_make_temps(data, ftup, num, tmpdir, optim, start):
option (e.g., pairddrad or pairgbs) and re-run step 1, which will require
using the force flag (-f) to overwrite existing data.
"""


if __name__ == "__main__":

import ipyrad as ip

# test to load samples w/ names "*_R1_*", "*_R2_*"
# data = ip.Assembly("TEST")
# data.params.sorted_fastq_path = "../../sra-fastqs/*.fastq"
# data.params.project_dir = "/tmp/9test"
# data.run("1", force=True, auto=True)
# print(data.stats)

# test to load samples w/ names "*_R1.*", "*_R2.*"
data = ip.Assembly("TEST")
data.params.sorted_fastq_path = "../../pedtest/DEMUX_fastqs/integ*.gz"
data.params.project_dir = "/tmp/9test"
data.run("1", force=True, auto=True)
print(data.stats)

# test to load samples w/ names "*_1.*", "*_2.*"
# ...
182 changes: 182 additions & 0 deletions ipyrad/assemble/pair_fastqs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#!/usr/bin/env python

"""Group files into pairs based on file name matching patterns.
The `get_filenames_to_paired_fastqs` function is used in both demux
and step1 of assembly to return full Paths to fastq files as tuples
of either (R1, None) or (R1, R2).
"""

from typing import List, Tuple, Dict, Union
from pathlib import Path
import itertools


def get_paths_list_from_fastq_str(fastq_paths: Union[Path, List[Path]]) -> List[Path]:
"""Expand fastq_paths str argument into a list of Path objects.
"""
expanded = []
# ensure paths is a List[Path] but where the Path elements may be
# regex path names that have not yet been expanded.

# ensure it is a list
if isinstance(fastq_paths, (str, Path)):
fastq_paths = [fastq_paths]

# ensure each is a Path object
paths = []
for path in fastq_paths:
if isinstance(path, str):
path = Path(path)
paths.append(path)

# for each Path in paths list expand into a list of Paths
for path in paths:
# expand a regex operator to possibly match multiple files
# such as paired-end files.
try:
fastqs = list(path.parent.glob(path.name))
assert fastqs
except (ValueError, AssertionError):
msg = f"No fastq data match input: {path}"
raise ValueError(msg)
expanded.extend([Path(i).expanduser().resolve() for i in fastqs])
return expanded


def drop_from_right(path: Path, delim: str = "_", idx: int = 0) -> str:
"""Return a name with an underscore separated portion removed.
This is used to find matching paired files by iteratively removing
subsections of filenames to find pairs that match when the sections
are removed (e.g., usually _R1 _R2 or _1 _2).
Example
-------
>>> path = Path("name_prefix_001_R1_002.fastq.gz")
>>> drop_from_right(path, "_", 0)
>>> # "name_prefix_001_R1.fastq.gz"
>>> drop_from_right(path, "_", 1)
>>> # "name_prefix_001_002.fastq.gz"
"""
# save and remove suffixes (it seems this method is needed with .x.y
# suffixes compared to using Path.stem or similar.)
suffixes = path.suffixes
while path.suffix in suffixes:
path = path.with_suffix('')

# break file name on delimiter and get chunks in reverse order
chunks = path.name.split(delim)[::-1]

# get chunks minus the index from the right
sublist = [j for i, j in enumerate(chunks) if i != idx][::-1]
path = path.parent / "_".join([i for i in sublist if i]).rstrip(delim)
path = path.with_suffix("".join(suffixes))
return path


def get_fastq_tuples_dict_from_paths_list(fastqs: List[Path]) -> Dict[str, Tuple[Path, Path]]:
"""Return {sample_name_str: tuple_of_paired_fastqs} dict.
If technical replicates are being merged then a sample can
be assigned multiple pairs of paired fastqs files. Paired
file names may differ by having the following diffs, which
may occur anywhere in the file name.
>>> '_1', '_2'
>>> '_R1', '_R2'
>>> '_R1_', '_R2_'
This func works by splitting file names by "_" and "." examining
each section in turn, starting from the end, to find one that
appears to match the paired naming conventions above.
"""
# dict to return
snames_to_fastq_tuples = {}

# look for PE matching file names.
idx = 0
paired = False
while 1:

# group names with matching names when _ section is removed.
try:
groups = itertools.groupby(
sorted(fastqs),
key=lambda x: drop_from_right(x, "_", idx)
)
assert groups

# sort the tuples to (R1, R2) and remove suffixes from names
sorted_tuple_groups = {}
for (name, gtup) in groups:
gtup = sorted(gtup)
path = Path(name)
suffixes = path.suffixes
while path.suffix in suffixes:
path = path.with_suffix('')
sorted_tuple_groups[path.name] = gtup
groups = sorted_tuple_groups

# groups = {i.with_suffix("").name: sorted(j) for (i, j) in groups}
assert len(groups) == len(fastqs) / 2
assert all(len(j) == 2 for i, j in groups.items())
paired = True
break

# if >5 tries and pairs not found then either data are not
# paired, or the file names format is strange. We will raise
# a warning later if the data seem to be PE but it was not
# detected.
except (AssertionError, ValueError):
idx += 1
if idx > 4:
print(
"No PE fastq pairs detected based on filenames, "
"assuming SE data."
)
break

# store paired tuples to each basename
if paired:
for name, paths in groups.items():
snames_to_fastq_tuples[name] = tuple(
[i.expanduser().resolve() for i in paths]
)

# store (R1, '') tuples for each basename, and report a warning
# if names are suspiciously paired looking.
else:
for path in fastqs:
subpath = path.with_suffix("")
while subpath.suffix:
subpath = subpath.with_suffix("")
snames_to_fastq_tuples[subpath.name] = (path.expanduser().resolve(), "")

# warning if the data appear to include R2s
if any(i in str(path.name) for i in ("_R2_", "_2.", "_R2.")):
print(
f"fastq file name ({path.name}) has a filename "
"that suggests it may be an R2 read, but its paired "
"R1 file could not be found. "
"Paired files should have matching names except "
"for _1 _2, _R1 _R2, or any of these followed by a "
"'_' or '.'."
)
return snames_to_fastq_tuples


if __name__ == "__main__":

import ipyrad as ip

FASTQ_PATH = Path("../../pedtest/small_tmp_R*.gz")
FASTQS = get_paths_list_from_fastq_str(FASTQ_PATH)
pairs = get_fastq_tuples_dict_from_paths_list(FASTQS)
print(FASTQS)
print(pairs)

FASTQ_PATH = Path("../../pedtest/NEW_fastqs/*fastq.gz")
FASTQS = get_paths_list_from_fastq_str(FASTQ_PATH)
pairs = get_fastq_tuples_dict_from_paths_list(FASTQS)
# print(pairs)

0 comments on commit 26b64c4

Please sign in to comment.