Skip to content

Commit

Permalink
Merge branch 'master' of github.com:dereneaton/ipyrad
Browse files Browse the repository at this point in the history
  • Loading branch information
eaton-lab committed Jan 7, 2025
2 parents 5947011 + b9e7070 commit 6b6c329
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 17 deletions.
2 changes: 1 addition & 1 deletion ipyrad/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import subprocess as _sps

# Dunders
__version__ = "0.9.102"
__version__ = "0.9.104"
__author__ = "Deren Eaton & Isaac Overcast"

# CLI __main__ changes to 0
Expand Down
95 changes: 94 additions & 1 deletion ipyrad/assemble/rawedit.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,12 @@ def cutadaptit_single(data, sample):
adapter filters.
"""
sname = sample.name

if data.hackersonly.max_raw_reads_sample > 0:
_sample_max_raw_reads(data, sample)

finput_r1 = sample.files.concat[0][0]

# if (GBS, ddRAD) we look for the second cut site + adapter. For SE
# data we don't bother trying to remove the second barcode since it's not
# as critical as with PE data.
Expand Down Expand Up @@ -433,7 +439,7 @@ def cutadaptit_single(data, sample):
"--trim-n",
"--output", os.path.join(
data.dirs.edits, sname + ".trimmed_R1_.fastq.gz"),
sample.files.concat[0][0],
finput_r1,
]

if int(data.params.filter_adapters):
Expand Down Expand Up @@ -486,6 +492,9 @@ def cutadaptit_pairs(data, sample):
"""
sname = sample.name

if data.hackersonly.max_raw_reads_sample > 0:
_sample_max_raw_reads(data, sample)

## applied to read pairs
finput_r1 = sample.files.concat[0][0]
finput_r2 = sample.files.concat[0][1]
Expand Down Expand Up @@ -710,6 +719,90 @@ def concat_multiple_inputs(data, sample):
return sample.files.concat


## Called inside cutadaptit_single/cutadaptit_pairs prior to cutadapt
def _sample_max_raw_reads(data, sample):
"""
Subsample incoming raw reads to a given maximum value.
Only used if the hackersonly["max_raw_reads_sample"] > 0
"""

# Grab indices for R1/R2 from post-concatenated input files
r1_in = sample.files.concat[0][0]
r2_in = sample.files.concat[0][1]

# subsample R1/R2 to a maximum value. Multiply by 4 to account for
# 4 lines per read in the input fastq. Cast to str so sps is happy
nsubsample_reads = str(data.hackersonly.max_raw_reads_sample * 4)

# Pushing ungzipped files through `cat` simplifies the code to handle
# both gz and flat files (so that both have 2 calls to sps.Popen) and the
# additional cat (rather than simply head) of the flat files should have
# identical performance.
isgzip = ".gz"
if r1_in.endswith(".gz"):
cmd_cat1 = ["gunzip", "-c", r1_in]
cmd_cat2 = ["gunzip", "-c", r2_in]
else:
cmd_cat1 = ["cat", r1_in]
cmd_cat2 = ["cat", r2_in]
isgzip = ""

# Catch only the first n reads
cmd_samp1 = ["head", "-n", nsubsample_reads]
cmd_samp2 = ["head", "-n", nsubsample_reads]

# write to new subsampled file handle (gzip happens downstream, if
# the input file was gzipped)
subsamp1 = os.path.join(
data.dirs.edits, sample.name + ".subsample_R1_.fq")
with open(subsamp1, 'w') as cout1:
# gzip/cat the data to a pipe
proc1 = sps.Popen(
cmd_cat1, stderr=sps.STDOUT, stdout=sps.PIPE)
# catch the data and peel off the first n reads
proc2 = sps.Popen(
cmd_samp1, stderr=sps.STDOUT, stdout=cout1, stdin=proc1.stdout)
res2 = proc2.communicate()[0]
if proc2.returncode:
raise IPyradError("error in: {} | {}, {}".format(cmd_cat1, cmd_samp1, res2))

# Only set conc2 if R2 actually exists
subsamp2 = 0
if "pair" in data.params.datatype:
subsamp2 = os.path.join(
data.dirs.edits, sample.name + ".subsample_R2_.fq")
with open(subsamp2, 'w') as cout2:
# gzip/cat the data to a pipe
proc1 = sps.Popen(
cmd_cat2, stderr=sps.STDOUT, stdout=sps.PIPE)
# catch the data and peel off the first n reads
proc2 = sps.Popen(
cmd_samp2, stderr=sps.STDOUT, stdout=cout2, stdin=proc1.stdout)
res2 = proc2.communicate()[0]
if proc2.returncode:
raise IPyradError("error in: {} | {}, {}".format(cmd_cat2, cmd_samp2, res2))

# Re-gzip the subsampled files to match status of the input file
if isgzip:
for infile in [subsamp1, subsamp2]:
if not infile:
# Don't try to gzip a file that doesn't exist
continue
cmd_gz1 = ["gzip", "-f", infile]
proc3 = sps.Popen(
cmd_gz1, stderr=sps.STDOUT)
res3 = proc3.communicate()[0]
if proc3.returncode:
raise IPyradError("error in: {}, {}".format(cmd_gz1, res3))
# Fix the suffix to agree with the file format
subsamp1 += ".gz"
if subsamp2: subsamp2 += ".gz"

# store new file handles (replaces concat files with subsampled files)
sample.files.concat = [(subsamp1, subsamp2)]
return sample.files.concat


# GLOBALS
NO_BARS_GBS_WARNING = """\
This is a just a warning:
Expand Down
8 changes: 8 additions & 0 deletions ipyrad/core/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def __init__(self):
("exclude_reference", True),
("trim_loci_min_sites", 4),
("mask_restriction_sites", False),
("max_raw_reads_sample", 0),
])

# pretty printing of object
Expand Down Expand Up @@ -177,6 +178,13 @@ def mask_restriction_sites(self):
def mask_restriction_sites(self, value):
self._data["mask_restriction_sites"] = bool(value)

@property
def max_raw_reads_sample(self):
return self._data["max_raw_reads_sample"]
@max_raw_reads_sample.setter
def max_raw_reads_sample(self, value):
self._data["max_raw_reads_sample"] = int(value)


class Params(object):
def __init__(self, data):
Expand Down
30 changes: 15 additions & 15 deletions newdocs/API-analysis/cookbook-bpp-species-delimitation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -981,7 +981,7 @@
"A00n = b.copy(name=\"A00-nodata\")\n",
"\n",
"## set params on the 'nodata' object to not use seq data\n",
"A00n.params.usedata = 0"
"A00n.kwargs[\"usedata\"] = 0"
]
},
{
Expand Down Expand Up @@ -3060,15 +3060,15 @@
"A10n = A00.copy(\"A10-nodata\")\n",
"\n",
"## set new params\n",
"A10.params.infer_sptree = 1\n",
"A10.params.infer_delimit = 0\n",
"A10.params.tauprior = (2, 200, 1)\n",
"A10n.params.infer_sptree = 1\n",
"A10n.params.infer_delimit = 0\n",
"A10n.params.tauprior = (2, 200, 1)\n",
"A10.kwargs[\"infer_sptree\"] = 1\n",
"A10.kwargs[\"infer_delimit\" = 0\n",
"A10.kwargs[\"tauprior\" = (2, 200, 1)\n",
"A10n.kwargs[\"infer_sptree\" = 1\n",
"A10n.kwargs[\"infer_delimit\" = 0\n",
"A10n.kwargs[\"tauprior\" = (2, 200, 1)\n",
"\n",
"## also set no-data on the 'n' run\n",
"A10n.params.usedata = 0"
"A10n.kwargs[\"usedata\" = 0"
]
},
{
Expand Down Expand Up @@ -14209,15 +14209,15 @@
"A01n = A00.copy(\"A01-nodata\")\n",
"\n",
"## set new params\n",
"A01.params.infer_sptree = 0\n",
"A01.params.infer_delimit = 1\n",
"A01.params.tauprior = (2, 200, 1)\n",
"A01n.params.infer_sptree = 0\n",
"A01n.params.infer_delimit = 1\n",
"A01n.params.tauprior = (2, 200, 1)\n",
"A01.kwargs[\"infer_sptree\" = 0\n",
"A01.kwargs[\"infer_delimit\" = 1\n",
"A01.kwargs[\"tauprior\" = (2, 200, 1)\n",
"A01n.kwargs[\"infer_sptree\" = 0\n",
"A01n.kwargs[\"infer_delimit\" = 1\n",
"A01n.kwargs[\"tauprior\" = (2, 200, 1)\n",
"\n",
"## also set no-data on the 'n' run\n",
"A01n.params.usedata = 0"
"A01n.kwargs[\"usedata\" = 0"
]
},
{
Expand Down
29 changes: 29 additions & 0 deletions newdocs/releasenotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,35 @@
Release Notes
=============

0.9.104
-------
**January 06, 2025**

- fix max_raw_reads_sample setter
- analysis.bpp - update docs to remove refs to bpp.params

0.9.103
-------
**December 17, 2024**

- Step 2 added _sample_max_raw_reads() and hackersonly.max_raw_reads_sample
- bugfix wex again
- Merge branch 'master' of github.com:dereneaton/ipyrad
- bugfix for wex sample stats for one scaff or concat
- Update 6-params.rst
- debugging sample stats in wex
- add func to view sample coverages after wex
- digest genomes simplified a bit
- window extracter write to fasta option added
- change default impute method temporarily
- fix the default pca impute_method
- add analysis.utils.read_popsfile function to load imap from file
- pca: fix impute_method logic so None means all 0s
- snps_extracter allow to pass imap samples as pd.Index (makes groupby pop assignment easier)
- pca impute_method for no imputation should be None and not False
- pca: warn if numpy > 2.0
- add muscle_version util

0.9.102
-------
**September 06, 2024**
Expand Down

0 comments on commit 6b6c329

Please sign in to comment.