diff --git a/ipyrad/__init__.py b/ipyrad/__init__.py index cb65a5a1..3fd9f8c2 100644 --- a/ipyrad/__init__.py +++ b/ipyrad/__init__.py @@ -9,7 +9,7 @@ import subprocess as _sps # Dunders -__version__ = "0.9.102" +__version__ = "0.9.104" __author__ = "Deren Eaton & Isaac Overcast" # CLI __main__ changes to 0 diff --git a/ipyrad/assemble/rawedit.py b/ipyrad/assemble/rawedit.py index 9b6672e0..e199cb0c 100644 --- a/ipyrad/assemble/rawedit.py +++ b/ipyrad/assemble/rawedit.py @@ -366,6 +366,12 @@ def cutadaptit_single(data, sample): adapter filters. """ sname = sample.name + + if data.hackersonly.max_raw_reads_sample > 0: + _sample_max_raw_reads(data, sample) + + finput_r1 = sample.files.concat[0][0] + # if (GBS, ddRAD) we look for the second cut site + adapter. For SE # data we don't bother trying to remove the second barcode since it's not # as critical as with PE data. @@ -433,7 +439,7 @@ def cutadaptit_single(data, sample): "--trim-n", "--output", os.path.join( data.dirs.edits, sname + ".trimmed_R1_.fastq.gz"), - sample.files.concat[0][0], + finput_r1, ] if int(data.params.filter_adapters): @@ -486,6 +492,9 @@ def cutadaptit_pairs(data, sample): """ sname = sample.name + if data.hackersonly.max_raw_reads_sample > 0: + _sample_max_raw_reads(data, sample) + ## applied to read pairs finput_r1 = sample.files.concat[0][0] finput_r2 = sample.files.concat[0][1] @@ -710,6 +719,90 @@ def concat_multiple_inputs(data, sample): return sample.files.concat +## Called inside cutadaptit_single/cutadaptit_pairs prior to cutadapt +def _sample_max_raw_reads(data, sample): + """ + Subsample incoming raw reads to a given maximum value. + Only used if the hackersonly["max_raw_reads_sample"] > 0 + """ + + # Grab indices for R1/R2 from post-concatenated input files + r1_in = sample.files.concat[0][0] + r2_in = sample.files.concat[0][1] + + # subsample R1/R2 to a maximum value. Multiply by 4 to account for + # 4 lines per read in the input fastq. Cast to str so sps is happy + nsubsample_reads = str(data.hackersonly.max_raw_reads_sample * 4) + + # Pushing ungzipped files through `cat` simplifies the code to handle + # both gz and flat files (so that both have 2 calls to sps.Popen) and the + # additional cat (rather than simply head) of the flat files should have + # identical performance. + isgzip = ".gz" + if r1_in.endswith(".gz"): + cmd_cat1 = ["gunzip", "-c", r1_in] + cmd_cat2 = ["gunzip", "-c", r2_in] + else: + cmd_cat1 = ["cat", r1_in] + cmd_cat2 = ["cat", r2_in] + isgzip = "" + + # Catch only the first n reads + cmd_samp1 = ["head", "-n", nsubsample_reads] + cmd_samp2 = ["head", "-n", nsubsample_reads] + + # write to new subsampled file handle (gzip happens downstream, if + # the input file was gzipped) + subsamp1 = os.path.join( + data.dirs.edits, sample.name + ".subsample_R1_.fq") + with open(subsamp1, 'w') as cout1: + # gzip/cat the data to a pipe + proc1 = sps.Popen( + cmd_cat1, stderr=sps.STDOUT, stdout=sps.PIPE) + # catch the data and peel off the first n reads + proc2 = sps.Popen( + cmd_samp1, stderr=sps.STDOUT, stdout=cout1, stdin=proc1.stdout) + res2 = proc2.communicate()[0] + if proc2.returncode: + raise IPyradError("error in: {} | {}, {}".format(cmd_cat1, cmd_samp1, res2)) + + # Only set conc2 if R2 actually exists + subsamp2 = 0 + if "pair" in data.params.datatype: + subsamp2 = os.path.join( + data.dirs.edits, sample.name + ".subsample_R2_.fq") + with open(subsamp2, 'w') as cout2: + # gzip/cat the data to a pipe + proc1 = sps.Popen( + cmd_cat2, stderr=sps.STDOUT, stdout=sps.PIPE) + # catch the data and peel off the first n reads + proc2 = sps.Popen( + cmd_samp2, stderr=sps.STDOUT, stdout=cout2, stdin=proc1.stdout) + res2 = proc2.communicate()[0] + if proc2.returncode: + raise IPyradError("error in: {} | {}, {}".format(cmd_cat2, cmd_samp2, res2)) + + # Re-gzip the subsampled files to match status of the input file + if isgzip: + for infile in [subsamp1, subsamp2]: + if not infile: + # Don't try to gzip a file that doesn't exist + continue + cmd_gz1 = ["gzip", "-f", infile] + proc3 = sps.Popen( + cmd_gz1, stderr=sps.STDOUT) + res3 = proc3.communicate()[0] + if proc3.returncode: + raise IPyradError("error in: {}, {}".format(cmd_gz1, res3)) + # Fix the suffix to agree with the file format + subsamp1 += ".gz" + if subsamp2: subsamp2 += ".gz" + + # store new file handles (replaces concat files with subsampled files) + sample.files.concat = [(subsamp1, subsamp2)] + return sample.files.concat + + # GLOBALS NO_BARS_GBS_WARNING = """\ This is a just a warning: diff --git a/ipyrad/core/params.py b/ipyrad/core/params.py index e6ae2cf1..4becf5f0 100644 --- a/ipyrad/core/params.py +++ b/ipyrad/core/params.py @@ -44,6 +44,7 @@ def __init__(self): ("exclude_reference", True), ("trim_loci_min_sites", 4), ("mask_restriction_sites", False), + ("max_raw_reads_sample", 0), ]) # pretty printing of object @@ -177,6 +178,13 @@ def mask_restriction_sites(self): def mask_restriction_sites(self, value): self._data["mask_restriction_sites"] = bool(value) + @property + def max_raw_reads_sample(self): + return self._data["max_raw_reads_sample"] + @max_raw_reads_sample.setter + def max_raw_reads_sample(self, value): + self._data["max_raw_reads_sample"] = int(value) + class Params(object): def __init__(self, data): diff --git a/newdocs/API-analysis/cookbook-bpp-species-delimitation.ipynb b/newdocs/API-analysis/cookbook-bpp-species-delimitation.ipynb index 6e6773b3..72f3d124 100644 --- a/newdocs/API-analysis/cookbook-bpp-species-delimitation.ipynb +++ b/newdocs/API-analysis/cookbook-bpp-species-delimitation.ipynb @@ -981,7 +981,7 @@ "A00n = b.copy(name=\"A00-nodata\")\n", "\n", "## set params on the 'nodata' object to not use seq data\n", - "A00n.params.usedata = 0" + "A00n.kwargs[\"usedata\"] = 0" ] }, { @@ -3060,15 +3060,15 @@ "A10n = A00.copy(\"A10-nodata\")\n", "\n", "## set new params\n", - "A10.params.infer_sptree = 1\n", - "A10.params.infer_delimit = 0\n", - "A10.params.tauprior = (2, 200, 1)\n", - "A10n.params.infer_sptree = 1\n", - "A10n.params.infer_delimit = 0\n", - "A10n.params.tauprior = (2, 200, 1)\n", + "A10.kwargs[\"infer_sptree\"] = 1\n", + "A10.kwargs[\"infer_delimit\" = 0\n", + "A10.kwargs[\"tauprior\" = (2, 200, 1)\n", + "A10n.kwargs[\"infer_sptree\" = 1\n", + "A10n.kwargs[\"infer_delimit\" = 0\n", + "A10n.kwargs[\"tauprior\" = (2, 200, 1)\n", "\n", "## also set no-data on the 'n' run\n", - "A10n.params.usedata = 0" + "A10n.kwargs[\"usedata\" = 0" ] }, { @@ -14209,15 +14209,15 @@ "A01n = A00.copy(\"A01-nodata\")\n", "\n", "## set new params\n", - "A01.params.infer_sptree = 0\n", - "A01.params.infer_delimit = 1\n", - "A01.params.tauprior = (2, 200, 1)\n", - "A01n.params.infer_sptree = 0\n", - "A01n.params.infer_delimit = 1\n", - "A01n.params.tauprior = (2, 200, 1)\n", + "A01.kwargs[\"infer_sptree\" = 0\n", + "A01.kwargs[\"infer_delimit\" = 1\n", + "A01.kwargs[\"tauprior\" = (2, 200, 1)\n", + "A01n.kwargs[\"infer_sptree\" = 0\n", + "A01n.kwargs[\"infer_delimit\" = 1\n", + "A01n.kwargs[\"tauprior\" = (2, 200, 1)\n", "\n", "## also set no-data on the 'n' run\n", - "A01n.params.usedata = 0" + "A01n.kwargs[\"usedata\" = 0" ] }, { diff --git a/newdocs/releasenotes.rst b/newdocs/releasenotes.rst index 0e991e70..33726552 100644 --- a/newdocs/releasenotes.rst +++ b/newdocs/releasenotes.rst @@ -4,6 +4,35 @@ Release Notes ============= +0.9.104 +------- +**January 06, 2025** + +- fix max_raw_reads_sample setter +- analysis.bpp - update docs to remove refs to bpp.params + +0.9.103 +------- +**December 17, 2024** + +- Step 2 added _sample_max_raw_reads() and hackersonly.max_raw_reads_sample +- bugfix wex again +- Merge branch 'master' of github.com:dereneaton/ipyrad +- bugfix for wex sample stats for one scaff or concat +- Update 6-params.rst +- debugging sample stats in wex +- add func to view sample coverages after wex +- digest genomes simplified a bit +- window extracter write to fasta option added +- change default impute method temporarily +- fix the default pca impute_method +- add analysis.utils.read_popsfile function to load imap from file +- pca: fix impute_method logic so None means all 0s +- snps_extracter allow to pass imap samples as pd.Index (makes groupby pop assignment easier) +- pca impute_method for no imputation should be None and not False +- pca: warn if numpy > 2.0 +- add muscle_version util + 0.9.102 ------- **September 06, 2024**