Merge branch 'master' of github.com:dereneaton/ipyrad

dereneaton · Jan 7, 2025 · 6b6c329 · 6b6c329
2 parents 5947011 + b9e7070
commit 6b6c329
Show file tree

Hide file tree

Showing 5 changed files with 147 additions and 17 deletions.
diff --git a/ipyrad/__init__.py b/ipyrad/__init__.py
@@ -9,7 +9,7 @@
 import subprocess as _sps
 
 # Dunders
-__version__ = "0.9.102"
+__version__ = "0.9.104"
 __author__ = "Deren Eaton & Isaac Overcast"
 
 # CLI __main__ changes to 0

diff --git a/ipyrad/assemble/rawedit.py b/ipyrad/assemble/rawedit.py
@@ -366,6 +366,12 @@ def cutadaptit_single(data, sample):
     adapter filters. 
     """
     sname = sample.name
+
+    if data.hackersonly.max_raw_reads_sample > 0:
+        _sample_max_raw_reads(data, sample)
+
+    finput_r1 = sample.files.concat[0][0]
+
     # if (GBS, ddRAD) we look for the second cut site + adapter. For SE
     # data we don't bother trying to remove the second barcode since it's not
     # as critical as with PE data.
@@ -433,7 +439,7 @@ def cutadaptit_single(data, sample):
         "--trim-n", 
         "--output", os.path.join(
             data.dirs.edits, sname + ".trimmed_R1_.fastq.gz"),
-        sample.files.concat[0][0],
+        finput_r1,
         ]
 
     if int(data.params.filter_adapters):
@@ -486,6 +492,9 @@ def cutadaptit_pairs(data, sample):
     """
     sname = sample.name
 
+    if data.hackersonly.max_raw_reads_sample > 0:
+        _sample_max_raw_reads(data, sample)
+
     ## applied to read pairs
     finput_r1 = sample.files.concat[0][0]
     finput_r2 = sample.files.concat[0][1]
@@ -710,6 +719,90 @@ def concat_multiple_inputs(data, sample):
     return sample.files.concat
 
 
+## Called inside cutadaptit_single/cutadaptit_pairs prior to cutadapt
+def _sample_max_raw_reads(data, sample):
+    """
+    Subsample incoming raw reads to a given maximum value.
+    Only used if the hackersonly["max_raw_reads_sample"] > 0
+    """
+
+    # Grab indices for R1/R2 from post-concatenated input files
+    r1_in = sample.files.concat[0][0]
+    r2_in = sample.files.concat[0][1]
+
+    # subsample R1/R2 to a maximum value. Multiply by 4 to account for
+    # 4 lines per read in the input fastq. Cast to str so sps is happy
+    nsubsample_reads = str(data.hackersonly.max_raw_reads_sample * 4)
+
+    # Pushing ungzipped files through `cat` simplifies the code to handle
+    # both gz and flat files (so that both have 2 calls to sps.Popen) and the
+    # additional cat (rather than simply head) of the flat files should have
+    # identical performance.
+    isgzip = ".gz"
+    if r1_in.endswith(".gz"):
+        cmd_cat1 = ["gunzip", "-c", r1_in]
+        cmd_cat2 = ["gunzip", "-c", r2_in]
+    else:
+        cmd_cat1 = ["cat", r1_in]
+        cmd_cat2 = ["cat", r2_in]
+        isgzip = ""
+
+    # Catch only the first n reads
+    cmd_samp1 = ["head", "-n", nsubsample_reads]
+    cmd_samp2 = ["head", "-n", nsubsample_reads]
+
+    # write to new subsampled file handle (gzip happens downstream, if
+    # the input file was gzipped)
+    subsamp1 = os.path.join(
+        data.dirs.edits, sample.name + ".subsample_R1_.fq")
+    with open(subsamp1, 'w') as cout1:
+        # gzip/cat the data to a pipe
+        proc1 = sps.Popen(
+            cmd_cat1, stderr=sps.STDOUT, stdout=sps.PIPE)
+        # catch the data and peel off the first n reads
+        proc2 = sps.Popen(
+            cmd_samp1, stderr=sps.STDOUT, stdout=cout1, stdin=proc1.stdout)
+        res2 = proc2.communicate()[0]
+    if proc2.returncode:
+        raise IPyradError("error in: {} | {}, {}".format(cmd_cat1, cmd_samp1, res2))
+
+    # Only set conc2 if R2 actually exists
+    subsamp2 = 0
+    if "pair" in data.params.datatype:
+        subsamp2 = os.path.join(
+            data.dirs.edits, sample.name + ".subsample_R2_.fq")
+        with open(subsamp2, 'w') as cout2:
+            # gzip/cat the data to a pipe
+            proc1 = sps.Popen(
+                cmd_cat2, stderr=sps.STDOUT, stdout=sps.PIPE)
+            # catch the data and peel off the first n reads
+            proc2 = sps.Popen(
+                cmd_samp2, stderr=sps.STDOUT, stdout=cout2, stdin=proc1.stdout)
+            res2 = proc2.communicate()[0]
+        if proc2.returncode:
+            raise IPyradError("error in: {} | {}, {}".format(cmd_cat2, cmd_samp2, res2))
+
+    # Re-gzip the subsampled files to match status of the input file
+    if isgzip:
+        for infile in [subsamp1, subsamp2]:
+            if not infile:
+                # Don't try to gzip a file that doesn't exist
+                continue
+            cmd_gz1 = ["gzip", "-f", infile]
+            proc3 = sps.Popen(
+                cmd_gz1, stderr=sps.STDOUT)
+            res3 = proc3.communicate()[0]
+            if proc3.returncode:
+                raise IPyradError("error in: {}, {}".format(cmd_gz1, res3))
+        # Fix the suffix to agree with the file format
+        subsamp1 += ".gz"
+        if subsamp2: subsamp2 += ".gz"
+
+    # store new file handles (replaces concat files with subsampled files)
+    sample.files.concat = [(subsamp1, subsamp2)]
+    return sample.files.concat
+
+
 # GLOBALS
 NO_BARS_GBS_WARNING = """\
     This is a just a warning: 

diff --git a/ipyrad/core/params.py b/ipyrad/core/params.py
@@ -44,6 +44,7 @@ def __init__(self):
             ("exclude_reference", True),
             ("trim_loci_min_sites", 4),
             ("mask_restriction_sites", False),
+            ("max_raw_reads_sample", 0),
         ])
 
     # pretty printing of object
@@ -177,6 +178,13 @@ def mask_restriction_sites(self):
     def mask_restriction_sites(self, value):
         self._data["mask_restriction_sites"] = bool(value)
 
+    @property
+    def max_raw_reads_sample(self):
+        return self._data["max_raw_reads_sample"]
+    @max_raw_reads_sample.setter
+    def max_raw_reads_sample(self, value):
+        self._data["max_raw_reads_sample"] = int(value)
+
 
 class Params(object):
     def __init__(self, data):

diff --git a/newdocs/API-analysis/cookbook-bpp-species-delimitation.ipynb b/newdocs/API-analysis/cookbook-bpp-species-delimitation.ipynb
@@ -981,7 +981,7 @@
     "A00n = b.copy(name=\"A00-nodata\")\n",
     "\n",
     "## set params on the 'nodata' object to not use seq data\n",
-    "A00n.params.usedata = 0"
+    "A00n.kwargs[\"usedata\"] = 0"
    ]
   },
   {
@@ -3060,15 +3060,15 @@
     "A10n = A00.copy(\"A10-nodata\")\n",
     "\n",
     "## set new params\n",
-    "A10.params.infer_sptree = 1\n",
-    "A10.params.infer_delimit = 0\n",
-    "A10.params.tauprior = (2, 200, 1)\n",
-    "A10n.params.infer_sptree = 1\n",
-    "A10n.params.infer_delimit = 0\n",
-    "A10n.params.tauprior = (2, 200, 1)\n",
+    "A10.kwargs[\"infer_sptree\"] = 1\n",
+    "A10.kwargs[\"infer_delimit\" = 0\n",
+    "A10.kwargs[\"tauprior\" = (2, 200, 1)\n",
+    "A10n.kwargs[\"infer_sptree\" = 1\n",
+    "A10n.kwargs[\"infer_delimit\" = 0\n",
+    "A10n.kwargs[\"tauprior\" = (2, 200, 1)\n",
     "\n",
     "## also set no-data on the 'n' run\n",
-    "A10n.params.usedata = 0"
+    "A10n.kwargs[\"usedata\" = 0"
    ]
   },
   {
@@ -14209,15 +14209,15 @@
     "A01n = A00.copy(\"A01-nodata\")\n",
     "\n",
     "## set new params\n",
-    "A01.params.infer_sptree = 0\n",
-    "A01.params.infer_delimit = 1\n",
-    "A01.params.tauprior = (2, 200, 1)\n",
-    "A01n.params.infer_sptree = 0\n",
-    "A01n.params.infer_delimit = 1\n",
-    "A01n.params.tauprior = (2, 200, 1)\n",
+    "A01.kwargs[\"infer_sptree\" = 0\n",
+    "A01.kwargs[\"infer_delimit\" = 1\n",
+    "A01.kwargs[\"tauprior\" = (2, 200, 1)\n",
+    "A01n.kwargs[\"infer_sptree\" = 0\n",
+    "A01n.kwargs[\"infer_delimit\" = 1\n",
+    "A01n.kwargs[\"tauprior\" = (2, 200, 1)\n",
     "\n",
     "## also set no-data on the 'n' run\n",
-    "A01n.params.usedata = 0"
+    "A01n.kwargs[\"usedata\" = 0"
    ]
   },
   {

diff --git a/newdocs/releasenotes.rst b/newdocs/releasenotes.rst
@@ -4,6 +4,35 @@
 Release Notes
 =============
 
+0.9.104
+-------
+**January 06, 2025**
+
+- fix max_raw_reads_sample setter
+- analysis.bpp - update docs to remove refs to bpp.params
+
+0.9.103
+-------
+**December 17, 2024**
+
+- Step 2 added _sample_max_raw_reads() and hackersonly.max_raw_reads_sample
+- bugfix wex again
+- Merge branch 'master' of github.com:dereneaton/ipyrad
+- bugfix for wex sample stats for one scaff or concat
+- Update 6-params.rst
+- debugging sample stats in wex
+- add func to view sample coverages after wex
+- digest genomes simplified a bit
+- window extracter write to fasta option added
+- change default impute method temporarily
+- fix the default pca impute_method
+- add analysis.utils.read_popsfile function to load imap from file
+- pca: fix impute_method logic so None means all 0s
+- snps_extracter allow to pass imap samples as pd.Index (makes groupby pop assignment easier)
+- pca impute_method for no imputation should be None and not False
+- pca: warn if numpy > 2.0
+- add muscle_version util
+
 0.9.102
 -------
 **September 06, 2024**