jlab · sjanssen2 · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024
diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml
@@ -39,7 +39,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
-          python-version: 3.11
+          python-version: 3.9
 
       - name: Basic dependencies install
         env:
@@ -55,7 +55,7 @@ jobs:
 
           # pull out the port so we can modify the configuration file easily
           pgport=${{ job.services.postgres.ports[5432] }}
-          sed -i "s/PORT = 5432/PORT = $pgport/" qiita_core/support_files/config_test.cfg
+          sed -i "s/PORT = 5432/PORT = $pgport/" qiita-dev/qiita_core/support_files/config_test.cfg
 
           # PGPASSWORD is read by pg_restore, which is called by the build_db process.
           export PGPASSWORD=postgres
@@ -79,17 +79,17 @@ jobs:
       - name: Install Qiita plugins
         shell: bash -l {0}
         run: |
-          conda create --quiet --yes -n qp-dbbact python=3.11 pip
-          conda activate qp-deblur
-          # conda install --quiet --yes -c bioconda -c biocore "VSEARCH=2.7.0" MAFFT=7.310 SortMeRNA=2.0 fragment-insertion numpy==1.13.0 cython
+          conda create --quiet --yes -n qp-dbbact python=3.9 pip
+          conda activate qp-dbbact
+          conda install --quiet --yes -c bioconda -c biocore scikit-bio biom-format pandas wordcloud nose
 
           export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/server.crt
           export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
           pip --quiet install -U pip
           pip --quiet install .
           pip --quiet install coveralls
 
-          configure_dbbact --env-script "source /home/runner/.profile; conda activate qp-deblur" --server-cert $QIITA_SERVER_CERT
+          configure_dbbact --env-script "source /home/runner/.profile; conda activate qp-dbbact" --server-cert $QIITA_SERVER_CERT
 
           echo "Available Qiita plugins"
           ls ~/.qiita_plugins/
@@ -129,15 +129,15 @@ jobs:
         env:
           COVER_PACKAGE: ${{ matrix.cover_package }}
         run: |
-          conda activate qp-deblur
+          conda activate qp-dbbact
           export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/server.crt
           export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
 
           export PYTHONWARNINGS="ignore:Certificate for localhost has no \`subjectAltName\`"
 
           nosetests --with-doctest --with-coverage --cover-package=qp_dbbact
 
-      - uses: codecov/codecov-action@v1
+      - uses: codecov/codecov-action@v3
         with:
          token: ${{ secrets.CODECOV_TOKEN }}
          file: codecov.yml
@@ -156,4 +156,4 @@ jobs:
     - name: lint
       run: |
         pip install -q flake8
-        flake8 qp_dbbact setup.py scripts support_files/*.py
+        flake8 qp_dbbact setup.py scripts
diff --git a/qp_dbbact/__init__.py b/qp_dbbact/__init__.py
@@ -6,14 +6,13 @@
 # The full license is in the file LICENSE, distributed with this software.
 # -----------------------------------------------------------------------------
 
-import os
 import urllib.parse
 
 from qiita_client import QiitaPlugin, QiitaCommand
 
 from .dbbact import wordcloud_from_ASVs
 
-__all__ = ['dbbact']
+__all__ = ['wordcloud_from_ASVs']
 
 # Initialize the plugin
 plugin = QiitaPlugin(
@@ -25,9 +24,11 @@
 
 req_params = {'deblur BIOM table': ('artifact', ['BIOM'])}
 URL = urllib.parse.quote_plus('http://dbbact.org')
+APIURL = urllib.parse.quote_plus('http://api.dbbact.org')
 
 opt_params = {
     'dbBact server URL': ['choice:["%s"]' % URL, URL],
+    'dbBact api URL': ['choice:["%s"]' % APIURL, APIURL],
     'Minimum ASV sample occurence in feature-table': ['float', '0.333'],
     'Wordcloud width': ['integer', '400'],
     'Wordcloud height': ['integer', '200'],
@@ -39,6 +40,7 @@
 dflt_param_set = {
     'Defaults': {
         'dbBact server URL': URL,
+        'dbBact api URL': APIURL,
         'Minimum ASV sample occurence in feature-table': 0.333,
         'Wordcloud width': 400,
         'Wordcloud height': 200,
@@ -48,7 +50,8 @@
 }
 dbbact_wordcloud_cmd = QiitaCommand(
     'Wordcloud from ASV sequences',  # The command name
-    'Query for enriched terms in dbBact for a set of ASV sequences',  # The command description
+    # The command description
+    'Query for enriched terms in dbBact for a set of ASV sequences',
     wordcloud_from_ASVs,  # function : callable
     req_params, opt_params, outputs, dflt_param_set)
 

diff --git a/qp_dbbact/dbbact.py b/qp_dbbact/dbbact.py
@@ -15,32 +15,33 @@
 import datetime
 import pandas as pd
 
-from biom import Table, load_table
+from biom import load_table
 
 from qiita_client import ArtifactInfo
-from qiita_client.util import system_call
 
-import qp_dbbact
 
+# copied from https://github.com/amnona/dbbact-calour/blob/
+#        f463fb52a56231ef68a0fb1cb200dceadec5c97b/dbbact_calour/dbbact.py#L2472
+#   smj 2024-03-06: replaced deprecated mpl.cm.get_cmap with
+#                   matplotlib.colormaps[name]
+def _get_color(word, font_size, position, orientation, font_path, random_state,
+               fscore, recall, precision, term_count):
+    '''Get the color for a wordcloud term based on the term_count and
+       higher/lower.
 
-
-# copied from https://github.com/amnona/dbbact-calour/blob/f463fb52a56231ef68a0fb1cb200dceadec5c97b/dbbact_calour/dbbact.py#L2472
-#   smj 2024-03-06: replaced deprecated mpl.cm.get_cmap with matplotlib.colormaps[name]
-def _get_color(word, font_size, position, orientation, font_path, random_state, fscore, recall, precision, term_count):
-    '''Get the color for a wordcloud term based on the term_count and higher/lower
-
-    If term starts with "-", it is lower in and colored red. otherwise colored blue
-    If we have term_count, we use it to color from close to white(low count) to full color (>=10 experiments)
+    If term starts with "-", it is lower in and colored red. otherwise colored
+        blue
+    If we have term_count, we use it to color from close to white(low count) to
+        full color (>=10 experiments)
 
     Parameters
     ----------
     fscores: dict of {term(str): fscore(float)}
         between 0 and 1
     recall: dict of {term(str): recall score(float)}, optional
     precision: dict of {term(str): precision score(float)}, optional
-    term_count: dict of {term(str): number of experiments with term(float)}, optional
-        used to determine the color intensity
-
+    term_count: dict of {term(str): number of experiments with term(float)},
+        optional used to determine the color intensity
 
     Returns
     -------
@@ -64,6 +65,20 @@ def _get_color(word, font_size, position, orientation, font_path, random_state,
     return '#%s%s%s' % (red, green, blue)
 
 
+def render_wordcloud_png(wordcloud, fp_png: str, width: int, height: int,
+                         DPI: int = 100):
+    fig = plt.figure(figsize=(width/DPI, height/DPI), dpi=DPI)
+    plt.imshow(wordcloud)
+    plt.axis("off")
+    fig.tight_layout()
+    fig.savefig(fp_png)
+
+
+def render_wordcloud_svg(wordcloud, fp_svg: str):
+    with open(fp_svg, 'w') as SVG:
+        SVG.write(wordcloud.to_svg(embed_font=True))
+
+
 def wordcloud_from_ASVs(qclient, job_id, parameters, out_dir):
     """Query for enriched terms in dbBact for a set of ASV sequences.
 
@@ -85,17 +100,22 @@ def wordcloud_from_ASVs(qclient, job_id, parameters, out_dir):
 
     Notes
     -----
-    The code will check if the artifact has a preprocessed_demux element, if
-    not it will use the preprocessed_fastq. We prefer to work with the
-    preprocessed_demux as running time will be greatly improved
+    The code first checks if the provided biom artifact was produced by deblur
+    to ensure that the features are actual DNA sequences. To double check, we
+    next test if a biom filepath is given. If so, the index is tested to
+    contain only A C G T letters, i.e. are ASV sequences. Depending on the
+    'Minimum ASV sample occurence in feature-table' parameter, a subset of
+    features is used to query F-scores against the dbBact server. The result
+    will be saved as a *.tsv file and rendered into a *.png and *.svg image.
+    We also obtain some stats from the server about database size and query
+    date and save that as stats.tsv for reference.
     """
     NUM_STEPS = 4
 
-    # Step 1 check if the provided BIOM table is
-    #   a) a result of deblur (can't to term enrichment for e.g. OTU IDs)
-    #   b) the reference-hit and not the all table (as we don't need both)
-    #   c) the target gene is 16S, 18S or ITS but nothing else
-    qclient.update_job_step(job_id, "Step 1 of %i: Collecting information" % NUM_STEPS)
+    # Step 1 collect information about BIOM artifact to check if expectations
+    # hold
+    qclient.update_job_step(
+        job_id, "Step 1 of %i: Collecting information" % NUM_STEPS)
     artifact_id = parameters['deblur BIOM table']
 
     # removing input from parameters so it's not part of the final command
@@ -105,9 +125,12 @@ def wordcloud_from_ASVs(qclient, job_id, parameters, out_dir):
     artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
 
     # check artifact properties
-    # only accept BIOM artifact that were produced via deblur (but not by DADA2 - which is not in Qiita yet, or pick_closed_OTUs)
-    if artifact_info['name'] not in ['deblur final table', 'deblur reference hit table']:
-        error_msg = 'Currently, dbBact queries within Qiita are only possible for artifacts that have been produced via "deblur".'
+    # only accept BIOM artifact that were produced via deblur (but not by
+    # DADA2 - which is not in Qiita yet, or pick_closed_OTUs)
+    if artifact_info['name'] not in ['deblur final table',
+                                     'deblur reference hit table']:
+        error_msg = ('Currently, dbBact queries within Qiita are only possible'
+                     ' for artifacts that have been produced via "deblur".')
         return False, None, error_msg
 
     fps = {k: [vv['filepath'] for vv in v]
@@ -121,49 +144,63 @@ def wordcloud_from_ASVs(qclient, job_id, parameters, out_dir):
     features = list(feature_table.ids(axis='observation'))
     # check that all features are DNA sequences
     if len(set(''.join(features)) - {'A', 'C', 'G', 'T'}) > 0:
-        error_msg = 'One or more ASV sequences contains at least one non-DNA character.'
+        error_msg = ('One or more ASV sequences contains at least one '
+                     'non-DNA character.')
         return False, None, error_msg
 
-    # only consider ASVs/features that at least occur in XXX of the samples (default: 1/3)
+    # only consider ASVs/features that at least occur in XXX of the
+    # samples (default: 1/3)
     sel_features = [feature
-                    for feature, occ in (feature_table.to_dataframe() > 0).sum(axis=1).items()
-                    if occ >= feature_table.shape[1] * parameters['Minimum ASV sample occurence in feature-table']]
-    qclient.update_job_step(job_id, "Step 2 of %i: query %s with %i features (total was %i)" % (NUM_STEPS, parameters['dbBact server URL'], len(sel_features), len(features)))
-
-    dbbact = requests.get('%s/sequences_fscores' % urllib.parse.unquote_plus(parameters['dbBact server URL']), json={'sequences': sel_features})
+                    for feature, occ
+                    in (feature_table.to_dataframe() > 0).sum(axis=1).items()
+                    if occ >= feature_table.shape[1] * parameters[
+                        'Minimum ASV sample occurence in feature-table']]
+    qclient.update_job_step(
+        job_id,
+        "Step 2 of %i: query %s with %i features (total was %i)" % (
+            NUM_STEPS, parameters['dbBact server URL'],
+            len(sel_features), len(features)))
+
+    dbbact = requests.get('%s/sequences_fscores' % urllib.parse.unquote_plus(
+        parameters['dbBact server URL']), json={'sequences': sel_features})
     if dbbact.status_code != 200:
-        return False, None, dbbact.content.decode('ascii')
+        return False, None, dbbact.content.decode("utf-8")
     fscores = dbbact.json()
 
-    qclient.update_job_step(job_id, "Step 3 of %i: generate wordcloud" % (NUM_STEPS))
-    wc = WordCloud(width=parameters['Wordcloud width'], height=parameters['Wordcloud height'],
-                   background_color=parameters['Wordcloud background color'], relative_scaling=parameters['Wordcloud relative scaling'],
+    qclient.update_job_step(
+        job_id, "Step 3 of %i: generate wordcloud" % (NUM_STEPS))
+    wc = WordCloud(width=parameters['Wordcloud width'],
+                   height=parameters['Wordcloud height'],
+                   background_color=parameters['Wordcloud background color'],
+                   relative_scaling=parameters['Wordcloud relative scaling'],
                    stopwords=set(),
-                   color_func=lambda *x, **y: _get_color(*x, **y, fscore=fscores, recall={}, precision={}, term_count={}))
+                   color_func=lambda *x, **y: _get_color(
+                      *x, **y, fscore=fscores, recall={},
+                      precision={}, term_count={}))
     cloud = wc.generate_from_frequencies(fscores)
 
-    qclient.update_job_step(job_id, "Step 4 of %i: render image" % (NUM_STEPS))
+    qclient.update_job_step(
+        job_id, "Step 4 of %i: render image" % (NUM_STEPS))
     fp_png = join(out_dir, 'wordcloud.png')
+    render_wordcloud_png(cloud, fp_png, parameters['Wordcloud width'],
+                         parameters['Wordcloud height'])
     fp_svg = join(out_dir, 'wordcloud.svg')
-    fig = plt.figure()
-    plt.imshow(cloud)
-    plt.axis("off")
-    fig.tight_layout()
-    fig.savefig(fp_png)
-    fig.savefig(fp_svg)
+    render_wordcloud_svg(cloud, fp_svg)
 
     # also save actual f-scores as table
     fp_fscores = join(out_dir, "fscores.tsv")
-    pd.Series(fscores, name='F-score').to_csv(fp_fscores, sep="\t")
+    pd.Series(fscores, name='F-score').to_csv(
+        fp_fscores, sep="\t", index_label='term')
 
     # obtain some stats from dbBact about database volume
-    dbbact_stats = requests.get('http://api.dbbact.org/stats/stats')
+    dbbact_stats = requests.get('%s/stats/stats' % urllib.parse.unquote_plus(
+        parameters['dbBact api URL']))
     if dbbact_stats.status_code != 200:
-        return False, None, dbbact.content.decode('ascii')
+        return False, None, dbbact.content.decode("utf-8")
     dbbact_stats = dbbact_stats.json()['stats']
     dbbact_stats['query_timestamp'] = str(datetime.datetime.now())
     fp_stats = join(out_dir, "stats.tsv")
-    pd.Series(dbbact_stats).to_csv(fp_stats, sep="\t")
+    pd.Series(dbbact_stats).to_csv(fp_stats, sep="\t", header=None)
 
     ainfo = [ArtifactInfo('dbBact wordcloud', 'BIOM',
                           [(fp_png, 'biom'),