Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add test #2

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions .github/workflows/qiita-plugin-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
python-version: 3.11
python-version: 3.9

- name: Basic dependencies install
env:
Expand All @@ -55,7 +55,7 @@ jobs:

# pull out the port so we can modify the configuration file easily
pgport=${{ job.services.postgres.ports[5432] }}
sed -i "s/PORT = 5432/PORT = $pgport/" qiita_core/support_files/config_test.cfg
sed -i "s/PORT = 5432/PORT = $pgport/" qiita-dev/qiita_core/support_files/config_test.cfg

# PGPASSWORD is read by pg_restore, which is called by the build_db process.
export PGPASSWORD=postgres
Expand All @@ -79,17 +79,17 @@ jobs:
- name: Install Qiita plugins
shell: bash -l {0}
run: |
conda create --quiet --yes -n qp-dbbact python=3.11 pip
conda activate qp-deblur
# conda install --quiet --yes -c bioconda -c biocore "VSEARCH=2.7.0" MAFFT=7.310 SortMeRNA=2.0 fragment-insertion numpy==1.13.0 cython
conda create --quiet --yes -n qp-dbbact python=3.9 pip
conda activate qp-dbbact
conda install --quiet --yes -c bioconda -c biocore scikit-bio biom-format pandas wordcloud nose

export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/server.crt
export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
pip --quiet install -U pip
pip --quiet install .
pip --quiet install coveralls

configure_dbbact --env-script "source /home/runner/.profile; conda activate qp-deblur" --server-cert $QIITA_SERVER_CERT
configure_dbbact --env-script "source /home/runner/.profile; conda activate qp-dbbact" --server-cert $QIITA_SERVER_CERT

echo "Available Qiita plugins"
ls ~/.qiita_plugins/
Expand Down Expand Up @@ -129,15 +129,15 @@ jobs:
env:
COVER_PACKAGE: ${{ matrix.cover_package }}
run: |
conda activate qp-deblur
conda activate qp-dbbact
export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/server.crt
export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg

export PYTHONWARNINGS="ignore:Certificate for localhost has no \`subjectAltName\`"

nosetests --with-doctest --with-coverage --cover-package=qp_dbbact

- uses: codecov/codecov-action@v1
- uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: codecov.yml
Expand All @@ -156,4 +156,4 @@ jobs:
- name: lint
run: |
pip install -q flake8
flake8 qp_dbbact setup.py scripts support_files/*.py
flake8 qp_dbbact setup.py scripts
9 changes: 6 additions & 3 deletions qp_dbbact/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

import os
import urllib.parse

from qiita_client import QiitaPlugin, QiitaCommand

from .dbbact import wordcloud_from_ASVs

__all__ = ['dbbact']
__all__ = ['wordcloud_from_ASVs']

# Initialize the plugin
plugin = QiitaPlugin(
Expand All @@ -25,9 +24,11 @@

req_params = {'deblur BIOM table': ('artifact', ['BIOM'])}
URL = urllib.parse.quote_plus('http://dbbact.org')
APIURL = urllib.parse.quote_plus('http://api.dbbact.org')

opt_params = {
'dbBact server URL': ['choice:["%s"]' % URL, URL],
'dbBact api URL': ['choice:["%s"]' % APIURL, APIURL],
'Minimum ASV sample occurence in feature-table': ['float', '0.333'],
'Wordcloud width': ['integer', '400'],
'Wordcloud height': ['integer', '200'],
Expand All @@ -39,6 +40,7 @@
dflt_param_set = {
'Defaults': {
'dbBact server URL': URL,
'dbBact api URL': APIURL,
'Minimum ASV sample occurence in feature-table': 0.333,
'Wordcloud width': 400,
'Wordcloud height': 200,
Expand All @@ -48,7 +50,8 @@
}
dbbact_wordcloud_cmd = QiitaCommand(
'Wordcloud from ASV sequences', # The command name
'Query for enriched terms in dbBact for a set of ASV sequences', # The command description
# The command description
'Query for enriched terms in dbBact for a set of ASV sequences',
wordcloud_from_ASVs, # function : callable
req_params, opt_params, outputs, dflt_param_set)

Expand Down
133 changes: 85 additions & 48 deletions qp_dbbact/dbbact.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,33 @@
import datetime
import pandas as pd

from biom import Table, load_table
from biom import load_table

from qiita_client import ArtifactInfo
from qiita_client.util import system_call

import qp_dbbact

# copied from https://github.com/amnona/dbbact-calour/blob/
# f463fb52a56231ef68a0fb1cb200dceadec5c97b/dbbact_calour/dbbact.py#L2472
# smj 2024-03-06: replaced deprecated mpl.cm.get_cmap with
# matplotlib.colormaps[name]
def _get_color(word, font_size, position, orientation, font_path, random_state,
fscore, recall, precision, term_count):
'''Get the color for a wordcloud term based on the term_count and
higher/lower.


# copied from https://github.com/amnona/dbbact-calour/blob/f463fb52a56231ef68a0fb1cb200dceadec5c97b/dbbact_calour/dbbact.py#L2472
# smj 2024-03-06: replaced deprecated mpl.cm.get_cmap with matplotlib.colormaps[name]
def _get_color(word, font_size, position, orientation, font_path, random_state, fscore, recall, precision, term_count):
'''Get the color for a wordcloud term based on the term_count and higher/lower

If term starts with "-", it is lower in and colored red. otherwise colored blue
If we have term_count, we use it to color from close to white(low count) to full color (>=10 experiments)
If term starts with "-", it is lower in and colored red. otherwise colored
blue
If we have term_count, we use it to color from close to white(low count) to
full color (>=10 experiments)

Parameters
----------
fscores: dict of {term(str): fscore(float)}
between 0 and 1
recall: dict of {term(str): recall score(float)}, optional
precision: dict of {term(str): precision score(float)}, optional
term_count: dict of {term(str): number of experiments with term(float)}, optional
used to determine the color intensity

term_count: dict of {term(str): number of experiments with term(float)},
optional used to determine the color intensity

Returns
-------
Expand All @@ -64,6 +65,20 @@ def _get_color(word, font_size, position, orientation, font_path, random_state,
return '#%s%s%s' % (red, green, blue)


def render_wordcloud_png(wordcloud, fp_png: str, width: int, height: int,
DPI: int = 100):
fig = plt.figure(figsize=(width/DPI, height/DPI), dpi=DPI)
plt.imshow(wordcloud)
plt.axis("off")
fig.tight_layout()
fig.savefig(fp_png)


def render_wordcloud_svg(wordcloud, fp_svg: str):
with open(fp_svg, 'w') as SVG:
SVG.write(wordcloud.to_svg(embed_font=True))


def wordcloud_from_ASVs(qclient, job_id, parameters, out_dir):
"""Query for enriched terms in dbBact for a set of ASV sequences.

Expand All @@ -85,17 +100,22 @@ def wordcloud_from_ASVs(qclient, job_id, parameters, out_dir):

Notes
-----
The code will check if the artifact has a preprocessed_demux element, if
not it will use the preprocessed_fastq. We prefer to work with the
preprocessed_demux as running time will be greatly improved
The code first checks if the provided biom artifact was produced by deblur
to ensure that the features are actual DNA sequences. To double check, we
next test if a biom filepath is given. If so, the index is tested to
contain only A C G T letters, i.e. are ASV sequences. Depending on the
'Minimum ASV sample occurence in feature-table' parameter, a subset of
features is used to query F-scores against the dbBact server. The result
will be saved as a *.tsv file and rendered into a *.png and *.svg image.
We also obtain some stats from the server about database size and query
date and save that as stats.tsv for reference.
"""
NUM_STEPS = 4

# Step 1 check if the provided BIOM table is
# a) a result of deblur (can't to term enrichment for e.g. OTU IDs)
# b) the reference-hit and not the all table (as we don't need both)
# c) the target gene is 16S, 18S or ITS but nothing else
qclient.update_job_step(job_id, "Step 1 of %i: Collecting information" % NUM_STEPS)
# Step 1 collect information about BIOM artifact to check if expectations
# hold
qclient.update_job_step(
job_id, "Step 1 of %i: Collecting information" % NUM_STEPS)
artifact_id = parameters['deblur BIOM table']

# removing input from parameters so it's not part of the final command
Expand All @@ -105,9 +125,12 @@ def wordcloud_from_ASVs(qclient, job_id, parameters, out_dir):
artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)

# check artifact properties
# only accept BIOM artifact that were produced via deblur (but not by DADA2 - which is not in Qiita yet, or pick_closed_OTUs)
if artifact_info['name'] not in ['deblur final table', 'deblur reference hit table']:
error_msg = 'Currently, dbBact queries within Qiita are only possible for artifacts that have been produced via "deblur".'
# only accept BIOM artifact that were produced via deblur (but not by
# DADA2 - which is not in Qiita yet, or pick_closed_OTUs)
if artifact_info['name'] not in ['deblur final table',
'deblur reference hit table']:
error_msg = ('Currently, dbBact queries within Qiita are only possible'
' for artifacts that have been produced via "deblur".')
return False, None, error_msg

fps = {k: [vv['filepath'] for vv in v]
Expand All @@ -121,49 +144,63 @@ def wordcloud_from_ASVs(qclient, job_id, parameters, out_dir):
features = list(feature_table.ids(axis='observation'))
# check that all features are DNA sequences
if len(set(''.join(features)) - {'A', 'C', 'G', 'T'}) > 0:
error_msg = 'One or more ASV sequences contains at least one non-DNA character.'
error_msg = ('One or more ASV sequences contains at least one '
'non-DNA character.')
return False, None, error_msg

# only consider ASVs/features that at least occur in XXX of the samples (default: 1/3)
# only consider ASVs/features that at least occur in XXX of the
# samples (default: 1/3)
sel_features = [feature
for feature, occ in (feature_table.to_dataframe() > 0).sum(axis=1).items()
if occ >= feature_table.shape[1] * parameters['Minimum ASV sample occurence in feature-table']]
qclient.update_job_step(job_id, "Step 2 of %i: query %s with %i features (total was %i)" % (NUM_STEPS, parameters['dbBact server URL'], len(sel_features), len(features)))

dbbact = requests.get('%s/sequences_fscores' % urllib.parse.unquote_plus(parameters['dbBact server URL']), json={'sequences': sel_features})
for feature, occ
in (feature_table.to_dataframe() > 0).sum(axis=1).items()
if occ >= feature_table.shape[1] * parameters[
'Minimum ASV sample occurence in feature-table']]
qclient.update_job_step(
job_id,
"Step 2 of %i: query %s with %i features (total was %i)" % (
NUM_STEPS, parameters['dbBact server URL'],
len(sel_features), len(features)))

dbbact = requests.get('%s/sequences_fscores' % urllib.parse.unquote_plus(
parameters['dbBact server URL']), json={'sequences': sel_features})
if dbbact.status_code != 200:
return False, None, dbbact.content.decode('ascii')
return False, None, dbbact.content.decode("utf-8")
fscores = dbbact.json()

qclient.update_job_step(job_id, "Step 3 of %i: generate wordcloud" % (NUM_STEPS))
wc = WordCloud(width=parameters['Wordcloud width'], height=parameters['Wordcloud height'],
background_color=parameters['Wordcloud background color'], relative_scaling=parameters['Wordcloud relative scaling'],
qclient.update_job_step(
job_id, "Step 3 of %i: generate wordcloud" % (NUM_STEPS))
wc = WordCloud(width=parameters['Wordcloud width'],
height=parameters['Wordcloud height'],
background_color=parameters['Wordcloud background color'],
relative_scaling=parameters['Wordcloud relative scaling'],
stopwords=set(),
color_func=lambda *x, **y: _get_color(*x, **y, fscore=fscores, recall={}, precision={}, term_count={}))
color_func=lambda *x, **y: _get_color(
*x, **y, fscore=fscores, recall={},
precision={}, term_count={}))
cloud = wc.generate_from_frequencies(fscores)

qclient.update_job_step(job_id, "Step 4 of %i: render image" % (NUM_STEPS))
qclient.update_job_step(
job_id, "Step 4 of %i: render image" % (NUM_STEPS))
fp_png = join(out_dir, 'wordcloud.png')
render_wordcloud_png(cloud, fp_png, parameters['Wordcloud width'],
parameters['Wordcloud height'])
fp_svg = join(out_dir, 'wordcloud.svg')
fig = plt.figure()
plt.imshow(cloud)
plt.axis("off")
fig.tight_layout()
fig.savefig(fp_png)
fig.savefig(fp_svg)
render_wordcloud_svg(cloud, fp_svg)

# also save actual f-scores as table
fp_fscores = join(out_dir, "fscores.tsv")
pd.Series(fscores, name='F-score').to_csv(fp_fscores, sep="\t")
pd.Series(fscores, name='F-score').to_csv(
fp_fscores, sep="\t", index_label='term')

# obtain some stats from dbBact about database volume
dbbact_stats = requests.get('http://api.dbbact.org/stats/stats')
dbbact_stats = requests.get('%s/stats/stats' % urllib.parse.unquote_plus(
parameters['dbBact api URL']))
if dbbact_stats.status_code != 200:
return False, None, dbbact.content.decode('ascii')
return False, None, dbbact.content.decode("utf-8")
dbbact_stats = dbbact_stats.json()['stats']
dbbact_stats['query_timestamp'] = str(datetime.datetime.now())
fp_stats = join(out_dir, "stats.tsv")
pd.Series(dbbact_stats).to_csv(fp_stats, sep="\t")
pd.Series(dbbact_stats).to_csv(fp_stats, sep="\t", header=None)

ainfo = [ArtifactInfo('dbBact wordcloud', 'BIOM',
[(fp_png, 'biom'),
Expand Down
Loading
Loading