diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 387f461..c1bd12b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,14 @@ Changelog ********* +1.12.0 (2022-02-11) +------------------- + +* :issue:`33`: Update the :command:`make-manifest` command to ignore undetermined FASTQ files. +* :issue:`34`: Update the :meth:`alpha_rarefaction_plot` method to keep 'N/A' values as string instead of NaN. +* :issue:`35`: Update the methods :meth:`alpha_diversity_plot`, :meth:`beta_2d_plot`, and :meth:`beta_3d_plot` to accept :class:`pandas.DataFrame` in case the input data was not generated from QIIME 2 (e.g. shotgun sequencing). +* Update the methods :meth:`beta_2d_plot` and :meth:`beta_3d_plot` to print out the proportions explained instead of embedding them in the PCoA plot. + 1.11.0 (2021-08-04) ------------------- diff --git a/README.rst b/README.rst index 1eab893..e149240 100644 --- a/README.rst +++ b/README.rst @@ -20,6 +20,25 @@ Your contributions (e.g. feature ideas, pull requests) are most welcome. | Email: sbstevenlee@gmail.com | License: MIT License +Citation +======== + +If you use Dokdo in a published analysis, please report the program version +and the GitHub repository (https://github.com/sbslee/dokdo). If applicable, +please also consider citing the following preprint in which Dokdo was first +described: + +JY Park, H Yoon, S Lee, et al. Do Different Samples From Pregnant Women and +Their Neonates Share the Common Microbiome: A Prospective Cohort Study. +November 29th, 2021. Research Square. +doi: https://doi.org/10.21203/rs.3.rs-1062191/v1. + +Below is an incomplete list of publications which have used Dokdo: + +- `Silva de Carvalho et al., 2022 `__ in *Brain, Behavior, and Immunity*: Post-ischemic protein restriction induces sustained neuroprotection, neurological recovery, brain remodeling, and gut microbiota rebalancing. +- `Park et al., 2021 `__ in *Scientific Reports*: Effect of black ginseng and silkworm supplementation on obesity, the transcriptome, and the gut microbiome of diet-induced overweight dogs. +- `Reinold et al., 2021 `__ in *Frontiers in Cellular and Infection Microbiology*: A Pro-Inflammatory Gut Microbiome Characterizes SARS-CoV-2 Infected Patients and a Reduction in the Connectivity of an Anti-Inflammatory Bacterial Network Associates With Severe COVID-19. + CLI Examples ============ diff --git a/docs/images/beta_2d_plot-1.png b/docs/images/beta_2d_plot-1.png index 906be8a..e451fdd 100644 Binary files a/docs/images/beta_2d_plot-1.png and b/docs/images/beta_2d_plot-1.png differ diff --git a/docs/images/beta_2d_plot-2.png b/docs/images/beta_2d_plot-2.png index 386204e..4ed02a8 100644 Binary files a/docs/images/beta_2d_plot-2.png and b/docs/images/beta_2d_plot-2.png differ diff --git a/docs/images/beta_3d_plot-1.png b/docs/images/beta_3d_plot-1.png index e328f42..ec0425f 100644 Binary files a/docs/images/beta_3d_plot-1.png and b/docs/images/beta_3d_plot-1.png differ diff --git a/docs/images/beta_3d_plot-2.png b/docs/images/beta_3d_plot-2.png index a837401..dd2e493 100644 Binary files a/docs/images/beta_3d_plot-2.png and b/docs/images/beta_3d_plot-2.png differ diff --git a/dokdo/__main__.py b/dokdo/__main__.py index 84f9a82..dc42dd8 100644 --- a/dokdo/__main__.py +++ b/dokdo/__main__.py @@ -82,14 +82,13 @@ def main(): add_help=False, help=("Create a manifest file (.tsv) from a directory containing " "FASTQ files."), - description=("Create a manifest file (.tsv) from a directory " - "containing FASTQ files. The file names must include " - "either '_R1_001.fastq' or '_R1_002.fastq'. The word " - "before the third-to-last underscore will be set " - "as the sample ID. For example, a file named " - "'EXAMPLE_S1_R1_001.fastq.gz' will produce 'EXAMPLE' " - "as sample ID and 'EXAM_PLE_S1_R1_001.fastq.gz', " - "'EXAM_PLE'."), + description=("Create a manifest file (.tsv) from a directory containing FASTQ files. " + "This command assumes that FASTQ filenames end with a suffix such as " + "'_S0_R1_001.fastq' or '_S14_R2_001.fastq'. The word before the " + "third-to-last underscore ('_') will be used as sample ID. For example, a " + "file named 'EXAMPLE_S1_R1_001.fastq.gz' will set 'EXAMPLE' as sample ID. " + "Undertermined reads (e.g. 'Undetermined_S0_R1_001.fastq') will not be " + "included in the output file."), ) make_manifest_parser._optionals.title = "Arguments" diff --git a/dokdo/api/alpha_diversity_plot.py b/dokdo/api/alpha_diversity_plot.py index 1b3fd28..a36cb19 100644 --- a/dokdo/api/alpha_diversity_plot.py +++ b/dokdo/api/alpha_diversity_plot.py @@ -22,9 +22,13 @@ def alpha_diversity_plot( Parameters ---------- - artifact : str or qiime2.Artifact + artifact : str, qiime2.Artifact, or pandas.DataFrame Artifact file or object from the q2-diversity plugin with the - semantic type ``SampleData[AlphaDiversity]``. + semantic type ``SampleData[AlphaDiversity]``. If you are importing + data from a software tool other than QIIME 2, then you can provide a + :class:`pandas.DataFrame` object in which the row index is sample + names and the only column is diversity values with its header being + the name of metrics used (e.g. 'faith_pd'). metadata : str or qiime2.Metadata Metadata file or object. where : str @@ -65,10 +69,12 @@ def alpha_diversity_plot( """ if isinstance(artifact, str): _alpha_diversity = Artifact.load(artifact) + df = _alpha_diversity.view(pd.Series).to_frame() + elif isinstance(artifact, pd.DataFrame): + df = artifact else: _alpha_diversity = artifact - - df = _alpha_diversity.view(pd.Series).to_frame() + df = _alpha_diversity.view(pd.Series).to_frame() mf = common.get_mf(metadata) df = pd.concat([df, mf], axis=1, join='inner') diff --git a/dokdo/api/alpha_rarefaction_plot.py b/dokdo/api/alpha_rarefaction_plot.py index ef01eec..18938aa 100644 --- a/dokdo/api/alpha_rarefaction_plot.py +++ b/dokdo/api/alpha_rarefaction_plot.py @@ -103,7 +103,9 @@ def alpha_rarefaction_plot( with tempfile.TemporaryDirectory() as t: common.export(visualization, t) - df = pd.read_csv(f'{t}/{metric}.csv', index_col=0) + df = pd.read_csv(f'{t}/{metric}.csv', index_col=0, + keep_default_na=False, na_values=['']) + df.to_csv('test.csv') metadata_columns = [x for x in df.columns if 'iter' not in x] diff --git a/dokdo/api/beta_2d_plot.py b/dokdo/api/beta_2d_plot.py index 611dcc7..aef9f9b 100644 --- a/dokdo/api/beta_2d_plot.py +++ b/dokdo/api/beta_2d_plot.py @@ -24,10 +24,14 @@ def beta_2d_plot( Parameters ---------- - artifact : str or qiime2.Artifact + artifact : str, qiime2.Artifact, or pandas.DataFrame Artifact file or object from the q2-diversity plugin with the semantic type ``PCoAResults`` or - ``PCoAResults % Properties('biplot')``. + ``PCoAResults % Properties('biplot')``. If you are importing + data from a software tool other than QIIME 2, then you can provide a + :class:`pandas.DataFrame` object in which the row index is sample + names and the first and second columns indicate the first and second + PCoA axes, respectively. metadata : str or qiime2.Metadata, optional Metadata file or object. hue : str, optional @@ -81,6 +85,12 @@ def beta_2d_plot( figsize=(5, 5)) plt.tight_layout() + .. code-block:: text + + # Explained proportions computed by QIIME 2: + # 33.94% for Axis 1 + # 25.90% for Axis 2 + .. image:: images/beta_2d_plot-1.png We can color the datapoints with ``hue``. We can also change the @@ -120,43 +130,52 @@ def beta_2d_plot( ax.legend(loc='upper left') plt.tight_layout() + .. code-block:: text + + # Explained proportions computed by QIIME 2: + # 33.94% for Axis 1 + # 25.90% for Axis 2 + # Explained proportions computed by QIIME 2: + # 33.94% for Axis 1 + # 25.90% for Axis 2 + # Explained proportions computed by QIIME 2: + # 33.94% for Axis 1 + # 25.90% for Axis 2 + # Explained proportions computed by QIIME 2: + # 33.94% for Axis 1 + # 25.90% for Axis 2 + .. image:: images/beta_2d_plot-2.png """ - if isinstance(artifact, str): - _pcoa_results = Artifact.load(artifact) + if isinstance(artifact, pd.DataFrame): + df = artifact else: - _pcoa_results = artifact - - ordination_results = _pcoa_results.view(OrdinationResults) - - df1 = ordination_results.samples.iloc[:, :2] - df1.columns = ['A1', 'A2'] + if isinstance(artifact, str): + _pcoa_results = Artifact.load(artifact) + else: + _pcoa_results = artifact + ordination_results = _pcoa_results.view(OrdinationResults) + df = ordination_results.samples.iloc[:, :2] + props = ordination_results.proportion_explained + print('# Explained proportions computed by QIIME 2:') + print(f'# {props[0]*100:.2f}% for Axis 1') + print(f'# {props[1]*100:.2f}% for Axis 2') + + df.columns = ['Axis 1', 'Axis 2'] if metadata is None: - df2 = df1 + pass else: mf = common.get_mf(metadata) - df2 = pd.concat([df1, mf], axis=1, join='inner') - - props = ordination_results.proportion_explained + df = pd.concat([df, mf], axis=1, join='inner') if ax is None: fig, ax = plt.subplots(figsize=figsize) - sns.scatterplot(data=df2, - x='A1', - y='A2', - hue=hue, - hue_order=hue_order, - style=style, - style_order=style_order, - size=size, - ax=ax, - s=s, - alpha=alpha, - legend=legend) - - ax.set_xlabel(f'Axis 1 ({props[0]*100:.2f} %)') - ax.set_ylabel(f'Axis 2 ({props[1]*100:.2f} %)') + sns.scatterplot( + x='Axis 1', y='Axis 2', data=df, hue=hue, hue_order=hue_order, + style=style, style_order=style_order, size=size, ax=ax, + s=s, alpha=alpha, legend=legend + ) return ax diff --git a/dokdo/api/beta_3d_plot.py b/dokdo/api/beta_3d_plot.py index ec3ed91..7412642 100644 --- a/dokdo/api/beta_3d_plot.py +++ b/dokdo/api/beta_3d_plot.py @@ -25,7 +25,11 @@ def beta_3d_plot( artifact : str or qiime2.Artifact Artifact file or object from the q2-diversity plugin with the semantic type ``PCoAResults`` or - ``PCoAResults % Properties('biplot')``. + ``PCoAResults % Properties('biplot')``. If you are importing + data from a software tool other than QIIME 2, then you can provide a + :class:`pandas.DataFrame` object in which the row index is sample + names and the first, second, and third columns indicate the first, + second, and third PCoA axes, respectively. metadata : str or qiime2.Metadata, optional Metadata file or object. hue : str, optional @@ -75,6 +79,13 @@ def beta_3d_plot( figsize=(8, 8)) plt.tight_layout() + .. code-block:: text + + # Explained proportions computed by QIIME 2: + # 33.94% for Axis 1 + # 25.90% for Axis 2 + # 6.63% for Axis 3 + .. image:: images/beta_3d_plot-1.png We can control the camera angle with ``elev`` and ``azim``: @@ -96,19 +107,35 @@ def beta_3d_plot( azim=70) plt.tight_layout() + .. code-block:: text + + # Explained proportions computed by QIIME 2: + # 33.94% for Axis 1 + # 25.90% for Axis 2 + # 6.63% for Axis 3 + # Explained proportions computed by QIIME 2: + # 33.94% for Axis 1 + # 25.90% for Axis 2 + # 6.63% for Axis 3 + .. image:: images/beta_3d_plot-2.png """ - if isinstance(artifact, str): - _pcoa_results = Artifact.load(artifact) + if isinstance(artifact, pd.DataFrame): + df = artifact else: - _pcoa_results = artifact - - ordination_results = _pcoa_results.view(OrdinationResults) - - df = ordination_results.samples.iloc[:, :3] - df.columns = ['A1', 'A2', 'A3'] + if isinstance(artifact, str): + _pcoa_results = Artifact.load(artifact) + else: + _pcoa_results = artifact + ordination_results = _pcoa_results.view(OrdinationResults) + df = ordination_results.samples.iloc[:, :3] + props = ordination_results.proportion_explained + print('# Explained proportions computed by QIIME 2:') + print(f'# {props[0]*100:.2f}% for Axis 1') + print(f'# {props[1]*100:.2f}% for Axis 2') + print(f'# {props[2]*100:.2f}% for Axis 3') - props = ordination_results.proportion_explained + df.columns = ['Axis 1', 'Axis 2', 'Axis 3'] if metadata is None: df = df @@ -123,7 +150,7 @@ def beta_3d_plot( ax.view_init(azim=azim, elev=elev) if hue is None: - ax.scatter(df['A1'], df['A2'], df['A3'], s=s) + ax.scatter(df['Axis 1'], df['Axis 2'], df['Axis 3'], s=s) else: if hue_order is None: _hue_order = df[hue].unique() @@ -131,12 +158,11 @@ def beta_3d_plot( _hue_order = hue_order for label in _hue_order: a = df[df[hue] == label] - ax.scatter(a['A1'], a['A2'], a['A3'], label=label, s=s) - - ax.set_xlabel(f'Axis 1 ({props[0]*100:.2f} %)') - ax.set_ylabel(f'Axis 2 ({props[1]*100:.2f} %)') - ax.set_zlabel(f'Axis 3 ({props[2]*100:.2f} %)') + ax.scatter(a['Axis 1'], a['Axis 2'], a['Axis 3'], label=label, s=s) + ax.legend() - ax.legend() + ax.set_xlabel('Axis 1') + ax.set_ylabel('Axis 2') + ax.set_zlabel('Axis 3') return ax diff --git a/dokdo/api/taxa_abundance.py b/dokdo/api/taxa_abundance.py index 43267b9..fcf9241 100644 --- a/dokdo/api/taxa_abundance.py +++ b/dokdo/api/taxa_abundance.py @@ -59,10 +59,10 @@ def _sort_by_mean(df): def _get_others_col(df, count, taxa_names, show_others): """Returns DataFrame object after selecting taxa.""" - if count is not 0 and taxa_names is not None: + if count != 0 and taxa_names is not None: m = "Cannot use 'count' and 'taxa_names' arguments together" raise ValueError(m) - elif count is not 0: + elif count != 0: if count < df.shape[1]: others = df.iloc[:, count-1:].sum(axis=1) df = df.iloc[:, :count-1] @@ -109,7 +109,7 @@ def taxa_abundance_bar_plot( Parameters ---------- - visualization : str, qiime2.Visualization, pandas.DataFrame + visualization : str, qiime2.Visualization, or pandas.DataFrame Visualization file or object from the q2-taxa plugin. Alternatively, a :class:`pandas.DataFrame` object. metadata : str or qiime2.Metadata, optional diff --git a/dokdo/cli/make_manifest.py b/dokdo/cli/make_manifest.py index ad913e5..666635b 100644 --- a/dokdo/cli/make_manifest.py +++ b/dokdo/cli/make_manifest.py @@ -2,33 +2,39 @@ from pathlib import Path def make_manifest(fastq_dir, output_file): - """Create a manifest file (.tsv) from a directory containing FASTQ files. + """ + Create a manifest file (.tsv) from a directory containing FASTQ files. - The file names must include either '_R1_001.fastq' or '_R1_002.fastq'. - The word before the third-to-last underscore will be set as the sample - ID. For example, a file named 'EXAMPLE_S1_R1_001.fastq.gz' will produce - 'EXAMPLE' as sample ID and 'EXAM_PLE_S1_R1_001.fastq.gz', 'EXAM_PLE'. + This command assumes that FASTQ filenames end with a suffix such as + '_S0_R1_001.fastq' or '_S14_R2_001.fastq'. The word before the + third-to-last underscore ('_') will be used as sample ID. For example, a + file named 'EXAMPLE_S1_R1_001.fastq.gz' will set 'EXAMPLE' as sample ID. + Undertermined reads (e.g. 'Undetermined_S0_R1_001.fastq') will not be + included in the output file. Parameters ---------- fastq_dir : str - Path to the directory containing input FASTQ files. + Directory containing input FASTQ files. output_file : str - Path to the output file. + Manifest file. """ - _fastq_dir = Path(fastq_dir).resolve() + fastq_dir = Path(fastq_dir).resolve() files = {} - for r, d, f in os.walk(_fastq_dir): + for r, d, f in os.walk(fastq_dir): for x in f: name = '_'.join(x.split('_')[:-3]) - if "_R1_001.fastq" in x: + if 'Undetermined' in x: + continue + + if '_R1_001.fastq' in x: if name not in files: files[name] = ['', ''] files[name][0] = f"{r}/{x}" - elif "_R2_001.fastq" in x: + elif '_R2_001.fastq' in x: if name not in files: files[name] = ['', ''] files[name][1] = f"{r}/{x}" @@ -36,8 +42,8 @@ def make_manifest(fastq_dir, output_file): pass with open(output_file, 'w') as f: - headers = ["sample-id", "forward-absolute-filepath", - "reverse-absolute-filepath"] + headers = ['sample-id', 'forward-absolute-filepath', + 'reverse-absolute-filepath'] f.write('\t'.join(headers) + '\n') for name in sorted(files): diff --git a/dokdo/version.py b/dokdo/version.py index da77e85..666b2f7 100644 --- a/dokdo/version.py +++ b/dokdo/version.py @@ -1 +1 @@ -__version__ = '1.11.0' +__version__ = '1.12.0'