From 64d07e35bce79c022257486ab0b532819a529dd4 Mon Sep 17 00:00:00 2001 From: Josh Goodman Date: Sun, 19 May 2024 22:46:00 -0400 Subject: [PATCH] Add initial Dmel and GH actions --- .github/workflows/generate-blast-conf.yml | 65 +++++++++++++ .pre-commit-config.yaml | 27 ++++++ conf/.gitkeep | 0 poetry.lock | 16 +++- pyproject.toml | 1 + src/blast_db_configuration/__main__.py | 18 ++-- src/blast_db_configuration/db_metadata.py | 109 ++++++++++++++-------- 7 files changed, 191 insertions(+), 45 deletions(-) create mode 100644 .github/workflows/generate-blast-conf.yml create mode 100644 .pre-commit-config.yaml create mode 100644 conf/.gitkeep diff --git a/.github/workflows/generate-blast-conf.yml b/.github/workflows/generate-blast-conf.yml new file mode 100644 index 0000000..4d5a4c5 --- /dev/null +++ b/.github/workflows/generate-blast-conf.yml @@ -0,0 +1,65 @@ +name: Generate FlyBase BLAST Configuration +on: + workflow_dispatch: + inputs: + FB-release: + required: true + dmel-annot-release: + required: true + +jobs: + generate: + runs-on: ubuntu-latest + steps: + - name: Checkout repository code + uses: actions/checkout@v4 + + - name: Set up python + id: setup-python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + #---------------------------------------------- + # load cached venv if cache exists + #---------------------------------------------- + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + #---------------------------------------------- + # install dependencies if cache does not exist + #---------------------------------------------- + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + #---------------------------------------------- + # install your root project, if required + #---------------------------------------------- + - name: Install project + run: poetry install --no-interaction + + - name: Generate configuration + run: poetry run python -m blast_db_configuration --release ${{ github.event.inputs.FB-release }} --dmel-annot-release ${{ github.event.inputs.dmel-annot-release }} + + - name: Create PR + uses: peter-evans/create-pull-request@v6 + with: + commit-message: ${{ github.event.inputs.FB-release }} BLAST config + branch: ${{ github.event.inputs.FB-release }}-blast-config + delete-branch: true + title: '[Update] ${{ github.event.inputs.FB-release }} BLAST config' + body: | + ${{ github.event.inputs.FB-release }} BLAST config + draft: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..4295f95 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +# .pre-commit-config.yaml +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-case-conflict + - id: check-merge-conflict + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: check-json + - id: check-added-large-files + - repo: local + hooks: + - id: black + name: black + entry: poetry run black + language: system + types: [file, python] + - id: isort + name: isort + entry: poetry run isort + language: system + types: [file, python] diff --git a/conf/.gitkeep b/conf/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/poetry.lock b/poetry.lock index b5c95e2..531c628 100644 --- a/poetry.lock +++ b/poetry.lock @@ -161,6 +161,20 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "isort" +version = "5.13.2" +description = "A Python utility / library to sort Python imports." +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "isort-5.13.2-py3-none-any.whl", hash = "sha256:8ca5e72a8d85860d5a3fa69b8745237f2939afe12dbf656afbcb47fe72d947a6"}, + {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"}, +] + +[package.extras] +colors = ["colorama (>=0.4.6)"] + [[package]] name = "jsonschema" version = "4.22.0" @@ -698,4 +712,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "7814e399799259bbc4a89f05b298bc76924a9cb9e0e50f4bbf30b7214c4e685c" +content-hash = "d7ae5f8fdc52b11033a9f90c3e3817492aee67bf39467ddda1c2efafffcc1dc8" diff --git a/pyproject.toml b/pyproject.toml index a52290d..0cc1a31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ tqdm = "^4.66.4" [tool.poetry.group.dev.dependencies] black = "^24.4.2" +isort = "^5.13.2" [build-system] requires = ["poetry-core"] diff --git a/src/blast_db_configuration/__main__.py b/src/blast_db_configuration/__main__.py index 6be0ae7..8362af8 100644 --- a/src/blast_db_configuration/__main__.py +++ b/src/blast_db_configuration/__main__.py @@ -1,15 +1,16 @@ +import json +import logging from dataclasses import dataclass -from typing_extensions import Annotated from datetime import datetime from pathlib import Path -import json -import logging + +import agr_blast_service_configuration.schemas.metadata as agrdb import typer -from tqdm import tqdm from Bio import Entrez -import agr_blast_service_configuration.schemas.metadata as agrdb +from tqdm import tqdm +from typing_extensions import Annotated -from .db_metadata import create_metadata_from_ncbi +from .db_metadata import create_dmel_metadata, create_metadata_from_ncbi app = typer.Typer() @@ -44,6 +45,9 @@ class DefaultBlastDbConfiguration: @app.command() def generate_config( release: Annotated[str, typer.Option(help="The FlyBase release version")], + dmel_annot_release: Annotated[ + str, typer.Option(help="The Dmel annotation release version e.g. r6.57") + ], contact: Annotated[ type(DEFAULT_CONFIG.contact), typer.Option(help="Email of the FlyBase technical contact."), @@ -99,7 +103,7 @@ def generate_config( unit="organism", ): if genus == "Drosophila" and species == "melanogaster": - pass + all_dbs.extend(create_dmel_metadata(dmel_annot_release)) else: all_dbs.extend(create_metadata_from_ncbi(genus, species, ncbi_email)) diff --git a/src/blast_db_configuration/db_metadata.py b/src/blast_db_configuration/db_metadata.py index e203cec..9a2953b 100755 --- a/src/blast_db_configuration/db_metadata.py +++ b/src/blast_db_configuration/db_metadata.py @@ -1,9 +1,11 @@ import logging +import urllib.request +from typing import Optional import agr_blast_service_configuration.schemas.metadata as blast_metadata_schema -from .ncbi import taxonomy as tax from .ncbi import genomes as genomes +from .ncbi import taxonomy as tax logger = logging.getLogger(__name__) @@ -72,39 +74,72 @@ def create_metadata_from_ncbi( return dbs -def create_dmel_metadata(): - pass - # dbs.extend( - # [ - # blast_metadata.BlastDBMetaData( - # version=options.dmel_annot, - # URI=f"ftp://ftp.flybase.org/genomes/Drosophila_melanogaster/dmel_r{options.dmel_annot}_{options.release}/fasta/dmel-all-chromosome-r{options.dmel_annot}.fasta.gz", - # md5sum="b7bc17acfd655914c68326df8599a9ca", # TODO - Hard coded for now, need to fetch this from the MD5SUM file - # genus="Drosophila", - # species="melanogaster", - # blast_title=f"D. melanogaster Genome Assembly ({options.dmel_annot})", - # description="Drosophila melanogaster genome assembly", - # taxon_id="NCBITaxon:7227", - # seqtype="nucl", - # ), - # blast_metadata.BlastDBMetaData( - # version=options.dmel_annot, - # URI=f"ftp://ftp.flybase.org/genomes/Drosophila_melanogaster/dmel_r{options.dmel_annot}_{options.release}/fasta/dmel-all-translation-r{options.dmel_annot}.fasta.gz", - # # TODO - Hard coded for now, need to fetch this from the MD5SUM file - # md5sum="e3f959ab0e1026de56e1bd00490450e5", - # genus="Drosophila", - # species="melanogaster", - # blast_title=f"D. melanogaster Protein Sequences ({options.dmel_annot})", - # description="Drosophila melanogaster protein sequences", - # taxon_id="NCBITaxon:7227", - # seqtype="prot", - # ), - # ] - # ) - # flybase_blast_metadata = blast_metadata.AGRBlastDatabases( - # metaData=blast_metadata.AGRBlastMetadata( - # contact=options.email, dataProvider="FlyBase", release=options.release - # ), - # data=dbs, - # ) - # print(flybase_blast_metadata.json()) +def create_dmel_metadata( + dmel_annot_release: str, +) -> list[blast_metadata_schema.SequenceMetadata]: + """ + Generate a list of BLAST DB metadata schemas based on Dmel annot release. + + :param dmel_annot_release: The Dmel annot release + :return: List of BLAST DB metadata schemas + """ + dmel_dbs = [ + { + "uri": "https://ftp.flybase.org/blast/dmel-assembly.fasta.gz", + "description": f"D. melanogaster Genome Assembly {dmel_annot_release}", + "seqtype": blast_metadata_schema.BlastDBType.NUCL, + "md5_sum": None, + }, + { + "uri": "https://ftp.flybase.org/blast/dmel-intergenic.fasta.gz", + "description": f"D. melanogaster Intergenic Regions {dmel_annot_release}", + "seqtype": blast_metadata_schema.BlastDBType.NUCL, + "md5_sum": None, + }, + { + "uri": "https://ftp.flybase.org/blast/dmel-transcript.fasta.gz", + "description": f"D. melanogaster Transcripts {dmel_annot_release}", + "seqtype": blast_metadata_schema.BlastDBType.NUCL, + "md5_sum": None, + }, + { + "uri": "https://ftp.flybase.org/blast/dmel-translation.fasta.gz", + "description": f"D. melanogaster Proteins {dmel_annot_release}", + "seqtype": blast_metadata_schema.BlastDBType.PROT, + "md5_sum": None, + }, + { + "uri": "https://ftp.flybase.org/blast/dmel-transposon.fasta.gz", + "description": f"D. melanogaster Transposons {dmel_annot_release}", + "seqtype": blast_metadata_schema.BlastDBType.NUCL, + "md5_sum": None, + }, + ] + # TODO: read in checksums and assign them to the appropriate DB. + return [ + blast_metadata_schema.SequenceMetadata( + version=dmel_annot_release, + uri=db.get("uri"), + md5_sum="MD5", + genus="Drosophila", + species="melanogaster", + blast_title=db.get("description"), + description=db.get("description"), + taxon_id="7227", + seqtype=db.get("seqtype"), + ) + for db in dmel_dbs + ] + + +def fetch_dmel_checksums(uri: str) -> Optional[str]: + """ + Get the current Dmel FASTA checksums. + + :param uri: The URI of the checksum file + :return: The text content of the checksum file + """ + with urllib.request.urlopen(uri) as response: + md5_checksums = response.read().decode("utf-8") + return md5_checksums + return None