Skip to content

Commit

Permalink
Add in pre-commit (#517)
Browse files Browse the repository at this point in the history
* Add in pre-commit and run on all files

* Add in bandit code scanning

* Lint

* Run linter
  • Loading branch information
thomasyu888 authored May 17, 2023
1 parent 9535d0a commit 35ddc2e
Show file tree
Hide file tree
Showing 25 changed files with 366 additions and 301 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
"mounts": [
"source=${localEnv:HOME}/.synapseConfig,target=/root/.synapseConfig,type=bind,consistency=cached"
]
}
}
2 changes: 1 addition & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ pip-delete-this-directory.txt
coverage.xml
*,cover
*.log
.git
.git
2 changes: 1 addition & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1 +1 @@
* @Sage-Bionetworks/genie-reviewers
* @Sage-Bionetworks/genie-reviewers
1 change: 0 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,3 @@ jobs:
run: |
python setup.py sdist bdist_wheel
twine upload dist/*
6 changes: 3 additions & 3 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ jobs:
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.

# Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality


# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
Expand All @@ -61,7 +61,7 @@ jobs:
# ℹ️ Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun

# If the Autobuild fails above, remove it and uncomment the following three lines.
# If the Autobuild fails above, remove it and uncomment the following three lines.
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.

# - run: |
Expand Down
63 changes: 63 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
exclude: '^docs/conf.py'

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: check-added-large-files
- id: check-ast
- id: check-json
- id: check-merge-conflict
- id: check-xml
- id: check-yaml
- id: debug-statements
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: mixed-line-ending
args: ['--fix=auto'] # replace 'auto' with 'lf' to enforce Linux/Mac line endings or 'crlf' for Windows

- repo: https://github.com/charliermarsh/ruff-pre-commit
# Ruff version.
rev: 'v0.0.262'
hooks:
- id: ruff

- repo: https://github.com/psf/black
rev: 23.3.0
hooks:
- id: black
language_version: python3

- repo: https://github.com/PyCQA/bandit
rev: 1.7.5
hooks:
- id: bandit
args: ["-c", "pyproject.toml"]
additional_dependencies: ["bandit[toml]"]

# - repo: https://github.com/asottile/blacken-docs
# rev: v1.12.0
# hooks:
# - id: blacken-docs
# additional_dependencies: [black]

# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: 'v1.0.1'
# hooks:
# - id: mypy
# additional_dependencies: [pydantic~=1.10]

# Checks for missing docstrings
# - repo: https://github.com/econchick/interrogate
# rev: 1.5.0
# hooks:
# - id: interrogate
# exclude: ^(docs/conf.py|setup.py|tests)
# args: [--config=pyproject.toml]

# finds dead python code
# - repo: https://github.com/jendrikseipp/vulture
# rev: 'v2.7'
# hooks:
# - id: vulture
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# This file currently doesn't do anything because
# R/ and templates/ isn't within the genie/ directory
graft R/
graft templates/
graft templates/
2 changes: 1 addition & 1 deletion R/mergeCheck.R
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ for (center in centers) {
mafSynId, querySamples),
includeRowIdAndRowVersion = F)
#genieMutTable = synTableQuery(sprintf("SELECT Center,Tumor_Sample_Barcode,Hugo_Symbol,HGVSp_Short,Variant_Classification,Chromosome,Start_Position,Reference_Allele,Tumor_Seq_Allele2,t_depth,t_alt_count,End_Position,Protein_position FROM %s where Tumor_Sample_Barcode in ('%s')", mafSynId, querySamples),includeRowIdAndRowVersion=F)

genieMutData = synapser::as.data.frame(genieMutTable)
flag_variants_to_merge(genieMutData, genieClinData, samplesToRun, upload = TRUE)
#write.csv(rbind(annotated_df[is.na(annotated_df$Flag),],new_rows), "Missing_variant_annotation.csv", row.names=F)
Expand Down
27 changes: 13 additions & 14 deletions R/mergecheck_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ uploadToTable <- function(tbl, databaseSynId, subSetSamples, centerMappingDf) {
if (any(annotated_df$Flag[!samples %in% new_samples] == "TOSS")) {
annotated_df$Flag[!samples %in% new_samples][
annotated_df$Flag[!samples %in% new_samples] == "TOSS"] = "FIXED"
}
}
if (any(annotated_df$Center %in% keepCenters)) {
annotated_df$Flag[annotated_df$Center %in% keepCenters] = "KEEP"
}
synapser::synStore(Table(databaseSynId, annotated_df))
}

#Append any new data
new_rows = nodup_tbl[!new_samples %in% samples,]
#Even when nodup_tbl is empty, it can be subsetted, causing one NA row to be uploaded.
Expand All @@ -61,28 +61,28 @@ uploadToTable <- function(tbl, databaseSynId, subSetSamples, centerMappingDf) {

# Flag variants to merge
flag_variants_to_merge <- function(genieMutData, genieClinData, samplesToRun, upload = TRUE) {

genieMutData <- data.frame( lapply( genieMutData , factor ))
# Create factors for clinical data

SAMPLE_ID = genieClinData$SAMPLE_ID[genieClinData$SAMPLE_ID %in% samplesToRun]
genieClinData = data.frame(SAMPLE_ID)
genieClinData <- data.frame( lapply( genieClinData , factor ))
# all inclusive list of samples should be from the clinical data file
# therefore factor levels for Tumor_Sample_Barcode in the MAF should be set to that of SAMPLE_ID of clinical data table
# therefore factor levels for Tumor_Sample_Barcode in the MAF should be set to that of SAMPLE_ID of clinical data table
# check that no samples are listed in the MAF that are not listed in the clinical data file
# reversing the order of the inputs would tell you which samples are submitted that have no entries (no mutations) in the MAF
if (length(setdiff(levels(genieMutData$Tumor_Sample_Barcode),
levels(genieClinData$SAMPLE_ID))) == 0) {
genieMutData$Tumor_Sample_Barcode = factor(genieMutData$Tumor_Sample_Barcode,
levels = levels(genieClinData$SAMPLE_ID))
}

# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth)
# genieMutData$t_depth <- as.numeric(genieMutData$t_depth) DONT DO THIS LINE...
genieMutData$t_depth <- as.numeric(levels(genieMutData$t_depth))[genieMutData$t_depth]
noVAF.idx = which((genieMutData$t_depth == 0) | is.na(genieMutData$t_depth))
genieMutData$t_alt_count_num =
genieMutData$t_alt_count_num =
as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count]
#if (length(noVAF.idx) > 0) {
genieMutData$t_alt_count_num[noVAF.idx] = 1
Expand All @@ -93,7 +93,7 @@ flag_variants_to_merge <- function(genieMutData, genieClinData, samplesToRun, up
genieMutData$Tumor_Seq_Allele2 <- as.character(genieMutData$Tumor_Seq_Allele2)
#invalid class “VRanges” object: if 'alt' is 'NA', then 'altDepth' should be 'NA'
genieMutData$t_alt_count_num[is.na(genieMutData$Tumor_Seq_Allele2)] <- NA

# get VRanges for all variants called in the MAF
mafVR = VRanges(seqnames = Rle(paste0("chr",genieMutData$Chromosome)),
ranges = IRanges(start = genieMutData$Start_Position,
Expand All @@ -104,31 +104,31 @@ flag_variants_to_merge <- function(genieMutData, genieClinData, samplesToRun, up
totalDepth = genieMutData$t_depth,
sampleNames = genieMutData$Tumor_Sample_Barcode)
seqlevels(mafVR) = sort(seqlevels(mafVR))

# precompute
vaf = altDepth(mafVR)/totalDepth(mafVR)
ord = order(mafVR)

# start with empty table
tbl = genieMutData[1, c("Center","Tumor_Sample_Barcode","Hugo_Symbol",
"HGVSp_Short","Variant_Classification","Chromosome",
"Start_Position","Reference_Allele","Tumor_Seq_Allele2",
"t_alt_count_num","t_depth")]
tbl = tbl[-1,]

# check for potential variants that may need to be evaluated for merge (cis/trans)
genieMutData$Tumor_Sample_Barcode <- as.character(genieMutData$Tumor_Sample_Barcode)
#genieClinData$SAMPLE_ID <- as.character(genieClinData$SAMPLE_ID)
t = Sys.time()
samplesRun = c()
for (i in 1:length(samplesToRun)) {

# get sample indices (in order from pre sort above)
idx = ord[which(genieMutData$Tumor_Sample_Barcode[ord] == samplesToRun[i])]
samplesRun = c(samplesRun, samplesToRun[i])
# get length of idx
l = length(idx)

# if sample has more than one variant
if (l > 1) {
# get differences in BPs of variant sites
Expand Down Expand Up @@ -173,4 +173,3 @@ flag_variants_to_merge <- function(genieMutData, genieClinData, samplesToRun, up
}
}
}

5 changes: 2 additions & 3 deletions R/test_flag_variants.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ library(testthat)
library(VariantAnnotation)
genieMutData = matrix(nrow = 2, ncol = 13)
colnames(genieMutData) = c("Chromosome", "Hugo_Symbol", "Start_Position", "End_Position", "Reference_Allele",
"Tumor_Seq_Allele2", "t_depth", 't_alt_count', "Tumor_Sample_Barcode",
"Tumor_Seq_Allele2", "t_depth", 't_alt_count', "Tumor_Sample_Barcode",
"Protein_position", "HGVSp_Short", "Variant_Classification", "Center")


Expand Down Expand Up @@ -58,7 +58,7 @@ test_that("Mutations are not flagged", {
expect_equal(colnames(tbl), c("Center", "Tumor_Sample_Barcode", "Hugo_Symbol", "HGVSp_Short",
"Variant_Classification", "Chromosome", "Start_Position",
"Reference_Allele", "Tumor_Seq_Allele2", "t_alt_count_num", "t_depth"))

})

genieMutData$Start_Position = c("1", "10")
Expand Down Expand Up @@ -93,4 +93,3 @@ test_that("Mutations not flagged, different starts and ends", {
tbl <- data.frame( lapply( tbl , factor ))
expect_equal(tbl, expected[,colnames(tbl)])
})

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

## Introduction

This repository documents code used to gather, QC, standardize, and analyze data uploaded by institutes participating in AACR's Project GENIE (Genomics, Evidence, Neoplasia, Information, Exchange).
This repository documents code used to gather, QC, standardize, and analyze data uploaded by institutes participating in AACR's Project GENIE (Genomics, Evidence, Neoplasia, Information, Exchange).

## Dependencies

Expand Down
1 change: 0 additions & 1 deletion genie/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Import logging last to not take in synapseclient logging
import logging
import os

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand Down
3 changes: 0 additions & 3 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -1488,9 +1488,6 @@ def stagingToCbio(
variant_filtering_synId = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "mutationsInCis"
][0]
fusionSynId = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "fusions"
][0]
sv_synid = databaseSynIdMappingDf["Id"][databaseSynIdMappingDf["Database"] == "sv"][
0
]
Expand Down
2 changes: 1 addition & 1 deletion genie/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
package"""

import pandas as pd
from pandas.api.types import is_integer_dtype, is_float_dtype
from pandas.api.types import is_float_dtype


def _col_name_to_titlecase(string: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion genie_registry/seg.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _validate(self, segDF):
"Seg: Only integars allowed in these column(s): %s.\n"
% ", ".join(sorted(nonInts))
)
if not segDF["SEG.MEAN"].dtype in [float, int]:
if segDF["SEG.MEAN"].dtype not in [float, int]:
total_error += "Seg: Only numerical values allowed in SEG.MEAN.\n"

error, warn = validate._validate_chromosome(
Expand Down
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,10 @@
line-length = 88
target-version = ['py37']
include = '\.pyi?$'

[tool.ruff]
extend-ignore = ["E501"]

[tool.bandit]
exclude_dirs = ["tests"]
skips = ["B101", "B608", "B404", "B603", "B602", "B607"]
Loading

0 comments on commit 35ddc2e

Please sign in to comment.