Add in pre-commit (#517)

* Add in pre-commit and run on all files * Add in bandit code scanning * Lint * Run linter
Sage-Bionetworks · May 17, 2023 · 35ddc2e · 35ddc2e
1 parent 9535d0a
commit 35ddc2e
Show file tree

Hide file tree

Showing 25 changed files with 366 additions and 301 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -3,4 +3,4 @@
     "mounts": [
         "source=${localEnv:HOME}/.synapseConfig,target=/root/.synapseConfig,type=bind,consistency=cached"
     ]
-}
+}
diff --git a/.dockerignore b/.dockerignore
@@ -20,4 +20,4 @@ pip-delete-this-directory.txt
 coverage.xml
 *,cover
 *.log
-.git
+.git
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1 +1 @@
-* @Sage-Bionetworks/genie-reviewers
+* @Sage-Bionetworks/genie-reviewers
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -78,4 +78,3 @@ jobs:
       run: |
         python setup.py sdist bdist_wheel
         twine upload dist/*
-
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -48,11 +48,11 @@ jobs:
         # If you wish to specify custom queries, you can do so here or in a config file.
         # By default, queries listed here will override any specified in a config file.
         # Prefix the list here with "+" to use these queries and those in the config file.
-        
+
         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
         # queries: security-extended,security-and-quality
 
-        
+
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
@@ -61,7 +61,7 @@ jobs:
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
 
-    #   If the Autobuild fails above, remove it and uncomment the following three lines. 
+    #   If the Autobuild fails above, remove it and uncomment the following three lines.
     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
 
     # - run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,63 @@
+exclude: '^docs/conf.py'
+
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+  - id: trailing-whitespace
+  - id: check-added-large-files
+  - id: check-ast
+  - id: check-json
+  - id: check-merge-conflict
+  - id: check-xml
+  - id: check-yaml
+  - id: debug-statements
+  - id: end-of-file-fixer
+  - id: requirements-txt-fixer
+  - id: mixed-line-ending
+    args: ['--fix=auto']  # replace 'auto' with 'lf' to enforce Linux/Mac line endings or 'crlf' for Windows
+
+- repo: https://github.com/charliermarsh/ruff-pre-commit
+  # Ruff version.
+  rev: 'v0.0.262'
+  hooks:
+    - id: ruff
+
+- repo: https://github.com/psf/black
+  rev: 23.3.0
+  hooks:
+  - id: black
+    language_version: python3
+
+- repo: https://github.com/PyCQA/bandit
+  rev: 1.7.5
+  hooks:
+  - id: bandit
+    args: ["-c", "pyproject.toml"]
+    additional_dependencies: ["bandit[toml]"]
+
+# - repo: https://github.com/asottile/blacken-docs
+#   rev: v1.12.0
+#   hooks:
+#   - id: blacken-docs
+#     additional_dependencies: [black]
+
+# - repo: https://github.com/pre-commit/mirrors-mypy
+#   rev: 'v1.0.1'
+#   hooks:
+#   - id: mypy
+#     additional_dependencies: [pydantic~=1.10]
+
+# Checks for missing docstrings
+# - repo: https://github.com/econchick/interrogate
+#   rev: 1.5.0
+#   hooks:
+#   - id: interrogate
+#     exclude: ^(docs/conf.py|setup.py|tests)
+#     args: [--config=pyproject.toml]
+
+# finds dead python code
+# - repo: https://github.com/jendrikseipp/vulture
+#   rev: 'v2.7'
+#   hooks:
+#     - id: vulture
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,4 @@
 # This file currently doesn't do anything because
 # R/ and templates/ isn't within the genie/ directory
 graft R/
-graft templates/
+graft templates/
diff --git a/R/mergeCheck.R b/R/mergeCheck.R
@@ -99,7 +99,7 @@ for (center in centers) {
                                           mafSynId, querySamples),
                                   includeRowIdAndRowVersion = F)
     #genieMutTable = synTableQuery(sprintf("SELECT Center,Tumor_Sample_Barcode,Hugo_Symbol,HGVSp_Short,Variant_Classification,Chromosome,Start_Position,Reference_Allele,Tumor_Seq_Allele2,t_depth,t_alt_count,End_Position,Protein_position FROM %s where Tumor_Sample_Barcode in ('%s')", mafSynId, querySamples),includeRowIdAndRowVersion=F)
-    
+
     genieMutData = synapser::as.data.frame(genieMutTable)
     flag_variants_to_merge(genieMutData, genieClinData, samplesToRun, upload = TRUE)
     #write.csv(rbind(annotated_df[is.na(annotated_df$Flag),],new_rows), "Missing_variant_annotation.csv", row.names=F)

diff --git a/R/mergecheck_functions.R b/R/mergecheck_functions.R
@@ -35,13 +35,13 @@ uploadToTable <- function(tbl, databaseSynId, subSetSamples, centerMappingDf) {
     if (any(annotated_df$Flag[!samples %in% new_samples] == "TOSS")) {
       annotated_df$Flag[!samples %in% new_samples][
         annotated_df$Flag[!samples %in% new_samples] == "TOSS"] = "FIXED"
-    } 
+    }
     if (any(annotated_df$Center %in% keepCenters)) {
       annotated_df$Flag[annotated_df$Center %in% keepCenters] = "KEEP"
     }
     synapser::synStore(Table(databaseSynId, annotated_df))
   }
-  
+
   #Append any new data
   new_rows = nodup_tbl[!new_samples %in% samples,]
   #Even when nodup_tbl is empty, it can be subsetted, causing one NA row to be uploaded.
@@ -61,28 +61,28 @@ uploadToTable <- function(tbl, databaseSynId, subSetSamples, centerMappingDf) {
 
 # Flag variants to merge
 flag_variants_to_merge <- function(genieMutData, genieClinData, samplesToRun, upload = TRUE) {
-  
+
   genieMutData <- data.frame( lapply( genieMutData , factor ))
   # Create factors for clinical data
-  
+
   SAMPLE_ID = genieClinData$SAMPLE_ID[genieClinData$SAMPLE_ID %in% samplesToRun]
   genieClinData = data.frame(SAMPLE_ID)
   genieClinData <- data.frame( lapply( genieClinData , factor ))
   # all inclusive list of samples should be from the clinical data file
-  # therefore factor levels for Tumor_Sample_Barcode in the MAF should be set to that of SAMPLE_ID of clinical data table 
+  # therefore factor levels for Tumor_Sample_Barcode in the MAF should be set to that of SAMPLE_ID of clinical data table
   # check that no samples are listed in the MAF that are not listed in the clinical data file
   # reversing the order of the inputs would tell you which samples are submitted that have no entries (no mutations) in the MAF
   if (length(setdiff(levels(genieMutData$Tumor_Sample_Barcode),
                      levels(genieClinData$SAMPLE_ID))) == 0) {
     genieMutData$Tumor_Sample_Barcode = factor(genieMutData$Tumor_Sample_Barcode,
                                                levels = levels(genieClinData$SAMPLE_ID))
   }
-  
+
   # records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth)
   # genieMutData$t_depth <-  as.numeric(genieMutData$t_depth) DONT DO THIS LINE...
   genieMutData$t_depth <-  as.numeric(levels(genieMutData$t_depth))[genieMutData$t_depth]
   noVAF.idx = which((genieMutData$t_depth == 0) | is.na(genieMutData$t_depth))
-  genieMutData$t_alt_count_num = 
+  genieMutData$t_alt_count_num =
     as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count]
   #if (length(noVAF.idx) > 0) {
   genieMutData$t_alt_count_num[noVAF.idx] = 1
@@ -93,7 +93,7 @@ flag_variants_to_merge <- function(genieMutData, genieClinData, samplesToRun, up
   genieMutData$Tumor_Seq_Allele2 <- as.character(genieMutData$Tumor_Seq_Allele2)
   #invalid class “VRanges” object: if 'alt' is 'NA', then 'altDepth' should be 'NA'
   genieMutData$t_alt_count_num[is.na(genieMutData$Tumor_Seq_Allele2)] <- NA
-  
+
   # get VRanges for all variants called in the MAF
   mafVR = VRanges(seqnames = Rle(paste0("chr",genieMutData$Chromosome)),
                   ranges = IRanges(start = genieMutData$Start_Position,
@@ -104,31 +104,31 @@ flag_variants_to_merge <- function(genieMutData, genieClinData, samplesToRun, up
                   totalDepth = genieMutData$t_depth,
                   sampleNames = genieMutData$Tumor_Sample_Barcode)
   seqlevels(mafVR) = sort(seqlevels(mafVR))
-  
+
   # precompute
   vaf = altDepth(mafVR)/totalDepth(mafVR)
   ord = order(mafVR)
-  
+
   # start with empty table
   tbl = genieMutData[1, c("Center","Tumor_Sample_Barcode","Hugo_Symbol",
                           "HGVSp_Short","Variant_Classification","Chromosome",
                           "Start_Position","Reference_Allele","Tumor_Seq_Allele2",
                           "t_alt_count_num","t_depth")]
   tbl = tbl[-1,]
-  
+
   # check for potential variants that may need to be evaluated for merge (cis/trans)
   genieMutData$Tumor_Sample_Barcode <- as.character(genieMutData$Tumor_Sample_Barcode)
   #genieClinData$SAMPLE_ID <- as.character(genieClinData$SAMPLE_ID)
   t = Sys.time()
   samplesRun = c()
   for (i in 1:length(samplesToRun)) {
-    
+
     # get sample indices (in order from pre sort above)
     idx = ord[which(genieMutData$Tumor_Sample_Barcode[ord] == samplesToRun[i])]
     samplesRun = c(samplesRun, samplesToRun[i])
     # get length of idx
     l = length(idx)
-    
+
     # if sample has more than one variant
     if (l > 1) {
       # get differences in BPs of variant sites
@@ -173,4 +173,3 @@ flag_variants_to_merge <- function(genieMutData, genieClinData, samplesToRun, up
     }
   }
 }
-
diff --git a/R/test_flag_variants.R b/R/test_flag_variants.R
@@ -16,7 +16,7 @@ library(testthat)
 library(VariantAnnotation)
 genieMutData = matrix(nrow = 2, ncol = 13)
 colnames(genieMutData) = c("Chromosome", "Hugo_Symbol", "Start_Position", "End_Position", "Reference_Allele",
-                           "Tumor_Seq_Allele2", "t_depth", 't_alt_count', "Tumor_Sample_Barcode", 
+                           "Tumor_Seq_Allele2", "t_depth", 't_alt_count', "Tumor_Sample_Barcode",
                            "Protein_position", "HGVSp_Short", "Variant_Classification", "Center")
 
 
@@ -58,7 +58,7 @@ test_that("Mutations are not flagged", {
   expect_equal(colnames(tbl), c("Center", "Tumor_Sample_Barcode", "Hugo_Symbol", "HGVSp_Short",
                                 "Variant_Classification", "Chromosome", "Start_Position",
                                 "Reference_Allele", "Tumor_Seq_Allele2", "t_alt_count_num", "t_depth"))
-  
+
 })
 
 genieMutData$Start_Position = c("1", "10")
@@ -93,4 +93,3 @@ test_that("Mutations not flagged, different starts and ends", {
   tbl <- data.frame( lapply( tbl , factor ))
   expect_equal(tbl, expected[,colnames(tbl)])
 })
-
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 
 ## Introduction
 
-This repository documents code used to gather, QC, standardize, and analyze data uploaded by institutes participating in AACR's Project GENIE (Genomics, Evidence, Neoplasia, Information, Exchange). 
+This repository documents code used to gather, QC, standardize, and analyze data uploaded by institutes participating in AACR's Project GENIE (Genomics, Evidence, Neoplasia, Information, Exchange).
 
 ## Dependencies
 

diff --git a/genie/__init__.py b/genie/__init__.py
@@ -1,6 +1,5 @@
 # Import logging last to not take in synapseclient logging
 import logging
-import os
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
@@ -1488,9 +1488,6 @@ def stagingToCbio(
     variant_filtering_synId = databaseSynIdMappingDf["Id"][
         databaseSynIdMappingDf["Database"] == "mutationsInCis"
     ][0]
-    fusionSynId = databaseSynIdMappingDf["Id"][
-        databaseSynIdMappingDf["Database"] == "fusions"
-    ][0]
     sv_synid = databaseSynIdMappingDf["Id"][databaseSynIdMappingDf["Database"] == "sv"][
         0
     ]

diff --git a/genie/transform.py b/genie/transform.py
@@ -2,7 +2,7 @@
 package"""
 
 import pandas as pd
-from pandas.api.types import is_integer_dtype, is_float_dtype
+from pandas.api.types import is_float_dtype
 
 
 def _col_name_to_titlecase(string: str) -> str:

diff --git a/genie_registry/seg.py b/genie_registry/seg.py
@@ -71,7 +71,7 @@ def _validate(self, segDF):
                     "Seg: Only integars allowed in these column(s): %s.\n"
                     % ", ".join(sorted(nonInts))
                 )
-            if not segDF["SEG.MEAN"].dtype in [float, int]:
+            if segDF["SEG.MEAN"].dtype not in [float, int]:
                 total_error += "Seg: Only numerical values allowed in SEG.MEAN.\n"
 
             error, warn = validate._validate_chromosome(

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,3 +2,10 @@
 line-length = 88
 target-version = ['py37']
 include = '\.pyi?$'
+
+[tool.ruff]
+extend-ignore = ["E501"]
+
+[tool.bandit]
+exclude_dirs = ["tests"]
+skips = ["B101", "B608", "B404", "B603", "B602", "B607"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,4 +3,4 @@ @@
         "mounts": [
             "source=${localEnv:HOME}/.synapseConfig,target=/root/.synapseConfig,type=bind,consistency=cached"
         ]
-    }
+    }
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		* @Sage-Bionetworks/genie-reviewers
		* @Sage-Bionetworks/genie-reviewers
Original file line number	Diff line number	Diff line change
Expand Up		@@ -78,4 +78,3 @@ jobs:
		run: \|
		python setup.py sdist bdist_wheel
		twine upload dist/*