diff --git a/README.md b/README.md index 9066f90..71433f7 100644 --- a/README.md +++ b/README.md @@ -239,7 +239,7 @@ extra_config= snakemake -s ${snakefile} --configfile ${config} ${extra_config} --cores 5 ``` -# Outputs +# :hatching_chick: Outputs ## Output folder structure @@ -260,7 +260,7 @@ out/ └── reference2 ``` -## Output files + # :gear: Installation and Dependencies @@ -364,9 +364,9 @@ pip install numpy pandas scHPL sklearn anndata matplotlib scanpy datetime tensor - Tangram ``` -# :floppy_disk: Resources + -Add table with resource usage for different sice references and queries + # :woman_mechanic: Adding new tools @@ -433,6 +433,7 @@ snakemake -s ${snakefile} --configfile ${config} --report ${report} ## scClassify Documentation written by: Bhavyaa Chandarana + Date written: 2023-07 scClassify workflow was generated using the tutorial below: @@ -451,6 +452,7 @@ https://www.bioconductor.org/packages/release/bioc/vignettes/scClassify/inst/doc ## scPred Documentation written by: Alva Annett + Date written: 2023-07 Normalization and parameters based on this tutorial: @@ -476,6 +478,7 @@ http://www.bioconductor.org/packages/devel/bioc/vignettes/SingleR/inst/doc/Singl ## singleCellNet Documentation written by: Rodrigo Lopez Gutierrez + Date written: 2023-08-01 singleCellNet workflow was generated following the tutorial below: @@ -490,6 +493,7 @@ Normal parameters were used in both the training and prediction functions, with ## Correlation Documentation written by: Rodrigo Lopez Gutierrez + Date written: 2023-08-02 The Correlation tool runs a correlation-based cell type prediction on a sample of interest, given the mean gene expression per label for a reference. @@ -507,6 +511,7 @@ Currently only outputting a table with each cell, the most highly correlated lab ## scLearn Documentation written by: Bhavyaa Chandarana, updated by Tomas Vega Waichman + Date written: 2023-08-04 scLearn workflow was generated using the following tutorial: https://github.com/bm2-lab/scLearn#single-label-single-cell-assignment @@ -524,6 +529,7 @@ scLearn workflow was generated using the following tutorial: https://github.com/ ## singleCellNet Documentation written by: Rodrigo Lopez Gutierrez + Date written: 2023-08-01 singleCellNet workflow was generated following the tutorial below: @@ -538,6 +544,7 @@ Normal parameters were used in both the training and prediction functions, with ## ACTINN Documentation written by: Alva Annett + Date written: 2023-08-08 ACTINN code is based on `actinn_format.py` and `actinn_predict.py` originally found here: https://github.com/mafeiyang/ACTINN @@ -549,6 +556,7 @@ ACTINN code is based on `actinn_format.py` and `actinn_predict.py` originally fo ## Tangram Documentation written by: Tomas Vega Waichman + Date written: 2023-08-08 The Tangram workflow was generated following the tutorial provided below: @@ -567,6 +575,7 @@ It is necessary to explore whether parallelization is possible. ## scAnnotate Documentation written by: Tomas Vega Waichman + Date written: 2023-08-11 The scAnnotate workflow was generated following the tutorial provided below: @@ -581,6 +590,7 @@ https://cran.r-project.org/web/packages/scAnnotate/vignettes/Introduction.html ## scID Documentation written by: Tomas Vega Waichman + Date written: 2023-08-12 The scID workflow was generated following the tutorials provided below: @@ -603,6 +613,7 @@ R CMD INSTALL MAST_1.26.0.tar.gz ## scNym Documentation written by: Tomas Vega Waichman + Date written: 2023-08-14 The scNym workflow was generated following the tutorial provided below: @@ -622,12 +633,15 @@ confidence scores." ## CellTypist Documentation written by: Tomas Vega Waichman + Date written: 2023-08-16 The CellTypist workflow was generated following the tutorials provided below: + Training: * https://celltypist.readthedocs.io/en/latest/celltypist.train.html * https://github.com/Teichlab/celltypist#supplemental-guidance-generate-a-custom-model + Predicting: * https://celltypist.readthedocs.io/en/latest/notebook/celltypist_tutorial_ml.html diff --git a/Scripts/preprocess.R b/Scripts/preprocess.R index 8adbe43..00b7fb5 100644 --- a/Scripts/preprocess.R +++ b/Scripts/preprocess.R @@ -16,13 +16,17 @@ l[['ref']] = data.table::fread(ref_path, header = T) %>% column_to_rownames('V1' # read query for(p in query_paths){ + print(p) tmp = data.table::fread(p, header = T) %>% column_to_rownames('V1') query = basename(dirname(p)) + print(query) l[[query]] = tmp } # if specified by user, convert reference gene names from mouse to human if(convert_genes){ + + message('@ CONVERTING GENE NAMES') # include functions and libraries for conversion library(Orthology.eg.db) @@ -70,18 +74,20 @@ genes = lapply(l, function(x){(colnames(x))}) # reduce set of genes to the intersect common_genes = Reduce(intersect,genes) +print(paste0('@Found ', length(common_genes), ' in common')) # throw error if number of common genes below % threshold of genes in any of provided datasets (ref or query) threshold = 0.5 -if(any(length(common_genes) < threshold*length(genes))){ - frac = lapply(genes, function(x){length(common_genes)/length(x)}) +frac = lapply(genes, function(x){length(common_genes)/length(x)}) + +if(any(frac < threshold)){ names(frac) = names(l) print(frac) stop(paste0("@ In at least one provided dataset (ref or query), less than ",threshold*100,"% of genes appear in common gene set. See above for the fraction of genes from each dataset appearing in common gene set (note: samples with few genes will have higher fractions)")) } # save common genes -data.table::fwrite(data.frame('common_genes' = genes), paste0(out, '/model/', reference_name, '/common_genes.csv')) +data.table::fwrite(data.frame('common_genes' = common_genes), file = paste0(out, '/model/', reference_name, '/common_genes.csv')) # filter each data set for common genes l = lapply(l, function(x){x[,common_genes]}) @@ -105,5 +111,3 @@ lab = data.table::fread(lab_path, header = T) %>% column_to_rownames('V1') lab = data.frame(label = unique(lab$label)) data.table::fwrite(lab, file = paste0(out, '/model/', reference_name, '/labels.csv'), sep = ',') - -