Merge pull request #65 from fungenomics/dev

Dev
fungenomics · Sep 6, 2023 · c03e1ce · c03e1ce
2 parents e03cd3a + 5e20b5e
commit c03e1ce
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -239,7 +239,7 @@ extra_config=<path to your new default config file>
 snakemake -s ${snakefile} --configfile ${config} ${extra_config} --cores 5
 ```
 
-#  Outputs 
+# :hatching_chick: Outputs 
 
 ## Output folder structure 
 
@@ -260,7 +260,7 @@ out/
     └── reference2
 ```
 
-## Output files 
+<!--- ## Output files --->
 
 
 # :gear: Installation and Dependencies
@@ -364,9 +364,9 @@ pip install numpy pandas scHPL sklearn anndata matplotlib scanpy datetime tensor
 - Tangram
 ```
 
-# :floppy_disk: Resources  
+<!--- # :floppy_disk: Resources  --->
 
-Add table with resource usage for different sice references and queries 
+<!--- Add table with resource usage for different size references and queries --->
 
 # :woman_mechanic: Adding new tools
 
@@ -433,6 +433,7 @@ snakemake -s ${snakefile} --configfile ${config} --report ${report}
 ## scClassify
 
 Documentation written by: Bhavyaa Chandarana
+
 Date written: 2023-07
 
 scClassify workflow was generated using the tutorial below:
@@ -451,6 +452,7 @@ https://www.bioconductor.org/packages/release/bioc/vignettes/scClassify/inst/doc
 ## scPred
 
 Documentation written by: Alva Annett    
+
 Date written: 2023-07   
 
 Normalization and parameters based on this tutorial:   
@@ -476,6 +478,7 @@ http://www.bioconductor.org/packages/devel/bioc/vignettes/SingleR/inst/doc/Singl
 ## singleCellNet
 
 Documentation written by: Rodrigo Lopez Gutierrez
+
 Date written: 2023-08-01
 
 singleCellNet workflow was generated following the tutorial below:
@@ -490,6 +493,7 @@ Normal parameters were used in both the training and prediction functions, with
 ## Correlation
 
 Documentation written by: Rodrigo Lopez Gutierrez   
+
 Date written: 2023-08-02   
 
 The Correlation tool runs a correlation-based cell type prediction on a sample of interest, given the mean gene expression per label for a reference.
@@ -507,6 +511,7 @@ Currently only outputting a table with each cell, the most highly correlated lab
 ## scLearn
 
 Documentation written by: Bhavyaa Chandarana, updated by Tomas Vega Waichman
+
 Date written: 2023-08-04 
 
 scLearn workflow was generated using the following tutorial: https://github.com/bm2-lab/scLearn#single-label-single-cell-assignment
@@ -524,6 +529,7 @@ scLearn workflow was generated using the following tutorial: https://github.com/
 ## singleCellNet
 
 Documentation written by: Rodrigo Lopez Gutierrez   
+
 Date written: 2023-08-01  
 
 singleCellNet workflow was generated following the tutorial below:
@@ -538,6 +544,7 @@ Normal parameters were used in both the training and prediction functions, with
 ## ACTINN
 
 Documentation written by: Alva Annett    
+
 Date written: 2023-08-08    
 
 ACTINN code is based on `actinn_format.py` and `actinn_predict.py` originally found here: https://github.com/mafeiyang/ACTINN
@@ -549,6 +556,7 @@ ACTINN code is based on `actinn_format.py` and `actinn_predict.py` originally fo
 ## Tangram
 
 Documentation written by: Tomas Vega Waichman    
+
 Date written: 2023-08-08     
 
 The Tangram workflow was generated following the tutorial provided below:
@@ -567,6 +575,7 @@ It is necessary to explore whether parallelization is possible.
 ## scAnnotate
 
 Documentation written by: Tomas Vega Waichman    
+
 Date written: 2023-08-11   
 
 The scAnnotate workflow was generated following the tutorial provided below:
@@ -581,6 +590,7 @@ https://cran.r-project.org/web/packages/scAnnotate/vignettes/Introduction.html
 ## scID
 
 Documentation written by: Tomas Vega Waichman    
+
 Date written: 2023-08-12    
 
 The scID workflow was generated following the tutorials provided below:
@@ -603,6 +613,7 @@ R CMD INSTALL MAST_1.26.0.tar.gz
 ## scNym
 
 Documentation written by: Tomas Vega Waichman    
+
 Date written: 2023-08-14 
 
 The scNym workflow was generated following the tutorial provided below:
@@ -622,12 +633,15 @@ confidence scores."
 ## CellTypist
 
 Documentation written by: Tomas Vega Waichman    
+
 Date written: 2023-08-16
 
 The CellTypist workflow was generated following the tutorials provided below:
+
 Training:
 * https://celltypist.readthedocs.io/en/latest/celltypist.train.html
 * https://github.com/Teichlab/celltypist#supplemental-guidance-generate-a-custom-model
+
 Predicting:
 * https://celltypist.readthedocs.io/en/latest/notebook/celltypist_tutorial_ml.html
 

diff --git a/Scripts/preprocess.R b/Scripts/preprocess.R
@@ -16,13 +16,17 @@ l[['ref']] = data.table::fread(ref_path, header = T) %>% column_to_rownames('V1'
 
 # read query 
 for(p in query_paths){
+  print(p)
   tmp = data.table::fread(p, header = T) %>% column_to_rownames('V1')
   query = basename(dirname(p))
+  print(query)
   l[[query]] = tmp
 }
 
 # if specified by user, convert reference gene names from mouse to human
 if(convert_genes){
+
+  message('@ CONVERTING GENE NAMES')
 
   # include functions and libraries for conversion
   library(Orthology.eg.db)
@@ -70,18 +74,20 @@ genes = lapply(l, function(x){(colnames(x))})
 
 # reduce set of genes to the intersect 
 common_genes = Reduce(intersect,genes)
+print(paste0('@Found ', length(common_genes), ' in common'))
 
 # throw error if number of common genes below % threshold of genes in any of provided datasets (ref or query) 
 threshold = 0.5
-if(any(length(common_genes) < threshold*length(genes))){
-  frac = lapply(genes, function(x){length(common_genes)/length(x)})
+frac = lapply(genes, function(x){length(common_genes)/length(x)})
+
+if(any(frac < threshold)){
   names(frac) = names(l)
   print(frac)
   stop(paste0("@ In at least one provided dataset (ref or query), less than ",threshold*100,"% of genes appear in common gene set. See above for the fraction of genes from each dataset appearing in common gene set (note: samples with few genes will have higher fractions)"))
 }
 
 # save common genes 
-data.table::fwrite(data.frame('common_genes' = genes), paste0(out, '/model/', reference_name, '/common_genes.csv'))
+data.table::fwrite(data.frame('common_genes' = common_genes), file = paste0(out, '/model/', reference_name, '/common_genes.csv'))
 
 # filter each data set for common genes
 l = lapply(l, function(x){x[,common_genes]})
@@ -105,5 +111,3 @@ lab = data.table::fread(lab_path, header = T) %>% column_to_rownames('V1')
 lab = data.frame(label = unique(lab$label))
 data.table::fwrite(lab, file = paste0(out, '/model/', reference_name, '/labels.csv'), sep = ',')
 
-
-