diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cf48696 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.Rhistory +.RData +.snakemake/ +.ipynb_checkpoints +config.yml +test/ +Notebooks/annotate_report_cache/ +Notebooks/benchmark_report_cache/ diff --git a/Benchmarking/.snakemake/log/2022-09-08T133151.890209.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T133151.890209.snakemake.log deleted file mode 100644 index 341f44d..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T133151.890209.snakemake.log +++ /dev/null @@ -1,8 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T133151.890209.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). diff --git a/Benchmarking/.snakemake/log/2022-09-08T133238.758946.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T133238.758946.snakemake.log deleted file mode 100644 index 9503b4a..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T133238.758946.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T133238.758946.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T141158.777553.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T141158.777553.snakemake.log deleted file mode 100644 index 33c32ab..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T141158.777553.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T141158.777553.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T141421.312624.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T141421.312624.snakemake.log deleted file mode 100644 index 179ec5f..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T141421.312624.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T141421.312624.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T141503.422813.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T141503.422813.snakemake.log deleted file mode 100644 index 8c12be4..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T141503.422813.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T141503.422813.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T141620.098535.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T141620.098535.snakemake.log deleted file mode 100644 index a4cc840..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T141620.098535.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T141620.098535.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T141718.523238.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T141718.523238.snakemake.log deleted file mode 100644 index 018e55f..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T141718.523238.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T141718.523238.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T141852.782530.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T141852.782530.snakemake.log deleted file mode 100644 index f3efa1a..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T141852.782530.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T141852.782530.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T142121.740084.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T142121.740084.snakemake.log deleted file mode 100644 index 082e9bd..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T142121.740084.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T142121.740084.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T142322.175430.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T142322.175430.snakemake.log deleted file mode 100644 index 10aa868..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T142322.175430.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T142322.175430.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T142344.721772.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T142344.721772.snakemake.log deleted file mode 100644 index 9e2e818..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T142344.721772.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T142344.721772.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T142356.762010.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T142356.762010.snakemake.log deleted file mode 100644 index a4e210d..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T142356.762010.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T142356.762010.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T142404.703381.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T142404.703381.snakemake.log deleted file mode 100644 index feaadf1..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T142404.703381.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T142404.703381.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T142436.949686.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T142436.949686.snakemake.log deleted file mode 100644 index e762d0d..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T142436.949686.snakemake.log +++ /dev/null @@ -1,7 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T142436.949686.snakemake.log -Executing main workflow. -Creating report... -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T142557.810546.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T142557.810546.snakemake.log deleted file mode 100644 index 337bbf2..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T142557.810546.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T142557.810546.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T143045.014696.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T143045.014696.snakemake.log deleted file mode 100644 index 9d4439a..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T143045.014696.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T143045.014696.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T143846.878355.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T143846.878355.snakemake.log deleted file mode 100644 index 659a1e8..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T143846.878355.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T143846.878355.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T143915.339453.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T143915.339453.snakemake.log deleted file mode 100644 index bf531d7..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T143915.339453.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T143915.339453.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding UMAP_embeddings.csv (0.18 MB). -Adding UMAP_embeddings.csv (0.27 MB). -Adding UMAP_embeddings.csv (0.53 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T144001.049537.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T144001.049537.snakemake.log deleted file mode 100644 index 736f709..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T144001.049537.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T144001.049537.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding UMAP_embeddings.csv (0.18 MB). -Adding UMAP_embeddings.csv (0.27 MB). -Adding UMAP_embeddings.csv (0.53 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T144242.060011.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T144242.060011.snakemake.log deleted file mode 100644 index ec8079d..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T144242.060011.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T144242.060011.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding UMAP_embeddings.csv (0.18 MB). -Adding UMAP_embeddings.csv (0.27 MB). -Adding UMAP_embeddings.csv (0.53 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T144432.308860.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T144432.308860.snakemake.log deleted file mode 100644 index 41a75b5..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T144432.308860.snakemake.log +++ /dev/null @@ -1,16 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T144432.308860.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding UMAP_embeddings.csv (0.18 MB). -Adding UMAP_embeddings.csv (0.27 MB). -Adding UMAP_embeddings.csv (0.53 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T144609.966021.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T144609.966021.snakemake.log deleted file mode 100644 index bfa5547..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T144609.966021.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T144609.966021.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T144635.458546.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T144635.458546.snakemake.log deleted file mode 100644 index fb5b709..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T144635.458546.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T144635.458546.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T144749.295035.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T144749.295035.snakemake.log deleted file mode 100644 index 97dba95..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T144749.295035.snakemake.log +++ /dev/null @@ -1,10 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T144749.295035.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T144812.039084.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T144812.039084.snakemake.log deleted file mode 100644 index ada97f7..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T144812.039084.snakemake.log +++ /dev/null @@ -1,16 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T144812.039084.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding UMAP_embeddings.csv (0.18 MB). -Adding UMAP_embeddings.csv (0.27 MB). -Adding UMAP_embeddings.csv (0.53 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T144829.502825.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T144829.502825.snakemake.log deleted file mode 100644 index c97d324..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T144829.502825.snakemake.log +++ /dev/null @@ -1,16 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T144829.502825.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding UMAP_embeddings.csv (0.18 MB). -Adding UMAP_embeddings.csv (0.27 MB). -Adding UMAP_embeddings.csv (0.53 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T145000.697165.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T145000.697165.snakemake.log deleted file mode 100644 index a96b824..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T145000.697165.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T145000.697165.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T145117.724678.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T145117.724678.snakemake.log deleted file mode 100644 index 38d74e7..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T145117.724678.snakemake.log +++ /dev/null @@ -1,14 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T145117.724678.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -WorkflowError in line 97 of /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/snakefile: -Failed to resolve wildcards. -AttributeError: 'Wildcards' object has no attribute 'sample' - File "/project/kleinman/modules/software/miniconda/3.8/envs/snakemake/lib/python3.9/site-packages/snakemake/report/__init__.py", line 651, in auto_report - File "/project/kleinman/modules/software/miniconda/3.8/envs/snakemake/lib/python3.9/site-packages/snakemake/report/__init__.py", line 632, in register_file - File "/project/kleinman/modules/software/miniconda/3.8/envs/snakemake/lib/python3.9/site-packages/snakemake/report/__init__.py", line 213, in __init__ diff --git a/Benchmarking/.snakemake/log/2022-09-08T145147.458529.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T145147.458529.snakemake.log deleted file mode 100644 index ee28b62..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T145147.458529.snakemake.log +++ /dev/null @@ -1,14 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T145147.458529.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -WorkflowError in line 97 of /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/snakefile: -Failed to resolve wildcards. -AttributeError: 'Wildcards' object has no attribute 'samples' - File "/project/kleinman/modules/software/miniconda/3.8/envs/snakemake/lib/python3.9/site-packages/snakemake/report/__init__.py", line 651, in auto_report - File "/project/kleinman/modules/software/miniconda/3.8/envs/snakemake/lib/python3.9/site-packages/snakemake/report/__init__.py", line 632, in register_file - File "/project/kleinman/modules/software/miniconda/3.8/envs/snakemake/lib/python3.9/site-packages/snakemake/report/__init__.py", line 213, in __init__ diff --git a/Benchmarking/.snakemake/log/2022-09-08T145333.876334.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T145333.876334.snakemake.log deleted file mode 100644 index 9d50714..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T145333.876334.snakemake.log +++ /dev/null @@ -1,8 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T145333.876334.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). diff --git a/Benchmarking/.snakemake/log/2022-09-08T145358.279667.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T145358.279667.snakemake.log deleted file mode 100644 index 4fe8b70..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T145358.279667.snakemake.log +++ /dev/null @@ -1,8 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T145358.279667.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). diff --git a/Benchmarking/.snakemake/log/2022-09-08T150009.566381.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T150009.566381.snakemake.log deleted file mode 100644 index 79b39ab..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T150009.566381.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T150009.566381.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T151241.745673.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T151241.745673.snakemake.log deleted file mode 100644 index 47165f7..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T151241.745673.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T151241.745673.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T151446.425062.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T151446.425062.snakemake.log deleted file mode 100644 index 73a8384..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T151446.425062.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T151446.425062.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T151659.197199.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T151659.197199.snakemake.log deleted file mode 100644 index dfb79fd..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T151659.197199.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T151659.197199.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T152841.441632.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T152841.441632.snakemake.log deleted file mode 100644 index cb217f9..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T152841.441632.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T152841.441632.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T153008.691643.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T153008.691643.snakemake.log deleted file mode 100644 index 6d428d3..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T153008.691643.snakemake.log +++ /dev/null @@ -1,13 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T153008.691643.snakemake.log -Executing main workflow. -Creating report... -Adding Prediction_Summary.tsv (0.33 MB). -Adding Prediction_Summary.tsv (0.49 MB). -Adding Prediction_Summary.tsv (0.95 MB). -Adding Consensus.png (0.031 MB). -Adding Consensus.png (0.023 MB). -Adding Consensus.png (0.052 MB). -Downloading resources and rendering HTML. -Report created: report.html. diff --git a/Benchmarking/.snakemake/log/2022-09-08T153018.042895.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T153018.042895.snakemake.log deleted file mode 100644 index f934821..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T153018.042895.snakemake.log +++ /dev/null @@ -1,24 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T153018.042895.snakemake.log -Executing main workflow. -Using shell: /usr/bin/bash -Provided cores: 2 -Rules claiming more threads will be scaled down. -Job counts: - count jobs - 1 all - 1 concat - 1 plot - 3 -Select jobs to execute... - -[Thu Sep 8 15:30:18 2022] -rule concat: - input: /project/kleinman/hussein.lakkis/from_hydra/test/BT2016062, /project/kleinman/hussein.lakkis/from_hydra/test/P-1694_S-1694_multiome, /project/kleinman/hussein.lakkis/from_hydra/test/P-1701_S-1701_multiome - output: /project/kleinman/hussein.lakkis/from_hydra/test/BT2016062/Prediction_Summary.tsv, /project/kleinman/hussein.lakkis/from_hydra/test/P-1694_S-1694_multiome/Prediction_Summary.tsv, /project/kleinman/hussein.lakkis/from_hydra/test/P-1701_S-1701_multiome/Prediction_Summary.tsv - log: /project/kleinman/hussein.lakkis/from_hydra/test/BT2016062/Gatherpreds.log, /project/kleinman/hussein.lakkis/from_hydra/test/P-1694_S-1694_multiome/Gatherpreds.log, /project/kleinman/hussein.lakkis/from_hydra/test/P-1701_S-1701_multiome/Gatherpreds.log - jobid: 1 - -Terminating processes on user request, this might take some time. -Cancelling snakemake on user request. diff --git a/Benchmarking/.snakemake/log/2022-09-08T153714.016757.snakemake.log b/Benchmarking/.snakemake/log/2022-09-08T153714.016757.snakemake.log deleted file mode 100644 index 9921f22..0000000 --- a/Benchmarking/.snakemake/log/2022-09-08T153714.016757.snakemake.log +++ /dev/null @@ -1,22 +0,0 @@ -Building DAG of jobs... -Nothing to be done. -Complete log: /project/kleinman/hussein.lakkis/from_hydra/scCoAnnotate/Benchmarking/.snakemake/log/2022-09-08T153714.016757.snakemake.log -Executing main workflow. -Using shell: /usr/bin/bash -Provided cores: 2 -Rules claiming more threads will be scaled down. -Job counts: - count jobs - 1 all - 1 concat - 1 plot - 3 -Select jobs to execute... - -[Thu Sep 8 15:37:14 2022] -rule concat: - input: /project/kleinman/hussein.lakkis/from_hydra/test/BT2016062, /project/kleinman/hussein.lakkis/from_hydra/test/P-1694_S-1694_multiome, /project/kleinman/hussein.lakkis/from_hydra/test/P-1701_S-1701_multiome - output: /project/kleinman/hussein.lakkis/from_hydra/test/BT2016062/Prediction_Summary.tsv, /project/kleinman/hussein.lakkis/from_hydra/test/P-1694_S-1694_multiome/Prediction_Summary.tsv, /project/kleinman/hussein.lakkis/from_hydra/test/P-1701_S-1701_multiome/Prediction_Summary.tsv - log: /project/kleinman/hussein.lakkis/from_hydra/test/BT2016062/Gatherpreds.log, /project/kleinman/hussein.lakkis/from_hydra/test/P-1694_S-1694_multiome/Gatherpreds.log, /project/kleinman/hussein.lakkis/from_hydra/test/P-1701_S-1701_multiome/Gatherpreds.log - jobid: 1 - diff --git a/Config/config.default.yml b/Config/config.default.yml new file mode 100644 index 0000000..788747d --- /dev/null +++ b/Config/config.default.yml @@ -0,0 +1,58 @@ + +SingleR: + threads: 1 + +Correlation: + threads: 1 + +scPred: + threads: 1 + classifier: 'svmRadial' + +scClassify: + threads: 1 + +SciBet: + threads: 1 + +singleCellNet: + threads: 1 + +scHPL: + threads: 1 + classifier: 'svm' + dimred: 'False' + threshold: 0.5 + +SVMlinear: + threads: 1 + threshold: 0 + classifier: 'SVMlinear' + +SVC: + threads: 1 + classifier: 'rbf' + threshold: 0.5 + +ACTINN: + threads: 1 + +scLearn: + threads: 1 + +scID: + threads: 1 + +scAnnotate: + threads: 1 + threshold: 0.5 + +scNym: + threads: 1 + threshold: 0.5 + +CellTypist: + threads: 1 + feature_selection: 'True' + majority_voting: 'True' + threshold: 0.5 diff --git a/Notebooks/annotate_report.Rmd b/Notebooks/annotate_report.Rmd new file mode 100644 index 0000000..d44e134 --- /dev/null +++ b/Notebooks/annotate_report.Rmd @@ -0,0 +1,394 @@ +--- +title: "scCoAnnotate - `r params$sample`" +output: + html_document: + df_print: paged + theme: flatly + toc: yes + toc_float: yes + toc_depth: 1 + code_folding: hide +params: + refs: '' + tools: '' + consensus: '' + output_dir: '' + sample: '' + threads: '' + marker_genes: '' + query: '' +--- + +```{r setup, knitr_options, echo=F} +knitr::opts_chunk$set(message = FALSE, warning=FALSE) +``` + +```{r fig.show='hide', include=F} +library(tidyverse) +library(ComplexHeatmap) +library(Seurat) +library(MetBrewer) +library(plotly) +library(kableExtra) + +#empty plotly plot to make sure the other plotly plots get printed later +plotly_empty() + +# format notebook parameters +threads = as.numeric(params$threads) +refs = strsplit(params$refs, split = ' ')[[1]] +tools = c('Consensus',strsplit(params$tools, split = ' ')[[1]]) +marker_genes = strsplit(params$marker_genes, split = ' ')[[1]] +``` + +```{r} +plot_tool_correlation_heatmap = function(seurat, tools){ + + mat = query@meta.data %>% + select(all_of(tools)) %>% + rownames_to_column('cell') %>% + pivot_longer(!cell) %>% + mutate(value = factor(value)) %>% + mutate(value = as.numeric(value)) %>% + pivot_wider(names_from = name, values_from = value) %>% + column_to_rownames('cell') %>% + cor() + + mat[is.na(mat)] = 0 + + col_fun = circlize::colorRamp2(c(-1, 0, 1), c("#2274A5", "beige", "#F75C03")) + + count = query@meta.data %>% + select(all_of(tools)) %>% + rownames_to_column('cell') %>% + pivot_longer(!cell) %>% + filter(value %in% c('Unresolved', 'No Consensus')) %>% + dplyr::count(name, .drop = F) %>% + mutate(freq = round(n/nrow(query@meta.data)*100)) %>% + select(!n) %>% + column_to_rownames('name') + + count[setdiff(names(seurat@meta.data %>% select(tools)), rownames(count)),] = 0 + count = count[order(match(rownames(count), colnames(mat))), , drop = FALSE] + + ha = columnAnnotation('% Unresolved/No Consensus' = anno_barplot(count, border = F, gp = gpar(fill = '#596475', col = '#596475'))) + + h = ComplexHeatmap::Heatmap(mat, + name = 'Correlation', + col = col_fun, + width = ncol(mat)*unit(7, "mm"), + height = nrow(mat)*unit(7, "mm"), + rect_gp = gpar(col = "white", lwd = 2), + top_annotation = ha, + show_column_dend = F) + return(h) +} + +create_color_pal = function(class, mb = 'Juarez'){ + pal = sample(met.brewer(mb, length(class))) + names(pal) = class + pal['Unresolved'] = 'lightgrey' + pal['No Consensus'] = 'grey' + return(pal) +} + +plot_bar_largest_group = function(seurat, meta_column = '', pal = pal, fr = 0.1){ + +df = seurat@meta.data %>% + count(seurat_clusters, .data[[meta_column]]) %>% + group_by(seurat_clusters) %>% + mutate(`%` = (n / sum(n))) %>% + mutate(meta = ifelse(`%` < fr, NA, .data[[meta_column]])) + +pal = pal[unique(na.omit(df$meta))] + +df = df %>% + mutate(meta = factor(meta, levels = c(NA, names(pal)), exclude = NULL)) + +p1 = df %>% + ggplot(aes(x = seurat_clusters, y = `%`, fill = meta, text = sprintf(" %s
%s ", + meta, + scales::percent(`%`, scale = 100, accuracy = .1)))) + + geom_bar(stat = 'identity', position="fill") + + scale_fill_manual(values = pal, na.value = 'white', name = '') + + scale_y_continuous(expand = c(0,0), labels = scales::percent_format(scale = 100, accuracy = 1)) + + coord_flip() + + theme_bw() + + theme(text = element_text(size = 10), + axis.title = element_blank(), + axis.line = element_line(size = 0.5), + panel.border = element_blank(), + panel.grid = element_blank(), + strip.background = element_blank(), + aspect.ratio = 0.5) + + p2 = ggplotly(p1, tooltip = c('text')) %>% toWebGL() + + return(p2) +} + +plot_percentage_predicted_consensus_class = function(seurat, tools){ + col_fun = circlize::colorRamp2(c(0, 50, 100), c("#2274A5", "beige", "#F75C03")) + +x = seurat@meta.data %>% + select(c('Consensus', tools)) %>% + group_by(Consensus) %>% + pivot_longer(!c('Consensus')) %>% + group_by(Consensus, name) %>% + count(value, .drop = F) %>% + mutate(freq = n/sum(n)*100) %>% + filter(Consensus == value) %>% + select(name, freq, Consensus) %>% + pivot_wider(values_from = 'freq', names_from = 'Consensus',values_fill = 0) %>% + column_to_rownames('name') + + h = ComplexHeatmap::Heatmap(x, + name = '%', + col = col_fun, + width = ncol(x)*unit(4, "mm"), + height = nrow(x)*unit(4, "mm"), + rect_gp = gpar(col = "white", lwd = 2), row_names_side = 'left', + show_row_dend = F, column_names_gp = gpar(size = 7)) + + return(h) +} + +color_class_seurat = function(seurat, meta_column, pal){ + list = list() + pal['Unsure'] = 'red' + pal['No Consensus'] = 'red' + Idents(seurat) = meta_column + class = (table(query@meta.data[[meta_column]]) %>% as.data.frame() %>% filter(Freq > 20))$Var + + for(c in class){ + lab = names(Idents(seurat)[Idents(seurat) == c]) + p = DimPlot(seurat, cells.highlight = lab, cols = 'lightgrey', cols.highlight = pal[c], pt.size = 1) + umap_theme + ggtitle(c) + list[[c]] = p + } + + return(list) +} + +feature_plot_seurat = function(seurat, genes){ + list = list() + + genes = genes[genes %in% rownames(seurat@assays$RNA)] + for(g in genes){ + p = FeaturePlot(seurat, features = g, cols = c("#F2EFC7", "#BC412B"), order = T) + umap_theme + ggtitle(g) + list[[g]] = p + } + return(list) +} + +umap_plotly = function(seurat, meta_column, pal){ + + p1 = cbind(seurat@reductions$umap@cell.embeddings, seurat@meta.data) %>% + slice(sample(1:n())) %>% + ggplot(aes(UMAP_1, UMAP_2, color = .data[[meta_column]], text = .data[[meta_column]])) + + geom_point(alpha = 0.8) + + scale_color_manual(values = pal) + + theme_bw() + umap_theme + theme(legend.position = 'right') + + p2 = ggplotly(plot = p1, tooltip = c('text')) %>% layout(autosize = F, width = 550, height = 450) %>% toWebGL() + + return(p2) +} + +calculate_percentage_unsure = function(pred, order){ + warn = pred %>% + select(order) %>% + pivot_longer(order) %>% + mutate(value = factor(value)) %>% + group_by(name) %>% + count(value, .drop = F) %>% + mutate(frac = n/sum(n)*100) %>% + filter(!(!name == 'Consensus' & value == 'No Consensus')) %>% + filter(value %in% c('No Consensus', 'Unsure')) %>% + mutate(warn = case_when(frac >= 70 ~ 'HIGH', + frac < 70 & frac > 30 ~ 'MEDIUM', + frac <= 30 ~ 'LOW')) + + warn = data.frame(TOOL = warn$name, + LABEL = warn$value, + PERCENTAGE = warn$frac, + FLAG = warn$warn) %>% + mutate(TOOL = factor(TOOL, levels = order)) + + warn = warn[order(warn$TOOL),] + + warn$FLAG = cell_spec(warn$FLAG, + bold = T, + background = case_when(warn$FLAG == 'HIGH' ~ "red", + warn$FLAG == 'MEDIUM' ~ "yellow", + warn$FLAG == 'LOW' ~ "green")) + + warn$TOOL = cell_spec(warn$TOOL, + bold = ifelse(warn$TOOL == 'Consensus', T, F), + background = ifelse(warn$TOOL == 'Consensus', 'black', 'white'), + color = ifelse(warn$TOOL == 'Consensus', 'white', 'black')) + return(warn) +} + +umap_theme = theme(aspect.ratio = 1, + text = element_text(size = 10), + axis.title = element_blank(), + axis.text = element_blank(), + axis.ticks = element_blank(), + axis.line = element_blank(), + panel.border = element_rect(colour = "grey", fill=NA, size=0.5), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_blank(), + legend.position = "none") +``` + + +```{r} +# read prediction summary for each reference +list = list() + +for(r in refs){ + list[[r]]$lab = data.table::fread(paste0(params$output_dir, '/model/', r, '/labels.csv'), header = T) + + list[[r]]$pred = data.table::fread(paste0(params$output_dir, '/', params$sample, '/', r, '/Prediction_Summary.tsv')) %>% + harmonize_unsure(., list[[r]]$lab) + + # create reference pal + list[[r]]$pal = create_color_pal(list[[r]]$lab$label) + + #save(list[[r]]$pal, file = paste0(params$output_dir, '/model/', r, '/class_pal.Rda')) +} + +# read expression matrix for sample +query = data.table::fread(paste0(params$query), + nThread=threads, + header=T, + data.table=F) %>% + column_to_rownames('V1') +``` + +```{r, results='hide'} +# create seurat object from expression matrix +set.seed(12345) +query = t(query) +query = CreateSeuratObject(query, row.names = colnames(query)) + +query = query %>% + NormalizeData() %>% + FindVariableFeatures() %>% + ScaleData() %>% + RunPCA() %>% + FindNeighbors(dims = 1:30) %>% + FindClusters(resolution = 0.5) %>% + RunUMAP(dims = 1:30) +``` + +```{r fig.width=8,fig.height=8,echo=FALSE,message=FALSE,results="asis"} +set.seed(12345) +cat("\n") + +for(r in refs){ + + query = AddMetaData(query, list[[r]]$pred) + + cat(" \n#", r, "{.tabset} \n") + + cat(" \n## Sample \n") + + cat("

Clusters

") + + p = umap_plotly(query, 'seurat_clusters', unname(list[[r]]$pal)) + print(htmltools::tagList(p)) + + cat("\n") + + cat("

Expression selected genes

") + + if(!length(marker_genes) == 0){ + l = feature_plot_seurat(query, marker_genes) + if(length(marker_genes) < 9){ + cowplot::plot_grid(plotlist = l[1:9], ncol = 3) %>% print() + }else{ + for(i in seq(from = 1, by = 9, length.out = round(length(marker_genes)/9))){ + cowplot::plot_grid(plotlist = l[i:(i+8)], ncol = 3) %>% print() + } + } + } + + cat("\n") + + cat(" \n## Prediction QC \n") + + cat("

Percentage Unsure

") + + calculate_percentage_unsure(list[[r]]$pred, order = tools) %>% + kbl(escape = FALSE, row.names = F) %>% + kable_styling(position = "center") %>% + print() + + cat("\n") + + cat("

Correlation between tools

") + + h = plot_tool_correlation_heatmap(query, tools = tools) + draw(h) + + cat("\n") + + cat("

Percentage overlap between tools and consensus

") + + h = plot_percentage_predicted_consensus_class(query, tools = tools) + draw(h) + + cat("\n") + + cat(" \n## Prediction {.tabset} \n") + + for(t in tools){ + cat(" \n### ", t , " \n") + + cat("

Top class per cluster

") + + p = plot_bar_largest_group(query, t, fr = 0.1, pal = list[[r]]$pal) + print(htmltools::tagList(p)) + + cat("

UMAP

") + + cat("\n") + + p = umap_plotly(query, t, list[[r]]$pal) + print(htmltools::tagList(p)) + + cat("\n") + + cat("

UMAP per class

") + + l = color_class_seurat(query, t, list[[r]]$pal) + if(length(l)< 9){ + cowplot::plot_grid(plotlist = l[1:9], ncol = 3) %>% print() + }else{ + for(i in seq(from = 1, by = 9, length.out = round(length(l)/9))){ + cowplot::plot_grid(plotlist = l[i:(i+8)], ncol = 3) %>% print() + } + } + + cat("\n") + } +} +``` + +# Report Info + +## Parameters + +```{r echo=FALSE,message=FALSE,results="asis"} +for(p in names(params)){ + cat(" \n -",p,": ", params[[p]], " \n") +} +``` + +## Session + +```{r} +sessionInfo() +``` diff --git a/Notebooks/benchmark_report.Rmd b/Notebooks/benchmark_report.Rmd new file mode 100644 index 0000000..c27b1b5 --- /dev/null +++ b/Notebooks/benchmark_report.Rmd @@ -0,0 +1,244 @@ +--- +title: "scCoAnnotate - Benchmarking" +output: + html_document: + df_print: paged + theme: flatly + toc: yes + toc_float: yes + toc_depth: 1 + code_folding: hide +params: + tools: '' + ref_name: '' + pred_path: '' + fold: '' +--- + +```{r, echo=FALSE} +knitr::opts_chunk$set(message = FALSE, warning=FALSE) +``` + +```{r} +set.seed(1234) +library(tidyverse) +library(caret) +library(ComplexHeatmap) +``` + +```{r} +get_pred = function(pred, tool, true){ + pred %>% + select(tool) %>% + mutate(label = .data[[tool]], + label = ifelse(!label %in% true$label, NA, label), + label = factor(label, ordered = TRUE)) %>% + return() +} + +# Plot confusion matrix as a heatmap +plot_cm = function(cm_table){ + col_fun = circlize::colorRamp2(c(range(cm_table)[1], + range(cm_table)[2]/2, + range(cm_table)[2]), + c("#5C80BC", "#F2EFC7", "#FF595E")) + + h = Heatmap(cm_table, + name = 'Counts', + col = col_fun, + width = ncol(cm_table)*unit(2, "mm"), + height = nrow(cm_table)*unit(2, "mm"), + cluster_rows = F, + cluster_columns = F, + row_names_gp = gpar(fontsize = 7), + column_names_gp = gpar(fontsize = 7), + column_title = 'True Class', + row_title = 'Predicted Class') + + return(h) +} + +# Plot class stat per fold (F1 etc) as barplot +plot_stat = function(cm_byclass, stat){ +p = cm_byclass %>% + as.data.frame() %>% + rownames_to_column('class') %>% + separate(class, into = c(NA, 'class'), sep = ': ') %>% + ggplot(aes(reorder(class, -.data[[stat]]), .data[[stat]])) + + geom_bar(stat = 'identity', col = 'white', fill = 'lightgrey') + + theme_bw() + + theme(text = element_text(size = 10), + axis.title.x = element_blank(), + axis.line = element_line(size = 0.5), + panel.border = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.text.x = element_text(angle = 45, + vjust = 1, + hjust=1), + aspect.ratio = 0.5) + + scale_y_continuous(expand = c(0, 0)) + + geom_hline(yintercept = c(1, 0.5), linetype = 'dotted', color = 'red') + +return(p) +} + +# plot F1 accross folds for each class as a boxplot +plot_stat_boxplot = function(list, tool, stat){ + +df = lapply(list[[tool]], get_stat, stat = stat) + +bind_rows(df) %>% + ggplot(aes(reorder(class, -.data[[stat]], mean), .data[[stat]])) + + geom_boxplot() + + theme_bw() + + theme(text = element_text(size = 10), + axis.title.x = element_blank(), + axis.line = element_line(size = 0.5), + panel.border = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.text.x = element_text(angle = 45, + vjust = 1, + hjust=1), + aspect.ratio = 0.5) + + scale_y_continuous(limits = c(0, 1), + expand = c(0, 0)) + + geom_hline(yintercept = c(1, 0.5), linetype = 'dotted', color = 'red') +} + +# plot average stat for all tools +plot_mean_tool = function(list, stat, tools){ + +df = lapply(list, function(x){lapply(x, get_stat, stat = stat) %>% bind_rows()}) + +df = bind_rows(df) %>% + group_by(class, tool) %>% + mutate(mean = mean(.data[[stat]])) %>% + distinct(class, tool, mean) %>% + pivot_wider(names_from = 'class', values_from = mean) %>% + column_to_rownames('tool') + +df[is.na(df)] = 0 + +col_fun = circlize::colorRamp2(c(0, + range(df)[2]/2, + range(df)[2]), + c("#3B5B91", "#F2EFC7", "#CC0007")) + +split = c('Consensus', rep('tools', length(tools)-1)) + +h = Heatmap(df, + name = paste('Mean ', stat), + col = col_fun, + width = ncol(df)*unit(4, "mm"), + height = nrow(df)*unit(6, "mm"), + row_names_side = 'left', + row_names_gp = gpar(fontsize = 12), + show_column_dend = F, + show_row_dend = F, + row_split = split, + cluster_row_slices = F, + row_title = NULL) + +return(h) +} + +#--------- HELPER FUNCTIONS ---------------- + +# gets stat for each fold and returns data frame +get_stat = function(x, stat){ + x$byClass %>% + as.data.frame() %>% + rownames_to_column('class') %>% + separate(class, into = c(NA, 'class'), sep = ': ') %>% + select(class, .data[[stat]]) %>% + mutate(fold = x$fold, + tool = x$tool) +} +#------------------------------------------- +``` + +```{r} +tools = c('Consensus', strsplit(params$tools, split = ' ')[[1]]) +fold = as.numeric(params$fold) +``` + +```{r} +# Read prediction and true labels for each tool and each fold and calculate confusion matrix and stats +# Save everything in a list object with hierarchy TOOL > FOLD > STATS +list = list() +for(n in 1:fold){ + + # read tru lables + true = data.table::fread(paste0(params$pred_path, '/fold', n, '/test_labels.csv'), header = T) %>% + column_to_rownames('V1') %>% + mutate(label = factor(label, ordered = TRUE)) + + # read prediction summary for fold + pred = data.table::fread(paste0(params$pred_path, '/fold', n, '/Prediction_Summary.tsv'), header = T)%>% + column_to_rownames('cellname') + + for(t in tools){ + + tmp = get_pred(pred, t, true) + + list[[t]][[n]] = confusionMatrix(data = tmp$label, reference = true$label, mode = 'everything') + list[[t]][[n]]$fold = paste0('fold', n) + list[[t]][[n]]$tool = t + } +} +``` + +```{r fig.width=10,echo=FALSE,message=FALSE,results="asis"} +cat(" \n#", params$ref_name , "{.tabset} \n") + +cat(" \n## Summary \n") +cat("

Average F1 score per tool and class

") + +plot_mean_tool(list, 'F1', tools) + +cat("\n") + +for(t in tools) { + cat(" \n##", t, "{.tabset} \n") + + print(plot_stat_boxplot(list, t, 'F1')) + + cat("\n") + + for(n in 1:fold){ + cat(" \n###", paste0('Fold ', n), " \n") + + cat("

Confusion Matrix

") + + draw(plot_cm(list[[t]][[n]]$table)) + + cat("

F1

") + + print(plot_stat(list[[t]][[n]]$byClass, 'F1')) + + cat("\n") + } +} +``` + +# Report Info + +## Parameters +```{r echo=FALSE,message=FALSE,results="asis"} +for(p in names(params)){ + cat(" \n -",p,": ", params[[p]], " \n") +} +``` + + +## Session + +```{r} +sessionInfo() +``` + + + + diff --git a/README.md b/README.md index 6fd72e8..9066f90 100644 --- a/README.md +++ b/README.md @@ -1,237 +1,653 @@ # scCoAnnotate -# Summary +Snakemake pipeline for consensus prediction of cell types in single-cell RNA sequencing (scRNA-seq) data. The pipeline allows users to run up to 15 different reference-based annotation tools (statistical models and machine learning approaches) to predict cell type labels of multiple scRNA-seq samples. It then outputs a consensus of the predictions, which has been found to have increased accuracy in benchmarking experiments compared to the individual predictions alone, by combining the strengths of the different approaches. -scRNA-seq based prediction of cell-types using a fast and efficient Snakemake pipeline to increase automation and reduce the need to run several scripts and experiments. The pipeline allows the user to select what single-cell annotation tools they want to run on a selected reference to annotate a list of query datasets. It then outputs a consensus of the predictions across tools selected. This pipeline trains classifiers on genes common to the reference and all query datasets. +The pipeline is automated and running it does not require prior knowledge of machine learning. It also features parallelization options to exploit available computational resources for maximal efficiency. This pipeline trains classifiers on genes common to the reference and all query datasets. -The pipeline also features parallelization options to exploit the computational resources available. +Two different workflows can be run as part of scCoAnnotate. The annotation workflow takes both a references data set and query samples with the aim of annotating the query samples. The benchmarking workflow takes only the reference and preforms a M fold cross validation. -# Installation and Dependencies + -Install [Snakemake](https://snakemake.readthedocs.io/en/stable/) in your linux environment. +See the snakemake rule graph for a more detailed description of the annotation workflow: +[Annotation Workflow](rulegraph.annotation.pdf) -You need to have have [R](https://www.r-project.org/) Version 4.0.5 and Python 3.6.5. +# :running_woman: Quickstart tutorial -```bash -$ conda activate base -$ mamba create -c conda-forge -c bioconda -n snakemake snakemake -``` +1. [Clone repository and install dependencies](#1-clone-repository-and-install-dependencies) +2. [Prepare reference](#2-prepare-reference) +3. [Prepare query samples](#3-prepare-query-samples) +4. [Prepare config file](#4-prepare-config-file) +5. [Prepare HPC submission script](#5-prepare-hpc-submission-script) +### 1. Clone repository and install dependencies +This step is not nessesary if you are part of the Kleinman group! -You need to also install all the dependancies for the tools you plan on using. You have to copy everything present in this repository and not break paths because it would disrupt the dependancies. One note is to change the paths in run_ACTINN.py to match your own directories when you clone the repository. The paths are in lines 44,45,49. +Clone git repository in appropriate location: +```bash +git clone https://github.com/fungenomics/scCoAnnotate.git +``` +Install R packages and python modules as specified in [Installation and Dependencies](#gear-installation-and-dependencies) +If you are part of the Kleinman group you only need to load the module on Narval or Hydra: -Current version of snakemake is snakemake/5.32.0 +```bash +module load scCoAnnotate/2.0 +``` -# Quickstart +### 2. Prepare reference -Using snakemake is straight forward and simple. The rules and processes are arranged as per this rule graph: +The input format for the references is a **cell x gene matrix** (.csv) of raw counts and a **cell x label matrix** (.csv). -Rule preprocess gets the common genes and creates temporary reference and query datasets based ob the common genes. Rule concat appends all predictions into one tab seperate file (prediction_summary.tsv) and gets the consensus prediction +Both the **cell x gene matrix** and **cell x label matrix** need the first column to be the cell names in matching order with an empty column name. +**cell x gene matrix** +```bash +'',gene1,gene2,gene3 +cell1,1,24,30 +cell2,54,20,61 +cell3,0,12,0 +cell4,1,13,17 +``` -![dag](https://user-images.githubusercontent.com/59002771/191146873-5c680bbd-d11c-418c-ae96-7662ee7f99ed.png) + **cell x label matrix** +```bash +'',label +cell1,label1 +cell2,label1 +cell3,label3 +cell4,label2 +``` +### 3. Prepare query samples +The input format for the query samples is a **cell x gene matrix** (.csv) of raw counts. -You need to set everything up in a config file and then run the following command: +The first column needs to be the cell names with an empty column name. +**cell x gene matrix** ```bash -snakemake --use-conda --configfile config.yml --cores 3 +'',gene1,gene2,gene3 +cell1,27,1,34 +cell2,0,12,56 +cell3,0,17,12 +cell4,54,20,61 ``` -## Config File: +### 4. Prepare config file + +For each set of query samples a config file needs to be prepared with information about the samples, the reference, the tools you want to run and how to calculate the consensus. + +Multiple references can be specified with an unique **reference name**. + +Full list of available tools can be found here: [Available tools](#hammer-and-wrench-available-tools) +Make sure that the names of the selected tools have the same capitalization and format as this list. + +The consensus method selected in **consensus_tools** can either be 'all' (which uses all the tools in **tools_to_run**) or a list of tools to include. + +See: [Example Config](example.config.yml) + ```yaml -# target directory -output_dir: +# target directory +output_dir: +output_dir_benchmark: + # path to reference to train classifiers on (cell x gene raw counts) -training_reference: -# path to annotations for the reference (csv file with cellname and label headers) -reference_annotations: +references: + : + expression: + labels: + : + expression: + labels: + # path to query datasets (cell x gene raw counts) query_datasets: - - - - - . - . -# step to check if required genes are kept between query and reference -check_genes: False -# path for the genes required -genes_required: Null -# rejection option for SVM -rejection: True + : + : + +# convert gene symbols in reference from mouse to human +convert_ref_mm_to_hg: False + # classifiers to run tools_to_run: - - - - - . - . -# benchmark tools on reference -benchmark: False -plots: True -consensus: - - - - - . + - tool1 + - tool2 + +# consensus method +consensus_tools: + - all + +# benchmark parameters +benchmark: + n_folds: +``` +See: [Changing Default Parameters](##changing-default-parameters) + +### 5. Prepare HPC submission script +To run the snakemake pipeline on a HPC a submission script needs to be prepared + +See: [Example Bash Script](example.submit.sh) + +```bash +#!/bin/sh +#SBATCH --job-name=scCoAnnotate +#SBATCH --account=rrg-kleinman +#SBATCH --output=logs/%x.out +#SBATCH --error=logs/%x.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=5 +#SBATCH --time=24:00:00 +#SBATCH --mem-per-cpu=60GB + +module load scCoAnnotate/2.0 + +# path to snakefile and config +snakefile= +config= + +# unlock directory incase of previous errors +snakemake -s ${snakefile} --configfile ${config} --unlock + +# run workflow +snakemake -s ${snakefile} --configfile ${config} --cores 5 ``` +Depending on if you want to run the annotation workflow or the benchmarking workflow the snakefile needs to be path to either [snakefile.annotate](snakefile.annotate) or [snakefile.benchmark](snakefile.benchmark) + +**OBS!!** Make sure that the number of cores requested match the number of cores in the snakemake command for optimal use of resources + +## Changing Default Parameters + +The pipeline uses a default config file in addition to the user defined one to specify tool parameters as well as cluster options. For full list of parameters you can change see: [Default Config](Config/config.default.yml) + +To over ride these values you can either add a corresponding section in your config file or copy the whole default config to your run folder, change the values and add it as an extra config in the submission script. The second option may be preferable if you are changing many of the default parameters. + +The order of overwriting parameters are as follows: +1. Config specified in the snakefile (in this case the default config) +2. Config specified as snakemake argument with `--configfile` (in the order they are added) +3. Parameters specified directly in snakemake argument with `--config` +### Option 1: Add corresponding section to your own config file -### An Example Config is attached +**Case:** You want to change the probbability cut off threshold from 0.5 to 0.25 for **scHPL** + +This section is found in the default config: + +```ymal +scHPL: + threads: 1 + classifier: 'svm' + dimred: 'False' + threshold: 0.5 +``` + +Create a corresponding section in your config and change the threshold value to 0.25: ```yaml -# target directory -output_dir: /project/kleinman/hussein.lakkis/from_hydra/test +# target directory +output_dir: +output_dir_benchmark: + # path to reference to train classifiers on (cell x gene raw counts) -training_reference: /project/kleinman/hussein.lakkis/from_hydra/2022_01_10-Datasets/Cortex/p0/expression.csv -# path to annotations for the reference (csv file with cellname and label headers) -reference_annotations: /project/kleinman/hussein.lakkis/from_hydra/2022_01_10-Datasets/Cortex/p0/labels.csv +references: + : + expression: + labels: + : + expression: + labels: + # path to query datasets (cell x gene raw counts) query_datasets: - - /project/kleinman/hussein.lakkis/from_hydra/Collab/HGG_Selin_Revision/test/BT2016062/expression.csv - - /project/kleinman/hussein.lakkis/from_hydra/Collab/HGG_Selin_Revision/test/P-1694_S-1694_multiome/expression.csv - - /project/kleinman/hussein.lakkis/from_hydra/Collab/HGG_Selin_Revision/test/P-1701_S-1701_multiome/expression.csv -# step to check if required genes are kept between query and reference -check_genes: False -# path for the genes required -genes_required: Null -# rejection option for SVM -rejection: True + : + : + +# convert gene symbols in reference from mouse to human +convert_ref_mm_to_hg: False + # classifiers to run tools_to_run: - - ACTINN - - scHPL - - scClassify - - correlation - - scmapcluster - - scPred - - SingleCellNet - - SVM_reject - - SingleR - - CHETAH - - scmapcell - - SciBet -# benchmark tools on reference -benchmark: False -plots: True -consensus: - - all -``` - -## Submission File: - -An example of the submission file is also available in this repository and is called submit.sh. This is for TORQUE schedulers. - - -``` bash -#!/usr/bin/bash -#PBS -N scCoAnnotate -#PBS -o logs/err.txt -#PBS -e logs/out.txt -#PBS -l walltime=20:00:00 -#PBS -l nodes=1:ppn=3 -#PBS -l mem=125G -#PBS -l vmem=125G - -# you need to do this step - -cd {root directory of the pipeline} -mkdir -p logs - -# set up the environment -#conda init bash -module load miniconda/3.8 -source ~/.conda_init.sh -module load snakemake/5.32.0 -module load hydra/R/4.0.5 -module load python/3.6.5 - -# Run the snakemake -snakemake --use-conda --configfile config.yml --cores 3 -``` - -# Tools Available - -1. [ACTINN](https://github.com/mafeiyang/ACTINN) -2. [SciBet](https://github.com/PaulingLiu/scibet) -4. [Spearman Correlation](https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide.php) -5. [SVM](https://scikit-learn.org/stable/modules/svm.html) -6. SVM Rejection -7. [SingleR](https://bioconductor.org/packages/release/bioc/html/SingleR.html) -8. [SingleCellNet](https://github.com/pcahan1/singleCellNet) -9. [CHETAH](https://www.bioconductor.org/packages/release/bioc/html/CHETAH.html) -10. [scHPL](https://github.com/lcmmichielsen/scHPL) -11. [scPred](https://github.com/powellgenomicslab/scPred) -12. [scmap (cell and cluster)](https://bioconductor.org/packages/release/bioc/html/scmap.html) - - - -# Packages Required: - -## Python Specific Libraries: - -``` -tensorboard==1.7.0 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.0 -tensorflow==1.7.0 -tensorflow-estimator==2.5.0 -sklearn==0.0 -scikit-learn==0.24.1 -pandas==1.1.5 -numpy==1.19.5 -numpy-groupies==0.9.13 -numpydoc==1.1.0 -scHPL==0.0.2 -``` - -## R Libraries: - -``` -scPred_1.9.2 -SingleCellExperiment_1.12.0 -SummarizedExperiment_1.20.0 -CHETAH_1.6.0 -scmap_1.12.0 -singleCellNet == 0.1.0 -scibet == 1.0 -SingleR == 1.4.1 -Seurat == 4.0.3 -dplyr == 1.0.7 -tidyr == 1.1.3 -viridis == 0.6.1 -ggsci == 2.9 -tidyverse == 1.3.1 -``` -# Adding New Tools: - -to add new tools, you have to add this template to the the snakefile as such: - -``` R -rule {rulename}: - input: - reference = "{output_dir}/expression.csv".format(output_dir =config['output_dir']), - labfile = config['reference_annotations'], - query = expand("{output_dir}/{sample}/expression.csv",sample = samples,output_dir=config['output_dir']), - output_dir = expand("{output_dir}/{sample}",sample = samples,output_dir=config['output_dir']) + - tool1 + - tool2 + +# consensus method +consensus_tools: + - all + +# benchmark parameters +benchmark: + n_folds: + +# additional parameters +scHPL: + threshold: 0.25 +``` + +### Option 2: Copy the whole default config and add it as an extra config file in the snakemake command + +In this case your submission script would look like this: + +```bash +# path to snakefile and config +snakefile= +config= +extra_config= + +# run workflow +snakemake -s ${snakefile} --configfile ${config} ${extra_config} --cores 5 +``` + +# Outputs + +## Output folder structure + +In each output folder there will be one folder per sample as well as a folder for the models. In each sample folder and in the model folder there will be subfolders for each reference specified in the config file. Each sample folder will also contain a reports folder with a `.html` report with the prediction results. Each refrence folder contains sub folders for the individual tools model and prediction results. + +``` +out/ +├── sample1 +│   ├── reference1 +│   ├── reference2 +│   └── report +├── sample1 +│   ├── reference1 +│   ├── reference2 +│   └── report +└── model + ├── reference1 + └── reference2 +``` + +## Output files + + +# :gear: Installation and Dependencies + +This tool has been designed and tested for use on a high-performance computing cluster (HPC) with a SLURM workload manager. + +It has been tested with [R](https://www.r-project.org/) version 4.2.2 and Python version 3.11.2. + +## R packages - CRAN + +```R +pkg = c("Seurat", + "tidyverse", + "MetBrewer", + "plotly", + "caret", + "Matrix", + "scAnnotate") + +install.packages(pkg) +``` + +Older version of Matrix package needs to be installed for Seurat to work: https://github.com/satijalab/seurat/issues/6746 + +```R +if (!require("devtools", quietly = TRUE)) + install.packages("devtools") + +devtools::install_version("Matrix", version = "1.5.3", repos = "http://cran.us.r-project.org") +``` + +## R packages - Bioconductor + +```R +if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +pkg = c("SingleCellExperiment", + "SummarizedExperiment", + "ComplexHeatmap", + "WGCNA", + "SingleR", + "scClassify", + "scuttle", + "scran", + "M3Drop", + "scAnnotate", + "Orthology.eg.db", + "org.Mm.eg.db", + "org.Hg.eg.db") + +BiocManager::install(pkg) +``` + +## R packages - Github + +```R +if (!require("devtools", quietly = TRUE)) + install.packages("devtools") + +pkg = c("pcahan1/singleCellNet", + "powellgenomicslab/scPred", + "PaulingLiu/scibet", + "bm2-lab/scLearn", + "BatadaLab/scID") + +devtools::install_github(pkg) +``` + +## Python modules + +```bash +pip install numpy pandas scHPL sklearn anndata matplotlib scanpy datetime tensorflow tables celltypist snakemake +``` + +# :hammer_and_wrench: Available tools + +## Single cell RNA reference + single cell RNA query + +```yaml +- scPred +- SingleR +- scClassify +- SciBet +- singleCellNet +- scHPL +- SVMlinear +- SVC +- scLearn +- ACTINN +- Correlation +- scID +- scAnnotate +- scNym +- CellTypist +``` + +## Single cell RNA reference + spatial RNA query + +```yaml +- Tangram +``` + +# :floppy_disk: Resources + +Add table with resource usage for different sice references and queries + +# :woman_mechanic: Adding new tools + +**1. Identify new tool** - output: - pred = expand("{output_dir}/{sample}/{rulename}/{rulename}_pred.csv", sample = samples,output_dir=config["output_dir"]), - query_time = expand("{output_dir}/{sample}/{rulename}/{rulename}_query_time.csv",sample = samples,output_dir=config["output_dir"]), - training_time = expand("{output_dir}/{sample}/{rulename}/{rulename}_training_time.csv",sample = samples,output_dir=config["output_dir"]) - log: expand("{output_dir}/{sample}/{rulename}/{rulename}.log", sample = samples,output_dir=config["output_dir"]) - shell: - "Rscript Scripts/run_{rulename}.R " - "--ref {input.reference} " - "--labs {input.labfile} " - "--query {input.query} " - "--output_dir {input.output_dir} " - "&> {log}" - - ``` - The tool script you add must generate outputs that match the output of the rule.. +**2. Read documentation. Try to find this information:** +- Can tool be split into training and prediction step? +- What normalization does the tool expect for the reference and query? +- Can the tool be paralellized? How? +- Does the tool expect the exact same set of features in the reference and the query? +- Are there extra outputs from the training and prediction that may be usefull to save (qc plots, probbability/p-value associated with each prediction etc..)? +- Are there parameters in the training and prediction that can be changed? What are the defult values? +**3. Open issue and create a branch from dev with the tool name + your name** +**4. Write scripts (check Templats folder for how to structure scripts: [Templats](Templats))** +- The scripts should take the reference expression/labels and query expression .csv files as specified in [Prepare reference](#2-prepare-reference) and [Prepare query samples](#3-prepare-query-samples) +- The scripts should take any additional parameters you want to be able to change as input + +**5. Update the snakefiles with rules for the new tool (ask Alva if you need help)** + +**6. Update README** +- Write detailed documentation about the tool in the section: [Detailed documentation on tool wrapper scripts](female-detective-detailed-documentation-on-tool-wrapper-scripts) +- Detailed documentation should include information from step 2 and if you changed any default parameters/normalization etc.. Links to papers and tutorials used to create the scripts can be put here. +- Update other sections of the readme such as the [Installation and Dependencies](gear-installation-and-dependencies) and [Available tools](hammer-and-wrench-available-tools) + +**7. Create pull request from your branch to dev and request reviewer** + +**8. When pull request is approved merge with dev** +- Rebase with dev +- Squash + merge + +**9. Make sure module on Narval/Hydra gets updated with necessary packages** + +# 🐍 Snakemake Tips and Tricks + +- Dryrun snakemake pipeline before submitting job +```bash +snakemake -s ${snakefile} --configfile ${config} -n +``` + +- Unlock working directory before running (in case previous run crashed) by adding this to your script +```bash +snakemake -s ${snakefile} --configfile ${config} --unlock +``` + +- Add `--rerun-incomplete` if snakemake finds incomplete files from a previous run that was not successfully removed +```bash +snakemake -s ${snakefile} --configfile ${config} --rerun-incomplete +``` + +- Update time stamp on files to avoid rerunning rules if code has changed +```bash +snakemake -s ${snakefile} --configfile ${config} -c1 -R $(snakemake -s ${snakefile} --configfile ${config} -c1 --list-code-changes) --touch +``` + +- Generate a report with information about the snakemake workflow +```bash +snakemake -s ${snakefile} --configfile ${config} --report ${report} +``` + +# :female_detective: Detailed documentation on tool wrapper scripts + +## scClassify + +Documentation written by: Bhavyaa Chandarana +Date written: 2023-07 + +scClassify workflow was generated using the tutorial below: +https://www.bioconductor.org/packages/release/bioc/vignettes/scClassify/inst/doc/scClassify.html + +* scCoAnnotate input reference and query have cells as the rows, genes as columns. scClassify (and the Seurat function used for normalization, see below) requires genes on the rows and cells on the columns. Therefore, I used `WGCNA::transposeBigData()` (function optimized for large sparse matrices) to transpose the inputs before normalization and training/prediction. + +* scClassify documentation defines "log-transformed" data as "size-factor normalized" data ([source](https://www.bioconductor.org/packages/devel/bioc/vignettes/scClassify/inst/doc/scClassify.html#2_Setting_up_the_data)). Function documentation for both `train_scClassify()` and `predict_scClassify()` specify that reference and query data must be "log-transformed" ([source](https://www.bioconductor.org/packages/release/bioc/manuals/scClassify/man/scClassify.pdf)) Therefore, I am normalizing both query and reference with `Seurat::NormalizeData()` (default parameters), which performs log normalization with scale factor 10000 ([source](https://satijalab.org/seurat/reference/normalizedata)) + +* scClassify train and predict functions `train_scClassify()` and `predict_scClassify()` both allow parallelization with package `BiocParallel`. If greater than one thread was requested by the user, I turn parallelization mode on with parallel = TRUE, and set the `BiocParallel` parameter to `BiocParallel::MulticoreParam()` with workers equal to number of requested threads (based on code in [this issue](https://github.com/SydneyBioX/scClassify/issues/14)) Otherwise I have set parallelization to FALSE and the `BiocParallel` parameter to `BiocParallel::SerialParam()` (which is the default value of the parameter in the functions - [source](https://www.bioconductor.org/packages/release/bioc/manuals/scClassify/man/scClassify.pdf)). + +* scClassify train function `train_scClassify()` can either return a list output for the model, or an R object of class `scClassifyTrainModel`, based on boolean argument `returnList`. Both types work as input for prediction with `predict_scClassify()`. However, in order to use `scClassify::cellTypeTree()` to extract and output the tree produced by scClassify during training, the input must be the R object of class `scClassifyTrainModel`. Therefore, I have chosen to set `returnList` in `train_scClassify()` to FALSE (default: TRUE), and use the resulting object for `cellTypeTree()`. (Function documentation [source](https://www.bioconductor.org/packages/release/bioc/manuals/scClassify/man/scClassify.pdf)) + +* `scClassify::plotCellTypeTree()` produces a ggplot object. Therefore, I am using `ggplot2::ggsave()` to save it as a png file. (Function documentation [source](https://www.bioconductor.org/packages/release/bioc/manuals/scClassify/man/scClassify.pdf)) + +## scPred + +Documentation written by: Alva Annett +Date written: 2023-07 + +Normalization and parameters based on this tutorial: +https://powellgenomicslab.github.io/scPred/articles/introduction.html + +* Both reference and query is normalized using `Seurat::NormalizeData()`. +Needs computed PCA space. Dims set to 1:30 according to tutorial. + +* Default model `SVMradial`. Option to switch model should be set up in snakemake. + +## SingleR + +Documentation written by: Alva Annett +Date written: 2023-07 + +Normalization and parameters based on this tutorial: +http://www.bioconductor.org/packages/devel/bioc/vignettes/SingleR/inst/doc/SingleR.html#3_Using_single-cell_references + +* Both reference and query is normalized using `scuttle::logNormCounts()`. Both reference and query is converted to SingleCellExperiment objects before normalization. + +* Deviation from default parameters: `de.method = de.method="wilcox"` Method for generating marker genes for each class in reference. Wilcox is recomended when single cell data is used as reference + +## singleCellNet + +Documentation written by: Rodrigo Lopez Gutierrez +Date written: 2023-08-01 + +singleCellNet workflow was generated following the tutorial below: +https://pcahan1.github.io/singleCellNet/ + +Input for `singleCellNet` is raw counts for both reference and query. The reference is normalized within the `scn_train()` function. The query is currently not normalized. In the tutorial example they used raw query data. Furthermore, according to the tutorial, the classification step is robust to the normalization and transformation steps of the query data sets. They claim that one can even directly use raw data to query and still obtains accurate classification. This could be tested in the future with our data to see if normalized queries perform better. + +Normal parameters were used in both the training and prediction functions, with the expection of the following parameters: +* In `scn_train()`, we used parameter `nTrees = 500` compared to the default `nTrees = 1000`. This parameter changes the number of trees for the random forest classifier. The value selected is based on Hussein's thesis and is changed to improve the speed of `singleCellNet`. It is mentioned that additional training parameters may need to be adjusted depending on the quality of the reference data. Additionally, tutorial mentions that classifier performance may increase if the values for `nTopGenes` and `nTopGenePairs` are increased. +* In `scn_predict()`, we used parameter `nrand = 0` compared to the default `nrand = 50`. This parameter changes the number of randomized single cell RNA-seq profiles which serve as positive controls that should be mostly classified as `rand` (unknown) category. If left at default value, then this would generate extra cells that might complicate downstream consolidation of the consensus predictions for each cell. Therefore, the selected value is used to avoid complication. + +## Correlation + +Documentation written by: Rodrigo Lopez Gutierrez +Date written: 2023-08-02 + +The Correlation tool runs a correlation-based cell type prediction on a sample of interest, given the mean gene expression per label for a reference. +The function to label by Spearman correlation was originally generated by Selin Jessa and Marie Coutelier. +Path to original file on Narval compute cluster: `/lustre06/project/6004736/sjessa/from_narval/HGG-oncohistones/stable/code/scripts/predict_celltype_cor.R` + +Input for `Correlation` is raw counts for both reference and query. Both the reference and the query are normalized using `Seurat::NormalizeData()`. + +Training script generates a matrix with the mean gene expression for each label in the reference. +Prediction script calculates a correlation between each cell in the query and each label in mean gene expression matrix generated in the training script. Then we assign each cell the most highly correlated label. +* `label_correlation()` function has a parameter `threshold_common_genes` which sets the percentage of query dataset genes required to be in the reference dataset in order to proceed. This parameter is currently not utilized as the preprocessing done in the beginning of the snakefile is extracting only the common genes between the reference and the queries. + +Currently only outputting a table with each cell, the most highly correlated label, and the corresponding correlation score for that label. In the future we could export the full correlation matrix, if necessary. + +## scLearn + +Documentation written by: Bhavyaa Chandarana, updated by Tomas Vega Waichman +Date written: 2023-08-04 + +scLearn workflow was generated using the following tutorial: https://github.com/bm2-lab/scLearn#single-label-single-cell-assignment + +* scCoAnnotate input reference and query have cells as the rows, genes as columns. scLearn requires genes on the rows and cells on the columns. Therefore, I used `WGCNA::transposeBigData()` (function optimized for large sparse matrices) to transpose the inputs before normalization and training/prediction. + +* In order to avoid cell filtering, the reference and query matrix were normalized using Seurat::NormalizeData. The authors original log normalized in this way in this way but with a custom function (using a scale.factor = 10000 and then log(ref + 1)). Because of this, the scLearn function argument for `species` is not used. This allows us to use this method with species other than human or mouse (only two arguments accepted) + +* Used default value `10` for argument `bootstrap_times` in training function. According to tool documentation, this can be increased to improve accuracy for unassigned cells(?) but increase train time. + +* Default parameters were used for tool prediction + +* Added some outputs: for training, added a table with the genes selected for the model. For prediction, added an output with the whole data frame containing the probabilities for each cell. + +## singleCellNet + +Documentation written by: Rodrigo Lopez Gutierrez +Date written: 2023-08-01 + +singleCellNet workflow was generated following the tutorial below: +https://pcahan1.github.io/singleCellNet/ + +Input for `singleCellNet` is raw counts for both reference and query. The reference is normalized within the `scn_train()` function. The query is currently not normalized. In the tutorial example they used raw query data. Furthermore, according to the tutorial, the classification step is robust to the normalization and transformation steps of the query data sets. They claim that one can even directly use raw data to query and still obtains accurate classification. This could be tested in the future with our data to see if normalized queries perform better. + +Normal parameters were used in both the training and prediction functions, with the expection of the following parameters: +* In `scn_train()`, we used parameter `nTrees = 500` compared to the default `nTrees = 1000`. This parameter changes the number of trees for the random forest classifier. The value selected is based on Hussein's thesis and is changed to improve the speed of `singleCellNet`. It is mentioned that additional training parameters may need to be adjusted depending on the quality of the reference data. Additionally, tutorial mentions that classifier performance may increase if the values for `nTopGenes` and `nTopGenePairs` are increased. +* In `scn_predict()`, we used parameter `nrand = 0` compared to the default `nrand = 50`. This parameter changes the number of randomized single cell RNA-seq profiles which serve as positive controls that should be mostly classified as `rand` (unknown) category. If left at default value, then this would generate extra cells that might complicate downstream consolidation of the consensus predictions for each cell. Therefore, the selected value is used to avoid complication. + +## ACTINN + +Documentation written by: Alva Annett +Date written: 2023-08-08 + +ACTINN code is based on `actinn_format.py` and `actinn_predict.py` originally found here: https://github.com/mafeiyang/ACTINN + +* ACTINN has been split into testing and predicting. To do this, filtering of outlier genes based on expression across all query samples and reference had to be removed. The rest of the code has not been changed from the original ACTINN implementation, other than rearrangements and removal of some parts related to processing multiple samples at the same time. + +* ACTINN is run with default parameters from original implementation. Normalization is based on original implementation and paper (cells scaled to total expression value, times 10 000, log2(x+1) normalized) + +## Tangram + +Documentation written by: Tomas Vega Waichman +Date written: 2023-08-08 + +The Tangram workflow was generated following the tutorial provided below: +https://tangram-sc.readthedocs.io/en/latest/tutorial_sq_link.html + +Tangram maps cells of a single cell reference to a spatial dataset. It cannot be separated into training and test steps. +It is necessary to explore whether parallelization is possible. + +* The spatial dataset needs to be in a `.h5ad` format with the `.X` matrix normalized and log-transformed. +* The mode could be set to `cells` if you want to map cells to spots, and the output matrix will be cell x spot probabilities. Alternatively, set it to `clusters` if the goal is to map whole clusters to the spatial data. +* The output is the highest scoring cell type for each spot, determined by the cell type projection (using the `tg.project_cell_annotations` function from the Tangram package). +* Other outputs include: a score matrix for spot vs label, a cell x spot probability matrix, and the Tangram output map object in `.h5ad` format containing all the relevant information. +* It runs using the whole transcriptome, no gene markers are selected. +* All parameters are the default. + +## scAnnotate + +Documentation written by: Tomas Vega Waichman +Date written: 2023-08-11 + +The scAnnotate workflow was generated following the tutorial provided below: +https://cran.r-project.org/web/packages/scAnnotate/vignettes/Introduction.html + +* Training and test steps of scAnnotate cannot be separated. +* Genes in references and query should match. +* The tool allows normalization inside their function using the parameter `lognormalized = F`. I normalized in the same way as they do on their script, but using the NormalizeData function from the Seurat package, via the “LogNormalize” method and a scale factor of 10,000. This is to allow the script to be easier to modify in the future (e.g. in case we allow an option for pre-normalized data). Since the data is normalized already by Seurat I set `lognormalized = T`. +* scAnnotate has two separate workflows with different batch effect removal steps based on the size of the training data. The `correction ="auto"` allows to automatically detect the needed for the dataset. They suggest using Seurat for dataset with at most one rare cell population (at most one cell population less than 100 cells) and using Harmony for dataset with at least two rare cell populations (at least two cell populations less than 100 cells). +* The `threshold` value goes between 0-1 and the cell with lower probability than the threshold are set to "unassigned" + +## scID + +Documentation written by: Tomas Vega Waichman +Date written: 2023-08-12 + +The scID workflow was generated following the tutorials provided below: +* https://github.com/BatadaLab/scID/blob/master/vignettes/Mapping_example.md +* https://github.com/BatadaLab/scID + +scID has some issues for installation: + * Needs module `gdal/3.5.1` + * MAST is needed. If you are not able to install it, use this approach: +``` +wget https://bioconductor.org/packages/release/bioc/src/contrib/MAST_1.26.0.tar.gz +R CMD INSTALL MAST_1.26.0.tar.gz +``` +* Training and test steps of scID cannot be separated. +* I used their `scID:::counts_to_cpm(counts_gem = query)` function that they provided (hidden, code in their github). Could be replaced with any normalization without log-transformation (they said this in the tutorial below: Any library-depth normalization (e.g. TPM, CPM) is compatible with scID, but not log-transformed data.) +* All parameters are the default except the normalization that is set in F since I normalized outside the function. But there exist some parameters that would be nice to explore as the `estimate_weights_from_target`. +* It's very slow (takes ~ 2hs for the 5k cells query and 5k cell reference), but we have to test if it's related with the number of labels (number of comparison) or the size of the dataset. + +## scNym + +Documentation written by: Tomas Vega Waichman +Date written: 2023-08-14 + +The scNym workflow was generated following the tutorial provided below: +https://github.com/calico/scnym/tree/master + +scNym takes advantage of the query to train the model, so the training and test steps should not be separated. + +* Query and training are concatenated into the same object. Any cell with the annotation "Unlabeled" will be treated as part of the target dataset and used for semi-supervised and adversarial training. It uses part of the query dataset to train the model. +* Data inputs for scNym should be log(CPM + 1) normalized counts, where CPM is Counts Per Million and log is the natural logarithm. +* They added the step of filtering genes that are not expressed, so I added it, but I ignored the step of filtering cells. +* This tool uses a threshold to assign labels to cells, and cells not passing this threshold have value “Unknown”. +* It needs more research in multi-domain. +* Additional output: `whole_df_output.csv` has the entire dataframe output with the score for the query test (mark as label == “Unlabeled”). +* I used the configuration as `new_identity_discovery` since: "This configuration is useful for experiments where new cell type discoveries may occur. It uses pseudolabel thresholding to avoid the assumption above. If new cell types are present in the target data, they correctly receive low +confidence scores." + +## CellTypist + +Documentation written by: Tomas Vega Waichman +Date written: 2023-08-16 + +The CellTypist workflow was generated following the tutorials provided below: +Training: +* https://celltypist.readthedocs.io/en/latest/celltypist.train.html +* https://github.com/Teichlab/celltypist#supplemental-guidance-generate-a-custom-model +Predicting: +* https://celltypist.readthedocs.io/en/latest/notebook/celltypist_tutorial_ml.html + +CellTypist allows separation between training and reference, and allows parallelization. +They provide their own pre-trained models. +CellTypist requires a logarithmised and normalised expression matrix stored in the `AnnData` (log1p normalised expression to 10,000 counts per cell) [link](https://github.com/Teichlab/celltypist#supplemental-guidance-generate-a-custom-model) + +Training: +* I use `check_expression = True` to check that the expression is okay. +* `celltypist.train` has the option `(feature_selection = True)` in order to do a feature_selection, but it is not implemented. +* The output is the model and and from the model we get the top markers for each cell type using the function `model.extract_top_markers()`. A table with the top 10 genes per cell-type is returned too (top10_model_markers_per_celltype.csv). + +Predicting: +* From tutorial: "By default, CellTypist will only do the prediction jobs to infer the identities of input cells, which renders the prediction of each cell independent. To combine the cell type predictions with the cell-cell transcriptomic relationships, CellTypist offers a majority voting approach based on the idea that similar cell subtypes are more likely to form a (sub)cluster regardless of their individual prediction outcomes. To turn on the majority voting classifier in addition to the CellTypist predictions, pass in `majority_voting = True`. If `majority_voting = True` all the predict column will be the majority_voting results otherwise it use the predicted_labels where each query cell gets its inferred label by choosing the most probable cell type among all possible cell types in the given model." [link](https://celltypist.readthedocs.io/en/latest/notebook/celltypist_tutorial_ml.html) +* `majority_voting parameter` should be specified in the configfile. +* I use the multilabel prediction, since we want to know if a cell cannot be classified very clearly… Description: "For the built-in models, we have collected a large number of cell types; yet, the presence of unexpected (e.g., low-quality or novel cell types) and ambiguous cell states (e.g., doublets) in the query data is beyond the prediction that CellTypist can achieve with a 'find-a-best-match' mode. To overcome this, CellTypist provides the option of multi-label cell type classification, which assigns 0 (i.e., unassigned), 1, or >=2 cell type labels to each query cell. It allows the use of a `threshold` to label cells that are below that probability as "Unnasigned". It allows to have intermediate labels as combination in the format of `celltype1|celltype2`." + +* Output: 4 `.csv`, the prediction for each cell (depending if we choose majority_voting or not will be the majority_voting or not), + * `decision_matrix.csv`: Decision matrix with the decision score of each cell belonging to a given cell type. + * `probability_matrix.csv`: Probability matrix representing the probability each cell belongs to a given cell type (transformed from decision matrix by the sigmoid function). + * `predicted_labels.csv`: The prediction for each cell, if majority_voting was true it has the information of the majority_voting labels AND the predicted_labels. + * Generates some embedding plots. + * An `.h5ad` object that has all the previous information (with the embeddings too) in a `.h5ad` object. diff --git a/Report/workflow.rst b/Report/workflow.rst new file mode 100644 index 0000000..b28b04f --- /dev/null +++ b/Report/workflow.rst @@ -0,0 +1,3 @@ + + + diff --git a/Scripts/ACTINN/predict_ACTINN.py b/Scripts/ACTINN/predict_ACTINN.py new file mode 100644 index 0000000..864fb3d --- /dev/null +++ b/Scripts/ACTINN/predict_ACTINN.py @@ -0,0 +1,303 @@ +import numpy as np +import pandas as pd +import sys +import math +import collections +import tensorflow as tf +tf.compat.v1.disable_eager_execution() +import argparse +import timeit +run_time = timeit.default_timer() +from tensorflow.python.framework import ops +import pickle +import os + + +def get_parser(parser=None): + if parser == None: + parser = argparse.ArgumentParser() + parser.add_argument("-ts", "--test_set", type=str, help="Training set file path.") + parser.add_argument("-mp", "--model_path", type=str, help="Model file path.") + parser.add_argument("-pp", "--pred_path", type=str, help="Output prediction file path.") + return parser + + +# Get common genes, normalize and scale the sets +def scale_sets(sets): + # input -- set to be scaled + # output -- scaled set + sets = sets.to_numpy() + sets = np.divide(sets, np.sum(sets, axis=0, keepdims=True)) * 10000 + sets = np.log2(sets+1) + return sets + + + +# Turn labels into matrix +def one_hot_matrix(labels, C): + # input -- labels (true labels of the sets), C (# types) + # output -- one hot matrix with shape (# types, # samples) + C = tf.constant(C, name = "C") + one_hot_matrix = tf.one_hot(labels, C, axis = 0) + sess = tf.compat.v1.Session() + one_hot = sess.run(one_hot_matrix) + sess.close() + return one_hot + + + +# Make types to labels dictionary +def type_to_label_dict(types): + # input -- types + # output -- type_to_label dictionary + type_to_label_dict = {} + all_type = list(set(types)) + for i in range(len(all_type)): + type_to_label_dict[all_type[i]] = i + return type_to_label_dict + + + +# Convert types to labels +def convert_type_to_label(types, type_to_label_dict): + # input -- list of types, and type_to_label dictionary + # output -- list of labels + types = list(types) + labels = list() + for type in types: + labels.append(type_to_label_dict[type]) + return labels + + + +# Function to create placeholders +def create_placeholders(n_x, n_y): + X = tf.compat.v1.placeholder(tf.float32, shape = (n_x, None)) + Y = tf.compat.v1.placeholder(tf.float32, shape = (n_y, None)) + return X, Y + + + +# Initialize parameters +def initialize_parameters(nf, ln1, ln2, ln3, nt): + # input -- nf (# of features), ln1 (# nodes in layer1), ln2 (# nodes in layer2), nt (# types) + # output -- a dictionary of tensors containing W1, b1, W2, b2, W3, b3 + tf.compat.v1.set_random_seed(3) # set seed to make the results consistant + W1 = tf.compat.v1.get_variable("W1", [ln1, nf], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform", seed = 3)) + b1 = tf.compat.v1.get_variable("b1", [ln1, 1], initializer = tf.compat.v1.zeros_initializer()) + W2 = tf.compat.v1.get_variable("W2", [ln2, ln1], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform", seed = 3)) + b2 = tf.compat.v1.get_variable("b2", [ln2, 1], initializer = tf.compat.v1.zeros_initializer()) + W3 = tf.compat.v1.get_variable("W3", [ln3, ln2], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform", seed = 3)) + b3 = tf.compat.v1.get_variable("b3", [ln3, 1], initializer = tf.compat.v1.zeros_initializer()) + W4 = tf.compat.v1.get_variable("W4", [nt, ln3], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform", seed = 3)) + b4 = tf.compat.v1.get_variable("b4", [nt, 1], initializer = tf.compat.v1.zeros_initializer()) + parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2, "W3": W3, "b3": b3, "W4": W4, "b4": b4} + return parameters + + + +# Forward propagation function +def forward_propagation(X, parameters): + # function model: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SOFTMAX + # input -- dataset with shape (# features, # sample), parameters "W1", "b1", "W2", "b2", "W3", "b3" + # output -- the output of the last linear unit + W1 = parameters['W1'] + b1 = parameters['b1'] + W2 = parameters['W2'] + b2 = parameters['b2'] + W3 = parameters['W3'] + b3 = parameters['b3'] + W4 = parameters['W4'] + b4 = parameters['b4'] + # forward calculations + Z1 = tf.add(tf.matmul(W1, X), b1) + A1 = tf.nn.relu(Z1) + Z2 = tf.add(tf.matmul(W2, A1), b2) + A2 = tf.nn.relu(Z2) + Z3 = tf.add(tf.matmul(W3, A2), b3) + A3 = tf.nn.relu(Z3) + Z4 = tf.add(tf.matmul(W4, A3), b4) + return Z4 + + + +# Compute cost +def compute_cost(Z4, Y, parameters, lambd=0.01): + # input -- Z3 (output of forward propagation with shape (# types, # samples)), Y (true labels, same shape as Z3) + # output -- tensor of teh cost function + logits = tf.transpose(a=Z4) + labels = tf.transpose(a=Y) + cost = tf.reduce_mean(input_tensor=tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels)) + \ + (tf.nn.l2_loss(parameters["W1"]) + tf.nn.l2_loss(parameters["W2"]) + tf.nn.l2_loss(parameters["W3"]) + tf.nn.l2_loss(parameters["W4"])) * lambd + return cost + + + +# Get the mini batches +def random_mini_batches(X, Y, mini_batch_size=32, seed=1): + # input -- X (training set), Y (true labels) + # output -- mini batches + ns = X.shape[1] + mini_batches = [] + np.random.seed(seed) + # shuffle (X, Y) + permutation = list(np.random.permutation(ns)) + shuffled_X = X[:, permutation] + shuffled_Y = Y[:, permutation] + # partition (shuffled_X, shuffled_Y), minus the end case. + num_complete_minibatches = int(math.floor(ns/mini_batch_size)) # number of mini batches of size mini_batch_size in your partitionning + for k in range(0, num_complete_minibatches): + mini_batch_X = shuffled_X[:, k * mini_batch_size : k * mini_batch_size + mini_batch_size] + mini_batch_Y = shuffled_Y[:, k * mini_batch_size : k * mini_batch_size + mini_batch_size] + mini_batch = (mini_batch_X, mini_batch_Y) + mini_batches.append(mini_batch) + # handling the end case (last mini-batch < mini_batch_size) + if ns % mini_batch_size != 0: + mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size : ns] + mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size : ns] + mini_batch = (mini_batch_X, mini_batch_Y) + mini_batches.append(mini_batch) + return mini_batches + + + +# Forward propagation for prediction +def forward_propagation_for_predict(X, parameters): + # input -- X (dataset used to make prediction), papameters after training + # output -- the output of the last linear unit + W1 = parameters['W1'] + b1 = parameters['b1'] + W2 = parameters['W2'] + b2 = parameters['b2'] + W3 = parameters['W3'] + b3 = parameters['b3'] + W4 = parameters['W4'] + b4 = parameters['b4'] + Z1 = tf.add(tf.matmul(W1, X), b1) + A1 = tf.nn.relu(Z1) + Z2 = tf.add(tf.matmul(W2, A1), b2) + A2 = tf.nn.relu(Z2) + Z3 = tf.add(tf.matmul(W3, A2), b3) + A3 = tf.nn.relu(Z3) + Z4 = tf.add(tf.matmul(W4, A3), b4) + return Z4 + + + +# Predict function +def predict(X, parameters): + # input -- X (dataset used to make prediction), papameters after training + # output -- prediction + W1 = tf.convert_to_tensor(value=parameters["W1"]) + b1 = tf.convert_to_tensor(value=parameters["b1"]) + W2 = tf.convert_to_tensor(value=parameters["W2"]) + b2 = tf.convert_to_tensor(value=parameters["b2"]) + W3 = tf.convert_to_tensor(value=parameters["W3"]) + b3 = tf.convert_to_tensor(value=parameters["b3"]) + W4 = tf.convert_to_tensor(value=parameters["W4"]) + b4 = tf.convert_to_tensor(value=parameters["b4"]) + params = {"W1": W1, "b1": b1, "W2": W2, "b2": b2, "W3": W3, "b3": b3, "W4": W4, "b4": b4} + x = tf.compat.v1.placeholder("float") + z4 = forward_propagation_for_predict(x, params) + p = tf.argmax(input=z4) + sess = tf.compat.v1.Session() + prediction = sess.run(p, feed_dict = {x: X}) + return prediction + + + +# Build the model +def model(X_train, Y_train, X_test, starting_learning_rate = 0.0001, num_epochs = 1500, minibatch_size = 128, print_cost = True): + # input -- X_train (training set), Y_train(training labels), X_test (test set), Y_test (test labels), + # output -- trained parameters + ops.reset_default_graph() # to be able to rerun the model without overwriting tf variables + tf.compat.v1.set_random_seed(3) + seed = 3 + (nf, ns) = X_train.shape + nt = Y_train.shape[0] + costs = [] + # create placeholders of shape (nf, nt) + X, Y = create_placeholders(nf, nt) + # initialize parameters + parameters = initialize_parameters(nf=nf, ln1=100, ln2=50, ln3=25, nt=nt) + # forward propagation: build the forward propagation in the tensorflow graph + Z4 = forward_propagation(X, parameters) + # cost function: add cost function to tensorflow graph + cost = compute_cost(Z4, Y, parameters, 0.005) + # Use learning rate decay + global_step = tf.Variable(0, trainable=False) + learning_rate = tf.compat.v1.train.exponential_decay(starting_learning_rate, global_step, 1000, 0.95, staircase=True) + # backpropagation: define the tensorflow optimizer, AdamOptimizer is used. + optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) + trainer = optimizer.minimize(cost, global_step=global_step) + # initialize all the variables + init = tf.compat.v1.global_variables_initializer() + # start the session to compute the tensorflow graph + with tf.compat.v1.Session() as sess: + # run the initialization + sess.run(init) + # do the training loop + for epoch in range(num_epochs): + epoch_cost = 0. + num_minibatches = int(ns / minibatch_size) + seed = seed + 1 + minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed) + for minibatch in minibatches: + # select a minibatch + (minibatch_X, minibatch_Y) = minibatch + # run the session to execute the "optimizer" and the "cost", the feedict contains a minibatch for (X,Y). + _ , minibatch_cost = sess.run([trainer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y}) + epoch_cost += minibatch_cost / num_minibatches + # print the cost every epoch + if print_cost == True and (epoch+1) % 5 == 0: + print ("Cost after epoch %i: %f" % (epoch+1, epoch_cost)) + costs.append(epoch_cost) + parameters = sess.run(parameters) + print ("Parameters have been trained!") + # calculate the correct predictions + correct_prediction = tf.equal(tf.argmax(input=Z4), tf.argmax(input=Y)) + # calculate accuracy on the test set + accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_prediction, "float")) + print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train})) + return parameters + + + +if __name__ == '__main__': + parser = get_parser() + args = parser.parse_args() + + # load model + print('@ LOAD MODEL') + model = pickle.load(open(args.model_path, 'rb')) + + dict_path = os.path.dirname(os.path.abspath(args.model_path)) + label_to_type_dict = pickle.load(open(dict_path + '/label_to_type_dict.pkl', 'rb')) + + print('@ DONE') + + # Read query + query = pd.read_csv(args.test_set, index_col=0) + + # transpose query + query = query.transpose() + + # save barcodes + barcode = list(query.columns) + + print("Dimension of the matrix after removing all-zero rows:", query.shape) + + # noramlize query + query = scale_sets(query) + + # predict + test_predict = predict(query, model) + predicted_label = [] + + for i in range(len(test_predict)): + predicted_label.append(label_to_type_dict[test_predict[i]]) + + predicted_label = pd.DataFrame({"cell":barcode, "ACTINN":predicted_label}) + predicted_label.to_csv(args.pred_path, sep=",", index=False) + +print("Run time:", timeit.default_timer() - run_time) diff --git a/Scripts/ACTINN_scripts/actinn_predict.py b/Scripts/ACTINN/train_ACTINN.py similarity index 81% rename from Scripts/ACTINN_scripts/actinn_predict.py rename to Scripts/ACTINN/train_ACTINN.py index f838792..9270d89 100644 --- a/Scripts/ACTINN_scripts/actinn_predict.py +++ b/Scripts/ACTINN/train_ACTINN.py @@ -9,14 +9,15 @@ import timeit run_time = timeit.default_timer() from tensorflow.python.framework import ops - +import pickle +import os def get_parser(parser=None): if parser == None: - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser() parser.add_argument("-trs", "--train_set", type=str, help="Training set file path.") parser.add_argument("-trl", "--train_label", type=str, help="Training label file path.") - parser.add_argument("-ts", "--test_set", type=str, help="Training set file path.") + parser.add_argument("-mp", "--model_path", type=str, help="Output model path.") parser.add_argument("-lr", "--learning_rate", type=float, help="Learning rate (default: 0.0001)", default=0.0001) parser.add_argument("-ne", "--num_epochs", type=int, help="Number of epochs (default: 50)", default=50) parser.add_argument("-ms", "--minibatch_size", type=int, help="Minibatch size (default: 128)", default=128) @@ -28,29 +29,14 @@ def get_parser(parser=None): # Get common genes, normalize and scale the sets def scale_sets(sets): - # input -- a list of all the sets to be scaled - # output -- scaled sets - common_genes = set(sets[0].index) - for i in range(1, len(sets)): - common_genes = set.intersection(set(sets[i].index),common_genes) - common_genes = sorted(list(common_genes)) - sep_point = [0] - for i in range(len(sets)): - sets[i] = sets[i].loc[common_genes,] - sep_point.append(sets[i].shape[1]) - total_set = np.array(pd.concat(sets, axis=1, sort=False), dtype=np.float32) - total_set = np.divide(total_set, np.sum(total_set, axis=0, keepdims=True)) * 10000 - total_set = np.log2(total_set+1) - expr = np.sum(total_set, axis=1) - total_set = total_set[np.logical_and(expr >= np.percentile(expr, 1), expr <= np.percentile(expr, 99)),] - cv = np.std(total_set, axis=1) / np.mean(total_set, axis=1) - total_set = total_set[np.logical_and(cv >= np.percentile(cv, 1), cv <= np.percentile(cv, 99)),] - for i in range(len(sets)): - sets[i] = total_set[:, sum(sep_point[:(i+1)]):sum(sep_point[:(i+2)])] + # input -- set to be scaled + # output -- scaled set + sets = sets.to_numpy() + sets = np.divide(sets, np.sum(sets, axis=0, keepdims=True)) * 10000 + sets = np.log2(sets+1) return sets - # Turn labels into matrix def one_hot_matrix(labels, C): # input -- labels (true labels of the sets), C (# types) @@ -225,7 +211,7 @@ def predict(X, parameters): # Build the model -def model(X_train, Y_train, X_test, starting_learning_rate = 0.0001, num_epochs = 1500, minibatch_size = 128, print_cost = True): +def model(X_train, Y_train, starting_learning_rate = 0.0001, num_epochs = 1500, minibatch_size = 128, print_cost = True): # input -- X_train (training set), Y_train(training labels), X_test (test set), Y_test (test labels), # output -- trained parameters ops.reset_default_graph() # to be able to rerun the model without overwriting tf variables @@ -280,34 +266,54 @@ def model(X_train, Y_train, X_test, starting_learning_rate = 0.0001, num_epochs return parameters - if __name__ == '__main__': parser = get_parser() args = parser.parse_args() - # Read in the files - train_set = pd.read_hdf(args.train_set, key="dge") - train_set.index = [s.upper() for s in train_set.index] - train_set = train_set.loc[~train_set.index.duplicated(keep='first')] - train_label = pd.read_csv(args.train_label, header=None, sep="\t") - test_set = pd.read_hdf(args.test_set, key="dge") - test_set.index = [s.upper() for s in test_set.index] - test_set = test_set.loc[~test_set.index.duplicated(keep='first')] - barcode = list(test_set.columns) - nt = len(set(train_label.iloc[:,1])) - train_set, test_set = scale_sets([train_set, test_set]) - type_to_label_dict = type_to_label_dict(train_label.iloc[:,1]) + + # read reference + ref = pd.read_csv(args.train_set, index_col=0, header=0) + print("Dimension of reference:", ref.shape) + + # read reference labels + labels = pd.read_csv(args.train_label, index_col=0, header=0, sep=',') + print("Dimension of labels:", labels.shape) + nt = len(set(labels.iloc[:,0])) + + print(ref.head()) + print(labels.head()) + + # check if cell names are in the same order in labels and ref + order = all(labels.index == ref.index) + + # throw error if order is not the same + if not order: + sys.exit("@ Order of cells in reference and labels do not match") + + # transpose reference + ref = ref.transpose() + + # noramlize reference + ref = scale_sets(ref) + + # prepare dictionary from reference labels + type_to_label_dict = type_to_label_dict(labels.iloc[:,0]) label_to_type_dict = {v: k for k, v in type_to_label_dict.items()} print("Cell Types in training set:", type_to_label_dict) - print("# Trainng cells:", train_label.shape[0]) - train_label = convert_type_to_label(train_label.iloc[:,1], type_to_label_dict) - train_label = one_hot_matrix(train_label, nt) - parameters = model(train_set, train_label, test_set, \ - args.learning_rate, args.num_epochs, args.minibatch_size, args.print_cost) - test_predict = predict(test_set, parameters) - predicted_label = [] + print("# Trainng cells:", labels.shape[0]) + + labels = convert_type_to_label(labels.iloc[:,0], type_to_label_dict) + labels = one_hot_matrix(labels, nt) + + # train model + model = model(ref, labels, args.learning_rate, args.num_epochs, args.minibatch_size, args.print_cost) + + print('@ SAVE MODEL') + pickle.dump(model, open(args.model_path, 'wb')) + print('@ DONE') + + # save label_to_type_dict + out_path = os.path.dirname(os.path.abspath(args.model_path)) + pickle.dump(label_to_type_dict, open(out_path + '/label_to_type_dict.pkl', 'wb')) + - for i in range(len(test_predict)): - predicted_label.append(label_to_type_dict[test_predict[i]]) - predicted_label = pd.DataFrame({"cellname":barcode, "celltype":predicted_label}) - predicted_label.to_csv("predicted_label.txt", sep="\t", index=False) print("Run time:", timeit.default_timer() - run_time) diff --git a/Scripts/ACTINN_scripts/actinn_format.py b/Scripts/ACTINN_scripts/actinn_format.py deleted file mode 100644 index f0dac91..0000000 --- a/Scripts/ACTINN_scripts/actinn_format.py +++ /dev/null @@ -1,62 +0,0 @@ -import numpy as np -import pandas as pd -import scipy.io -import os -import argparse - -def get_parser(parser=None): - if parser == None: - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input", type=str, help="Path to the input file or the 10X directory.") - parser.add_argument("-o", "--output", type=str, help="Prefix of the output file.") - parser.add_argument("-f", "--format", type=str, help="Format of the input file (10X_V2, 10X_V3, txt, csv).") - return parser - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - if args.format == "10X_V2": - path = args.input - if path[-1] != "/": - path += "/" - new = scipy.io.mmread(os.path.join(path, "matrix.mtx")) - genes = list(pd.read_csv(path+"genes.tsv", header=None, sep='\t')[1]) - barcodes = list(pd.read_csv(path+"barcodes.tsv", header=None)[0]) - new = pd.DataFrame(np.array(new.todense()), index=genes, columns=barcodes) - new.fillna(0, inplace=True) - uniq_index = np.unique(new.index, return_index=True)[1] - new = new.iloc[uniq_index,] - new = new.loc[new.sum(axis=1)>0, :] - print("Dimension of the matrix after removing all-zero rows:", new.shape) - new.to_hdf(args.output+".h5", key="dge", mode="w", complevel=3) - - if args.format == "10X_V3": - path = args.input - if path[-1] != "/": - path += "/" - new = scipy.io.mmread(os.path.join(path, "matrix.mtx.gz")) - genes = list(pd.read_csv(path+"features.tsv.gz", header=None, sep='\t')[1]) - barcodes = list(pd.read_csv(path+"barcodes.tsv.gz", header=None)[0]) - new = pd.DataFrame(np.array(new.todense()), index=genes, columns=barcodes) - new.fillna(0, inplace=True) - uniq_index = np.unique(new.index, return_index=True)[1] - new = new.iloc[uniq_index,] - new = new.loc[new.sum(axis=1)>0, :] - print("Dimension of the matrix after removing all-zero rows:", new.shape) - new.to_hdf(args.output+".h5", key="dge", mode="w", complevel=3) - - if args.format == "csv": - new = pd.read_csv(args.input, index_col=0) - uniq_index = np.unique(new.index, return_index=True)[1] - new = new.iloc[uniq_index,] - new = new.loc[new.sum(axis=1)>0, :] - print("Dimension of the matrix after removing all-zero rows:", new.shape) - new.to_hdf(args.output+".h5", key="dge", mode="w", complevel=3) - - if args.format == "txt": - new = pd.read_csv(args.input, index_col=0, sep="\t") - uniq_index = np.unique(new.index, return_index=True)[1] - new = new.iloc[uniq_index,] - new = new.loc[new.sum(axis=1)>0, :] - print("Dimension of the matrix after removing all-zero rows:", new.shape) - new.to_hdf(args.output+".h5", key="dge", mode="w", complevel=3) diff --git a/Scripts/CellTypist/predict_CellTypist.py b/Scripts/CellTypist/predict_CellTypist.py new file mode 100644 index 0000000..07c3b78 --- /dev/null +++ b/Scripts/CellTypist/predict_CellTypist.py @@ -0,0 +1,100 @@ +#--------------- Libraries ------------------- +import numpy as np +import pandas as pd +import celltypist +import anndata as ad +import sys +import scanpy as sc +import pickle +import os +import random + +### Set seed +random.seed(123456) + +#--------------- Parameters ------------------- + +sample_path = str(sys.argv[1]) +model_path = str(sys.argv[2]) +out_path = str(sys.argv[3]) +out_other_path = os.path.dirname(str(sys.argv[3])) +threshold = float(sys.argv[4]) +threads = int(sys.argv[5]) +majority_voting = bool(sys.argv[6]) + +#--------------- Data ------------------------- + +print('@ READ QUERY') +query = pd.read_csv(sample_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +print('@ DONE') + +query = ad.AnnData(X = query, + obs = dict(obs_names=query.index.astype(str)), + var = dict(var_names=query.columns.astype(str)) +) + +# Now I normalize the matrix with scanpy: +# Normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization +# 1e4 similar as Seurat +sc.pp.normalize_total(query, target_sum=1e4) + +#Logarithmize the data: +sc.pp.log1p(query) + + +# load model +print('@ LOAD MODEL') +model = pickle.load(open(model_path, 'rb')) +print('@ DONE') + +### If the trained model was generated with a diff number of threads and the +## prediction is done with other number +model["Model"].n_jobs = threads + +#------------- Predict CellTypist ------------- + +predictions = celltypist.annotate(query, + model = model_path, + majority_voting = majority_voting, + mode = 'prob match', + p_thres = threshold) + +## If the majority voting is true, return that result +if majority_voting: + pred_df = pd.DataFrame({'cell': predictions.predicted_labels.index, + 'CellTypist': predictions.predicted_labels.majority_voting}) +else: + pred_df = pd.DataFrame({'cell': predictions.predicted_labels.index, + "CellTypist": predictions.predicted_labels.predicted_labels}) + +print('@ WRITTING PREDICTIONS') +pred_df.to_csv(out_path, index = False) +print('@ DONE') + +#------------- Other outputs -------------- + +## Save the outputs as .csv +print('@ WRITTING CSV OUTPUTS') +predictions.to_table(out_other_path) +print('@ DONE') + +## Save the output plots +print('@ GENERATING OUTPUT PLOTS') +predictions.to_plots(out_other_path) +print('@ DONE') + +### Save the prob matrix +print('@ WRITTING OUTPUT OBJECT') +filename = out_other_path + '/CellTypist_output_object.h5ad' +adata = predictions.to_adata(insert_prob = True) +adata.write_h5ad(filename= filename, + compression='gzip') +print('@ DONE ') + + diff --git a/Scripts/CellTypist/train_CellTypist.py b/Scripts/CellTypist/train_CellTypist.py new file mode 100644 index 0000000..2fa355c --- /dev/null +++ b/Scripts/CellTypist/train_CellTypist.py @@ -0,0 +1,96 @@ +## Python 3.11.2 +#--------------- Libraries ------------------- +import numpy as np +import pandas as pd +import celltypist +import anndata as ad +import sys +import scanpy as sc +import pickle +import os +import random + +### Set seed +random.seed(123456) + +#--------------- Parameters ------------------- + +ref_path = str(sys.argv[1]) +lab_path = str(sys.argv[2]) +out_path = str(sys.argv[3]) +out_other_path = os.path.dirname(str(sys.argv[3])) +threads = int(sys.argv[4]) +feature_selection = bool(sys.argv[5]) + +#--------------- Data ------------------------- + +# read the data +ref = pd.read_csv(ref_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +labels = pd.read_csv(lab_path, + index_col = 0, + sep=',') + +# check if cell names are in the same order in labels and ref +order = all(labels.index == ref.index) + +# throw error if order is not the same +if not order: + sys.exit("@ Order of cells in reference and labels do not match") + +# create AnnData object +adata = ad.AnnData(X = ref, + obs = dict(obs_names=ref.index.astype(str), + label = labels['label']), + var = dict(var_names=ref.columns.astype(str)) + ) + +# normalize the matrix with scanpy: +# normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization +# 1e4 similar as Seurat +sc.pp.normalize_total(adata, target_sum=1e4) + +#Logarithmize the data: +sc.pp.log1p(adata) + +#------------- Train CellTypist ------------- + +model = celltypist.train(adata, + labels = "label", + transpose_input = False, + check_expression = True, + feature_selection = feature_selection, + n_jobs = threads) + + +#Save the model to disk +print('@ SAVE MODEL') +model.write(out_path) +print('@ DONE') + +#------------- Other outputs -------------- + +# We can extract the top markers, I get the top 10 for each cell-type applying the +# function extract_top_markers +dataframes = [] +for cell_type in model.cell_types: + top_markers = model.extract_top_markers(cell_type, 10) + + # Create a DataFrame for the current cell type's top markers + df = pd.DataFrame(top_markers, columns=['Marker']) + df['CellType'] = cell_type # Add a column to store the cell type + + # Append the DataFrame to the list + dataframes.append(df) + +# Concatenate all DataFrames into a single DataFrame +markers_df = pd.concat(dataframes, ignore_index=True) + +filename = out_other_path + "/top10_model_markers_per_celltype.csv" +markers_df.to_csv(filename, + index=False) diff --git a/Scripts/Correlation/predict_Correlation.R b/Scripts/Correlation/predict_Correlation.R new file mode 100644 index 0000000..a046126 --- /dev/null +++ b/Scripts/Correlation/predict_Correlation.R @@ -0,0 +1,98 @@ +# loads needed libraries +library(tidyverse) +library(data.table) +library(Seurat) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +sample_path = args[1] +model_path = args[2] +out_path = args[3] +threads = as.numeric(args[4]) + +#--------------- Data ------------------- + +# read query matrix and transpose +message('@ READ QUERY') +query = data.table::fread(sample_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# load model +message('@ LOAD MODEL') +load(model_path) +message('@ DONE') + +# Transpose query +query = transposeBigData(query, blocksize = 10000) +seurat_query = CreateSeuratObject(counts = query) + +# Normalize seurat using default "LogNormalize" method +seurat_query = NormalizeData(seurat_query) + +# Get the normalized expression matrix from the query +query_mat = GetAssayData(seurat_query) + +# ------------ Load Correlation Function -------------- +# Function to label by Spearman correlation generated by Selin Jessa and Marie Coutlier +# Path to original file: /lustre06/project/6004736/sjessa/from_narval/HGG-oncohistones/stable/code/scripts/predict_celltype_cor.R + + +label_correlation <- function(test_expr_mat, + ref_expr_mat, + threshold_common_genes = 0.5) { + + rownames(test_expr_mat) <- toupper(rownames(test_expr_mat)) + rownames(ref_expr_mat) <- toupper(rownames(ref_expr_mat)) + + # Testing how many genes are in common and stopping if not enough + common_genes <- intersect(rownames(test_expr_mat), rownames(ref_expr_mat)) + prop_common <- length(common_genes) / length(rownames(test_expr_mat)) + message("@@ ", round(prop_common*100, digits = 2), "% of test dataset genes are in the reference dataset") + + if (prop_common < threshold_common_genes) stop("Proportion of common genes below threshold.") + + # Reducing matrices to common subset + mat1 <- as.matrix(test_expr_mat[common_genes, ]) + mat2 <- as.matrix(ref_expr_mat[common_genes, ]) + + # sanity check + nrow(mat1) == nrow(mat2) + + # Computing correlations + cor_matrix <- cor(mat1, mat2, method = "spearman", use = "complete.obs") + + # Getting the best one + cor_label <- as.data.frame(cor_matrix) %>% + mutate("cell" = rownames(cor_matrix)) %>% + gather("celltype", "correlation", -cell) %>% + group_by(cell) %>% + top_n(1, correlation) %>% + dplyr::select(cell, celltype, correlation) %>% + arrange(cell) + + # Returning the results + return(cor_label) + +} + +#----------- Predict Correlation ------------ + +# predict labels +message('@ PREDICT LABELS') +# Hussein set the threshold_common_genes = 0.3 +# This does not affect current prediction as preprocessing is already subsetting common genes between reference and query +# Therefore, 100% of query dataset genes should be in the reference dataset +# However, this is something that could be explored in the future +predicted = as.data.frame(label_correlation(query_mat,ref_mean_mat,0.3)) +message('@ DONE') + +# Rename columns +colnames(predicted) <- c("cell", "Correlation","Correlation_score") + +# write prediction +data.table::fwrite(predicted, file = out_path) + +#---------------------------------------- diff --git a/Scripts/Correlation/train_Correlation.R b/Scripts/Correlation/train_Correlation.R new file mode 100644 index 0000000..8cc0b17 --- /dev/null +++ b/Scripts/Correlation/train_Correlation.R @@ -0,0 +1,61 @@ +# loads needed libraries +library(tidyverse) +library(data.table) +library(Seurat) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +out_path = args[3] +threads = as.numeric(args[4]) + +# Script modified from Hussein Lakkis's run_correlation.R script + +#--------------- Data ------------------- +# read reference matrix and transpose +message('@ READ REF') +ref = data.table::fread(ref_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# read reference labels +labels = data.table::fread(lab_path, header=T, data.table=F) %>% + column_to_rownames('V1') + + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(rownames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# make Seurat object (transpose ref first) +ref = transposeBigData(ref, blocksize = 10000) +seurat = CreateSeuratObject(counts = ref, meta.data = labels) + +# Normalize seurat using default "LogNormalize" method +seurat = NormalizeData(seurat) + +#------------- Generate Mean Expression Matrix ------------- + +# Set labels as identity classes +Idents(seurat) = "label" +# Returns averaged expression values for each identity class +# Function uses slot = "data" (normalized data) as default +seurat = AverageExpression(seurat, return.seurat = TRUE) + +# get mean expression matrix +ref_mean_mat = GetAssayData(seurat) + +# save mean expression matrix +message('@ SAVE MODEL') +save(ref_mean_mat, file = out_path) +message('@ DONE') + +#---------------------------------------- + diff --git a/Scripts/SVC/predict_SVM.py b/Scripts/SVC/predict_SVM.py new file mode 100644 index 0000000..5996bc1 --- /dev/null +++ b/Scripts/SVC/predict_SVM.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 24 18:37:59 2023 + +@author: tomas.vega +""" +## Python 3.11.2 +#--------------- Libraries ------------------- +import numpy as np +import pandas as pd +from sklearn.svm import SVC +from sklearn.calibration import CalibratedClassifierCV +import anndata as ad +import sys +import scanpy as sc +import pickle +import os +import random + +# Set seed +random.seed(123456) + +# Function to get the column name and maximum value for each row +def get_max_column_and_value(row): + pred_label = row.idxmax() + proba_label = row.max() + return pred_label, proba_label + +#--------------- Parameters ------------------- + +sample_path = str(sys.argv[1]) +model_path = str(sys.argv[2]) +out_path = str(sys.argv[3]) +out_other_path = os.path.dirname(str(sys.argv[3])) +threshold = float(sys.argv[4]) +threads = int(sys.argv[5]) +tool_name = str(sys.argv[6]) + +#--------------- Data ------------------------- + +print('@ READ QUERY') +query = pd.read_csv(sample_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +print('@ DONE') + +query = ad.AnnData(X = query, + obs = dict(obs_names=query.index.astype(str)), + var = dict(var_names=query.columns.astype(str)) +) + +# Now I normalize the matrix with scanpy: +# Normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization +# 1e4 similar as Seurat +sc.pp.normalize_total(query, target_sum=1e4) + +#Logarithmize the data: +sc.pp.log1p(query) + +# load model +print('@ LOAD MODEL') +SVM_model = pickle.load(open(model_path, 'rb')) +print('@ DONE') + + +# If the trained model was generated with a diff number of threads and the +# prediction is done with other number +SVM_model.n_jobs = threads + +# Calculate predicted probability +pred_proba = SVM_model.predict_proba(query.X) +df = pd.DataFrame(pred_proba,index = query.obs_names,columns = SVM_model.classes_) + +# Create a new column 'max_column' with the column name containing the maximum value for each row +df['pred_label'], df['proba_label'] = zip(*df.apply(get_max_column_and_value, axis=1)) + +# Create a new column 'unknown_max_column' to store 'max_column' as 'unknown' if 'max_value' is lower than the threshold +df['pred_label_reject'] = df.apply(lambda row: 'Unknown' if row['proba_label'] < threshold else row['pred_label'], axis=1) + +print('@ WRITTING PREDICTIONS') +pred_df = pd.DataFrame({'cell': df.index, tool_name: df.pred_label_reject}) +pred_df.to_csv(out_path, index = False) +print('@ DONE') + +#------------- Other outputs -------------- + +# Save the prob matrix +print('@ WRITTING PROB MATRIX ') +filename = out_other_path + '/prob_matrix.csv' +df.to_csv(filename, + index=True) #True because we want to conserve the rownames (cells) +print('@ DONE ') diff --git a/Scripts/SVC/train_SVM.py b/Scripts/SVC/train_SVM.py new file mode 100644 index 0000000..7d62e02 --- /dev/null +++ b/Scripts/SVC/train_SVM.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 24 18:37:59 2023 + +@author: tomas.vega +""" +# Python 3.11.2 + +#--------------- Libraries ------------------- +import numpy as np +import pandas as pd +from sklearn.svm import SVC +from sklearn.calibration import CalibratedClassifierCV +import anndata as ad +import sys +import scanpy as sc +import pickle +import os +import random + +# Set seed +random.seed(123456) + +#--------------- Parameters ------------------- +ref_path = str(sys.argv[1]) +lab_path = str(sys.argv[2]) +out_path = str(sys.argv[3]) +out_other_path = os.path.dirname(str(sys.argv[3])) +classifier = str(sys.argv[4]) +threads = int(sys.argv[5]) + +#--------------- Data ------------------------- +# read the data +ref = pd.read_csv(ref_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +labels = pd.read_csv(lab_path, + index_col = 0, + sep=',') + +# check if cell names are in the same order in labels and ref +order = all(labels.index == ref.index) + +# throw error if order is not the same +if not order: + sys.exit("@ Order of cells in reference and labels do not match") + +adata = ad.AnnData(X = ref, + obs = dict(obs_names=ref.index.astype(str)), + var = dict(var_names=ref.columns.astype(str)) + ) + +# Now I normalize the matrix with scanpy: +# Normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization +# 1e4 similar as Seurat +sc.pp.normalize_total(adata, target_sum=1e4) + +# Logarithmize the data: +sc.pp.log1p(adata) + +# Transform labels to array +label = np.array(labels['label']) + +#------------- Train SVC ------------- +# kernel could be ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ +# When the constructor option probability is set to True, class membership +# probability estimates (from the methods predict_proba and predict_log_proba) +# are enabled. +svm_model = SVC(kernel=classifier, + probability=True) + +calibrated_model = CalibratedClassifierCV(svm_model, + n_jobs = threads) +calibrated_model.fit(adata.X, label) + + +# Save the model to disk +print('@ SAVE MODEL') +pickle.dump(calibrated_model, open(out_path, 'wb')) +print('@ DONE') + +#------------- Other outputs -------------- +# Plot the tree +filepath = out_other_path + '/train_parameters.csv' +print('@ WRITE TRAIN PARAMETERS ') +params_table = calibrated_model.get_params() +params_table = pd.DataFrame.from_dict(params_table, + orient= 'index') +params_table.to_csv(filepath) +print('@ DONE') + +# I cannot get the features per class, since: +# there is attribute coef_ for SVM classifier but it only works for SVM with +# linear kernel. For other kernels it is not possible because data are transformed +# by kernel method to another space, which is not related to input space +# (https://stackoverflow.com/questions/21260691/how-to-obtain-features-weights) +# I cannot get the scores since I use the CalibratedClassifierCV too, +# and it doens't have the score attribute. + diff --git a/Scripts/SVC/train_linearSVM.py b/Scripts/SVC/train_linearSVM.py new file mode 100644 index 0000000..73e2231 --- /dev/null +++ b/Scripts/SVC/train_linearSVM.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 24 18:37:59 2023 + +@author: tomas.vega +""" +## Python 3.11.2 + +#--------------- Libraries ------------------- +import numpy as np +import pandas as pd +from sklearn.svm import LinearSVC +from sklearn.calibration import CalibratedClassifierCV +import anndata as ad +import sys +import scanpy as sc +import pickle +import os +import random + +# Set seed +random.seed(123456) + +#--------------- Parameters ------------------- +ref_path = str(sys.argv[1]) +lab_path = str(sys.argv[2]) +out_path = str(sys.argv[3]) +out_other_path = os.path.dirname(str(sys.argv[3])) +threads = int(sys.argv[4]) + +#--------------- Data ------------------------- +# read the data +ref = pd.read_csv(ref_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +labels = pd.read_csv(lab_path, + index_col = 0, + sep=',') + +# check if cell names are in the same order in labels and ref +order = all(labels.index == ref.index) + +# throw error if order is not the same +if not order: + sys.exit("@ Order of cells in reference and labels do not match") + +adata = ad.AnnData(X = ref, + obs = dict(obs_names=ref.index.astype(str)), + var = dict(var_names=ref.columns.astype(str)) +) + +# Now I normalize the matrix with scanpy: +# Normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization +# 1e4 similar as Seurat +sc.pp.normalize_total(adata, target_sum=1e4) + +# Logarithmize the data: +sc.pp.log1p(adata) + +## Transform labels to array + +label = np.array(labels['label']) + +#------------- Train SVM Lineal ------------- +# kernel could be ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ +# When the constructor option probability is set to True, class membershipsq +# probability estimates (from the methods predict_proba and predict_log_proba) +# are enabled. +svm_model = LinearSVC() + +# Calibrate the model using 5-fold cross validation +calibrated_model = CalibratedClassifierCV(svm_model, + n_jobs = threads) #Default +calibrated_model.fit(adata.X, label) + +#Save the model to disk +print('@ SAVE MODEL') +pickle.dump(calibrated_model, open(out_path, 'wb')) +print('@ DONE') + +#------------- Other outputs -------------- +# Plot the tree +filepath = out_other_path + '/train_parameters.csv' +print('@ WRITE TRAIN PARAMETERS ') +params_table = calibrated_model.get_params() +params_table = pd.DataFrame.from_dict(params_table, + orient= 'index') +params_table.to_csv(filepath) +print('@ DONE') + +# I cannot get the features per class, since: +# there is attribute coef_ for SVM classifier but it only works for SVM with +# linear kernel. For other kernels it is not possible because data are transformed +# by kernel method to another space, which is not related to input space +# (https://stackoverflow.com/questions/21260691/how-to-obtain-features-weights) +# I cannot get the scores since I use the CalibratedClassifierCV too, +# and it doens't have the score attribute. diff --git a/Scripts/SciBet/predict_SciBet.R b/Scripts/SciBet/predict_SciBet.R new file mode 100644 index 0000000..54b8d4d --- /dev/null +++ b/Scripts/SciBet/predict_SciBet.R @@ -0,0 +1,79 @@ +# load libraries and arguments +library(data.table) +library(scibet) +library(WGCNA) +library(tidyverse) +library(Seurat) +library(glue) + +set.seed(1234) + +#---------- Parameters ------------------- +args = commandArgs(trailingOnly = TRUE) +query_path = args[1] +model_path = args[2] +pred_path = args[3] +threads = as.numeric(args[4]) +# path for other outputs (depends on tools) +out_path = dirname(pred_path) + +#--------------- Data ------------------- + +# read query matrix and transpose +message('@ READ QUERY') + +# The matrix of the references is transpose since it needs to be normalize +# with Seurat that expect a genes x cell. +query <- data.table::fread(query_path, + data.table=F, + header=T, + nThread=threads) %>% + column_to_rownames('V1') %>% + WGCNA::transposeBigData() + +message('@ DONE') + +# load model +message('@ LOAD MODEL') +load(model_path) +message('@ DONE') + +# The matrix here is transposed since SciBet expect a cell x gene matrix. +query <- Seurat::NormalizeData(query) %>% as.data.frame() %>% WGCNA::transposeBigData() + +#----------- Predict SciBet -------- + +pred <- Scibet_model(query, + result = 'list') + +pred_labels <- data.frame(cell = rownames(query), + SciBet = pred) + +message('@ WRITTING PREDICTIONS') +data.table::fwrite(pred_labels, + file = pred_path, + row.names = F, + col.names = T, + sep = ",", + nThread = threads) +message('@ DONE') + +#------------- Other outputs -------------- + +# I tested and the results are the same when you run the same model twice, +# so I run it again to obtain the prob matrix. +pred_matrix <- Scibet_model(query, + result = 'table') +rownames(pred_matrix) <- rownames(query) +pred_matrix <- pred_matrix %>% as.data.frame() %>% tibble::rownames_to_column(" ") + +message('@ SAVE PRED MATRIX') +data.table::fwrite(pred_matrix, + file = glue('{out_path}/Query_probabilities_matrix.csv'), + row.names = F, + col.names = T, + sep = ",", + nThread = threads +) +message('@ DONE') +#---------------------------------------- diff --git a/Scripts/SciBet/train_SciBet.R b/Scripts/SciBet/train_SciBet.R new file mode 100644 index 0000000..c004d04 --- /dev/null +++ b/Scripts/SciBet/train_SciBet.R @@ -0,0 +1,109 @@ +# load libraries and arguments +library(data.table) +library(scibet) +library(tidyverse) +library(WGCNA) +library(Seurat) +library(glue) + +set.seed(1234) + +#---------- Parameters ------------------- +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +model_path = args[3] +threads = as.numeric(args[4]) +out_path = dirname(model_path) + +#--------------- Data ------------------- + +# read reference matrix and transpose +message('@ READ REF') + +# The matrix of the references is transpose since it needs to be normalize +# with Seurat that expect a genes x cell. +ref <- data.table::fread(ref_path, + data.table=F, + header=T, + nThread=threads) %>% + column_to_rownames('V1') %>% + transposeBigData() + +message('@ DONE') + +# read reference labels +labels <- data.table::fread(lab_path, + data.table=F, + header=T, + nThread=threads) %>% + column_to_rownames('V1') + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(colnames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# The matrix here is transposed since SciBet expect a cell x gene matrix and +# converted to data.frame since labels need to be add. +ref <- NormalizeData(ref) %>% as.data.frame() %>% transposeBigData() +ref$label <- labels$label + +#------------- Train SciBet ------------- + +Scibet_model <- scibet::Learn(expr = ref) + +# save trained model +message('@ SAVE MODEL') + +save(Scibet_model, + file = model_path) + +message('@ DONE') + +#------------- Other outputs -------------- + +# Transforming the model into a matrix +Scibet_matrix <- scibet::ExportModel(Scibet_model) %>% as.data.frame() %>% tibble::rownames_to_column(" ") + +message('@ SAVE MODEL MATRIX') +data.table::fwrite(Scibet_matrix, + file = glue('{out_path}/model_matrix.csv'), + row.names = F, + col.names = T, + sep = ",", + nThread = threads +) +message('@ DONE') + +# Taking the selected genes used in the model +selected_genes <- Scibet_matrix[,1,drop=T] + +# Plotting in a marker heatmap +message('@ PLOTTING MARKER HEATMAP MATRIX') +pdf(glue('{out_path}/selected_markers_per_label.pdf', + width = 12, + height = 12) +) +marker_plot <- scibet::Marker_heatmap(expr = ref, + gene = selected_genes) +plot(marker_plot) +dev.off() +message('@ DONE') + +df_markers <- marker_plot %>% .$data %>% filter(group == cell_type) %>% select(c('gene','cell_type','zscore')) + +# write df_markers +message('@ SAVE MARKERS') +data.table::fwrite(df_markers, + file = glue('{out_path}/selected_markers_per_label.csv'), + row.names = F, + col.names = T, + sep = ",", + nThread = threads +) +message('@ DONE') +#---------------------------------------- diff --git a/Scripts/SingleR/predict_SingleR.R b/Scripts/SingleR/predict_SingleR.R new file mode 100644 index 0000000..58d4765 --- /dev/null +++ b/Scripts/SingleR/predict_SingleR.R @@ -0,0 +1,51 @@ +# load libraries +library(tidyverse) +library(SingleR) +library(SingleCellExperiment) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +sample_path = args[1] +model_path = args[2] +pred_path = args[3] +threads = as.numeric(args[4]) + +#--------------- Data ------------------- + +# read query matrix and transpose +message('@ READ QUERY') +query = data.table::fread(sample_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# load model +message('@ LOAD MODEL') +load(model_path) +message('@ DONE') + +# Make SingleCellExperiment object from query (transpose query first) +query = transposeBigData(query) +query = SingleCellExperiment(assays = list(counts = query)) + +# Log normalize query counts +message('@ NORMALIZE QUERY') +query = scuttle::logNormCounts(query) +message('@ DONE') + +#----------- Predict SingleR ------------ + +# predict labels +message('@ PREDICT LABELS') +pred = classifySingleR(query, singler, assay.type = "logcounts") +message('@ DONE') + +pred_labs = data.frame(cell = rownames(pred), + SingleR = pred$labels) + +# write prediction +data.table::fwrite(pred_labs, file = pred_path) + +#---------------------------------------- + diff --git a/Scripts/SingleR/train_SingleR.R b/Scripts/SingleR/train_SingleR.R new file mode 100644 index 0000000..74b46b5 --- /dev/null +++ b/Scripts/SingleR/train_SingleR.R @@ -0,0 +1,59 @@ +# load libraries +library(tidyverse) +library(SingleR) +library(SingleCellExperiment) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +model_path = args[3] +threads = as.numeric(args[4]) + +#--------------- Data ------------------- +# read reference matrix and transpose +message('@ READ REF') +ref = data.table::fread(ref_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# read reference labels +labels = data.table::fread(lab_path, header=T, data.table=F) %>% + column_to_rownames('V1') + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(rownames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# make SingleCellExperiment object (transpose ref first) +ref = transposeBigData(ref) +ref = SingleCellExperiment(assays = list(counts = ref)) + +# log normalize reference +message('@ NORMALIZE REF') +ref = scuttle::logNormCounts(ref) +message('@ DONE') + +#------------- Train SingleR ------------- + +# train SingleR +message('@ TRAINING MODEL') +singler = trainSingleR(ref, + labels=labels$label, + assay.type = "logcounts", + de.method = "wilcox", + de.n = 10) +message('@ DONE') + +# save trained model +message('@ SAVE MODEL') +save(singler, file = model_path) +message('@ DONE') + +#---------------------------------------- diff --git a/Scripts/Tangram/run_Tangram.py b/Scripts/Tangram/run_Tangram.py new file mode 100644 index 0000000..9b04f1f --- /dev/null +++ b/Scripts/Tangram/run_Tangram.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Aug 8 17:11:20 2023 + +@author: tomas.vega +""" +## Python 3.11.2 +#--------------- Libraries ------------------- +import tangram as tg +import anndata as ad +import scanpy as sc +import numpy as np +import pandas as pd +import sys +import random + +# Set seed +random.seed(123456) + +#--------------- Parameters ------------------- +ref_path = str(sys.argv[1]) +lab_path = str(sys.argv[2]) +out_path = str(sys.argv[3]) +out_other_path = os.path.dirname(str(sys.argv[3])) +sp_dataset = str(sys.argv[4]) +mode = str(sys.argv[5]) + +#--------------- Data ------------------------- + +# read the data +ref = pd.read_csv(ref_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +labels = pd.read_csv(lab_path, + index_col = 0, + sep=',') + +# check if cell names are in the same order in labels and ref +order = all(labels.index == ref.index) + +# throw error if order is not the same +if not order: + sys.exit("@ Order of cells in reference and labels do not match") + +ad_sc = ad.AnnData(X = ref, + obs = dict(obs_names=ref.index.astype(str), + label = labels['label']), + var = dict(var_names=ref.columns.astype(str)) + ) + +# Now I normalize the matrix with scanpy: +# Normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization +# 1e4 similar as Seurat +sc.pp.normalize_total(ad_sc, target_sum=1e4) + +# Logarithmize the data: +sc.pp.log1p(ad_sc) + +# read spatial query +ad_sp = sc.read_h5ad(sp_dataset) + +# Use only the same genes in the spatial and the query +tg.pp_adatas(ad_sc, ad_sp, genes=None) + +if mode == 'clusters': + ad_map = tg.map_cells_to_space( + ad_sc, + ad_sp, + mode=mode, + cluster_label='label') +else: + ad_map = tg.map_cells_to_space( + ad_sc, + ad_sp, + mode=mode) + + +tg.project_cell_annotations(ad_map, ad_sp, annotation='label') +df = ad_sp.obsm['tangram_ct_pred'].copy() + +# Function to get the column name and maximum value for each row +def get_max_column_and_value(row): + pred_label = row.idxmax() + proba_label = row.max() + return pred_label, proba_label + +# Create a new column 'max_column' with the column name containing the maximum value for each row +df['pred_label'], df['score_label'] = zip(*df.apply(get_max_column_and_value, axis=1)) + +print('@ WRITTING PREDICTIONS') +pred_df = pd.DataFrame({'spot': df.index, 'Tangram': df.pred_label}) +pred_df.to_csv(out_path, index = False) +print('@ DONE') + +#------------- Other outputs -------------- + +# Save transfered score matrix +print('@ WRITTING TRANSFERED SCORE MATRIX ') + +filename = out_other_path + '/label_score_matrix.csv' +df.to_csv(filename, + index=True) #True because we want to conserve the rownames (cells) + +print('@ DONE ') + +# Save map object that contains the cell x spot prob +print('@ WRITTING OUTPUT OBJECT ') + +filename = out_other_path + '/Tangram_mapped_object.h5ad' +ad_map.write_h5ad(filename= filename, + compression='gzip') + +print('@ DONE ') + +# Save cell x spot prob matrix +print('@ WRITTING PROB MATRIX ') + +filename = out_other_path + '/prob_matrix.h5ad' +prob_matrix = pd.DataFrame(ad_map.X,index = ad_map.obs_names,columns = ad_map.var_names) +prob_matrix.to_csv(filename, + index=True) #True because we want to conserve the rownames (cells) + +print('@ DONE ') diff --git a/Benchmarking/LICENSE b/Scripts/archive/Benchmarking/LICENSE similarity index 100% rename from Benchmarking/LICENSE rename to Scripts/archive/Benchmarking/LICENSE diff --git a/Benchmarking/Scripts/Cross_Validation.R b/Scripts/archive/Benchmarking/Scripts/Cross_Validation.R similarity index 100% rename from Benchmarking/Scripts/Cross_Validation.R rename to Scripts/archive/Benchmarking/Scripts/Cross_Validation.R diff --git a/Benchmarking/Scripts/check_preds.R b/Scripts/archive/Benchmarking/Scripts/check_preds.R similarity index 100% rename from Benchmarking/Scripts/check_preds.R rename to Scripts/archive/Benchmarking/Scripts/check_preds.R diff --git a/Benchmarking/Scripts/complexity/F2.py b/Scripts/archive/Benchmarking/Scripts/complexity/F2.py similarity index 100% rename from Benchmarking/Scripts/complexity/F2.py rename to Scripts/archive/Benchmarking/Scripts/complexity/F2.py diff --git a/Benchmarking/Scripts/complexity/N1.py b/Scripts/archive/Benchmarking/Scripts/complexity/N1.py similarity index 100% rename from Benchmarking/Scripts/complexity/N1.py rename to Scripts/archive/Benchmarking/Scripts/complexity/N1.py diff --git a/Benchmarking/Scripts/complexity/prep.R b/Scripts/archive/Benchmarking/Scripts/complexity/prep.R similarity index 100% rename from Benchmarking/Scripts/complexity/prep.R rename to Scripts/archive/Benchmarking/Scripts/complexity/prep.R diff --git a/Benchmarking/Scripts/evaluate.R b/Scripts/archive/Benchmarking/Scripts/evaluate.R similarity index 100% rename from Benchmarking/Scripts/evaluate.R rename to Scripts/archive/Benchmarking/Scripts/evaluate.R diff --git a/Benchmarking/Scripts/run_CHETAH.R b/Scripts/archive/Benchmarking/Scripts/run_CHETAH.R similarity index 100% rename from Benchmarking/Scripts/run_CHETAH.R rename to Scripts/archive/Benchmarking/Scripts/run_CHETAH.R diff --git a/Benchmarking/Scripts/run_Correlation.R b/Scripts/archive/Benchmarking/Scripts/run_Correlation.R similarity index 100% rename from Benchmarking/Scripts/run_Correlation.R rename to Scripts/archive/Benchmarking/Scripts/run_Correlation.R diff --git a/Benchmarking/Scripts/run_SVM.py b/Scripts/archive/Benchmarking/Scripts/run_SVM.py similarity index 100% rename from Benchmarking/Scripts/run_SVM.py rename to Scripts/archive/Benchmarking/Scripts/run_SVM.py diff --git a/Benchmarking/Scripts/run_SVM_rejection.py b/Scripts/archive/Benchmarking/Scripts/run_SVM_rejection.py similarity index 100% rename from Benchmarking/Scripts/run_SVM_rejection.py rename to Scripts/archive/Benchmarking/Scripts/run_SVM_rejection.py diff --git a/Benchmarking/Scripts/run_SciBet.R b/Scripts/archive/Benchmarking/Scripts/run_SciBet.R similarity index 100% rename from Benchmarking/Scripts/run_SciBet.R rename to Scripts/archive/Benchmarking/Scripts/run_SciBet.R diff --git a/Benchmarking/Scripts/run_SingleR.R b/Scripts/archive/Benchmarking/Scripts/run_SingleR.R similarity index 100% rename from Benchmarking/Scripts/run_SingleR.R rename to Scripts/archive/Benchmarking/Scripts/run_SingleR.R diff --git a/Benchmarking/Scripts/run_scClassify.R b/Scripts/archive/Benchmarking/Scripts/run_scClassify.R similarity index 100% rename from Benchmarking/Scripts/run_scClassify.R rename to Scripts/archive/Benchmarking/Scripts/run_scClassify.R diff --git a/Benchmarking/Scripts/run_scHPL.py b/Scripts/archive/Benchmarking/Scripts/run_scHPL.py similarity index 100% rename from Benchmarking/Scripts/run_scHPL.py rename to Scripts/archive/Benchmarking/Scripts/run_scHPL.py diff --git a/Benchmarking/Scripts/run_scPred.R b/Scripts/archive/Benchmarking/Scripts/run_scPred.R similarity index 100% rename from Benchmarking/Scripts/run_scPred.R rename to Scripts/archive/Benchmarking/Scripts/run_scPred.R diff --git a/Benchmarking/Scripts/run_scmap.R b/Scripts/archive/Benchmarking/Scripts/run_scmap.R similarity index 100% rename from Benchmarking/Scripts/run_scmap.R rename to Scripts/archive/Benchmarking/Scripts/run_scmap.R diff --git a/Benchmarking/Scripts/run_scmapcell.R b/Scripts/archive/Benchmarking/Scripts/run_scmapcell.R similarity index 100% rename from Benchmarking/Scripts/run_scmapcell.R rename to Scripts/archive/Benchmarking/Scripts/run_scmapcell.R diff --git a/Benchmarking/Scripts/run_scmapcluster.R b/Scripts/archive/Benchmarking/Scripts/run_scmapcluster.R similarity index 100% rename from Benchmarking/Scripts/run_scmapcluster.R rename to Scripts/archive/Benchmarking/Scripts/run_scmapcluster.R diff --git a/Benchmarking/Scripts/run_scmaptotal.R b/Scripts/archive/Benchmarking/Scripts/run_scmaptotal.R similarity index 100% rename from Benchmarking/Scripts/run_scmaptotal.R rename to Scripts/archive/Benchmarking/Scripts/run_scmaptotal.R diff --git a/Benchmarking/Scripts/run_singleCellNet.R b/Scripts/archive/Benchmarking/Scripts/run_singleCellNet.R similarity index 100% rename from Benchmarking/Scripts/run_singleCellNet.R rename to Scripts/archive/Benchmarking/Scripts/run_singleCellNet.R diff --git a/Benchmarking/Snakefile b/Scripts/archive/Benchmarking/Snakefile similarity index 100% rename from Benchmarking/Snakefile rename to Scripts/archive/Benchmarking/Snakefile diff --git a/Benchmarking/dag.pdf b/Scripts/archive/Benchmarking/dag.pdf similarity index 100% rename from Benchmarking/dag.pdf rename to Scripts/archive/Benchmarking/dag.pdf diff --git a/Benchmarking/requirements.txt b/Scripts/archive/Benchmarking/requirements.txt similarity index 100% rename from Benchmarking/requirements.txt rename to Scripts/archive/Benchmarking/requirements.txt diff --git a/Scripts/Gather_Preds.py b/Scripts/archive/Gather_Preds.py similarity index 99% rename from Scripts/Gather_Preds.py rename to Scripts/archive/Gather_Preds.py index e56f423..8e47698 100644 --- a/Scripts/Gather_Preds.py +++ b/Scripts/archive/Gather_Preds.py @@ -61,7 +61,7 @@ def run_concat(Results_Folder_Path, tools_for_consensus): for path in paths[1:]: current = pd.read_csv(path, index_col = 0) result = pd.concat([result, current], axis = 1) - + # case 1: get consensus of all tools if tools_for_consensus[0] == 'all': mode = result.select_dtypes(exclude=['float64']).mode(axis = 1) diff --git a/Scripts/plot_preds.R b/Scripts/archive/plot_preds.R similarity index 100% rename from Scripts/plot_preds.R rename to Scripts/archive/plot_preds.R diff --git a/Scripts/archive/preprocess.R b/Scripts/archive/preprocess.R new file mode 100644 index 0000000..7215be4 --- /dev/null +++ b/Scripts/archive/preprocess.R @@ -0,0 +1,99 @@ +library(glue) +library(data.table) + +preprocess_data<-function(RefPath, QueryPaths, OutputDir, check_genes = FALSE, genes_required = NULL){ + " + run preproccessing, normalization, and writing down of matrices with common genes + Wrapper script to run preproccessing, normalization, and writing down of matrices with common genes between query and reference datasets + outputs lists of true and predicted cell labels as csv files, as well as computation time. + + Parameters + ---------- + RefPath : Data file path (.csv), cells-genes matrix with cell unique barcodes + as row names and gene names as column names. + LabelsPath : Cell population annotations (per cell) file path (.csv). + QueryPaths : List of Query datasets paths + OutputDir : Output directory defining the path of the exported files. + genes_required: path to file containing required genes in common feature space with column name genes_required + " + + if(check_genes){ + genes_required <- as.vector(read.csv(genes_required$genes_required))} + + message("@Reading reference") + # read data and labels + # Read the reference expression matrix + Data <- fread(RefPath,data.table=FALSE) + message("@Done reading reference") + + row.names(Data) <- Data$V1 + Data <- Data[, 2:ncol(Data)] + colnames(Data) <- toupper(colnames(Data)) + + # set the genes of the reference to common_genes + common_genes <- colnames(Data) + + # loop over all query datasets and get common genes with all + for(query in QueryPaths){ + message("reading Test") + # Read Query + Test <- fread(query,data.table=FALSE) + row.names(Test) <- Test$V1 + Test <- Test[, 2:ncol(Test)] + # Map gene names to upper + colnames(Test) <- toupper(colnames(Test)) + # subset based on common genes + common_genes<- as.vector(intersect(common_genes, colnames(Test))) + } + + # check if the common genes + if(check_genes){ + if(!(genes_required %in% common_genes)){ + warning("Not all genes required in Gene list are found in the common genes between reference and query") + }} + + # loop over query datasets and subset them to common genes and write it down in each query output dir + for(query in QueryPaths){ + # Read Query + Test <- fread(query,data.table=FALSE) + row.names(Test) <- Test$V1 + Test <- Test[, 2:ncol(Test)] + colnames(Test) <- toupper(colnames(Test)) + Test <- Test[,common_genes] + + # write down query expression matrices + dir.create(file.path(OutputDir, basename(dirname(query))), showWarnings = FALSE) + setwd(paste(OutputDir, basename(dirname(query)),sep= '/')) + fwrite(Test, "expression.csv",row.names=T) + } + Data <- Data[,common_genes ] + fwrite(Data, glue("{OutputDir}/expression.csv"),row.names=T) + fwrite(as.data.frame(common_genes), glue("{OutputDir}/common_genes.csv")) +} + +# Get Command line arguments +args <- commandArgs(trailingOnly = TRUE) +# Split the arguments to form lists +arguments <- paste(unlist(args),collapse=' ') +listoptions <- unlist(strsplit(arguments,'--'))[-1] +# Get individual argument names +options.args <- sapply(listoptions,function(x){ + unlist(strsplit(x, ' '))[-1] + }) +options.names <- sapply(listoptions,function(x){ + option <- unlist(strsplit(x, ' '))[1] +}) + +# Set variables containing command line argument values +names(options.args) <- unlist(options.names) +ref <- unlist(options.args['ref']) +print(ref) +test <- unlist(options.args['query']) +output_dir <- unlist(options.args['output_dir' ]) +check_genes <- unlist(options.args['check_genes' ]) +genes_required <- unlist(options.args['genes_required' ]) + + +preprocess_data(ref, test, output_dir, check_genes, genes_required) + + diff --git a/Scripts/run_ACTINN.py b/Scripts/archive/run_ACTINN.py similarity index 82% rename from Scripts/run_ACTINN.py rename to Scripts/archive/run_ACTINN.py index 774ae73..dc1f976 100644 --- a/Scripts/run_ACTINN.py +++ b/Scripts/archive/run_ACTINN.py @@ -52,7 +52,7 @@ def run_ACTINN(RefPath, LabelsPath, QueryPaths, OutputDirs): tm.sleep(60) # RUN ACTINN formatting - os.system("python ../Scripts/ACTINN_scripts/actinn_format.py -i train.csv -o train -f csv") + os.system("python /lustre06/project/6004736/alvann/from_narval/DEV/scCoAnnotate-dev/Scripts/ACTINN/actinn_format.py -i train.csv -o train -f csv") i = 0 for Query in QueryPaths: @@ -66,24 +66,24 @@ def run_ACTINN(RefPath, LabelsPath, QueryPaths, OutputDirs): Query = np.log1p(Query) Query = Query.transpose() Query.to_csv("Query.csv") - os.system("python ../Scripts/ACTINN_scripts/actinn_format.py -i Query.csv -o Query -f csv") + os.system("python /lustre06/project/6004736/alvann/from_narval/DEV/scCoAnnotate-dev/Scripts/ACTINN/actinn_format.py -i Query.csv -o Query -f csv") # measure total time start = tm.time() # execute the actinn prediction file - os.system("python ../Scripts/ACTINN_scripts/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts Query.h5 -op False ") + os.system("python /lustre06/project/6004736/alvann/from_narval/DEV/scCoAnnotate-dev/Scripts/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts Query.h5 -op False ") tot.append(tm.time()-start) tm.sleep(60) # read predictions and probabilities - predlabels = pd.read_csv('{tmp}/predicted_label.txt'.format(tmp = tmp_path),header=0,index_col=0, sep='\t') + predlabels = pd.read_csv('{tmp}/predicted_label.txt'.format(tmp = tmp_path),sep='\t') - pred = pd.DataFrame({"ACTINN":predlabels['celltype']}, index = predlabels.index) + pred = pd.DataFrame({"cell":predlabels["cellname"], "ACTINN":predlabels["celltype"]}) tot_time = pd.DataFrame(tot) # output the files os.chdir(path) - pred.to_csv("ACTINN_pred.csv", index = True) + pred.to_csv("ACTINN_pred.csv", index = False) tot_time.to_csv("ACTINN_training_time.csv", index = False) tot_time.to_csv("ACTINN_query_time.csv", index = False) i = i+1 diff --git a/Scripts/run_CHETAH.R b/Scripts/archive/run_CHETAH.R similarity index 100% rename from Scripts/run_CHETAH.R rename to Scripts/archive/run_CHETAH.R diff --git a/Scripts/run_SVM_reject.py b/Scripts/archive/run_SVM_reject.py similarity index 100% rename from Scripts/run_SVM_reject.py rename to Scripts/archive/run_SVM_reject.py diff --git a/Scripts/run_SciBet.R b/Scripts/archive/run_SciBet.R similarity index 100% rename from Scripts/run_SciBet.R rename to Scripts/archive/run_SciBet.R diff --git a/Scripts/run_SingleCellNet.R b/Scripts/archive/run_SingleCellNet.R similarity index 100% rename from Scripts/run_SingleCellNet.R rename to Scripts/archive/run_SingleCellNet.R diff --git a/Scripts/run_SingleR.R b/Scripts/archive/run_SingleR.R similarity index 97% rename from Scripts/run_SingleR.R rename to Scripts/archive/run_SingleR.R index 640fc10..d273c09 100644 --- a/Scripts/run_SingleR.R +++ b/Scripts/archive/run_SingleR.R @@ -71,7 +71,6 @@ run_SingleR <- function(RefPath, LabelsPath, QueryPaths, OutputDirs){ # tidy up prediction dataframe Pred_Labels_SingleR <- as.vector(pred$labels) Pred_Labels_SingleR <- data.frame(SingleR =Pred_Labels_SingleR,row.names = cellnames) - Test_Time_SingleR <- as.numeric(difftime(end_time,start_time,units = 'secs')) # Create SingleR subdir in target dir dir.create(file.path(OutputDir, "SingleR"), showWarnings = FALSE) diff --git a/Scripts/run_correlation.R b/Scripts/archive/run_correlation.R similarity index 100% rename from Scripts/run_correlation.R rename to Scripts/archive/run_correlation.R diff --git a/Scripts/run_scClassify.R b/Scripts/archive/run_scClassify.R similarity index 100% rename from Scripts/run_scClassify.R rename to Scripts/archive/run_scClassify.R diff --git a/Scripts/run_scHPL.py b/Scripts/archive/run_scHPL.py similarity index 100% rename from Scripts/run_scHPL.py rename to Scripts/archive/run_scHPL.py diff --git a/Scripts/run_scPred.R b/Scripts/archive/run_scPred.R similarity index 100% rename from Scripts/run_scPred.R rename to Scripts/archive/run_scPred.R diff --git a/Scripts/run_scmapcell.R b/Scripts/archive/run_scmapcell.R similarity index 100% rename from Scripts/run_scmapcell.R rename to Scripts/archive/run_scmapcell.R diff --git a/Scripts/run_scmapcluster.R b/Scripts/archive/run_scmapcluster.R similarity index 100% rename from Scripts/run_scmapcluster.R rename to Scripts/archive/run_scmapcluster.R diff --git a/snakefile b/Scripts/archive/snakefile similarity index 66% rename from snakefile rename to Scripts/archive/snakefile index 6c2d8e0..728d8bb 100644 --- a/snakefile +++ b/Scripts/archive/snakefile @@ -1,192 +1,192 @@ # import libraries -report: "report/workflow.rst" import os # Get the names of query samples from the paths given in the query section of the config samples = [os.path.basename(os.path.dirname(query_path)) for query_path in config['query_datasets']] -# Get the names of tools to run from the list provided in the config -tools = config['tools_to_run'] - # Create subdirectories for each query sample for sample in samples: path = "{output_dir}/{sample}".format(output_dir = config["output_dir"], sample = sample) if not os.path.exists(path): os.mkdir(path) -# create the prediction output list as an expand() output -output = expand( - "{output_dir}/{sample}/{tool}/{tool}_pred.csv", - tool = config['tools_to_run'], - output_dir = config["output_dir"], - sample = samples) - -# loop to only mark tools that havent been run on all samples to avoid rerunning -to_run = [] -# loop over all tools -for tool in tools: - alreadyrun = True - for sample in samples: - # check if the prediction output of the tool is present for all query datasets - path = "{output_dir}/{sample}/{tool}/{tool}_pred.csv".format(output_dir = config["output_dir"], sample = sample, tool = tool) - # if file is not present in all, mark it for running - if not os.path.isfile(path): - alreadyrun = False - if alreadyrun == False: - to_run.append(tool) - -# config file to check if certain genes required by the user are present inb the list of common genes between reference and queries -genes_required = config['genes_required'] - -# create benchmarking directory is user chooses to perform benchmarking on the reference -benchmarking_dir = "{output_dir}/benchmarking".format(output_dir = config["output_dir"]) -if not os.path.exists(benchmarking_dir) and bool(config['benchmark']): - os.mkdir(benchmarking_dir) - -# create the benchmarking config file and save to benchmarking subdirectory -if bool(config['benchmark']): - outF = open("{dir}/benchmark.yml".format(dir = benchmarking_dir), "w") - outF.write("output_dir: {output_dir}\n".format(output_dir = benchmarking_dir )) - outF.write("datafile: {reference} \n".format(reference = config['training_reference'])) - outF.write("labfile: {labels}\n".format(labels = config['reference_annotations'])) - outF.write("rejection: \"True\"\n") - outF.write("column: 2\n") - outF.write("metrics: \n") - outF.write(" - N1\n") - outF.write(" - F2\n") - outF.write("tools_to_run:\n") - outF.write(" - Correlation\n") - outF.write(" - SciBet\n") - outF.close() - -# invoke the benchmarking subworkflow (saved in the benchmarking folder in scCoAnnotate dir) -subworkflow Benchmark: - workdir: - "Benchmarking/" - snakefile: - "Benchmarking/Snakefile" - configfile: - "{benchmarking_dir}/benchmark.yml".format(benchmarking_dir = benchmarking_dir) - - - -# use rule all to dictate the final output """ One rule that directs the DAG which is represented in the rulegraph """ rule all: input: - output = Benchmark("{benchmarking_dir}/evaluation/finished.csv".format(benchmarking_dir = benchmarking_dir)) - if bool(config['benchmark']) else [], - summary = expand( - "{output_dir}/{sample}/Prediction_Summary.tsv", - output_dir=config["output_dir"], - sample = samples), - predictions = report(expand( - "{output_dir}/{sample}/{tool}/{tool}_pred.csv", - tool=to_run, - output_dir=config["output_dir"], - sample = samples)), - plot_and_embeddings = expand("{output_dir}/{sample}/figures/{tools}.png", - sample = samples, - output_dir = config['output_dir'], tools = config['tools_to_run']) if bool(config['plots']) else [] - + expand(config["output_dir"] + "/{sample}/Prediction_Summary.tsv", + sample = samples), + expand(config["output_dir"] + "/{sample}/{tool}/{tool}_pred.csv", + tool = config['tools_to_run'], + sample = samples) """ - rule that gets the Consensus and concat all predictions +rule that gets the gets the interesction in genes between samples and reference +It outputs temporary reference and query datasets based on the common genes """ -rule plot: +rule preprocess: input: - summary = expand( - "{output_dir}/{sample}/Prediction_Summary.tsv", - output_dir = config["output_dir"], - sample = samples), - query = expand("{query}", query=config['query_datasets']), - output_dir = expand("{output_dir}/{sample}", sample = samples, output_dir=config['output_dir']) + reference = config['training_reference'], + query = config['query_datasets'] output: - expand("{output_dir}/{sample}/figures/{tools}.png", - sample = samples, - output_dir = config['output_dir'], tools = config['tools_to_run']) - - log: expand("{output_dir}/{sample}/plots.log", output_dir=config["output_dir"], sample = samples) + reference = config['output_dir'] + "/expression.csv", + query = expand(config['output_dir'] + "/{sample}/expression.csv", sample = samples) + params: + check_genes = bool(config['check_genes']), + genes_required = config['genes_required'] + log: config['output_dir'] + "/preprocess.log" + priority: 50 shell: - "Rscript Scripts/plot_preds.R " + "Rscript {workflow.basedir}/Scripts/preprocess.R " + "--ref {input.reference} " "--query {input.query} " - "--output_dir {input.output_dir} " - "&> {log}" - + "--output_dir {config[output_dir]} " + "--check_genes {params.check_genes} " + "--genes_required {params.genes_required} " + "&> {log} " """ - rule that gets the Consensus +rule that gets the Consensus """ rule concat: input: - results = expand("{output_dir}/{sample}/{tool}/{tool}_pred.csv", - tool=to_run, - output_dir=config["output_dir"], - sample = samples), - sample = expand("{output_dir}/{sample}", - output_dir=config["output_dir"], - sample = samples) - params: - consensus = config.get("consensus") + results = expand(config["output_dir"] + "/{sample}/{tool}/{tool}_pred.csv", + tool=config['tools_to_run'], + sample = samples), + sample = expand(config["output_dir"] + "/{sample}", + sample = samples) output: - report(expand("{output_dir}/{sample}/Prediction_Summary.tsv", - sample = samples, - output_dir = config['output_dir']), caption = "report/summaries.rst", category = "Predictions") - log: expand("{output_dir}/{sample}/Gatherpreds.log", output_dir=config["output_dir"], sample = samples) + expand(config["output_dir"] + "/{sample}/Prediction_Summary.tsv", + sample = samples) + log: + expand(config["output_dir"] + "/{sample}/Gatherpreds.log", sample = samples) shell: - "python3 Scripts/Gather_Preds.py " + "python3 {workflow.basedir}/Scripts/Gather_Preds.py " "-i {input.sample} " - "-c {params.consensus} " + "-c {config[consensus]} " "-k {input.sample} " "&> {log}" """ - rule that gets the gets the interesction in genes between samples and reference - It outputs temporary reference and query datasets based on the common genes -""" -rule preprocess: +Rules for R based tools. + + +#--------------------------------------------------------------------------- + +rule train_SingleR: input: - reference = config['training_reference'], - query = expand("{query}", query=config['query_datasets']), - output_dir = config['output_dir'], + reference = config['output_dir'] + "/expression.csv", + labfile = config['reference_annotations'] output: - reference_result = temp("{output_dir}/expression.csv".format(output_dir = config['output_dir'])), - query = temp(expand("{output_dir}/{sample}/expression.csv", output_dir=config["output_dir"], - sample = samples)) + model = config['output_dir'] + "/SingleR/SingleR_model.Rda" params: - check_genes = bool(config['check_genes']), - genes_required = genes_required - log: "{output_dir}/preprocess.log".format(output_dir=config['output_dir']) - priority: 50 + basedir = {workflow.basedir} + log: + config['output_dir'] + "/SingleR/SingleR.log" + benchmark: + config['output_dir'] + "/SingleR/SingleR_train_benchmark.txt" + threads: 1 + resources: shell: - "Rscript Scripts/preprocess.R " - "--ref {input.reference} " - "--query {input.query} " - "--output_dir {input.output_dir} " - "--check_genes {params.check_genes} " - "--genes_required {params.genes_required} " - "&> {log}" + """ + Rscript {params.basedir}/Scripts/SingleR/train_SingleR.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_SingleR: + input: + query = config['output_dir'] + "/{sample}/expression.csv", + model = config['output_dir'] + "/SingleR/SingleR_model.Rda" + output: + pred = config['output_dir'] + "/{sample}/SingleR/SingleR_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/SingleR/SingleR.log" + benchmark: + config['output_dir'] + "/{sample}/SingleR/SingleR_predict_benchmark.txt" + threads: 1 + resources: + shell: + """ + Rscript {params.basedir}/Scripts/SingleR/predict_SingleR.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +rule train_scPred: + input: + reference = config['output_dir'] + "/expression.csv", + labfile = config['reference_annotations'] + output: + model_type = config['output_dir'] + "/scPred/scPred_model.Rda" + params: + basedir = {workflow.basedir}, + model = "svmRadial" + log: + config['output_dir'] + "/scPred/scPred.log" + benchmark: + config['output_dir'] + "/scPred/scPred_train_benchmark.txt" + threads: 1 + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scPred/train_scPred.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + {params.model_type} \ + &> {log} + """ + +rule predict_scPred: + input: + query = config['output_dir'] + "/{sample}/expression.csv", + model = config['output_dir'] + "/scPred/scPred_model.Rda" + output: + pred = config['output_dir'] + "/{sample}/scPred/scPred_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/scPred/scPred.log" + benchmark: + config['output_dir'] + "/{sample}/scPred/scPred_predict_benchmark.txt" + threads: 1 + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scPred/predict_scPred.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ +#--------------------------------------------------------------------------- -""" -Rules for R based tools. -""" rule correlation: input: reference = "{output_dir}/expression.csv".format(output_dir =config['output_dir']), labfile = config['reference_annotations'], query = expand("{output_dir}/{sample}/expression.csv",sample = samples,output_dir=config['output_dir']), output_dir = expand("{output_dir}/{sample}",sample = samples,output_dir=config['output_dir']) - output: pred = expand("{output_dir}/{sample}/correlation/correlation_pred.csv", sample = samples,output_dir=config["output_dir"]), query_time = expand("{output_dir}/{sample}/correlation/correlation_query_time.csv",sample = samples,output_dir=config["output_dir"]), training_time = expand("{output_dir}/{sample}/correlation/correlation_training_time.csv",sample = samples,output_dir=config["output_dir"]) log: expand("{output_dir}/{sample}/correlation/correlation.log", sample = samples,output_dir=config["output_dir"]) shell: - "Rscript Scripts/run_correlation.R " + "Rscript {workflow.basedir}/Scripts/run_correlation.R " "--ref {input.reference} " "--labs {input.labfile} " "--query {input.query} " @@ -212,26 +212,6 @@ rule SciBet: "--query {input.query} " "--output_dir {input.output_dir} " "&> {log}" - -rule scClassify: - input: - reference = "{output_dir}/expression.csv".format(output_dir =config['output_dir']), - labfile = config['reference_annotations'], - query = expand("{output_dir}/{sample}/expression.csv",sample = samples,output_dir=config['output_dir']), - output_dir = expand("{output_dir}/{sample}",sample = samples,output_dir=config['output_dir']) - - output: - pred = expand("{output_dir}/{sample}/scClassify/scClassify_pred.csv", sample = samples,output_dir=config["output_dir"]), - query_time = expand("{output_dir}/{sample}/scClassify/scClassify_query_time.csv",sample = samples,output_dir=config["output_dir"]), - training_time = expand("{output_dir}/{sample}/scClassify/scClassify_training_time.csv",sample = samples,output_dir=config["output_dir"]) - log: expand("{output_dir}/{sample}/scClassify/scClassify.log", sample = samples,output_dir=config["output_dir"]) - shell: - "Rscript Scripts/run_scClassify.R " - "--ref {input.reference} " - "--labs {input.labfile} " - "--query {input.query} " - "--output_dir {input.output_dir} " - "&> {log}" rule SingleCellNet: input: @@ -251,47 +231,6 @@ rule SingleCellNet: "--query {input.query} " "--output_dir {input.output_dir} " "&> {log}" - -rule scPred: - input: - reference = "{output_dir}/expression.csv".format(output_dir =config['output_dir']), - labfile = config['reference_annotations'], - query = expand("{output_dir}/{sample}/expression.csv",sample = samples,output_dir=config['output_dir']), - output_dir = expand("{output_dir}/{sample}",sample = samples,output_dir=config['output_dir']) - - output: - pred = expand("{output_dir}/{sample}/scPred/scPred_pred.csv", sample = samples,output_dir=config["output_dir"]), - query_time = expand("{output_dir}/{sample}/scPred/scPred_query_time.csv",sample = samples,output_dir=config["output_dir"]), - training_time = expand("{output_dir}/{sample}/scPred/scPred_training_time.csv",sample = samples,output_dir=config["output_dir"]) - log: expand("{output_dir}/{sample}/scPred/scPred.log", sample = samples,output_dir=config["output_dir"]) - shell: - "Rscript Scripts/run_scPred.R " - "--ref {input.reference} " - "--labs {input.labfile} " - "--query {input.query} " - "--output_dir {input.output_dir} " - "&> {log}" - -rule SingleR: - input: - reference = "{output_dir}/expression.csv".format(output_dir =config['output_dir']), - labfile = config['reference_annotations'], - query = expand("{output_dir}/{sample}/expression.csv",sample = samples,output_dir=config['output_dir']), - output_dir = expand("{output_dir}/{sample}",sample = samples,output_dir=config['output_dir']) - - output: - pred = expand("{output_dir}/{sample}/SingleR/SingleR_pred.csv", sample = samples,output_dir=config["output_dir"]), - query_time = expand("{output_dir}/{sample}/SingleR/SingleR_query_time.csv",sample = samples,output_dir=config["output_dir"]), - training_time = expand("{output_dir}/{sample}/SingleR/SingleR_training_time.csv",sample = samples,output_dir=config["output_dir"]) - - log: expand("{output_dir}/{sample}/SingleR/SingleR.log", sample = samples,output_dir=config["output_dir"]) - shell: - "Rscript Scripts/run_SingleR.R " - "--ref {input.reference} " - "--labs {input.labfile} " - "--query {input.query} " - "--output_dir {input.output_dir} " - "&> {log}" rule CHETAH: input: @@ -418,3 +357,44 @@ rule scHPL: "--query {input.query} " "--output_dir {input.output_dir} " "&> {log}" + +rule scPred: + input: + reference = "{output_dir}/expression.csv".format(output_dir =config['output_dir']), + labfile = config['reference_annotations'], + query = expand("{output_dir}/{sample}/expression.csv",sample = samples,output_dir=config['output_dir']), + output_dir = expand("{output_dir}/{sample}",sample = samples,output_dir=config['output_dir']) + + output: + pred = expand("{output_dir}/{sample}/scPred/scPred_pred.csv", sample = samples,output_dir=config["output_dir"]), + query_time = expand("{output_dir}/{sample}/scPred/scPred_query_time.csv",sample = samples,output_dir=config["output_dir"]), + training_time = expand("{output_dir}/{sample}/scPred/scPred_training_time.csv",sample = samples,output_dir=config["output_dir"]) + log: expand("{output_dir}/{sample}/scPred/scPred.log", sample = samples,output_dir=config["output_dir"]) + shell: + "Rscript Scripts/run_scPred.R " + "--ref {input.reference} " + "--labs {input.labfile} " + "--query {input.query} " + "--output_dir {input.output_dir} " + "&> {log}" + +rule SingleR: + input: + reference = "{output_dir}/expression.csv".format(output_dir =config['output_dir']), + labfile = config['reference_annotations'], + query = expand("{output_dir}/{sample}/expression.csv",sample = samples,output_dir=config['output_dir']), + output_dir = expand("{output_dir}/{sample}",sample = samples,output_dir=config['output_dir']) + + output: + pred = expand("{output_dir}/{sample}/SingleR/SingleR_pred.csv", sample = samples,output_dir=config["output_dir"]), + query_time = expand("{output_dir}/{sample}/SingleR/SingleR_query_time.csv",sample = samples,output_dir=config["output_dir"]), + training_time = expand("{output_dir}/{sample}/SingleR/SingleR_training_time.csv",sample = samples,output_dir=config["output_dir"]) + + log: expand("{output_dir}/{sample}/SingleR/SingleR.log", sample = samples,output_dir=config["output_dir"]) + shell: + "Rscript Scripts/run_SingleR.R " + "--ref {input.reference} " + "--labs {input.labfile} " + "--query {input.query} " + "--output_dir {input.output_dir} " + "&> {log}" diff --git a/Scripts/style/theme_min_SelinJessa.R b/Scripts/archive/style/theme_min_SelinJessa.R similarity index 100% rename from Scripts/style/theme_min_SelinJessa.R rename to Scripts/archive/style/theme_min_SelinJessa.R diff --git a/Scripts/benchmark/subset_folds.R b/Scripts/benchmark/subset_folds.R new file mode 100644 index 0000000..aa442c3 --- /dev/null +++ b/Scripts/benchmark/subset_folds.R @@ -0,0 +1,81 @@ +# load libraries and arguments +library(rBayesianOptimization) +library(tidyverse) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +out_path = args[3] +threads = as.numeric(args[4]) +n_folds = as.numeric(args[5]) + +#--------------- Data ------------------- + +# read reference matrix +message('@ READ REF') +ref = data.table::fread(ref_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# read reference labels +labels = data.table::fread(lab_path, header=T, data.table=F) %>% + column_to_rownames('V1') + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(rownames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +ref[1:10,1:10] +head(labels) + +# create n folds +folds = KFold(labels$label, + nfolds = n_folds, + stratified = T, + seed = 1234) +head(folds) + +# Loop through folds and save training and testing data sets +for (i in 1:n_folds){ + message(paste0('@ SAVING FOLD ', i)) + + print(head(folds[[i]])) + + # subset test fold + message('subset test fold') + test = ref[folds[[i]], ,drop=F] + test = test %>% rownames_to_column("cell") + colnames(test)[1] = "" + + # subset true test labels + message('subset true test labels') + test_labels = labels[folds[[i]], ,drop=F] + test_labels = test_labels %>% rownames_to_column("cell") + colnames(test_labels)[1] = "" + + # subset training data + message('subset true test labels') + train = ref[-folds[[i]], ,drop=F] + train = train %>% rownames_to_column("cell") + colnames(train)[1] = "" + + # subset labels for training data + message('@ subset labels for training data') + train_labels = labels[-folds[[i]], ,drop=F] + train_labels = train_labels %>% rownames_to_column("cell") + colnames(train_labels)[1] = "" + + # save csv files + data.table::fwrite(test, paste0(out_path, '/fold', i, '/test.csv')) + data.table::fwrite(test_labels, paste0(out_path, '/fold', i, '/test_labels.csv')) + data.table::fwrite(train, paste0(out_path, '/fold', i, '/train.csv')) + data.table::fwrite(train_labels, paste0(out_path, '/fold', i, '/train_labels.csv')) +} + + diff --git a/Scripts/calculate_consensus.R b/Scripts/calculate_consensus.R new file mode 100644 index 0000000..31e5133 --- /dev/null +++ b/Scripts/calculate_consensus.R @@ -0,0 +1,63 @@ + +library(tidyverse) + +args = commandArgs(trailingOnly = TRUE) +pred_path = args[1] +summary_path = args[2] +tools = strsplit(args[3], split = ' ')[[1]] +consensus_tools = strsplit(args[4], split = ' ')[[1]] +ref_lab = args[5] + +print(tools) +print(consensus_tools) + +if(consensus_tools[1] == 'all'){ + consensus_tools = tools +} + +print(consensus_tools) + +harmonize_unresolved = function(pred, ref_labels){ + pred %>% + column_to_rownames('cellname') %>% + mutate(across(where(is.character), ~ifelse(. %in% c(ref_labels$label), ., 'Unresolved'))) %>% + rownames_to_column('cellname') %>% + return() +} + +getmode = function(v) { + uniqv = unique(v) + matches = tabulate(match(v, uniqv)) + max_match = max(matches) + + ties = ifelse(length(which(matches == max_match)) > 1, T, F) + + if (max_match == 1) { + return("No Consensus") + } else if (ties) { + return("No Consensus") + } else { + return(uniqv[which.max(matches)]) + } +} + +ref_labels = data.table::fread(ref_lab, header = T) %>% column_to_rownames('V1') + +files =list.files(pred_path, pattern = 'pred.csv', recursive = T, full.names = T) + +l = list() +for(f in files){ + l[[basename(dirname(f))]] = data.table::fread(f) +} + +consensus = l %>% reduce(left_join, by = "cell") %>% rename('cellname' = 'cell') +rm(l) + +tmp = harmonize_unresolved(consensus, ref_labels) +tmp = tmp %>% select(all_of(consensus_tools)) +consensus$Consensus = apply(tmp, 1, getmode) +rm(tmp) + +data.table::fwrite(consensus, summary_path, sep = '\t') + + diff --git a/Scripts/preprocess.R b/Scripts/preprocess.R index 46b87f9..8adbe43 100644 --- a/Scripts/preprocess.R +++ b/Scripts/preprocess.R @@ -1,98 +1,109 @@ -library(glue) -library(data.table) - -preprocess_data<-function(RefPath, QueryPaths, OutputDir, check_genes = FALSE, genes_required = NULL){ - " - run preproccessing, normalization, and writing down of matrices with common genes - Wrapper script to run preproccessing, normalization, and writing down of matrices with common genes between query and reference datasets - outputs lists of true and predicted cell labels as csv files, as well as computation time. + +library(tidyverse) + +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +query_paths = strsplit(args[2], split = ' ')[[1]] +out = args[3] +convert_genes = as.logical(args[4]) +lab_path = args[5] +reference_name = args[6] + +l = list() + +# read reference +l[['ref']] = data.table::fread(ref_path, header = T) %>% column_to_rownames('V1') + +# read query +for(p in query_paths){ + tmp = data.table::fread(p, header = T) %>% column_to_rownames('V1') + query = basename(dirname(p)) + l[[query]] = tmp +} + +# if specified by user, convert reference gene names from mouse to human +if(convert_genes){ + + # include functions and libraries for conversion + library(Orthology.eg.db) + library(org.Mm.eg.db) + library(org.Hs.eg.db) + library(WGCNA) + + mapfun = function(mousegenes){ + gns = mapIds(org.Mm.eg.db, mousegenes, "ENTREZID", "SYMBOL") + mapped = AnnotationDbi::select(Orthology.eg.db, gns, "Homo_sapiens","Mus_musculus") + naind = is.na(mapped$Homo_sapiens) + hsymb = mapIds(org.Hs.eg.db, as.character(mapped$Homo_sapiens[!naind]), "SYMBOL", "ENTREZID") + out = data.frame(Mouse_symbol = mousegenes, mapped, Human_symbol = NA) + out$Human_symbol[!naind] = hsymb + return(out) + } + + # convert + hg = mapfun(colnames(l[['ref']])) %>% dplyr::select(Mouse_symbol, Human_symbol) - Parameters - ---------- - RefPath : Data file path (.csv), cells-genes matrix with cell unique barcodes - as row names and gene names as column names. - LabelsPath : Cell population annotations (per cell) file path (.csv). - QueryPaths : List of Query datasets paths - OutputDir : Output directory defining the path of the exported files. - genes_required: path to file containing required genes in common feature space with column name genes_required - " - - if(check_genes){ - genes_required <- as.vector(read.csv(genes_required$genes_required))} - - message("@Reading reference") - # read data and labels - # Read the reference expression matrix - Data <- fread(RefPath,data.table=FALSE) - message("@Done reading reference") - - row.names(Data) <- Data$V1 - Data <- Data[, 2:ncol(Data)] - colnames(Data) <- toupper(colnames(Data)) - - # set the genes of the reference to common_genes - common_genes <- colnames(Data) - - # loop over all query datasets and get common genes with all - for(query in QueryPaths){ - message("reading Test") - # Read Query - Test <- fread(query,data.table=FALSE) - row.names(Test) <- Test$V1 - Test <- Test[, 2:ncol(Test)] - # Map gene names to upper - colnames(Test) <- toupper(colnames(Test)) - # subset based on common genes - common_genes<- as.vector(intersect(common_genes, colnames(Test))) + # output list of mouse genes that were not converted + not_converted = hg %>% filter(is.na(Human_symbol)) %>% .$Mouse_symbol + data.table::fwrite(as.list(not_converted), file = paste0(reference_out, '/genes_not_converted.csv'), sep = ',') + + # throw error if more than threshold % genes not converted + threshold = 0.5 + if(length(not_converted) > threshold*length(colnames(l[['ref']]))){ + stop(paste0("@ More than ",threshold*100,"% of mouse genes in reference could not be converted to human")) } - # check if the common genes - if(check_genes){ - if(!(genes_required %in% common_genes)){ - warning("Not all genes required in Gene list are found in the common genes between reference and query") - }} - - # loop over query datasets and subset them to common genes and write it down in each query output dir - for(query in QueryPaths){ - # Read Query - Test <- fread(query,data.table=FALSE) - row.names(Test) <- Test$V1 - Test <- Test[, 2:ncol(Test)] - colnames(Test) <- toupper(colnames(Test)) - Test <- Test[,common_genes] - - # write down query expression matrices - dir.create(file.path(OutputDir, basename(dirname(query))), showWarnings = FALSE) - setwd(paste(OutputDir, basename(dirname(query)),sep= '/')) - fwrite(Test, "expression.csv",row.names=T) - } - Data <- Data[,common_genes ] - fwrite(Data, glue("{OutputDir}/expression.csv"),row.names=T) - fwrite(as.data.frame(common_genes), glue("{OutputDir}/common_genes.csv")) + # modify reference matrix to contain converted genes + l[['ref']] = l[['ref']] %>% + transposeBigData() %>% + rownames_to_column('Mouse_symbol') %>% + inner_join(hg %>% filter(!is.na(Human_symbol)), + by = 'Mouse_symbol') %>% + dplyr::select(-Mouse_symbol) %>% + column_to_rownames('Human_symbol') %>% + transposeBigData() + +} + +# get genes for each data frame (colnames) +genes = lapply(l, function(x){(colnames(x))}) + +# reduce set of genes to the intersect +common_genes = Reduce(intersect,genes) + +# throw error if number of common genes below % threshold of genes in any of provided datasets (ref or query) +threshold = 0.5 +if(any(length(common_genes) < threshold*length(genes))){ + frac = lapply(genes, function(x){length(common_genes)/length(x)}) + names(frac) = names(l) + print(frac) + stop(paste0("@ In at least one provided dataset (ref or query), less than ",threshold*100,"% of genes appear in common gene set. See above for the fraction of genes from each dataset appearing in common gene set (note: samples with few genes will have higher fractions)")) } -# Get Command line arguments -args <- commandArgs(trailingOnly = TRUE) -# Split the arguments to form lists -arguments <- paste(unlist(args),collapse=' ') -listoptions <- unlist(strsplit(arguments,'--'))[-1] -# Get individual argument names -options.args <- sapply(listoptions,function(x){ - unlist(strsplit(x, ' '))[-1] - }) -options.names <- sapply(listoptions,function(x){ - option <- unlist(strsplit(x, ' '))[1] -}) - -# Set variables containing command line argument values -names(options.args) <- unlist(options.names) -ref <- unlist(options.args['ref']) -test <- unlist(options.args['query']) -output_dir <- unlist(options.args['output_dir' ]) -check_genes <- unlist(options.args['check_genes' ]) -genes_required <- unlist(options.args['genes_required' ]) - - -preprocess_data(ref, test, output_dir, check_genes, genes_required) +# save common genes +data.table::fwrite(data.frame('common_genes' = genes), paste0(out, '/model/', reference_name, '/common_genes.csv')) + +# filter each data set for common genes +l = lapply(l, function(x){x[,common_genes]}) + +# save reference +tmp = l[['ref']] %>% rownames_to_column() +colnames(tmp)[1] = " " +data.table::fwrite(tmp, file = paste0(out, '/model/', reference_name, '/expression.csv'), sep = ',') + +# save query +query_names = names(l)[!names(l) == 'ref'] +for(q in query_names){ + print(q) + tmp = l[[q]] %>% rownames_to_column() + colnames(tmp)[1] = " " + data.table::fwrite(tmp, file = paste0(out, '/', q, '/', reference_name, '/expression.csv'), sep = ',') +} + +# save unique labels (for downstream report color pal) +lab = data.table::fread(lab_path, header = T) %>% column_to_rownames('V1') +lab = data.frame(label = unique(lab$label)) +data.table::fwrite(lab, file = paste0(out, '/model/', reference_name, '/labels.csv'), sep = ',') + diff --git a/Scripts/scAnnotate/run_scAnnotate.R b/Scripts/scAnnotate/run_scAnnotate.R new file mode 100644 index 0000000..377ab88 --- /dev/null +++ b/Scripts/scAnnotate/run_scAnnotate.R @@ -0,0 +1,103 @@ +# load libraries and arguments +library(data.table) +library(WGCNA) +library(tidyverse) +library(Seurat) +library(glue) +library(scAnnotate) +set.seed(1234) + +#---------- Parameters ------------------- +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +query_path = args[3] +pred_path = args[4] +threads = as.numeric(args[5]) +threshold = as.numeric(args[6]) + +# path for other outputs (depends on tools) +out_path = dirname(pred_path) + +#--------------- Data ------------------- + +# read reference matrix and transpose +message('@ READ REF') + +# The matrix of the references is transpose since it needs to be normalize +# with Seurat that expect a genes x cell. +ref <- data.table::fread(ref_path, + data.table=F, + header=T, + nThread=threads) %>% + column_to_rownames('V1') %>% + transposeBigData() + +message('@ DONE') + +# read reference labels +labels <- data.table::fread(lab_path, + data.table=F, + header=T, + nThread=threads) %>% + column_to_rownames('V1') + + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(colnames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# read query matrix and transpose +message('@ READ QUERY') + +# The matrix of the references is transpose since it needs to be normalize +# with Seurat that expect a genes x cell. +query <- data.table::fread(query_path, + data.table=F, + header=T, + nThread=threads) %>% + column_to_rownames('V1') %>% + WGCNA::transposeBigData() + +message('@ DONE') + +# Query and References should have the same gene features +common.genes <- intersect(rownames(query),rownames(ref)) +query <- query[common.genes,] +ref <- ref[common.genes,] + +# Prepare the reference: Normalization +ref <- NormalizeData(ref) %>% as.data.frame() %>% transposeBigData() + +# The label should be in the first column +ref <- cbind(labels,ref) + +# Prepare the query: Normalization +query <- NormalizeData(query) %>% as.data.frame() %>% transposeBigData() + +#------------- Train + Predict scAnnotate ------------- + +# Auto to automaticly define the correction method needed. +pred <- scAnnotate(train=ref, + test=query, + distribution="normal", #Default + correction ="auto", + screening = "wilcox", + threshold=threshold, + lognormalized=TRUE) + +pred_labels <- data.frame(cell = rownames(query), + scAnnotate = pred) + +message('@ WRITTING PREDICTIONS') +data.table::fwrite(pred_labels, + file = pred_path, + row.names = F, + col.names = T, + sep = ",", + nThread = threads) +message('@ DONE') diff --git a/Scripts/scClassify/predict_scClassify.R b/Scripts/scClassify/predict_scClassify.R new file mode 100644 index 0000000..51d8a49 --- /dev/null +++ b/Scripts/scClassify/predict_scClassify.R @@ -0,0 +1,78 @@ +# scClassify prediction script +# Bhavyaa Chandarana, July 2023 + +# load libraries and arguments +library(data.table) +library(Seurat) +library(scClassify) +library(tidyverse) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +query_path = args[1] +model_path = args[2] +pred_path = args[3] +threads = as.numeric(args[4]) + +# path for other outputs (depends on tools) +out_path = dirname(model_path) + +#--------------- Data ------------------- + +# read query matrix +message('@ READ QUERY') +query <- data.table::fread(query_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# load model +# object name "scClassify" +message('@ LOAD MODEL') +load(model_path) +message('@ DONE') + +# transpose (put cells in columns) for Seurat normalization and scClassify, normalize +query <- query %>% transposeBigData() %>% Seurat::NormalizeData() + +#----------- Predict scClassify -------- + +# specify parallelization configuration depending on number of threads +if(threads > 1){ + + bpparam <- BiocParallel::MulticoreParam(workers = threads) + parallel <- TRUE + +} else { + + bpparam <- BiocParallel::SerialParam() + parallel <- FALSE + +} + +# run prediction +message('@ PREDICTING QUERY') +pred <- predict_scClassify( + exprsMat_test = as.matrix(query), + trainRes = scClassify, + parallel = parallel, + BPPARAM = bpparam +) + +# extract predictions table and format +message('@ FORMATTING PREDICTIONS') +pred_table <- pred$pearson_WKNN_limma$predRes %>% as.data.frame() %>% rownames_to_column() +colnames(pred_table)[1] <- "cell" +colnames(pred_table)[2] <- "scClassify" + +# write prediction +message('@ SAVE PREDICTIONS') +data.table::fwrite(pred_table, file = pred_path, + row.names = F, + col.names = T, + sep = ",", + nThread = threads) +message('@ DONE') + +#---------------------------------------- diff --git a/Scripts/scClassify/train_scClassify.R b/Scripts/scClassify/train_scClassify.R new file mode 100644 index 0000000..9c5b6c3 --- /dev/null +++ b/Scripts/scClassify/train_scClassify.R @@ -0,0 +1,92 @@ +# scClassify training script +# Bhavyaa Chandarana, July 2023 + +# load libraries and arguments +library(data.table) +library(Seurat) +library(scClassify) +library(tidyverse) +library(ggplot2) +library(glue) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +model_path = args[3] +threads = as.numeric(args[4]) + +# path for other outputs (depends on tools) +out_path = dirname(model_path) + +#--------------- Data ------------------- + +# read reference matrix +message('@ READ REF') +ref <- data.table::fread(ref_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames("V1") +message('@ DONE') + +# read reference labels +labels <- data.table::fread(lab_path, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# check if cell names are in the same order in labels and ref +order <- all(as.character(rownames(labels)) == as.character(rownames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# transpose (put cells in columns) for Seurat normalization and scClassify, normalize +ref <- ref %>% transposeBigData() %>% Seurat::NormalizeData() + +#------------- Train scClassify ------------- + +# specify parallelization configuration depending on number of threads +if(threads > 1){ + + bpparam <- BiocParallel::MulticoreParam(workers = threads) + parallel <- TRUE + +} else { + + bpparam <- BiocParallel::SerialParam() + parallel <- FALSE + +} + +# train scClassify +message('@ TRAINING MODEL') +scClassify <- train_scClassify( + exprsMat_train = as.matrix(ref), + cellTypes_train = labels$label, + parallel = parallel, + BPPARAM = bpparam, + returnList = FALSE # so that cell type tree can be extracted +) + +# save trained model +message('@ SAVE MODEL') +save(scClassify, file = model_path) +message('@ DONE') + +#------------- Other outputs ---------------- + +# save cell type tree produced in training +message('@ SAVE TREE') +tree <- cellTypeTree(scClassify) +save(tree, file = glue("{out_path}/scClassify_tree.Rda")) +message('@ DONE') + +# save plot of same cell type tree +message('@ SAVE TREE PLOT') +tree_plot <- plotCellTypeTree(tree) +ggsave(tree_plot, file = glue("{out_path}/scClassify_tree.png")) +message('@ DONE') + +#--------------------------------------------- diff --git a/Scripts/scHPL/predict_scHPL.py b/Scripts/scHPL/predict_scHPL.py new file mode 100644 index 0000000..c01e972 --- /dev/null +++ b/Scripts/scHPL/predict_scHPL.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jul 25 17:52:32 2023 + +@author: tomas.vega +""" +## Python 3.11.2 +#--------------- Libraries ------------------- +import pandas as pd +from scHPL import predict +import anndata as ad +import sys +import os +import scanpy as sc +import pickle +import random + +# Set seed +random.seed(123456) + +#--------------- Parameters ------------------- +sample_path = str(sys.argv[1]) +model_path = str(sys.argv[2]) +out_path = str(sys.argv[3]) +out_other_path = os.path.dirname(str(sys.argv[3])) +threshold = float(sys.argv[4]) +#threads = as.numeric(args[4]) + +#--------------- Data ------------------- + +# read query matrix +print('@ READ QUERY') +query = pd.read_csv(sample_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +print('@ DONE') + +# load model +print('@ LOAD MODEL') +scHPL_model = pickle.load(open(model_path, 'rb')) +print('@ DONE') + +# Query preprocessing +query = ad.AnnData(X = query, + obs = dict(obs_names=query.index.astype(str)), + var = dict(var_names=query.columns.astype(str)) + ) + +# Now I normalize the matrix with scanpy: +# Normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization +# 1e4 similar as Seurat +sc.pp.normalize_total(query, target_sum=1e4) + +# Logarithmize the data: +sc.pp.log1p(query) + +#----------- Predict scHPL -------- + +pred = predict.predict_labels(testdata= query.X, + tree = scHPL_model, + threshold = threshold) + +print('@ WRITTING PREDICTIONS') + +pred_labels = pd.DataFrame({'cell': query.obs_names, 'scHPL': pred}) +pred_labels.to_csv(out_path, index = False) +print('@ DONE') +#---------------------------------------- diff --git a/Scripts/scHPL/train_scHPL.py b/Scripts/scHPL/train_scHPL.py new file mode 100644 index 0000000..4d053d8 --- /dev/null +++ b/Scripts/scHPL/train_scHPL.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 24 18:37:59 2023 + +@author: tomas.vega +""" +## Python 3.11.2 +#--------------- Libraries ------------------- +import numpy as np +import pandas as pd +from scHPL import learn, train, utils +from scHPL.utils import TreeNode, _print_node, _count_nodes +from matplotlib import pyplot as plt +import matplotlib.lines as mlines +import anndata as ad +import sys +import scanpy as sc +import pickle +import os +import random + +# Set seed +random.seed(123456) + +#--------------- Parameters ------------------- +ref_path = str(sys.argv[1]) +lab_path = str(sys.argv[2]) +out_path = str(sys.argv[3]) +out_other_path = os.path.dirname(str(sys.argv[3])) +classifier = str(sys.argv[4]) +dimred = bool(sys.argv[5]) + +#--------------- Data ------------------------- +# read the data +ref = pd.read_csv(ref_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +labels = pd.read_csv(lab_path, + index_col = 0, + sep=',') + +# check if cell names are in the same order in labels and ref +order = all(labels.index == ref.index) + +# throw error if order is not the same +if not order: + sys.exit("@ Order of cells in reference and labels do not match") + +adata = ad.AnnData(X = ref, + obs = dict(obs_names=ref.index.astype(str)), + var = dict(var_names=ref.columns.astype(str)) + ) + +# Now I normalize the matrix with scanpy: +# Normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization +# 1e4 similar as Seurat +sc.pp.normalize_total(adata, target_sum=1e4) + +# Logarithmize the data: +sc.pp.log1p(adata) + +# A tree could be use an input, in that case the read_tree can be used. +# The tree format is Newick formatted file +# tree = utils.read_tree(treePath) + +# They provided this solution here: https://github.com/lcmmichielsen/scHPL/issues/7 +tree = utils.create_tree('root') +tree = learn._construct_tree(tree, labels) + +#------------- Train scHPL ------------- +# classifier could be svm, svm_occ, knn +# - the linear SVM when your integrated data still has a lot of +# dimensions (e.g. when you have used Seurat to integrate the datasets) +# - the kNN when your integrated data has less, 10-50, dimensions +# (e.g. when you have used scVI or Harmony to integrate the datasets) +# - the one-class SVM when your main focus is to find unseen cell +# populations. A downside of the one-class SVM, however, is that the +# classification performance drops. +tree = train.train_tree(adata.X, + labels.label, + tree, + classifier = classifier, + dimred = dimred, + useRE = True, + FN = 0.5) + +# Save the model to disk +print('@ SAVE MODEL') +pickle.dump(tree, open(out_path, 'wb')) +print('@ DONE') + + +#------------- Other outputs -------------- +# Plot the tree +# I'm using this method since they provided it here: +# https://github.com/lcmmichielsen/scHPL/issues/5 +def _print_node(node, hor, ver_steps, fig, new_nodes): + global ver + # Add horizontal line + x, y = ([np.max([0.05, hor-0.045]), hor], [ver, ver]) + line = mlines.Line2D(x,y, lw=1) + fig.add_artist(line) + + # Add textbox + if np.isin(node.name[0], new_nodes): + txt = r"$\bf{" + node.name[0] + "}$" + else: + txt = node.name[0] + + for n in node.name: + if(n != node.name[0]): + if np.isin(n, new_nodes): + txt = txt + ' & ' + r"$\bf{" + n + "}$" + else: + txt = txt + ' & ' + n + + fig.text(hor,ver, txt, size=10, + ha = 'left', va='center', + bbox = dict(boxstyle='round', fc='w', ec='k')) + + # Continue with child nodes + hor = hor+0.05 + ver_line_start = ver + ver_line_end = ver + + for i in node.descendants: + ver = ver-ver_steps + ver_line_end = ver + _print_node(i, hor, ver_steps, fig, new_nodes) + + # Add vertical line + x, y = ([np.max([0.05, hor-0.045]), np.max([0.05, hor-0.045])], + [ver_line_start, ver_line_end]) + line = mlines.Line2D(x,y, lw=1) + fig.add_artist(line) + +def print_tree(filepath, + tree: TreeNode, + new_nodes: list = []): + '''Print the tree + Parameters + ---------- + tree : TreeNode + Tree to print + new_nodes : List = [] + Nodes recently added to the tree, these are printed in bold + + Returns + ------- + None. + ''' + + global ver + ver = 0.93 + + count = _count_nodes(tree) + ver_steps = 0.9/count + plot_height = count*0.3 + fig = plt.figure(figsize=(6,plot_height)) # This size is hard coded + ax = plt.subplot(111) + + _print_node(tree[0], hor=0.05, ver_steps=ver_steps, fig=fig, + new_nodes = new_nodes) + + plt.axis('off') + plt.savefig(filepath, dpi=1000) + plt.close() + +print('@ PLOTTING') +filepath = out_other_path + '/tree.png' +print_tree(filepath=filepath, + tree = tree) +print('@ DONE') diff --git a/Scripts/scID/run_scID.R b/Scripts/scID/run_scID.R new file mode 100644 index 0000000..4bc63d5 --- /dev/null +++ b/Scripts/scID/run_scID.R @@ -0,0 +1,108 @@ +# load libraries and arguments +library(data.table) +library(WGCNA) +library(tidyverse) +library(glue) +library(Seurat) +library(scID) +library(MAST) +set.seed(1234) + +#---------- Parameters ------------------- +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +query_path = args[3] +pred_path = args[4] +threads = as.numeric(args[5]) + +# path for other outputs (depends on tools) +out_path = dirname(pred_path) + +#--------------- Data ------------------- + +# read reference matrix and transpose +message('@ READ REF') + +# The matrix of the references is transpose since it needs to be normalize +# with Seurat that expect a genes x cell. +ref <- data.table::fread(ref_path, + data.table=F, + header=T, + nThread=threads) %>% + column_to_rownames('V1') %>% + transposeBigData() + +message('@ DONE') + +# read reference labels +labels <- data.table::fread(lab_path, + data.table=F, + header=T, + nThread=threads) %>% + column_to_rownames('V1') + + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(colnames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# read query matrix and transpose +message('@ READ QUERY') + +# The matrix of the references is transpose since it needs to be normalize +# with Seurat that expect a genes x cell. +query <- data.table::fread(query_path, + data.table=F, + header=T, + nThread=threads) %>% + column_to_rownames('V1') %>% + WGCNA::transposeBigData() + +message('@ DONE') + +# Prepare the reference: Normalization +ref <- scID:::counts_to_cpm(counts_gem = ref) + +# Labels as named vector +label <- setNames(object = labels$label,nm = rownames(labels)) + +# Prepare the query: Normalization +query <- scID:::counts_to_cpm(counts_gem = query) + + +#----------- Train + Predict scID -------- +pred <- scid_multiclass(target_gem = query, + reference_gem = ref, + reference_clusters = label, + logFC = 0.5, #Default + only_pos = FALSE, #Default + normalize_reference = FALSE, #I already normalized the reference + estimate_weights_from_target = FALSE #Default + ) + +pred_labels <- data.frame(cell = names(pred$labels), + scID = as.character(pred$labels) + ) + +message('@ WRITTING PREDICTIONS') +data.table::fwrite(pred_labels, + file = pred_path, + row.names = F, + col.names = T, + sep = ",", + nThread = threads) +message('@ DONE') + +#------------- Other outputs -------------- +# Save the entire output +message('@ SAVE scID OUTPUT') +save(pred, + file = glue('{out_path}/scID_output.Rdata') + ) + +message('@ DONE') diff --git a/Scripts/scLearn/.ipynb_checkpoints/train_scLearn-checkpoint.R b/Scripts/scLearn/.ipynb_checkpoints/train_scLearn-checkpoint.R new file mode 100644 index 0000000..afaa463 --- /dev/null +++ b/Scripts/scLearn/.ipynb_checkpoints/train_scLearn-checkpoint.R @@ -0,0 +1,80 @@ +# load libraries and arguments +library(tidyverse) +library(M3Drop) +library(scLearn) +library(Seurat) +library(glue) +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +model_path = args[3] +threads = as.numeric(args[4]) +#species = args[5] # species="Hs" for homo sapiens or species="Mm" for mus musculus. + +# path for other outputs (depends on tools) +out_path = dirname(model_path) + +#--------------- Data ------------------- + +# read reference matrix +message('@ READ REF') +ref = data.table::fread(ref_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# read reference labels +labels = data.table::fread(lab_path, header=T, data.table=F) %>% + column_to_rownames('V1') + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(rownames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# change labels format to named list (names = cell IDs, values = labels) +labels <- setNames(nm=rownames(labels), + object = labels$label) + + + +# Preprocessing +# transpose reference so cells are on columns +## Because we don't want to filter any cell, we will do only the normalization as they do it. +### They do manually the Seurat normalization. +# ref <-apply(ref,2,function(x){x/(sum(x)/10000)}) +# ref <- log(ref+1) + +ref <- ref %>% WGCNA::transposeBigData() %>% Seurat::NormalizeData() + +### Select genes +high_varGene_names <- Feature_selection_M3Drop(expression_profile = ref, + log_normalized = T #True because it was previously normalized + ) +#------------- Train scLearn ------------- + +# From documentation: "training the model. To improve the accuracy for "unassigned" cell, you can increase "bootstrap_times", but it will takes longer time. The default value of "bootstrap_times" is 10." +scLearn <- scLearn_model_learning(high_varGene_names = high_varGene_names, + expression_profile = as.matrix(ref), #It need a matrix not a sparse matrix. + sample_information_cellType = labels, + bootstrap_times = 10 #Default + ) + +# save trained model +message('@ SAVE MODEL') +save(scLearn, + file = model_path) +message('@ DONE') + +#--------------------------------------------- + +data.table::fwrite(data.frame(Selected_genes = scLearn$high_varGene_names), + file = glue('{out_path}/selected_genes.csv'), + row.names = F, + col.names = T, + sep = ",", + nThread = threads) diff --git a/Scripts/scLearn/predict_scLearn.R b/Scripts/scLearn/predict_scLearn.R new file mode 100644 index 0000000..2c839af --- /dev/null +++ b/Scripts/scLearn/predict_scLearn.R @@ -0,0 +1,72 @@ +# load libraries and arguments +library(tidyverse) +library(M3Drop) +library(scLearn) +library(glue) +set.seed(1234) + +#---------- Parameters ------------------- +args = commandArgs(trailingOnly = TRUE) +query_path = args[1] +model_path = args[2] +pred_path = args[3] +threads = as.numeric(args[4]) + +# path for other outputs (depends on tools) +out_path = dirname(pred_path) + +#--------------- Data ------------------- + +# read query matrix +message('@ READ QUERY') +query = data.table::fread(query_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# load model +message('@ LOAD MODEL') +load(model_path) +message('@ DONE') + +# transpose query data so so cells are on columns +# Normalizing with Seurat, in order to not filter any cell. +query <- query %>% WGCNA::transposeBigData() %>% Seurat::NormalizeData() + +#----------- Predict scLearn -------- + +# CODE FOR PREDICTING HERE +message('@ PREDICT LABELS') + +# From documentation: "Assignment with trained model above. To get a less strict result for "unassigned" cells, you can decrease "diff" and "vote_rate". If you are sure that the cell type of query cells must be in the reference dataset, you can set "threshold_use" as FALSE. It means you don't want to use the thresholds learned by scLearn." +scLearn_predict_result <- scLearn_cell_assignment(scLearn_model_learning_result = scLearn, + expression_profile_query = query, + diff = 0.05, + threshold_use = TRUE, + vote_rate = 0.6) + +message('@ DONE') + + +pred_labs = data.frame(cell = scLearn_predict_result$Query_cell_id, + scLearn = scLearn_predict_result$Predict_cell_type) + +# write prediction +message('@ WRITE PREDICTIONS') +data.table::fwrite(pred_labs, + file = pred_path, + row.names = F, + col.names = T, + sep = ",", + nThread = threads) +message('@ DONE') +#---------------------------------------- + +# Add the entire output with the Scores in Additional_information +message('@ WRITE TABLE OUTPUT WITH ALL PROBABILITIES') +data.table::fwrite(scLearn_predict_result, + file = glue('{out_path}/table_with_probabilities.csv'), + row.names = F, + col.names = T, + sep = ",", + nThread = threads) +message('@ DONE') diff --git a/Scripts/scLearn/train_scLearn.R b/Scripts/scLearn/train_scLearn.R new file mode 100644 index 0000000..2d19c93 --- /dev/null +++ b/Scripts/scLearn/train_scLearn.R @@ -0,0 +1,76 @@ +# load libraries and arguments +library(tidyverse) +library(M3Drop) +library(scLearn) +library(Seurat) +library(glue) +set.seed(1234) + +#---------- Parameters ------------------- +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +model_path = args[3] +threads = as.numeric(args[4]) + +# path for other outputs (depends on tools) +out_path = dirname(model_path) + +#--------------- Data ------------------- + +# read reference matrix +message('@ READ REF') +ref = data.table::fread(ref_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# read reference labels +labels = data.table::fread(lab_path, header=T, data.table=F) %>% + column_to_rownames('V1') + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(rownames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# change labels format to named list (names = cell IDs, values = labels) +labels <- setNames(nm=rownames(labels), + object = labels$label) + + + +# Preprocessing +# transpose reference so cells are on columns +# Because we don't want to filter any cell, we will do only the normalization as they do it. +ref <- ref %>% WGCNA::transposeBigData() %>% Seurat::NormalizeData() + +# Select genes +high_varGene_names <- Feature_selection_M3Drop(expression_profile = ref, + log_normalized = T #True because it was previously normalized + ) +#------------- Train scLearn ------------- + +# From documentation: "training the model. To improve the accuracy for "unassigned" cell, you can increase "bootstrap_times", but it will takes longer time. The default value of "bootstrap_times" is 10." +scLearn <- scLearn_model_learning(high_varGene_names = high_varGene_names, + expression_profile = as.matrix(ref), #It need a matrix not a sparse matrix. + sample_information_cellType = labels, + bootstrap_times = 10 #Default + ) + +# save trained model +message('@ SAVE MODEL') +save(scLearn, + file = model_path) +message('@ DONE') + +#--------------------------------------------- + +data.table::fwrite(data.frame(Selected_genes = scLearn$high_varGene_names), + file = glue('{out_path}/selected_genes.csv'), + row.names = F, + col.names = T, + sep = ",", + nThread = threads) diff --git a/Scripts/scNym/run_scNym.py b/Scripts/scNym/run_scNym.py new file mode 100644 index 0000000..d34690f --- /dev/null +++ b/Scripts/scNym/run_scNym.py @@ -0,0 +1,137 @@ +#--------------- Libraries ------------------- +import pandas as pd +from scnym.api import scnym_api +import anndata as ad +import sys +import scanpy as sc +import os +import random + +# Set seed +random.seed(123456) + +#--------------- Parameters ------------------- +ref_path = str(sys.argv[1]) +lab_path = str(sys.argv[2]) +sample_path = str(sys.argv[3]) +out_path = str(sys.argv[4]) +threshold = float(sys.argv[5]) +out_other_path = os.path.dirname(str(sys.argv[4])) + +#--------------- Data ------------------------- +# read the data +ref = pd.read_csv(ref_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +labels = pd.read_csv(lab_path, + index_col = 0, + sep=',') + +# check if cell names are in the same order in labels and ref +order = all(labels.index == ref.index) + +# throw error if order is not the same +if not order: + sys.exit("@ Order of cells in reference and labels do not match") + +print('@ READ QUERY') +query = pd.read_csv(sample_path, + index_col=0, + sep=',', + engine='c') ## could be pyarrow to make it faster but it needs to be install on the module and has many problems with dependencies + +print('@ DONE') + +#------------- Preparing input ------------- + +#Normalizing reference +ad_ref = ad.AnnData(X = ref, + obs = dict(obs_names=ref.index.astype(str), + label = labels['label']), + var = dict(var_names=ref.columns.astype(str)) + ) + +# Now I normalize the matrix with scanpy: +# Normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization. +sc.pp.normalize_total(ad_ref, target_sum=1e6) + +# Logarithmize the data: +sc.pp.log1p(ad_ref) +sc.pp.filter_genes(ad_ref, min_cells=10) + +# Normalizing query +ad_query = ad.AnnData(X = query, + obs = dict(obs_names=query.index.astype(str)), + var = dict(var_names=query.columns.astype(str)) +) + +# Now I normalize the matrix with scanpy: +# Normalize each cell by total counts over all genes, +# so that every cell has the same total count after normalization. +# If choosing `target_sum=1e6`, this is CPM normalization +sc.pp.normalize_total(ad_query, target_sum=1e6) + +# Logarithmize the data: +sc.pp.log1p(ad_query) +sc.pp.filter_genes(ad_query, min_cells=10) + +# Any cell with the annotation "Unlabeled" will be treated as part of the target +# dataset and used for semi-supervised and adversarial training. +ad_query.obs["label"] = "Unlabeled" + + +#------------- Train scNym ------------- + +# Contatenate +adata = ad_ref.concatenate(ad_query) +file_model = out_other_path + '/model' +scnym_api( + adata=adata, + task='train', + groupby='label', + out_path=file_model, + config='new_identity_discovery', +) + +file_pred = out_other_path + '/predict' +scnym_api( + adata=adata, + task='predict', + key_added='scNym', + trained_model=file_model, + out_path=file_pred, + config='new_identity_discovery', +) + +adata.obs['pred_label_reject'] = adata.obs.apply(lambda row: 'Unknown' if row['scNym_confidence'] < threshold else row['scNym'], axis=1) +def remove_batch(cell_id, batch): + return cell_id.replace(f'-{batch}', '') + +adata.obs['cell_id'] = adata.obs.index.map(lambda cell_id: remove_batch(cell_id, adata.obs.loc[cell_id, 'batch'])) + +# Get only the query +df = adata.obs[adata.obs["label"] == "Unlabeled"] +print('@ WRITTING PREDICTIONS') +pred_df = pd.DataFrame({'cell': df.cell_id, "scNym": df.pred_label_reject}) +pred_df.to_csv(out_path, index = False) +print('@ DONE') + +#------------- Other outputs -------------- + +# Save data.frame output +print('@ WRITTING DATA FRAME OUTPUT ') +filename = out_other_path + '/whole_df_output.csv' +df.to_csv(filename, + index=True) #True because we want to conserve the rownames (cells) +print('@ DONE ') + +# Save map object that contains the cell x spot prob +print('@ WRITTING OUTPUT OBJECT ') +filename = out_other_path + '/scNym_output_object.h5ad' +adata.write_h5ad(filename= filename, + compression='gzip') +print('@ DONE ') diff --git a/Scripts/scPred/predict_scPred.R b/Scripts/scPred/predict_scPred.R new file mode 100644 index 0000000..a0f7b43 --- /dev/null +++ b/Scripts/scPred/predict_scPred.R @@ -0,0 +1,52 @@ +# load libraries and arguments +library(scPred) +library(Seurat) +library(tidyverse) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +query_path = args[1] +model_path = args[2] +pred_path = args[3] +threads = as.numeric(args[4]) + +#--------------- Data ------------------- + +# read query matrix +message('@ READ QUERY') +query = data.table::fread(query_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# load model +message('@ LOAD MODEL') +load(model_path) +message('@ DONE') + +# transpose and create seurat object +query = transposeBigData(query) +query = CreateSeuratObject(query) + +# normalize query +query = query %>% + NormalizeData() + +#----------- Predict scPred ------------- + +# predict cells +message('@ PREDICT LABELS') +query = scPredict(query, scpred) +message('@ DONE') + +head(colnames(query)) +head(query$scpred_prediction) + +pred_labs = data.frame(cell = colnames(query), + scPred = query$scpred_prediction) + +# write prediction +data.table::fwrite(pred_labs, file = pred_path) + +#---------------------------------------- diff --git a/Scripts/scPred/train_scPred.R b/Scripts/scPred/train_scPred.R new file mode 100644 index 0000000..f5cae9c --- /dev/null +++ b/Scripts/scPred/train_scPred.R @@ -0,0 +1,76 @@ +# load libraries and arguments +library(scPred) +library(Seurat) +library(tidyverse) +library(doParallel) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +model_path = args[3] +threads = as.numeric(args[4]) +model_type = args[5] + +# path for other outputs (depends on tools) +out_path = dirname(model_path) + +#--------------- Data ------------------- + +# read reference matrix +message('@ READ REF') +ref = data.table::fread(ref_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# read reference labels +labels = data.table::fread(lab_path, header=T, data.table=F) %>% + column_to_rownames('V1') + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(rownames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# transpose and create seurat object with the labels as meta data +ref = transposeBigData(ref) +ref = CreateSeuratObject(ref, meta.data = labels) + +# normalize reference +ref = ref %>% + NormalizeData() %>% + FindVariableFeatures() %>% + ScaleData() %>% + RunPCA() %>% + RunUMAP(dims = 1:30) + +# create scPred object stored in the @misc slot of reference +ref = getFeatureSpace(ref, "label") + +#------------- Train scPred ------------- + +# train model (parallelized) +cl = makePSOCKcluster(threads) +registerDoParallel(cl) +scpred = trainModel(ref, model = model_type, allowParallel = T) +stopCluster(cl) + +# print model info +get_scpred(scpred) + +# save trained model +message('@ SAVE MODEL') +save(scpred, file = model_path) +message('@ DONE') + +# Plot prob +pdf(paste0(out_path, '/qc_plots.pdf'), width=10, height=10) +plot_probabilities(scpred) +dev.off() + +#--------------------------------------------- diff --git a/Scripts/singleCellNet/predict_singleCellNet.R b/Scripts/singleCellNet/predict_singleCellNet.R new file mode 100644 index 0000000..8e5a694 --- /dev/null +++ b/Scripts/singleCellNet/predict_singleCellNet.R @@ -0,0 +1,52 @@ +library(tidyverse) +library(Seurat) +library(singleCellNet) +library(tidyverse) +library(data.table) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +sample_path = args[1] +model_path = args[2] +out_path = args[3] +threads = as.numeric(args[4]) + +#--------------- Data ------------------- + +# read query matrix and transpose +message('@ READ QUERY') +query = data.table::fread(sample_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# Get cell names +cellnames = row.names(query) + +# load model +message('@ LOAD MODEL') +load(model_path) +message('@ DONE') + +# Transpose query +query = transposeBigData(query, blocksize = 10000) + +#----------- Predict singleCellNet ------------ + +# predict labels +message('@ PREDICT LABELS') +# Default nrand = 50, Hussein had used nrand = 0 +crPBMC = scn_predict(class_info[['cnProc']], query, nrand = 0) +message('@ DONE') + +# classify cells +stQuery = assign_cate(classRes = crPBMC, sampTab = data.frame(row.names = cellnames), cThresh = 0.5) + +pred_labs = data.frame(cell = rownames(stQuery), + singleCellNet = stQuery$category) + +# write prediction +data.table::fwrite(pred_labs, file = out_path) + +#---------------------------------------- diff --git a/Scripts/singleCellNet/train_singleCellNet.R b/Scripts/singleCellNet/train_singleCellNet.R new file mode 100644 index 0000000..b95129c --- /dev/null +++ b/Scripts/singleCellNet/train_singleCellNet.R @@ -0,0 +1,71 @@ +# load libraries +library(tidyverse) +library(data.table) +library(Seurat) +library(singleCellNet) +library(WGCNA) + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +out_path = args[3] +threads = as.numeric(args[4]) + +#--------------- Data ------------------- +# read reference matrix and transpose +message('@ READ REF') +ref = data.table::fread(ref_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# read reference labels +labels = data.table::fread(lab_path, header=T, data.table=F) %>% + column_to_rownames('V1') + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(rownames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# make Seurat object (transpose ref first) +ref = transposeBigData(ref, blocksize = 10000) +seurat = CreateSeuratObject(counts = ref, meta.data = labels) + +#------------- Train singleCellNet ------------- + +# Split reference data for training and assessment +# Default ncells = 50, Hussein had used ncells = 80 in scCoAnnotate V1 +stList = splitCommon(sampTab = seurat@meta.data, ncells = 50, dLevel = "label") +# Get the downsampled list +stTrain = stList[[1]] +# Get corresponding +expTrain = as.matrix(GetAssayData(seurat))[,row.names(stTrain)] + +# Train singleCellNet +# Default uses nTopGenes = 10, nTrees = 1000 +# Hussein had used nTopGenes = 12, nTrees = 350 in scCoAnnotate V1 and nTrees = 500 in his thesis +message('@ TRAINING MODEL') +class_info = scn_train(stTrain = stTrain, + expTrain = expTrain, + nTopGenes = 10, + nRand = 70, + nTrees = 500, + nTopGenePairs = 25, + dLevel = "label") +message('@ DONE') + +# save trained model +message('@ SAVE MODEL') +save(class_info, file = out_path) +message('@ DONE') + +#---------------------------------------- + + + + diff --git a/Templates/predict_template.R b/Templates/predict_template.R new file mode 100644 index 0000000..1eb3e27 --- /dev/null +++ b/Templates/predict_template.R @@ -0,0 +1,40 @@ +# load libraries and arguments + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +query_path = args[1] +model_path = args[2] +pred_path = args[3] +threads = as.numeric(args[4]) + +# path for other outputs (depends on tools) +out_path = dirname(pred_path) + +#--------------- Data ------------------- + +# read query matrix +message('@ READ QUERY') +query = data.table::fread(query_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# load model +message('@ LOAD MODEL') +load(model_path) +message('@ DONE') + +# CODE FOR ANY DATA MANIPULATION NEEDED BEFORE PREDICTION HERE + +#----------- Predict -------- + +# CODE FOR PREDICTING HERE +message('@ PREDICT LABELS') + + +message('@ DONE') + +# write prediction +data.table::fwrite(, file = pred_path) + +#---------------------------------------- diff --git a/Templates/train_template.R b/Templates/train_template.R new file mode 100644 index 0000000..fb44e8c --- /dev/null +++ b/Templates/train_template.R @@ -0,0 +1,46 @@ +# load libraries and arguments + +set.seed(1234) + +args = commandArgs(trailingOnly = TRUE) +ref_path = args[1] +lab_path = args[2] +model_path = args[3] +threads = as.numeric(args[4]) + +# path for other outputs (depends on tools) +out_path = dirname(model_path) + +#--------------- Data ------------------- + +# read reference matrix +message('@ READ REF') +ref = data.table::fread(ref_path, nThread=threads, header=T, data.table=F) %>% + column_to_rownames('V1') +message('@ DONE') + +# read reference labels +labels = data.table::fread(lab_path, header=T, data.table=F) %>% + column_to_rownames('V1') + +# check if cell names are in the same order in labels and ref +order = all(as.character(rownames(labels)) == as.character(rownames(ref))) + +# throw error if order is not the same +if(!order){ + stop("@ Order of cells in reference and labels do not match") +} + +# CODE FOR ANY DATA MANIPULATION NEEDED BEFORE TRAINING HERE + + +#------------- Train ------------- + +# CODE FOR TRAINING HERE + +# save trained model +message('@ SAVE MODEL') +save(, file = model_path) +message('@ DONE') + +#--------------------------------------------- diff --git a/config.yml b/config.yml deleted file mode 100644 index 523acea..0000000 --- a/config.yml +++ /dev/null @@ -1,43 +0,0 @@ -# target directory -output_dir: /project/kleinman/hussein.lakkis/from_hydra/test -# path to reference to train classifiers on (cell x gene raw counts) -training_reference: /project/kleinman/hussein.lakkis/from_hydra/2022_01_10-Datasets/Cortex/p0/expression.csv -# path to annotations for the reference (csv file with cellname and label headers) -reference_annotations: /project/kleinman/hussein.lakkis/from_hydra/2022_01_10-Datasets/Cortex/p0/labels.csv -# path to query datasets (cell x gene raw counts) -query_datasets: - - /project/kleinman/hussein.lakkis/from_hydra/Collab/HGG_Selin_Revision/test/BT2016062/expression.csv - - /project/kleinman/hussein.lakkis/from_hydra/Collab/HGG_Selin_Revision/test/P-1694_S-1694_multiome/expression.csv - - /project/kleinman/hussein.lakkis/from_hydra/Collab/HGG_Selin_Revision/test/P-1701_S-1701_multiome/expression.csv -# step to check if some genes are kept in the common gene space between ref and query -check_genes: False -# path for the genes required -genes_required: Null -# rejection option for SVM -rejection: True -# classifiers to run -tools_to_run: - - ACTINN - - scHPL - - scClassify - - correlation - - scmapcluster - - scPred - - SingleCellNet - - SVM_reject - - SingleR - - CHETAH - - scmapcell - - SciBet -# benchmark tools on reference -benchmark: False -plots: True -consensus: - - all - - - - - - - diff --git a/example.config.yml b/example.config.yml new file mode 100644 index 0000000..379fa7c --- /dev/null +++ b/example.config.yml @@ -0,0 +1,47 @@ +# Example config for scCoAnnotate annotation and benchmarking workflow + +# target directory +output_dir: /lustre06/project/6004736/alvann/from_narval/DEV/test-scCoAnnotate-dev +output_dir_benchmark: /lustre06/project/6004736/alvann/from_narval/DEV/test-scCoAnnotate-dev/benchmark + +# path to reference to train classifiers on (cell x gene raw counts) +references: + braindex1: + expression: /lustre06/project/6004736/alvann/from_narval/DEV/reference-dev/Braindex_20210710/ref/expression.csv + labels: /lustre06/project/6004736/alvann/from_narval/DEV/reference-dev/Braindex_20210710/ref/labels.csv + braindex2: + expression: /lustre06/project/6004736/alvann/from_narval/DEV/reference-dev/Braindex_20210710/ref/expression.csv + labels: /lustre06/project/6004736/alvann/from_narval/DEV/reference-dev/Braindex_20210710/ref/labels.csv + +# path to query datasets (cell x gene raw counts) +query_datasets: + ct_p3: /lustre06/project/6004736/alvann/from_narval/DEV/reference-dev/Braindex_20210710/samples/ct_p3/expression.csv + ct_p0: /lustre06/project/6004736/alvann/from_narval/DEV/reference-dev/Braindex_20210710/samples/ct_p0/expression.csv + +# convert gene symbols in reference from mouse to human +convert_ref_mm_to_hg: False + +# classifiers to run +tools_to_run: + - scPred + - SingleR + - scClassify + - SciBet + - singleCellNet + - scHPL + - SVMlinear + - Correlation + - scLearn + - ACTINN + - scID + - scAnnotate + - scNym + - CellTypist + +consensus_tools: + - all + +# Benchmark parameters +benchmark: + n_folds: 10 + diff --git a/example.submit.sh b/example.submit.sh new file mode 100644 index 0000000..d9e14ac --- /dev/null +++ b/example.submit.sh @@ -0,0 +1,22 @@ +#!/bin/sh +#SBATCH --job-name=scCoAnnotate +#SBATCH --account=rrg-kleinman +#SBATCH --output=logs/%x.out +#SBATCH --error=logs/%x.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=5 +#SBATCH --time=24:00:00 +#SBATCH --mem-per-cpu=60GB + +module load scCoAnnotate/2.0 + +# path to snakefile and config +snakefile= +config= + +# unlock directory incase of previous errors +snakemake -s ${snakefile} --configfile ${config} --unlock + +# run workflow +snakemake -s ${snakefile} --configfile ${config} --cores 5 + diff --git a/rulegraph.annotation.pdf b/rulegraph.annotation.pdf new file mode 100644 index 0000000..dd47b88 Binary files /dev/null and b/rulegraph.annotation.pdf differ diff --git a/rulegraph.pdf b/rulegraph.pdf deleted file mode 100644 index d17bc7e..0000000 Binary files a/rulegraph.pdf and /dev/null differ diff --git a/scCoAnnotate_workflow.drawio.png b/scCoAnnotate_workflow.drawio.png new file mode 100644 index 0000000..1770ed4 Binary files /dev/null and b/scCoAnnotate_workflow.drawio.png differ diff --git a/setup.R b/setup.R deleted file mode 100644 index c56102d..0000000 --- a/setup.R +++ /dev/null @@ -1,3 +0,0 @@ -list.of.packages <- c("ggplot2", "scibet", "Seurat", "singleCellNet", "viridis", "ggsci", "SingleR", "dplyr") -new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] -if(length(new.packages)) install.packages(new.packages) diff --git a/snakefile.annotate b/snakefile.annotate new file mode 100644 index 0000000..705c33c --- /dev/null +++ b/snakefile.annotate @@ -0,0 +1,886 @@ + +#---------------------------------------------------- +# Setup +#---------------------------------------------------- + +configfile: workflow.basedir + "/Config/config.default.yml" + +# import libraries +import os +from datetime import datetime + +# Get the names of query samples from the paths given in the query section of the config +samples = config['query_datasets'] + +now = datetime.now() +dt_string = now.strftime("%Y-%m-%d_%H-%M-%S") + +#---------------------------------------------------- +# Final rule all +#---------------------------------------------------- + +rule all: + input: + expand(config['output_dir'] + '/{sample}/report/{sample}.prediction_report.' + dt_string + '.html', + sample = samples) + +#---------------------------------------------------- +# Preprocess +#---------------------------------------------------- + +rule preprocess: + input: + reference = lambda wildcards:config['references'][wildcards.reference]['expression'], + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'], + query = config['query_datasets'].values() + output: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + query = expand(config['output_dir'] + "/{sample}/{{reference}}/expression.csv", sample = samples) + log: + config['output_dir'] + "/model/{reference}/preprocess.log" + params: + basedir = {workflow.basedir}, + out = config['output_dir'], + convert_genes = config['convert_ref_mm_to_hg'], + reference_name = "{reference}" + shell: + """ + Rscript {params.basedir}/Scripts/preprocess.R \ + {input.reference} \ + "{input.query}" \ + {params.out} \ + {params.convert_genes} \ + {input.labfile} \ + {params.reference_name} \ + &> {log} + """ + +#---------------------------------------------------- +# Consensus +#---------------------------------------------------- + +rule consensus: + input: + results = expand(config["output_dir"] + "/{{sample}}/{{reference}}/{tool}/{tool}_pred.csv", + tool=config['tools_to_run']) + output: + prediction_summary = config["output_dir"] + "/{sample}/{reference}/Prediction_Summary.tsv" + log: + config["output_dir"] + "/{sample}/{reference}/Gatherpreds.log" + params: + basedir = {workflow.basedir}, + tools = config['tools_to_run'], + consensus_tools = config['consensus_tools'], + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'], + sample = config["output_dir"] + "/{sample}/{reference}/" + shell: + """ + Rscript {params.basedir}/Scripts/calculate_consensus.R \ + {params.sample} \ + {output.prediction_summary} \ + "{params.tools}" \ + "{params.consensus_tools}" \ + {params.labfile} \ + &> {log} + """ + +#---------------------------------------------------- +# Knit Report +#---------------------------------------------------- + +rule knit_report: + input: + pred = expand(config['output_dir'] + "/{{sample}}/{reference}/Prediction_Summary.tsv", reference = config['references']), + query = lambda wildcards:config['query_datasets'][wildcards.sample] + output: + report_path = config['output_dir'] + '/{sample}/report/{sample}.prediction_report.' + dt_string + '.html' + log: + config['output_dir'] + "/{sample}/report/report.log" + params: + basedir = {workflow.basedir}, + output_dir = config['output_dir'], + sample = "{sample}", + tools = config['tools_to_run'], + consensus = config['consensus_tools'], + refs = list(config['references'].keys()), + marker_genes = config['marker_genes'] + threads: 1 + resources: + shell: + """ + Rscript -e "rmarkdown::render( + '{params.basedir}/Notebooks/annotate_report.Rmd', + params = list(tools = '{params.tools}', + consensus = '{params.consensus}', + refs = '{params.refs}', + output_dir = '{params.output_dir}', + sample = '{params.sample}', + marker_genes = '{params.marker_genes}', + threads = '{threads}', + query = '{input.query}'), + output_file = '{output.report_path}')" \ + &> {log} + """ + +#---------------------------------------------------- +# SingleR +#---------------------------------------------------- + +rule train_SingleR: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/SingleR/SingleR_model.Rda" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/model/{reference}/SingleR/SingleR.log" + benchmark: + config['output_dir'] + "/model/{reference}/SingleR/SingleR_train_benchmark.txt" + threads: + config['SingleR']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/SingleR/train_SingleR.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_SingleR: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/SingleR/SingleR_model.Rda" + output: + pred = config['output_dir'] + "/{sample}/{reference}/SingleR/SingleR_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/{reference}/SingleR/SingleR.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/SingleR/SingleR_predict_benchmark.txt" + threads: + config['SingleR']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/SingleR/predict_SingleR.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# scPred +#---------------------------------------------------- + +rule train_scPred: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/scPred/scPred_model.Rda" + params: + basedir = {workflow.basedir}, + classifier = config['scPred']['classifier'], + log: + config['output_dir'] + "/model/{reference}/scPred/scPred.log" + benchmark: + config['output_dir'] + "/model/{reference}/scPred/scPred_train_benchmark.txt" + threads: + config['scPred']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scPred/train_scPred.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + {params.classifier} \ + &> {log} + """ + +rule predict_scPred: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/scPred/scPred_model.Rda" + output: + pred = config['output_dir'] + "/{sample}/{reference}/scPred/scPred_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/{reference}/scPred/scPred.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/scPred/scPred_predict_benchmark.txt" + threads: + config['scPred']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scPred/predict_scPred.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ +#---------------------------------------------------- +# scClassify +#---------------------------------------------------- + +rule train_scClassify: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/scClassify/scClassify_model.Rda" + params: + basedir = {workflow.basedir}, + log: + config['output_dir'] + "/model/{reference}/scClassify/scClassify.log" + benchmark: + config['output_dir'] + "/model/{reference}/scClassify/scClassify_train_benchmark.txt" + threads: + config['scClassify']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scClassify/train_scClassify.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_scClassify: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/scClassify/scClassify_model.Rda" + output: + pred = config['output_dir'] + "/{sample}/{reference}/scClassify/scClassify_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/{reference}/scClassify/scClassify.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/scClassify/scClassify_predict_benchmark.txt" + threads: + config['scClassify']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scClassify/predict_scClassify.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# SciBet +#---------------------------------------------------- + +rule train_SciBet: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/SciBet/SciBet_model.Rda" + params: + basedir = {workflow.basedir}, + log: + config['output_dir'] + "/model/{reference}/SciBet/SciBet.log" + benchmark: + config['output_dir'] + "/model/{reference}/SciBet/SciBet_train_benchmark.txt" + threads: + config['SciBet']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/SciBet/train_SciBet.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_SciBet: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/SciBet/SciBet_model.Rda" + output: + pred = config['output_dir'] + "/{sample}/{reference}/SciBet/SciBet_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/{reference}/SciBet/SciBet.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/SciBet/SciBet_predict_benchmark.txt" + threads: + config['SciBet']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/SciBet/predict_SciBet.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# scHPL +#---------------------------------------------------- + +rule train_scHPL: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/scHPL/scHPL_model.pkl" + params: + basedir = {workflow.basedir}, + classifier = config['scHPL']['classifier'], + dimred = config['scHPL']['dimred'] + log: + config['output_dir'] + "/model/{reference}/scHPL/scHPL.log" + benchmark: + config['output_dir'] + "/model/{reference}/scHPL/scHPL_train_benchmark.txt" + threads: + config['scHPL']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/scHPL/train_scHPL.py \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {params.classifier} \ + {params.dimred} \ + &> {log} + """ + +rule predict_scHPL: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/scHPL/scHPL_model.pkl" + output: + pred = config['output_dir'] + "/{sample}/{reference}/scHPL/scHPL_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['scHPL']['threshold'] + log: + config['output_dir'] + "/{sample}/{reference}/scHPL/scHPL.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/scHPL/scHPL_predict_benchmark.txt" + threads: + config['scHPL']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/scHPL/predict_scHPL.py \ + {input.query} \ + {input.model} \ + {output.pred} \ + {params.threshold} \ + &> {log} + """ + +#---------------------------------------------------- +# SVM Linear +#---------------------------------------------------- + +rule train_SVMlinear: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/SVMlinear/SVMlinear_model.pkl" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/model/{reference}/SVMlinear/SVMlinear.log" + benchmark: + config['output_dir'] + "/model/{reference}/SVMlinear/SVMlinear_train_benchmark.txt" + threads: + config['SVMlinear']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/SVC/train_linearSVM.py \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_SVMlinear: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/SVMlinear/SVMlinear_model.pkl" + output: + pred = config['output_dir'] + "/{sample}/{reference}/SVMlinear/SVMlinear_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['SVMlinear']['threshold'], + tool_name = config['SVMlinear']['classifier'] + log: + config['output_dir'] + "/{sample}/{reference}/SVMlinear/SVMlinear.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/SVMlinear/SVMlinear_predict_benchmark.txt" + threads: + config['SVMlinear']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/SVC/predict_SVM.py \ + {input.query} \ + {input.model} \ + {output.pred} \ + {params.threshold} \ + {threads} \ + {params.tool_name} \ + &> {log} + """ + +#---------------------------------------------------- +# SVC +#---------------------------------------------------- + +rule train_SVC: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/SVC" + config['SVC']['classifier'] + "/SVC" + config['SVC']['classifier'] + "_model.pkl" + params: + basedir = {workflow.basedir}, + classifier = config['SVC']['classifier'] + log: + config['output_dir'] + "/model/{reference}/SVC" + config['SVC']['classifier'] + "/SVC" + config['SVC']['classifier'] + ".log" + benchmark: + config['output_dir'] + "/model/{reference}/SVC" + config['SVC']['classifier'] + "/SVC" + config['SVC']['classifier'] + "_train_benchmark.txt" + threads: + config['SVC']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/SVC/train_SVM.py \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {params.classifier} \ + {threads} \ + &> {log} + """ + +rule predict_SVC: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/SVC" + config['SVC']['classifier'] + "/SVC" + config['SVC']['classifier'] + "_model.pkl" + output: + pred = config['output_dir'] + "/{sample}/{reference}/SVC" + config['SVC']['classifier'] + "/SVC" + config['SVC']['classifier'] + "_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['SVC']['threshold'], + tool_name = config['SVC']['classifier'] + log: + config['output_dir'] + "/{sample}/{reference}/SVC" + config['SVC']['classifier'] + "/SVC" + config['SVC']['classifier'] + ".log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/SVC" + config['SVC']['classifier'] + "/SVC" + config['SVC']['classifier'] + "_predict_benchmark.txt" + threads: + config['SVC']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/SVC/predict_SVM.py \ + {input.query} \ + {input.model} \ + {output.pred} \ + {params.threshold} \ + {threads} \ + {params.tool_name} \ + &> {log} + """ + +#---------------------------------------------------- +# singleCellNet +#---------------------------------------------------- + +rule train_singleCellNet: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/singleCellNet/singleCellNet_model.Rda" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/model/{reference}/singleCellNet/singleCellNet.log" + benchmark: + config['output_dir'] + "/model/{reference}/singleCellNet/singleCellNet_train_benchmark.txt" + threads: + config['singleCellNet']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/singleCellNet/train_singleCellNet.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_singleCellNet: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/singleCellNet/singleCellNet_model.Rda" + output: + pred = config['output_dir'] + "/{sample}/{reference}/singleCellNet/singleCellNet_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/{reference}/singleCellNet/singleCellNet.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/singleCellNet/singleCellNet_predict_benchmark.txt" + threads: + config['singleCellNet']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/singleCellNet/predict_singleCellNet.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# Correlation +#---------------------------------------------------- + +rule train_Correlation: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/Correlation/Correlation_model.Rda" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/model/{reference}/Correlation/Correlation.log" + benchmark: + config['output_dir'] + "/model/{reference}/Correlation/Correlation_train_benchmark.txt" + threads: + config['Correlation']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/Correlation/train_Correlation.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_Correlation: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/Correlation/Correlation_model.Rda" + output: + pred = config['output_dir'] + "/{sample}/{reference}/Correlation/Correlation_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/{reference}/Correlation/Correlation.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/Correlation/Correlation_predict_benchmark.txt" + threads: + config['Correlation']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/Correlation/predict_Correlation.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# scLearn +#---------------------------------------------------- + +rule train_scLearn: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/scLearn/scLearn_model.Rda" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/model/{reference}/scLearn/scLearn.log" + benchmark: + config['output_dir'] + "/model/{reference}/scLearn/scLearn_train_benchmark.txt" + threads: + config['scLearn']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scLearn/train_scLearn.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_scLearn: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/scLearn/scLearn_model.Rda" + output: + pred = config['output_dir'] + "/{sample}/{reference}/scLearn/scLearn_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/{reference}/scLearn/scLearn.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/scLearn/scLearn_train_benchmark.txt" + threads: + config['scLearn']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scLearn/predict_scLearn.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# ACTINN +#---------------------------------------------------- + +rule train_ACTINN: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/ACTINN/ACTINN_model.pkl" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/model/{reference}/ACTINN/ACTINN.log" + benchmark: + config['output_dir'] + "/model/{reference}/ACTINN/ACTINN_predict_benchmark.txt" + threads: + config['ACTINN']['threads'] + shell: + """ + python {params.basedir}/Scripts/ACTINN/train_ACTINN.py \ + -trs {input.reference} \ + -trl {input.labfile} \ + -mp {output.model} \ + &> {log} + """ + +rule predict_ACTINN: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/ACTINN/ACTINN_model.pkl" + output: + pred = config['output_dir'] + "/{sample}/{reference}/ACTINN/ACTINN_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/{reference}/ACTINN/ACTINN.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/ACTINN/ACTINN_predict_benchmark.txt" + threads: + config['ACTINN']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/ACTINN/predict_ACTINN.py \ + -ts {input.query} \ + -mp {input.model} \ + -pp {output.pred} \ + &> {log} + """ + +#---------------------------------------------------- +# scID +#---------------------------------------------------- + +rule run_scID: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'], + query = config['output_dir'] + "/{sample}/{reference}/expression.csv" + output: + pred = config['output_dir'] + "/{sample}/{reference}/scID/scID_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir'] + "/{sample}/{reference}/scID/scID.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/scID/scID_predict_benchmark.txt" + threads: + config['scID']['threads'] + shell: + """ + Rscript {params.basedir}/Scripts/scID/run_scID.R \ + {input.reference} \ + {input.labfile} \ + {input.query} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# scAnnotate +#---------------------------------------------------- + +rule run_scAnnotate: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'], + query = config['output_dir'] + "/{sample}/{reference}/expression.csv" + output: + pred = config['output_dir'] + "/{sample}/{reference}/scAnnotate/scAnnotate_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['scAnnotate']['threshold'] + log: + config['output_dir'] + "/{sample}/{reference}/scAnnotate/scAnnotate.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/scAnnotate/scAnnotate_predict_benchmark.txt" + threads: + config['scAnnotate']['threads'] + shell: + """ + Rscript {params.basedir}/Scripts/scAnnotate/run_scAnnotate.R \ + {input.reference} \ + {input.labfile} \ + {input.query} \ + {output.pred} \ + {threads} \ + {params.threshold} \ + &> {log} + """ + +#---------------------------------------------------- +# scNym +#---------------------------------------------------- + +rule run_scNym: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'], + query = config['output_dir'] + "/{sample}/{reference}/expression.csv" + output: + pred = config['output_dir'] + "/{sample}/{reference}/scNym/scNym_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['scNym']['threshold'] + log: + config['output_dir'] + "/{sample}/{reference}/scNym/scNym.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/scNym/scNym_predict_benchmark.txt" + threads: + config['scNym']['threads'] + shell: + """ + python {params.basedir}/Scripts/scNym/run_scNym.py \ + {input.reference} \ + {input.labfile} \ + {input.query} \ + {output.pred} \ + {params.threshold} \ + &> {log} + """ + +#---------------------------------------------------- +# CellTypist +#---------------------------------------------------- + +rule train_CellTypist: + input: + reference = config['output_dir'] + "/model/{reference}/expression.csv", + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + model = config['output_dir'] + "/model/{reference}/CellTypist/CellTypist_model.pkl" + params: + basedir = {workflow.basedir}, + feature_selection = config['CellTypist']['feature_selection'] + log: + config['output_dir'] + "/model/{reference}/CellTypist/CellTypist.log" + benchmark: + config['output_dir'] + "/model/{reference}/CellTypist/CellTypist_train_benchmark.txt" + threads: + config['CellTypist']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/CellTypist/train_CellTypist.py \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + {params.feature_selection} \ + &> {log} + """ + +rule predict_CellTypist: + input: + query = config['output_dir'] + "/{sample}/{reference}/expression.csv", + model = config['output_dir'] + "/model/{reference}/CellTypist/CellTypist_model.pkl" + output: + pred = config['output_dir'] + "/{sample}/{reference}/CellTypist/CellTypist_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['CellTypist']['threshold'], + majority_voting = config['CellTypist']['majority_voting'] + log: + config['output_dir'] + "/{sample}/{reference}/CellTypist/CellTypist.log" + benchmark: + config['output_dir'] + "/{sample}/{reference}/CellTypist/CellTypist_predict_benchmark.txt" + threads: + config['CellTypist']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/CellTypist/predict_CellTypist.py \ + {input.query} \ + {input.model} \ + {output.pred} \ + {params.threshold} \ + {threads} \ + {params.majority_voting} \ + &> {log} + """ + +#---------------------------------------------------- +# The End +#---------------------------------------------------- diff --git a/snakefile.benchmark b/snakefile.benchmark new file mode 100644 index 0000000..8144e24 --- /dev/null +++ b/snakefile.benchmark @@ -0,0 +1,881 @@ + +#---------------------------------------------------- +# Setup +#---------------------------------------------------- + +configfile: workflow.basedir + "/Config/config.default.yml" + +# import libraries +import os +from datetime import datetime + +n_folds = list(range(1, config['benchmark']['n_folds']+1)) + +now = datetime.now() +dt_string = now.strftime("%Y-%m-%d_%H-%M-%S") + +#---------------------------------------------------- +# Final rule all +#---------------------------------------------------- + +rule all: + input: + expand(config['output_dir_benchmark'] + "/{reference}/report/" + dt_string + '.benchmark_report.html', + reference = config['references'].keys()) + +#---------------------------------------------------- +# Subset Folds +#---------------------------------------------------- + +rule subset_folds: + input: + reference = lambda wildcards:config['references'][wildcards.reference]['expression'], + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'] + output: + training_data = expand(config['output_dir_benchmark'] + "/{{reference}}/fold{folds_index}/train.csv", folds_index = n_folds), + labfile = expand(config['output_dir_benchmark'] + "/{{reference}}/fold{folds_index}/train_labels.csv", folds_index = n_folds), + test_data = expand(config['output_dir_benchmark'] + "/{{reference}}/fold{folds_index}/test.csv", folds_index = n_folds) + log: + config['output_dir_benchmark'] + "/{reference}/subset_folds.log" + params: + basedir = {workflow.basedir}, + outdir = config['output_dir_benchmark'] + "/{reference}", + n_folds = config['benchmark']['n_folds'] + threads: 1 + resources: + shell: + """ + Rscript {params.basedir}/Scripts/benchmark/subset_folds.R \ + {input.reference} \ + {input.labfile} \ + {params.outdir} \ + {threads} \ + {params.n_folds} \ + &> {log} + """ + +#---------------------------------------------------- +# Knit Report +#---------------------------------------------------- + +rule knit_report: + input: + pred = expand(config['output_dir_benchmark'] + "/{{reference}}/fold{folds_index}/{tool}/{tool}_pred.csv", + folds_index = n_folds, + tool = config['tools_to_run']), + consensus = expand(config['output_dir_benchmark'] + "/{{reference}}/fold{folds_index}/Prediction_Summary.tsv", + folds_index = n_folds) + output: + report_path = config['output_dir_benchmark'] + "/{reference}/report/" + dt_string + '.benchmark_report.html' + log: + config['output_dir_benchmark'] + "/{reference}/report/report.log" + params: + basedir = {workflow.basedir}, + pred_path = config['output_dir_benchmark'] + "/{reference}", + n_folds = config['benchmark']['n_folds'], + tools = config['tools_to_run'], + ref_name = "{reference}" + resources: + shell: + """ + Rscript -e "rmarkdown::render( + '{params.basedir}/Notebooks/benchmark_report.Rmd', + params = list(tools = '{params.tools}', + ref_name = '{params.ref_name}', + pred_path = '{params.pred_path}', + fold = '{params.n_folds}'), + output_file = '{output.report_path}')" \ + &> {log} + """ + +#---------------------------------------------------- +# Consensus +#---------------------------------------------------- + +rule consensus: + input: + results = expand(config['output_dir_benchmark'] + "/{{reference}}/fold{{folds_index}}/{tool}/{tool}_pred.csv", + tool=config['tools_to_run']) + output: + prediction_summary = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/Prediction_Summary.tsv" + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/Gatherpreds.log" + params: + basedir = {workflow.basedir}, + tools = config['tools_to_run'], + conesnsus_tools = config['consensus_tools'], + labfile = lambda wildcards:config['references'][wildcards.reference]['labels'], + sample = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}" + shell: + """ + Rscript {params.basedir}/Scripts/calculate_consensus.R \ + {params.sample} \ + {output.prediction_summary} \ + "{params.tools}" \ + "{params.conesnsus_tools}" \ + {params.labfile} \ + &> {log} + """ + +#---------------------------------------------------- +# scPred +#---------------------------------------------------- + +rule train_scPred: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{fold}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{fold}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{fold}/scPred/scPred_model.Rda" + params: + basedir = {workflow.basedir}, + classifier = config['scPred']['classifier'] + log: + config['output_dir_benchmark'] + "/{reference}/fold{fold}/scPred/scPred.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{fold}/scPred/scPred_train_benchmark.txt" + threads: + config['scPred']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scPred/train_scPred.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + {params.classifier} \ + &> {log} + """ + +rule predict_scPred: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scPred/scPred_model.Rda" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scPred/scPred_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scPred/scPred.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scPred/scPred_predict_benchmark.txt" + threads: + config['scPred']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scPred/predict_scPred.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# SingleR +#---------------------------------------------------- + +rule train_SingleR: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SingleR/SingleR_model.Rda" + params: + basedir = {workflow.basedir}, + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SingleR/SingleR.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SingleR/SingleR_train_benchmark.txt" + threads: + config['SingleR']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/SingleR/train_SingleR.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_SingleR: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SingleR/SingleR_model.Rda" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SingleR/SingleR_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SingleR/SingleR.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SingleR/SingleR_predict_benchmark.txt" + threads: + config['SingleR']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/SingleR/predict_SingleR.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# scClassify +#---------------------------------------------------- + +rule train_scClassify: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scClassify/scClassify_model.Rda" + params: + basedir = {workflow.basedir}, + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scClassify/scClassify.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scClassify/scClassify_train_benchmark.txt" + threads: + config['scClassify']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scClassify/train_scClassify.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_scClassify: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scClassify/scClassify_model.Rda" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scClassify/scClassify_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scClassify/scClassify.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scClassify/scClassify_predict_benchmark.txt" + threads: + config['scClassify']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scClassify/predict_scClassify.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# SciBet +#---------------------------------------------------- + +rule train_SciBet: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SciBet/SciBet_model.Rda" + params: + basedir = {workflow.basedir}, + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SciBet/SciBet.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SciBet/SciBet_train_benchmark.txt" + threads: + config['SciBet']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/SciBet/train_SciBet.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_SciBet: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SciBet/SciBet_model.Rda" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SciBet/SciBet_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SciBet/SciBet.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SciBet/SciBet_predict_benchmark.txt" + threads: + config['SciBet']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/SciBet/predict_SciBet.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# SVM Linear +#---------------------------------------------------- + +rule train_SVMlinear: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVMlinear/SVMlinear_model.Rda" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVMlinear/SVMlinear.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVMlinear/SVMlinear_train_benchmark.txt" + threads: + config['SVMlinear']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/SVC/train_linearSVM.py \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_SVMlinear: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVMlinear/SVMlinear_model.Rda" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVMlinear/SVMlinear_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['SVMlinear']['threshold'], + tool_name = config['SVMlinear']['classifier'] + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVMlinear/SVMlinear.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVMlinear/SVMlinear_predict_benchmark.txt" + threads: + config['SVMlinear']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/SVC/predict_SVM.py \ + {input.query} \ + {input.model} \ + {output.pred} \ + {params.threshold} \ + {threads} \ + {params.tool_name} \ + &> {log} + """ + +#---------------------------------------------------- +# SVC +#---------------------------------------------------- + +rule train_SVC: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVC/SVC_model.pkl" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVC/SVC.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVC/SVC_train_benchmark.txt" + threads: + config['SVC']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/SVC/train_SVM.py \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {params.classifier} \ + {threads} \ + &> {log} + """ + +rule predict_SVC: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVC/SVC_model.pkl" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVC/SVC_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['SVC']['threshold'], + tool_name = config['SVC']['classifier'] + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVC/SVC.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/SVC/SVC_predict_benchmark.txt" + threads: + config['SVC']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/SVC/predict_SVM.py \ + {input.query} \ + {input.model} \ + {output.pred} \ + {params.threshold} \ + {threads} \ + {params.tool_name} \ + &> {log} + """ + +#---------------------------------------------------- +# singleCellNet +#---------------------------------------------------- + +rule train_singleCellNet: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/singleCellNet/singleCellNet_model.Rda" + params: + basedir = {workflow.basedir}, + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/singleCellNet/singleCellNet.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/singleCellNet/singleCellNet_train_benchmark.txt" + threads: + config['singleCellNet']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/singleCellNet/train_singleCellNet.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_singleCellNet: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/singleCellNet/singleCellNet_model.Rda" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/singleCellNet/singleCellNet_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/singleCellNet/singleCellNet.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/singleCellNet/singleCellNet_predict_benchmark.txt" + threads: + config['singleCellNet']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/singleCellNet/predict_singleCellNet.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# Correlation +#---------------------------------------------------- + +rule train_Correlation: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/Correlation/Correlation_model.Rda" + params: + basedir = {workflow.basedir}, + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/Correlation/Correlation.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/Correlation/Correlation_train_benchmark.txt" + threads: + config['Correlation']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/Correlation/train_Correlation.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ +rule predict_Correlation: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/Correlation/Correlation_model.Rda" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/Correlation/Correlation_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/Correlation/Correlation.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/Correlation/Correlation_predict_benchmark.txt" + threads: + config['Correlation']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/Correlation/predict_Correlation.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# scLearn +#---------------------------------------------------- + +rule train_scLearn: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scLearn/scLearn_model.Rda" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scLearn/scLearn.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scLearn/scLearn_train_benchmark.txt" + threads: + config['scLearn']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scLearn/train_scLearn.R \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + &> {log} + """ + +rule predict_scLearn: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scLearn/scLearn_model.Rda" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scLearn/scLearn_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scLearn/scLearn.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scLearn/scLearn_predict_benchmark.txt" + threads: + config['scLearn']['threads'] + resources: + shell: + """ + Rscript {params.basedir}/Scripts/scLearn/predict_scLearn.R \ + {input.query} \ + {input.model} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# scHPL +#---------------------------------------------------- + +rule train_scHPL: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scHPL/scHPL_model.Rda" + params: + basedir = {workflow.basedir}, + classifier = config['scHPL']['classifier'], + dimred = config['scHPL']['dimred'] + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scHPL/scHPL.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scHPL/scHPL_train_benchmark.txt" + threads: + config['scHPL']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/scHPL/train_scHPL.py \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {params.classifier} \ + {params.dimred} \ + &> {log} + """ + +rule predict_scHPL: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scHPL/scHPL_model.Rda" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scHPL/scHPL_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['scHPL']['threshold'] + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scHPL/scHPL.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scHPL/scHPL_predict_benchmark.txt" + threads: + config['scHPL']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/scHPL/predict_scHPL.py \ + {input.query} \ + {input.model} \ + {output.pred} \ + {params.threshold} \ + &> {log} + """ + +#---------------------------------------------------- +# ACTINN +#---------------------------------------------------- + +rule train_ACTINN: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/ACTINN/ACTINN_model.pkl" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/ACTINN/ACTINN.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/ACTINN/ACTINN_predict_benchmark.txt" + threads: + config['ACTINN']['threads'] + shell: + """ + python {params.basedir}/Scripts/ACTINN/train_ACTINN.py \ + -trs {input.reference} \ + -trl {input.labfile} \ + -mp {output.model} \ + &> {log} + """ + +rule predict_ACTINN: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/ACTINN/ACTINN_model.pkl" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/ACTINN/ACTINN_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/ACTINN/ACTINN.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/ACTINN/ACTINN_predict_benchmark.txt" + threads: + config['ACTINN']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/ACTINN/predict_ACTINN.py \ + -ts {input.query} \ + -mp {input.model} \ + -pp {output.pred} \ + &> {log} + + """ + +#---------------------------------------------------- +# scID +#---------------------------------------------------- + +rule run_scID: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv", + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scID/scID_pred.csv" + params: + basedir = {workflow.basedir} + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scID/scID.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scID/scID_benchmark.txt" + threads: + config['scID']['threads'] + shell: + """ + Rscript {params.basedir}/Scripts/scID/run_scID.R \ + {input.reference} \ + {input.labfile} \ + {input.query} \ + {output.pred} \ + {threads} \ + &> {log} + """ + +#---------------------------------------------------- +# scAnnotate +#---------------------------------------------------- + +rule run_scAnnotate: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv", + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scAnnotate/scAnnotate_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = 0.5 + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scAnnotate/scAnnotate.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scAnnotate/scAnnotate_benchmark.txt" + threads: + config['scAnnotate']['threads'] + shell: + """ + Rscript {params.basedir}/Scripts/scAnnotate/run_scAnnotate.R \ + {input.reference} \ + {input.labfile} \ + {input.query} \ + {output.pred} \ + {threads} \ + {params.threshold} \ + &> {log} + """ + +#---------------------------------------------------- +# scNym +#---------------------------------------------------- + +rule run_scNym: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv", + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scNym/scNym_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = 0.5 + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scNym/scNym.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/scNym/scNym_benchmark.txt" + threads: + config['scNym']['threads'] + shell: + """ + python {params.basedir}/Scripts/scNym/run_scNym.py \ + {input.reference} \ + {input.labfile} \ + {input.query} \ + {output.pred} \ + {params.threshold} \ + &> {log} + """ + +#---------------------------------------------------- +# CellTypist +#---------------------------------------------------- + +rule train_CellTypist: + input: + reference = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train.csv", + labfile = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/train_labels.csv" + output: + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/CellTypist/CellTypist_model.pkl" + params: + basedir = {workflow.basedir}, + feature_selection = config['CellTypist']['feature_selection'] + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/CellTypist/CellTypist.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/CellTypist/CellTypist_train_benchmark.txt" + threads: + config['CellTypist']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/CellTypist/train_CellTypist.py \ + {input.reference} \ + {input.labfile} \ + {output.model} \ + {threads} \ + {params.feature_selection} \ + &> {log} + """ + +rule predict_CellTypist: + input: + query = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/test.csv", + model = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/CellTypist/CellTypist_model.pkl" + output: + pred = config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/CellTypist/CellTypist_pred.csv" + params: + basedir = {workflow.basedir}, + threshold = config['CellTypist']['threshold'], + majority_voting = config['CellTypist']['majority_voting'] + log: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/CellTypist/CellTypist.log" + benchmark: + config['output_dir_benchmark'] + "/{reference}/fold{folds_index}/CellTypist/CellTypist_predict_benchmark.txt" + threads: + config['CellTypist']['threads'] + resources: + shell: + """ + python {params.basedir}/Scripts/CellTypist/predict_CellTypist.py \ + {input.query} \ + {input.model} \ + {output.pred} \ + {params.threshold} \ + {threads} \ + {params.majority_voting} \ + &> {log} + """ + +#---------------------------------------------------- +# The End +#---------------------------------------------------- diff --git a/submit.sh b/submit.sh deleted file mode 100644 index b1f5940..0000000 --- a/submit.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/bash -#PBS -N Test_Submission -#PBS -o logs/err.txt -#PBS -e logs/out.txt -#PBS -l walltime=20:00:00 -#PBS -l nodes=1:ppn=3 -#PBS -l mem=125G -#PBS -l vmem=125G - - -cd /scCoAnnotate/ -mkdir -p logs - -#conda init bash -module load miniconda/3.8 -source ~/.conda_init.sh -module load snakemake/5.32.0 -module load hydra/R/4.0.5 -module load python/3.6.5_new - -snakemake --use-conda --configfile config.yml --cores 2 -