Merge branch 'counts-dev'

goknurginer · May 24, 2024 · 0d4c639 · 0d4c639
2 parents 9ff3b75 + 9977844
commit 0d4c639
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 14 deletions.
diff --git a/R/count_wrapper.R b/R/count_wrapper.R
@@ -4,22 +4,32 @@ count <- function(fq1_trimmed,
                   guides_fasta,
                   output_dir,
                   sample_name,
-                  guide_len,
+                  guide_len = 20,
                   primer = "",
-                  mismatches = 1) {
+                  mismatches = 1,
+                  infer_primer = TRUE,
+                  num_infer = 20,
+                  primer_len = 30) {
 
   output_dir <- file.path(output_dir, paste0(sample_name, "_count"))
   dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
 
   counts_out <- file.path(output_dir, paste0(sample_name, ".tsv"))
   counts_stderr <- file.path(output_dir, paste0(sample_name, "_count.out"))
 
+  primer_args <- paste0(" -p ", primer)
+  if (infer_primer) {
+    primer_args <- paste0(" -I ",
+                          " -n ", num_infer,
+                          " -P ", primer_len)
+  }
+
   cmd <- paste0("python ",
                 count_path,
                 " -r1 ", fq1_trimmed,
                 " -l ", guides_fasta,
                 " -i ", guide_len,
-                " -p ", primer,
+                primer_args,
                 " -m ", mismatches,
                 " > ", counts_out, " 2> ", counts_stderr)
 

diff --git a/R/test_count.R b/R/test_count.R
@@ -1,16 +1,33 @@
-library(tools)
+library(dplyr)
+library(reticulate)
 
-source("count_wrapper.R")
+source("R/count_wrapper.R")
 
-input1 <- "../data/Base1.fastq"
-guides_fasta <- "../data/small_sgRNA_library.fasta"
+# Define Conda environment and packages for counting script
+conda_dependencies <- c("pandas", "pyfastx", "python-edlib")
+conda_env_name <- "count-env"
+
+conda_create(conda_env_name,
+             packages = conda_dependencies,
+             channel = c("bioconda", "conda-forge"))
+use_condaenv(conda_env_name, required = TRUE)
+
+# Set the environment variables to ensure the correct python is used
+python_path <- conda_python(conda_env_name)
+Sys.setenv(RETICULATE_PYTHON = python_path)
+Sys.setenv(PATH = paste(dirname(python_path), Sys.getenv("PATH"), sep = ":"))
+
+# input parameter setup
+input1 <- "datasets/base/fastqfiles/Base1.fastq"
+guides_fasta <- "datasets/base/guidelibrary/small_sgRNA_library.fasta"
 outdir <-  "test"
-sample_name <- tools::file_path_sans_ext(basename(input1))
+sample_name <- basename(input1) %>%
+  strsplit(., "(.fastq|.fq)") %>%
+  first() %>%
+  first()
 
+# call the count wrapper function
 count_output <- count(input1,
                       guides_fasta,
                       outdir,
-                      sample_name,
-                      guide_len = 20,
-                      primer = "TATTTATTTTGCTACTTAATAATTGGGACT",
-                      mismatches = 1)
+                      sample_name)
diff --git a/python/count_guides.py b/python/count_guides.py
@@ -72,7 +72,7 @@ def parse_args():
                         guide sequence. Default is no primer (guide is at read
                         start.''')
     parser.add_argument('-m',
-                        '--primer_mismatches',
+                        '--primer-mismatches',
                         metavar='M',
                         type=int,
                         default=1,
@@ -84,10 +84,59 @@ def parse_args():
                         help='''Whether to expect Genomics barcode format
                         for file name (e.g., Fwd_01-Rev_01_R1.fastq).
                         Default is False.''')
+    parser.add_argument('-I',
+                        '--infer-primer',
+                        action='store_true',
+                        help='Infer primer sequence.')
+    parser.add_argument('-n',
+                        '--num-infer',
+                        metavar='N',
+                        type=int,
+                        default=20,
+                        help='Number of reads to use for primer inference.')
+    parser.add_argument('-P',
+                        '--primer-len',
+                        metavar='PL',
+                        type=int,
+                        default=30,
+                        help='Length of primer sequence for inference.')
 
     return parser.parse_args()
 
 
+def infer_primer(read1, num_reads, primer_len):
+    '''
+    Infer primer sequence from first num_reads reads
+    '''
+    reads_processed = 0
+    primer_dict = {}
+    primer = ""
+
+    for r1 in read1:
+        reads_processed += 1
+        if reads_processed > num_reads:
+            break
+
+        try:
+            name, seq, qual = r1
+        except Exception as e:
+            logging.info('Error reading read(s) %s (%s)' % (r1.name, type(e)))
+            continue
+
+        tmp_primer = seq[:primer_len]
+        if tmp_primer in primer_dict:
+            primer_dict[tmp_primer] += 1
+        else:
+            primer_dict[tmp_primer] = 1
+
+    if len(primer_dict) == 0:
+        logging.error('Failed to infer primer sequence')
+    else:
+        primer = max(primer_dict, key=primer_dict.get)
+
+    return primer
+
+
 def read_crispr_library(library_file, guide_len):
     if library_file.endswith(('csv', 'csv.gz')):
         crispr_library = pd.read_csv(library_file, sep=',', header=None)
@@ -241,13 +290,22 @@ def main():
         print('\t'.join(output_colnames), file=sys.stdout)
         return
 
+    # infer primer sequence if infer_primer > 0 is specified
+    if args.infer_primer:
+        logging.info('Infering primer sequence')
+        read1 = fx.Fastq(args.read1, build_index=False)
+        primer = infer_primer(read1, args.num_infer, args.primer_len)
+        logging.info('Inferred primer sequence: %s' % primer)
+    else:
+        primer = args.primer
+
     # read in CRISPR library
     crispr_library = read_crispr_library(args.library, args.guide_len)
 
     # read in fastq
     read1 = fx.Fastq(args.read1, build_index=False)
     counts = count_amplicons(read1, args.guide_len,
-                             args.primer, args.primer_mismatches)
+                             primer, args.primer_mismatches)
 
     # write counts to file
     counts = match_counts(basename, counts, crispr_library,