Skip to content

Commit

Permalink
Merge branch 'counts-dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
mcmero committed May 24, 2024
2 parents 9ff3b75 + 9977844 commit 0d4c639
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 14 deletions.
16 changes: 13 additions & 3 deletions R/count_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,32 @@ count <- function(fq1_trimmed,
guides_fasta,
output_dir,
sample_name,
guide_len,
guide_len = 20,
primer = "",
mismatches = 1) {
mismatches = 1,
infer_primer = TRUE,
num_infer = 20,
primer_len = 30) {

output_dir <- file.path(output_dir, paste0(sample_name, "_count"))
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

counts_out <- file.path(output_dir, paste0(sample_name, ".tsv"))
counts_stderr <- file.path(output_dir, paste0(sample_name, "_count.out"))

primer_args <- paste0(" -p ", primer)
if (infer_primer) {
primer_args <- paste0(" -I ",
" -n ", num_infer,
" -P ", primer_len)
}

cmd <- paste0("python ",
count_path,
" -r1 ", fq1_trimmed,
" -l ", guides_fasta,
" -i ", guide_len,
" -p ", primer,
primer_args,
" -m ", mismatches,
" > ", counts_out, " 2> ", counts_stderr)

Expand Down
35 changes: 26 additions & 9 deletions R/test_count.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,33 @@
library(tools)
library(dplyr)
library(reticulate)

source("count_wrapper.R")
source("R/count_wrapper.R")

input1 <- "../data/Base1.fastq"
guides_fasta <- "../data/small_sgRNA_library.fasta"
# Define Conda environment and packages for counting script
conda_dependencies <- c("pandas", "pyfastx", "python-edlib")
conda_env_name <- "count-env"

conda_create(conda_env_name,
packages = conda_dependencies,
channel = c("bioconda", "conda-forge"))
use_condaenv(conda_env_name, required = TRUE)

# Set the environment variables to ensure the correct python is used
python_path <- conda_python(conda_env_name)
Sys.setenv(RETICULATE_PYTHON = python_path)
Sys.setenv(PATH = paste(dirname(python_path), Sys.getenv("PATH"), sep = ":"))

# input parameter setup
input1 <- "datasets/base/fastqfiles/Base1.fastq"
guides_fasta <- "datasets/base/guidelibrary/small_sgRNA_library.fasta"
outdir <- "test"
sample_name <- tools::file_path_sans_ext(basename(input1))
sample_name <- basename(input1) %>%
strsplit(., "(.fastq|.fq)") %>%
first() %>%
first()

# call the count wrapper function
count_output <- count(input1,
guides_fasta,
outdir,
sample_name,
guide_len = 20,
primer = "TATTTATTTTGCTACTTAATAATTGGGACT",
mismatches = 1)
sample_name)
62 changes: 60 additions & 2 deletions python/count_guides.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def parse_args():
guide sequence. Default is no primer (guide is at read
start.''')
parser.add_argument('-m',
'--primer_mismatches',
'--primer-mismatches',
metavar='M',
type=int,
default=1,
Expand All @@ -84,10 +84,59 @@ def parse_args():
help='''Whether to expect Genomics barcode format
for file name (e.g., Fwd_01-Rev_01_R1.fastq).
Default is False.''')
parser.add_argument('-I',
'--infer-primer',
action='store_true',
help='Infer primer sequence.')
parser.add_argument('-n',
'--num-infer',
metavar='N',
type=int,
default=20,
help='Number of reads to use for primer inference.')
parser.add_argument('-P',
'--primer-len',
metavar='PL',
type=int,
default=30,
help='Length of primer sequence for inference.')

return parser.parse_args()


def infer_primer(read1, num_reads, primer_len):
'''
Infer primer sequence from first num_reads reads
'''
reads_processed = 0
primer_dict = {}
primer = ""

for r1 in read1:
reads_processed += 1
if reads_processed > num_reads:
break

try:
name, seq, qual = r1
except Exception as e:
logging.info('Error reading read(s) %s (%s)' % (r1.name, type(e)))
continue

tmp_primer = seq[:primer_len]
if tmp_primer in primer_dict:
primer_dict[tmp_primer] += 1
else:
primer_dict[tmp_primer] = 1

if len(primer_dict) == 0:
logging.error('Failed to infer primer sequence')
else:
primer = max(primer_dict, key=primer_dict.get)

return primer


def read_crispr_library(library_file, guide_len):
if library_file.endswith(('csv', 'csv.gz')):
crispr_library = pd.read_csv(library_file, sep=',', header=None)
Expand Down Expand Up @@ -241,13 +290,22 @@ def main():
print('\t'.join(output_colnames), file=sys.stdout)
return

# infer primer sequence if infer_primer > 0 is specified
if args.infer_primer:
logging.info('Infering primer sequence')
read1 = fx.Fastq(args.read1, build_index=False)
primer = infer_primer(read1, args.num_infer, args.primer_len)
logging.info('Inferred primer sequence: %s' % primer)
else:
primer = args.primer

# read in CRISPR library
crispr_library = read_crispr_library(args.library, args.guide_len)

# read in fastq
read1 = fx.Fastq(args.read1, build_index=False)
counts = count_amplicons(read1, args.guide_len,
args.primer, args.primer_mismatches)
primer, args.primer_mismatches)

# write counts to file
counts = match_counts(basename, counts, crispr_library,
Expand Down

0 comments on commit 0d4c639

Please sign in to comment.