Skip to content

Commit

Permalink
Merge branch 'counts-dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
mcmero committed May 21, 2024
2 parents e99ad06 + 8013374 commit 22b2308
Show file tree
Hide file tree
Showing 3 changed files with 314 additions and 0 deletions.
34 changes: 34 additions & 0 deletions R/count_wrapper.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
count_path <- "../python/count_guides.py" # assume that cutadapt is in our path

count <- function(fq1_trimmed,
guides_fasta,
output_dir,
sample_name,
guide_len,
primer = "",
mismatches = 1) {

output_dir <- file.path(output_dir, paste0(sample_name, "_count"))
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

counts_out <- file.path(output_dir, paste0(sample_name, ".tsv"))
counts_stderr <- file.path(output_dir, paste0(sample_name, "_count.out"))

cmd <- paste0("python ",
count_path,
" -r1 ", fq1_trimmed,
" -l ", guides_fasta,
" -i ", guide_len,
" -p ", primer,
" -m ", mismatches,
" > ", counts_out, " 2> ", counts_stderr)

message("[[", Sys.time(), "]] Counts reads ---- ")
print(cmd)

system(cmd)
cat(cmd, file = file.path(output_dir, "run_count.log"),
sep = "\n", append = FALSE)

counts_out
}
16 changes: 16 additions & 0 deletions R/test_count.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
library(tools)

source("count_wrapper.R")

input1 <- "../data/Base1.fastq"
guides_fasta <- "../data/small_sgRNA_library.fasta"
outdir <- "test"
sample_name <- tools::file_path_sans_ext(basename(input1))

count_output <- count(input1,
guides_fasta,
outdir,
sample_name,
guide_len = 20,
primer = "TATTTATTTTGCTACTTAATAATTGGGACT",
mismatches = 1)
264 changes: 264 additions & 0 deletions python/count_guides.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
#!/usr/bin/env python

'''
Module : count_guides
Description : Count CRISPR guide sequences
Copyright : (c) MAREK CMERO, 2023
License : TBD
Maintainer : Marek Cmero
Portability : POSIX
Takes a fastq file and tab-delimited text file containing CRISPR library guide
sequences and counts the number of times each sequence appears in the fastq.
'''

import os
import sys
import gzip
import logging
import pandas as pd
import pyfastx as fx
import re
import edlib
from io import StringIO
from datetime import datetime
from argparse import ArgumentParser

LIBRARY_FILE_COLNAMES = ['library', 'sequence', 'gene', 'other']


def init_log(filename):
'''
Initialise logging
'''
logging.basicConfig(filename=filename,
level=logging.DEBUG,
filemode='w',
format='%(asctime)s %(levelname)s - %(message)s',
datefmt="%Y-%m-%dT%H:%M:%S%z")


def parse_args():
'''Parse arguments'''
description = '''
Takes a fastq file and tab-delimited text file or
fasta file containing CRISPR library guide sequences
'''
parser = ArgumentParser(description=description)
parser.add_argument('-r1',
'--read1',
metavar='R1',
type=str,
help='Read 1 file in fastq format.')
parser.add_argument('-l',
'--library',
metavar='LIB',
type=str,
required=True,
help='''
Fasta file containing guide sequences, or
comma-separated gzipped file containing
CRISPR library guide sequences.''')
parser.add_argument('-i',
'--guide-len',
metavar='LEN',
type=int,
help='Length of guide sequence.')
parser.add_argument('-p',
'--primer',
metavar='PRIMER',
type=str,
default="",
help='''Primer sequence to trim from the 5' end of the
guide sequence. Default is no primer (guide is at read
start.''')
parser.add_argument('-m',
'--primer_mismatches',
metavar='M',
type=int,
default=1,
help='''Number of mismatches allowed when matching
primer sequences. Default is 1.''')
parser.add_argument('-g',
'--genomics-barcodes',
action='store_true',
help='''Whether to expect Genomics barcode format
for file name (e.g., Fwd_01-Rev_01_R1.fastq).
Default is False.''')

return parser.parse_args()


def read_crispr_library(library_file, guide_len):
if library_file.endswith(('csv', 'csv.gz')):
crispr_library = pd.read_csv(library_file, sep=',', header=None)
if len(crispr_library.columns) != 4:
raise ValueError('CRISPR library file must have 4 columns')

# reload with header if necessary
if crispr_library.iloc[0, 0].lower() == LIBRARY_FILE_COLNAMES[0]:
crispr_library = pd.read_csv(library_file, sep=',', header=0)
if not all(crispr_library.columns == LIBRARY_FILE_COLNAMES):
raise ValueError('Invalid column names in library file')
else:
crispr_library.columns = LIBRARY_FILE_COLNAMES

# check that guide length is correct
cur_guide_len = crispr_library.sequence.map(len).unique()[0]
if cur_guide_len != guide_len:
raise ValueError('Insert does not match library insert length')
elif library_file.endswith(('fasta', 'fa')):
crispr_library = pd.DataFrame()
fasta = fx.Fasta(library_file, build_index=False, full_name=True)
for name, seq in fasta:
if len(seq) != guide_len:
raise ValueError('Insert does not match library insert length')
record_to_add = pd.DataFrame(
{'library': os.path.basename(library_file).split('.')[0],
'sequence': seq,
'gene': name.split(' ')[0],
'other': ':'.join(name.split(' ')[1:])},
index=[0])
crispr_library = pd.concat([crispr_library, record_to_add],
ignore_index=True)
else:
raise ValueError('Invalid library file: expected csv.gz or fasta.')

return crispr_library


def count_amplicons(read1, guide_len, primer="", primer_mismatches=1):
reads_processed = 0
failed_reads = 0
amplicon_dict = {}
primer_len = len(primer)

for r1 in read1:
reads_processed += 1
if reads_processed % 10000 == 0:
logging.info('Processed %d reads' % reads_processed)

try:
name, seq, qual = r1
except Exception as e:
logging.info('Error reading read(s) %s (%s)' % (r1.name, type(e)))
failed_reads += 1
continue

if primer_len > 0:
# check if primer is present
read_primer = seq[:primer_len]
primer_result = edlib.align(primer, read_primer, mode="HW",
task="path", k=primer_mismatches)
if primer_result['editDistance'] > primer_mismatches:
logging.info('Failed to match primer in read %s' % name)
failed_reads += 1
continue

guide = seq[primer_len:(guide_len+primer_len)]
if guide in amplicon_dict:
amplicon_dict[guide] += 1
else:
amplicon_dict[guide] = 1

logging.info('Processed %d reads' % reads_processed)
logging.info('Failed to process %d reads' % failed_reads)
return amplicon_dict


def match_counts(basename, counts, crispr_library,
output_colnames, genomics_barcodes=False):
'''
Match counts to CRISPR library guide sequences
'''
results = pd.DataFrame.from_dict(counts, orient='index', columns=['count'])
results['guide'] = results.index
results = pd.merge(results, crispr_library,
left_on='guide', right_on='sequence')

if genomics_barcodes:
# expected file name example: Fwd_01-Rev_01_R1.fastq
# add index information (obtained from read file name)
tmp = basename.split('-')
fwd_idx_name = tmp[0]
rev_idx_name = '_'.join(tmp[1].split('_')[:2])
results['FwdBarcode'] = fwd_idx_name
results['RevBarcode'] = rev_idx_name
else:
results['sample'] = basename

# sort and reorder columns
results = results.sort_values(by='count', ascending=False)
results = results[output_colnames]

logging.info('Assigned %d reads.' % results['count'].sum())
return results


def is_file_empty(file):
'''
Check if file is empty
'''
if file.endswith('.gz'):
with gzip.open(file, 'rt') as f:
char = f.read(1)
return not char

return os.stat(file).st_size == 0


def main():
'''
Main function
'''
args = parse_args()

# initialise logging
basename = os.path.splitext(os.path.basename(args.read1))[0]
basename = os.path.splitext(basename)[0] # split again for .fastq.gz files
logfile = f'{basename}_' + datetime.now().strftime('%H-%M-%d-%m-%Y.log')
init_log(logfile)

# write arguments to log
logging.info('Arguments:')
for arg in vars(args):
logging.info('%s: %s' % (arg, getattr(args, arg)))

# if genomics barcodes are specified, check that the input format is right
output_colnames = ['sample', 'guide', 'library', 'gene', 'other', 'count']
if args.genomics_barcodes:
regex = r'Fwd_[0-9]*\-Rev_[0-9]*_R[1,2]'
if not re.search(regex, basename):
logging.error('''
File names do not match expected Genomic
sbarcode format (e.g., Fwd_01-Rev_01_R1).''')
sys.exit(1)
output_colnames = ['FwdBarcode', 'RevBarcode', 'guide',
'library', 'gene', 'other', 'count']

# handle empty input fastq files
if is_file_empty(args.read1):
logging.info('Read 1 file is empty. Writing empty output file.')
print('\t'.join(output_colnames), file=sys.stdout)
return

# read in CRISPR library
crispr_library = read_crispr_library(args.library, args.guide_len)

# read in fastq
read1 = fx.Fastq(args.read1, build_index=False)
counts = count_amplicons(read1, args.guide_len,
args.primer, args.primer_mismatches)

# write counts to file
counts = match_counts(basename, counts, crispr_library,
output_colnames, args.genomics_barcodes)

# write to stdout
output = StringIO()
counts.to_csv(output, sep='\t', index=False)
output.seek(0)
print(output.read(), file=sys.stdout)


if __name__ == '__main__':
main()

0 comments on commit 22b2308

Please sign in to comment.