-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
314 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
count_path <- "../python/count_guides.py" # assume that cutadapt is in our path | ||
|
||
count <- function(fq1_trimmed, | ||
guides_fasta, | ||
output_dir, | ||
sample_name, | ||
guide_len, | ||
primer = "", | ||
mismatches = 1) { | ||
|
||
output_dir <- file.path(output_dir, paste0(sample_name, "_count")) | ||
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) | ||
|
||
counts_out <- file.path(output_dir, paste0(sample_name, ".tsv")) | ||
counts_stderr <- file.path(output_dir, paste0(sample_name, "_count.out")) | ||
|
||
cmd <- paste0("python ", | ||
count_path, | ||
" -r1 ", fq1_trimmed, | ||
" -l ", guides_fasta, | ||
" -i ", guide_len, | ||
" -p ", primer, | ||
" -m ", mismatches, | ||
" > ", counts_out, " 2> ", counts_stderr) | ||
|
||
message("[[", Sys.time(), "]] Counts reads ---- ") | ||
print(cmd) | ||
|
||
system(cmd) | ||
cat(cmd, file = file.path(output_dir, "run_count.log"), | ||
sep = "\n", append = FALSE) | ||
|
||
counts_out | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
library(tools) | ||
|
||
source("count_wrapper.R") | ||
|
||
input1 <- "../data/Base1.fastq" | ||
guides_fasta <- "../data/small_sgRNA_library.fasta" | ||
outdir <- "test" | ||
sample_name <- tools::file_path_sans_ext(basename(input1)) | ||
|
||
count_output <- count(input1, | ||
guides_fasta, | ||
outdir, | ||
sample_name, | ||
guide_len = 20, | ||
primer = "TATTTATTTTGCTACTTAATAATTGGGACT", | ||
mismatches = 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,264 @@ | ||
#!/usr/bin/env python | ||
|
||
''' | ||
Module : count_guides | ||
Description : Count CRISPR guide sequences | ||
Copyright : (c) MAREK CMERO, 2023 | ||
License : TBD | ||
Maintainer : Marek Cmero | ||
Portability : POSIX | ||
Takes a fastq file and tab-delimited text file containing CRISPR library guide | ||
sequences and counts the number of times each sequence appears in the fastq. | ||
''' | ||
|
||
import os | ||
import sys | ||
import gzip | ||
import logging | ||
import pandas as pd | ||
import pyfastx as fx | ||
import re | ||
import edlib | ||
from io import StringIO | ||
from datetime import datetime | ||
from argparse import ArgumentParser | ||
|
||
LIBRARY_FILE_COLNAMES = ['library', 'sequence', 'gene', 'other'] | ||
|
||
|
||
def init_log(filename): | ||
''' | ||
Initialise logging | ||
''' | ||
logging.basicConfig(filename=filename, | ||
level=logging.DEBUG, | ||
filemode='w', | ||
format='%(asctime)s %(levelname)s - %(message)s', | ||
datefmt="%Y-%m-%dT%H:%M:%S%z") | ||
|
||
|
||
def parse_args(): | ||
'''Parse arguments''' | ||
description = ''' | ||
Takes a fastq file and tab-delimited text file or | ||
fasta file containing CRISPR library guide sequences | ||
''' | ||
parser = ArgumentParser(description=description) | ||
parser.add_argument('-r1', | ||
'--read1', | ||
metavar='R1', | ||
type=str, | ||
help='Read 1 file in fastq format.') | ||
parser.add_argument('-l', | ||
'--library', | ||
metavar='LIB', | ||
type=str, | ||
required=True, | ||
help=''' | ||
Fasta file containing guide sequences, or | ||
comma-separated gzipped file containing | ||
CRISPR library guide sequences.''') | ||
parser.add_argument('-i', | ||
'--guide-len', | ||
metavar='LEN', | ||
type=int, | ||
help='Length of guide sequence.') | ||
parser.add_argument('-p', | ||
'--primer', | ||
metavar='PRIMER', | ||
type=str, | ||
default="", | ||
help='''Primer sequence to trim from the 5' end of the | ||
guide sequence. Default is no primer (guide is at read | ||
start.''') | ||
parser.add_argument('-m', | ||
'--primer_mismatches', | ||
metavar='M', | ||
type=int, | ||
default=1, | ||
help='''Number of mismatches allowed when matching | ||
primer sequences. Default is 1.''') | ||
parser.add_argument('-g', | ||
'--genomics-barcodes', | ||
action='store_true', | ||
help='''Whether to expect Genomics barcode format | ||
for file name (e.g., Fwd_01-Rev_01_R1.fastq). | ||
Default is False.''') | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def read_crispr_library(library_file, guide_len): | ||
if library_file.endswith(('csv', 'csv.gz')): | ||
crispr_library = pd.read_csv(library_file, sep=',', header=None) | ||
if len(crispr_library.columns) != 4: | ||
raise ValueError('CRISPR library file must have 4 columns') | ||
|
||
# reload with header if necessary | ||
if crispr_library.iloc[0, 0].lower() == LIBRARY_FILE_COLNAMES[0]: | ||
crispr_library = pd.read_csv(library_file, sep=',', header=0) | ||
if not all(crispr_library.columns == LIBRARY_FILE_COLNAMES): | ||
raise ValueError('Invalid column names in library file') | ||
else: | ||
crispr_library.columns = LIBRARY_FILE_COLNAMES | ||
|
||
# check that guide length is correct | ||
cur_guide_len = crispr_library.sequence.map(len).unique()[0] | ||
if cur_guide_len != guide_len: | ||
raise ValueError('Insert does not match library insert length') | ||
elif library_file.endswith(('fasta', 'fa')): | ||
crispr_library = pd.DataFrame() | ||
fasta = fx.Fasta(library_file, build_index=False, full_name=True) | ||
for name, seq in fasta: | ||
if len(seq) != guide_len: | ||
raise ValueError('Insert does not match library insert length') | ||
record_to_add = pd.DataFrame( | ||
{'library': os.path.basename(library_file).split('.')[0], | ||
'sequence': seq, | ||
'gene': name.split(' ')[0], | ||
'other': ':'.join(name.split(' ')[1:])}, | ||
index=[0]) | ||
crispr_library = pd.concat([crispr_library, record_to_add], | ||
ignore_index=True) | ||
else: | ||
raise ValueError('Invalid library file: expected csv.gz or fasta.') | ||
|
||
return crispr_library | ||
|
||
|
||
def count_amplicons(read1, guide_len, primer="", primer_mismatches=1): | ||
reads_processed = 0 | ||
failed_reads = 0 | ||
amplicon_dict = {} | ||
primer_len = len(primer) | ||
|
||
for r1 in read1: | ||
reads_processed += 1 | ||
if reads_processed % 10000 == 0: | ||
logging.info('Processed %d reads' % reads_processed) | ||
|
||
try: | ||
name, seq, qual = r1 | ||
except Exception as e: | ||
logging.info('Error reading read(s) %s (%s)' % (r1.name, type(e))) | ||
failed_reads += 1 | ||
continue | ||
|
||
if primer_len > 0: | ||
# check if primer is present | ||
read_primer = seq[:primer_len] | ||
primer_result = edlib.align(primer, read_primer, mode="HW", | ||
task="path", k=primer_mismatches) | ||
if primer_result['editDistance'] > primer_mismatches: | ||
logging.info('Failed to match primer in read %s' % name) | ||
failed_reads += 1 | ||
continue | ||
|
||
guide = seq[primer_len:(guide_len+primer_len)] | ||
if guide in amplicon_dict: | ||
amplicon_dict[guide] += 1 | ||
else: | ||
amplicon_dict[guide] = 1 | ||
|
||
logging.info('Processed %d reads' % reads_processed) | ||
logging.info('Failed to process %d reads' % failed_reads) | ||
return amplicon_dict | ||
|
||
|
||
def match_counts(basename, counts, crispr_library, | ||
output_colnames, genomics_barcodes=False): | ||
''' | ||
Match counts to CRISPR library guide sequences | ||
''' | ||
results = pd.DataFrame.from_dict(counts, orient='index', columns=['count']) | ||
results['guide'] = results.index | ||
results = pd.merge(results, crispr_library, | ||
left_on='guide', right_on='sequence') | ||
|
||
if genomics_barcodes: | ||
# expected file name example: Fwd_01-Rev_01_R1.fastq | ||
# add index information (obtained from read file name) | ||
tmp = basename.split('-') | ||
fwd_idx_name = tmp[0] | ||
rev_idx_name = '_'.join(tmp[1].split('_')[:2]) | ||
results['FwdBarcode'] = fwd_idx_name | ||
results['RevBarcode'] = rev_idx_name | ||
else: | ||
results['sample'] = basename | ||
|
||
# sort and reorder columns | ||
results = results.sort_values(by='count', ascending=False) | ||
results = results[output_colnames] | ||
|
||
logging.info('Assigned %d reads.' % results['count'].sum()) | ||
return results | ||
|
||
|
||
def is_file_empty(file): | ||
''' | ||
Check if file is empty | ||
''' | ||
if file.endswith('.gz'): | ||
with gzip.open(file, 'rt') as f: | ||
char = f.read(1) | ||
return not char | ||
|
||
return os.stat(file).st_size == 0 | ||
|
||
|
||
def main(): | ||
''' | ||
Main function | ||
''' | ||
args = parse_args() | ||
|
||
# initialise logging | ||
basename = os.path.splitext(os.path.basename(args.read1))[0] | ||
basename = os.path.splitext(basename)[0] # split again for .fastq.gz files | ||
logfile = f'{basename}_' + datetime.now().strftime('%H-%M-%d-%m-%Y.log') | ||
init_log(logfile) | ||
|
||
# write arguments to log | ||
logging.info('Arguments:') | ||
for arg in vars(args): | ||
logging.info('%s: %s' % (arg, getattr(args, arg))) | ||
|
||
# if genomics barcodes are specified, check that the input format is right | ||
output_colnames = ['sample', 'guide', 'library', 'gene', 'other', 'count'] | ||
if args.genomics_barcodes: | ||
regex = r'Fwd_[0-9]*\-Rev_[0-9]*_R[1,2]' | ||
if not re.search(regex, basename): | ||
logging.error(''' | ||
File names do not match expected Genomic | ||
sbarcode format (e.g., Fwd_01-Rev_01_R1).''') | ||
sys.exit(1) | ||
output_colnames = ['FwdBarcode', 'RevBarcode', 'guide', | ||
'library', 'gene', 'other', 'count'] | ||
|
||
# handle empty input fastq files | ||
if is_file_empty(args.read1): | ||
logging.info('Read 1 file is empty. Writing empty output file.') | ||
print('\t'.join(output_colnames), file=sys.stdout) | ||
return | ||
|
||
# read in CRISPR library | ||
crispr_library = read_crispr_library(args.library, args.guide_len) | ||
|
||
# read in fastq | ||
read1 = fx.Fastq(args.read1, build_index=False) | ||
counts = count_amplicons(read1, args.guide_len, | ||
args.primer, args.primer_mismatches) | ||
|
||
# write counts to file | ||
counts = match_counts(basename, counts, crispr_library, | ||
output_colnames, args.genomics_barcodes) | ||
|
||
# write to stdout | ||
output = StringIO() | ||
counts.to_csv(output, sep='\t', index=False) | ||
output.seek(0) | ||
print(output.read(), file=sys.stdout) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |