Skip to content

Commit

Permalink
fix bug regarding --add-snp-info; add --min-mapq option to filter out…
Browse files Browse the repository at this point in the history
… reads with low mapping quality
  • Loading branch information
yupenghe committed Nov 25, 2017
1 parent 1437f35 commit 9d3ae4e
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 35 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ methylpy bam-quality-filter \
--input-file mESC_processed_reads_no_clonal.bam \
--output-file mESC_processed_reads_no_clonal.filtered.bam \
--ref-fasta mm10_bt2/mm10.fa \
--quality-cutoff 30 \
--min-mapq 30 \
--min-num-ch 3 \
--max-mch-level 0.7 \
--buffer-line-number 100
Expand Down
8 changes: 7 additions & 1 deletion docs/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,12 @@ with ``--add-snp-info`` option when using ``single-end-pipeline``,
| | | | icant |
| | | | methyl |
| | | | ation |
| | | | (1 if |
| | | | no |
| | | | test |
| | | | is |
| | | | perfor |
| | | | med) |
+---------+----------+----------+--------+
| 8 | (optiona | 3,2,3 | number |
| | l) | | of |
Expand Down Expand Up @@ -448,7 +454,7 @@ than 30 MAPQ score (poor alignment) and with mCH level greater than 0.7
--input-file mESC_processed_reads_no_clonal.bam \
--output-file mESC_processed_reads_no_clonal.filtered.bam \
--ref-fasta mm10_bt2/mm10.fa \
--quality-cutoff 30 \
--min-mapq 30 \
--min-num-ch 3 \
--max-mch-level 0.7 \
--buffer-line-number 100
Expand Down
37 changes: 25 additions & 12 deletions methylpy/call_mc_pe.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def run_methylation_pipeline_pe(read1_files, read2_files, sample,
binom_test=True, min_cov=2,
trim_reads=True, path_to_cutadapt="",
bowtie2=True, path_to_aligner="", aligner_options=None,
merge_by_max_mapq=False,
merge_by_max_mapq=False,min_mapq=2,
pbat=False,check_dependency=True,
remove_clonal=False, keep_clonal_stats=True,
path_to_picard="",java_options="-Xmx20g",
Expand Down Expand Up @@ -240,6 +240,7 @@ def run_methylation_pipeline_pe(read1_files, read2_files, sample,
path_to_samtools=path_to_samtools, path_to_aligner=path_to_aligner,
aligner_options=aligner_options,
merge_by_max_mapq=merge_by_max_mapq,
min_mapq=min_mapq,
num_procs=num_procs,
trim_reads=trim_reads, path_to_cutadapt=path_to_cutadapt,
adapter_seq_read1=adapter_seq_read1, adapter_seq_read2=adapter_seq_read2,
Expand Down Expand Up @@ -337,6 +338,7 @@ def run_mapping_pe(current_library, library_read1_files, library_read2_files,
path_to_samtools="", path_to_aligner="",
aligner_options=None,
merge_by_max_mapq=False,
min_mapq=2,
num_procs=1, trim_reads=True, path_to_cutadapt="",
adapter_seq_read1="AGATCGGAAGAGCACACGTCTGAAC",
adapter_seq_read2="AGATCGGAAGAGCGTCGTGTAGGGA",
Expand Down Expand Up @@ -550,6 +552,7 @@ def run_mapping_pe(current_library, library_read1_files, library_read2_files,
path_to_samtools=path_to_samtools,
aligner_options=aligner_options,
merge_by_max_mapq=merge_by_max_mapq,
min_mapq=min_mapq,
path_to_aligner=path_to_aligner,num_procs=num_procs,
keep_temp_files=keep_temp_files,
bowtie2=bowtie2, sort_mem=sort_mem)
Expand All @@ -562,7 +565,7 @@ def run_bowtie_pe(current_library,library_read1_files,library_read2_files,
path_to_output="",
path_to_samtools="",
aligner_options=None,path_to_aligner="",
merge_by_max_mapq=False,
merge_by_max_mapq=False,min_mapq=2,
num_procs=1,keep_temp_files=False, bowtie2=True, sort_mem="500M"):
"""
This function runs bowtie on the forward and reverse converted bisulfite references
Expand Down Expand Up @@ -645,7 +648,10 @@ def run_bowtie_pe(current_library,library_read1_files,library_read2_files,
subprocess.check_call(shlex.split(" ".join(args)))
print_checkpoint("Processing forward strand hits")
find_multi_mappers_pe(prefix+"_forward_strand_hits.sam",prefix,
num_procs=num_procs,keep_temp_files=keep_temp_files)
num_procs=num_procs,
min_mapq=min_mapq,
keep_temp_files=keep_temp_files,
append=False)

## Reverse
if bowtie2:
Expand All @@ -667,7 +673,9 @@ def run_bowtie_pe(current_library,library_read1_files,library_read2_files,
subprocess.check_call(shlex.split(" ".join(args)))
print_checkpoint("Processing reverse strand hits")
find_multi_mappers_pe(prefix+"_reverse_strand_hits.sam",prefix,
num_procs=num_procs,append=True,keep_temp_files=keep_temp_files)
num_procs=num_procs,
min_mapq=min_mapq,
append=True,keep_temp_files=keep_temp_files)

## Clear temporary files
if keep_temp_files==False:
Expand Down Expand Up @@ -726,7 +734,9 @@ def run_bowtie_pe(current_library,library_read1_files,library_read2_files,
return total_unique


def find_multi_mappers_pe(inputf,output,num_procs=1,keep_temp_files=False,append=False):
def find_multi_mappers_pe(inputf,output,num_procs=1,
min_mapq=2,
keep_temp_files=False,append=False):
"""
This function takes a sam file generated by bowtie/bowtie2 and pulls out any mapped reads.
It splits these mapped reads into num_procs number of files.
Expand All @@ -746,6 +756,7 @@ def find_multi_mappers_pe(inputf,output,num_procs=1,keep_temp_files=False,append
mapped reads) and True for the second. This option is mainly for safety. It ensures that files from
previous runs are erased.
"""
min_mapq = max(2,min_mapq)
sam_header = []
file_handles = {}
f = open(inputf,'r')
Expand All @@ -762,7 +773,7 @@ def find_multi_mappers_pe(inputf,output,num_procs=1,keep_temp_files=False,append
fields = line.split("\t")

## Check if it is proper pair
if int(fields[1]) & 2 == 0 or int(fields[4]) < 2:
if int(fields[1]) & 2 == 0 or int(fields[4]) < min_mapq:
continue;

header = fields[0].split("!")
Expand Down Expand Up @@ -919,27 +930,27 @@ def merge_sorted_multimap_pe_max_mapq(current_library,files,prefix,reference_fas
count_1, count_2 = 0, 0
current_line_1, current_line_2 = "", ""
count= 0
max_maq, min_maq = -1000,1000
max_mapq, min_mapq = -1000,1000
for key in fields:
while fields[key] == min_field:
count += 1
if int(lines[key].split("\t")[4]) >= max_maq:
max_maq = int(lines[key].split("\t")[4])
if int(lines[key].split("\t")[4]) >= max_mapq:
max_mapq = int(lines[key].split("\t")[4])
if(int(lines[key].split("\t")[1]) & 64 == 64): #First in pair
#count_1 += 1
current_line_1 = lines[key]
else:
#count_2 += 1
current_line_2 = lines[key]

if int(lines[key].split("\t")[4]) < min_maq:
min_maq = int(lines[key].split("\t")[4])
if int(lines[key].split("\t")[4]) < min_mapq:
min_mapq = int(lines[key].split("\t")[4])

lines[key]=file_handles[key].readline()
fields[key]=lines[key].split("\t")[0]
#Check if there is only one valid alignment
#if count_1 == 1:
if count == 2 or max_maq > min_maq:
if count == 2 or max_mapq > min_mapq:
output_handle.write(current_line_1)
output_handle.write(current_line_2)
total_unique += 1
Expand Down Expand Up @@ -1200,6 +1211,7 @@ def call_methylated_sites_pe(inputf, sample, reference_fasta,
num_upstr_bases=0,num_downstr_bases=2,
generate_mpileup_file=True,compress_output=True,
buffer_line_number = 100000,
min_mapq=2,
min_cov=1,binom_test=True,
path_to_samtools="",
remove_chr_prefix=True,
Expand Down Expand Up @@ -1255,6 +1267,7 @@ def call_methylated_sites_pe(inputf, sample, reference_fasta,
generate_mpileup_file=generate_mpileup_file,
compress_output=compress_output,
buffer_line_number = buffer_line_number,
min_mapq=min_mapq,
min_cov = min_cov,
binom_test = binom_test,
path_to_samtools = path_to_samtools,
Expand Down
Loading

0 comments on commit 9d3ae4e

Please sign in to comment.