diff --git a/README.md b/README.md index bd31068..51b5960 100644 --- a/README.md +++ b/README.md @@ -294,7 +294,7 @@ methylpy bam-quality-filter \ --input-file mESC_processed_reads_no_clonal.bam \ --output-file mESC_processed_reads_no_clonal.filtered.bam \ --ref-fasta mm10_bt2/mm10.fa \ - --quality-cutoff 30 \ + --min-mapq 30 \ --min-num-ch 3 \ --max-mch-level 0.7 \ --buffer-line-number 100 diff --git a/docs/README.rst b/docs/README.rst index 32236c2..7f031d1 100644 --- a/docs/README.rst +++ b/docs/README.rst @@ -273,6 +273,12 @@ with ``--add-snp-info`` option when using ``single-end-pipeline``, | | | | icant | | | | | methyl | | | | | ation | +| | | | (1 if | +| | | | no | +| | | | test | +| | | | is | +| | | | perfor | +| | | | med) | +---------+----------+----------+--------+ | 8 | (optiona | 3,2,3 | number | | | l) | | of | @@ -448,7 +454,7 @@ than 30 MAPQ score (poor alignment) and with mCH level greater than 0.7 --input-file mESC_processed_reads_no_clonal.bam \ --output-file mESC_processed_reads_no_clonal.filtered.bam \ --ref-fasta mm10_bt2/mm10.fa \ - --quality-cutoff 30 \ + --min-mapq 30 \ --min-num-ch 3 \ --max-mch-level 0.7 \ --buffer-line-number 100 diff --git a/methylpy/call_mc_pe.py b/methylpy/call_mc_pe.py index cd71a7e..3ddc0ad 100644 --- a/methylpy/call_mc_pe.py +++ b/methylpy/call_mc_pe.py @@ -29,7 +29,7 @@ def run_methylation_pipeline_pe(read1_files, read2_files, sample, binom_test=True, min_cov=2, trim_reads=True, path_to_cutadapt="", bowtie2=True, path_to_aligner="", aligner_options=None, - merge_by_max_mapq=False, + merge_by_max_mapq=False,min_mapq=2, pbat=False,check_dependency=True, remove_clonal=False, keep_clonal_stats=True, path_to_picard="",java_options="-Xmx20g", @@ -240,6 +240,7 @@ def run_methylation_pipeline_pe(read1_files, read2_files, sample, path_to_samtools=path_to_samtools, path_to_aligner=path_to_aligner, aligner_options=aligner_options, merge_by_max_mapq=merge_by_max_mapq, + min_mapq=min_mapq, num_procs=num_procs, trim_reads=trim_reads, path_to_cutadapt=path_to_cutadapt, adapter_seq_read1=adapter_seq_read1, adapter_seq_read2=adapter_seq_read2, @@ -337,6 +338,7 @@ def run_mapping_pe(current_library, library_read1_files, library_read2_files, path_to_samtools="", path_to_aligner="", aligner_options=None, merge_by_max_mapq=False, + min_mapq=2, num_procs=1, trim_reads=True, path_to_cutadapt="", adapter_seq_read1="AGATCGGAAGAGCACACGTCTGAAC", adapter_seq_read2="AGATCGGAAGAGCGTCGTGTAGGGA", @@ -550,6 +552,7 @@ def run_mapping_pe(current_library, library_read1_files, library_read2_files, path_to_samtools=path_to_samtools, aligner_options=aligner_options, merge_by_max_mapq=merge_by_max_mapq, + min_mapq=min_mapq, path_to_aligner=path_to_aligner,num_procs=num_procs, keep_temp_files=keep_temp_files, bowtie2=bowtie2, sort_mem=sort_mem) @@ -562,7 +565,7 @@ def run_bowtie_pe(current_library,library_read1_files,library_read2_files, path_to_output="", path_to_samtools="", aligner_options=None,path_to_aligner="", - merge_by_max_mapq=False, + merge_by_max_mapq=False,min_mapq=2, num_procs=1,keep_temp_files=False, bowtie2=True, sort_mem="500M"): """ This function runs bowtie on the forward and reverse converted bisulfite references @@ -645,7 +648,10 @@ def run_bowtie_pe(current_library,library_read1_files,library_read2_files, subprocess.check_call(shlex.split(" ".join(args))) print_checkpoint("Processing forward strand hits") find_multi_mappers_pe(prefix+"_forward_strand_hits.sam",prefix, - num_procs=num_procs,keep_temp_files=keep_temp_files) + num_procs=num_procs, + min_mapq=min_mapq, + keep_temp_files=keep_temp_files, + append=False) ## Reverse if bowtie2: @@ -667,7 +673,9 @@ def run_bowtie_pe(current_library,library_read1_files,library_read2_files, subprocess.check_call(shlex.split(" ".join(args))) print_checkpoint("Processing reverse strand hits") find_multi_mappers_pe(prefix+"_reverse_strand_hits.sam",prefix, - num_procs=num_procs,append=True,keep_temp_files=keep_temp_files) + num_procs=num_procs, + min_mapq=min_mapq, + append=True,keep_temp_files=keep_temp_files) ## Clear temporary files if keep_temp_files==False: @@ -726,7 +734,9 @@ def run_bowtie_pe(current_library,library_read1_files,library_read2_files, return total_unique -def find_multi_mappers_pe(inputf,output,num_procs=1,keep_temp_files=False,append=False): +def find_multi_mappers_pe(inputf,output,num_procs=1, + min_mapq=2, + keep_temp_files=False,append=False): """ This function takes a sam file generated by bowtie/bowtie2 and pulls out any mapped reads. It splits these mapped reads into num_procs number of files. @@ -746,6 +756,7 @@ def find_multi_mappers_pe(inputf,output,num_procs=1,keep_temp_files=False,append mapped reads) and True for the second. This option is mainly for safety. It ensures that files from previous runs are erased. """ + min_mapq = max(2,min_mapq) sam_header = [] file_handles = {} f = open(inputf,'r') @@ -762,7 +773,7 @@ def find_multi_mappers_pe(inputf,output,num_procs=1,keep_temp_files=False,append fields = line.split("\t") ## Check if it is proper pair - if int(fields[1]) & 2 == 0 or int(fields[4]) < 2: + if int(fields[1]) & 2 == 0 or int(fields[4]) < min_mapq: continue; header = fields[0].split("!") @@ -919,12 +930,12 @@ def merge_sorted_multimap_pe_max_mapq(current_library,files,prefix,reference_fas count_1, count_2 = 0, 0 current_line_1, current_line_2 = "", "" count= 0 - max_maq, min_maq = -1000,1000 + max_mapq, min_mapq = -1000,1000 for key in fields: while fields[key] == min_field: count += 1 - if int(lines[key].split("\t")[4]) >= max_maq: - max_maq = int(lines[key].split("\t")[4]) + if int(lines[key].split("\t")[4]) >= max_mapq: + max_mapq = int(lines[key].split("\t")[4]) if(int(lines[key].split("\t")[1]) & 64 == 64): #First in pair #count_1 += 1 current_line_1 = lines[key] @@ -932,14 +943,14 @@ def merge_sorted_multimap_pe_max_mapq(current_library,files,prefix,reference_fas #count_2 += 1 current_line_2 = lines[key] - if int(lines[key].split("\t")[4]) < min_maq: - min_maq = int(lines[key].split("\t")[4]) + if int(lines[key].split("\t")[4]) < min_mapq: + min_mapq = int(lines[key].split("\t")[4]) lines[key]=file_handles[key].readline() fields[key]=lines[key].split("\t")[0] #Check if there is only one valid alignment #if count_1 == 1: - if count == 2 or max_maq > min_maq: + if count == 2 or max_mapq > min_mapq: output_handle.write(current_line_1) output_handle.write(current_line_2) total_unique += 1 @@ -1200,6 +1211,7 @@ def call_methylated_sites_pe(inputf, sample, reference_fasta, num_upstr_bases=0,num_downstr_bases=2, generate_mpileup_file=True,compress_output=True, buffer_line_number = 100000, + min_mapq=2, min_cov=1,binom_test=True, path_to_samtools="", remove_chr_prefix=True, @@ -1255,6 +1267,7 @@ def call_methylated_sites_pe(inputf, sample, reference_fasta, generate_mpileup_file=generate_mpileup_file, compress_output=compress_output, buffer_line_number = buffer_line_number, + min_mapq=min_mapq, min_cov = min_cov, binom_test = binom_test, path_to_samtools = path_to_samtools, diff --git a/methylpy/call_mc_se.py b/methylpy/call_mc_se.py index a6118df..53f765c 100644 --- a/methylpy/call_mc_se.py +++ b/methylpy/call_mc_se.py @@ -33,7 +33,7 @@ def run_methylation_pipeline(read_files, sample, trim_reads=True, path_to_cutadapt="", pbat=False,check_dependency=True, bowtie2=True, path_to_aligner="", aligner_options=None, - merge_by_max_mapq=False, + merge_by_max_mapq=False,min_mapq=2, remove_clonal=False,keep_clonal_stats=True, path_to_picard="",java_options="-Xmx20g", path_to_samtools="", @@ -219,6 +219,7 @@ def run_methylation_pipeline(read_files, sample, aligner_options=aligner_options, merge_by_max_mapq=merge_by_max_mapq, pbat=pbat, + min_mapq=min_mapq, num_procs=num_procs, trim_reads=trim_reads, path_to_cutadapt=path_to_cutadapt, @@ -299,6 +300,7 @@ def run_methylation_pipeline(read_files, sample, num_downstr_bases=num_downstr_bases, generate_mpileup_file=generate_mpileup_file, compress_output=compress_output, + min_mapq=min_mapq, min_cov=min_cov, binom_test=binom_test, remove_chr_prefix=remove_chr_prefix, @@ -314,7 +316,7 @@ def run_mapping(current_library, library_files, sample, path_to_output="", path_to_samtools="", path_to_aligner="", aligner_options=None,merge_by_max_mapq=False, - pbat=False, + min_mapq=2,pbat=False, num_procs=1, trim_reads=True, path_to_cutadapt="", adapter_seq="AGATCGGAAGAGCACACGTCTG", max_adapter_removal=None, overlap_length=None, zero_cap=None, @@ -490,6 +492,7 @@ def run_mapping(current_library, library_files, sample, path_to_output=path_to_output, aligner_options=aligner_options, merge_by_max_mapq=merge_by_max_mapq, + min_mapq=min_mapq, path_to_aligner=path_to_aligner, num_procs=num_procs, keep_temp_files=keep_temp_files, bowtie2=bowtie2, sort_mem=sort_mem) @@ -746,7 +749,8 @@ def run_bowtie(current_library,library_read_files, forward_reference,reverse_reference,reference_fasta, path_to_output="", path_to_samtools="", - path_to_aligner="",aligner_options="",merge_by_max_mapq=False, + path_to_aligner="",aligner_options="", + merge_by_max_mapq=False,min_mapq=2, num_procs=1,keep_temp_files=False, bowtie2=False, sort_mem="500M"): """ This function runs bowtie on the forward and reverse converted bisulfite references @@ -817,7 +821,12 @@ def run_bowtie(current_library,library_read_files, args.append(prefix+"_forward_strand_hits.sam") subprocess.check_call(shlex.split(" ".join(args))) print_checkpoint("Processing forward strand hits") - find_multi_mappers(prefix+"_forward_strand_hits.sam",prefix,num_procs=num_procs,keep_temp_files=keep_temp_files) + find_multi_mappers(prefix+"_forward_strand_hits.sam", + prefix, + num_procs=num_procs, + min_mapq=min_mapq, + append=False, + keep_temp_files=keep_temp_files) if bowtie2: args = [path_to_aligner+"bowtie2"] @@ -835,7 +844,12 @@ def run_bowtie(current_library,library_read_files, args.append(prefix+"_reverse_strand_hits.sam") subprocess.check_call(shlex.split(" ".join(args))) print_checkpoint("Processing reverse strand hits") - sam_header = find_multi_mappers(prefix+"_reverse_strand_hits.sam",prefix,num_procs=num_procs,append=True,keep_temp_files=keep_temp_files) + sam_header = find_multi_mappers(prefix+"_reverse_strand_hits.sam", + prefix, + num_procs=num_procs, + min_mapq=min_mapq, + append=True, + keep_temp_files=keep_temp_files) ## Clear temporary files if num_procs > 1: pool = multiprocessing.Pool(num_procs) @@ -885,7 +899,8 @@ def run_bowtie(current_library,library_read_files, output_bam_file )) return total_unique -def find_multi_mappers(inputf,output,num_procs=1,keep_temp_files=False,append=False): +def find_multi_mappers(inputf,output,num_procs=1,min_mapq=2, + keep_temp_files=False,append=False): """ This function takes a sam file generated by bowtie and pulls out any mapped reads. It splits these mapped reads into num_procs number of files. @@ -905,6 +920,7 @@ def find_multi_mappers(inputf,output,num_procs=1,keep_temp_files=False,append=Fa mapped reads) and True for the second. This option is mainly for safety. It ensures that files from previous runs are erased. """ + min_mapq = max(2,min_mapq) sam_header = [] file_handles = {} f = open(inputf,'r') @@ -921,7 +937,7 @@ def find_multi_mappers(inputf,output,num_procs=1,keep_temp_files=False,append=Fa fields = line.split("\t") # minimum QC - if fields[2] == "*" or int(fields[4]) < 2: + if fields[2] == "*" or int(fields[4]) < min_mapq: continue header = fields[0].split("!") #BIG ASSUMPTION!! NO TABS IN FASTQ HEADER LINES EXCEPT THE ONES I ADD! @@ -1291,6 +1307,7 @@ def call_methylated_sites(inputf, sample, reference_fasta, generate_mpileup_file=True, compress_output=True, buffer_line_number = 100000, + min_mapq=2, min_cov=1,binom_test=True, path_to_samtools="", remove_chr_prefix=True, @@ -1334,6 +1351,7 @@ def call_methylated_sites(inputf, sample, reference_fasta, generate_mpileup_file=generate_mpileup_file, compress_output=compress_output, buffer_line_number=buffer_line_number, + min_mapq=min_mapq, min_cov=min_cov, binom_test=binom_test, path_to_samtools=path_to_samtools, @@ -1380,7 +1398,8 @@ def call_methylated_sites(inputf, sample, reference_fasta, ## Input if not generate_mpileup_file: - cmd = path_to_samtools+"samtools mpileup -Q "+str(min_base_quality)+" -B -f "+reference_fasta+" "+inputf + cmd = path_to_samtools+"samtools mpileup -Q "+str(min_base_quality)+\ + +" -q "+str(min_mapq)+" -B -f "+reference_fasta+" "+inputf pipes = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -1390,8 +1409,9 @@ def call_methylated_sites(inputf, sample, reference_fasta, with open(path_to_files+sample+"_mpileup_output.tsv",'w') as f: subprocess.check_call( shlex.split( - path_to_samtools+"samtools mpileup -Q "+ - str(min_base_quality)+" -B -f "+reference_fasta+" "+inputf), + path_to_samtools+"samtools mpileup -Q "+str(min_base_quality) + +" -q "+str(min_mapq) + +" -B -f "+reference_fasta+" "+inputf), stdout=f) fhandle = open(path_to_files+sample+"_mpileup_output.tsv" ,'r') @@ -1594,7 +1614,7 @@ def analyze_read_basecalls(ref,read_bases): read_bases.count('t')+ \ read_bases.count('c')+ \ read_bases.count('g') - cons_basecalls = read_bases.count(',') + cons_basecalls = read_bases.count('.') return (str(cons_basecalls),str(incons_basecalls)) else: return ('0',str(incons_basecalls)) @@ -1606,6 +1626,7 @@ def call_methylated_sites_with_SNP_info(inputf, sample, reference_fasta, generate_mpileup_file=True, compress_output=True, buffer_line_number = 100000, + min_mapq=20, min_cov=1,binom_test=True, path_to_samtools="", remove_chr_prefix=True, @@ -1676,7 +1697,8 @@ def call_methylated_sites_with_SNP_info(inputf, sample, reference_fasta, ## Input if not generate_mpileup_file: - cmd = path_to_samtools+"samtools mpileup -Q "+str(min_base_quality)+" -B -f "+reference_fasta+" "+inputf + cmd = path_to_samtools+"samtools mpileup -Q "+str(min_base_quality)+\ + +" -q "+min_mapq+" -B -f "+reference_fasta+" "+inputf pipes = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -1686,8 +1708,9 @@ def call_methylated_sites_with_SNP_info(inputf, sample, reference_fasta, with open(path_to_files+sample+"_mpileup_output.tsv",'w') as f: subprocess.check_call( shlex.split( - path_to_samtools+"samtools mpileup -Q "+ - str(min_base_quality)+" -B -f "+reference_fasta+" "+inputf), + path_to_samtools+"samtools mpileup -Q "+str(min_base_quality) + +" -q "+min_mapq + +" -B -f "+reference_fasta+" "+inputf), stdout=f) fhandle = open(path_to_files+sample+"_mpileup_output.tsv" ,'r') @@ -2253,7 +2276,7 @@ def filter_files_by_pvalue_combined(input_files,output_file, def bam_quality_mch_filter(inputf, outputf, reference_fasta, - quality_cutoff = 30, + min_mapq = 30, min_ch = 3, max_mch_level = 0.7, buffer_line_number = 100000, @@ -2278,7 +2301,7 @@ def bam_quality_mch_filter(inputf, max_mch_level = float(max_mch_level) # quality filter - cmd = path_to_samtools+"samtools view -q"+str(quality_cutoff)+" "+inputf + cmd = path_to_samtools+"samtools view -q"+str(min_mapq)+" "+inputf pipes = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, diff --git a/methylpy/parser.py b/methylpy/parser.py index f7ff1f9..838bb6a 100644 --- a/methylpy/parser.py +++ b/methylpy/parser.py @@ -96,6 +96,7 @@ def parse_args(): path_to_aligner=args.path_to_aligner, aligner_options=args.aligner_options, merge_by_max_mapq=args.merge_by_max_mapq, + min_mapq=args.min_mapq, remove_clonal=args.remove_clonal, path_to_picard=args.path_to_picard, keep_clonal_stats=args.keep_clonal_stats, @@ -142,6 +143,7 @@ def parse_args(): path_to_aligner=args.path_to_aligner, aligner_options=args.aligner_options, merge_by_max_mapq=args.merge_by_max_mapq, + min_mapq=args.min_mapq, remove_clonal=args.remove_clonal, path_to_picard=args.path_to_picard, keep_clonal_stats=args.keep_clonal_stats, @@ -169,7 +171,7 @@ def parse_args(): bam_quality_mch_filter(inputf=args.input_file, outputf=args.output_file, reference_fasta=args.ref_fasta, - quality_cutoff=args.quality_cutoff, + min_mapq=args.min_mapq, min_ch=args.min_num_ch, max_mch_level=args.max_mch_level, buffer_line_number=args.buffer_line_number, @@ -188,6 +190,7 @@ def parse_args(): num_downstr_bases=args.num_downstream_bases, generate_mpileup_file=args.generate_mpileup_file, compress_output=args.compress_output, + min_mapq=args.min_mapq, min_cov=args.min_cov, binom_test=args.binom_test, path_to_samtools=args.path_to_samtools, @@ -207,6 +210,7 @@ def parse_args(): num_downstr_bases=args.num_downstream_bases, generate_mpileup_file=args.generate_mpileup_file, compress_output=args.compress_output, + min_mapq=args.min_mapq, min_cov=args.min_cov, binom_test=args.binom_test, path_to_samtools=args.path_to_samtools, @@ -687,6 +691,11 @@ def add_se_pipeline_subparser(subparsers): help="float indicating the adjusted p-value cutoff you wish to use for " + "determining whether or not a site is methylated") + parser_se_opt.add_argument("--min-mapq", + type=int, + default=2, + help="Minimum MAPQ for reads to be included.") + parser_se_opt.add_argument("--min-cov", type=int, default=0, @@ -972,6 +981,11 @@ def add_pe_pipeline_subparser(subparsers): help="float indicating the adjusted p-value cutoff you wish to use for " + "determining whether or not a site is methylated") + parser_pe_opt.add_argument("--min-mapq", + type=int, + default=2, + help="Minimum MAPQ for reads to be included.") + parser_pe_opt.add_argument("--min-cov", type=int, default=0, @@ -1100,7 +1114,7 @@ def add_bam_filter_subparser(subparsers): +"sequences you used for mapping") parser_filter_opt = parser_filter.add_argument_group("optional inputs") - parser_filter_opt.add_argument("--quality-cutoff", + parser_filter_opt.add_argument("--min-mapq", type=int, default=30, help="Minimum MAPQ for reads to be included.") @@ -1243,6 +1257,11 @@ def add_call_mc_subparser(subparsers): help="float indicating the adjusted p-value cutoff you wish to use for " + "determining whether or not a site is methylated") + call_mc_opt.add_argument("--min-mapq", + type=int, + default=2, + help="Minimum MAPQ for reads to be included.") + call_mc_opt.add_argument("--min-cov", type=int, default=0, diff --git a/setup.py b/setup.py index bd3bc0e..a14cfcc 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='methylpy', - version='1.1.7', + version='1.1.8', author='Yupeng He', author_email='yupeng.he.bioinfo@gmail.com', packages=['methylpy'], diff --git a/test/run_test.py b/test/run_test.py index cdd95a3..163025e 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -270,7 +270,7 @@ def get_executable_version(exec_name): +"--output-file results/se_bt_processed_reads.filtered.bam " +"--ref-fasta data/chrL.fa " +"--path-to-samtools "+path_to_samtools+" " - +"--quality-cutoff 30 " + +"--min-mapq 30 " +"--min-num-ch 3 " +"--max-mch-level 0.7 " +"--buffer-line-number 100"), @@ -283,7 +283,7 @@ def get_executable_version(exec_name): +"--output-file results/se_bt2_processed_reads.filtered.bam " +"--ref-fasta data/chrL.fa " +"--path-to-samtools "+path_to_samtools+" " - +"--quality-cutoff 30 " + +"--min-mapq 30 " +"--min-num-ch 3 " +"--max-mch-level 0.7 " +"--buffer-line-number 100"),