From e71a5f2d82a03c0a257107c786d7d19fb309b749 Mon Sep 17 00:00:00 2001 From: Michael Baym Date: Thu, 27 Aug 2015 00:25:14 -0400 Subject: [PATCH] Trying to push the files in --- extractmutationsfromgbk.pl | 178 ++++++++++++++++++++++++++++ marginalia.pl | 237 +++++++++++++++++++++++++++++++++++++ multivcfmerger.pl | 39 ++++++ multivcfparser.pl | 143 ++++++++++++++++++++++ setupone.pl | 207 ++++++++++++++++++++++++++++++++ 5 files changed, 804 insertions(+) create mode 100644 extractmutationsfromgbk.pl create mode 100755 marginalia.pl create mode 100755 multivcfmerger.pl create mode 100755 multivcfparser.pl create mode 100755 setupone.pl diff --git a/extractmutationsfromgbk.pl b/extractmutationsfromgbk.pl new file mode 100644 index 0000000..2e4e633 --- /dev/null +++ b/extractmutationsfromgbk.pl @@ -0,0 +1,178 @@ +# first, bring in the SeqIO module +use Bio::SeqIO; +use Bio::Seq; +use List::MoreUtils qw(any); +use Getopt::Long; +use Bio::Tools::CodonTable; + +my $startdir = '.'; +my $refsource = '.';#'/groups/kishony/Reference_Genomes/'; +my $outdir = '.'; +my $mutfile = "newtest5variants.txt"; +my $outfile = 'mutations.txt'; +my $promoterlength=150; + +GetOptions ('s|startdir=s' => \$startdir, #directory with reads split into folders + 'r|g|refsource=s' => \$refsource, #where is the reference genome? + 'i|mutfile=s' => \$mutfile, #where is the mutations vcf + 'o|outfile=s' => \$outfile, #to what directory should analyses be outputted? + 'runtag=s' => \$runtag, #unique tag for these processes? (to manage bsub dependencies) + 'unixcsv' => \$unixcsv, #are samples.csv and alignment_params.csv unix or windows? + 'postprocess' => \$postprocess, #are samples.csv and alignment_params.csv unix or windows? + 'debug' => \$debug); #debug mode + + + +# Bring in the file and format, or die with a nice +# usage statement if one or both arguments are missing. + + +#my $usage = "perl testparse.pl [file] [format]\n"; +#my $file = shift or die $usage; +#my $format = shift or die $usage; + + +open VCFFILE, $mutfile || die "can't open VCF file $mutfile\n";; + + while ($vcfline = ) { + next if ($vcfline =~ m/$\#/); + + @linesplitarray = split(" ",$vcfline); + #print $vcfline."\n"; + #print $mutpos."\n"; + $mutpos = $linesplitarray[1]; + $refbases{$mutpos} = $linesplitarray[3]; + + + push @mutlocs, $mutpos; + + } + +close VCFFILE; + +#print "<$refsource/genome.fasta"; +open GENOME, "<$refsource/genome.fasta" || warn "can't open gemome"; + $genometopline = ; + $genometopline =~ m/\|.+\|.+\|(.+)\|/; + $gbkfile = $refsource."/".$1.".gb"; +close GENOME; + + +print "Reading genbank file $gbkfile ....\n"; +my $inseq = Bio::SeqIO->new( + -file => "<$gbkfile", + -format => 'GenBank', + ); +$myCodonTable = Bio::Tools::CodonTable->new(); +$myCodonTable->id(11); + + print "Parsing genbank...\n"; + +my $seq = $inseq->next_seq; + + +for my $feat_object ($seq->get_SeqFeatures) { + my $primtag = $feat_object->primary_tag; + #print "primary tag: ", $primtag , "\n"; + + if ($feat_object->primary_tag eq "CDS") { + $startpos = $feat_object->start; + $endpos = $feat_object->end; + $mutdata{$_}{"strand"}=$feat_object->strand; + #if (any { ($startpos < $_ && $endpos>$_) } @mutlocs ) { + foreach (@mutlocs) { + + if ($startpos < $_ && $endpos>$_) { + $mutdata{$_}{"strand"}=$feat_object->strand; + $mutdata{$_}{"start"}=$feat_object->start; + $mutdata{$_}{"end"}=$feat_object->end; + $mutdata{$_}{"promotor"}=0; + #print $feat_object->start,"..."; + #print $feat_object->end," "; + #print (($feat_object->strand eq 1)?"F":"R")."\n"; + for my $tag ($feat_object->get_all_tags) { + #print " tag: ", $tag, "\n"; + for my $value ($feat_object->get_tag_values($tag)) { + #print " value: ", $value, "\n"; + $mutdata{$_}{$tag}=$value; + } + } + + my $sequence_string = $feat_object->seq()->seq; + $mutdata{$_}{"nt_sequence"}=$sequence_string; + + $mutpos=(($feat_object->strand == 1)? ( $_ -$startpos):($endpos-$_)); #0-indexed position + $mutdata{$_}{"nt_pos"}=$mutpos+1; + + $mutcodonstart = $mutpos-($mutpos % 3); + $mutdata{$_}{"aa_pos"}=int($mutcodonstart/3)+1; + #print $mutcodonstart."x y".length($sequence_string)."z a"; + $mutcodon = substr($sequence_string,$mutcodonstart,3); + $mutdata{$_}{"codon"}=$mutcodon; + #print "$mutcodon "; + $altcodon = $mutcodon; + $altcodings = ""; + foreach $base ("A","T","C","G") { + #print "($altcodon) ".($mutpos % 3)." "; + $z = substr $altcodon,($mutpos % 3), 1,$base; + $altcodings = $altcodings.($myCodonTable->translate($altcodon)); + $altcodings =~ s/(.)(.)(.)(.)/$2$1$4$3/ if ($feat_object->strand == -1); + } + $mutdata{$_}{"AAs"}=$altcodings; + + # + } else { #INTERGENIC + + if ( ( (($startpos-$promoterlength) < $_) && (($_ < $startpos) && ($feat_object->strand==1) ) ) || + ( (($endpos+$promoterlength) > $_) && ( ($_ > $endpos) && ($feat_object->strand==-1) )) ){ + + $mutdata{$_}{"strand"}=$feat_object->strand; + $mutdata{$_}{"start"}=$feat_object->start; + $mutdata{$_}{"end"}=$feat_object->end; + $mutdata{$_}{"promotor"}=1; + for my $tag ($feat_object->get_all_tags) { + #print " tag: ", $tag, "\n"; + for my $value ($feat_object->get_tag_values($tag)) { + #print " value: ", $value, "\n"; + $mutdata{$_}{$tag}=$value; + } + } + } + + } + } + + } + + + +} + +open OUTFILE, ">$outfile"; + +foreach (@mutlocs) { + print OUTFILE $_."\t"; + if ($mutdata{$_}{"gene"}) { + print OUTFILE $mutdata{$_}{"gene"}; + } else { + print OUTFILE "yyyY"; + } + print OUTFILE "\t"; + print OUTFILE $mutdata{$_}{"start"}."\t"; + print OUTFILE $mutdata{$_}{"end"}."\t"; + print OUTFILE $mutdata{$_}{"strand"}."\t"; + print OUTFILE $mutdata{$_}{"promotor"}."\t"; + print OUTFILE $refbases{$_}."\t"; + print OUTFILE $mutdata{$_}{"product"}."\t"; + print OUTFILE $mutdata{$_}{"protein_id"}."\t"; + print OUTFILE $mutdata{$_}{"nt_pos"}."\t"; + print OUTFILE $mutdata{$_}{"codon"}."\t"; + print OUTFILE $mutdata{$_}{"locus_tag"}."\t"; + print OUTFILE $mutdata{$_}{"aa_pos"}."\t"; + print OUTFILE $mutdata{$_}{"AAs"}."\t"; + print OUTFILE $mutdata{$_}{"note"}; + #print OUTFILE " ".$mutdata{$_}{"function"}."\n"; + #print OUTFILE " ".$mutdata{$_}{"note"}."\n"; + #print OUTFILE " ".$mutdata{$_}{"translation"}."\n"; + print OUTFILE "\n"; + } \ No newline at end of file diff --git a/marginalia.pl b/marginalia.pl new file mode 100755 index 0000000..c1f1f5b --- /dev/null +++ b/marginalia.pl @@ -0,0 +1,237 @@ +#!/usr/bin/perl +#usage 'perl marginalia.pl -o outdir -q queue'; +use Getopt::Long; + +my $startdir = '.'; +my $refsource = '/groups/kishony/Reference_Genomes/'; +my $outdir = '.'; +my $marginalia = 'marginalia'; +my $queue = 'short -n 2'; +my $runtag = 'pipeline'.int(rand(1000)); +my $samplesheet = ""; +my $gatkbase = "module load dev/java/jdk1.7;java -Xmx8g -jar /groups/kishony/baym/gatklocal/GenomeAnalysisTK.jar -T HaplotypeCaller -nct 2 "; + +GetOptions ('s|startdir=s' => \$startdir, #directory with reads split into folders + 'q|queue=s' => \$queue, #what orchestra queue to use? + 'r|refsource=s' => \$refsource, #where is the reference genome? + 'm|marginalia=s' => \$marginalia, #where are the marginalia scripts? + 'o|outdir=s' => \$outdir, #to what directory should analyses be outputted? + 'runtag=s' => \$runtag, #unique tag for these processes? (to manage bsub dependencies) + 'unixcsv' => \$unixcsv, #are samples.csv and alignment_params.csv unix or windows? + 'postprocess' => \$postprocess, #are samples.csv and alignment_params.csv unix or windows? + 'preprocess' => \$preprocess, #are samples.csv and alignment_params.csv unix or windows? + 'samples=s' => \$samplesheet, #use an alternate sample sheet name + 'debug' => \$debug); #debug mode + + +sub runcommand { #this lets me switch from testing to running easily! +if ($debug) { + print $_[0]."\n"; +} else { + return system $_[0]; +} +} + +#read in the csvs +{local $/ = "\r" unless $unixcsv; #assume it's an excel csv unless the unixcsv flag is on +if ($samplesheet eq "") { + open SAMPLES, "$startdir/samples.csv" or die "Can't find $startdir/samples.csv\n"; +} else { + open SAMPLES, "$startdir/$samplesheet" or die "Can't find $startdir/$samplesheet\n"; + +} +$sampline = ; #chuck the top line +while ($sampline = ) { + $sampnum = $.-2; + chomp $sampline; + @sampdata = split(",",$sampline); + $readsdir[$sampnum]=$sampdata[0]; + $readsprefix[$sampnum]=$sampdata[5]; + $sampleoutname[$sampnum]=$sampdata[3]; + $protocol[$sampnum]=$sampdata[4]; +} +close SAMPLES; + +$maxsamp = $sampnum; + +foreach $sampnum (0..$maxsamp) { + push @{ $memberhash{$sampleoutname[$sampnum]} }, $sampnum; +} + + +open ALIGNPARAMS, "$startdir/alignment_params.csv" or die "Can't find $startdir/alignment_params.csv\n"; +$paramline = ; +while ($paramline = ) { + @paramdata = split(",",$paramline); + $refgenome{$paramdata[0]} = $paramdata[2]; + #print "$refgenome{$paramdata[0]}" +} +close ALIGNPARAMS; +} + +#first dispatch the preanalyzer +###set up the commands +runcommand "mkdir -p $outdir"; +runcommand "mkdir -p $outdir/runlogs"; + +unless ($postprocess) { + #for each one we need to do a few things: + #compile the perl command + $commandbase = "perl $marginalia/setupone.pl"; + $perlcom = ""; + foreach $sampnum (0..$maxsamp) { + if ($memberhash{$sampleoutname[$sampnum]}[0]==$sampnum) { + $perlcom = ""; + foreach $localsamp (@{$memberhash{$sampleoutname[$sampnum]}}) { + #now need to do this for each one + $readbase = $readsdir[$localsamp].$readsprefix[$localsamp]; + chomp $readbase; + if (-e $readbase."R1.fastq") { + $readbase = $readbase."R"; + } elsif (-e $readbase."_R1.fastq") { + $readbase = $readbase."_R"; + } else { + #defaulting to this behavior + $readbase = $readbase."_R"; + warn "Reads not found at $readbase\n" unless $debug; + } + + $perlcom = $perlcom." -s $readbase"; + } + + $perlcom = $commandbase.$perlcom; + + #reads directory + $perlcom = $perlcom." -t $outdir".'/'.$sampleoutname[$sampnum].'/'; #target directory + $perlcom = $perlcom." -r $refsource$refgenome{$protocol[$sampnum]}"; #reference directory + $perlcom = $perlcom." -p $protocol[$sampnum]"; #protocol name + $perlcom = $perlcom." -runtag $runtag"; #unique tag for this run + + #compile the bsub command + $logbase = "$outdir/runlogs/$sampleoutname[$sampnum]"; + $bsubcommand = "bsub -o $logbase.out -e $logbase.err -q $queue -W 6:0 -J ".$runtag."_".$sampleoutname[$sampnum]."_preprocess"; + runcommand "$bsubcommand $perlcom"; + + } + } +} + +die if $preprocess; + + + +runcommand "mkdir -p $outdir/analyses"; + + +#now dispatch vcfcollate, but make it wait on the vcf processes (<1 second) +#and dispatch the genbank processor (5 seconds) + $waitcom = ""; + $waitcom = '-w "done('.$runtag.'_*_preprocess)"' unless $postprocess; + #$filecom = '"'.$filecom.')"'; + #print $filecom."\n"; + + +$reffasta = $refsource.$refgenome{$protocol[$sampnum]}."/genome.fasta"; + +#### THIS ALL NEEDS TO BE UNCOMMENTED FOR ORCHESTRA +open GENOMEDICT, $refsource.$refgenome{$protocol[$sampnum]}."/genome.dict" or die "Can't find $refsource$refgenome{$protocol[$sampnum]}/genome.dict\n"; +$topline = ; +$nextline = ; +chomp $nextline; +$nextline =~ m/SN:(\S+)\s*LN:(\d+)/; +$contigname = $1; #print $contigname."\n"; +$genomesize = $2; #print $genomesize."\n"; + +close GENOMEDICT; +#$genomesize=50; + +$logdir = "$outdir/runlogs"; +$outdir2 = "$outdir/analyses/gatktranches"; + +$tranchsize = int($genomesize/$maxsamp/5); +$numintervals = int($genomesize/$tranchsize); + +#print "$genomesize,$tranchsize,$numintervals\n"; + + + +runcommand "mkdir -p $outdir2"; + + #for each one we need to do a few things: + #compile the perl command + $bsubbase = "bsub -q $queue -W 11:58 "; + #$perlcom = $perlcom." -R $refsource$refgenome{$protocol[0]}/genome.fasta"; #reference directory + $gatkbase = $gatkbase." -R $reffasta"; #reference directory + + foreach $sampnum (0..$maxsamp) { + if ($memberhash{$sampleoutname[$sampnum]}[0]==$sampnum) { + $gatkbase = $gatkbase." -I $outdir".'/'.$sampleoutname[$sampnum]."/sickle2051/$protocol[$sampnum]/realigned.bam"; #target directory + } + } + +runcommand "mkdir -p $logdir/gatktranches"; + for (0..$numintervals) { + ### this is to add the files per tranch + #"-o multincttranch.out -e /groups/kishony/baym/maniallgatknobqsr/runlogs/multincttranch.err" + $tranchstart = $_*$tranchsize+1; + $tranchfinish = ($_+1)*$tranchsize; + $tranchfinish = $genomesize if $tranchfinish>$genomesize; + #print $contigname.":".$tranchstart."-".$tranchfinish."\n"; + next if ($tranchstart>$tranchfinish); + next if (-e "$outdir/analyses/gatktranches/gatk.all.multiout.".$_.".vcf"); + $bsubcom = $bsubbase . $waitcom ." -o $logdir/gatktranches/tranch".$_.".out"; + $bsubcom = $bsubcom . " -J ".$runtag."_".$_."_gatktranch"; + $gatkcom = $gatkbase . " -L \\\"".$contigname.":".$tranchstart."-".$tranchfinish."\\\""; + #-o gatk.all.multiout1.vcf" + $gatkcom = $gatkcom . " -o $outdir/analyses/gatktranches/gatk.all.multiout.".$_.".vcf"; + runcommand "$bsubcom \"$gatkcom\""; + } + + + + + + +#now to make the next several in one shot + + +#perl ../marginalia/marginalia.pl -m "../marginalia/" -o staphout2 -q short --preprocess +#perl multitrancher.pl -o staphout2 -q long + +#(in a bsub -- takes 20-30 seconds) +#perl multivcfmerger.pl -s staphout2/analyses/gatktranches -o staphout2/analyses +$perlcom = "perl $marginalia/multivcfmerger.pl -s $outdir/analyses/gatktranches -o $outdir/analyses;"; +#perl ../marginalia/extractmutationsfromgbk.pl -i staphout2/analyses/gatk.all.multiout.merged.vcf -o staphout2/analyses/mutations.txt -r /groups/kishony/Reference_Genomes/SaureusNCTC8325 +$perlcom = $perlcom." module load dev/perl/5.18.1;"; +$perlcom = $perlcom." perl $marginalia/extractmutationsfromgbk.pl -i $outdir/analyses/gatk.all.multiout.merged.vcf -o $outdir/mutations.txt -r $refsource$refgenome{$protocol[$sampnum]};"; +#perl multivcfparser.pl -s staphout2/analyses/gatk.all.multiout.merged.vcf -o staphout2/analyses/ +$perlcom = $perlcom." perl $marginalia/multivcfparser.pl -s $outdir/analyses/gatk.all.multiout.merged.vcf -o $outdir/analyses;"; +#-> copy the readfile.m to the output directory +$perlcom = $perlcom."cp $marginalia/readfiles.m $outdir/analyses/readfiles.m;"; + +$logbase = "$outdir/runlogs/vcfmergeandprocess"; +$waitcom = '-w "done('.$runtag.'_*_gatktranch)"'; +$bsubcom = "bsub -o $logbase.out -e $logbase.err -q short -W 1:0 ".$waitcom." -J ".$runtag."_makecsvs "; +runcommand "$bsubcom \"$perlcom\""; + +#need to run the matlab command to input everything + + +open NAMELIST, ">$outdir/analyses/namelist.txt"; + foreach $sampnum (0..$maxsamp) { + if ($memberhash{$sampleoutname[$sampnum]}[0]==$sampnum) { + print NAMELIST $sampleoutname[$sampnum]."\n"; + } +} +close NAMELIST; + + +#dispatch readstomatlab.m making it wait on all previous work +# matlab -r "cd marginalia;readfiles('/groups/kishony/baym/symcipmani2/analyses/');quit;" + $logname = $logbase."matlabread"; + $bsubcommand = "bsub -o $logname.out -e $logname.err -q short -W 0:30 -w \"done(".$runtag."*gatktranch)\" -J ".$runtag."_matlabin"; + $matlabcommand = 'matlab -r "cd '.$outdir.'/analyses;readfiles;quit;"'; + runcommand $bsubcommand.' '.$matlabcommand; +die; + + \ No newline at end of file diff --git a/multivcfmerger.pl b/multivcfmerger.pl new file mode 100755 index 0000000..75175aa --- /dev/null +++ b/multivcfmerger.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl +use Getopt::Long; + +my $sourcedir = "."; +my $outdir = "."; + +GetOptions ('s|sourcedir=s' => \$sourcedir, + 'o|outdir=s' => \$outdir, #(the directory with sample name last) + 'debug' => \$debug); + +sub runcommand { #this lets me switch from testing to running easily! +if ($debug) { + print $_[0]."\n"; +} else { + print $_[0]."\n"; + return system $_[0]; +} +} + +runcommand "mkdir -p $outdir"; + +open OUTVCF, ">$outdir/gatk.all.multiout.merged.vcf"; + +$filenum=0; +$maxpos=0; + +while (open INVCF, $sourcedir."/gatk.all.multiout.".$filenum.".vcf") { + print "File number $filenum opened...\n" if ($filenum % 100 == 0); + while ($vcfline = ) { + next if ($vcfline =~ m/$\#/ && $filenum>0); + @linesplitarray = split(" ",$vcfline); + $pos = $linesplitarray[1]; + print OUTVCF $vcfline if ($pos > $maxpos || $vcfline =~ m/$\#/); + } + $filenum++; +} + +$filenum--; +print "Last file was $filenum.\n" \ No newline at end of file diff --git a/multivcfparser.pl b/multivcfparser.pl new file mode 100755 index 0000000..e29aac3 --- /dev/null +++ b/multivcfparser.pl @@ -0,0 +1,143 @@ +#!/usr/bin/perl +use Getopt::Long; +use List::Util qw(max min reduce); + +my $source = "gatk.all.multiout.vcf"; +my $outdir = "."; + +GetOptions ('s|v|invcf|source=s' => \$source, + 'o|outdir=s' => \$outdir, #(the directory with sample name last) + 'debug' => \$debug); + +open MULTIVCF, $source; + +open OUTDP, ">$outdir/depth.csv"; +open OUTAGREE, ">$outdir/agree.csv"; +open OUTCALL, ">$outdir/calls.tsv"; +open OUTQUAL, ">$outdir/quals.csv"; +open OUTVQMQ, ">$outdir/VqMqMqrsBqrsFsQd.csv"; +open OUTREFS, ">$outdir/locsandrefs.tsv"; + + while ($vcfline = ) { + next if ($vcfline =~ m/$\##/); + if ($vcfline =~ m/$\#/) { + @linesplitarray = split(" ",$vcfline); + $numitems = @linesplitarray; + @namearray =@linesplitarray[9..$numitems-1]; + $numsamps = @namearray; + #print $namearray[0]." ".$numsamps." @namearray\n"; + open OUTNAMES, ">$outdir/names.txt"; + print OUTNAMES "$_\n" foreach (@namearray); + close OUTNAMES; + next; + } + + @linesplitarray = split(" ",$vcfline); + #print $vcfline."\n"; + $pos = $linesplitarray[1]; + $ref = $linesplitarray[3]; + $call = $linesplitarray[4]; + print OUTREFS "$pos\t$ref\t$call\n"; + @calls = split(",",$call); + $qual = $linesplitarray[5]; + + $linesplitarray[7] =~ m/MQ=(\d+.?\d*);/; + $mq = $1; + $linesplitarray[7] =~ m/MQRankSum=(\d+.?\d*);/; + $mqrs = $1; + $linesplitarray[7] =~ m/BaseQRankSum=(\d+.?\d*);/; + $bqrs = $1; + $linesplitarray[7] =~ m/FS=(\d+.?\d*);/; + $fs = $1; + $linesplitarray[7] =~ m/QD=(\d+.?\d*);/; + $qd = $1; + + print OUTVQMQ "$qual,$mq,$mqrs,$bqrs,$fs,$qd\n"; + + $numcalls = @calls; + $numentries = @linesplitarray; + + + @tags = split(":",$linesplitarray[8]); #GT AD DP GQ PL + #print "@tags\n"; + my $gtarray=[]; + my $adarray=[]; + my $dparray=[]; + my $gqarray=[]; + my $plarray=[]; + for $i (0..$numsamps-1) { + @values = split(":",$linesplitarray[9+$i]); + #print "@tags @values"; + @qualtags{@tags}=@values; + + #$gtarray[$i] = $qualtags{"GT"}; + #$gqarray[$i] = $qualtags{"GQ"}; #these are diploid-optimized, so we'll ignore + $adarray[$i] = $qualtags{"AD"}; #agree vs disagree + $dparray[$i] = $qualtags{"DP"}; #read depth (sum of AD) + $plarray[$i] = $qualtags{"PL"}; # + # "@keys"; + + @locpl = split(",",$qualtags{"PL"}); + @locad = split(",",$qualtags{"AD"}); + + #this code uses reduce to do argmin; + #$plsiz=@locpl; + #$foo = reduce {$locpl[$a] < $locpl[$b] ? $a : $b } (0..($plsiz-1)); + + $thisdp = $qualtags{"DP"}; + $thiscall="."; + $thisqual=0; + $thisagree=0; + + if ($linesplitarray[9+$i] eq "./.") { + $thiscall="."; + $thisqual=0; + $thisagree=0; + $thisdp=0; + } elsif ($numcalls==1) { + #make a call based on the PL + if ($locpl[0]<$locpl[2]) { #the ref call is more likely + $thiscall=$ref; + $thisqual=$locpl[2]-$locpl[0]; + $thisagree = $locad[0]; + } else { + $thiscall=$calls[0]; + $thisqual=$locpl[0]-$locpl[2]; + $thisagree = $locad[1]; + } + + } elsif ($numcalls==2) { + $minval = reduce {$locpl[$a] < $locpl[$b] ? $a : $b } (0,2,5); + #print $minval." "; + if ($minval == 0) { + $thiscall=$ref; + $thisqual=min($locpl[2],$locpl[5])-$locpl[0]; + $thisagree = $locad[0]; + } elsif ($minval == 2) { + $thiscall=$calls[0]; + $thisagree = $locad[1]; + $thisqual=min($locpl[0],$locpl[5])-$locpl[2]; + } elsif ($minval == 5) { + $thiscall=$calls[1]; + $thisagree = $locad[2]; + $thisqual=min($locpl[0],$locpl[2])-$locpl[5]; + } + + } + $thisdp = 0 if $thisdp eq "."; + $thisagree = 0 if ($thisagree eq "." || $thisagree eq ""); + + print "PROBLEM: $thispl ".$linesplitarray[9+$i] ." $thisqual $i $pos $thiscall\n" if $thisagree eq ""; + + + print OUTDP "$thisdp,"; + print OUTAGREE "$thisagree,"; + print OUTCALL "$thiscall\t"; + print OUTQUAL "$thisqual,"; + } + + print OUTDP "\n"; + print OUTAGREE "\n"; + print OUTCALL "\n"; + print OUTQUAL "\n"; + } diff --git a/setupone.pl b/setupone.pl new file mode 100755 index 0000000..b3a0c28 --- /dev/null +++ b/setupone.pl @@ -0,0 +1,207 @@ +#!/usr/bin/perl +use Getopt::Long; + +my $samtools = '/opt/samtools/bin/samtools'; +my $bcftools = '/opt/samtools/bin/bcftools'; +my $gatk = 'module load dev/java/jdk1.7;java -Xmx4g -jar /groups/kishony/baym/gatklocal/GenomeAnalysisTK.jar'; +my $sickledir = '/home/mhb15/illumina_pipeline/sickle-master'; #sickle directory +my $filtercom = "$sickledir/sickle pe -t sanger -q 20 -l 50 -x -n"; +my $aligncom = '/opt/bowtie2/bowtie2 -p 2 -X 1000 --no-mixed --dovetail --very-sensitive --n-ceil 0,0.001'; #bowtiecommand +my $cutadaptcom = 'cutadapt -a CTGTCTCTTAT'; + +my $source = '.'; #source filepath +my $target = '/hms/scratch1/baym/experiment1/Sample001/'; #default filepath +my $refdir = '/groups/kishony/Reference_Genomes/EcoliMG1655'; #reference directory +my $protname = 'pairedendkeio'; +my $runtag = 'pipeline'; +my $debug; + +GetOptions ('s|source=s' => \@source, + 't|target=s' => \$target, #(the directory with sample name last) + 'f|filter=s' => \$filtercom, + 'a|align=s' => \$aligncom, + 'r|refdir=s' => \$refdir, + 'p|protname=s' => \$protname, + 'runtag=s' => \$runtag, + 'copyfirst' => \$copyfirst, + 'debug' => \$debug); + + +sub runcommand { #this lets me switch from testing to running easily! +if ($debug) { + print $_[0]."\n"; +} else { + print $_[0]."\n"; + return system $_[0]; +} +} +#--sam-rg "ID:'.$source.'-illumina" -- currently not working +#$target =~ /\/([^\/]+)\/$/; #pulls the directory name out to find the filename for now -> replace later +#$readbase = $target.$1."_"; + + +$reffasta = "$refdir/genome.fasta"; +$fromhereflag = 1; + +#make the target directory +$target =~ m/\/([^\/]+)\/$/; +$filename = $1; +runcommand "mkdir -p $target"; + + +$numfilesin = @source; + + + +print "\nCutting adaptors... (at ".localtime().")\n";#trim the reads + +unless (-e "$target/done.cut" && $fromhereflag) {$fromhereflag=0; + $success = 0; + for $file (@source) { + $read1in= $file."1.fastq"; + $read2in= $file."2.fastq"; + + $success += runcommand "$cutadaptcom $read1in >> $target"."readcut1.fastq"; + $success += runcommand "$cutadaptcom $read2in >> $target"."readcut2.fastq"; + runcommand "touch $target/done.cut" if ($success == 0); + } +} + + +print "\nTrimming reads... (at ".localtime().")\n";#trim the reads +$trimtarget=$target."sickle2051"; +runcommand "mkdir -p $trimtarget"; +#don't copy the fastqs to the target base directory -- get them from the source! +$fr1="$trimtarget/filter_reads_1.fastq"; +$fr2="$trimtarget/filter_reads_2.fastq"; +unless (-e "$trimtarget/done.trim" && $fromhereflag) {$fromhereflag=0; + + #$success = runcommand "$sickledir/sickle pe -f $read1in -r $read2in -t sanger -o $fr1 -p $fr2 -s $trimtarget/singles.fastq -q 20 -l 50 -x -n"; + $success = runcommand "$filtercom -f $target/readcut1.fastq -r $target/readcut2.fastq -o $fr1 -p $fr2 -s $trimtarget/singles.fastq"; + runcommand "touch $trimtarget/done.trim" if ($success == 0); +} + + +print "\nAligning reads... (at ".localtime().")\n"; +#align the reads, using four processors +$aligndir=$trimtarget."/".$protname; +runcommand "mkdir -p $aligndir"; +$alignsam = "$aligndir/aligned.sam"; +unless (-e "$aligndir/done.align" && $fromhereflag) {$fromhereflag=0; + $success = runcommand "$aligncom --un-conc $aligndir/unaligned.fastq -x $refdir/genome_bowtie2 -1 $fr1 -2 $fr2 -S $alignsam"; + runcommand "touch $aligndir/done.align" if ($success == 0); +} + +print "\nMaking bam files... (at ".localtime().")\n"; +$alignbam = "$aligndir/aligned.bam"; +$alignsort = "$aligndir/aligned.sorted"; +unless ((-e "$aligndir/done.sambam") && ($fromhereflag)) {$fromhereflag=0; + runcommand "$samtools view -bS $alignsam > $alignbam"; #make bam file + runcommand "$samtools sort $alignbam $alignsort"; #make sorted bam file + $success = runcommand "$samtools index $alignsort.bam $alignsort.bai"; #index sorted bam file + runcommand "touch $aligndir/done.sambam" if ($success == 0); +} + + +#use picard to add rungroups so gatk doesn't bork +print "\nAdding read groups with picard... (at ".localtime().")\n"; +$alignwrg = "$aligndir/alignedwrg.bam"; +$alignwrgindex = "$aligndir/alignedwrg.bai"; +$picardcommand= "java -jar /opt/picard/AddOrReplaceReadGroups.jar"; +unless (-e "$aligndir/done.picard" && $fromhereflag) {$fromhereflag=0; + runcommand "$picardcommand INPUT=$alignsort.bam OUTPUT=$alignwrg SORT_ORDER=coordinate RGLB=MG1655 RGPL=Illumina RGPU=2 RGSM=$filename"; + $success = runcommand "$samtools index $alignwrg $alignwrgindex"; + runcommand "touch $aligndir/done.picard" if ($success == 0); +} + +#realign with GATK +print "\nRealigning with GATK... (at ".localtime().")\n"; +runcommand "module load dev/java/jdk1.7"; +$realigned = "$aligndir/realigned.bam"; +$intervals = "$aligndir/forIndelRealigner.intervals"; +unless (-e "$aligndir/done.gatk" && $fromhereflag) {$fromhereflag=0; + runcommand "$gatk -T RealignerTargetCreator -R $reffasta -I $alignwrg -o $intervals"; + $success = runcommand "$gatk -T IndelRealigner -LOD 1.0 --entropyThreshold 0.10 -R $reffasta -I $alignwrg -targetIntervals $intervals -o $realigned"; + runcommand "touch $aligndir/done.gatk" if ($success == 0); +} + + + +#use gatk to recalibrate quality scores and remake the outputbam +#use gatk recalibrater to recalibrate == note, without bootstrapping this has only been tested for Michael's data + + +# print "\nRecalibrating Base Score Quality with GATK... (at ".localtime().")\n"; +# + $target =~ m/\/([^\/]+)\/$filename\/$/; + $basedir = $1; +# + $recaldir = "$aligndir/recalibrated"; +# $recalbam = "$recaldir/qualrecalibrated.bam"; + runcommand "mkdir -p $recaldir"; +# unless (-e "$recaldir/done.bqsr" && $fromhereflag) {$fromhereflag=0; +# runcommand "$gatk -T BaseRecalibrator -R $reffasta -I $realigned -o $recaldir/recal_data.table --run_without_dbsnp_potentially_ruining_quality"; +# $success = runcommand "$gatk -T PrintReads -R $reffasta -I $realigned -BQSR $recaldir/recal_data.table -o $recalbam"; +# runcommand "touch $recaldir/done.bqsr" if ($success == 0); +# } +$recalbam = $realigned; + +print "\nMaking the pileup in a separate process... (at ".localtime().")\n"; +$pileup = "$recaldir/strain.pileup"; +$pileupouts = "$filename_pileups"; +runcommand "mkdir -p $basedir/runlogs/"; + +$runlogbase = $basedir."/runlogs/".$pileupouts; +unless (-e "$recaldir/done.pileup" && $fromhereflag) {$fromhereflag=0; + $pileupcommand = "$samtools mpileup -q30 -s -O -d3000 -C30 -f $reffasta $recalbam > $pileup"; + $bsubcomm = "bsub -J ".$runtag."_".int(rand(5000))."_pileup -W 3:0 -o ".$runlogbase.".out -e ".$runlogbase.".err -q short "; + $pileupdone = "touch $recaldir/done.pileup"; + runcommand $bsubcomm.'"'.$pileupcommand.";".$pileupdone.'"'; +} + + + +#### EVERYTHING BELOW HERE IS OBSOLETE + + +#print "\nMaking the variant vcf in a separate process... (at ".localtime().")\n"; + +#use GATK to make strain and variant VCFs + +#$samcommand1 = "$samtools mpileup -q30 -s -d3000 -C30 -ugf $refdir/genome.fasta $realigned > $aligndir/strain"; +#$samcommand2 = "$bcftools view -g $aligndir/strain > $aligndir/strain.vcf"; +#$samcommand3 = "$bcftools view -vS $aligndir/strain.vcf > $aligndir/variant.vcf"; +#$vcfouts = "$filename_varvcf"; +#$runlogbase = $basedir."/runlogs/".$vcfouts; +#$bsubcomm = "bsub -J ".$runtag."_".int(rand(5000))."_variant_vcf -W 1:55 -o ".$runlogbase.".out -e ".$runlogbase.".err -q short"; +#$vcfcommand1 = "$bsubcomm \'$gatk -T HaplotypeCaller -R $reffasta -I $recalbam -o $recaldir/gatk.output.raw.recal.snps.indels.vcf\'"; +#$vcfcommand2 = "$gatk -T HaplotypeCaller -R $reffasta -I $recalbam -o $recaldir/gatk.output.raw.all.vcf --emitRefConfidence BP_RESOLUTION"; +#$vcfcommand1 = "$bsubcomm \'$gatk -T UnifiedGenotyper -R $reffasta -I $recalbam -o $recaldir/gatk.output.raw.recal.snps.indels.vcf -ploidy 1 -glm BOTH -nt 4\'"; +#$vcfcommand2 = "$gatk -T UnifiedGenotyper -R $reffasta -I $recalbam -o $recaldir/gatk.output.raw.all.vcf -ploidy 1 -glm BOTH -out_mode EMIT_ALL_SITES -nt 4"; + + + +#unless (-e "$recaldir/done.vcfs" && $fromhereflag && !(-z "$recaldir/gatk.output.raw.recal.snps.indels.vcf")) {$fromhereflag=0; +# runcommand $vcfcommand1; +# print "\nMaking the strain vcf... (at ".localtime().")\n"; +# $success = runcommand $vcfcommand2; +# runcommand "touch $recaldir/done.vcfs" if ($success == 0); +#} +#$vcfdone = "touch $aligndir/done.vcfs"; +# runcommand $samcommand1; +# runcommand $samcommand2; +# $success = runcommand $samcommand3; +# runcommand $vcfdone if ($success == 0); + +#print "\nVCFs finished at (at ".localtime().")\n"; + + +# +#print "\nCleaning up... \n"; +#if ((-e "$aligndir/done.pileup")) { +# if (-e "$aligndir/done.vcfs") { +# runcommand "rm $aligndir/aligned.sam; rm $aligndir/strain"; +# } else { +# runcommand "bsub -w \"done($target_vcf)\" -W 2 -q mini \"rm $aligndir/aligned.sam; rm $aligndir/strain\""; +# } +#} \ No newline at end of file