From e71a5f2d82a03c0a257107c786d7d19fb309b749 Mon Sep 17 00:00:00 2001
From: Michael Baym <baym@peregrine-2.local>
Date: Thu, 27 Aug 2015 00:25:14 -0400
Subject: [PATCH] Trying to push the files in

---
 extractmutationsfromgbk.pl | 178 ++++++++++++++++++++++++++++
 marginalia.pl              | 237 +++++++++++++++++++++++++++++++++++++
 multivcfmerger.pl          |  39 ++++++
 multivcfparser.pl          | 143 ++++++++++++++++++++++
 setupone.pl                | 207 ++++++++++++++++++++++++++++++++
 5 files changed, 804 insertions(+)
 create mode 100644 extractmutationsfromgbk.pl
 create mode 100755 marginalia.pl
 create mode 100755 multivcfmerger.pl
 create mode 100755 multivcfparser.pl
 create mode 100755 setupone.pl
diff --git a/extractmutationsfromgbk.pl b/extractmutationsfromgbk.pl
new file mode 100644
index 0000000..2e4e633
--- /dev/null
+++ b/extractmutationsfromgbk.pl
@@ -0,0 +1,178 @@
+# first, bring in the SeqIO module
+use Bio::SeqIO;
+use Bio::Seq;
+use List::MoreUtils qw(any);
+use Getopt::Long;
+use Bio::Tools::CodonTable;
+ 
+my $startdir = '.';
+my $refsource = '.';#'/groups/kishony/Reference_Genomes/';
+my $outdir = '.';
+my $mutfile = "newtest5variants.txt";
+my $outfile = 'mutations.txt';
+my $promoterlength=150;
+
+GetOptions ('s|startdir=s' => \$startdir,	#directory with reads split into folders
+			'r|g|refsource=s' => \$refsource, #where is the reference genome?
+			'i|mutfile=s' => \$mutfile,		#where is the mutations vcf
+			'o|outfile=s' => \$outfile,		#to what directory should analyses be outputted?
+			'runtag=s' => \$runtag,			#unique tag for these processes? (to manage bsub dependencies)
+			'unixcsv' => \$unixcsv,			#are samples.csv and alignment_params.csv unix or windows?
+			'postprocess' => \$postprocess,			#are samples.csv and alignment_params.csv unix or windows?
+			'debug' => \$debug);			#debug mode
+ 
+ 
+ 
+# Bring in the file and format, or die with a nice
+# usage statement if one or both arguments are missing.
+
+
+#my $usage  = "perl testparse.pl [file] [format]\n";
+#my $file   = shift or die $usage;
+#my $format = shift or die $usage;
+ 
+
+open VCFFILE, $mutfile || die "can't open VCF file $mutfile\n";;
+
+	while ($vcfline = <VCFFILE>) {
+	 	 next if ($vcfline =~ m/$\#/);
+	 	 
+		 @linesplitarray = split(" ",$vcfline);
+		 #print $vcfline."\n";
+		 #print $mutpos."\n";
+		 $mutpos = $linesplitarray[1];
+		 $refbases{$mutpos} = $linesplitarray[3];
+		 
+		 
+		 push @mutlocs, $mutpos;
+	
+	}
+	
+close VCFFILE;
+
+#print "<$refsource/genome.fasta";
+open GENOME, "<$refsource/genome.fasta" || warn "can't open gemome";
+	$genometopline = <GENOME>;
+	$genometopline =~ m/\|.+\|.+\|(.+)\|/;
+	$gbkfile = $refsource."/".$1.".gb";
+close GENOME;
+ 
+ 
+print "Reading genbank file $gbkfile  ....\n";
+my $inseq = Bio::SeqIO->new(
+                            -file   => "<$gbkfile",
+                            -format => 'GenBank',
+                            );
+$myCodonTable   = Bio::Tools::CodonTable->new();
+$myCodonTable->id(11);
+ 
+ print "Parsing genbank...\n";
+ 
+my $seq = $inseq->next_seq;
+
+
+for my $feat_object ($seq->get_SeqFeatures) {         
+	my $primtag =  $feat_object->primary_tag; 
+    #print "primary tag: ", $primtag , "\n";  
+  
+	if ($feat_object->primary_tag eq "CDS") {
+		$startpos = $feat_object->start;
+		$endpos = $feat_object->end;
+		$mutdata{$_}{"strand"}=$feat_object->strand;
+	 #if (any { ($startpos < $_ && $endpos>$_) } @mutlocs ) {
+	   foreach (@mutlocs) {  
+		
+	  	if ($startpos < $_ && $endpos>$_) {
+	  	  $mutdata{$_}{"strand"}=$feat_object->strand;
+	  	  $mutdata{$_}{"start"}=$feat_object->start;
+	  	  $mutdata{$_}{"end"}=$feat_object->end;
+	  	  $mutdata{$_}{"promotor"}=0;
+		  #print $feat_object->start,"...";
+		  #print $feat_object->end," ";
+		  #print (($feat_object->strand eq 1)?"F":"R")."\n";
+		  for my $tag ($feat_object->get_all_tags) {             
+			#print "  tag: ", $tag, "\n";
+			for my $value ($feat_object->get_tag_values($tag)) {                
+				#print "    value: ", $value, "\n";
+				$mutdata{$_}{$tag}=$value;     
+			}  
+		  }
+		  
+		  my $sequence_string = $feat_object->seq()->seq;
+      	  $mutdata{$_}{"nt_sequence"}=$sequence_string;
+      	  
+		  $mutpos=(($feat_object->strand == 1)? ( $_ -$startpos):($endpos-$_)); #0-indexed position		  
+      	  $mutdata{$_}{"nt_pos"}=$mutpos+1;
+		  
+		  $mutcodonstart = $mutpos-($mutpos % 3); 
+      	  $mutdata{$_}{"aa_pos"}=int($mutcodonstart/3)+1;
+		  #print $mutcodonstart."x y".length($sequence_string)."z a";
+      	  $mutcodon = substr($sequence_string,$mutcodonstart,3);
+      	  $mutdata{$_}{"codon"}=$mutcodon;
+      	  #print "$mutcodon ";
+      	  $altcodon = $mutcodon;
+      	  $altcodings = "";
+		  foreach $base ("A","T","C","G") {
+		  	#print "($altcodon) ".($mutpos % 3)." ";
+        	$z = substr $altcodon,($mutpos % 3), 1,$base;
+      		$altcodings = $altcodings.($myCodonTable->translate($altcodon));
+      		$altcodings =~ s/(.)(.)(.)(.)/$2$1$4$3/ if ($feat_object->strand == -1);
+      		}
+		  $mutdata{$_}{"AAs"}=$altcodings;
+		
+      #
+    	} else { #INTERGENIC
+    	
+    		if (  ( (($startpos-$promoterlength) < $_)   &&   (($_ < $startpos)   &&   ($feat_object->strand==1)  ) )  ||
+    		    ( (($endpos+$promoterlength) > $_)   &&  ( ($_ > $endpos)   &&   ($feat_object->strand==-1)  ))   ){
+
+	  	 		$mutdata{$_}{"strand"}=$feat_object->strand;
+	  	  		$mutdata{$_}{"start"}=$feat_object->start;
+	  	  		$mutdata{$_}{"end"}=$feat_object->end;
+	  	  		$mutdata{$_}{"promotor"}=1;
+    			for my $tag ($feat_object->get_all_tags) {             
+			#print "  tag: ", $tag, "\n";
+				for my $value ($feat_object->get_tag_values($tag)) {                
+				#print "    value: ", $value, "\n";
+				$mutdata{$_}{$tag}=$value;     
+				}  
+		  		}
+    		}
+    		
+    	} 
+      }
+           
+	}  
+   
+   
+      
+}
+
+open OUTFILE, ">$outfile";
+
+foreach (@mutlocs) {
+	print OUTFILE $_."\t";
+	if ($mutdata{$_}{"gene"}) {
+		print OUTFILE $mutdata{$_}{"gene"};
+	} else {
+		print OUTFILE "yyyY";
+	}
+	print OUTFILE "\t"; 
+	print OUTFILE $mutdata{$_}{"start"}."\t";
+	print OUTFILE $mutdata{$_}{"end"}."\t";
+	print OUTFILE $mutdata{$_}{"strand"}."\t";
+	print OUTFILE $mutdata{$_}{"promotor"}."\t";
+	print OUTFILE $refbases{$_}."\t";
+	print OUTFILE $mutdata{$_}{"product"}."\t";
+	print OUTFILE $mutdata{$_}{"protein_id"}."\t";
+	print OUTFILE $mutdata{$_}{"nt_pos"}."\t";
+	print OUTFILE $mutdata{$_}{"codon"}."\t";
+	print OUTFILE $mutdata{$_}{"locus_tag"}."\t";
+	print OUTFILE $mutdata{$_}{"aa_pos"}."\t";
+	print OUTFILE $mutdata{$_}{"AAs"}."\t";
+	print OUTFILE $mutdata{$_}{"note"};
+	#print OUTFILE " ".$mutdata{$_}{"function"}."\n";
+	#print OUTFILE " ".$mutdata{$_}{"note"}."\n";
+	#print OUTFILE " ".$mutdata{$_}{"translation"}."\n";
+	print OUTFILE "\n";
+	}
\ No newline at end of file
diff --git a/marginalia.pl b/marginalia.pl
new file mode 100755
index 0000000..c1f1f5b
--- /dev/null
+++ b/marginalia.pl
@@ -0,0 +1,237 @@
+#!/usr/bin/perl
+#usage 'perl marginalia.pl -o outdir -q queue';
+use Getopt::Long;
+
+my $startdir = '.';
+my $refsource = '/groups/kishony/Reference_Genomes/';
+my $outdir = '.';
+my $marginalia = 'marginalia';
+my $queue = 'short -n 2';
+my $runtag = 'pipeline'.int(rand(1000));
+my $samplesheet = "";
+my $gatkbase = "module load dev/java/jdk1.7;java -Xmx8g -jar /groups/kishony/baym/gatklocal/GenomeAnalysisTK.jar -T HaplotypeCaller -nct 2 ";
+
+GetOptions ('s|startdir=s' => \$startdir,	#directory with reads split into folders
+			'q|queue=s' => \$queue,			#what orchestra queue to use?
+			'r|refsource=s' => \$refsource, #where is the reference genome?
+			'm|marginalia=s' => \$marginalia, #where are the marginalia scripts?
+			'o|outdir=s' => \$outdir,		#to what directory should analyses be outputted?
+			'runtag=s' => \$runtag,			#unique tag for these processes? (to manage bsub dependencies)
+			'unixcsv' => \$unixcsv,			#are samples.csv and alignment_params.csv unix or windows?
+			'postprocess' => \$postprocess,			#are samples.csv and alignment_params.csv unix or windows?
+			'preprocess' => \$preprocess,			#are samples.csv and alignment_params.csv unix or windows?
+			'samples=s' => \$samplesheet,	#use an alternate sample sheet name
+			'debug' => \$debug);			#debug mode
+			
+			
+sub runcommand { #this lets me switch from testing to running easily!
+if ($debug) {
+	print $_[0]."\n";
+} else {
+	return system $_[0];
+}
+}
+			
+#read in the csvs
+{local $/ = "\r" unless $unixcsv; #assume it's an excel csv unless the unixcsv flag is on
+if ($samplesheet eq "") {
+	open SAMPLES, "$startdir/samples.csv" or die "Can't find $startdir/samples.csv\n";
+} else {
+	open SAMPLES, "$startdir/$samplesheet" or die "Can't find $startdir/$samplesheet\n";
+	
+}
+$sampline = <SAMPLES>; #chuck the top line
+while ($sampline = <SAMPLES>) {
+	$sampnum = $.-2;
+	chomp $sampline;
+	@sampdata = split(",",$sampline);
+	$readsdir[$sampnum]=$sampdata[0];
+	$readsprefix[$sampnum]=$sampdata[5];
+	$sampleoutname[$sampnum]=$sampdata[3];
+	$protocol[$sampnum]=$sampdata[4];
+}
+close SAMPLES;
+
+$maxsamp = $sampnum;
+
+foreach $sampnum (0..$maxsamp) {
+	push @{ $memberhash{$sampleoutname[$sampnum]} }, $sampnum;
+}
+
+
+open ALIGNPARAMS, "$startdir/alignment_params.csv" or die "Can't find $startdir/alignment_params.csv\n";
+$paramline = <ALIGNPARAMS>;
+while ($paramline = <ALIGNPARAMS>) {
+	@paramdata = split(",",$paramline);
+	$refgenome{$paramdata[0]} = $paramdata[2];
+	#print "$refgenome{$paramdata[0]}"
+}
+close ALIGNPARAMS;
+}
+
+#first dispatch the preanalyzer
+###set up the commands
+runcommand "mkdir -p $outdir";
+runcommand "mkdir -p $outdir/runlogs";
+
+unless ($postprocess) {
+	#for each one we need to do a few things:
+	#compile the perl command
+	$commandbase = "perl $marginalia/setupone.pl";
+	$perlcom = "";
+	foreach $sampnum (0..$maxsamp) {
+		if ($memberhash{$sampleoutname[$sampnum]}[0]==$sampnum) {
+			$perlcom = "";
+			foreach $localsamp (@{$memberhash{$sampleoutname[$sampnum]}}) {
+			#now need to do this for each one
+				$readbase = $readsdir[$localsamp].$readsprefix[$localsamp];
+				chomp $readbase;
+				if (-e $readbase."R1.fastq") {
+					$readbase = $readbase."R";
+				} elsif (-e $readbase."_R1.fastq") {
+					$readbase = $readbase."_R";
+				} else {
+					#defaulting to this behavior
+					$readbase = $readbase."_R";
+					warn "Reads not found at $readbase\n" unless $debug;
+				}
+
+				$perlcom = $perlcom." -s $readbase";
+			}
+		
+			$perlcom = $commandbase.$perlcom;
+		
+											#reads directory
+			$perlcom = $perlcom." -t $outdir".'/'.$sampleoutname[$sampnum].'/'; 	#target directory
+			$perlcom = $perlcom." -r $refsource$refgenome{$protocol[$sampnum]}";	#reference directory
+			$perlcom = $perlcom." -p $protocol[$sampnum]";							#protocol name
+			$perlcom = $perlcom." -runtag $runtag";									#unique tag for this run
+
+			#compile the bsub command
+			$logbase = "$outdir/runlogs/$sampleoutname[$sampnum]";
+			$bsubcommand = "bsub -o $logbase.out -e $logbase.err -q $queue -W 6:0 -J ".$runtag."_".$sampleoutname[$sampnum]."_preprocess";
+			runcommand "$bsubcommand $perlcom";
+		
+		}
+	}
+}
+
+die if $preprocess;
+
+
+
+runcommand "mkdir -p $outdir/analyses";
+
+
+#now dispatch vcfcollate, but make it wait on the vcf processes (<1 second)
+#and dispatch the genbank processor (5 seconds)
+	$waitcom = "";
+	$waitcom = '-w "done('.$runtag.'_*_preprocess)"' unless $postprocess;
+	#$filecom = '"'.$filecom.')"';
+	#print $filecom."\n";
+	
+	
+$reffasta = $refsource.$refgenome{$protocol[$sampnum]}."/genome.fasta";
+
+#### THIS ALL NEEDS TO BE UNCOMMENTED FOR ORCHESTRA
+open GENOMEDICT, $refsource.$refgenome{$protocol[$sampnum]}."/genome.dict" or die "Can't find $refsource$refgenome{$protocol[$sampnum]}/genome.dict\n";
+$topline = <GENOMEDICT>;
+$nextline = <GENOMEDICT>;
+chomp $nextline;
+$nextline =~ m/SN:(\S+)\s*LN:(\d+)/;
+$contigname = $1; #print $contigname."\n";
+$genomesize = $2; #print $genomesize."\n";
+
+close GENOMEDICT;
+#$genomesize=50;
+
+$logdir = "$outdir/runlogs";
+$outdir2 = "$outdir/analyses/gatktranches";
+
+$tranchsize = int($genomesize/$maxsamp/5);
+$numintervals = int($genomesize/$tranchsize);
+
+#print "$genomesize,$tranchsize,$numintervals\n";
+
+
+
+runcommand "mkdir -p $outdir2";
+
+	#for each one we need to do a few things:
+	#compile the perl command
+	$bsubbase = "bsub -q $queue -W 11:58 ";
+	#$perlcom = $perlcom." -R $refsource$refgenome{$protocol[0]}/genome.fasta";	#reference directory
+	$gatkbase = $gatkbase." -R $reffasta";	#reference directory
+	
+	foreach $sampnum (0..$maxsamp) {
+		if ($memberhash{$sampleoutname[$sampnum]}[0]==$sampnum)  {
+		$gatkbase = $gatkbase." -I $outdir".'/'.$sampleoutname[$sampnum]."/sickle2051/$protocol[$sampnum]/realigned.bam"; 	#target directory	
+		}	
+	}
+	
+runcommand "mkdir -p $logdir/gatktranches";
+	for (0..$numintervals) {
+	### this is to add the files per tranch
+	#"-o multincttranch.out -e /groups/kishony/baym/maniallgatknobqsr/runlogs/multincttranch.err"
+		$tranchstart = $_*$tranchsize+1;
+		$tranchfinish = ($_+1)*$tranchsize;
+		$tranchfinish = $genomesize if $tranchfinish>$genomesize;
+		#print $contigname.":".$tranchstart."-".$tranchfinish."\n";
+		next if ($tranchstart>$tranchfinish);
+		next if (-e "$outdir/analyses/gatktranches/gatk.all.multiout.".$_.".vcf");
+		$bsubcom = $bsubbase . $waitcom ." -o $logdir/gatktranches/tranch".$_.".out";
+		$bsubcom = $bsubcom . " -J ".$runtag."_".$_."_gatktranch";
+		$gatkcom = $gatkbase . " -L \\\"".$contigname.":".$tranchstart."-".$tranchfinish."\\\"";
+		#-o gatk.all.multiout1.vcf"
+		$gatkcom = $gatkcom . " -o $outdir/analyses/gatktranches/gatk.all.multiout.".$_.".vcf";
+		runcommand "$bsubcom \"$gatkcom\""; 
+	}
+	
+	
+	
+	
+	
+	
+#now to make the next several in one shot
+	
+	
+#perl ../marginalia/marginalia.pl -m "../marginalia/" -o staphout2 -q short --preprocess
+#perl multitrancher.pl -o staphout2 -q long
+
+#(in a bsub -- takes 20-30 seconds)
+#perl multivcfmerger.pl -s staphout2/analyses/gatktranches -o staphout2/analyses
+$perlcom = "perl $marginalia/multivcfmerger.pl -s $outdir/analyses/gatktranches -o $outdir/analyses;";
+#perl ../marginalia/extractmutationsfromgbk.pl -i staphout2/analyses/gatk.all.multiout.merged.vcf -o staphout2/analyses/mutations.txt -r /groups/kishony/Reference_Genomes/SaureusNCTC8325
+$perlcom = $perlcom." module load dev/perl/5.18.1;";
+$perlcom = $perlcom." perl $marginalia/extractmutationsfromgbk.pl -i $outdir/analyses/gatk.all.multiout.merged.vcf -o $outdir/mutations.txt -r $refsource$refgenome{$protocol[$sampnum]};";
+#perl multivcfparser.pl -s staphout2/analyses/gatk.all.multiout.merged.vcf -o staphout2/analyses/
+$perlcom = $perlcom." perl $marginalia/multivcfparser.pl -s $outdir/analyses/gatk.all.multiout.merged.vcf -o $outdir/analyses;";
+#-> copy the readfile.m to the output directory
+$perlcom = $perlcom."cp $marginalia/readfiles.m $outdir/analyses/readfiles.m;";
+
+$logbase = "$outdir/runlogs/vcfmergeandprocess";
+$waitcom = '-w "done('.$runtag.'_*_gatktranch)"';
+$bsubcom = "bsub -o $logbase.out -e $logbase.err -q short -W 1:0 ".$waitcom." -J ".$runtag."_makecsvs ";
+runcommand "$bsubcom \"$perlcom\"";	
+	
+#need to run the matlab command to input everything
+	
+	
+open NAMELIST, ">$outdir/analyses/namelist.txt";
+	foreach $sampnum (0..$maxsamp) {
+		if ($memberhash{$sampleoutname[$sampnum]}[0]==$sampnum)  {
+		print NAMELIST $sampleoutname[$sampnum]."\n";
+		}
+}
+close NAMELIST;
+
+
+#dispatch readstomatlab.m making it wait on all previous work
+#  matlab -r "cd marginalia;readfiles('/groups/kishony/baym/symcipmani2/analyses/');quit;"
+		$logname = $logbase."matlabread";
+	$bsubcommand = "bsub -o $logname.out -e $logname.err -q short -W 0:30 -w \"done(".$runtag."*gatktranch)\" -J ".$runtag."_matlabin";
+	$matlabcommand = 'matlab -r "cd '.$outdir.'/analyses;readfiles;quit;"';
+	 runcommand $bsubcommand.' '.$matlabcommand;
+die;
+	
+	
\ No newline at end of file
diff --git a/multivcfmerger.pl b/multivcfmerger.pl
new file mode 100755
index 0000000..75175aa
--- /dev/null
+++ b/multivcfmerger.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/perl
+use Getopt::Long;
+
+my $sourcedir = ".";
+my $outdir = ".";
+
+GetOptions ('s|sourcedir=s' => \$sourcedir,
+			'o|outdir=s' => \$outdir, #(the directory with sample name last)
+			'debug' => \$debug);
+
+sub runcommand { #this lets me switch from testing to running easily!
+if ($debug) {
+	print $_[0]."\n";
+} else {
+	print $_[0]."\n";
+	return system $_[0];
+}
+}
+
+runcommand "mkdir -p $outdir";
+
+open OUTVCF, ">$outdir/gatk.all.multiout.merged.vcf";
+
+$filenum=0;
+$maxpos=0;
+
+while (open INVCF, $sourcedir."/gatk.all.multiout.".$filenum.".vcf") {
+	print "File number $filenum opened...\n" if ($filenum % 100 == 0);
+	while ($vcfline = <INVCF>) {
+	 	 next if ($vcfline =~ m/$\#/ && $filenum>0);
+	 	 @linesplitarray = split(" ",$vcfline);
+		 $pos = $linesplitarray[1];
+		 print OUTVCF $vcfline if ($pos > $maxpos || $vcfline =~ m/$\#/);
+	}
+	$filenum++;
+}
+
+$filenum--;
+print "Last file was $filenum.\n"
\ No newline at end of file
diff --git a/multivcfparser.pl b/multivcfparser.pl
new file mode 100755
index 0000000..e29aac3
--- /dev/null
+++ b/multivcfparser.pl
@@ -0,0 +1,143 @@
+#!/usr/bin/perl
+use Getopt::Long;
+use List::Util qw(max min reduce);
+
+my $source = "gatk.all.multiout.vcf";
+my $outdir = ".";
+
+GetOptions ('s|v|invcf|source=s' => \$source,
+			'o|outdir=s' => \$outdir, #(the directory with sample name last)
+			'debug' => \$debug);
+
+open MULTIVCF, $source;
+
+open OUTDP, ">$outdir/depth.csv";
+open OUTAGREE, ">$outdir/agree.csv";
+open OUTCALL, ">$outdir/calls.tsv";
+open OUTQUAL, ">$outdir/quals.csv";
+open OUTVQMQ, ">$outdir/VqMqMqrsBqrsFsQd.csv";
+open OUTREFS, ">$outdir/locsandrefs.tsv";
+
+	while ($vcfline = <MULTIVCF>) {
+	 	 next if ($vcfline =~ m/$\##/);
+	 	 if ($vcfline =~ m/$\#/) {
+	 	 		 @linesplitarray = split(" ",$vcfline);
+	 	 		 $numitems = @linesplitarray;
+	 	 		 @namearray =@linesplitarray[9..$numitems-1];
+	 	 		 $numsamps = @namearray;
+	 	 		 #print $namearray[0]." ".$numsamps." @namearray\n";
+				 open OUTNAMES, ">$outdir/names.txt";
+				 print OUTNAMES "$_\n" foreach (@namearray);
+				 close OUTNAMES;
+	 	 		 next;
+	 	 }
+	 	 
+		 @linesplitarray = split(" ",$vcfline);
+		 #print $vcfline."\n";
+		 $pos = $linesplitarray[1];
+		 $ref = $linesplitarray[3];
+		 $call = $linesplitarray[4];
+		 print OUTREFS "$pos\t$ref\t$call\n";
+		 @calls = split(",",$call);
+		 $qual = $linesplitarray[5];
+		 
+		 $linesplitarray[7] =~ m/MQ=(\d+.?\d*);/;
+		 $mq = $1;
+		 $linesplitarray[7] =~ m/MQRankSum=(\d+.?\d*);/;
+		 $mqrs = $1;
+		 $linesplitarray[7] =~ m/BaseQRankSum=(\d+.?\d*);/;
+		 $bqrs = $1;
+		 $linesplitarray[7] =~ m/FS=(\d+.?\d*);/;
+		 $fs = $1;
+		 $linesplitarray[7] =~ m/QD=(\d+.?\d*);/;
+		 $qd = $1;
+		 
+		 print OUTVQMQ "$qual,$mq,$mqrs,$bqrs,$fs,$qd\n";
+		 
+		 $numcalls = @calls;
+		 $numentries = @linesplitarray;
+		 
+		 
+		 @tags = split(":",$linesplitarray[8]); #GT AD DP GQ PL
+		 #print "@tags\n";
+		my $gtarray=[];
+		my $adarray=[];
+		my $dparray=[];
+		my $gqarray=[];
+		my $plarray=[];
+		for $i (0..$numsamps-1) {
+		 	@values = split(":",$linesplitarray[9+$i]);
+		 	#print "@tags @values";
+		 	@qualtags{@tags}=@values;
+		 	
+		 	#$gtarray[$i] = $qualtags{"GT"};
+			#$gqarray[$i] = $qualtags{"GQ"};  #these are diploid-optimized, so we'll ignore
+			$adarray[$i] = $qualtags{"AD"};	#agree vs disagree
+			$dparray[$i] = $qualtags{"DP"}; #read depth (sum of AD)
+		 	$plarray[$i] = $qualtags{"PL"}; #
+		 	# "@keys";
+		 	
+		 	@locpl = split(",",$qualtags{"PL"});
+		 	@locad = split(",",$qualtags{"AD"});
+		 	
+		 	#this code uses reduce to do argmin;
+		 	#$plsiz=@locpl;
+		 	#$foo = reduce {$locpl[$a] < $locpl[$b] ? $a : $b } (0..($plsiz-1));
+		 	
+		 	$thisdp = $qualtags{"DP"};
+		 	$thiscall=".";
+		 	$thisqual=0;
+		 	$thisagree=0;
+		 	
+		 	if ($linesplitarray[9+$i] eq "./.") {
+		 		$thiscall=".";
+		 		$thisqual=0;
+		 		$thisagree=0;
+		 		$thisdp=0;
+		 	} elsif ($numcalls==1) {
+		 		#make a call based on the PL
+		 		if ($locpl[0]<$locpl[2]) { #the ref call is more likely
+		 			$thiscall=$ref;
+		 			$thisqual=$locpl[2]-$locpl[0];
+		 			$thisagree = $locad[0];
+		 		} else {
+		 			$thiscall=$calls[0];
+		 			$thisqual=$locpl[0]-$locpl[2];
+		 			$thisagree = $locad[1];
+		 		}
+		 	
+		 	} elsif ($numcalls==2) {
+				$minval = reduce {$locpl[$a] < $locpl[$b] ? $a : $b } (0,2,5);
+				#print $minval."  ";
+				if ($minval == 0) {
+					$thiscall=$ref;
+		 			$thisqual=min($locpl[2],$locpl[5])-$locpl[0];
+		 			$thisagree = $locad[0];
+				} elsif ($minval == 2) {
+					$thiscall=$calls[0];
+		 			$thisagree = $locad[1];
+		 			$thisqual=min($locpl[0],$locpl[5])-$locpl[2];
+				} elsif ($minval == 5) {
+					$thiscall=$calls[1];
+		 			$thisagree = $locad[2];
+		 			$thisqual=min($locpl[0],$locpl[2])-$locpl[5];
+				}	
+		 	
+		 	}
+		 	$thisdp = 0 if $thisdp eq ".";
+		 	$thisagree = 0 if ($thisagree eq "." || $thisagree eq "");
+		 	
+		 	print "PROBLEM: $thispl ".$linesplitarray[9+$i] ." $thisqual $i $pos $thiscall\n" if $thisagree eq "";
+		 	
+		 
+		 print OUTDP "$thisdp,";
+		 print OUTAGREE "$thisagree,";
+		 print OUTCALL "$thiscall\t";
+		 print OUTQUAL "$thisqual,";
+		 }
+	
+	print OUTDP "\n";
+	print OUTAGREE "\n";
+	print OUTCALL  "\n";
+	print OUTQUAL  "\n";
+	}
diff --git a/setupone.pl b/setupone.pl
new file mode 100755
index 0000000..b3a0c28
--- /dev/null
+++ b/setupone.pl
@@ -0,0 +1,207 @@
+#!/usr/bin/perl
+use Getopt::Long;
+
+my $samtools = '/opt/samtools/bin/samtools';
+my $bcftools = '/opt/samtools/bin/bcftools';
+my $gatk = 'module load dev/java/jdk1.7;java -Xmx4g -jar /groups/kishony/baym/gatklocal/GenomeAnalysisTK.jar';
+my $sickledir = '/home/mhb15/illumina_pipeline/sickle-master'; #sickle directory
+my $filtercom = "$sickledir/sickle pe -t sanger -q 20 -l 50 -x -n";
+my $aligncom = '/opt/bowtie2/bowtie2 -p 2 -X 1000 --no-mixed --dovetail --very-sensitive --n-ceil 0,0.001'; #bowtiecommand
+my $cutadaptcom = 'cutadapt -a CTGTCTCTTAT';
+
+my $source = '.'; #source filepath
+my $target = '/hms/scratch1/baym/experiment1/Sample001/'; #default filepath
+my $refdir = '/groups/kishony/Reference_Genomes/EcoliMG1655'; #reference directory
+my $protname = 'pairedendkeio';
+my $runtag = 'pipeline';
+my $debug;
+
+GetOptions ('s|source=s' => \@source,
+			't|target=s' => \$target, #(the directory with sample name last)
+			'f|filter=s' => \$filtercom,
+			'a|align=s' => \$aligncom,
+			'r|refdir=s' => \$refdir,
+			'p|protname=s' => \$protname,
+			'runtag=s' => \$runtag,
+			'copyfirst' => \$copyfirst,
+			'debug' => \$debug);
+			
+			
+sub runcommand { #this lets me switch from testing to running easily!
+if ($debug) {
+	print $_[0]."\n";
+} else {
+	print $_[0]."\n";
+	return system $_[0];
+}
+}
+#--sam-rg "ID:'.$source.'-illumina" -- currently not working
+#$target =~ /\/([^\/]+)\/$/; #pulls the directory name out to find the filename for now -> replace later
+#$readbase = $target.$1."_";
+
+
+$reffasta = "$refdir/genome.fasta";
+$fromhereflag = 1;
+
+#make the target directory
+$target =~ m/\/([^\/]+)\/$/;
+$filename = $1;
+runcommand "mkdir -p $target";
+
+
+$numfilesin = @source;
+
+
+
+print "\nCutting adaptors... (at ".localtime().")\n";#trim the reads
+
+unless (-e "$target/done.cut" &&  $fromhereflag) {$fromhereflag=0;
+	$success = 0;
+	for $file (@source) {
+		$read1in= $file."1.fastq";
+		$read2in= $file."2.fastq";
+	
+		$success += runcommand "$cutadaptcom $read1in >> $target"."readcut1.fastq";
+		$success += runcommand "$cutadaptcom $read2in >> $target"."readcut2.fastq";
+		runcommand "touch $target/done.cut"  if ($success == 0);
+	}
+}
+
+
+print "\nTrimming reads... (at ".localtime().")\n";#trim the reads
+$trimtarget=$target."sickle2051";
+runcommand "mkdir -p $trimtarget";
+#don't copy the fastqs to the target base directory -- get them from the source!
+$fr1="$trimtarget/filter_reads_1.fastq";
+$fr2="$trimtarget/filter_reads_2.fastq";
+unless (-e "$trimtarget/done.trim" &&  $fromhereflag) {$fromhereflag=0;
+
+	#$success = runcommand "$sickledir/sickle pe -f $read1in -r $read2in -t sanger -o $fr1 -p $fr2 -s $trimtarget/singles.fastq -q 20 -l 50 -x -n";
+	$success = runcommand "$filtercom -f $target/readcut1.fastq -r $target/readcut2.fastq -o $fr1 -p $fr2 -s $trimtarget/singles.fastq";
+	runcommand "touch $trimtarget/done.trim"  if ($success == 0);
+}
+
+
+print "\nAligning reads... (at ".localtime().")\n";
+#align the reads, using four processors
+$aligndir=$trimtarget."/".$protname;
+runcommand "mkdir -p $aligndir";
+$alignsam = "$aligndir/aligned.sam";
+unless (-e "$aligndir/done.align" &&  $fromhereflag) {$fromhereflag=0;
+	$success = runcommand "$aligncom --un-conc $aligndir/unaligned.fastq -x $refdir/genome_bowtie2 -1 $fr1 -2 $fr2 -S $alignsam";
+	runcommand "touch $aligndir/done.align" if ($success == 0);
+}
+
+print "\nMaking bam files... (at ".localtime().")\n";
+$alignbam = "$aligndir/aligned.bam";
+$alignsort = "$aligndir/aligned.sorted";
+unless ((-e "$aligndir/done.sambam") &&  ($fromhereflag)) {$fromhereflag=0;
+	runcommand "$samtools view -bS $alignsam > $alignbam";						#make bam file
+	runcommand "$samtools sort $alignbam $alignsort";							#make sorted bam file
+	$success = runcommand "$samtools index $alignsort.bam $alignsort.bai";		#index sorted bam file
+	runcommand "touch $aligndir/done.sambam" if ($success == 0);
+} 
+
+
+#use picard to add rungroups so gatk doesn't bork
+print "\nAdding read groups with picard... (at ".localtime().")\n";
+$alignwrg = "$aligndir/alignedwrg.bam";
+$alignwrgindex = "$aligndir/alignedwrg.bai";
+$picardcommand= "java -jar /opt/picard/AddOrReplaceReadGroups.jar";
+unless (-e "$aligndir/done.picard" &&  $fromhereflag) {$fromhereflag=0;
+	runcommand "$picardcommand INPUT=$alignsort.bam OUTPUT=$alignwrg SORT_ORDER=coordinate RGLB=MG1655 RGPL=Illumina RGPU=2 RGSM=$filename";
+	$success = runcommand "$samtools index $alignwrg $alignwrgindex";
+	runcommand "touch $aligndir/done.picard" if ($success == 0);
+}
+
+#realign with GATK
+print "\nRealigning with GATK... (at ".localtime().")\n";
+runcommand "module load dev/java/jdk1.7";
+$realigned = "$aligndir/realigned.bam";
+$intervals = "$aligndir/forIndelRealigner.intervals";
+unless (-e "$aligndir/done.gatk" &&  $fromhereflag) {$fromhereflag=0;
+	runcommand "$gatk -T RealignerTargetCreator -R $reffasta -I $alignwrg -o $intervals";
+	$success = runcommand "$gatk -T IndelRealigner -LOD 1.0 --entropyThreshold 0.10 -R $reffasta -I $alignwrg -targetIntervals $intervals -o $realigned";
+	runcommand "touch $aligndir/done.gatk" if ($success == 0);
+}
+
+
+
+#use gatk to recalibrate quality scores and remake the outputbam
+#use gatk recalibrater to recalibrate == note, without bootstrapping this has only been tested for Michael's data
+
+
+# print "\nRecalibrating Base Score Quality with GATK... (at ".localtime().")\n";
+# 
+ $target =~ m/\/([^\/]+)\/$filename\/$/;
+ $basedir = $1;
+# 
+ $recaldir = "$aligndir/recalibrated";
+# $recalbam = "$recaldir/qualrecalibrated.bam";
+ runcommand "mkdir -p $recaldir";
+# unless (-e "$recaldir/done.bqsr" &&  $fromhereflag) {$fromhereflag=0;
+# 	runcommand "$gatk -T BaseRecalibrator -R $reffasta -I $realigned -o $recaldir/recal_data.table --run_without_dbsnp_potentially_ruining_quality";
+# 	$success = runcommand "$gatk -T PrintReads -R $reffasta -I $realigned -BQSR $recaldir/recal_data.table -o $recalbam";
+# 	runcommand "touch $recaldir/done.bqsr" if ($success == 0);
+# }
+$recalbam = $realigned;
+
+print "\nMaking the pileup in a separate process... (at ".localtime().")\n";
+$pileup = "$recaldir/strain.pileup";
+$pileupouts = "$filename_pileups";
+runcommand "mkdir -p $basedir/runlogs/";
+
+$runlogbase = $basedir."/runlogs/".$pileupouts;
+unless (-e "$recaldir/done.pileup" && $fromhereflag) {$fromhereflag=0;
+	$pileupcommand = "$samtools mpileup -q30 -s -O -d3000 -C30 -f $reffasta $recalbam > $pileup";
+	$bsubcomm = "bsub -J ".$runtag."_".int(rand(5000))."_pileup -W 3:0 -o ".$runlogbase.".out -e ".$runlogbase.".err -q short ";
+	$pileupdone = "touch $recaldir/done.pileup";
+	runcommand $bsubcomm.'"'.$pileupcommand.";".$pileupdone.'"';
+}
+
+
+
+#### EVERYTHING BELOW HERE IS OBSOLETE
+
+
+#print "\nMaking the variant vcf in a separate process... (at ".localtime().")\n";
+
+#use GATK to make strain and variant VCFs
+
+#$samcommand1 = "$samtools mpileup -q30 -s -d3000 -C30 -ugf $refdir/genome.fasta $realigned  > $aligndir/strain";
+#$samcommand2 = "$bcftools view -g $aligndir/strain > $aligndir/strain.vcf";
+#$samcommand3 = "$bcftools view -vS $aligndir/strain.vcf > $aligndir/variant.vcf";
+#$vcfouts = "$filename_varvcf";
+#$runlogbase = $basedir."/runlogs/".$vcfouts;
+#$bsubcomm = "bsub -J ".$runtag."_".int(rand(5000))."_variant_vcf -W 1:55 -o ".$runlogbase.".out -e ".$runlogbase.".err -q short";
+#$vcfcommand1 = "$bsubcomm  \'$gatk -T HaplotypeCaller -R $reffasta -I $recalbam -o $recaldir/gatk.output.raw.recal.snps.indels.vcf\'";
+#$vcfcommand2 = "$gatk -T HaplotypeCaller -R $reffasta -I $recalbam -o $recaldir/gatk.output.raw.all.vcf --emitRefConfidence BP_RESOLUTION";
+#$vcfcommand1 = "$bsubcomm  \'$gatk -T UnifiedGenotyper -R $reffasta -I $recalbam -o $recaldir/gatk.output.raw.recal.snps.indels.vcf  -ploidy 1 -glm BOTH -nt 4\'";
+#$vcfcommand2 = "$gatk -T UnifiedGenotyper -R $reffasta -I $recalbam -o $recaldir/gatk.output.raw.all.vcf  -ploidy 1 -glm BOTH -out_mode EMIT_ALL_SITES -nt 4";
+
+
+
+#unless (-e "$recaldir/done.vcfs" && $fromhereflag && !(-z "$recaldir/gatk.output.raw.recal.snps.indels.vcf")) {$fromhereflag=0;
+#	runcommand $vcfcommand1;
+#	print "\nMaking the strain vcf... (at ".localtime().")\n";
+#	$success = runcommand $vcfcommand2;
+#	runcommand "touch $recaldir/done.vcfs" if ($success == 0);
+#}
+#$vcfdone = "touch $aligndir/done.vcfs";
+#	runcommand $samcommand1;
+#	runcommand $samcommand2;
+#	$success = runcommand $samcommand3;
+#	runcommand $vcfdone if ($success == 0);
+
+#print "\nVCFs finished at (at ".localtime().")\n";
+
+
+#
+#print "\nCleaning up... \n";
+#if ((-e "$aligndir/done.pileup")) {
+#	if (-e "$aligndir/done.vcfs") {
+#		runcommand "rm $aligndir/aligned.sam; rm $aligndir/strain";
+#	} else {
+#		runcommand "bsub -w \"done($target_vcf)\" -W 2 -q mini \"rm $aligndir/aligned.sam; rm $aligndir/strain\"";
+#	}
+#}
\ No newline at end of file