Modifications to several apps for searching the PMR, running TNRunner,

and parsing vst and tpm values for cBioPortal
HuntsmanCancerInstitute · Sep 11, 2023 · 56f6cac · 56f6cac
1 parent 71b68de
commit 56f6cac
Show file tree

Hide file tree

Showing 10 changed files with 761 additions and 99 deletions.
diff --git a/Source/edu/expr/NormalizedCountCBioFormater.java b/Source/edu/expr/NormalizedCountCBioFormater.java
@@ -0,0 +1,167 @@
+package edu.expr;
+
+import java.io.*;
+import java.util.regex.*;
+
+import edu.utah.hci.misc.Gzipper;
+
+import java.util.*;
+import util.gen.IO;
+import util.gen.Misc;
+
+public class NormalizedCountCBioFormater {
+
+	private File ensSymIdFile;
+	private HashMap<String,String> ensSymIds;
+	private File normCountFile;
+	private String dupKeyValDelim = ":";
+	private Pattern dupDelim = Pattern.compile(dupKeyValDelim);
+
+	public NormalizedCountCBioFormater (String[] args){
+		processArgs(args);
+
+		//load key value file skipping duplicate keys and making duplicate values unique
+		IO.pl("Loading Ensembl 2 gene symbol lookup table...");
+		ensSymIds = IO.loadFileIntoHashMapUniqueValues(ensSymIdFile, dupKeyValDelim);
+		IO.pl("Number unique ensembl gene IDs "+ensSymIds.size());
+
+		//walk the norm count file
+		IO.pl("\nParsing "+normCountFile.getName());
+		parseAndSaveMatches();
+
+		IO.pl("\nDone!\n");
+	}
+
+
+	//here, test and run it!
+
+	public void parseAndSaveMatches(){
+			try {
+				HashSet<String> geneSymbols = new HashSet<String>();
+				File results = new File (normCountFile.getParentFile(), Misc.removeExtension(normCountFile.getName())+"_ForCBio.txt.gz");
+				Gzipper out = new Gzipper (results);
+				BufferedReader in = IO.fetchBufferedReader(normCountFile);
+
+				//read in the header, this is just a dump of the column names
+				String line = in.readLine();
+				String[] tokens = Misc.TAB.split(line);
+				int numSamples = tokens.length;
+				IO.pl("\t"+numSamples+"\t#Samples");
+				out.println("Hugo_Symbol\t"+line);
+
+				//counters
+				int totalLines = 0;
+				int numMatches = 0;
+				int numNoMatches = 0;
+				int numBadColumns = 0;
+				int numDuplicateSymbolsSkipped = 0;
+
+				while ((line = in.readLine())!=null){
+					totalLines++;
+					tokens = Misc.TAB.split(line);
+					if (tokens.length-1 != numSamples) {
+						numBadColumns++;
+						IO.el("\tSkipping, incorrect # columns -> "+line);
+						continue;
+					}
+
+					//look for the ensembl id in the lookup hash
+					String ensId = tokens[0];
+					String geneSymbol = ensSymIds.get(ensId);
+					if (geneSymbol!= null) {
+
+						//does it contain a value delimiter? If so check it
+						if (geneSymbol.contains(dupKeyValDelim)) {
+							String[] symbolKey = dupDelim.split(geneSymbol);
+							//already seen?
+							if (geneSymbols.contains(symbolKey[0])) {
+								IO.el("\tSkipping, duplicate symbol -> "+geneSymbol+ " line # "+totalLines);
+								numDuplicateSymbolsSkipped++;
+								continue;
+							}
+							else {
+								geneSymbols.add(symbolKey[0]);
+								geneSymbol = symbolKey[0];
+							}
+						}
+
+						//save the line
+						numMatches++;
+						out.print(geneSymbol);
+						for (int i=1; i< tokens.length; i++) {
+							out.print("\t");
+							out.print(tokens[i]);
+						}
+						out.println();
+					}
+					else numNoMatches++;
+				}
+				IO.pl("\t"+totalLines+"\t# Data lines");
+				IO.pl("\t"+numMatches+"\t# Matches written to file");
+				IO.pl("\t"+ numNoMatches+"\t# No Matches");
+				IO.pl("\t"+ numBadColumns+"\t# Incorrect sample number column lines");
+				IO.pl("\t"+ numDuplicateSymbolsSkipped+"\t# Duplicate symbols");
+
+				//close the reader and writer IO
+				in.close();
+				out.close();
+			} catch (Exception e){
+				e.printStackTrace();
+			}
+		}
+
+	/**This method will process each argument and assign new varibles*/
+	public void processArgs(String[] args){
+		IO.pl("\n"+IO.fetchUSeqVersion()+" Arguments: "+ Misc.stringArrayToString(args, " ") +"\n");
+		Pattern pat = Pattern.compile("-[a-z]");
+		for (int i = 0; i<args.length; i++){
+			String lcArg = args[i].toLowerCase();
+			Matcher mat = pat.matcher(lcArg);
+			if (mat.matches()){
+				char test = args[i].charAt(1);
+				try{
+					switch (test){
+					case 'n': normCountFile = new File(args[i+1]); i++; break;
+					case 'e': ensSymIdFile = new File(args[i+1]); i++; break;
+					case 'h': printDocs(); System.exit(0);
+					default: Misc.printExit("\nError: unknown option! " + mat.group());
+					}
+				}
+				catch (Exception e){
+					Misc.printExit("\nSorry, something doesn't look right with this parameter: -"+test+"\n");
+					e.printStackTrace();
+				}
+			}
+		}
+		//check agilentFiles
+		if (ensSymIdFile == null ) Misc.printExit("\nCannot find your tab delimited ensembl geneSymbol ID lookup file?\n");
+		if (normCountFile == null) Misc.printExit("\nCannot find your DESeq2 normalized count file?\n");
+	}	
+
+	public static void printDocs(){ 
+		IO.pl("\n" +
+				"**************************************************************************************\n" +
+				"**                          NormalizedCountCBioFormater:  August 2023               **\n" +
+				"**************************************************************************************\n" +
+				"Parses a DESeq2 output file containing normalized counts, converts the ensembl gene\n" +
+				"names to HUGO gene symbols, and saves the output ready for import into cBioPortal.\n"+
+
+				"-e Path a tab delimited txt file containing Ensembl IDs and Gene Symbols.\n" +
+				"-n Path to a normalized count file from DESeq2, e.g. \n"+
+				"write.table(assay(vst(dds, blind=FALSE)), file = 'vst.txt', quote=FALSE, sep ='\\t')'.\n" +
+
+				"Example: java -jar pathTo/Apps/NormalizedCountCBioFormater -e ensembIDsGeneSym.txt\n"+
+				"    -n vst.txt\n" +	
+		"**************************************************************************************\n");		
+	}
+
+	public static void main(String[] args) {
+		if (args.length == 0) {
+			printDocs();
+			System.exit(0);
+		}
+		else new NormalizedCountCBioFormater(args);
+	}
+
+
+}
diff --git a/Source/edu/utah/billing/CBiBilling.java b/Source/edu/utah/billing/CBiBilling.java
@@ -52,6 +52,8 @@ public CBiBilling (String[] args) throws Exception{
 
 		parseAWSAccounts();
 
+		//add missing cloud WAFs!
+
 		printInvoices();
 
 		masterAccountInfo.saveUpdatedInfo();
@@ -61,7 +63,7 @@ public CBiBilling (String[] args) throws Exception{
 	}
 
 
-	private void printInvoices() {
+	private void printInvoices() throws IOException {
 		IO.pl("\nPrinting Invoices...");
 		HashMap<String, Float> groupNameHoursBilled = this.masterAccountInfo.getGroupNameHoursBilled();
 
@@ -117,6 +119,7 @@ private void printInvoices() {
 
 			//pull non cancer
 			String cancerStatus = masterAccountInfo.getGroupNameCancerStatus().get(groupName);
+			if (cancerStatus == null) throw new IOException ("ERROR: failed to find the cancer status for "+groupName);
 			if (cancerStatus.contains("Non")) {
 				float additionalBilling = hscCostSharing * totalHoursToBill;
 				nonCancerBilling.add(groupName+"\t"+Num.formatNumber(totalHoursToBill,3)+"\t$"+ Num.formatNumber(additionalBilling, 2));
@@ -150,12 +153,11 @@ private void printInvoices() {
 
 	private void printHSCBilling() {
 		IO.pl("\nHSC Invoice...");
-		IO.pl("GroupName\tHoursBilled\tHSCShare($"+(int)hscCostSharing+"/hour)");
+		IO.pl("GroupName\tHoursBilled\tHSCShare($"+(int)hscCostSharing+"/hour)\tHourlyChartfieldsOnFile");
 		for (String gl: nonCancerBilling) IO.pl(gl);
 		IO.pl("\nTotalHSCCostSharing:\t$"+ Num.formatNumber(totalHscBilling, 2));
 	}
 
-
 	private void printJustCloudInvoices() {
 		//remove any account numbers that were already printed with the hourly invoices
 		TreeMap<String, Float> accountNumberTotals = carahsoftParser.getAwsAccountNumberTotals();
@@ -271,8 +273,8 @@ private float printCloudInvoice(HashSet<String> groupAliases, ArrayList<String>
 	}
 
 
-	private void parseAWSAccounts() {
-		IO.pl("\nParsing AWS accounts...");
+	private void parseAWSAccounts() throws IOException {
+		IO.pl("\nParsing Carahsoft AWS account files...");
 		//any cloud reports?
 		if (cloudReportsDirectory != null) {
 			carahsoftParser = new CarahsoftXlsxParser(cloudReportsDirectory, debug);
@@ -503,7 +505,7 @@ private void parseHeader(String[] cells) throws IOException {
 		}
 
 		//check it contains the required cells
-		String[] toFind = {"Work Type", "Account Name", "Issue Key", "Full name", "Billed Hours", "Issue summary", "Work Description"};
+		String[] toFind = {"CBI - Work Type", "Account Name", "Issue Key", "Full name", "Billed Hours", "Issue summary", "Work Description"};
 
 		for (String tf: toFind) {
 			if (headerKeyIndex.containsKey(tf) == false) throw new IOException("Failed to find the '"+tf+"' header key in "+headerKeyIndex);
@@ -577,18 +579,20 @@ public void processArgs(String[] args){
 	public static void printDocs(){
 		System.out.println("\n" +
 				"**************************************************************************************\n" +
-				"**                                Correlate:    Nov 2008                            **\n" +
+				"**                               CBI Billing:    Sept 2023                          **\n" +
 				"**************************************************************************************\n" +
-				"Calculates all pair-wise Pearson correlation coefficients (r) and if indicated will\n" +
-				"perform a hierarchical clustering on the files.\n\n"+				
-
-				"Parameters:\n" +
-				"-d The full path directory text containing serialized java float[] files (xxx.celp\n"+
-				"      see CelProcessor app).\n"+
-				"-a Files provided are float[][] files (xxx.cela) and need to be collapsed to float[]\n"+
-				"-c Cluster files.\n\n" +
-
-				"Example: java -Xmx256M -jar pathTo/T2/Apps/Correlate -d /Mango/PCels/ -c -a\n\n" +
+				"Generates billing reports for the Cancer Bioinformatics Shared Resource.\n\n"+				
+
+				"Required Parameters:\n" +
+				"-j Path to the exported cvs Jira 'Logged Time' report.\n"+
+				"-m Path to the masterAccountInfo.xlsx spreadsheet updated from the prior month.\n"+
+				"-w Path to a dir containing the hourly and cloud 'WAF Tracking Schedule' xlsx files.\n" +
+				"-c Path to a dir containing the cloud AWS Carahsoft xlsx expense reports. May be empty\n"+
+				"      if are none available.\n"+
+				"-o Path to write the results.\n"+
+
+				"\nExample: java -Xmx256M -jar pathTo/USeq/Apps/CBiBilling -j jiraTime.cvs -m \n"+
+				"   masterAccountInfo.xlsx -w WAFs/ -c Carahsoft/ -o Invoices\n" +
 
 
 		"**************************************************************************************\n");		

diff --git a/Source/edu/utah/billing/CarahsoftXlsxParser.java b/Source/edu/utah/billing/CarahsoftXlsxParser.java
@@ -21,9 +21,10 @@ public class CarahsoftXlsxParser {
 	private int finalCostIndex = -1;
 	private int numParsedLines = 0;
 
-	public CarahsoftXlsxParser(File dir, boolean debug) {
+	public CarahsoftXlsxParser(File dir, boolean debug) throws IOException {
 		File[] xlsxFiles = IO.extractFiles(dir, ".xlsx");
-
+		if (xlsxFiles == null || xlsxFiles.length ==0) throw new IOException("ERROR: failed to find xlsx Carahsoft files in "+dir);
+
 		for (File xlsx: xlsxFiles) parseIt(xlsx);
 
 		float awsTotal = 0.0f;
@@ -48,7 +49,7 @@ public CarahsoftXlsxParser(File dir, boolean debug) {
 		}
 	}
 
-	public static void main(String[] args) {
+	public static void main(String[] args) throws IOException {
 		CarahsoftXlsxParser p = new CarahsoftXlsxParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/CarahsoftMay2023"), true);
 
 	}

diff --git a/Source/edu/utah/billing/JiraTicketSummary.java b/Source/edu/utah/billing/JiraTicketSummary.java
@@ -4,6 +4,7 @@
 import java.util.LinkedHashMap;
 
 import util.gen.IO;
+import util.gen.Misc;
 import util.gen.Num;
 
 public class JiraTicketSummary {
@@ -26,35 +27,39 @@ public class JiraTicketSummary {
 	private ArrayList<String> errors = new ArrayList<String>();
 
 	public JiraTicketSummary (String[] cells, String line, LinkedHashMap<String, Integer> headerKeyIndex) {
-
-
-		//pull Work Type, Hourly | Infrastructure | FTE - cannot be blank
-		workType = cells[headerKeyIndex.get("Work Type")].trim();
-		if (workType.length()==0) errors.add("Error: missing 'Work Type' in -> "+line);
-
-		//pull Account Name, Judson-Torres, Robert Lab - might be blank for Infrastructure
-		groupToBill = cells[headerKeyIndex.get("Account Name")].trim();
-		if (workType.startsWith("Infrastructure")== false && groupToBill.length()==0) errors.add("Error: missing 'Account Name' for a non-infrastructure job in -> "+line);
-		if (workType.startsWith("Infrastructure") && groupToBill.length()!=0) errors.add("Error: Infrastructure job has an Account Name -> "+line);
-
-		//pull Issue Key, BSD-642 - cannot be blank
-		jiraTicket = cells[headerKeyIndex.get("Issue Key")].trim();
-		if (jiraTicket.length()==0) errors.add("Error: missing 'Issue Key' in -> "+line);
-
-		//pull Full name, Timothy Parnell - cannot be blank
-		analystName = cells[headerKeyIndex.get("Full name")].trim();
-		if (analystName.length()==0) errors.add("Error: missing 'Full name' in -> "+line);
-
-		//pull Billed Hours, 2 - cannot be blank
-		hoursString = cells[headerKeyIndex.get("Billed Hours")].trim();
-		if (hoursString.length()==0) errors.add("Error: missing 'Billed Hours' in -> "+line);
-
-		//pull Issue summary, ChIP Seq data analysis - can be blank
-		issueSummary = cells[headerKeyIndex.get("Issue summary")].trim();
-
-		//pull Work Description, Evaluate new results. Generate new plots. Annotate peaks. Post to GNomEx and Email. - can be blank
-		workPerformed = cells[headerKeyIndex.get("Work Description")].trim();
-
+
+		try {
+			//pull Work Type, Hourly | Infrastructure | FTE - cannot be blank
+			workType = cells[headerKeyIndex.get("CBI - Work Type")].trim();
+			if (workType.length()==0) errors.add("Error: missing 'Work Type' in -> "+line);
+
+			//pull Account Name, Judson-Torres, Robert Lab - might be blank for Infrastructure
+			groupToBill = cells[headerKeyIndex.get("Account Name")].trim();
+			if (workType.startsWith("Infrastructure")== false && groupToBill.length()==0) errors.add("Error: missing 'Account Name' for a non-infrastructure job in -> "+line);
+			if (workType.startsWith("Infrastructure") && groupToBill.length()!=0) errors.add("Error: Infrastructure job has an Account Name -> "+line);
+
+			//pull Issue Key, BSD-642 - cannot be blank
+			jiraTicket = cells[headerKeyIndex.get("Issue Key")].trim();
+			if (jiraTicket.length()==0) errors.add("Error: missing 'Issue Key' in -> "+line);
+
+			//pull Full name, Timothy Parnell - cannot be blank
+			analystName = cells[headerKeyIndex.get("Full name")].trim();
+			if (analystName.length()==0) errors.add("Error: missing 'Full name' in -> "+line);
+
+			//pull Billed Hours, 2 - cannot be blank
+			hoursString = cells[headerKeyIndex.get("Billed Hours")].trim();
+			if (hoursString.length()==0) errors.add("Error: missing 'Billed Hours' in -> "+line);
+
+			//pull Issue summary, ChIP Seq data analysis - can be blank
+			issueSummary = cells[headerKeyIndex.get("Issue summary")].trim();
+
+			//pull Work Description, Evaluate new results. Generate new plots. Annotate peaks. Post to GNomEx and Email. - can be blank
+			workPerformed = cells[headerKeyIndex.get("Work Description")].trim();
+		} catch (Exception e) {
+			e.printStackTrace();
+			Misc.printErrAndExit("Bad line, check for extra line return -> "+line);
+		}
+
 	}
 
 	public static final String getToStringHeader() {