Skip to content

Commit

Permalink
Modifications to several apps for searching the PMR, running TNRunner,
Browse files Browse the repository at this point in the history
and parsing vst and tpm values for cBioPortal
  • Loading branch information
u0028003 committed Sep 11, 2023
1 parent 71b68de commit 56f6cac
Show file tree
Hide file tree
Showing 10 changed files with 761 additions and 99 deletions.
167 changes: 167 additions & 0 deletions Source/edu/expr/NormalizedCountCBioFormater.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
package edu.expr;

import java.io.*;
import java.util.regex.*;

import edu.utah.hci.misc.Gzipper;

import java.util.*;
import util.gen.IO;
import util.gen.Misc;

public class NormalizedCountCBioFormater {

private File ensSymIdFile;
private HashMap<String,String> ensSymIds;
private File normCountFile;
private String dupKeyValDelim = ":";
private Pattern dupDelim = Pattern.compile(dupKeyValDelim);

public NormalizedCountCBioFormater (String[] args){
processArgs(args);

//load key value file skipping duplicate keys and making duplicate values unique
IO.pl("Loading Ensembl 2 gene symbol lookup table...");
ensSymIds = IO.loadFileIntoHashMapUniqueValues(ensSymIdFile, dupKeyValDelim);
IO.pl("Number unique ensembl gene IDs "+ensSymIds.size());

//walk the norm count file
IO.pl("\nParsing "+normCountFile.getName());
parseAndSaveMatches();

IO.pl("\nDone!\n");
}


//here, test and run it!

public void parseAndSaveMatches(){
try {
HashSet<String> geneSymbols = new HashSet<String>();
File results = new File (normCountFile.getParentFile(), Misc.removeExtension(normCountFile.getName())+"_ForCBio.txt.gz");
Gzipper out = new Gzipper (results);
BufferedReader in = IO.fetchBufferedReader(normCountFile);

//read in the header, this is just a dump of the column names
String line = in.readLine();
String[] tokens = Misc.TAB.split(line);
int numSamples = tokens.length;
IO.pl("\t"+numSamples+"\t#Samples");
out.println("Hugo_Symbol\t"+line);

//counters
int totalLines = 0;
int numMatches = 0;
int numNoMatches = 0;
int numBadColumns = 0;
int numDuplicateSymbolsSkipped = 0;

while ((line = in.readLine())!=null){
totalLines++;
tokens = Misc.TAB.split(line);
if (tokens.length-1 != numSamples) {
numBadColumns++;
IO.el("\tSkipping, incorrect # columns -> "+line);
continue;
}

//look for the ensembl id in the lookup hash
String ensId = tokens[0];
String geneSymbol = ensSymIds.get(ensId);
if (geneSymbol!= null) {

//does it contain a value delimiter? If so check it
if (geneSymbol.contains(dupKeyValDelim)) {
String[] symbolKey = dupDelim.split(geneSymbol);
//already seen?
if (geneSymbols.contains(symbolKey[0])) {
IO.el("\tSkipping, duplicate symbol -> "+geneSymbol+ " line # "+totalLines);
numDuplicateSymbolsSkipped++;
continue;
}
else {
geneSymbols.add(symbolKey[0]);
geneSymbol = symbolKey[0];
}
}

//save the line
numMatches++;
out.print(geneSymbol);
for (int i=1; i< tokens.length; i++) {
out.print("\t");
out.print(tokens[i]);
}
out.println();
}
else numNoMatches++;
}
IO.pl("\t"+totalLines+"\t# Data lines");
IO.pl("\t"+numMatches+"\t# Matches written to file");
IO.pl("\t"+ numNoMatches+"\t# No Matches");
IO.pl("\t"+ numBadColumns+"\t# Incorrect sample number column lines");
IO.pl("\t"+ numDuplicateSymbolsSkipped+"\t# Duplicate symbols");

//close the reader and writer IO
in.close();
out.close();
} catch (Exception e){
e.printStackTrace();
}
}

/**This method will process each argument and assign new varibles*/
public void processArgs(String[] args){
IO.pl("\n"+IO.fetchUSeqVersion()+" Arguments: "+ Misc.stringArrayToString(args, " ") +"\n");
Pattern pat = Pattern.compile("-[a-z]");
for (int i = 0; i<args.length; i++){
String lcArg = args[i].toLowerCase();
Matcher mat = pat.matcher(lcArg);
if (mat.matches()){
char test = args[i].charAt(1);
try{
switch (test){
case 'n': normCountFile = new File(args[i+1]); i++; break;
case 'e': ensSymIdFile = new File(args[i+1]); i++; break;
case 'h': printDocs(); System.exit(0);
default: Misc.printExit("\nError: unknown option! " + mat.group());
}
}
catch (Exception e){
Misc.printExit("\nSorry, something doesn't look right with this parameter: -"+test+"\n");
e.printStackTrace();
}
}
}
//check agilentFiles
if (ensSymIdFile == null ) Misc.printExit("\nCannot find your tab delimited ensembl geneSymbol ID lookup file?\n");
if (normCountFile == null) Misc.printExit("\nCannot find your DESeq2 normalized count file?\n");
}

public static void printDocs(){
IO.pl("\n" +
"**************************************************************************************\n" +
"** NormalizedCountCBioFormater: August 2023 **\n" +
"**************************************************************************************\n" +
"Parses a DESeq2 output file containing normalized counts, converts the ensembl gene\n" +
"names to HUGO gene symbols, and saves the output ready for import into cBioPortal.\n"+

"-e Path a tab delimited txt file containing Ensembl IDs and Gene Symbols.\n" +
"-n Path to a normalized count file from DESeq2, e.g. \n"+
"write.table(assay(vst(dds, blind=FALSE)), file = 'vst.txt', quote=FALSE, sep ='\\t')'.\n" +

"Example: java -jar pathTo/Apps/NormalizedCountCBioFormater -e ensembIDsGeneSym.txt\n"+
" -n vst.txt\n" +
"**************************************************************************************\n");
}

public static void main(String[] args) {
if (args.length == 0) {
printDocs();
System.exit(0);
}
else new NormalizedCountCBioFormater(args);
}


}
38 changes: 21 additions & 17 deletions Source/edu/utah/billing/CBiBilling.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public CBiBilling (String[] args) throws Exception{

parseAWSAccounts();

//add missing cloud WAFs!

printInvoices();

masterAccountInfo.saveUpdatedInfo();
Expand All @@ -61,7 +63,7 @@ public CBiBilling (String[] args) throws Exception{
}


private void printInvoices() {
private void printInvoices() throws IOException {
IO.pl("\nPrinting Invoices...");
HashMap<String, Float> groupNameHoursBilled = this.masterAccountInfo.getGroupNameHoursBilled();

Expand Down Expand Up @@ -117,6 +119,7 @@ private void printInvoices() {

//pull non cancer
String cancerStatus = masterAccountInfo.getGroupNameCancerStatus().get(groupName);
if (cancerStatus == null) throw new IOException ("ERROR: failed to find the cancer status for "+groupName);
if (cancerStatus.contains("Non")) {
float additionalBilling = hscCostSharing * totalHoursToBill;
nonCancerBilling.add(groupName+"\t"+Num.formatNumber(totalHoursToBill,3)+"\t$"+ Num.formatNumber(additionalBilling, 2));
Expand Down Expand Up @@ -150,12 +153,11 @@ private void printInvoices() {

private void printHSCBilling() {
IO.pl("\nHSC Invoice...");
IO.pl("GroupName\tHoursBilled\tHSCShare($"+(int)hscCostSharing+"/hour)");
IO.pl("GroupName\tHoursBilled\tHSCShare($"+(int)hscCostSharing+"/hour)\tHourlyChartfieldsOnFile");
for (String gl: nonCancerBilling) IO.pl(gl);
IO.pl("\nTotalHSCCostSharing:\t$"+ Num.formatNumber(totalHscBilling, 2));
}


private void printJustCloudInvoices() {
//remove any account numbers that were already printed with the hourly invoices
TreeMap<String, Float> accountNumberTotals = carahsoftParser.getAwsAccountNumberTotals();
Expand Down Expand Up @@ -271,8 +273,8 @@ private float printCloudInvoice(HashSet<String> groupAliases, ArrayList<String>
}


private void parseAWSAccounts() {
IO.pl("\nParsing AWS accounts...");
private void parseAWSAccounts() throws IOException {
IO.pl("\nParsing Carahsoft AWS account files...");
//any cloud reports?
if (cloudReportsDirectory != null) {
carahsoftParser = new CarahsoftXlsxParser(cloudReportsDirectory, debug);
Expand Down Expand Up @@ -503,7 +505,7 @@ private void parseHeader(String[] cells) throws IOException {
}

//check it contains the required cells
String[] toFind = {"Work Type", "Account Name", "Issue Key", "Full name", "Billed Hours", "Issue summary", "Work Description"};
String[] toFind = {"CBI - Work Type", "Account Name", "Issue Key", "Full name", "Billed Hours", "Issue summary", "Work Description"};

for (String tf: toFind) {
if (headerKeyIndex.containsKey(tf) == false) throw new IOException("Failed to find the '"+tf+"' header key in "+headerKeyIndex);
Expand Down Expand Up @@ -577,18 +579,20 @@ public void processArgs(String[] args){
public static void printDocs(){
System.out.println("\n" +
"**************************************************************************************\n" +
"** Correlate: Nov 2008 **\n" +
"** CBI Billing: Sept 2023 **\n" +
"**************************************************************************************\n" +
"Calculates all pair-wise Pearson correlation coefficients (r) and if indicated will\n" +
"perform a hierarchical clustering on the files.\n\n"+

"Parameters:\n" +
"-d The full path directory text containing serialized java float[] files (xxx.celp\n"+
" see CelProcessor app).\n"+
"-a Files provided are float[][] files (xxx.cela) and need to be collapsed to float[]\n"+
"-c Cluster files.\n\n" +

"Example: java -Xmx256M -jar pathTo/T2/Apps/Correlate -d /Mango/PCels/ -c -a\n\n" +
"Generates billing reports for the Cancer Bioinformatics Shared Resource.\n\n"+

"Required Parameters:\n" +
"-j Path to the exported cvs Jira 'Logged Time' report.\n"+
"-m Path to the masterAccountInfo.xlsx spreadsheet updated from the prior month.\n"+
"-w Path to a dir containing the hourly and cloud 'WAF Tracking Schedule' xlsx files.\n" +
"-c Path to a dir containing the cloud AWS Carahsoft xlsx expense reports. May be empty\n"+
" if are none available.\n"+
"-o Path to write the results.\n"+

"\nExample: java -Xmx256M -jar pathTo/USeq/Apps/CBiBilling -j jiraTime.cvs -m \n"+
" masterAccountInfo.xlsx -w WAFs/ -c Carahsoft/ -o Invoices\n" +


"**************************************************************************************\n");
Expand Down
7 changes: 4 additions & 3 deletions Source/edu/utah/billing/CarahsoftXlsxParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ public class CarahsoftXlsxParser {
private int finalCostIndex = -1;
private int numParsedLines = 0;

public CarahsoftXlsxParser(File dir, boolean debug) {
public CarahsoftXlsxParser(File dir, boolean debug) throws IOException {
File[] xlsxFiles = IO.extractFiles(dir, ".xlsx");

if (xlsxFiles == null || xlsxFiles.length ==0) throw new IOException("ERROR: failed to find xlsx Carahsoft files in "+dir);

for (File xlsx: xlsxFiles) parseIt(xlsx);

float awsTotal = 0.0f;
Expand All @@ -48,7 +49,7 @@ public CarahsoftXlsxParser(File dir, boolean debug) {
}
}

public static void main(String[] args) {
public static void main(String[] args) throws IOException {
CarahsoftXlsxParser p = new CarahsoftXlsxParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/CarahsoftMay2023"), true);

}
Expand Down
63 changes: 34 additions & 29 deletions Source/edu/utah/billing/JiraTicketSummary.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.util.LinkedHashMap;

import util.gen.IO;
import util.gen.Misc;
import util.gen.Num;

public class JiraTicketSummary {
Expand All @@ -26,35 +27,39 @@ public class JiraTicketSummary {
private ArrayList<String> errors = new ArrayList<String>();

public JiraTicketSummary (String[] cells, String line, LinkedHashMap<String, Integer> headerKeyIndex) {


//pull Work Type, Hourly | Infrastructure | FTE - cannot be blank
workType = cells[headerKeyIndex.get("Work Type")].trim();
if (workType.length()==0) errors.add("Error: missing 'Work Type' in -> "+line);

//pull Account Name, Judson-Torres, Robert Lab - might be blank for Infrastructure
groupToBill = cells[headerKeyIndex.get("Account Name")].trim();
if (workType.startsWith("Infrastructure")== false && groupToBill.length()==0) errors.add("Error: missing 'Account Name' for a non-infrastructure job in -> "+line);
if (workType.startsWith("Infrastructure") && groupToBill.length()!=0) errors.add("Error: Infrastructure job has an Account Name -> "+line);

//pull Issue Key, BSD-642 - cannot be blank
jiraTicket = cells[headerKeyIndex.get("Issue Key")].trim();
if (jiraTicket.length()==0) errors.add("Error: missing 'Issue Key' in -> "+line);

//pull Full name, Timothy Parnell - cannot be blank
analystName = cells[headerKeyIndex.get("Full name")].trim();
if (analystName.length()==0) errors.add("Error: missing 'Full name' in -> "+line);

//pull Billed Hours, 2 - cannot be blank
hoursString = cells[headerKeyIndex.get("Billed Hours")].trim();
if (hoursString.length()==0) errors.add("Error: missing 'Billed Hours' in -> "+line);

//pull Issue summary, ChIP Seq data analysis - can be blank
issueSummary = cells[headerKeyIndex.get("Issue summary")].trim();

//pull Work Description, Evaluate new results. Generate new plots. Annotate peaks. Post to GNomEx and Email. - can be blank
workPerformed = cells[headerKeyIndex.get("Work Description")].trim();


try {
//pull Work Type, Hourly | Infrastructure | FTE - cannot be blank
workType = cells[headerKeyIndex.get("CBI - Work Type")].trim();
if (workType.length()==0) errors.add("Error: missing 'Work Type' in -> "+line);

//pull Account Name, Judson-Torres, Robert Lab - might be blank for Infrastructure
groupToBill = cells[headerKeyIndex.get("Account Name")].trim();
if (workType.startsWith("Infrastructure")== false && groupToBill.length()==0) errors.add("Error: missing 'Account Name' for a non-infrastructure job in -> "+line);
if (workType.startsWith("Infrastructure") && groupToBill.length()!=0) errors.add("Error: Infrastructure job has an Account Name -> "+line);

//pull Issue Key, BSD-642 - cannot be blank
jiraTicket = cells[headerKeyIndex.get("Issue Key")].trim();
if (jiraTicket.length()==0) errors.add("Error: missing 'Issue Key' in -> "+line);

//pull Full name, Timothy Parnell - cannot be blank
analystName = cells[headerKeyIndex.get("Full name")].trim();
if (analystName.length()==0) errors.add("Error: missing 'Full name' in -> "+line);

//pull Billed Hours, 2 - cannot be blank
hoursString = cells[headerKeyIndex.get("Billed Hours")].trim();
if (hoursString.length()==0) errors.add("Error: missing 'Billed Hours' in -> "+line);

//pull Issue summary, ChIP Seq data analysis - can be blank
issueSummary = cells[headerKeyIndex.get("Issue summary")].trim();

//pull Work Description, Evaluate new results. Generate new plots. Annotate peaks. Post to GNomEx and Email. - can be blank
workPerformed = cells[headerKeyIndex.get("Work Description")].trim();
} catch (Exception e) {
e.printStackTrace();
Misc.printErrAndExit("Bad line, check for extra line return -> "+line);
}

}

public static final String getToStringHeader() {
Expand Down
Loading

0 comments on commit 56f6cac

Please sign in to comment.