Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
evolarjun committed Oct 16, 2023
2 parents 185a69f + c0f7904 commit 01dbead
Show file tree
Hide file tree
Showing 15 changed files with 475 additions and 355 deletions.
36 changes: 23 additions & 13 deletions amr_report.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ map <string/*accession*/, Susceptible> accession2susceptible;

bool cdsExist = false;
bool print_node = false;
bool print_node_raw = false;

bool reportPseudo = false;
//const string stopCodonS ("[stop]");
Expand Down Expand Up @@ -698,7 +699,12 @@ struct BlastAlignment : Alignment
}
}
if (print_node)
td << famId;
{
if (! print_node_raw && allele () && ! refExactlyMatched ())
td << gene;
else
td << famId;
}
IMPLY ((isMutationProt () && ! seqChange. empty () && mut && ! seqChange. replacement), hasMutation ());
if ( ! isMutationProt ()
|| (! seqChange. empty () && mut && ! seqChange. replacement) // resistant mutation
Expand Down Expand Up @@ -808,7 +814,7 @@ struct BlastAlignment : Alignment
}
public:
string fusion2geneSymbols () const
{ ASSERT (! isMutationProt ());
{ ASSERT (! isMutationProt ());
if (fusions. empty ())
return getGeneSymbol ();
string s;
Expand Down Expand Up @@ -954,15 +960,14 @@ struct BlastAlignment : Alignment
&& refCoverage () >= br. ref_coverage - frac_delta
;
}
bool partialPseudo () const
{ return partial ()
&& ! cdss. empty ()
&& ! truncatedCds ();
}
bool pseudo () const
{ if (stopCodon)
return true;
if ( partial ()
&& ! cdss. empty ()
&& ! truncatedCds ()
)
return true;
return false;
{ return stopCodon
|| partialPseudo ();
}
bool good () const
{ if (refAccession. empty ())
Expand Down Expand Up @@ -999,7 +1004,7 @@ struct BlastAlignment : Alignment
&& targetEnd <= other. targetEnd + mismatchTailTarget ()
)
return true;
if ( ! pseudo ()
if ( ! partialPseudo ()
|| isMutationProt ()
|| other. isMutationProt ()
|| fusion2geneSymbols () != other. fusion2geneSymbols ()
Expand Down Expand Up @@ -1102,6 +1107,7 @@ struct BlastAlignment : Alignment
)
if ( ! targetProt
|| ( ! isMutationProt () // PD-4722
&& ! other. isMutationProt () // PD-4755
&& fusion2geneSymbols () != other. fusion2geneSymbols ()
)
) // PD-4687
Expand Down Expand Up @@ -1397,7 +1403,7 @@ struct Batch
}
catch (const exception &e)
{
throw runtime_error ("Cannot read " + famFName +", line " + toString (f. lineNum) + "\n" + e. what ());
throw runtime_error ("Cannot read " + famFName +", " + f. lineStr () + "\n" + e. what ());
}
}
{
Expand Down Expand Up @@ -2150,7 +2156,8 @@ struct ThisApplication : Application

// Output
addKey ("out", "Identifiers of the reported input proteins");
addFlag ("print_node", "Print FAM.id");
addFlag ("print_node", "Print FAM.id replaced by FAM.parent for non-exact allele matches");
addFlag ("print_node_raw", "Print FAM.id");
addFlag ("pseudo", "Indicate pseudo-genes as method INTERNAL_STOP"); // or FRAME_SHIFT
addFlag ("force_cds_report", "Report contig/start/stop/strand even if this information does not exist");
addFlag ("non_reportable", "Report non-reportable families");
Expand Down Expand Up @@ -2195,6 +2202,7 @@ struct ThisApplication : Application
equidistant = getFlag ("report_equidistant");
const string outFName = getArg ("out");
print_node = getFlag ("print_node");
print_node_raw = getFlag ("print_node_raw");
reportPseudo = getFlag ("pseudo");
const bool force_cds_report = getFlag ("force_cds_report");
const bool non_reportable = getFlag ("non_reportable");
Expand All @@ -2221,6 +2229,8 @@ struct ThisApplication : Application
QC_ASSERT (! lcl);
QC_ASSERT (! bedP);
}

QC_IMPLY (print_node_raw, print_node);


if (ident_min == -1.0)
Expand Down
104 changes: 64 additions & 40 deletions amrfinder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,15 @@
* gunzip (optional)
*
* Release changes:
* 3.11.26 10/16/2023 PD-4772 Remove prodigal GFF format from AMRFinderPlus
* 3.11.25 10/13/2023 PD-4771 Revert removing '*' from Prodigal output to ensure ALLELEP and EXACTP matches ??
* 3.11.24 10/12/2023 PD-4769 --print_node prints FAM.id replaced by FAM.parent for non-exact allele matches
* 3.11.23 10/06/2023 PD-4764 Remove '*' from Prodigal output to ensure ALLELEP and EXACTP matches
* 10/05/2023 PD-4761 Remove protein sequences with >= 20 Xs
* 3.11.22 10/05/2023 PD-4754 Prodigal GFF
* 3.11.21 10/02/2023 PD-4755 bug: calling fusion2geneSymbols() for a mutation protein
* 3.11.20 09/06/2023 PD-4722 bug: calling fusion2geneSymbols() for a mutation protein
* TERM codes are printed only when output is to screen
* color codes are printed only when output is to screen
* 3.11.19 08/09/2023 PD-4698 if a pseudogene is overlapped by a different gene on the length >= 20 aa with the same gene symbol then the pseudogene is not reported
* 08/04/2023 PD-4706 protein match overrides a nucleotide match for point mutations
* 3.11.18 07/25/2023 parameter order in instruction; "can be gzipped" is added to help
Expand Down Expand Up @@ -281,6 +288,7 @@ constexpr size_t threads_def = 4;
// Cf. amr_report.cpp
constexpr double ident_min_def = 0.9;
constexpr double partial_coverage_min_def = 0.5;
const string ambigS ("20");


#define HELP \
Expand Down Expand Up @@ -310,7 +318,10 @@ struct ThisApplication : ShellApplication
addKey ("protein", "Input protein FASTA file (can be gzipped)", "", 'p', "PROT_FASTA");
addKey ("nucleotide", "Input nucleotide FASTA file (can be gzipped)", "", 'n', "NUC_FASTA");
addKey ("gff", "GFF file for protein locations (can be gzipped). Protein id should be in the attribute 'Name=<id>' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE");
addKey ("annotation_format", "Type of GFF file: " + Gff::names. toString (", "), "genbank", 'a', "ANNOTATION_FORMAT");

string annots (Gff::names. toString (", "));
replaceStr (annots, ", prodigal", ""); // PD-4772 ??
addKey ("annotation_format", "Type of GFF file: " + annots, "genbank", 'a', "ANNOTATION_FORMAT");

addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR");
addFlag ("database_version", "Print database version", 'V');
Expand Down Expand Up @@ -359,31 +370,25 @@ struct ThisApplication : ShellApplication



bool fastaCheck (const string &fName,
void fastaCheck (const string &fName,
bool prot,
const string &qcS,
const string &logFName,
size_t &nSeq,
size_t &len_max,
size_t &len_total) const
// Input: fName: quoted
// Return: false <=> hyphen in protein FASTA
size_t &len_total,
const string &outFName) const
// Input: fName, outFName: quoted
{
try
ASSERT (fName != logFName);
if (! outFName. empty ())
{
exec (fullProg ("fasta_check") + fName + " " + (prot ? "-aa" : "-len "+ tmp + "/len") + (prot ? "" : " -hyphen") + qcS + " -log " + logFName + " > " + tmp + "/nseq", logFName);
}
catch (...)
{
if (prot)
{
LineInput f (logFName);
while (f. nextLine ())
if (contains (f. line, "hyphen in the sequence")) // Cf. fasta_check.cpp
return false;
}
throw;
}
ASSERT (outFName != fName);
ASSERT (outFName != logFName);
}
exec (fullProg ("fasta_check") + fName + " " + (prot ? "-aa -stop_codon -ambig_max " + ambigS + prependS (outFName, " -out ") : "-len " + tmp + "/len -hyphen -ambig") + qcS + " -log " + logFName + " > " + tmp + "/nseq", logFName);
// "-stop_codon" PD-4771 ??

const StringVector vec (tmp + "/nseq", (size_t) 10, true);
if (vec. size () != 3)
throw runtime_error (string (prot ? "Protein" : "DNA") + " fasta_check failed: " + vec. toString ("\n"));
Expand All @@ -393,7 +398,6 @@ struct ThisApplication : ShellApplication
QC_ASSERT (nSeq);
QC_ASSERT (len_max);
QC_ASSERT (len_total);
return true;
}


Expand Down Expand Up @@ -942,26 +946,45 @@ struct ThisApplication : ShellApplication
size_t nProt = 0;
size_t protLen_max = 0;
size_t protLen_total = 0;
if (! fastaCheck (prot_flat, true, qcS, logFName, nProt, protLen_max, protLen_total))

try
{
fastaCheck (prot_flat, true, qcS, logFName, nProt, protLen_max, protLen_total, noString);
}
catch (...)
{
bool fixable = false;
{
LineInput f (logFName);
while (f. nextLine ())
// Cf. fasta_check.cpp
if (contains (f. line, "Hyphen in the sequence"))
{
const Warning warning (stderr);
stderr << "Ignoring dash '-' characters in the sequences of the protein file " << prot;
fixable = true;
break;
}
else if (contains (f. line, "Too many ambiguities"))
{
const Warning warning (stderr);
stderr << "Removing sequences with >= " << ambigS << " Xs from the protein file " << prot;
fixable = true;
break;
}
else if (contains (f. line, "'*' at the sequence end"))
{
const Warning warning (stderr);
stderr << "Removing '*' from the ends of protein sequences in " << prot;
fixable = true;
break;
}
}
if (! fixable)
throw;
prot1 = shellQuote (tmp + "/prot");
OFStream outF (unQuote (prot1));
LineInput f (unQuote (prot_flat));
while (f. nextLine ())
{
trimTrailing (f. line);
if (f. line. empty ())
continue;
if (f. line [0] != '>')
replaceStr (f. line, "-", "");
outF << f. line << endl;
}
{
const Warning warning (stderr);
stderr << "Ignoring dash '-' characters in the sequences of the protein file " << prot;
}
EXEC_ASSERT (fastaCheck (prot1, true, qcS, logFName, nProt, protLen_max, protLen_total));
}
fastaCheck (prot_flat, true, qcS, logFName, nProt, protLen_max, protLen_total, prot1);
}

// gff_check
if (! emptyArg (gff) && ! contains (parm, "-bed"))
Expand All @@ -984,6 +1007,7 @@ struct ThisApplication : ShellApplication
}
break;
case Gff::microscope: gffProtMatchP = true; break;
case Gff::prodigal: gffProtMatchP = true; break;
default: break;
}
if (gffProtMatchP)
Expand Down Expand Up @@ -1067,7 +1091,7 @@ struct ThisApplication : ShellApplication
size_t nDna = 0;
size_t dnaLen_max = 0;
size_t dnaLen_total = 0;
EXEC_ASSERT (fastaCheck (dna_flat, false, qcS, logFName, nDna, dnaLen_max, dnaLen_total));
/*EXEC_ASSERT (*/ fastaCheck (dna_flat, false, qcS, logFName, nDna, dnaLen_max, dnaLen_total, noString); // );
const string blastx (/*"tblastn"*/ dnaLen_max > 100000 ? "tblastn" : "blastx"); // PAR // SB-3643

stderr. section ("Running " + blastx);
Expand Down
Loading

0 comments on commit 01dbead

Please sign in to comment.