Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up and test sample metadata barplot computations #313

Merged
merged 27 commits into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
ba65a87
MNT: Abstract+simplify sm barplots/getObsCountsBy
fedarko Aug 6, 2020
c0e023c
STY: prettier
fedarko Aug 6, 2020
9703416
Merge branch 'master' of https://github.com/biocore/empress into sm-b…
fedarko Aug 6, 2020
5a5479b
MNT: ++ instead of += 1 consistently in biom table
fedarko Aug 6, 2020
13e0272
MNT: document+rename getObsCountsAndTotalBy()
fedarko Aug 6, 2020
898a0b3
MNT: add sped up? freq map code
fedarko Aug 6, 2020
32174c0
STY: prettify
fedarko Aug 6, 2020
d10c4ed
MNT: use 2D arrays internally for freq map comp
fedarko Aug 7, 2020
b59ba6e
MNT: remove now-unused code i added earlier
fedarko Aug 7, 2020
e541686
STY: prettify
fedarko Aug 7, 2020
c1440e6
DOC: improve documentation for freq map #298
fedarko Aug 7, 2020
3cb4771
DOC: add extra context to sm barplot drawing func
fedarko Aug 7, 2020
65dfcc2
STY: fix redundant variable declaration
fedarko Aug 7, 2020
f08638d
TST: test getFrequencyMap() - close #298
fedarko Aug 7, 2020
e91538a
DOC: document getfrequencymap output a bit more
fedarko Aug 7, 2020
7083d67
grammar
fedarko Aug 8, 2020
3735832
Add extra freqmap test
fedarko Aug 8, 2020
1ece062
DOC: improve uniqueVal docs in sm barplot drawing
fedarko Aug 8, 2020
f685f99
TST: expand getFrequencyMap() tests
fedarko Aug 8, 2020
7890a17
Update empress/support_files/js/biom-table.js
fedarko Aug 11, 2020
2b8626d
MNT: rename remaining fIdx2... vars
fedarko Aug 11, 2020
5fa827b
MNT: fID2freqs -> fID2Freqs
fedarko Aug 11, 2020
dc6476b
DOC: more explicit about smb proportions in README
fedarko Aug 11, 2020
3aeec27
DOC: clarify presentFeatureIndices usage a bit?
fedarko Aug 11, 2020
d2b7d37
DOC: add note re a future optimization #313
fedarko Aug 11, 2020
80770a8
MNT: Simplify iteration in SM barplot drawing
fedarko Aug 11, 2020
472a248
STY: prettify
fedarko Aug 11, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 91 additions & 2 deletions empress/support_files/js/biom-table.js
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ define(["underscore", "util"], function (_, util) {
// This sample actually contains the feature!
cVal = scope._sm[sIdx][colIdx];
// Update our output Object's count info accordingly.
valueToCountOfSampleWithObs[cVal] += 1;
valueToCountOfSampleWithObs[cVal]++;
}
});
return valueToCountOfSampleWithObs;
Expand Down Expand Up @@ -472,13 +472,102 @@ define(["underscore", "util"], function (_, util) {
var sampleIdx = scope._getSampleIndexFromID(sID);
var cVal = scope._sm[sampleIdx][colIdx];
if (_.has(valueToSampleCount, cVal)) {
valueToSampleCount[cVal] += 1;
valueToSampleCount[cVal]++;
} else {
valueToSampleCount[cVal] = 1;
}
});
return valueToSampleCount;
};

/**
* Maps each feature ID in the table to a "frequencies" Object for a sample
* metadata field.
*
* Each "frequencies" Object contains information on the number of samples
* from each unique sample metadata value that contain the feature ID in
* question. Keys in these objects are unique sample metadata values, and
* values in these objects are the proportion of samples containing the
* feature that have this unique value. Only frequency information for
* unique values where at least 1 sample with this value contains the
* feature is included in a given "frequencies" Object.
*
* This function is designed to be reasonably fast, which is a big part of
* why this works on the order of "each feature ID in the table" rather
* than on a feature-per-feature basis. (The reason for this design is that
* this is used for generating sample metadata barplots, and that was
* previously very slow on large trees: see issue #298 on GitHub. Thanks
* to Yoshiki for discussing this with me.)
*
* @param {String} col Sample metadata column
*
* @return {Object} fID2freqs
*
* @throws {Error} If the sample metadata column is unrecognized.
*/
BIOMTable.prototype.getFrequencyMap = function (col) {
var scope = this;
var colIdx = this._getSampleMetadataColIndex(col);
var fIdx2counts = [];
var fIdx2sampleCt = [];
fedarko marked this conversation as resolved.
Show resolved Hide resolved
var containingSampleCount, cVal, cValIdx;

// Find unique (sorted) values in this sample metadata column; map
// sample metadata values to a consistent index. (Using an index to
// store this data means we can store the sample metadata values for
// each feature in an Array rather than in an Object for now.)
var uniqueSMVals = this.getUniqueSampleValues(col);
var numUniqueSMVals = uniqueSMVals.length;
var smVal2Idx = {};
_.each(uniqueSMVals, function (smVal, c) {
smVal2Idx[smVal] = c;
});

// Assign each feature an empty counts array with all 0s. Also set
// things up so we can keep track of the total number of samples
// containing each feature easily.
var i, emptyCounts;
_.each(this._fIDs, function (fID, fIdx) {
emptyCounts = [];
for (i = 0; i < numUniqueSMVals; i++) {
emptyCounts.push(0);
}
fIdx2counts.push(emptyCounts);
fIdx2sampleCt.push(0);
});

// Iterate through each sample of the BIOM table, storing unique s.m.
// value counts and total sample counts for each feature
_.each(this._tbl, function (presentFeatureIndices, sIdx) {
// Figure out what metadata value this sample has at the column.
cVal = scope._sm[sIdx][colIdx];
cValIdx = smVal2Idx[cVal];
// Increment s.m. value counts for each feature present in this
// sample
_.each(presentFeatureIndices, function (fIdx) {
fIdx2counts[fIdx][cValIdx]++;
fIdx2sampleCt[fIdx]++;
});
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
_.each(this._tbl, function (presentFeatureIndices, sIdx) {
// Figure out what metadata value this sample has at the column.
cVal = scope._sm[sIdx][colIdx];
cValIdx = smVal2Idx[cVal];
// Increment s.m. value counts for each feature present in this
// sample
_.each(presentFeatureIndices, function (fIdx) {
fIdx2counts[fIdx][cValIdx]++;
fIdx2sampleCt[fIdx]++;
});
_.each(this._tbl, function (samples, sIdx) {
// Figure out what metadata value this sample has at the column.
cVal = scope._sm[sIdx][colIdx];
cValIdx = smVal2Idx[cVal];
// Increment s.m. value counts for each feature present in this
// sample
_.each(samples, function (fIdx) {
fIdx2counts[fIdx][cValIdx]++;
fIdx2sampleCt[fIdx]++;
});

Since _tbl is a 2D array of samples by features, this may make more sense to developers who are not to familiar with biom-table. Minor suggestion feel free to ignore. (:

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's ok I'd like to keep this as is -- there are a couple of other places in the BIOM table JS where presentFeatureIndices is used when iterating over the table in this way. I added some context to the comment before this loop, so hopefully that makes things clearer as a compromise :) You're totally correct, though -- we should make this code as non-intimidating as possible, since these functions are all getting used pretty frequently ...

});

// Convert counts to frequencies
// Also, return an Object where the keys are feature IDs pointing to
// other Objects where the keys are sample metadata values, rather than
// a 2D array (which is how fIdx2counts has been stored)
var fID2freqs = {};
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a note: It might be less memory expensive/more efficient to iterate through it this was 2D array especially for large trees. Storing (even temporarily) an object that uses fId's as keys can be quite memory intensive + js has a bit more (although not a ton) of overhead compared to arrays when iterating. I would keep this the same for now and we can come back to this if memory becomes a problem when working with large trees.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understood and agreed. Added a TODO to the code detailing this optimization.

One weird-ish thing is that none of the Empress JS code besides the BIOM table knows anything about feature indices -- it only stores information about feature names (aka IDs), as far as I know. Moving the index stuff to empress.js / treeData might be a hassle, but I think it could save a decent amount of space -- I guess in-the-BIOM-table tip names are technically stored in two places (treeData and the BIOM table).

var totalSampleCount;
_.each(this._fIDs, function (fID, fIdx) {
totalSampleCount = fIdx2sampleCt[fIdx];
fID2freqs[fID] = {};
_.each(fIdx2counts[fIdx], function (count, smValIdx) {
if (count > 0) {
fID2freqs[fID][uniqueSMVals[smValIdx]] =
count / totalSampleCount;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should either let the user know that the sample stack bar plots are displaying proportions instead of counts or we should just display the counts (or all the user to choose).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point -- and supporting displaying "variable-length" stacked barplots (where e.g. 1 sample has a given fixed length, so a tip present in 2 samples vs. a tip present in 20 samples will display differently) would be a cool feature to add. I've added an issue for this at #322, and updated the README to be clearer about proportions being displayed.

}
});
});
return fID2freqs;
};

return BIOMTable;
});
53 changes: 27 additions & 26 deletions empress/support_files/js/empress.js
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,7 @@ define([
this.getNodeInfo(rNode, "highestchildyr")
);
}
// iterate throught the tree in postorder, skip root
// iterate through the tree in postorder, skip root
for (var i = 1; i < tree.size; i++) {
// name of current node
var nodeInd = i;
Expand Down Expand Up @@ -1139,50 +1139,51 @@ define([
);
var colorer = new Colorer(layer.colorBySMColorMap, sortedUniqueValues);
var sm2color = colorer.getMapRGB();
// Do most of the hard work: compute the frequencies for each tip
var feature2freqs = this._biom.getFrequencyMap(layer.colorBySMField);
// Bar thickness
var halfyrscf = this._yrscf / 2;
for (i = 1; i < this._tree.size; i++) {
if (this._tree.isleaf(this._tree.postorderselect(i))) {
var node = this._treeData[i];
var name = this.getNodeInfo(node, "name");
var freqs = feature2freqs[name];
// Don't draw bars for tips that aren't in the BIOM table
// (Note that this is only for the sample metadata barplots --
// these tips could still ostensibly have associated
// feature metadata)
if (this._biom.getObsIDsDifference([name]).length > 0) {
if (_.isUndefined(freqs)) {
continue;
}
fedarko marked this conversation as resolved.
Show resolved Hide resolved
// Figure how many samples across each unique value in the
// selected sample metadata field contain this tip. (This is
// computed the same way as the information shown in the
// selected node menu's "Sample Presence Information" section.)
var spi = this.computeTipSamplePresence(name, [
layer.colorBySMField,
])[layer.colorBySMField];

// Sum the values of the sample presence information, getting
// us the total number of samples containing this tip.
// JS doesn't have a built-in sum() function, so I couldn't
// think of a better way to do this. Taken from
// https://underscorejs.org/#reduce.
var totalSampleCt = _.reduce(
_.values(spi),
function (a, b) {
return a + b;
},
0
);
var prevSectionMaxX = prevLayerMaxX;
// NOTE: currently we iterate through all of sortedUniqueValues
// once for every tip in the table, detecting and skipping
// unique values where no samples contain this tip.
// The reason we do things this way, rather than just
// iterating directly over the keys of this tip's Object within
// the frequency map, is that we want to ensure that unique
// values are processed in the same order for every tip (so for
// a "body site" barplot you'd always see e.g. gut, left palm,
// right palm, tongue in that order).
//
// Ideally we'd skip having to do this full iteration, though,
// and only look at the unique values containing this tip from
// the start (saving time). This might require refactoring the
// output of BiomTable.getFrequencyMap(), though.
for (var v = 0; v < sortedUniqueValues.length; v++) {
var smVal = sortedUniqueValues[v];
var ct = spi[smVal];
if (ct > 0) {
var freq = freqs[smVal];
// Ignore sample metadata values where no sample with this
// value contains this tip. We can detect this using
// !_.isUndefined() because freqs should only include
// entries for metadata values where this feature is
// present in at least one sample with that value.
if (!_.isUndefined(freq)) {
var sectionColor = sm2color[smVal];
// Assign each unique sample metadata value a length
// proportional to its, well, proportion within the sample
// presence information for this tip.
var barSectionLen =
layer.lengthSM * (ct / totalSampleCt);
var barSectionLen = layer.lengthSM * freq;
var thisSectionMaxX = prevSectionMaxX + barSectionLen;
var y = this.getY(node);
var ty = y + halfyrscf;
Expand Down
86 changes: 86 additions & 0 deletions tests/test-biom-table.js
Original file line number Diff line number Diff line change
Expand Up @@ -690,5 +690,91 @@ require(["jquery", "underscore", "BiomTable"], function ($, _, BiomTable) {
"Test: error thrown if unrecognized metadata col passed"
);
});
test("Test getFrequencyMap", function () {
deepEqual(
this.biomTable.getFrequencyMap("f1"),
{
o1: { a: 1 },
o2: { a: 0.5, c: 0.5 },
o3: { a: 0.5, c: 0.5 },
o4: { a: 0.5, b: 0.5 },
o5: { a: 2 / 3, b: 1 / 3 },
o6: { a: 0.5, c: 0.5 },
o7: { a: 1 },
o8: { b: 1 },
o9: { a: 1 },
o10: { a: 1 },
},
"Test frequency map for field f1"
);
deepEqual(
this.biomTable.getFrequencyMap("f4"),
{
o1: { 4: 0.5, 3: 0.5 },
o2: { 4: 0.5, 1: 0.5 },
o3: { 3: 0.5, 1: 0.5 },
o4: { 4: 0.5, 2: 0.5 },
o5: { 4: 1 / 3, 3: 1 / 3, 5: 1 / 3 },
o6: { 3: 0.5, 1: 0.5 },
o7: { 4: 0.5, 3: 0.5 },
o8: { 2: 0.5, 5: 0.5 },
o9: { 3: 1 },
o10: { 4: 1 },
},
"Test frequency map for field f4"
);

var smolTable = new BiomTable(
["s1", "s2", "s3"],
["o1", "o2", "o3", "o4"],
{ s1: 0, s2: 1, s3: 2 },
{ o1: 0, o2: 1, o3: 2, o4: 3 },
[
[0, 1],
[2, 3],
[0, 3],
],
["f1"],
[["m"], ["m"], ["m"]]
);
deepEqual(
smolTable.getFrequencyMap("f1"),
{
o1: { m: 1 },
o2: { m: 1 },
o3: { m: 1 },
o4: { m: 1 },
},
"Test frequency map when all features unique to same group"
);

var funkyTable = new BiomTable(
["s1", "s2", "s3"],
["o1", "o2", "o3"],
{ s1: 0, s2: 1, s3: 2 },
{ o1: 0, o2: 1, o3: 2 },
[[0], [1], [2]],
["f1"],
[["x"], ["y"], ["z"]]
);
deepEqual(
funkyTable.getFrequencyMap("f1"),
{
o1: { x: 1 },
o2: { y: 1 },
o3: { z: 1 },
},
"Test frequency map when all features unique to different group"
);

var scope = this;
throws(
function () {
scope.biomTable.getFrequencyMap("badfield");
},
/Sample metadata column "badfield" not in BIOM table./,
"Test error thrown if unrecognized metadata col passed"
);
});
});
});