Skip to content

Commit

Permalink
Metrics for indexing failures due to version conflicts (#119067)
Browse files Browse the repository at this point in the history
This exposes new OTel node and index based metrics for indexing failures due to version conflicts.

In addition, the /_cat/shards, /_cat/indices and /_cat/nodes APIs also expose the same metric, under the newly added column iifvc.

Relates: #107601
  • Loading branch information
albertzaharovits authored Jan 8, 2025
1 parent 447dcaa commit 12eb1cf
Show file tree
Hide file tree
Showing 23 changed files with 578 additions and 66 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/119067.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 119067
summary: Metrics for indexing failures due to version conflicts
area: CRUD
type: feature
issues: []
3 changes: 3 additions & 0 deletions docs/reference/cat/nodes.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,9 @@ Number of indexing operations, such as `1`.
`indexing.index_failed`, `iif`, `indexingIndexFailed`::
Number of failed indexing operations, such as `0`.

`indexing.index_failed_due_to_version_conflict`, `iifvc`, `indexingIndexFailedDueToVersionConflict`::
Number of failed indexing operations due to version conflict, such as `0`.

`merges.current`, `mc`, `mergesCurrent`::
Number of current merge operations, such as `0`.

Expand Down
3 changes: 3 additions & 0 deletions docs/reference/cat/shards.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ Number of indexing operations, such as `1`.
`indexing.index_failed`, `iif`, `indexingIndexFailed`::
Number of failed indexing operations, such as `0`.

`indexing.index_failed_due_to_version_conflict`, `iifvc`, `indexingIndexFailedDueToVersionConflict`::
Number of failed indexing operations due to version conflict, such as `0`.

`merges.current`, `mc`, `mergesCurrent`::
Number of current merge operations, such as `0`.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ private static ShardStats getShardStats(IndexMetadata indexMeta, int shardIndex,
CommonStats stats = new CommonStats();
stats.docs = new DocsStats(100, 0, randomByteSizeValue().getBytes());
stats.store = new StoreStats();
stats.indexing = new IndexingStats(new IndexingStats.Stats(1, 1, 1, 1, 1, 1, 1, 1, false, 1, targetWriteLoad, 1));
stats.indexing = new IndexingStats(new IndexingStats.Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, false, 1, targetWriteLoad, 1));
return new ShardStats(shardRouting, new ShardPath(false, path, path, shardId), stats, null, null, null, false, 0);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
indexing.index_time .+ \n
indexing.index_total .+ \n
indexing.index_failed .+ \n
indexing.index_failed_due_to_version_conflict .+ \n
merges.current .+ \n
merges.current_docs .+ \n
merges.current_size .+ \n
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

Expand Down Expand Up @@ -76,6 +77,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
static final String STANDARD_INDEXING_COUNT = "es.indices.standard.indexing.total";
static final String STANDARD_INDEXING_TIME = "es.indices.standard.indexing.time";
static final String STANDARD_INDEXING_FAILURE = "es.indices.standard.indexing.failure.total";
static final String STANDARD_INDEXING_FAILURE_DUE_TO_VERSION_CONFLICT = "es.indices.standard.indexing.failure.version_conflict.total";

static final String TIME_SERIES_INDEX_COUNT = "es.indices.time_series.total";
static final String TIME_SERIES_BYTES_SIZE = "es.indices.time_series.size";
Expand All @@ -89,6 +91,8 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
static final String TIME_SERIES_INDEXING_COUNT = "es.indices.time_series.indexing.total";
static final String TIME_SERIES_INDEXING_TIME = "es.indices.time_series.indexing.time";
static final String TIME_SERIES_INDEXING_FAILURE = "es.indices.time_series.indexing.failure.total";
static final String TIME_SERIES_INDEXING_FAILURE_DUE_TO_VERSION_CONFLICT =
"es.indices.time_series.indexing.failure.version_conflict.total";

static final String LOGSDB_INDEX_COUNT = "es.indices.logsdb.total";
static final String LOGSDB_BYTES_SIZE = "es.indices.logsdb.size";
Expand All @@ -102,6 +106,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
static final String LOGSDB_INDEXING_COUNT = "es.indices.logsdb.indexing.total";
static final String LOGSDB_INDEXING_TIME = "es.indices.logsdb.indexing.time";
static final String LOGSDB_INDEXING_FAILURE = "es.indices.logsdb.indexing.failure.total";
static final String LOGSDB_INDEXING_FAILURE_DUE_TO_VERSION_CONFLICT = "es.indices.logsdb.indexing.failure.version_conflict.total";

public void testIndicesMetrics() {
String indexNode = internalCluster().startNode();
Expand Down Expand Up @@ -132,7 +137,9 @@ public void testIndicesMetrics() {
STANDARD_INDEXING_TIME,
greaterThanOrEqualTo(0L),
STANDARD_INDEXING_FAILURE,
equalTo(indexing1.getIndexFailedCount() - indexing0.getIndexCount())
equalTo(indexing1.getIndexFailedCount() - indexing0.getIndexFailedCount()),
STANDARD_INDEXING_FAILURE_DUE_TO_VERSION_CONFLICT,
equalTo(indexing1.getIndexFailedDueToVersionConflictCount() - indexing0.getIndexFailedDueToVersionConflictCount())
)
);

Expand All @@ -155,7 +162,9 @@ public void testIndicesMetrics() {
TIME_SERIES_INDEXING_TIME,
greaterThanOrEqualTo(0L),
TIME_SERIES_INDEXING_FAILURE,
equalTo(indexing2.getIndexFailedCount() - indexing1.getIndexFailedCount())
equalTo(indexing1.getIndexFailedCount() - indexing0.getIndexFailedCount()),
TIME_SERIES_INDEXING_FAILURE_DUE_TO_VERSION_CONFLICT,
equalTo(indexing1.getIndexFailedDueToVersionConflictCount() - indexing0.getIndexFailedDueToVersionConflictCount())
)
);

Expand All @@ -177,36 +186,50 @@ public void testIndicesMetrics() {
LOGSDB_INDEXING_TIME,
greaterThanOrEqualTo(0L),
LOGSDB_INDEXING_FAILURE,
equalTo(indexing3.getIndexFailedCount() - indexing2.getIndexFailedCount())
equalTo(indexing3.getIndexFailedCount() - indexing2.getIndexFailedCount()),
LOGSDB_INDEXING_FAILURE_DUE_TO_VERSION_CONFLICT,
equalTo(indexing3.getIndexFailedDueToVersionConflictCount() - indexing2.getIndexFailedDueToVersionConflictCount())
)
);
// already collected indexing stats
collectThenAssertMetrics(
telemetry,
4,
Map<String, Matcher<Long>> zeroMatchers = new HashMap<>();
zeroMatchers.putAll(
Map.of(
STANDARD_INDEXING_COUNT,
equalTo(0L),
STANDARD_INDEXING_TIME,
equalTo(0L),
STANDARD_INDEXING_FAILURE,
equalTo(0L),

STANDARD_INDEXING_FAILURE_DUE_TO_VERSION_CONFLICT,
equalTo(0L)
)
);
zeroMatchers.putAll(
Map.of(
TIME_SERIES_INDEXING_COUNT,
equalTo(0L),
TIME_SERIES_INDEXING_TIME,
equalTo(0L),
TIME_SERIES_INDEXING_FAILURE,
equalTo(0L),

TIME_SERIES_INDEXING_FAILURE_DUE_TO_VERSION_CONFLICT,
equalTo(0L)
)
);
zeroMatchers.putAll(
Map.of(
LOGSDB_INDEXING_COUNT,
equalTo(0L),
LOGSDB_INDEXING_TIME,
equalTo(0L),
LOGSDB_INDEXING_FAILURE,
equalTo(0L),
LOGSDB_INDEXING_FAILURE_DUE_TO_VERSION_CONFLICT,
equalTo(0L)
)
);
collectThenAssertMetrics(telemetry, 4, zeroMatchers);
String searchNode = internalCluster().startDataOnlyNode();
indicesService = internalCluster().getInstance(IndicesService.class, searchNode);
telemetry = internalCluster().getInstance(PluginsService.class, searchNode)
Expand Down
Loading

0 comments on commit 12eb1cf

Please sign in to comment.