Skip to content

Commit

Permalink
Add live setting for verbose index metrics (#608)
Browse files Browse the repository at this point in the history
* Add live setting for verbose index metrics

* Address review comment
  • Loading branch information
aprudhomme authored Nov 13, 2023
1 parent b43244c commit 92ece91
Show file tree
Hide file tree
Showing 13 changed files with 983 additions and 536 deletions.
2 changes: 2 additions & 0 deletions clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,8 @@ message IndexLiveSettings {
google.protobuf.Int32Value defaultTerminateAfter = 13;
// Merge precopy would be stopped after this time, or 0 for no checks, default: 0
google.protobuf.UInt64Value maxMergePreCopyDurationSec = 14;
// Collect and publish additional index metrics, which may be more expensive in terms of volume, memory and/or compute, default: false
google.protobuf.BoolValue verboseMetrics = 15;
}

message IndexStateInfo {
Expand Down
1,084 changes: 549 additions & 535 deletions grpc-gateway/luceneserver.pb.go

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions grpc-gateway/luceneserver.swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -1382,6 +1382,13 @@
"required": false,
"type": "string",
"format": "uint64"
},
{
"name": "liveSettings.verboseMetrics",
"description": "Collect and publish additional index metrics, which may be more expensive in terms of volume, memory and/or compute, default: false.",
"in": "query",
"required": false,
"type": "boolean"
}
],
"tags": [
Expand Down Expand Up @@ -3630,6 +3637,10 @@
"type": "string",
"format": "uint64",
"title": "Merge precopy would be stopped after this time, or 0 for no checks, default: 0"
},
"verboseMetrics": {
"type": "boolean",
"title": "Collect and publish additional index metrics, which may be more expensive in terms of volume, memory and/or compute, default: false"
}
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
package com.yelp.nrtsearch.server.cli;

import com.google.protobuf.BoolValue;
import com.google.protobuf.DoubleValue;
import com.google.protobuf.Int32Value;
import com.google.protobuf.UInt64Value;
Expand Down Expand Up @@ -110,6 +111,12 @@ public class LiveSettingsV2Command implements Callable<Integer> {
description = "Maximum time allowed for merge precopy in seconds")
private Long maxMergePreCopyDurationSec;

@CommandLine.Option(
names = {"--verboseMetrics"},
description =
"If additional index metrics should be collected and published, must be 'true' or 'false'")
private String verboseMetrics;

@Override
public Integer call() throws Exception {
LuceneServerClient client = baseCmd.getClient();
Expand Down Expand Up @@ -172,6 +179,10 @@ public Integer call() throws Exception {
liveSettingsBuilder.setMaxMergePreCopyDurationSec(
UInt64Value.newBuilder().setValue(maxMergePreCopyDurationSec));
}
if (verboseMetrics != null) {
liveSettingsBuilder.setVerboseMetrics(
BoolValue.newBuilder().setValue(parseBoolean(verboseMetrics)).build());
}

IndexLiveSettings indexLiveSettings = liveSettingsBuilder.build();
if (!indexLiveSettings.getAllFields().isEmpty()) {
Expand All @@ -183,4 +194,15 @@ public Integer call() throws Exception {
}
return 0;
}

private Boolean parseBoolean(String booleanStr) {
String lowerCaseStr = booleanStr.toLowerCase();
if ("true".equals(lowerCaseStr)) {
return Boolean.TRUE;
} else if ("false".equals(lowerCaseStr)) {
return Boolean.FALSE;
} else {
throw new IllegalArgumentException("Invalid boolean string: " + booleanStr);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ private void registerMetrics(GlobalState globalState) {
new DirSizeCollector(globalState).register(collectorRegistry);
new ProcStatCollector().register(collectorRegistry);
new MergeSchedulerCollector(globalState).register(collectorRegistry);
new VerboseIndexCollector(globalState).register(collectorRegistry);
}

/** Main launches the server from the command line. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,9 @@ public abstract IndexWriterConfig getIndexWriterConfig(

public abstract Map<String, Lookup> getSuggesters();

/** Get if additional index metrics should be collected and published. */
public abstract boolean getVerboseMetrics();

@Override
public void close() throws IOException {}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import com.yelp.nrtsearch.server.luceneserver.search.SearchCutoffWrapper.CollectionTimeoutException;
import com.yelp.nrtsearch.server.luceneserver.search.SearchRequestProcessor;
import com.yelp.nrtsearch.server.luceneserver.search.SearcherResult;
import com.yelp.nrtsearch.server.monitoring.VerboseIndexCollector;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
Expand Down Expand Up @@ -287,7 +288,12 @@ public SearchResponse handle(IndexState indexState, SearchRequest searchRequest)

// if we are out of time, don't bother with serialization
DeadlineUtils.checkDeadline("SearchHandler: end", "SEARCH");
return searchContext.getResponseBuilder().build();
SearchResponse searchResponse = searchContext.getResponseBuilder().build();
if (!warming && searchContext.getIndexState().getVerboseMetrics()) {
VerboseIndexCollector.updateSearchResponseMetrics(
searchResponse, searchContext.getIndexState().getName());
}
return searchResponse;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ public class ImmutableIndexState extends IndexState {
.setDefaultSearchTimeoutCheckEvery(Int32Value.newBuilder().setValue(0).build())
.setDefaultTerminateAfter(Int32Value.newBuilder().setValue(0).build())
.setMaxMergePreCopyDurationSec(UInt64Value.newBuilder().setValue(0))
.setVerboseMetrics(BoolValue.newBuilder().setValue(false).build())
.build();

// Live Settings
Expand All @@ -181,6 +182,7 @@ public class ImmutableIndexState extends IndexState {
private final int defaultSearchTimeoutCheckEvery;
private final int defaultTerminateAfter;
private final long maxMergePreCopyDurationSec;
private final boolean verboseMetrics;

private final IndexStateManager indexStateManager;
private final String uniqueName;
Expand Down Expand Up @@ -267,6 +269,7 @@ public ImmutableIndexState(
mergedLiveSettings.getDefaultSearchTimeoutCheckEvery().getValue();
defaultTerminateAfter = mergedLiveSettings.getDefaultTerminateAfter().getValue();
maxMergePreCopyDurationSec = mergedLiveSettings.getMaxMergePreCopyDurationSec().getValue();
verboseMetrics = mergedLiveSettings.getVerboseMetrics().getValue();

// If there is previous shard state, use it. Otherwise, initialize the shard.
if (previousShardState != null) {
Expand Down Expand Up @@ -830,6 +833,11 @@ public Map<String, Lookup> getSuggesters() {
throw new UnsupportedOperationException();
}

@Override
public boolean getVerboseMetrics() {
return verboseMetrics;
}

@Override
public void initWarmer(Archiver archiver) {
initWarmer(archiver, uniqueName);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*
* Copyright 2022 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.monitoring;

import com.yelp.nrtsearch.server.grpc.SearchResponse;
import com.yelp.nrtsearch.server.grpc.SearchResponse.Diagnostics;
import com.yelp.nrtsearch.server.luceneserver.GlobalState;
import io.prometheus.client.Collector;
import io.prometheus.client.Counter;
import io.prometheus.client.Summary;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Collector for detailed index metrics that may be expensive to produce or publish. Collection can
* be toggled with the verbose metrics index live setting.
*/
public class VerboseIndexCollector extends Collector {
private static final Logger logger = LoggerFactory.getLogger(VerboseIndexCollector.class);
private final GlobalState globalState;

private static final Summary searchResponseSizeBytes =
Summary.build()
.name("nrt_search_response_size_bytes")
.help("Size of response protobuf message")
.quantile(0.0, 0)
.quantile(0.5, 0.01)
.quantile(0.95, 0.005)
.quantile(0.99, 0.005)
.quantile(1.0, 0)
.labelNames("index")
.create();

private static final Summary searchResponseTotalHits =
Summary.build()
.name("nrt_search_response_total_hits")
.help("Number of total hits for queries")
.quantile(0.0, 0)
.quantile(0.5, 0.01)
.quantile(0.95, 0.005)
.quantile(0.99, 0.005)
.quantile(1.0, 0)
.labelNames("index")
.create();

private static final Summary searchStageLatencyMs =
Summary.build()
.name("nrt_search_stage_latency_ms")
.help("Latency of various search operations (ms)")
.quantile(0.0, 0)
.quantile(0.5, 0.01)
.quantile(0.95, 0.005)
.quantile(0.99, 0.005)
.quantile(1.0, 0)
.labelNames("index", "stage")
.create();

private static final Counter searchTimeoutCount =
Counter.build()
.name("nrt_search_timeout_count")
.help("Number of requests that hit the recall timeout")
.labelNames("index")
.create();

private static final Counter searchTerminatedEarlyCount =
Counter.build()
.name("nrt_search_terminated_early_count")
.help("Number of requests that terminated early")
.labelNames("index")
.create();

public static void updateSearchResponseMetrics(SearchResponse searchResponse, String index) {
searchResponseSizeBytes.labels(index).observe(searchResponse.getSerializedSize());
searchResponseTotalHits.labels(index).observe(searchResponse.getTotalHits().getValue());
if (searchResponse.getHitTimeout()) {
searchTimeoutCount.labels(index).inc();
}
if (searchResponse.getTerminatedEarly()) {
searchTerminatedEarlyCount.labels(index).inc();
}

Diagnostics diagnostics = searchResponse.getDiagnostics();
searchStageLatencyMs.labels(index, "recall").observe(diagnostics.getFirstPassSearchTimeMs());
searchStageLatencyMs.labels(index, "highlight").observe(diagnostics.getHighlightTimeMs());
searchStageLatencyMs.labels(index, "fetch").observe(diagnostics.getGetFieldsTimeMs());
for (Map.Entry<String, Double> entry : diagnostics.getFacetTimeMsMap().entrySet()) {
searchStageLatencyMs.labels(index, "facet:" + entry.getKey()).observe(entry.getValue());
}
for (Map.Entry<String, Double> entry : diagnostics.getRescorersTimeMsMap().entrySet()) {
searchStageLatencyMs.labels(index, "rescorer:" + entry.getKey()).observe(entry.getValue());
}
}

public VerboseIndexCollector(GlobalState globalState) {
this.globalState = globalState;
}

@Override
public List<MetricFamilySamples> collect() {
List<MetricFamilySamples> mfs = new ArrayList<>();

try {
boolean publishMetrics = false;
Set<String> indexNames = globalState.getIndexNames();
for (String indexName : indexNames) {
if (globalState.getIndex(indexName).getVerboseMetrics()) {
publishMetrics = true;
break;
}
}
if (publishMetrics) {
mfs.addAll(searchResponseSizeBytes.collect());
mfs.addAll(searchResponseTotalHits.collect());
mfs.addAll(searchStageLatencyMs.collect());
mfs.addAll(searchTimeoutCount.collect());
mfs.addAll(searchTerminatedEarlyCount.collect());
}
} catch (Exception e) {
logger.warn("Error getting verbose index metrics: ", e);
}

return mfs;
}
}
Loading

0 comments on commit 92ece91

Please sign in to comment.