Skip to content

Commit

Permalink
Add prometheus metrics to serving (#316)
Browse files Browse the repository at this point in the history
* Add prometheus metrics to serving

* Add subsystem

* Use histogram instead of summary

* Change timer to histogram timer

* Fix configuration for serving chart for prometheus

* Fix application.yaml reference in chart
  • Loading branch information
Chen Zhiling authored and feast-ci-bot committed Nov 25, 2019
1 parent 31aaed4 commit a79c77b
Show file tree
Hide file tree
Showing 9 changed files with 140 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ metadata:
chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
annotations:
{{- if .Values.prometheus.enabled }}
{{ $config := index .Values "application.yaml" }}
prometheus.io/path: /metrics
prometheus.io/port: "{{ $config.server.port }}"
prometheus.io/scrape: "true"
{{- end }}
spec:
replicas: {{ .Values.replicaCount }}
selector:
Expand Down
5 changes: 5 additions & 0 deletions infra/charts/feast/charts/feast-serving/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ application.yaml:
grpc:
port: 6566
enable-reflection: true
server:
port: 8080
spring:
main:
web-application-type: none
Expand Down Expand Up @@ -177,6 +179,9 @@ ingress:
# - host: chart-example.local
# port: http

prometheus:
enabled: true

resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious
# choice for the user. This also increases chances charts run on environments with little
Expand Down
24 changes: 24 additions & 0 deletions serving/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,30 @@
<version>0.31.0</version>
</dependency>

<!-- The client -->
<dependency>
<groupId>io.prometheus</groupId>
<artifactId>simpleclient</artifactId>
<version>0.8.0</version>
</dependency>
<!-- Hotspot JVM metrics-->
<dependency>
<groupId>io.prometheus</groupId>
<artifactId>simpleclient_hotspot</artifactId>
<version>0.8.0</version>
</dependency>
<!-- Exposition HTTPServer-->
<dependency>
<groupId>io.prometheus</groupId>
<artifactId>simpleclient_servlet</artifactId>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>io.prometheus</groupId>
<artifactId>simpleclient_spring_boot</artifactId>
<version>0.8.0</version>
</dependency>

<!-- Google Cloud -->
<dependency>
<groupId>com.google.cloud</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,29 @@
import feast.serving.FeastProperties;
import io.opentracing.Tracer;
import io.opentracing.noop.NoopTracerFactory;
import io.prometheus.client.hotspot.DefaultExports;
import io.prometheus.client.exporter.MetricsServlet;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.web.servlet.ServletRegistrationBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

@Configuration
public class InstrumentationConfig {

private FeastProperties feastProperties;

@Autowired
public InstrumentationConfig(FeastProperties feastProperties) {
this.feastProperties = feastProperties;
}

@Bean
public ServletRegistrationBean servletRegistrationBean() {
DefaultExports.initialize();
return new ServletRegistrationBean(new MetricsServlet(), "/metrics");
}

@Bean
public Tracer tracer() {
if (!feastProperties.getTracing().isEnabled()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package feast.serving.service;

import static feast.serving.util.BigQueryUtil.getTimestampLimitQuery;
import static feast.serving.util.Metrics.requestCount;
import static feast.serving.util.Metrics.requestLatency;

import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException;
Expand Down Expand Up @@ -37,6 +39,7 @@
import feast.serving.ServingAPIProto.JobType;
import feast.serving.util.BigQueryUtil;
import io.grpc.Status;
import io.prometheus.client.Histogram.Timer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -100,11 +103,13 @@ public GetOnlineFeaturesResponse getOnlineFeatures(GetOnlineFeaturesRequest getF
*/
@Override
public GetBatchFeaturesResponse getBatchFeatures(GetBatchFeaturesRequest getFeaturesRequest) {

Timer getBatchFeaturesTimer = requestLatency.labels("getBatchFeatures").startTimer();
List<FeatureSetSpec> featureSetSpecs =
getFeaturesRequest.getFeatureSetsList().stream()
.map(featureSet ->
specService.getFeatureSet(featureSet.getName(), featureSet.getVersion())
.map(featureSet -> {
requestCount.labels(featureSet.getName()).inc();
return specService.getFeatureSet(featureSet.getName(), featureSet.getVersion());
}
)
.collect(Collectors.toList());

Expand Down Expand Up @@ -233,6 +238,7 @@ public GetBatchFeaturesResponse getBatchFeatures(GetBatchFeaturesRequest getFeat
})
.start();

getBatchFeaturesTimer.observeDuration();
return GetBatchFeaturesResponse.newBuilder().setJob(feastJob).build();
}

Expand Down
13 changes: 13 additions & 0 deletions serving/src/main/java/feast/serving/service/CachedSpecService.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import feast.core.StoreProto.Store.Subscription;
import feast.serving.exception.SpecRetrievalException;
import io.grpc.StatusRuntimeException;
import io.prometheus.client.Gauge;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
Expand All @@ -40,6 +41,15 @@ public class CachedSpecService {
private final LoadingCache<String, FeatureSetSpec> featureSetSpecCache;
private Store store;

private static Gauge featureSetsCount = Gauge.build().name("feature_set_count")
.subsystem("feast_serving")
.help("number of feature sets served by this instance")
.register();
private static Gauge cacheLastUpdated = Gauge.build().name("cache_last_updated")
.subsystem("feast_serving")
.help("epoch time of the last time the cache was updated")
.register();

public CachedSpecService(CoreSpecService coreService, Path configPath) {
this.configPath = configPath;
this.coreService = coreService;
Expand Down Expand Up @@ -102,6 +112,9 @@ public void populateCache() {
this.store = updateStore(readConfig(configPath));
Map<String, FeatureSetSpec> featureSetSpecMap = getFeatureSetSpecMap();
featureSetSpecCache.putAll(featureSetSpecMap);

featureSetsCount.set(featureSetSpecCache.size());
cacheLastUpdated.set(System.currentTimeMillis());
}

public void scheduledPopulateCache() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@

package feast.serving.service;

import static feast.serving.util.Metrics.missingKeyCount;
import static feast.serving.util.Metrics.requestLatency;
import static feast.serving.util.Metrics.requestCount;
import static feast.serving.util.Metrics.staleKeyCount;

import com.google.common.collect.Maps;
import com.google.protobuf.AbstractMessageLite;
import com.google.protobuf.Duration;
Expand All @@ -41,6 +46,7 @@
import io.grpc.Status;
import io.opentracing.Scope;
import io.opentracing.Tracer;
import io.prometheus.client.Histogram.Timer;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
Expand All @@ -61,7 +67,9 @@ public RedisServingService(JedisPool jedisPool, CachedSpecService specService, T
this.tracer = tracer;
}

/** {@inheritDoc} */
/**
* {@inheritDoc}
*/
@Override
public GetFeastServingInfoResponse getFeastServingInfo(
GetFeastServingInfoRequest getFeastServingInfoRequest) {
Expand All @@ -70,10 +78,13 @@ public GetFeastServingInfoResponse getFeastServingInfo(
.build();
}

/** {@inheritDoc} */
/**
* {@inheritDoc}
*/
@Override
public GetOnlineFeaturesResponse getOnlineFeatures(GetOnlineFeaturesRequest request) {
try (Scope scope = tracer.buildSpan("Redis-getOnlineFeatures").startActive(true)) {
Timer getOnlineFeaturesTimer = requestLatency.labels("getOnlineFeatures").startTimer();
GetOnlineFeaturesResponse.Builder getOnlineFeaturesResponseBuilder =
GetOnlineFeaturesResponse.newBuilder();

Expand Down Expand Up @@ -114,6 +125,7 @@ public GetOnlineFeaturesResponse getOnlineFeatures(GetOnlineFeaturesRequest requ
featureValuesMap.values().stream()
.map(m -> FieldValues.newBuilder().putAllFields(m).build())
.collect(Collectors.toList());
getOnlineFeaturesTimer.observeDuration();
return getOnlineFeaturesResponseBuilder.addAllFieldValues(fieldValues).build();
}
}
Expand Down Expand Up @@ -166,9 +178,11 @@ private RedisKey makeRedisKey(
for (int i = 0; i < featureSetEntityNames.size(); i++) {
String entityName = featureSetEntityNames.get(i);

if (!fieldsMap.containsKey(entityName)){
if (!fieldsMap.containsKey(entityName)) {
throw Status.INVALID_ARGUMENT
.withDescription(String.format("Entity row fields \"%s\" does not contain required entity field \"%s\"", fieldsMap.keySet().toString(), entityName))
.withDescription(String
.format("Entity row fields \"%s\" does not contain required entity field \"%s\"",
fieldsMap.keySet().toString(), entityName))
.asRuntimeException();
}

Expand All @@ -186,33 +200,45 @@ private void sendAndProcessMultiGet(
throws InvalidProtocolBufferException {

List<byte[]> jedisResps = sendMultiGet(redisKeys);

Timer processResponseTimer = requestLatency.labels("processResponse")
.startTimer();
try (Scope scope = tracer.buildSpan("Redis-processResponse").startActive(true)) {
String featureSetId =
String.format("%s:%d", featureSetRequest.getName(), featureSetRequest.getVersion());

Map<String, Value> nullValues =
featureSetRequest.getFeatureNamesList().stream()
.collect(
Collectors.toMap(
name -> featureSetId + ":" + name, name -> Value.newBuilder().build()));

for (int i = 0; i < jedisResps.size(); i++) {
EntityRow entityRow = entityRows.get(i);
Map<String, Value> featureValues = featureValuesMap.get(entityRow);

byte[] jedisResponse = jedisResps.get(i);
if (jedisResponse == null) {
missingKeyCount.labels(featureSetRequest.getName()).inc();
featureValues.putAll(nullValues);
continue;
}

FeatureRow featureRow = FeatureRow.parseFrom(jedisResponse);

boolean stale = isStale(featureSetRequest, entityRow, featureRow);
if (stale) {
staleKeyCount.labels(featureSetRequest.getName()).inc();
featureValues.putAll(nullValues);
continue;
}

requestCount.labels(featureSetRequest.getName()).inc();
featureRow.getFieldsList().stream()
.filter(f -> featureSetRequest.getFeatureNamesList().contains(f.getName()))
.forEach(f -> featureValues.put(featureSetId + ":" + f.getName(), f.getValue()));
}
} finally {
processResponseTimer.observeDuration();
}
}

Expand All @@ -237,6 +263,7 @@ private boolean isStale(
*/
private List<byte[]> sendMultiGet(List<RedisKey> keys) {
try (Scope scope = tracer.buildSpan("Redis-sendMultiGet").startActive(true)) {
Timer sendMultiGetTimer = requestLatency.labels("sendMultiGet").startTimer();
try (Jedis jedis = jedisPool.getResource()) {
byte[][] binaryKeys =
keys.stream()
Expand All @@ -249,6 +276,8 @@ private List<byte[]> sendMultiGet(List<RedisKey> keys) {
.withDescription("Unable to retrieve feature from Redis")
.withCause(e)
.asRuntimeException();
} finally {
sendMultiGetTimer.observeDuration();
}
}
}
Expand Down
37 changes: 37 additions & 0 deletions serving/src/main/java/feast/serving/util/Metrics.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package feast.serving.util;

import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import io.prometheus.client.Summary;

public class Metrics {

public static final Histogram requestLatency = Histogram.build()
.buckets(2, 4, 6, 8, 10, 15, 20, 25, 30, 35, 50)
.name("request_latency_ms")
.subsystem("feast_serving")
.help("Request latency in milliseconds.")
.labelNames("method")
.register();

public static final Counter requestCount = Counter.build()
.name("request_feature_count")
.subsystem("feast_serving")
.help("number of feature rows requested")
.labelNames("feature_set_name")
.register();

public static final Counter missingKeyCount = Counter.build()
.name("missing_feature_count")
.subsystem("feast_serving")
.help("number requested feature rows that were not found")
.labelNames("feature_set_name")
.register();

public static final Counter staleKeyCount = Counter.build()
.name("stale_feature_count")
.subsystem("feast_serving")
.help("number requested feature rows that were stale")
.labelNames("feature_set_name")
.register();
}
2 changes: 1 addition & 1 deletion serving/src/main/resources/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,6 @@ grpc:

server:
# The port number on which the Tomcat webserver that serves REST API endpoints should listen
# It is set by default to 8080 so it does not conflict with Tomcat webserver on Feast Core
# It is set by default to 8081 so it does not conflict with Tomcat webserver on Feast Core
# if both Feast Core and Serving are running on the same machine
port: ${SERVER_PORT:8081}

0 comments on commit a79c77b

Please sign in to comment.