Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Per-operation and client error metric #6443

Open
wants to merge 23 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .changesets/exp_njm_operation_error_metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
### Experimental per-operation error metrics ([PR #6443](https://github.com/apollographql/router/pull/6443))

Adds a new experimental OpenTelemetry metric that includes error counts at a per-operation and per-client level. These metrics contain the following attributes:
* Operation name
* Operation type (query/mutation/subscription)
* Apollo operation ID
* Client name
* Client version
* Error code

This metric is currently only sent to GraphOS and is not available in 3rd-party OTel destinations.

By [@bonnici](https://github.com/bonnici) in https://github.com/apollographql/router/pull/6443
Original file line number Diff line number Diff line change
Expand Up @@ -2651,6 +2651,10 @@ expression: "&schema"
"ErrorsConfiguration": {
"additionalProperties": false,
"properties": {
"experimental_otlp_error_metrics": {
"$ref": "#/definitions/OtlpErrorMetricsMode",
"description": "#/definitions/OtlpErrorMetricsMode"
},
"subgraph": {
"$ref": "#/definitions/SubgraphErrorConfig",
"description": "#/definitions/SubgraphErrorConfig"
Expand Down Expand Up @@ -4357,6 +4361,25 @@ expression: "&schema"
}
]
},
"OtlpErrorMetricsMode": {
"description": "Open Telemetry error metrics mode",
"oneOf": [
{
"description": "Do not store OTLP error metrics",
"enum": [
"disabled"
],
"type": "string"
},
{
"description": "Send OTLP error metrics to Apollo Studio",
"enum": [
"enabled"
],
"type": "string"
}
]
},
"PersistedQueries": {
"additionalProperties": false,
"description": "Persisted Queries (PQ) configuration",
Expand Down
14 changes: 14 additions & 0 deletions apollo-router/src/plugins/telemetry/apollo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ pub(crate) struct Config {
pub(crate) struct ErrorsConfiguration {
/// Handling of errors coming from subgraph
pub(crate) subgraph: SubgraphErrorConfig,

/// Configuration for storing and sending error metrics via OTLP
pub(crate) experimental_otlp_error_metrics: OtlpErrorMetricsMode,
}

#[derive(Debug, Clone, Deserialize, JsonSchema, Default)]
Expand Down Expand Up @@ -160,6 +163,17 @@ impl SubgraphErrorConfig {
}
}

/// Open Telemetry error metrics mode
#[derive(Clone, Default, Debug, Deserialize, JsonSchema, Copy)]
#[serde(deny_unknown_fields, rename_all = "lowercase")]
pub(crate) enum OtlpErrorMetricsMode {
/// Do not store OTLP error metrics
#[default]
Disabled,
/// Send OTLP error metrics to Apollo Studio
Enabled,
}

const fn default_field_level_instrumentation_sampler() -> SamplerOption {
SamplerOption::TraceIdRatioBased(0.01)
}
Expand Down
2 changes: 1 addition & 1 deletion apollo-router/src/plugins/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ pub(crate) mod utils;

// Tracing consts
pub(crate) const CLIENT_NAME: &str = "apollo_telemetry::client_name";
const CLIENT_VERSION: &str = "apollo_telemetry::client_version";
pub(crate) const CLIENT_VERSION: &str = "apollo_telemetry::client_version";
const SUBGRAPH_FTV1: &str = "apollo_telemetry::subgraph_ftv1";
pub(crate) const STUDIO_EXCLUDE: &str = "apollo_telemetry::studio::exclude";
pub(crate) const SUPERGRAPH_SCHEMA_ID_CONTEXT_KEY: &str = "apollo::supergraph_schema_id";
Expand Down
76 changes: 65 additions & 11 deletions apollo-router/src/services/router/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,14 @@ use crate::cache::DeduplicatingCache;
use crate::configuration::Batching;
use crate::configuration::BatchingMode;
use crate::context::CONTAINS_GRAPHQL_ERROR;
use crate::context::OPERATION_KIND;
use crate::context::OPERATION_NAME;
use crate::graphql;
use crate::http_ext;
#[cfg(test)]
use crate::plugin::test::MockSupergraphService;
use crate::plugins::telemetry::apollo::OtlpErrorMetricsMode;
use crate::plugins::telemetry::config::Conf;
use crate::plugins::telemetry::config_new::attributes::HTTP_REQUEST_BODY;
use crate::plugins::telemetry::config_new::attributes::HTTP_REQUEST_HEADERS;
use crate::plugins::telemetry::config_new::attributes::HTTP_REQUEST_URI;
Expand All @@ -54,9 +58,12 @@ use crate::plugins::telemetry::config_new::events::log_event;
use crate::plugins::telemetry::config_new::events::DisplayRouterRequest;
use crate::plugins::telemetry::config_new::events::DisplayRouterResponse;
use crate::plugins::telemetry::config_new::events::RouterResponseBodyExtensionType;
use crate::plugins::telemetry::CLIENT_NAME;
use crate::plugins::telemetry::CLIENT_VERSION;
use crate::protocols::multipart::Multipart;
use crate::protocols::multipart::ProtocolMode;
use crate::query_planner::InMemoryCachePlanner;
use crate::query_planner::APOLLO_OPERATION_ID;
use crate::router_factory::RouterFactory;
use crate::services::layers::apq::APQLayer;
use crate::services::layers::content_negotiation;
Expand Down Expand Up @@ -102,6 +109,7 @@ pub(crate) struct RouterService {
persisted_query_layer: Arc<PersistedQueryLayer>,
query_analysis_layer: QueryAnalysisLayer,
batching: Batching,
oltp_error_metrics_mode: OtlpErrorMetricsMode,
}

impl RouterService {
Expand All @@ -111,13 +119,15 @@ impl RouterService {
persisted_query_layer: Arc<PersistedQueryLayer>,
query_analysis_layer: QueryAnalysisLayer,
batching: Batching,
oltp_error_metrics_mode: OtlpErrorMetricsMode,
) -> Self {
RouterService {
supergraph_creator,
apq_layer,
persisted_query_layer,
query_analysis_layer,
batching,
oltp_error_metrics_mode,
}
}
}
Expand Down Expand Up @@ -309,7 +319,7 @@ impl RouterService {
&& (accepts_json || accepts_wildcard)
{
if !response.errors.is_empty() {
Self::count_errors(&response.errors);
self.count_errors(&response.errors, &context);
}

parts
Expand Down Expand Up @@ -346,7 +356,7 @@ impl RouterService {
}

if !response.errors.is_empty() {
Self::count_errors(&response.errors);
self.count_errors(&response.errors, &context);
}

// Useful when you're using a proxy like nginx which enable proxy_buffering by default (http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_buffering)
Expand All @@ -373,12 +383,8 @@ impl RouterService {

Ok(RouterResponse { response, context })
} else {
u64_counter!(
"apollo.router.graphql_error",
"Number of GraphQL error responses returned by the router",
1,
code = "INVALID_ACCEPT_HEADER"
);
self.count_error_codes(vec![Some("INVALID_ACCEPT_HEADER")], &context);

// Useful for selector in spans/instruments/events
context.insert_json_value(
CONTAINS_GRAPHQL_ERROR,
Expand Down Expand Up @@ -812,12 +818,47 @@ impl RouterService {
Ok(graphql_requests)
}

fn count_errors(errors: &[graphql::Error]) {
fn count_errors(&self, errors: &[graphql::Error], context: &Context) {
let codes = errors
.iter()
.map(|e| e.extensions.get("code").and_then(|c| c.as_str()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC for errors without a code, it would count against the "empty" code e.g. code="". I think we could consider defining an explicit UNKNOWN code but it seems fine to support this with an empty code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this is an empty string. I can easily add a fallback code but I think an empty string is just as good.

.collect();
self.count_error_codes(codes, context);
bonnici marked this conversation as resolved.
Show resolved Hide resolved
}

fn count_error_codes(&self, codes: Vec<Option<&str>>, context: &Context) {
let unwrap_context_string = |context_key: &str| -> String {
context
.get::<_, String>(context_key)
.unwrap_or_default()
.unwrap_or_default()
};

let operation_id = unwrap_context_string(APOLLO_OPERATION_ID);
let operation_name = unwrap_context_string(OPERATION_NAME);
let operation_kind = unwrap_context_string(OPERATION_KIND);
let client_name = unwrap_context_string(CLIENT_NAME);
let client_version = unwrap_context_string(CLIENT_VERSION);

let mut map = HashMap::new();
for error in errors {
let code = error.extensions.get("code").and_then(|c| c.as_str());
for code in &codes {
let entry = map.entry(code).or_insert(0u64);
*entry += 1;

if matches!(self.oltp_error_metrics_mode, OtlpErrorMetricsMode::Enabled) {
let code_str = code.unwrap_or_default().to_string();
u64_counter!(
"apollo.router.operations.error",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know that the Router team has asked for this to be a private metric - would that mean we would need to call it apollo_private.router.operations.error? Ultimately we don't want it to be, so I'm fine with it not being private at all.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should pass the "private" filter regex and not pass the "public" filter regex, so it should be private. I don't really know how to test that thought because it is still coming through in prometheus.

    pub(crate) fn private<T: Into<MeterProvider>>(delegate: T) -> Self {
        FilterMeterProvider::builder()
            .delegate(delegate)
            .allow(
                Regex::new(
                    r"apollo\.(graphos\.cloud|router\.(operations?|lifecycle|config|schema|query|query_planning|telemetry|instance))(\..*|$)|apollo_router_uplink_fetch_count_total|apollo_router_uplink_fetch_duration_seconds",
                )
                .expect("regex should have been valid"),
            )
            .build()
    }

    pub(crate) fn public<T: Into<MeterProvider>>(delegate: T) -> Self {
        FilterMeterProvider::builder()
            .delegate(delegate)
            .deny(
                Regex::new(r"apollo\.router\.(config|entities|instance|operations\.(connectors|fetch|request_size|response_size)|schema\.connectors)(\..*|$)")
                    .expect("regex should have been valid"),
            )
            .build()
    }

"Number of errors returned by operation",
1,
"apollo.operation.id" = operation_id.clone(),
"graphql.operation.name" = operation_name.clone(),
"graphql.operation.type" = operation_kind.clone(),
"apollo.client.name" = client_name.clone(),
"apollo.client.version" = client_version.clone(),
"graphql.error.extensions.code" = code_str
);
}
}

for (code, count) in map {
Expand Down Expand Up @@ -865,6 +906,7 @@ pub(crate) struct RouterCreator {
pub(crate) persisted_query_layer: Arc<PersistedQueryLayer>,
query_analysis_layer: QueryAnalysisLayer,
batching: Batching,
oltp_error_metrics_mode: OtlpErrorMetricsMode,
}

impl ServiceFactory<router::Request> for RouterCreator {
Expand Down Expand Up @@ -916,13 +958,24 @@ impl RouterCreator {
// For now just call activate to make the gauges work on the happy path.
apq_layer.activate();

let oltp_error_metrics_mode = match configuration.apollo_plugins.plugins.get("telemetry") {
Some(telemetry_config) => {
match serde_json::from_value::<Conf>(telemetry_config.clone()) {
Ok(conf) => conf.apollo.errors.experimental_otlp_error_metrics,
_ => OtlpErrorMetricsMode::default(),
}
}
_ => OtlpErrorMetricsMode::default(),
};

Ok(Self {
supergraph_creator,
static_page,
apq_layer,
query_analysis_layer,
persisted_query_layer,
batching: configuration.batching.clone(),
oltp_error_metrics_mode,
})
}

Expand All @@ -940,6 +993,7 @@ impl RouterCreator {
self.persisted_query_layer.clone(),
self.query_analysis_layer.clone(),
self.batching.clone(),
self.oltp_error_metrics_mode,
));

ServiceBuilder::new()
Expand Down
107 changes: 107 additions & 0 deletions apollo-router/src/services/router/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,29 @@ use http::HeaderValue;
use http::Method;
use http::Uri;
use mime::APPLICATION_JSON;
use opentelemetry_api::KeyValue;
use serde_json_bytes::json;
use tower::ServiceExt;
use tower_service::Service;

use crate::context::OPERATION_KIND;
use crate::context::OPERATION_NAME;
use crate::graphql;
use crate::metrics::FutureMetricsExt;
use crate::plugins::telemetry::CLIENT_NAME;
use crate::plugins::telemetry::CLIENT_VERSION;
use crate::query_planner::APOLLO_OPERATION_ID;
use crate::services::router;
use crate::services::router::service::from_supergraph_mock_callback;
use crate::services::router::service::from_supergraph_mock_callback_and_configuration;
use crate::services::router::service::process_vary_header;
use crate::services::subgraph;
use crate::services::supergraph;
use crate::services::SupergraphRequest;
use crate::services::SupergraphResponse;
use crate::services::MULTIPART_DEFER_CONTENT_TYPE;
use crate::test_harness::make_fake_batch;
use crate::Configuration;
use crate::Context;

// Test Vary processing
Expand Down Expand Up @@ -590,3 +599,101 @@ async fn escaped_quotes_in_string_literal() {
// The string literal made it through unchanged:
assert!(subgraph_query.contains(r#"reviewsForAuthor(authorID: "\"1\"")"#));
}

#[tokio::test]
async fn it_stores_operation_error_when_config_is_enabled() {
async {
let query = "query operationName { __typename }";
let operation_name = "operationName";
let operation_type = "query";
let operation_id = "opId";
let client_name = "client";
let client_version = "version";

let mut config = Configuration::default();
config.apollo_plugins.plugins.insert(
"telemetry".to_string(),
serde_json::json!({
"apollo": {
"errors": {
"experimental_otlp_error_metrics": "enabled"
}
}
}),
);

let mut router_service = from_supergraph_mock_callback_and_configuration(
move |req| {
let example_response = graphql::Response::builder()
.data(json!({"data": null}))
.errors(vec![
graphql::Error::builder()
.message("some error")
.extension_code("SOME_ERROR_CODE")
.build(),
graphql::Error::builder()
.message("some other error")
.extension_code("SOME_OTHER_ERROR_CODE")
.build(),
])
.build();

Ok(SupergraphResponse::new_from_graphql_response(
example_response,
req.context,
))
},
Arc::new(config),
)
.await;

let context = Context::new();
context.insert_json_value(APOLLO_OPERATION_ID, operation_id.into());
context.insert_json_value(OPERATION_NAME, operation_name.into());
context.insert_json_value(OPERATION_KIND, query.into());
context.insert_json_value(CLIENT_NAME, client_name.into());
context.insert_json_value(CLIENT_VERSION, client_version.into());

let post_request = supergraph::Request::builder()
.query(query)
.operation_name(operation_name)
.header(CONTENT_TYPE, APPLICATION_JSON.essence_str())
.uri(Uri::from_static("/"))
.method(Method::POST)
.context(context)
.build()
.unwrap();

router_service
.call(post_request.try_into().unwrap())
.await
.unwrap();

assert_counter!(
"apollo.router.operations.error",
1,
&[
KeyValue::new("apollo.operation.id", operation_id),
KeyValue::new("graphql.operation.name", operation_name),
KeyValue::new("graphql.operation.type", operation_type),
KeyValue::new("apollo.client.name", client_name),
KeyValue::new("apollo.client.version", client_version),
KeyValue::new("graphql.error.extensions.code", "SOME_ERROR_CODE"),
]
);
assert_counter!(
"apollo.router.operations.error",
1,
&[
KeyValue::new("apollo.operation.id", operation_id),
KeyValue::new("graphql.operation.name", operation_name),
KeyValue::new("graphql.operation.type", operation_type),
KeyValue::new("apollo.client.name", client_name),
KeyValue::new("apollo.client.version", client_version),
KeyValue::new("graphql.error.extensions.code", "SOME_OTHER_ERROR_CODE"),
]
);
}
.with_metrics()
.await;
}
Loading