-
Notifications
You must be signed in to change notification settings - Fork 275
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Per-operation and client error metric #6443
base: dev
Are you sure you want to change the base?
Changes from 16 commits
e369240
98eef0a
4e4d728
38d4517
b6e97be
f98b608
ff44170
5284c2b
90eaebd
498bbeb
d2e2610
2145777
a46fee7
da4e95c
e7b18e9
4a753af
054c87d
0561648
9b996e6
c213521
3a09ee8
abea7ac
872a8f3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
### Experimental per-operation error metrics ([PR #6443](https://github.com/apollographql/router/pull/6443)) | ||
|
||
Adds a new experimental OpenTelemetry metric that includes error counts at a per-operation and per-client level. These metrics contain the following attributes: | ||
* Operation name | ||
* Operation type (query/mutation/subscription) | ||
* Apollo operation ID | ||
* Client name | ||
* Client version | ||
* Error code | ||
|
||
This metric is currently only sent to GraphOS and is not available in 3rd-party OTel destinations. | ||
|
||
By [@bonnici](https://github.com/bonnici) in https://github.com/apollographql/router/pull/6443 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,10 +42,14 @@ use crate::cache::DeduplicatingCache; | |
use crate::configuration::Batching; | ||
use crate::configuration::BatchingMode; | ||
use crate::context::CONTAINS_GRAPHQL_ERROR; | ||
use crate::context::OPERATION_KIND; | ||
use crate::context::OPERATION_NAME; | ||
use crate::graphql; | ||
use crate::http_ext; | ||
#[cfg(test)] | ||
use crate::plugin::test::MockSupergraphService; | ||
use crate::plugins::telemetry::apollo::OtlpErrorMetricsMode; | ||
use crate::plugins::telemetry::config::Conf; | ||
use crate::plugins::telemetry::config_new::attributes::HTTP_REQUEST_BODY; | ||
use crate::plugins::telemetry::config_new::attributes::HTTP_REQUEST_HEADERS; | ||
use crate::plugins::telemetry::config_new::attributes::HTTP_REQUEST_URI; | ||
|
@@ -54,9 +58,12 @@ use crate::plugins::telemetry::config_new::events::log_event; | |
use crate::plugins::telemetry::config_new::events::DisplayRouterRequest; | ||
use crate::plugins::telemetry::config_new::events::DisplayRouterResponse; | ||
use crate::plugins::telemetry::config_new::events::RouterResponseBodyExtensionType; | ||
use crate::plugins::telemetry::CLIENT_NAME; | ||
use crate::plugins::telemetry::CLIENT_VERSION; | ||
use crate::protocols::multipart::Multipart; | ||
use crate::protocols::multipart::ProtocolMode; | ||
use crate::query_planner::InMemoryCachePlanner; | ||
use crate::query_planner::APOLLO_OPERATION_ID; | ||
use crate::router_factory::RouterFactory; | ||
use crate::services::layers::apq::APQLayer; | ||
use crate::services::layers::content_negotiation; | ||
|
@@ -102,6 +109,7 @@ pub(crate) struct RouterService { | |
persisted_query_layer: Arc<PersistedQueryLayer>, | ||
query_analysis_layer: QueryAnalysisLayer, | ||
batching: Batching, | ||
oltp_error_metrics_mode: OtlpErrorMetricsMode, | ||
} | ||
|
||
impl RouterService { | ||
|
@@ -111,13 +119,15 @@ impl RouterService { | |
persisted_query_layer: Arc<PersistedQueryLayer>, | ||
query_analysis_layer: QueryAnalysisLayer, | ||
batching: Batching, | ||
oltp_error_metrics_mode: OtlpErrorMetricsMode, | ||
) -> Self { | ||
RouterService { | ||
supergraph_creator, | ||
apq_layer, | ||
persisted_query_layer, | ||
query_analysis_layer, | ||
batching, | ||
oltp_error_metrics_mode, | ||
} | ||
} | ||
} | ||
|
@@ -309,7 +319,7 @@ impl RouterService { | |
&& (accepts_json || accepts_wildcard) | ||
{ | ||
if !response.errors.is_empty() { | ||
Self::count_errors(&response.errors); | ||
self.count_errors(&response.errors, &context); | ||
} | ||
|
||
parts | ||
|
@@ -346,7 +356,7 @@ impl RouterService { | |
} | ||
|
||
if !response.errors.is_empty() { | ||
Self::count_errors(&response.errors); | ||
self.count_errors(&response.errors, &context); | ||
} | ||
|
||
// Useful when you're using a proxy like nginx which enable proxy_buffering by default (http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_buffering) | ||
|
@@ -373,12 +383,8 @@ impl RouterService { | |
|
||
Ok(RouterResponse { response, context }) | ||
} else { | ||
u64_counter!( | ||
"apollo.router.graphql_error", | ||
"Number of GraphQL error responses returned by the router", | ||
1, | ||
code = "INVALID_ACCEPT_HEADER" | ||
); | ||
self.count_error_codes(vec![Some("INVALID_ACCEPT_HEADER")], &context); | ||
|
||
// Useful for selector in spans/instruments/events | ||
context.insert_json_value( | ||
CONTAINS_GRAPHQL_ERROR, | ||
|
@@ -812,12 +818,47 @@ impl RouterService { | |
Ok(graphql_requests) | ||
} | ||
|
||
fn count_errors(errors: &[graphql::Error]) { | ||
fn count_errors(&self, errors: &[graphql::Error], context: &Context) { | ||
let codes = errors | ||
.iter() | ||
.map(|e| e.extensions.get("code").and_then(|c| c.as_str())) | ||
.collect(); | ||
self.count_error_codes(codes, context); | ||
bonnici marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
fn count_error_codes(&self, codes: Vec<Option<&str>>, context: &Context) { | ||
let unwrap_context_string = |context_key: &str| -> String { | ||
context | ||
.get::<_, String>(context_key) | ||
.unwrap_or_default() | ||
.unwrap_or_default() | ||
}; | ||
|
||
let operation_id = unwrap_context_string(APOLLO_OPERATION_ID); | ||
let operation_name = unwrap_context_string(OPERATION_NAME); | ||
let operation_kind = unwrap_context_string(OPERATION_KIND); | ||
let client_name = unwrap_context_string(CLIENT_NAME); | ||
let client_version = unwrap_context_string(CLIENT_VERSION); | ||
|
||
let mut map = HashMap::new(); | ||
for error in errors { | ||
let code = error.extensions.get("code").and_then(|c| c.as_str()); | ||
for code in &codes { | ||
let entry = map.entry(code).or_insert(0u64); | ||
*entry += 1; | ||
|
||
if matches!(self.oltp_error_metrics_mode, OtlpErrorMetricsMode::Enabled) { | ||
let code_str = code.unwrap_or_default().to_string(); | ||
u64_counter!( | ||
"apollo.router.operations.error", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know that the Router team has asked for this to be a private metric - would that mean we would need to call it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should pass the "private" filter regex and not pass the "public" filter regex, so it should be private. I don't really know how to test that thought because it is still coming through in prometheus.
|
||
"Number of errors returned by operation", | ||
1, | ||
"apollo.operation.id" = operation_id.clone(), | ||
"graphql.operation.name" = operation_name.clone(), | ||
"graphql.operation.type" = operation_kind.clone(), | ||
"apollo.client.name" = client_name.clone(), | ||
"apollo.client.version" = client_version.clone(), | ||
"graphql.error.extensions.code" = code_str | ||
); | ||
} | ||
} | ||
|
||
for (code, count) in map { | ||
|
@@ -865,6 +906,7 @@ pub(crate) struct RouterCreator { | |
pub(crate) persisted_query_layer: Arc<PersistedQueryLayer>, | ||
query_analysis_layer: QueryAnalysisLayer, | ||
batching: Batching, | ||
oltp_error_metrics_mode: OtlpErrorMetricsMode, | ||
} | ||
|
||
impl ServiceFactory<router::Request> for RouterCreator { | ||
|
@@ -916,13 +958,24 @@ impl RouterCreator { | |
// For now just call activate to make the gauges work on the happy path. | ||
apq_layer.activate(); | ||
|
||
let oltp_error_metrics_mode = match configuration.apollo_plugins.plugins.get("telemetry") { | ||
Some(telemetry_config) => { | ||
match serde_json::from_value::<Conf>(telemetry_config.clone()) { | ||
Ok(conf) => conf.apollo.errors.experimental_otlp_error_metrics, | ||
_ => OtlpErrorMetricsMode::default(), | ||
} | ||
} | ||
_ => OtlpErrorMetricsMode::default(), | ||
}; | ||
|
||
Ok(Self { | ||
supergraph_creator, | ||
static_page, | ||
apq_layer, | ||
query_analysis_layer, | ||
persisted_query_layer, | ||
batching: configuration.batching.clone(), | ||
oltp_error_metrics_mode, | ||
}) | ||
} | ||
|
||
|
@@ -940,6 +993,7 @@ impl RouterCreator { | |
self.persisted_query_layer.clone(), | ||
self.query_analysis_layer.clone(), | ||
self.batching.clone(), | ||
self.oltp_error_metrics_mode, | ||
)); | ||
|
||
ServiceBuilder::new() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC for errors without a code, it would count against the "empty" code e.g.
code=""
. I think we could consider defining an explicitUNKNOWN
code but it seems fine to support this with an empty code.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah this is an empty string. I can easily add a fallback code but I think an empty string is just as good.