Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Per-operation and client error metric #6443

Open
wants to merge 23 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .changesets/exp_njm_operation_error_metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
### Experimental per-operation error metrics ([PR #6443](https://github.com/apollographql/router/pull/6443))

Adds a new experimental OpenTelemetry metric that includes error counts at a per-operation and per-client level. These metrics contain the following attributes:
* Operation name
* Operation type (query/mutation/subscription)
* Apollo operation ID
* Client name
* Client version
* Error code

This metric is currently only sent to GraphOS and is not available in 3rd-party OTel destinations. The metric can be enabled using the configuration `telemetry.apollo.errors.experimental_otlp_error_metrics: enabled`.

By [@bonnici](https://github.com/bonnici) in https://github.com/apollographql/router/pull/6443
Original file line number Diff line number Diff line change
Expand Up @@ -2601,6 +2601,10 @@ expression: "&schema"
"ErrorsConfiguration": {
"additionalProperties": false,
"properties": {
"experimental_otlp_error_metrics": {
"$ref": "#/definitions/OtlpErrorMetricsMode",
"description": "#/definitions/OtlpErrorMetricsMode"
},
"subgraph": {
"$ref": "#/definitions/SubgraphErrorConfig",
"description": "#/definitions/SubgraphErrorConfig"
Expand Down Expand Up @@ -4318,6 +4322,25 @@ expression: "&schema"
}
]
},
"OtlpErrorMetricsMode": {
"description": "Open Telemetry error metrics mode",
"oneOf": [
{
"description": "Do not store OTLP error metrics",
"enum": [
"disabled"
],
"type": "string"
},
{
"description": "Send OTLP error metrics to Apollo Studio",
"enum": [
"enabled"
],
"type": "string"
}
]
},
"PersistedQueries": {
"additionalProperties": false,
"description": "Persisted Queries (PQ) configuration",
Expand Down
14 changes: 14 additions & 0 deletions apollo-router/src/plugins/telemetry/apollo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ pub(crate) struct Config {
pub(crate) struct ErrorsConfiguration {
/// Handling of errors coming from subgraph
pub(crate) subgraph: SubgraphErrorConfig,

/// Configuration for storing and sending error metrics via OTLP
pub(crate) experimental_otlp_error_metrics: OtlpErrorMetricsMode,
}

#[derive(Debug, Clone, Deserialize, JsonSchema, Default)]
Expand Down Expand Up @@ -160,6 +163,17 @@ impl SubgraphErrorConfig {
}
}

/// Open Telemetry error metrics mode
#[derive(Clone, Default, Debug, Deserialize, JsonSchema, Copy)]
#[serde(deny_unknown_fields, rename_all = "lowercase")]
pub(crate) enum OtlpErrorMetricsMode {
/// Do not store OTLP error metrics
#[default]
Disabled,
/// Send OTLP error metrics to Apollo Studio
Enabled,
}

const fn default_field_level_instrumentation_sampler() -> SamplerOption {
SamplerOption::TraceIdRatioBased(0.01)
}
Expand Down
2 changes: 1 addition & 1 deletion apollo-router/src/plugins/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ pub(crate) mod utils;

// Tracing consts
pub(crate) const CLIENT_NAME: &str = "apollo::telemetry::client_name";
const CLIENT_VERSION: &str = "apollo::telemetry::client_version";
pub(crate) const CLIENT_VERSION: &str = "apollo::telemetry::client_version";
const SUBGRAPH_FTV1: &str = "apollo::telemetry::subgraph_ftv1";
pub(crate) const STUDIO_EXCLUDE: &str = "apollo::telemetry::studio_exclude";
pub(crate) const SUPERGRAPH_SCHEMA_ID_CONTEXT_KEY: &str = "apollo::supergraph_schema_id";
Expand Down
85 changes: 76 additions & 9 deletions apollo-router/src/services/router/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,16 @@ use crate::cache::DeduplicatingCache;
use crate::configuration::Batching;
use crate::configuration::BatchingMode;
use crate::context::CONTAINS_GRAPHQL_ERROR;
use crate::context::OPERATION_KIND;
use crate::context::OPERATION_NAME;
use crate::graphql;
use crate::http_ext;
use crate::json_ext::Object;
use crate::json_ext::Value;
#[cfg(test)]
use crate::plugin::test::MockSupergraphService;
use crate::plugins::telemetry::apollo::OtlpErrorMetricsMode;
use crate::plugins::telemetry::config::Conf;
use crate::plugins::telemetry::config_new::attributes::HTTP_REQUEST_BODY;
use crate::plugins::telemetry::config_new::attributes::HTTP_REQUEST_HEADERS;
use crate::plugins::telemetry::config_new::attributes::HTTP_REQUEST_URI;
Expand All @@ -54,9 +60,12 @@ use crate::plugins::telemetry::config_new::events::log_event;
use crate::plugins::telemetry::config_new::events::DisplayRouterRequest;
use crate::plugins::telemetry::config_new::events::DisplayRouterResponse;
use crate::plugins::telemetry::config_new::events::RouterResponseBodyExtensionType;
use crate::plugins::telemetry::CLIENT_NAME;
use crate::plugins::telemetry::CLIENT_VERSION;
use crate::protocols::multipart::Multipart;
use crate::protocols::multipart::ProtocolMode;
use crate::query_planner::InMemoryCachePlanner;
use crate::query_planner::APOLLO_OPERATION_ID;
use crate::router_factory::RouterFactory;
use crate::services::layers::apq::APQLayer;
use crate::services::layers::content_negotiation;
Expand Down Expand Up @@ -102,6 +111,7 @@ pub(crate) struct RouterService {
persisted_query_layer: Arc<PersistedQueryLayer>,
query_analysis_layer: QueryAnalysisLayer,
batching: Batching,
oltp_error_metrics_mode: OtlpErrorMetricsMode,
}

impl RouterService {
Expand All @@ -111,13 +121,15 @@ impl RouterService {
persisted_query_layer: Arc<PersistedQueryLayer>,
query_analysis_layer: QueryAnalysisLayer,
batching: Batching,
oltp_error_metrics_mode: OtlpErrorMetricsMode,
) -> Self {
RouterService {
supergraph_creator,
apq_layer,
persisted_query_layer,
query_analysis_layer,
batching,
oltp_error_metrics_mode,
}
}
}
Expand Down Expand Up @@ -309,7 +321,7 @@ impl RouterService {
&& (accepts_json || accepts_wildcard)
{
if !response.errors.is_empty() {
Self::count_errors(&response.errors);
self.count_errors(&response.errors, &context);
}

parts
Expand Down Expand Up @@ -346,7 +358,7 @@ impl RouterService {
}

if !response.errors.is_empty() {
Self::count_errors(&response.errors);
self.count_errors(&response.errors, &context);
}

// Useful when you're using a proxy like nginx which enable proxy_buffering by default (http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_buffering)
Expand All @@ -373,12 +385,8 @@ impl RouterService {

Ok(RouterResponse { response, context })
} else {
u64_counter!(
"apollo.router.graphql_error",
"Number of GraphQL error responses returned by the router",
1,
code = "INVALID_ACCEPT_HEADER"
);
self.count_error_codes(vec!["INVALID_ACCEPT_HEADER"], &context);

// Useful for selector in spans/instruments/events
context.insert_json_value(
CONTAINS_GRAPHQL_ERROR,
Expand Down Expand Up @@ -812,12 +820,40 @@ impl RouterService {
Ok(graphql_requests)
}

fn count_errors(errors: &[graphql::Error]) {
fn count_errors(&self, errors: &Vec<graphql::Error>, context: &Context) {
let unwrap_context_string = |context_key: &str| -> String {
context
.get::<_, String>(context_key)
.unwrap_or_default()
.unwrap_or_default()
};

let operation_id = unwrap_context_string(APOLLO_OPERATION_ID);
let operation_name = unwrap_context_string(OPERATION_NAME);
let operation_kind = unwrap_context_string(OPERATION_KIND);
let client_name = unwrap_context_string(CLIENT_NAME);
let client_version = unwrap_context_string(CLIENT_VERSION);

let mut map = HashMap::new();
for error in errors {
let code = error.extensions.get("code").and_then(|c| c.as_str());
let entry = map.entry(code).or_insert(0u64);
*entry += 1;

if matches!(self.oltp_error_metrics_mode, OtlpErrorMetricsMode::Enabled) {
let code_str = code.unwrap_or_default().to_string();
u64_counter!(
"apollo.router.operations.error",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know that the Router team has asked for this to be a private metric - would that mean we would need to call it apollo_private.router.operations.error? Ultimately we don't want it to be, so I'm fine with it not being private at all.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should pass the "private" filter regex and not pass the "public" filter regex, so it should be private. I don't really know how to test that thought because it is still coming through in prometheus.

    pub(crate) fn private<T: Into<MeterProvider>>(delegate: T) -> Self {
        FilterMeterProvider::builder()
            .delegate(delegate)
            .allow(
                Regex::new(
                    r"apollo\.(graphos\.cloud|router\.(operations?|lifecycle|config|schema|query|query_planning|telemetry|instance))(\..*|$)|apollo_router_uplink_fetch_count_total|apollo_router_uplink_fetch_duration_seconds",
                )
                .expect("regex should have been valid"),
            )
            .build()
    }

    pub(crate) fn public<T: Into<MeterProvider>>(delegate: T) -> Self {
        FilterMeterProvider::builder()
            .delegate(delegate)
            .deny(
                Regex::new(r"apollo\.router\.(config|entities|instance|operations\.(connectors|fetch|request_size|response_size)|schema\.connectors)(\..*|$)")
                    .expect("regex should have been valid"),
            )
            .build()
    }

"Number of errors returned by operation",
1,
"apollo.operation.id" = operation_id.clone(),
"graphql.operation.name" = operation_name.clone(),
"graphql.operation.type" = operation_kind.clone(),
"apollo.client.name" = client_name.clone(),
"apollo.client.version" = client_version.clone(),
"graphql.error.extensions.code" = code_str
);
}
}

for (code, count) in map {
Expand All @@ -840,6 +876,24 @@ impl RouterService {
}
}
}

fn count_error_codes(&self, codes: Vec<&str>, context: &Context) {
let errors = codes
.iter()
.map(|c| {
let mut extensions = Object::new();
extensions.insert("code", Value::String((*c).into()));
graphql::Error {
message: "".into(),
locations: vec![],
path: None,
extensions,
}
})
.collect();

self.count_errors(&errors, context);
}
}

struct TranslateError<'a> {
Expand All @@ -865,6 +919,7 @@ pub(crate) struct RouterCreator {
pub(crate) persisted_query_layer: Arc<PersistedQueryLayer>,
query_analysis_layer: QueryAnalysisLayer,
batching: Batching,
oltp_error_metrics_mode: OtlpErrorMetricsMode,
}

impl ServiceFactory<router::Request> for RouterCreator {
Expand Down Expand Up @@ -916,13 +971,24 @@ impl RouterCreator {
// For now just call activate to make the gauges work on the happy path.
apq_layer.activate();

let oltp_error_metrics_mode = match configuration.apollo_plugins.plugins.get("telemetry") {
Some(telemetry_config) => {
match serde_json::from_value::<Conf>(telemetry_config.clone()) {
Ok(conf) => conf.apollo.errors.experimental_otlp_error_metrics,
_ => OtlpErrorMetricsMode::default(),
}
}
_ => OtlpErrorMetricsMode::default(),
};

Ok(Self {
supergraph_creator,
static_page,
apq_layer,
query_analysis_layer,
persisted_query_layer,
batching: configuration.batching.clone(),
oltp_error_metrics_mode,
})
}

Expand All @@ -940,6 +1006,7 @@ impl RouterCreator {
self.persisted_query_layer.clone(),
self.query_analysis_layer.clone(),
self.batching.clone(),
self.oltp_error_metrics_mode,
));

ServiceBuilder::new()
Expand Down
Loading
Loading