From a8597852689db6fca7856845d0d33de934d8c63d Mon Sep 17 00:00:00 2001 From: Simon Worthington Date: Tue, 23 Jan 2024 08:58:04 +1100 Subject: [PATCH 1/5] Add authentication that supports usernames and passwords This commit adds `ask` mode authentication that allows node operators to configure Bacalhau to ask the user for arbitrary information to be used as a credential. This method can be used to implement basic usernames and passwords, shared secrets, security questions and even 2FA. The associated policy additionally returns a JSON Schema to show what information is required. The CLI uses the schema to ask the user for the right information. In the future, the Web UI will do this as well. --- apps/job-info-consumer/consumer/go.mod | 2 +- apps/job-info-consumer/consumer/go.sum | 2 +- cmd/util/auth/ask.go | 77 +++++++++++++++++++++ cmd/util/auth/auth.go | 9 +-- docs/docs/dev/auth_flow.md | 19 ++++++ docs/docs/running-node/auth.md | 19 ++++++ go.mod | 5 +- go.sum | 5 +- ops/aws/canary/lambda/go.mod | 3 +- ops/aws/canary/lambda/go.sum | 2 +- pkg/authn/ask/ask_ns_example.rego | 34 ++++++++++ pkg/authn/ask/ask_ns_password.rego | 67 ++++++++++++++++++ pkg/authn/ask/ask_ns_test_password.rego | 71 +++++++++++++++++++ pkg/authn/ask/authenticator.go | 90 +++++++++++++++++++++++++ pkg/authn/ask/authenticator_test.go | 82 ++++++++++++++++++++++ pkg/authn/ask/gen_password/main.go | 43 ++++++++++++ pkg/authn/types.go | 3 + pkg/lib/policy/policy.go | 2 +- pkg/lib/policy/scrypt.go | 60 +++++++++++++++++ pkg/node/factories.go | 21 ++++-- 20 files changed, 598 insertions(+), 18 deletions(-) create mode 100644 cmd/util/auth/ask.go create mode 100644 pkg/authn/ask/ask_ns_example.rego create mode 100644 pkg/authn/ask/ask_ns_password.rego create mode 100644 pkg/authn/ask/ask_ns_test_password.rego create mode 100644 pkg/authn/ask/authenticator.go create mode 100644 pkg/authn/ask/authenticator_test.go create mode 100644 pkg/authn/ask/gen_password/main.go create mode 100644 pkg/lib/policy/scrypt.go diff --git a/apps/job-info-consumer/consumer/go.mod b/apps/job-info-consumer/consumer/go.mod index 0412bfe53d..9ab10658d4 100644 --- a/apps/job-info-consumer/consumer/go.mod +++ b/apps/job-info-consumer/consumer/go.mod @@ -323,7 +323,7 @@ require ( go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.26.0 // indirect go4.org v0.0.0-20230225012048-214862532bf5 // indirect - golang.org/x/crypto v0.17.0 // indirect + golang.org/x/crypto v0.18.0 // indirect golang.org/x/exp v0.0.0-20230321023759-10a507213a29 // indirect golang.org/x/mod v0.14.0 // indirect golang.org/x/net v0.19.0 // indirect diff --git a/apps/job-info-consumer/consumer/go.sum b/apps/job-info-consumer/consumer/go.sum index 149f0c38f1..6188c5be1c 100644 --- a/apps/job-info-consumer/consumer/go.sum +++ b/apps/job-info-consumer/consumer/go.sum @@ -1183,7 +1183,7 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= +golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= diff --git a/cmd/util/auth/ask.go b/cmd/util/auth/ask.go new file mode 100644 index 0000000000..e1f6aee673 --- /dev/null +++ b/cmd/util/auth/ask.go @@ -0,0 +1,77 @@ +package auth + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "os" + + "github.com/santhosh-tekuri/jsonschema/v5" + "github.com/spf13/cobra" + "golang.org/x/term" +) + +// Returns a responder that responds to authentication requirements of type +// `authn.MethodTypeAsk`. Reads the JSON Schema returned by the `ask` endpoint +// and uses it to ask appropriate questions to the user on their terminal, and +// then returns their response as serialized JSON. +func askResponder(cmd *cobra.Command) responder { + return func(request *json.RawMessage) ([]byte, error) { + compiler := jsonschema.NewCompiler() + compiler.ExtractAnnotations = true + + if err := compiler.AddResource("", bytes.NewReader(*request)); err != nil { + return nil, err + } + + schema, err := compiler.Compile("") + if err != nil { + return nil, err + } + + response := make(map[string]any, len(schema.Properties)) + for _, name := range schema.Required { + subschema := schema.Properties[name] + + if len(subschema.Types) < 1 { + return nil, fmt.Errorf("invalid schema: property %q has no type", name) + } + + typ := subschema.Types[0] + if typ == "object" { + return nil, fmt.Errorf("invalid schema: property %q has non-scalar type", name) + } + + fmt.Fprintf(cmd.ErrOrStderr(), "%s: ", name) + + var input []byte + var err error + + // If the property is marked as write only, assume it is a sensitive + // value and make sure we don't display it in the terminal + if subschema.WriteOnly { + input, err = term.ReadPassword(int(os.Stdin.Fd())) + fmt.Fprintln(cmd.ErrOrStderr()) + } else { + reader := bufio.NewScanner(cmd.InOrStdin()) + if reader.Scan() { + input = reader.Bytes() + } + err = reader.Err() + } + + if err != nil { + return nil, err + } + response[name] = string(input) + } + + respBytes, err := json.Marshal(response) + if err != nil { + return nil, err + } + + return respBytes, schema.Validate(response) + } +} diff --git a/cmd/util/auth/auth.go b/cmd/util/auth/auth.go index 25b1158c6c..74b376752e 100644 --- a/cmd/util/auth/auth.go +++ b/cmd/util/auth/auth.go @@ -19,11 +19,12 @@ import ( type responder = func(request *json.RawMessage) (response []byte, err error) -var supportedMethods map[authn.MethodType]responder = map[authn.MethodType]responder{ - authn.MethodTypeChallenge: challenge.Respond, -} - func RunAuthenticationFlow(cmd *cobra.Command) (string, error) { + supportedMethods := map[authn.MethodType]responder{ + authn.MethodTypeChallenge: challenge.Respond, + authn.MethodTypeAsk: askResponder(cmd), + } + client := util.GetAPIClientV2(cmd.Context()) methods, err := client.Auth().Methods(&apimodels.ListAuthnMethodsRequest{}) if err != nil { diff --git a/docs/docs/dev/auth_flow.md b/docs/docs/dev/auth_flow.md index f20f1434e9..53f0c1cbdd 100644 --- a/docs/docs/dev/auth_flow.md +++ b/docs/docs/dev/auth_flow.md @@ -83,6 +83,25 @@ return to the endpoint. } ``` +### `ask` authentication + +This method requires the user to manually input some information. This method +can be used to implement username and password authentication, shared secret +authentication, and even 2FA or security question auth. + +The required information is represented by a JSON Schema in the object itself. +The implementation should parse the JSON Schema and ask the user questions to +populate an object that is valid by it. + +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://bacalhau.org/auth/ask", + "type": "object", + "$ref": "https://json-schema.org/draft/2020-12/schema", +} +``` + ## 2. Run the authn flow and submit the result for an access token The user agent decides which authentication method to use (e.g. by asking the diff --git a/docs/docs/running-node/auth.md b/docs/docs/running-node/auth.md index 4958b5fa9a..2a409e0f91 100644 --- a/docs/docs/running-node/auth.md +++ b/docs/docs/running-node/auth.md @@ -47,3 +47,22 @@ include acceptable client IDs, found by running `bacalhau id`. Once the node is restarted, only keys in the allowed list will be able to access any API. + +## Username and password access + +Users can authenticate using a username and password instead of specifying a +private key for access. Again, this requires installation of an appropriate +policy on the server. + + curl -sL https://raw.githubusercontent.com/bacalhau-project/bacalhau/main/pkg/authn/ask/ask_ns_password.rego -o ~/.bacalhau/ask_ns_password.rego + bacalhau config set Node.Auth.Methods.Password.Type ask + bacalhau config set Node.Auth.Methods.Password.PolicyPath ~/.bacalhau/ask_ns_password.rego + +Passwords are not stored in plaintext and are salted. The downloaded policy +expects password hashes and salts generated by `scrypt`. To generate a salted +password, the helper script in `pkg/authn/ask/gen_password` can be used: + + cd pkg/authn/ask/gen_password && go run . + +This will ask for a password and generate a salt and hash to authenticate with +it. Add the encoded username, salt and hash into the `ask_ns_password.rego`. diff --git a/go.mod b/go.mod index 891812c0a5..7027a857fb 100644 --- a/go.mod +++ b/go.mod @@ -56,6 +56,7 @@ require ( github.com/pkg/errors v0.9.1 github.com/ricochet2200/go-disk-usage/du v0.0.0-20210707232629-ac9918953285 github.com/rs/zerolog v1.31.0 + github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 github.com/spf13/cobra v1.8.0 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.16.0 @@ -82,7 +83,7 @@ require ( go.uber.org/mock v0.4.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.26.0 - golang.org/x/crypto v0.17.0 + golang.org/x/crypto v0.18.0 golang.org/x/exp v0.0.0-20230321023759-10a507213a29 gopkg.in/alessio/shellescape.v1 v1.0.0-20170105083845-52074bc9df61 k8s.io/apimachinery v0.29.0 @@ -373,7 +374,7 @@ require ( golang.org/x/oauth2 v0.13.0 // indirect golang.org/x/sync v0.6.0 golang.org/x/sys v0.16.0 // indirect - golang.org/x/term v0.16.0 // indirect + golang.org/x/term v0.16.0 golang.org/x/text v0.14.0 // indirect golang.org/x/time v0.5.0 golang.org/x/tools v0.16.1 // indirect diff --git a/go.sum b/go.sum index 6e43170e77..c5045d1b6b 100644 --- a/go.sum +++ b/go.sum @@ -1069,6 +1069,8 @@ github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk= github.com/samber/lo v1.39.0 h1:4gTz1wUhNYLhFSKl6O+8peW0v2F4BCY034GRpU9WnuA= github.com/samber/lo v1.39.0/go.mod h1:+m/ZKRl6ClXCE2Lgf3MsQlWfh4bn1bz6CXEOxnEXnEA= +github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4= +github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= @@ -1341,8 +1343,9 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= +golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc= +golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= diff --git a/ops/aws/canary/lambda/go.mod b/ops/aws/canary/lambda/go.mod index 04056f3ff5..14aca0427b 100644 --- a/ops/aws/canary/lambda/go.mod +++ b/ops/aws/canary/lambda/go.mod @@ -304,14 +304,13 @@ require ( go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.26.0 // indirect go4.org v0.0.0-20230225012048-214862532bf5 // indirect - golang.org/x/crypto v0.17.0 // indirect + golang.org/x/crypto v0.18.0 // indirect golang.org/x/exp v0.0.0-20230321023759-10a507213a29 // indirect golang.org/x/mod v0.14.0 // indirect golang.org/x/net v0.19.0 // indirect golang.org/x/oauth2 v0.13.0 // indirect golang.org/x/sync v0.6.0 // indirect golang.org/x/sys v0.16.0 // indirect - golang.org/x/term v0.16.0 // indirect golang.org/x/text v0.14.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.16.1 // indirect diff --git a/ops/aws/canary/lambda/go.sum b/ops/aws/canary/lambda/go.sum index d7671afce6..8c8475f75a 100644 --- a/ops/aws/canary/lambda/go.sum +++ b/ops/aws/canary/lambda/go.sum @@ -1176,7 +1176,7 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= +golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= diff --git a/pkg/authn/ask/ask_ns_example.rego b/pkg/authn/ask/ask_ns_example.rego new file mode 100644 index 0000000000..abebbf467b --- /dev/null +++ b/pkg/authn/ask/ask_ns_example.rego @@ -0,0 +1,34 @@ +package bacalhau.authn + +import rego.v1 + +schema := { + "type": "object", + "properties": {"magic": {"type": "string"}}, + "required": ["magic"], +} + +token := t if { + input.magic == "open sesame" + + t := io.jwt.encode_sign( + { + "typ": "JWT", + "alg": "RS256", + }, + { + "iss": input.nodeId, + "sub": "aladdin", + "aud": [input.nodeId], + "iat": now, + "exp": one_month, + "ns": { + # Read-only access to all namespaces + "*": read_only, + # Writable access to own namespace + "genie": full_access, + }, + }, + input.signingKey, + ) +} diff --git a/pkg/authn/ask/ask_ns_password.rego b/pkg/authn/ask/ask_ns_password.rego new file mode 100644 index 0000000000..35e4604f5f --- /dev/null +++ b/pkg/authn/ask/ask_ns_password.rego @@ -0,0 +1,67 @@ +package bacalhau.authn + +import rego.v1 + +# Implements a policy where clients that supply a valid username and password +# are permitted access. Anonymous users are not permitted. +# +# Modify the `userlist` to control what users are permitted access. +# Modify the `ns` key of the token to control what namespaces they can access. + +schema := { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "username": {"type": "string"}, + "password": {"type": "string", "writeOnly": true}, + }, + "required": ["username", "password"], +} + +now := time.now_ns() / 1000 + +one_month := time.add_date(time.now_ns(), 0, 1, 0) / 1000 + +# userlist should be a map of usernames to scrypt-hashed passwords and salts. in +# a simple live setup they can be hard coded here as a map, and then apply +# appropriate file permissions to this policy. +userlist := { + # "username": ["hash", "salt"] +} + +valid_user := input.ask.username if { + input.ask.username in userlist + + hash := userlist[input.ask.username][0] + salt := userlist[input.ask.username][1] + hash == scrypt(input.ask.password, salt) +} + +token := io.jwt.encode_sign( + { + "typ": "JWT", + "alg": "RS256", + }, + { + "iss": input.nodeId, + "sub": valid_user, + "aud": [input.nodeId], + "iat": now, + "exp": one_month, + "ns": { + # Read-only access to all namespaces + "*": read_only, + # Writable access to own namespace + valid_user: full_access, + }, + }, + input.signingKey, +) + +namespace_read := 1 +namespace_write := 2 +namespace_download := 4 +namespace_cancel := 8 + +read_only := bits.and(namespace_read, namespace_download) +full_access := bits.and(bits.and(namespace_write, namespace_cancel), read_only) diff --git a/pkg/authn/ask/ask_ns_test_password.rego b/pkg/authn/ask/ask_ns_test_password.rego new file mode 100644 index 0000000000..c174d3afdb --- /dev/null +++ b/pkg/authn/ask/ask_ns_test_password.rego @@ -0,0 +1,71 @@ +package bacalhau.authn + +import rego.v1 + +# Implements a policy where clients that supply a valid username and password +# are permitted access. Anonymous users are not permitted. +# +# Modify the `userlist` to control what users are permitted access. +# Modify the `ns` key of the token to control what namespaces they can access. + +schema := { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "username": {"type": "string"}, + "password": {"type": "string", "writeOnly": true}, + }, + "required": ["username", "password"], +} + +now := time.now_ns() / 1000 + +one_month := time.add_date(time.now_ns(), 0, 1, 0) / 1000 + +# userlist should be a map of usernames to scrypt-hashed passwords and salts. in +# a simple live setup they can be hard coded here as a map, and then apply +# appropriate file permissions to this policy. +userlist := {"username": [ + # hash corresponding to "password" + "SN1U4DxjUhzyYG6p6nQ4by0IpudU8wdNs7Fpp42Ky9M=", + # a randomly generated salt + "d9ucnhE5kHEqm0YEWqN5qJmrHB+IDqjuPEwLkmZ9BGs=", +]} + +valid_user := user if { + some input.ask.username, _ in userlist + + user := input.ask.username + hash := userlist[input.ask.username][0] + salt := userlist[input.ask.username][1] + hash == scrypt(input.ask.password, salt) +} + +token := io.jwt.encode_sign( + { + "typ": "JWT", + "alg": "RS256", + }, + { + "iss": input.nodeId, + "sub": valid_user, + "aud": [input.nodeId], + "iat": now, + "exp": one_month, + "ns": { + # Read-only access to all namespaces + "*": read_only, + # Writable access to own namespace + valid_user: full_access, + }, + }, + input.signingKey, +) + +namespace_read := 1 +namespace_write := 2 +namespace_download := 4 +namespace_cancel := 8 + +read_only := bits.and(namespace_read, namespace_download) +full_access := bits.and(bits.and(namespace_write, namespace_cancel), read_only) diff --git a/pkg/authn/ask/authenticator.go b/pkg/authn/ask/authenticator.go new file mode 100644 index 0000000000..c81b73bc92 --- /dev/null +++ b/pkg/authn/ask/authenticator.go @@ -0,0 +1,90 @@ +package ask + +import ( + "context" + "crypto/rsa" + "encoding/json" + + "github.com/bacalhau-project/bacalhau/pkg/authn" + "github.com/bacalhau-project/bacalhau/pkg/lib/policy" + "github.com/lestrrat-go/jwx/jwk" + "github.com/pkg/errors" + "github.com/samber/lo" +) + +type policyData struct { + SigningKey jwk.Key `json:"signingKey"` + NodeID string `json:"nodeId"` + Ask map[string]string `json:"ask"` +} + +type requiredSchema = map[string]any + +const schemaRule = "bacalhau.authn.schema" + +type askAuthenticator struct { + authnPolicy *policy.Policy + key jwk.Key + nodeID string + + validate policy.Query[policyData, string] + schema policy.Query[any, requiredSchema] +} + +func NewAuthenticator(p *policy.Policy, key *rsa.PrivateKey, nodeID string) authn.Authenticator { + return askAuthenticator{ + authnPolicy: p, + key: lo.Must(jwk.New(key)), + nodeID: nodeID, + validate: policy.AddQuery[policyData, string](p, authn.PolicyTokenRule), + schema: policy.AddQuery[any, requiredSchema](p, schemaRule), + } +} + +// Authenticate implements authn.Authenticator. +func (authenticator askAuthenticator) Authenticate(ctx context.Context, req []byte) (authn.Authentication, error) { + var userInput map[string]string + err := json.Unmarshal(req, &userInput) + if err != nil { + return authn.Error(errors.Wrap(err, "invalid authentication data")) + } + + input := policyData{ + SigningKey: authenticator.key, + NodeID: authenticator.nodeID, + Ask: userInput, + } + + token, err := authenticator.validate(ctx, input) + if errors.Is(err, policy.ErrNoResult) { + return authn.Failed("credentials rejected"), nil + } else if err != nil { + return authn.Error(err) + } + + return authn.Authentication{Success: true, Token: token}, nil +} + +func (authenticator askAuthenticator) Schema(ctx context.Context) ([]byte, error) { + schema, err := authenticator.schema(ctx, nil) + if err != nil { + return nil, err + } + + return json.Marshal(schema) +} + +// IsInstalled implements authn.Authenticator. +func (authenticator askAuthenticator) IsInstalled(ctx context.Context) (bool, error) { + schema, err := authenticator.Schema(ctx) + return err == nil && schema != nil, err +} + +// Requirement implements authn.Authenticator. +func (authenticator askAuthenticator) Requirement() authn.Requirement { + params := lo.Must(authenticator.Schema(context.TODO())) + return authn.Requirement{ + Type: authn.MethodTypeAsk, + Params: (*json.RawMessage)(¶ms), + } +} diff --git a/pkg/authn/ask/authenticator_test.go b/pkg/authn/ask/authenticator_test.go new file mode 100644 index 0000000000..8da457a762 --- /dev/null +++ b/pkg/authn/ask/authenticator_test.go @@ -0,0 +1,82 @@ +//go:build unit || !integration + +package ask + +import ( + "context" + "crypto/rand" + "crypto/rsa" + "embed" + "encoding/json" + "testing" + + "github.com/bacalhau-project/bacalhau/pkg/authn" + "github.com/bacalhau-project/bacalhau/pkg/lib/policy" + "github.com/bacalhau-project/bacalhau/pkg/logger" + "github.com/stretchr/testify/require" +) + +//go:embed *.rego +var policies embed.FS + +func setup(t *testing.T) authn.Authenticator { + logger.ConfigureTestLogging(t) + + authPolicy, err := (policy.FromFS(policies, "ask_ns_test_password.rego")) + require.NoError(t, err) + + rsaKey, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + + return NewAuthenticator(authPolicy, rsaKey, "node") +} + +func try(t *testing.T, authenticator authn.Authenticator, r any) authn.Authentication { + req, err := json.Marshal(r) + require.NoError(t, err) + + auth, err := authenticator.Authenticate(context.Background(), req) + require.NoError(t, err) + return auth +} + +func TestRequirement(t *testing.T) { + authenticator := setup(t) + + requirement := authenticator.Requirement() + require.Equal(t, authn.MethodTypeAsk, requirement.Type) + require.NoError(t, json.Unmarshal(*requirement.Params, &requiredSchema{})) +} + +func TestUnknownUser(t *testing.T) { + authenticator := setup(t) + + auth := try(t, authenticator, map[string]string{ + "username": "robert", + "password": "password", + }) + require.False(t, auth.Success, auth.Reason) + require.Empty(t, auth.Token) +} + +func TestIncorrectPassword(t *testing.T) { + authenticator := setup(t) + + auth := try(t, authenticator, map[string]string{ + "username": "username", + "password": "username", + }) + require.False(t, auth.Success, auth.Reason) + require.Empty(t, auth.Token) +} + +func TestGoodResponse(t *testing.T) { + authenticator := setup(t) + + auth := try(t, authenticator, map[string]string{ + "username": "username", + "password": "password", + }) + require.True(t, auth.Success, auth.Reason) + require.NotEmpty(t, auth.Token) +} diff --git a/pkg/authn/ask/gen_password/main.go b/pkg/authn/ask/gen_password/main.go new file mode 100644 index 0000000000..25e0706aa1 --- /dev/null +++ b/pkg/authn/ask/gen_password/main.go @@ -0,0 +1,43 @@ +package main + +import ( + "crypto/rand" + "encoding/json" + "fmt" + "os" + + "github.com/bacalhau-project/bacalhau/pkg/lib/policy" + "golang.org/x/term" +) + +const saltLength = 32 + +func main() { + fmt.Fprintf(os.Stderr, "Password: ") + + password, err := term.ReadPassword(int(os.Stdin.Fd())) + if err != nil { + fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } + + salt := make([]byte, saltLength) + _, err = rand.Read(salt) + if err != nil { + fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } + + hash, err := policy.Scrypt(password, salt) + if err != nil { + fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } + + output := [][]byte{hash, salt} + err = json.NewEncoder(os.Stdout).Encode(output) + if err != nil { + fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } +} diff --git a/pkg/authn/types.go b/pkg/authn/types.go index b5716bac00..f582adaef8 100644 --- a/pkg/authn/types.go +++ b/pkg/authn/types.go @@ -40,6 +40,9 @@ const ( // An authentication method that provides a challenge string that the user // must sign using their private key. MethodTypeChallenge MethodType = "challenge" + + // An authentication method that asks the user to supply some credentials. + MethodTypeAsk MethodType = "ask" ) // Requirement represents information about how to authenticate using a diff --git a/pkg/lib/policy/policy.go b/pkg/lib/policy/policy.go index 80b95c95e6..285f956ab7 100644 --- a/pkg/lib/policy/policy.go +++ b/pkg/lib/policy/policy.go @@ -67,7 +67,7 @@ type Query[Input, Output any] func(ctx context.Context, input Input) (Output, er // certain input type and returns a function that will execute the query when // given input of that type. func AddQuery[Input, Output any](runner *Policy, rule string) Query[Input, Output] { - opts := append(runner.modules, rego.Query("data."+rule)) + opts := append(runner.modules, rego.Query("data."+rule), scryptFn) query := lo.Must(rego.New(opts...).PrepareForEval(context.Background())) return func(ctx context.Context, t Input) (Output, error) { diff --git a/pkg/lib/policy/scrypt.go b/pkg/lib/policy/scrypt.go new file mode 100644 index 0000000000..15e7e41da7 --- /dev/null +++ b/pkg/lib/policy/scrypt.go @@ -0,0 +1,60 @@ +package policy + +import ( + "encoding/base64" + + "github.com/open-policy-agent/opa/ast" + "github.com/open-policy-agent/opa/rego" + "github.com/open-policy-agent/opa/types" + "golang.org/x/crypto/scrypt" +) + +// See https://pkg.go.dev/golang.org/x/crypto/scrypt +const ( + n = 32768 + r = 8 + p = 1 + keyLen = 32 +) + +func Scrypt(password, salt []byte) ([]byte, error) { + return scrypt.Key(password, salt, n, r, p, keyLen) +} + +// scryptFn exposes the `scrypt` password hashing primitive to Rego. +var scryptFn = rego.Function2( + ®o.Function{ + Name: "scrypt", + Description: "Run the scrypt key derivation function", + Decl: types.NewFunction(types.Args(types.S, types.S), types.S), + Memoize: true, + Nondeterministic: false, + }, + func(bctx rego.BuiltinContext, passwordTerm, saltTerm *ast.Term) (*ast.Term, error) { + var password, salt string + if err := ast.As(passwordTerm.Value, &password); err != nil { + return nil, err + } + if err := ast.As(saltTerm.Value, &salt); err != nil { + return nil, err + } + + saltBytes, err := base64.StdEncoding.DecodeString(salt) + if err != nil { + return nil, err + } + + passwordBytes := []byte(password) + hash, err := Scrypt(passwordBytes, saltBytes) + if err != nil { + return nil, err + } + + value, err := ast.InterfaceToValue(base64.StdEncoding.EncodeToString(hash)) + if err != nil { + return nil, err + } + + return ast.NewTerm(value), nil + }, +) diff --git a/pkg/node/factories.go b/pkg/node/factories.go index f692e5227b..9c424d1d15 100644 --- a/pkg/node/factories.go +++ b/pkg/node/factories.go @@ -5,6 +5,7 @@ import ( "fmt" "github.com/bacalhau-project/bacalhau/pkg/authn" + "github.com/bacalhau-project/bacalhau/pkg/authn/ask" "github.com/bacalhau-project/bacalhau/pkg/authn/challenge" "github.com/bacalhau-project/bacalhau/pkg/config" "github.com/bacalhau-project/bacalhau/pkg/executor" @@ -136,26 +137,36 @@ func NewStandardAuthenticatorsFactory() AuthenticatorsFactory { return AuthenticatorsFactoryFunc( func(ctx context.Context, nodeConfig NodeConfig) (authn.Provider, error) { var allErr error - authns := make(map[string]authn.Authenticator, len(nodeConfig.AuthConfig.Methods)) + privKey, allErr := config.GetClientPrivateKey() + if allErr != nil { + return nil, allErr + } + authns := make(map[string]authn.Authenticator, len(nodeConfig.AuthConfig.Methods)) for name, authnConfig := range nodeConfig.AuthConfig.Methods { switch authnConfig.Type { case authn.MethodTypeChallenge: - privKey, err := config.GetClientPrivateKey() + methodPolicy, err := policy.FromPathOrDefault(authnConfig.PolicyPath, challenge.AnonymousModePolicy) if err != nil { allErr = multierr.Append(allErr, err) continue } - methodPolicy, err := policy.FromPathOrDefault(authnConfig.PolicyPath, challenge.AnonymousModePolicy) + authns[name] = challenge.NewAuthenticator( + methodPolicy, + challenge.NewStringMarshaller(nodeConfig.NodeID), + privKey, + nodeConfig.NodeID, + ) + case authn.MethodTypeAsk: + methodPolicy, err := policy.FromPath(authnConfig.PolicyPath) if err != nil { allErr = multierr.Append(allErr, err) continue } - authns[name] = challenge.NewAuthenticator( + authns[name] = ask.NewAuthenticator( methodPolicy, - challenge.NewStringMarshaller(nodeConfig.NodeID), privKey, nodeConfig.NodeID, ) From 88dd223902de3589f29c4abfc47088d815894190 Mon Sep 17 00:00:00 2001 From: frrist Date: Fri, 26 Jan 2024 10:37:14 -0800 Subject: [PATCH 2/5] clean: tidy up metrics suite --- ops/metrics/otel-collector-config,yaml | 0 ops/metrics/prometheus/prometheus.yml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 ops/metrics/otel-collector-config,yaml diff --git a/ops/metrics/otel-collector-config,yaml b/ops/metrics/otel-collector-config,yaml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/ops/metrics/prometheus/prometheus.yml b/ops/metrics/prometheus/prometheus.yml index 7e33e0d582..1e7a9903d6 100644 --- a/ops/metrics/prometheus/prometheus.yml +++ b/ops/metrics/prometheus/prometheus.yml @@ -1,5 +1,5 @@ scrape_configs: - job_name: 'otel-collector' - scrape_interval: 5s + scrape_interval: 1s static_configs: - targets: ['opentelemetry-collector:9095'] From 3e34681a88a35e77b69c3d8ac63590475dbbc31f Mon Sep 17 00:00:00 2001 From: frrist Date: Fri, 26 Jan 2024 10:38:52 -0800 Subject: [PATCH 3/5] feat: add timer, gauge, and counter metric helpers --- pkg/telemetry/counter.go | 38 ++++++++++++++++++++++++++++++++++++++ pkg/telemetry/guage.go | 36 ++++++++++++++++++++++++++++++++++++ pkg/telemetry/timer.go | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+) create mode 100644 pkg/telemetry/counter.go create mode 100644 pkg/telemetry/guage.go create mode 100644 pkg/telemetry/timer.go diff --git a/pkg/telemetry/counter.go b/pkg/telemetry/counter.go new file mode 100644 index 0000000000..0705a6707b --- /dev/null +++ b/pkg/telemetry/counter.go @@ -0,0 +1,38 @@ +package telemetry + +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Counter is a synchronous Instrument which supports non-negative increments +// Example uses for Counter: +// - count the number of bytes received +// - count the number of requests completed +// - count the number of accounts created +// - count the number of checkpoints run +// - count the number of HTTP 5xx errors +type Counter struct { + counter metric.Int64Counter +} + +func NewCounter(meter metric.Meter, name string, description string) (*Counter, error) { + counter, err := meter.Int64Counter(name, metric.WithDescription(description)) + if err != nil { + return nil, err + } + + return &Counter{ + counter: counter, + }, nil +} + +func (c *Counter) Inc(ctx context.Context, attrs ...attribute.KeyValue) { + c.counter.Add(ctx, 1, metric.WithAttributes(attrs...)) +} + +func (c *Counter) Add(ctx context.Context, num int64, attrs ...attribute.KeyValue) { + c.counter.Add(ctx, num, metric.WithAttributes(attrs...)) +} diff --git a/pkg/telemetry/guage.go b/pkg/telemetry/guage.go new file mode 100644 index 0000000000..15ad479e1e --- /dev/null +++ b/pkg/telemetry/guage.go @@ -0,0 +1,36 @@ +package telemetry + +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Gauge is a synchronous Instrument which supports increments and decrements. +// Note: if the value is monotonically increasing, use Counter instead. +// Example uses for Gauge: +// - the number of active requests +// - the number of items in a queue +type Gauge struct { + gauge metric.Int64UpDownCounter +} + +func NewGauge(meter metric.Meter, name string, description string) (*Gauge, error) { + gauge, err := meter.Int64UpDownCounter(name, metric.WithDescription(description)) + if err != nil { + return nil, err + } + + return &Gauge{ + gauge: gauge, + }, nil +} + +func (g *Gauge) Inc(ctx context.Context, attrs ...attribute.KeyValue) { + g.gauge.Add(ctx, 1, metric.WithAttributes(attrs...)) +} + +func (g *Gauge) Dec(ctx context.Context, attrs ...attribute.KeyValue) { + g.gauge.Add(ctx, -1, metric.WithAttributes(attrs...)) +} diff --git a/pkg/telemetry/timer.go b/pkg/telemetry/timer.go new file mode 100644 index 0000000000..9a8da613a2 --- /dev/null +++ b/pkg/telemetry/timer.go @@ -0,0 +1,40 @@ +package telemetry + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Timer measures the duration of an event. +type Timer struct { + startTime time.Time + durationRecorder metric.Int64Histogram +} + +func NewTimer(durationRecorder metric.Int64Histogram) *Timer { + return &Timer{ + durationRecorder: durationRecorder, + } +} + +// Start begins the timer by recording the current time. +func (t *Timer) Start() { + t.startTime = time.Now() +} + +// Stop ends the timer and records the duration since Start was called. +// `attrs` are optional attributes that can be added to the duration metric for additional context. +func (t *Timer) Stop(ctx context.Context, attrs ...attribute.KeyValue) { + if t.startTime.IsZero() { + // Handle the case where Stop is called without Start being called. + return + } + + // Calculate the duration and record it using the OpenTelemetry histogram. + duration := time.Since(t.startTime).Milliseconds() + t.durationRecorder.Record(ctx, duration, metric.WithAttributes(attrs...)) + t.startTime = time.Time{} // Reset the start time for future use. +} From 7a5f9c840ab11d74a60c1bf8b9d4350bb164be90 Mon Sep 17 00:00:00 2001 From: frrist Date: Fri, 26 Jan 2024 10:40:37 -0800 Subject: [PATCH 4/5] feat: add job and execution metrics: - Job Duration - Number of Jobs Submitted - Active Docker Executions - Active WASM Executions - Basic Node Info (metric isn't valuable, labels are) --- pkg/compute/executor.go | 4 ++++ pkg/compute/metrics.go | 25 ++++++++++++++++--------- pkg/executor/docker/handler.go | 3 +++ pkg/executor/docker/metrics.go | 20 ++++++++++++++++++++ pkg/executor/wasm/handler.go | 4 +++- pkg/executor/wasm/metrics.go | 20 ++++++++++++++++++++ pkg/models/execution.go | 3 ++- pkg/models/job.go | 11 +++++++++-- pkg/models/task.go | 12 +++++++++++- pkg/node/metrics/node.go | 20 ++++++++++++++++++++ pkg/node/node.go | 20 ++++++++++++++++---- pkg/requester/endpoint.go | 9 +++++---- pkg/requester/metrics.go | 19 +++++++++++++++++++ 13 files changed, 148 insertions(+), 22 deletions(-) create mode 100644 pkg/executor/docker/metrics.go create mode 100644 pkg/executor/wasm/metrics.go create mode 100644 pkg/node/metrics/node.go create mode 100644 pkg/requester/metrics.go diff --git a/pkg/compute/executor.go b/pkg/compute/executor.go index 37e2d16935..66d9ce33dd 100644 --- a/pkg/compute/executor.go +++ b/pkg/compute/executor.go @@ -11,6 +11,7 @@ import ( "github.com/rs/zerolog/log" "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/telemetry" "github.com/bacalhau-project/bacalhau/pkg/compute/store" "github.com/bacalhau-project/bacalhau/pkg/executor" @@ -298,11 +299,14 @@ func (e *BaseExecutor) Run(ctx context.Context, state store.LocalExecutionState) Str("execution", execution.ID). Logger().WithContext(ctx) + stopwatch := telemetry.NewTimer(jobDurationMilliseconds) + stopwatch.Start() operation := "Running" defer func() { if err != nil { e.handleFailure(ctx, state, err, operation) } + stopwatch.Stop(ctx, state.Execution.Job.MetricAttributes()...) }() res := e.Start(ctx, execution) diff --git a/pkg/compute/metrics.go b/pkg/compute/metrics.go index 398993f92c..3e453f5cfc 100644 --- a/pkg/compute/metrics.go +++ b/pkg/compute/metrics.go @@ -1,30 +1,37 @@ package compute import ( + "github.com/samber/lo" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/metric" ) // Metrics for monitoring compute nodes: var ( - meter = otel.GetMeterProvider().Meter("compute") - jobsReceived, _ = meter.Int64Counter( + meter = otel.GetMeterProvider().Meter("compute") + jobsReceived = lo.Must(meter.Int64Counter( "jobs_received", metric.WithDescription("Number of jobs received by the compute node"), - ) + )) - jobsAccepted, _ = meter.Int64Counter( + jobsAccepted = lo.Must(meter.Int64Counter( "jobs_accepted", metric.WithDescription("Number of jobs bid on and accepted by the compute node"), - ) + )) - jobsCompleted, _ = meter.Int64Counter( + jobsCompleted = lo.Must(meter.Int64Counter( "jobs_completed", metric.WithDescription("Number of jobs completed by the compute node."), - ) + )) - jobsFailed, _ = meter.Int64Counter( + jobsFailed = lo.Must(meter.Int64Counter( "jobs_failed", metric.WithDescription("Number of jobs failed by the compute node."), - ) + )) + + jobDurationMilliseconds = lo.Must(meter.Int64Histogram( + "job_duration_milliseconds", + metric.WithDescription("Duration of a job on the compute node in milliseconds."), + metric.WithUnit("ms"), + )) ) diff --git a/pkg/executor/docker/handler.go b/pkg/executor/docker/handler.go index 68acac15d8..f382991897 100644 --- a/pkg/executor/docker/handler.go +++ b/pkg/executor/docker/handler.go @@ -13,6 +13,7 @@ import ( "github.com/pkg/errors" "github.com/rs/zerolog" "github.com/rs/zerolog/log" + "go.opentelemetry.io/otel/attribute" "go.uber.org/atomic" "github.com/bacalhau-project/bacalhau/pkg/docker" @@ -52,6 +53,7 @@ type executionHandler struct { //nolint:funlen func (h *executionHandler) run(ctx context.Context) { + ActiveExecutions.Inc(ctx, attribute.String("executor_id", h.ID)) h.running.Store(true) defer func() { destroyTimeout := time.Second * 10 @@ -60,6 +62,7 @@ func (h *executionHandler) run(ctx context.Context) { } h.running.Store(false) close(h.waitCh) + ActiveExecutions.Dec(ctx, attribute.String("executor_id", h.ID)) }() // start the container h.logger.Info().Msg("starting container execution") diff --git a/pkg/executor/docker/metrics.go b/pkg/executor/docker/metrics.go new file mode 100644 index 0000000000..22b93d7906 --- /dev/null +++ b/pkg/executor/docker/metrics.go @@ -0,0 +1,20 @@ +package docker + +import ( + "github.com/samber/lo" + "go.opentelemetry.io/otel" + + "github.com/bacalhau-project/bacalhau/pkg/telemetry" +) + +var ( + dockerExecutorMeter = otel.GetMeterProvider().Meter("docker-executor") +) + +var ( + ActiveExecutions = lo.Must(telemetry.NewGauge( + dockerExecutorMeter, + "docker_active_executions", + "Number of active docker executions", + )) +) diff --git a/pkg/executor/wasm/handler.go b/pkg/executor/wasm/handler.go index 77fd67d671..eca30131af 100644 --- a/pkg/executor/wasm/handler.go +++ b/pkg/executor/wasm/handler.go @@ -19,7 +19,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/executor" wasmmodels "github.com/bacalhau-project/bacalhau/pkg/executor/wasm/models" wasmlogs "github.com/bacalhau-project/bacalhau/pkg/logger/wasm" - models "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/storage" "github.com/bacalhau-project/bacalhau/pkg/telemetry" "github.com/bacalhau-project/bacalhau/pkg/util/closer" @@ -62,6 +62,7 @@ type executionHandler struct { //nolint:funlen func (h *executionHandler) run(ctx context.Context) { + ActiveExecutions.Inc(ctx) defer func() { if r := recover(); r != nil { h.logger.Error(). @@ -70,6 +71,7 @@ func (h *executionHandler) run(ctx context.Context) { // TODO don't do this. h.result = &models.RunCommandResult{} } + ActiveExecutions.Dec(ctx) }() var wasmCtx context.Context diff --git a/pkg/executor/wasm/metrics.go b/pkg/executor/wasm/metrics.go new file mode 100644 index 0000000000..55de051bd6 --- /dev/null +++ b/pkg/executor/wasm/metrics.go @@ -0,0 +1,20 @@ +package wasm + +import ( + "github.com/samber/lo" + "go.opentelemetry.io/otel" + + "github.com/bacalhau-project/bacalhau/pkg/telemetry" +) + +var ( + wasmExecutorMeter = otel.GetMeterProvider().Meter("wasm-executor") +) + +var ( + ActiveExecutions = lo.Must(telemetry.NewGauge( + wasmExecutorMeter, + "wasm_active_executions", + "Number of active WASM executions", + )) +) diff --git a/pkg/models/execution.go b/pkg/models/execution.go index 124388ca36..7b04eadba7 100644 --- a/pkg/models/execution.go +++ b/pkg/models/execution.go @@ -6,8 +6,9 @@ import ( "errors" "time" - "github.com/bacalhau-project/bacalhau/pkg/lib/validate" "github.com/hashicorp/go-multierror" + + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" ) // ExecutionStateType The state of an execution. An execution represents a single attempt to execute a job on a node. diff --git a/pkg/models/job.go b/pkg/models/job.go index 592367f2a3..0f07a02061 100644 --- a/pkg/models/job.go +++ b/pkg/models/job.go @@ -7,9 +7,11 @@ import ( "strings" "time" - "github.com/bacalhau-project/bacalhau/pkg/lib/validate" "github.com/hashicorp/go-multierror" + "go.opentelemetry.io/otel/attribute" "golang.org/x/exp/maps" + + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" ) type JobStateType int @@ -67,7 +69,7 @@ func (s *JobStateType) UnmarshalText(text []byte) (err error) { type Job struct { // ID is a unique identifier assigned to this job. // It helps to distinguish jobs with the same name after they have been deleted and re-created. - //The ID is generated by the server and should not be set directly by the client. + // The ID is generated by the server and should not be set directly by the client. ID string `json:"ID"` // Name is the logical name of the job used to refer to it. @@ -114,6 +116,11 @@ type Job struct { ModifyTime int64 `json:"ModifyTime"` } +func (j *Job) MetricAttributes() []attribute.KeyValue { + // TODO(forrest): will need to re-think how we tag metrics from jobs with more than one task when ever that happens. + return append(j.Task().MetricAttributes(), attribute.String("job_type", j.Type)) +} + func (j *Job) String() string { return j.ID } diff --git a/pkg/models/task.go b/pkg/models/task.go index c5524e145a..24311d3573 100644 --- a/pkg/models/task.go +++ b/pkg/models/task.go @@ -4,9 +4,11 @@ import ( "errors" "fmt" - "github.com/bacalhau-project/bacalhau/pkg/lib/validate" "github.com/hashicorp/go-multierror" + "go.opentelemetry.io/otel/attribute" "golang.org/x/exp/maps" + + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" ) type Task struct { @@ -38,6 +40,14 @@ type Task struct { Timeouts *TimeoutConfig `json:"Timeouts,omitempty"` } +func (t *Task) MetricAttributes() []attribute.KeyValue { + return []attribute.KeyValue{ + attribute.String("task_engine", t.Engine.Type), + attribute.String("task_publisher", t.Publisher.Type), + attribute.String("task_network", t.Network.Type.String()), + } +} + func (t *Task) Normalize() { // Ensure that an empty and nil map are treated the same if t.Meta == nil { diff --git a/pkg/node/metrics/node.go b/pkg/node/metrics/node.go new file mode 100644 index 0000000000..526f5b19ac --- /dev/null +++ b/pkg/node/metrics/node.go @@ -0,0 +1,20 @@ +package metrics + +import ( + "github.com/samber/lo" + "go.opentelemetry.io/otel" + + "github.com/bacalhau-project/bacalhau/pkg/telemetry" +) + +var ( + nodeMeter = otel.GetMeterProvider().Meter("bacalhau-node") +) + +var ( + NodeInfo = lo.Must(telemetry.NewCounter( + nodeMeter, + "bacalhau_node_info", + "A static metric with labels describing the bacalhau node", + )) +) diff --git a/pkg/node/node.go b/pkg/node/node.go index a8b2c8c494..ba619ae2cc 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -5,6 +5,12 @@ import ( "fmt" "time" + "github.com/hashicorp/go-multierror" + "github.com/imdario/mergo" + "github.com/labstack/echo/v4" + "github.com/libp2p/go-libp2p/core/host" + "go.opentelemetry.io/otel/attribute" + "github.com/bacalhau-project/bacalhau/pkg/authz" pkgconfig "github.com/bacalhau-project/bacalhau/pkg/config" "github.com/bacalhau-project/bacalhau/pkg/config/types" @@ -14,6 +20,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" nats_transport "github.com/bacalhau-project/bacalhau/pkg/nats/transport" + "github.com/bacalhau-project/bacalhau/pkg/node/metrics" "github.com/bacalhau-project/bacalhau/pkg/publicapi" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels" "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/agent" @@ -24,10 +31,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/system" "github.com/bacalhau-project/bacalhau/pkg/transport" "github.com/bacalhau-project/bacalhau/pkg/version" - "github.com/hashicorp/go-multierror" - "github.com/imdario/mergo" - "github.com/labstack/echo/v4" - "github.com/libp2p/go-libp2p/core/host" ) type FeatureConfig struct { @@ -365,6 +368,15 @@ func NewNode( return errors.ErrorOrNil() }) + metrics.NodeInfo.Add(ctx, 1, + attribute.String("node_id", config.NodeID), + attribute.String("node_network_transport", config.NetworkConfig.Type), + attribute.Bool("node_is_compute", config.IsComputeNode), + attribute.Bool("node_is_requester", config.IsRequesterNode), + attribute.StringSlice("node_engines", executors.Keys(ctx)), + attribute.StringSlice("node_publishers", publishers.Keys(ctx)), + attribute.StringSlice("node_storages", storageProviders.Keys(ctx)), + ) node := &Node{ ID: config.NodeID, CleanupManager: config.CleanupManager, diff --git a/pkg/requester/endpoint.go b/pkg/requester/endpoint.go index 9342f6352a..d5047b6550 100644 --- a/pkg/requester/endpoint.go +++ b/pkg/requester/endpoint.go @@ -5,18 +5,17 @@ import ( "fmt" "time" - "github.com/bacalhau-project/bacalhau/pkg/models/migration/legacy" "github.com/google/uuid" "github.com/rs/zerolog/log" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" - "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/bacalhau-project/bacalhau/pkg/compute" "github.com/bacalhau-project/bacalhau/pkg/jobstore" "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/models/migration/legacy" + "github.com/bacalhau-project/bacalhau/pkg/orchestrator" "github.com/bacalhau-project/bacalhau/pkg/requester/jobtransform" "github.com/bacalhau-project/bacalhau/pkg/storage" "github.com/bacalhau-project/bacalhau/pkg/system" @@ -128,6 +127,8 @@ func (e *BaseEndpoint) SubmitJob(ctx context.Context, data model.JobCreatePayloa } } + JobsSubmitted.Inc(ctx, job.MetricAttributes()...) + err = e.store.CreateJob(ctx, *job) if err != nil { return nil, err diff --git a/pkg/requester/metrics.go b/pkg/requester/metrics.go new file mode 100644 index 0000000000..69f2324878 --- /dev/null +++ b/pkg/requester/metrics.go @@ -0,0 +1,19 @@ +package requester + +import ( + "github.com/samber/lo" + "go.opentelemetry.io/otel" + + "github.com/bacalhau-project/bacalhau/pkg/telemetry" +) + +var ( + requesterMeter = otel.GetMeterProvider().Meter("requester") +) + +var ( + JobsSubmitted = lo.Must(telemetry.NewCounter( + requesterMeter, + "job_submitted", + "Number of jobs submitted")) +) From 7414c044c01567dfea334bcc39b3fd94b47ebd37 Mon Sep 17 00:00:00 2001 From: frrist Date: Fri, 26 Jan 2024 10:41:31 -0800 Subject: [PATCH 5/5] feat: update metrics suite dashboard --- .../provisioning/dashboards/dashboard.json | 1224 ++++++++++++++++- 1 file changed, 1211 insertions(+), 13 deletions(-) diff --git a/ops/metrics/grafana/provisioning/dashboards/dashboard.json b/ops/metrics/grafana/provisioning/dashboards/dashboard.json index 52aa74cf85..e6b25d8cb9 100644 --- a/ops/metrics/grafana/provisioning/dashboards/dashboard.json +++ b/ops/metrics/grafana/provisioning/dashboards/dashboard.json @@ -31,6 +31,13 @@ "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, "mappings": [], "thresholds": { "mode": "absolute", @@ -50,11 +57,95 @@ "overrides": [] }, "gridPos": { - "h": 8, - "w": 24, + "h": 10, + "w": 5, "x": 0, "y": 0 }, + "id": 3, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_bacalhau_node_info_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Node Metadata", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "keepLabels": [ + "node_engines", + "node_id", + "node_is_compute", + "node_is_requester", + "node_network_transport", + "node_publishers", + "node_storages" + ], + "mode": "rows" + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 5, + "y": 0 + }, "id": 2, "options": { "colorMode": "value", @@ -105,16 +196,83 @@ "mode": "thresholds" }, "mappings": [], + "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null - }, + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 7, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_jobs_accepted_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Jobs Accepted", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ { - "color": "red", - "value": 80 + "color": "green", + "value": null } ] }, @@ -123,12 +281,12 @@ "overrides": [] }, "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 8 + "h": 10, + "w": 2, + "x": 9, + "y": 0 }, - "id": 1, + "id": 5, "options": { "colorMode": "value", "graphMode": "area", @@ -166,22 +324,1062 @@ ], "title": "Jobs Completed", "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 11, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_docker_active_executions", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Active Docker Executions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 13, + "y": 0 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_wasm_active_executions", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Active WASM Executions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "editorMode": "code", + "expr": "rate(bacalhau_job_duration_milliseconds_sum[5m])\n/\nrate(bacalhau_job_duration_milliseconds_count[5m])", + "instant": false, + "legendFormat": "{{task_engine}}", + "range": true, + "refId": "A" + } + ], + "title": "Average Job Duration over 5mins", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "editorMode": "code", + "expr": "rate(bacalhau_http_server_duration_milliseconds_sum[5m])\n/\nrate(bacalhau_http_server_duration_milliseconds_count[5m])", + "instant": false, + "legendFormat": "{{http_route}}", + "range": true, + "refId": "A" + } + ], + "title": "Averagef HTTP Requests Duration over 5min", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "editorMode": "code", + "expr": "bacalhau_http_server_request_content_length_bytes_total", + "hide": false, + "instant": false, + "legendFormat": "{{http_route}}", + "range": true, + "refId": "B" + } + ], + "title": "HTTP Request Content Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "editorMode": "code", + "expr": "bacalhau_http_server_response_content_length_bytes_total", + "hide": false, + "instant": false, + "legendFormat": "{{http_route}}", + "range": true, + "refId": "B" + } + ], + "title": "HTTP Response Content Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 0, + "y": 42 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_eval_broker_cancelable", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Evaluatio Broker Cancelable", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 2, + "y": 42 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_eval_broker_inflight", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Evaluatio Broker Inflight", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 4, + "y": 42 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_eval_broker_pending", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Evaluatio Broker Pending", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 6, + "y": 42 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_eval_broker_waiting", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Evaluatio Broker Waiting", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 8, + "y": 42 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_worker_ack_faults_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Worker Ack Faults", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 10, + "y": 42 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_worker_dequeue_faults_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Worker Dequeue Faults", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 12, + "y": 42 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_worker_process_faults_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Worker Process Faults", + "type": "stat" } ], - "refresh": "", + "refresh": "10s", "schemaVersion": 39, "tags": [], "templating": { "list": [] }, "time": { - "from": "now-5m", + "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Bacalhau Metrics", "uid": "cbe6c668-d74b-4a27-be8b-431c19b2d4ca", - "version": 1, + "version": 2, "weekStart": "" } \ No newline at end of file