From b5a4918cee732119b150de11a22a00e4b006c137 Mon Sep 17 00:00:00 2001 From: Forrest <6546409+frrist@users.noreply.github.com> Date: Thu, 3 Oct 2024 09:31:56 -0700 Subject: [PATCH] fix: only collect OS type; fix method name; add readme (#4579) ### What: - Fix NodeID option name (cosmetic change) - Hash the NodeID before recording it. - Only collect OS type, instead of all details. - Add a README.md describing what is collected, and how to opt out of collection --------- Co-authored-by: frrist --- cmd/cli/serve/serve.go | 2 +- pkg/analytics/README.md | 118 +++++++++++++++++++++++++++++++++++++ pkg/analytics/analytics.go | 8 +-- 3 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 pkg/analytics/README.md diff --git a/cmd/cli/serve/serve.go b/cmd/cli/serve/serve.go index 04b69e92d6..b61884d342 100644 --- a/cmd/cli/serve/serve.go +++ b/cmd/cli/serve/serve.go @@ -260,7 +260,7 @@ func serve(cmd *cobra.Command, cfg types.Bacalhau, fsRepo *repo.FsRepo) error { if !cfg.DisableAnalytics { err = analytics.SetupAnalyticsProvider(ctx, - analytics.WithNodeNodeID(sysmeta.NodeName), + analytics.WithNodeID(sysmeta.NodeName), analytics.WithInstallationID(system.InstallationID()), analytics.WithInstanceID(sysmeta.InstanceID), analytics.WithNodeType(isRequesterNode, isComputeNode), diff --git a/pkg/analytics/README.md b/pkg/analytics/README.md new file mode 100644 index 0000000000..7c61f5f060 --- /dev/null +++ b/pkg/analytics/README.md @@ -0,0 +1,118 @@ +# What Data is shared by users of Bacalhau? + +When a job is submitted or completed, data is collected about it to help track, manage, and optimize its execution. + +## What information is collected on the bacalhau agent: + +- **Node Type**: One of: ‘hybrid’, ‘orchestrator’, ‘compute’. +- **Node Version:** The version of bacalhau the node is running. +- **Node ID**: The identifier of the bacalhau node. +- **Installation ID**: The identified associated with the installation of bacalhau. +- **Instance ID**: An anonymous identifier of the bacalhau node. +- **Operating System Type**: The name of the operating system the bacalhau node is running on. + +## **What information is collected on job submissions and completions:** + +1. **Job Identification** + - **ID**: A unique identifier for the job. + - **Namespace Hash**: A hashed version of the job’s namespace, used for grouping related jobs. + - **Name Set**: Whether a specific name was set for the job. + - **Type**: The type of job you’re running. + - **Count**: The number of tasks associated with the job. + - **Labels & Metadata Counts**: The number of labels and metadata entries attached to the job. +2. **State and Timing Information (Terminal Jobs Only)** + - **State**: The current state of the job (e.g., completed, failed). + - **Creation & Modification Times**: When the job was created and last modified. +3. **Versioning and Revisions** + - **Version & Revision**: These fields help track changes to the job’s configuration over time. +4. **Task-Specific Information** + - **Task Name Hash**: A hashed version of the task name for internal tracking. + - **Task Engine & Publisher Types**: The type of engine and publisher used for the task. + - **Environment Variables & Metadata**: The number of environment variables and metadata entries tied to the task. + - **Input Source Types**: The types of input sources for the task (e.g., file, database). + - **Result Paths Count**: The number of result paths generated by the task. +5. **Resource Allocation** + - **CPU, Memory, Disk, GPU Usage**: The amount of CPU, memory, disk, and GPU resources requested by the task. + - **Network Details**: The network type and number of network domains used by the task. +6. **Timeouts** + - **Execution Timeout**: The maximum allowed time for the task to run. + - **Queue Timeout**: The maximum time the task can wait in the queue. + - **Total Timeout**: The total allowed time for the job, including both queue and execution time. +7. **Warnings and Errors (Submitted Jobs Only)** + - Any warnings or errors that occurred during the job submission or execution process. + +## **What Information is Collected on Job Execution** + +When a job is executed, detailed information about the execution process is collected to help monitor and optimize performance, as well as assist with troubleshooting. Here’s a breakdown of what is collected: + +1. **Execution Identification** + - **Execution ID**: A unique identifier for the execution. + - **Job ID**: The identifier for the associated job. + - **Evaluation ID**: An identifier linking the execution to its evaluation process. + - **Node Name Hash**: A hashed version of the name of the node where the execution is running. + - **Namespace Hash**: A hashed version of the namespace under which the execution is running. +2. **Execution Metadata** + - **Execution Name Set**: Whether a specific name was set for the execution. + - **Previous & Next Executions**: Links to any preceding or subsequent executions, if applicable. + - **Follow-up Evaluation ID**: An identifier for any follow-up evaluations related to the execution. + - **Revision**: A version number that tracks changes to the execution configuration over time. + - **Creation & Modification Times**: Timestamps indicating when the execution was created and last modified. +3. **Resource Allocation** + - **Total CPU Units**: The total CPU resources allocated for the execution. + - **Total Memory, Disk, and GPU Usage**: The memory, disk space, and GPU resources used by the execution. +4. **Execution States** + - **Desired State:** The intended state of the execution (e.g., running, completed). + - **Compute State & Message**: The actual state of the execution, including any details about its progress or errors. + - **Compute Error Code**: An error code related to any issues with the execution's state on the compute node. +5. **Published Results** + - **Published Result Type**: The type of result produced by the execution, such as output files or data. +6. **Run Command Results** + - **Run Output Details**: Information about the command’s execution, including: + - **Exit Code**: The exit code returned by the executed task (typically 0 for success). + - **RunResultStdoutTruncated**: Whether stdout was truncated during execution. + - **RunResultStderrTruncated**: Whether stderr was truncated during execution. + +# How do users opt out of sharing data? + +To opt out of sharing data, users may run one of the following commands before starting their bacalhau node: +**Disable collection via `config set`** + +```bash +bacalhau config set DisableAnalytics true +``` + +**Disable collection via environment variable** + +```bash +export BACALHAU_DISABLEANALYTICS=true +``` + +**Disable collection via editing the config file** + +```bash +echo 'disableanalytics: true' >> ~/.bacalhau/config.yaml +``` + +**Disable collection via a config flag** + +```bash +bacalhau --config=DisableAnalytics=true +``` + +## **How can users verify they have opted out?** + +```bash +bacalhau config list | grep disableanalytics +``` + +Expected output when collection is disabled: + +```bash +disableanalytics true No description available +``` + +Expected output when collection is enabled: + +```bash +disableanalytics false No description available +``` diff --git a/pkg/analytics/analytics.go b/pkg/analytics/analytics.go index 8143bfe654..0b6c2ee0cd 100644 --- a/pkg/analytics/analytics.go +++ b/pkg/analytics/analytics.go @@ -23,7 +23,7 @@ const DefaultOtelCollectorEndpoint = "t.bacalhau.org:4317" const ( NodeInstallationIDKey = "installation_id" NodeInstanceIDKey = "instance_id" - NodeIDKey = "node_id" + NodeIDHashKey = "node_id_hash" NodeTypeKey = "node_type" NodeVersionKey = "node_version" ) @@ -41,9 +41,9 @@ func WithEndpoint(endpoint string) Option { } } -func WithNodeNodeID(id string) Option { +func WithNodeID(id string) Option { return func(c *Config) { - c.attributes = append(c.attributes, attribute.String(NodeIDKey, id)) + c.attributes = append(c.attributes, attribute.String(NodeIDHashKey, hashString(id))) } } @@ -108,7 +108,7 @@ func SetupAnalyticsProvider(ctx context.Context, opts ...Option) error { // Create a new resource with auto-detected host information res, err := resource.New(ctx, - resource.WithOS(), + resource.WithOSType(), resource.WithSchemaURL(semconv.SchemaURL), resource.WithAttributes(config.attributes...), )