Skip to content

Commit dca8719

Browse files
authored
Support Running Orchestrator behind a Reverse Proxy (#4724)
If we need to put an Orchestrator behind a TLS terminating reverse proxy,the NATS server should be configurred in a very specific way, and the NATS clients (compute nodes) should also be configured in a certain way. The NATS server should say the TLS is available, although it is not. Also, the compute node should enforce TLS communication for NATS, because reverse proxy supports TLS. See link: https://docs.nats.io/running-a-nats-service/configuration/securing_nats/tls#tls-terminating-reverse-proxies Sample Orchestrator Node config: ```yaml NameProvider: "uuid" API: Port: 1234 Orchestrator: Enabled: true Auth: Token: "i_am_very_secret_token" SupportReverseProxy: true ``` Sample Compute Node Config: ```yaml NameProvider: "uuid" API: Port: 1234 Compute: Enabled: true Orchestrators: - nats://bacalhau-traefik-node:4222 Auth: Token: "i_am_very_secret_token" TLS: RequireTLS: true ``` Please see the integration tests in this commit, it has a very detailed test suite covering all cases. Linear: https://linear.app/expanso/issue/ENG-379/bacalhau-to-support-tls-behind-reverse-proxy <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit ## Release Notes - **New Features** - Introduced support for TLS communication and reverse proxy configurations in compute and orchestrator nodes. - Added new properties in the API schema to enhance configuration options. - **Bug Fixes** - Improved error handling for NATS connections based on TLS requirements. - **Documentation** - Updated Swagger API documentation to include new properties and configurations. - **Tests** - Added a new test suite to validate orchestrator functionality behind a reverse proxy. - **Chores** - Introduced new Docker Compose configurations for enhanced service orchestration. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent 5e3ab41 commit dca8719

21 files changed

+488
-24
lines changed

.cspell/custom-dictionary.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -438,4 +438,5 @@ buildvcs
438438
Nilf
439439
IMDS
440440
tlsca
441-
Lenf
441+
Lenf
442+
traefik

pkg/config/types/compute.go

+3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ type ComputeAuth struct {
2323
type ComputeTLS struct {
2424
// CACert specifies the CA file path that the compute node trusts when connecting to orchestrator.
2525
CACert string `yaml:"CACert,omitempty" json:"CACert,omitempty"`
26+
27+
// RequireTLS specifies if the compute node enforces encrypted communication with orchestrator.
28+
RequireTLS bool `yaml:"RequireTLS,omitempty" json:"RequireTLS,omitempty"`
2629
}
2730

2831
type Heartbeat struct {

pkg/config/types/generated_constants.go

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ const ComputeHeartbeatIntervalKey = "Compute.Heartbeat.Interval"
2626
const ComputeHeartbeatResourceUpdateIntervalKey = "Compute.Heartbeat.ResourceUpdateInterval"
2727
const ComputeOrchestratorsKey = "Compute.Orchestrators"
2828
const ComputeTLSCACertKey = "Compute.TLS.CACert"
29+
const ComputeTLSRequireTLSKey = "Compute.TLS.RequireTLS"
2930
const DataDirKey = "DataDir"
3031
const DisableAnalyticsKey = "DisableAnalytics"
3132
const EnginesDisabledKey = "Engines.Disabled"
@@ -92,6 +93,7 @@ const OrchestratorSchedulerHousekeepingIntervalKey = "Orchestrator.Scheduler.Hou
9293
const OrchestratorSchedulerHousekeepingTimeoutKey = "Orchestrator.Scheduler.HousekeepingTimeout"
9394
const OrchestratorSchedulerQueueBackoffKey = "Orchestrator.Scheduler.QueueBackoff"
9495
const OrchestratorSchedulerWorkerCountKey = "Orchestrator.Scheduler.WorkerCount"
96+
const OrchestratorSupportReverseProxyKey = "Orchestrator.SupportReverseProxy"
9597
const OrchestratorTLSCACertKey = "Orchestrator.TLS.CACert"
9698
const OrchestratorTLSServerCertKey = "Orchestrator.TLS.ServerCert"
9799
const OrchestratorTLSServerKeyKey = "Orchestrator.TLS.ServerKey"

pkg/config/types/generated_descriptions.go

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ var ConfigDescriptions = map[string]string{
2828
ComputeHeartbeatResourceUpdateIntervalKey: "ResourceUpdateInterval specifies the time between updates of resource information to the orchestrator.",
2929
ComputeOrchestratorsKey: "Orchestrators specifies a list of orchestrator endpoints that this compute node connects to.",
3030
ComputeTLSCACertKey: "CACert specifies the CA file path that the compute node trusts when connecting to orchestrator.",
31+
ComputeTLSRequireTLSKey: "RequireTLS specifies if the compute node enforces encrypted communication with orchestrator.",
3132
DataDirKey: "DataDir specifies a location on disk where the bacalhau node will maintain state.",
3233
DisableAnalyticsKey: "DisableAnalytics, when true, disables sharing anonymous analytics data with the Bacalhau development team",
3334
EnginesDisabledKey: "Disabled specifies a list of engines that are disabled.",
@@ -94,6 +95,7 @@ var ConfigDescriptions = map[string]string{
9495
OrchestratorSchedulerHousekeepingTimeoutKey: "HousekeepingTimeout specifies the maximum time allowed for a single housekeeping run.",
9596
OrchestratorSchedulerQueueBackoffKey: "QueueBackoff specifies the time to wait before retrying a failed job.",
9697
OrchestratorSchedulerWorkerCountKey: "WorkerCount specifies the number of concurrent workers for job scheduling.",
98+
OrchestratorSupportReverseProxyKey: "SupportReverseProxy configures the orchestrator node to run behind a reverse proxy",
9799
OrchestratorTLSCACertKey: "CACert specifies the CA file path that the orchestrator node trusts when connecting to NATS server.",
98100
OrchestratorTLSServerCertKey: "ServerCert specifies the certificate file path given to NATS server to serve TLS connections.",
99101
OrchestratorTLSServerKeyKey: "ServerKey specifies the private key file path given to NATS server to serve TLS connections.",

pkg/config/types/orchestrator.go

+2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ type Orchestrator struct {
1717
NodeManager NodeManager `yaml:"NodeManager,omitempty" json:"NodeManager,omitempty"`
1818
Scheduler Scheduler `yaml:"Scheduler,omitempty" json:"Scheduler,omitempty"`
1919
EvaluationBroker EvaluationBroker `yaml:"EvaluationBroker,omitempty" json:"EvaluationBroker,omitempty"`
20+
// SupportReverseProxy configures the orchestrator node to run behind a reverse proxy
21+
SupportReverseProxy bool `yaml:"SupportReverseProxy,omitempty" json:"SupportReverseProxy,omitempty"`
2022
}
2123

2224
type OrchestratorAuth struct {

pkg/nats/transport/nats.go

+50-6
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,6 @@ import (
66
"fmt"
77
"strings"
88

9-
"github.com/nats-io/nats-server/v2/server"
10-
"github.com/nats-io/nats.go"
11-
"github.com/rs/zerolog/log"
12-
139
"github.com/bacalhau-project/bacalhau/pkg/bacerrors"
1410
"github.com/bacalhau-project/bacalhau/pkg/compute"
1511
"github.com/bacalhau-project/bacalhau/pkg/compute/logstream"
@@ -20,6 +16,9 @@ import (
2016
nats_pubsub "github.com/bacalhau-project/bacalhau/pkg/nats/pubsub"
2117
"github.com/bacalhau-project/bacalhau/pkg/pubsub"
2218
"github.com/bacalhau-project/bacalhau/pkg/routing"
19+
"github.com/nats-io/nats-server/v2/server"
20+
"github.com/nats-io/nats.go"
21+
"github.com/rs/zerolog/log"
2322
)
2423

2524
const NodeInfoSubjectPrefix = "node.info."
@@ -60,8 +59,15 @@ type NATSTransportConfig struct {
6059
// Used by the Nats Client when node acts as orchestrator
6160
ServerTLSCACert string
6261

63-
// Use by the Nats Client when node acts as compute
62+
// Used by the Nats Client when node acts as compute
6463
ClientTLSCACert string
64+
65+
// Used to configure Orchestrator (actually the NATS server) to run behind
66+
// a reverse proxy
67+
ServerSupportReverseProxy bool
68+
69+
// Used to configure compute node nats client to require TLS connection
70+
ComputeClientRequireTLS bool
6571
}
6672

6773
func (c *NATSTransportConfig) Validate() error {
@@ -168,6 +174,21 @@ func NewNATSTransport(ctx context.Context,
168174
serverOpts.TLSConfig = serverTLSConfig
169175
}
170176

177+
if config.ServerSupportReverseProxy {
178+
// If the ServerSupportReverseProxy is enabled, we need to set
179+
// serverOpts.TLSConfig to an empty config, if it is null.
180+
// Reason for that , unfortunately that's the only eay NATS server will
181+
// work behind a reverse proxy, that's how NATS documentation recommends doing it.
182+
// See: https://docs.nats.io/running-a-nats-service/configuration/securing_nats/tls#tls-terminating-reverse-proxies
183+
serverOpts.AllowNonTLS = true
184+
185+
// We need to make sure not to override TLS configuration if it was set. Maybe the operator want TLS
186+
// between reverse proxy and NATS server, up to them.
187+
if serverOpts.TLSConfig == nil {
188+
serverOpts.TLSConfig, _ = server.GenTLSConfig(&server.TLSConfigOpts{})
189+
}
190+
}
191+
171192
// Only set cluster options if cluster peers are provided. Jetstream doesn't
172193
// like the setting to be present with no values, or with values that are
173194
// a local address (e.g. it can't RAFT to itself).
@@ -194,7 +215,25 @@ func NewNATSTransport(ctx context.Context,
194215
return nil, err
195216
}
196217

197-
config.Orchestrators = append(config.Orchestrators, sm.Server.ClientURL())
218+
if config.ServerSupportReverseProxy {
219+
// Server.ClientURL() (in core NATS code), will check if TLSConfig of the server
220+
// is not null, and changes the URL Scheme from "nats" to "tls". When running
221+
// the server with ServerSupportReverseProxy setting, almost all the time
222+
// the NATS server will not be supporting TLS. This will make the orchestrator NATS client
223+
// fail, since it was given the "tls://" NATS server URL to connect to, but the
224+
// server does not support TLS. It is unfortunate that the ClientURL method does that.
225+
// So here, we are checking, if NATS server was not started with a cert and key, and at the
226+
// same time it was started with ServerSupportReverseProxy set to true, then we will change
227+
// URL the scheme back to "nats://" from "tls://".
228+
229+
clientURL := sm.Server.ClientURL()
230+
if strings.HasPrefix(clientURL, "tls://") && config.ServerTLSCert == "" {
231+
clientURL = strings.Replace(clientURL, "tls://", "nats://", 1)
232+
}
233+
config.Orchestrators = append(config.Orchestrators, clientURL)
234+
} else {
235+
config.Orchestrators = append(config.Orchestrators, sm.Server.ClientURL())
236+
}
198237
}
199238

200239
nc, err := CreateClient(ctx, config)
@@ -273,6 +312,11 @@ func CreateClient(ctx context.Context, config *NATSTransportConfig) (*nats_helpe
273312
nats.MaxReconnects(-1),
274313
}
275314

315+
// When Compute Node requires TLS, enforce it
316+
if config.ComputeClientRequireTLS {
317+
clientOptions = append(clientOptions, nats.TLSHandshakeFirst())
318+
}
319+
276320
// We need to do this logic since the Nats Transport Layer does not differentiate
277321
// between orchestrator mode and compute mode
278322
if config.ServerTLSCert == "" && config.ClientTLSCACert != "" {

pkg/node/node.go

+19-17
Original file line numberDiff line numberDiff line change
@@ -349,23 +349,25 @@ func createTransport(ctx context.Context, cfg NodeConfig) (*nats_transport.NATST
349349

350350
// TODO: revisit how we setup the transport layer for compute only, orchestrator only and hybrid nodes
351351
config := &nats_transport.NATSTransportConfig{
352-
NodeID: cfg.NodeID,
353-
Host: cfg.BacalhauConfig.Orchestrator.Host,
354-
Port: cfg.BacalhauConfig.Orchestrator.Port,
355-
AdvertisedAddress: cfg.BacalhauConfig.Orchestrator.Advertise,
356-
AuthSecret: cfg.BacalhauConfig.Orchestrator.Auth.Token,
357-
Orchestrators: cfg.BacalhauConfig.Compute.Orchestrators,
358-
StoreDir: storeDir,
359-
ClusterName: cfg.BacalhauConfig.Orchestrator.Cluster.Name,
360-
ClusterPort: cfg.BacalhauConfig.Orchestrator.Cluster.Port,
361-
ClusterPeers: cfg.BacalhauConfig.Orchestrator.Cluster.Peers,
362-
ClusterAdvertisedAddress: cfg.BacalhauConfig.Orchestrator.Cluster.Advertise,
363-
IsRequesterNode: cfg.BacalhauConfig.Orchestrator.Enabled,
364-
ServerTLSCACert: cfg.BacalhauConfig.Orchestrator.TLS.CACert,
365-
ServerTLSCert: cfg.BacalhauConfig.Orchestrator.TLS.ServerCert,
366-
ServerTLSKey: cfg.BacalhauConfig.Orchestrator.TLS.ServerKey,
367-
ServerTLSTimeout: cfg.BacalhauConfig.Orchestrator.TLS.ServerTimeout,
368-
ClientTLSCACert: cfg.BacalhauConfig.Compute.TLS.CACert,
352+
NodeID: cfg.NodeID,
353+
Host: cfg.BacalhauConfig.Orchestrator.Host,
354+
Port: cfg.BacalhauConfig.Orchestrator.Port,
355+
AdvertisedAddress: cfg.BacalhauConfig.Orchestrator.Advertise,
356+
AuthSecret: cfg.BacalhauConfig.Orchestrator.Auth.Token,
357+
Orchestrators: cfg.BacalhauConfig.Compute.Orchestrators,
358+
StoreDir: storeDir,
359+
ClusterName: cfg.BacalhauConfig.Orchestrator.Cluster.Name,
360+
ClusterPort: cfg.BacalhauConfig.Orchestrator.Cluster.Port,
361+
ClusterPeers: cfg.BacalhauConfig.Orchestrator.Cluster.Peers,
362+
ClusterAdvertisedAddress: cfg.BacalhauConfig.Orchestrator.Cluster.Advertise,
363+
IsRequesterNode: cfg.BacalhauConfig.Orchestrator.Enabled,
364+
ServerTLSCACert: cfg.BacalhauConfig.Orchestrator.TLS.CACert,
365+
ServerTLSCert: cfg.BacalhauConfig.Orchestrator.TLS.ServerCert,
366+
ServerTLSKey: cfg.BacalhauConfig.Orchestrator.TLS.ServerKey,
367+
ServerTLSTimeout: cfg.BacalhauConfig.Orchestrator.TLS.ServerTimeout,
368+
ServerSupportReverseProxy: cfg.BacalhauConfig.Orchestrator.SupportReverseProxy,
369+
ClientTLSCACert: cfg.BacalhauConfig.Compute.TLS.CACert,
370+
ComputeClientRequireTLS: cfg.BacalhauConfig.Compute.TLS.RequireTLS,
369371
}
370372

371373
if cfg.BacalhauConfig.Compute.Enabled && !cfg.BacalhauConfig.Orchestrator.Enabled {

pkg/swagger/docs.go

+8
Original file line numberDiff line numberDiff line change
@@ -2400,6 +2400,10 @@ const docTemplate = `{
24002400
"CACert": {
24012401
"description": "CACert specifies the CA file path that the compute node trusts when connecting to orchestrator.",
24022402
"type": "string"
2403+
},
2404+
"RequireTLS": {
2405+
"description": "RequireTLS specifies if the compute node enforces encrypted communication with orchestrator.",
2406+
"type": "boolean"
24032407
}
24042408
}
24052409
},
@@ -2714,6 +2718,10 @@ const docTemplate = `{
27142718
"Scheduler": {
27152719
"$ref": "#/definitions/types.Scheduler"
27162720
},
2721+
"SupportReverseProxy": {
2722+
"description": "SupportReverseProxy configures the orchestrator node to run behind a reverse proxy",
2723+
"type": "boolean"
2724+
},
27172725
"TLS": {
27182726
"description": "TLS specifies the TLS related configuration on the orchestrator for when compute nodes need to connect.",
27192727
"allOf": [

pkg/swagger/swagger.json

+8
Original file line numberDiff line numberDiff line change
@@ -2396,6 +2396,10 @@
23962396
"CACert": {
23972397
"description": "CACert specifies the CA file path that the compute node trusts when connecting to orchestrator.",
23982398
"type": "string"
2399+
},
2400+
"RequireTLS": {
2401+
"description": "RequireTLS specifies if the compute node enforces encrypted communication with orchestrator.",
2402+
"type": "boolean"
23992403
}
24002404
}
24012405
},
@@ -2710,6 +2714,10 @@
27102714
"Scheduler": {
27112715
"$ref": "#/definitions/types.Scheduler"
27122716
},
2717+
"SupportReverseProxy": {
2718+
"description": "SupportReverseProxy configures the orchestrator node to run behind a reverse proxy",
2719+
"type": "boolean"
2720+
},
27132721
"TLS": {
27142722
"description": "TLS specifies the TLS related configuration on the orchestrator for when compute nodes need to connect.",
27152723
"allOf": [
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
package test_integration
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
"testing"
8+
"time"
9+
10+
"bacalhau/integration_tests/utils"
11+
12+
"github.com/google/uuid"
13+
"github.com/stretchr/testify/suite"
14+
)
15+
16+
type OrchestratorBehindReverseProxySuite struct {
17+
BaseDockerComposeTestSuite
18+
}
19+
20+
func NewOrchestratorBehindReverseProxySuite() *OrchestratorBehindReverseProxySuite {
21+
s := &OrchestratorBehindReverseProxySuite{}
22+
s.GlobalRunIdentifier = globalTestExecutionId
23+
s.SuiteRunIdentifier = strings.ToLower(strings.Split(uuid.New().String(), "-")[0])
24+
return s
25+
}
26+
27+
func (s *OrchestratorBehindReverseProxySuite) SetupSuite() {
28+
// In this test suite, the orchestrator is running behind a reverse proxy, and all
29+
// the NATS traffic between orchestrator and compute Node go through a real reverse proxy (Traefik)
30+
31+
rawDockerComposeFilePath := "./common_assets/docker_compose_files/orchestrator-compute-traefik-custom-startup.yml"
32+
s.Context, s.Cancel = context.WithCancel(context.Background())
33+
34+
traefikConfigFile := s.commonAssets("nodes_configs/9_traefik_static_config.yaml")
35+
traefikStartCommand := fmt.Sprintf("--configFile=%s", traefikConfigFile)
36+
37+
orchestratorConfigFile := s.commonAssets("nodes_configs/9_orchestrator_node_behind_reverse_proxy.yaml")
38+
orchestratorStartCommand := fmt.Sprintf("bacalhau serve --config=%s", orchestratorConfigFile)
39+
40+
computeConfigFile := s.commonAssets("nodes_configs/9_compute_node_with_enforced_tls_nats.yaml")
41+
computeStartCommand := fmt.Sprintf("bacalhau serve --config=%s", computeConfigFile)
42+
extraRenderingData := map[string]interface{}{
43+
"OrchestratorStartCommand": orchestratorStartCommand,
44+
"ComputeStartCommand": computeStartCommand,
45+
"TraefikStartCommand": traefikStartCommand,
46+
}
47+
s.BaseDockerComposeTestSuite.SetupSuite(rawDockerComposeFilePath, extraRenderingData)
48+
}
49+
50+
func (s *OrchestratorBehindReverseProxySuite) TearDownSuite() {
51+
s.T().Log("Tearing down [Test Suite] in OrchestratorBehindReverseProxySuite...")
52+
s.BaseDockerComposeTestSuite.TearDownSuite()
53+
}
54+
55+
func (s *OrchestratorBehindReverseProxySuite) TestRunHelloWorldJobWithOrchestratorBehindReverseProxy() {
56+
result, err := s.executeCommandInDefaultJumpbox(
57+
[]string{
58+
"bacalhau",
59+
"job",
60+
"run",
61+
"--wait=false",
62+
"--id-only",
63+
"/bacalhau_integration_tests/common_assets/job_specs/hello_world.yml",
64+
})
65+
s.Require().NoError(err)
66+
67+
jobID, err := utils.ExtractJobIDFromShortOutput(result)
68+
s.Require().NoError(err)
69+
70+
_, err = s.waitForJobToComplete(jobID, 30*time.Second)
71+
s.Require().NoError(err)
72+
73+
resultDescription, err := s.executeCommandInDefaultJumpbox([]string{"bacalhau", "job", "describe", jobID})
74+
s.Require().NoError(err)
75+
s.Require().Contains(resultDescription, "hello bacalhau world", resultDescription)
76+
}
77+
78+
func (s *OrchestratorBehindReverseProxySuite) TestNatsConnectionWillFailWithoutRequireTLS() {
79+
_, err := s.executeCommandInDefaultJumpbox(
80+
[]string{
81+
"nats",
82+
"--server=nats://i_am_very_secret_token@bacalhau-traefik-node:4222",
83+
"--no-tlsfirst",
84+
"pub",
85+
"node.info",
86+
"helloworld",
87+
})
88+
s.Require().Error(err)
89+
s.Require().ErrorContains(err, "error: read tcp")
90+
s.Require().ErrorContains(err, "timeout")
91+
}
92+
93+
func (s *OrchestratorBehindReverseProxySuite) TestNatsTLSConnectionWillFailWithoutGoingThroughReverseProxy() {
94+
_, err := s.executeCommandInDefaultJumpbox(
95+
[]string{
96+
"nats",
97+
"--server=nats://i_am_very_secret_token@bacalhau-orchestrator-node:4222",
98+
"--tlsca=/bacalhau_integration_tests/common_assets/certificates/nats_custom/nats_root_ca.crt",
99+
"--tlsfirst",
100+
"pub",
101+
"node.info",
102+
"helloworld",
103+
})
104+
s.Require().Error(err)
105+
s.Require().ErrorContains(err, "error: tls: first record does not look like a TLS handshake")
106+
}
107+
108+
func (s *OrchestratorBehindReverseProxySuite) TestNatsConnectionWillSucceedWithRequireTLS() {
109+
result, err := s.executeCommandInDefaultJumpbox(
110+
[]string{
111+
"nats",
112+
"--server=nats://i_am_very_secret_token@bacalhau-traefik-node:4222",
113+
"--tlsfirst",
114+
"pub",
115+
"node.info",
116+
"helloworld",
117+
})
118+
s.Require().NoError(err)
119+
s.Require().Contains(result, `Published 10 bytes to "node.info"`)
120+
}
121+
122+
func TestOrchestratorBehindReverseProxySuite(t *testing.T) {
123+
suite.Run(t, NewOrchestratorBehindReverseProxySuite())
124+
}

test_integration/base_suite.go

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ func (s *BaseDockerComposeTestSuite) SetupSuite(dockerComposeFilePath string, re
3939
"OrchestratorImageName": fmt.Sprintf("bacalhau-test-orchestrator-%s:%s", s.GlobalRunIdentifier, s.GlobalRunIdentifier),
4040
"ComputeImageName": fmt.Sprintf("bacalhau-test-compute-%s:%s", s.GlobalRunIdentifier, s.GlobalRunIdentifier),
4141
"JumpboxImageName": fmt.Sprintf("bacalhau-test-jumpbox-%s:%s", s.GlobalRunIdentifier, s.GlobalRunIdentifier),
42+
"TraefikImageName": fmt.Sprintf("bacalhau-test-traefik-%s:%s", s.GlobalRunIdentifier, s.GlobalRunIdentifier),
4243
}
4344

4445
// Merge Rendering Data

0 commit comments

Comments
 (0)