Skip to content

Commit 703f5ed

Browse files
committed
Run non-cancellation safe cluster controller leader tasks in select expression
To avoid losing progress, this commit runs all cluster controller leader tasks that are not cancellation safe as part of the top-level select expression instead of a select arm. Select arms can be cancelled and therefore we would lose progress in this case. This applies to trimming the logs and updating scheduler when there is a logs or partition table update.
1 parent ecc429b commit 703f5ed

File tree

3 files changed

+84
-39
lines changed

3 files changed

+84
-39
lines changed

crates/admin/src/cluster_controller/logs_controller.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@
1010

1111
mod nodeset_selection;
1212

13+
use futures::never::Never;
14+
use rand::prelude::IteratorRandom;
15+
use rand::{thread_rng, RngCore};
1316
use std::collections::HashMap;
1417
use std::iter;
1518
use std::num::NonZeroU8;
1619
use std::ops::Deref;
1720
use std::sync::Arc;
1821
use std::time::Duration;
19-
20-
use rand::prelude::IteratorRandom;
21-
use rand::{thread_rng, RngCore};
2222
use tokio::sync::Semaphore;
2323
use tokio::task::JoinSet;
2424
use tracing::{debug, trace, trace_span, Instrument};
@@ -1210,7 +1210,7 @@ impl LogsController {
12101210
});
12111211
}
12121212

1213-
pub async fn run_async_operations(&mut self) -> Result<()> {
1213+
pub async fn run_async_operations(&mut self) -> Result<Never> {
12141214
loop {
12151215
if self.async_operations.is_empty() {
12161216
futures::future::pending().await

crates/admin/src/cluster_controller/service.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,8 @@ impl<T: TransportConnect> Service<T> {
272272
state.reconfigure(configuration);
273273
}
274274
result = state.run() => {
275-
result?
275+
let leader_event = result?;
276+
state.on_leader_event(leader_event).await?;
276277
}
277278
_ = &mut shutdown => {
278279
self.health_status.update(AdminStatus::Unknown);

crates/admin/src/cluster_controller/service/state.rs

+78-34
Original file line numberDiff line numberDiff line change
@@ -87,18 +87,19 @@ where
8787

8888
Ok(())
8989
}
90-
}
9190

92-
impl<T> ClusterControllerState<T>
93-
where
94-
T: TransportConnect,
95-
{
96-
pub async fn run(&mut self) -> anyhow::Result<()> {
91+
pub async fn on_leader_event(&mut self, leader_event: LeaderEvent) -> anyhow::Result<()> {
9792
match self {
98-
Self::Follower => {
99-
futures::future::pending::<()>().await;
100-
Ok(())
101-
}
93+
ClusterControllerState::Follower => Ok(()),
94+
ClusterControllerState::Leader(leader) => leader.on_leader_event(leader_event).await,
95+
}
96+
}
97+
98+
/// Runs the cluster controller state related tasks. It returns [`LeaderEvent`] which need to
99+
/// be processed by calling [`Self::on_leader_event`].
100+
pub async fn run(&mut self) -> anyhow::Result<LeaderEvent> {
101+
match self {
102+
Self::Follower => futures::future::pending::<anyhow::Result<_>>().await,
102103
Self::Leader(leader) => leader.run().await,
103104
}
104105
}
@@ -125,6 +126,15 @@ where
125126
}
126127
}
127128

129+
/// Events that are emitted by a leading cluster controller that need to be processed explicitly
130+
/// because their operations are not cancellation safe.
131+
#[derive(Debug)]
132+
pub enum LeaderEvent {
133+
TrimLogs,
134+
LogsUpdate,
135+
PartitionTableUpdate,
136+
}
137+
128138
pub struct Leader<T> {
129139
metadata: Metadata,
130140
bifrost: Bifrost,
@@ -224,48 +234,82 @@ where
224234
create_log_trim_interval(&configuration.admin);
225235
}
226236

227-
async fn run(&mut self) -> anyhow::Result<()> {
228-
let bifrost_admin = BifrostAdmin::new(
229-
&self.bifrost,
230-
&self.metadata_writer,
231-
&self.metadata_store_client,
232-
);
233-
237+
async fn run(&mut self) -> anyhow::Result<LeaderEvent> {
234238
loop {
235239
tokio::select! {
236240
_ = self.find_logs_tail_interval.tick() => {
237241
self.logs_controller.find_logs_tail();
238242
}
239243
_ = OptionFuture::from(self.log_trim_interval.as_mut().map(|interval| interval.tick())) => {
240-
let result = self.trim_logs(bifrost_admin).await;
241-
242-
if let Err(err) = result {
243-
warn!("Could not trim the logs. This can lead to increased disk usage: {err}");
244-
}
244+
return Ok(LeaderEvent::TrimLogs);
245245
}
246246
result = self.logs_controller.run_async_operations() => {
247247
result?;
248248
}
249249
Ok(_) = self.logs_watcher.changed() => {
250-
self.logs_controller.on_logs_update(self.metadata.logs_ref())?;
251-
// tell the scheduler about potentially newly provisioned logs
252-
self.scheduler.on_logs_update(self.logs.live_load(), self.partition_table.live_load()).await?
250+
return Ok(LeaderEvent::LogsUpdate);
251+
253252
}
254253
Ok(_) = self.partition_table_watcher.changed() => {
255-
let partition_table = self.partition_table.live_load();
256-
let logs = self.logs.live_load();
257-
258-
self.logs_controller.on_partition_table_update(partition_table);
259-
self.scheduler.on_logs_update(logs, partition_table).await?;
254+
return Ok(LeaderEvent::PartitionTableUpdate);
260255
}
261256
}
262257
}
263258
}
264259

265-
async fn trim_logs(
266-
&self,
267-
bifrost_admin: BifrostAdmin<'_>,
268-
) -> Result<(), restate_bifrost::Error> {
260+
pub async fn on_leader_event(&mut self, leader_event: LeaderEvent) -> anyhow::Result<()> {
261+
match leader_event {
262+
LeaderEvent::TrimLogs => {
263+
self.trim_logs().await;
264+
}
265+
LeaderEvent::LogsUpdate => {
266+
self.on_logs_update().await?;
267+
}
268+
LeaderEvent::PartitionTableUpdate => {
269+
self.on_partition_table_update().await?;
270+
}
271+
}
272+
273+
Ok(())
274+
}
275+
276+
async fn on_logs_update(&mut self) -> anyhow::Result<()> {
277+
self.logs_controller
278+
.on_logs_update(self.metadata.logs_ref())?;
279+
// tell the scheduler about potentially newly provisioned logs
280+
self.scheduler
281+
.on_logs_update(self.logs.live_load(), self.partition_table.live_load())
282+
.await?;
283+
284+
Ok(())
285+
}
286+
287+
async fn on_partition_table_update(&mut self) -> anyhow::Result<()> {
288+
let partition_table = self.partition_table.live_load();
289+
let logs = self.logs.live_load();
290+
291+
self.logs_controller
292+
.on_partition_table_update(partition_table);
293+
self.scheduler.on_logs_update(logs, partition_table).await?;
294+
295+
Ok(())
296+
}
297+
298+
async fn trim_logs(&self) {
299+
let result = self.trim_logs_inner().await;
300+
301+
if let Err(err) = result {
302+
warn!("Could not trim the logs. This can lead to increased disk usage: {err}");
303+
}
304+
}
305+
306+
async fn trim_logs_inner(&self) -> Result<(), restate_bifrost::Error> {
307+
let bifrost_admin = BifrostAdmin::new(
308+
&self.bifrost,
309+
&self.metadata_writer,
310+
&self.metadata_store_client,
311+
);
312+
269313
let cluster_state = self.cluster_state_watcher.current();
270314

271315
let mut persisted_lsns_per_partition: BTreeMap<

0 commit comments

Comments
 (0)