Skip to content

Commit 8a6c3f7

Browse files
authored
Merge pull request #352 from conductor-oss/fix/join-latency-issues
Reduce FORK/JOIN latency
2 parents fa9420f + 95e8c12 commit 8a6c3f7

File tree

6 files changed

+113
-16
lines changed

6 files changed

+113
-16
lines changed

core/src/main/java/com/netflix/conductor/core/config/ConductorProperties.java

+36
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
import org.springframework.util.unit.DataSize;
2525
import org.springframework.util.unit.DataUnit;
2626

27+
import com.netflix.conductor.model.TaskModel;
28+
2729
@ConfigurationProperties("conductor.app")
2830
public class ConductorProperties {
2931

@@ -226,6 +228,24 @@ public class ConductorProperties {
226228
/** Used to limit the size of task execution logs. */
227229
private int taskExecLogSizeLimit = 10;
228230

231+
/**
232+
* This property defines the number of poll counts (executions) after which SystemTasks
233+
* implementing getEvaluationOffset should begin postponing the next execution.
234+
*
235+
* @see
236+
* com.netflix.conductor.core.execution.tasks.WorkflowSystemTask#getEvaluationOffset(TaskModel,
237+
* long)
238+
* @see com.netflix.conductor.core.execution.tasks.Join#getEvaluationOffset(TaskModel, long)
239+
*/
240+
private int systemTaskPostponeThreshold = 200;
241+
242+
/**
243+
* Timeout used by {@link com.netflix.conductor.core.execution.tasks.SystemTaskWorker} when
244+
* polling, i.e.: call to {@link com.netflix.conductor.dao.QueueDAO#pop(String, int, int)}.
245+
*/
246+
@DurationUnit(ChronoUnit.MILLIS)
247+
private Duration systemTaskQueuePopTimeout = Duration.ofMillis(100);
248+
229249
public String getStack() {
230250
return stack;
231251
}
@@ -567,4 +587,20 @@ public Map<String, Object> getAll() {
567587
props.forEach((key, value) -> map.put(key.toString(), value));
568588
return map;
569589
}
590+
591+
public void setSystemTaskPostponeThreshold(int systemTaskPostponeThreshold) {
592+
this.systemTaskPostponeThreshold = systemTaskPostponeThreshold;
593+
}
594+
595+
public int getSystemTaskPostponeThreshold() {
596+
return systemTaskPostponeThreshold;
597+
}
598+
599+
public Duration getSystemTaskQueuePopTimeout() {
600+
return systemTaskQueuePopTimeout;
601+
}
602+
603+
public void setSystemTaskQueuePopTimeout(Duration systemTaskQueuePopTimeout) {
604+
this.systemTaskQueuePopTimeout = systemTaskQueuePopTimeout;
605+
}
570606
}

core/src/main/java/com/netflix/conductor/core/execution/tasks/Join.java

+17-5
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919

2020
import org.springframework.stereotype.Component;
2121

22+
import com.netflix.conductor.annotations.VisibleForTesting;
2223
import com.netflix.conductor.common.utils.TaskUtils;
24+
import com.netflix.conductor.core.config.ConductorProperties;
2325
import com.netflix.conductor.core.execution.WorkflowExecutor;
2426
import com.netflix.conductor.model.TaskModel;
2527
import com.netflix.conductor.model.WorkflowModel;
@@ -29,8 +31,13 @@
2931
@Component(TASK_TYPE_JOIN)
3032
public class Join extends WorkflowSystemTask {
3133

32-
public Join() {
34+
@VisibleForTesting static final double EVALUATION_OFFSET_BASE = 1.2;
35+
36+
private final ConductorProperties properties;
37+
38+
public Join(ConductorProperties properties) {
3339
super(TASK_TYPE_JOIN);
40+
this.properties = properties;
3441
}
3542

3643
@Override
@@ -117,12 +124,17 @@ public boolean execute(
117124
}
118125

119126
@Override
120-
public Optional<Long> getEvaluationOffset(TaskModel taskModel, long defaultOffset) {
121-
int index = taskModel.getPollCount() > 0 ? taskModel.getPollCount() - 1 : 0;
122-
if (index == 0) {
127+
public Optional<Long> getEvaluationOffset(TaskModel taskModel, long maxOffset) {
128+
int pollCount = taskModel.getPollCount();
129+
// Assuming pollInterval = 50ms and evaluationOffsetThreshold = 200 this will cause
130+
// a JOIN task to be evaluated continuously during the first 10 seconds and the FORK/JOIN
131+
// will end with minimal delay.
132+
if (pollCount <= properties.getSystemTaskPostponeThreshold()) {
123133
return Optional.of(0L);
124134
}
125-
return Optional.of(Math.min((long) Math.pow(2, index), defaultOffset));
135+
136+
double exp = pollCount - properties.getSystemTaskPostponeThreshold();
137+
return Optional.of(Math.min((long) Math.pow(EVALUATION_OFFSET_BASE, exp), maxOffset));
126138
}
127139

128140
public boolean isAsync() {

core/src/main/java/com/netflix/conductor/core/execution/tasks/SystemTaskWorker.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public class SystemTaskWorker extends LifecycleAwareComponent {
5252
private final AsyncSystemTaskExecutor asyncSystemTaskExecutor;
5353
private final ConductorProperties properties;
5454
private final ExecutionService executionService;
55+
private final int queuePopTimeout;
5556

5657
ConcurrentHashMap<String, ExecutionConfig> queueExecutionConfigMap = new ConcurrentHashMap<>();
5758

@@ -67,6 +68,7 @@ public SystemTaskWorker(
6768
this.queueDAO = queueDAO;
6869
this.pollInterval = properties.getSystemTaskWorkerPollInterval().toMillis();
6970
this.executionService = executionService;
71+
this.queuePopTimeout = (int) properties.getSystemTaskQueuePopTimeout().toMillis();
7072

7173
LOGGER.info("SystemTaskWorker initialized with {} threads", threadCount);
7274
}
@@ -114,7 +116,8 @@ void pollAndExecute(WorkflowSystemTask systemTask, String queueName) {
114116

115117
LOGGER.debug("Polling queue: {} with {} slots acquired", queueName, messagesToAcquire);
116118

117-
List<String> polledTaskIds = queueDAO.pop(queueName, messagesToAcquire, 200);
119+
List<String> polledTaskIds =
120+
queueDAO.pop(queueName, messagesToAcquire, queuePopTimeout);
118121

119122
Monitors.recordTaskPoll(queueName);
120123
LOGGER.debug("Polling queue:{}, got {} tasks", queueName, polledTaskIds.size());

core/src/main/java/com/netflix/conductor/core/execution/tasks/WorkflowSystemTask.java

+13-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,19 @@ public boolean execute(
6565
*/
6666
public void cancel(WorkflowModel workflow, TaskModel task, WorkflowExecutor workflowExecutor) {}
6767

68-
public Optional<Long> getEvaluationOffset(TaskModel taskModel, long defaultOffset) {
68+
/**
69+
* Determines the time in seconds by which the next execution of a task will be postponed after
70+
* an execution. By default, this method returns {@code Optional.empty()}.
71+
*
72+
* <p>WorkflowSystemTasks may override this method to define a custom evaluation offset based on
73+
* the task's behavior or requirements.
74+
*
75+
* @param taskModel task model
76+
* @param maxOffset the max recommended offset value to use
77+
* @return an {@code Optional<Long>} specifying the evaluation offset in seconds, or {@code
78+
* Optional.empty()} if no postponement is required
79+
*/
80+
public Optional<Long> getEvaluationOffset(TaskModel taskModel, long maxOffset) {
6981
return Optional.empty();
7082
}
7183

core/src/test/java/com/netflix/conductor/core/execution/TestDeciderOutcomes.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ public Switch switchTask() {
128128

129129
@Bean(TASK_TYPE_JOIN)
130130
public Join join() {
131-
return new Join();
131+
return new Join(new ConductorProperties());
132132
}
133133

134134
@Bean
@@ -595,7 +595,8 @@ public void testOptionalWithDynamicFork() {
595595

596596
assertEquals(TaskModel.Status.SCHEDULED, outcome.tasksToBeScheduled.get(0).getStatus());
597597
System.out.println(outcome.tasksToBeScheduled.get(0));
598-
new Join().execute(workflow, outcome.tasksToBeScheduled.get(0), null);
598+
new Join(new ConductorProperties())
599+
.execute(workflow, outcome.tasksToBeScheduled.get(0), null);
599600
assertEquals(TaskModel.Status.COMPLETED, outcome.tasksToBeScheduled.get(0).getStatus());
600601
}
601602

core/src/test/java/com/netflix/conductor/core/execution/tasks/TestJoin.java

+40-7
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import org.junit.Test;
2020

2121
import com.netflix.conductor.common.metadata.workflow.WorkflowTask;
22+
import com.netflix.conductor.core.config.ConductorProperties;
2223
import com.netflix.conductor.core.execution.WorkflowExecutor;
2324
import com.netflix.conductor.model.TaskModel;
2425
import com.netflix.conductor.model.WorkflowModel;
@@ -27,6 +28,9 @@
2728
import static org.mockito.Mockito.mock;
2829

2930
public class TestJoin {
31+
32+
private final ConductorProperties properties = new ConductorProperties();
33+
3034
private final WorkflowExecutor executor = mock(WorkflowExecutor.class);
3135

3236
private TaskModel createTask(
@@ -65,7 +69,7 @@ public void testShouldNotMarkJoinAsCompletedWithErrorsWhenNotDone() {
6569
// task2 is not scheduled yet, so the join is not completed
6670
var wfJoinPair = createJoinWorkflow(List.of(task1), "task2");
6771

68-
var join = new Join();
72+
var join = new Join(properties);
6973
var result = join.execute(wfJoinPair.getLeft(), wfJoinPair.getRight(), executor);
7074
assertFalse(result);
7175
}
@@ -77,7 +81,7 @@ public void testJoinCompletesSuccessfullyWhenAllTasksSucceed() {
7781

7882
var wfJoinPair = createJoinWorkflow(List.of(task1, task2));
7983

80-
var join = new Join();
84+
var join = new Join(properties);
8185
var result = join.execute(wfJoinPair.getLeft(), wfJoinPair.getRight(), executor);
8286
assertTrue("Join task should execute successfully when all tasks succeed", result);
8387
assertEquals(
@@ -93,7 +97,7 @@ public void testJoinWaitsWhenAnyTaskIsNotTerminal() {
9397

9498
var wfJoinPair = createJoinWorkflow(List.of(task1, task2));
9599

96-
var join = new Join();
100+
var join = new Join(properties);
97101
var result = join.execute(wfJoinPair.getLeft(), wfJoinPair.getRight(), executor);
98102
assertFalse("Join task should wait when any task is not in terminal state", result);
99103
}
@@ -107,7 +111,7 @@ public void testJoinFailsWhenMandatoryTaskFails() {
107111

108112
var wfJoinPair = createJoinWorkflow(List.of(task1, task2));
109113

110-
var join = new Join();
114+
var join = new Join(properties);
111115
var result = join.execute(wfJoinPair.getLeft(), wfJoinPair.getRight(), executor);
112116
assertTrue("Join task should be executed when a mandatory task fails", result);
113117
assertEquals(
@@ -125,7 +129,7 @@ public void testJoinCompletesWithErrorsWhenOnlyOptionalTasksFail() {
125129

126130
var wfJoinPair = createJoinWorkflow(List.of(task1, task2));
127131

128-
var join = new Join();
132+
var join = new Join(properties);
129133
var result = join.execute(wfJoinPair.getLeft(), wfJoinPair.getRight(), executor);
130134
assertTrue("Join task should be executed when only optional tasks fail", result);
131135
assertEquals(
@@ -143,7 +147,7 @@ public void testJoinAggregatesFailureReasonsCorrectly() {
143147

144148
var wfJoinPair = createJoinWorkflow(List.of(task1, task2));
145149

146-
var join = new Join();
150+
var join = new Join(properties);
147151
var result = join.execute(wfJoinPair.getLeft(), wfJoinPair.getRight(), executor);
148152
assertTrue("Join task should be executed when tasks fail", result);
149153
assertEquals(
@@ -174,7 +178,7 @@ public void testJoinWaitsForAllTasksBeforeFailingDueToPermissiveTaskFailure() {
174178
var wfJoinPair = createJoinWorkflow(List.of(task1, task2));
175179

176180
// First execution: Task 2 is not yet terminal.
177-
var join = new Join();
181+
var join = new Join(properties);
178182
boolean result = join.execute(wfJoinPair.getLeft(), wfJoinPair.getRight(), executor);
179183
assertFalse("Join task should wait as not all tasks are terminal", result);
180184

@@ -189,4 +193,33 @@ public void testJoinWaitsForAllTasksBeforeFailingDueToPermissiveTaskFailure() {
189193
TaskModel.Status.FAILED,
190194
wfJoinPair.getRight().getStatus());
191195
}
196+
197+
@Test
198+
public void testEvaluationOffsetWhenPollCountIsBelowThreshold() {
199+
var join = new Join(properties);
200+
var taskModel = createTask("join1", TaskModel.Status.COMPLETED, false, false);
201+
taskModel.setPollCount(properties.getSystemTaskPostponeThreshold() - 1);
202+
var opt = join.getEvaluationOffset(taskModel, 30L);
203+
assertEquals(0L, (long) opt.orElseThrow());
204+
}
205+
206+
@Test
207+
public void testEvaluationOffsetWhenPollCountIsAboveThreshold() {
208+
final var maxOffset = 30L;
209+
var join = new Join(properties);
210+
var taskModel = createTask("join1", TaskModel.Status.COMPLETED, false, false);
211+
212+
taskModel.setPollCount(properties.getSystemTaskPostponeThreshold() + 1);
213+
var opt = join.getEvaluationOffset(taskModel, maxOffset);
214+
assertEquals(1L, (long) opt.orElseThrow());
215+
216+
taskModel.setPollCount(properties.getSystemTaskPostponeThreshold() + 10);
217+
opt = join.getEvaluationOffset(taskModel, maxOffset);
218+
long expected = (long) Math.pow(Join.EVALUATION_OFFSET_BASE, 10);
219+
assertEquals(expected, (long) opt.orElseThrow());
220+
221+
taskModel.setPollCount(properties.getSystemTaskPostponeThreshold() + 40);
222+
opt = join.getEvaluationOffset(taskModel, maxOffset);
223+
assertEquals(maxOffset, (long) opt.orElseThrow());
224+
}
192225
}

0 commit comments

Comments
 (0)