Skip to content

Commit 8cf5a6c

Browse files
committed
(core) add ANR / livelock protection
This one has quite far-reaching consequences. The privilege-separated part will now monitor a shared timestamp counter. If that counter stalls for more than a fixed threshold of about 5 seconds, an interrupt signal is sent. This interrupt signal hooks into the lua state machine and generates a scripting error, prompting the normal script error recover. If that does not resolve the situation, the child is forcibly killed and we move into hard crash recovery. Next iterations to consider for this tactic is to also have a SIGINT per time-out restriction, and a chain-loader version used by the _lwa stage.
1 parent 72ccb5d commit 8cf5a6c

File tree

9 files changed

+144
-24
lines changed

9 files changed

+144
-24
lines changed

HACKING.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,17 @@ debugging quickly becomes very disruptive, and tracing sources becomes more
158158
important to get valuable information. In this section a few such sources and
159159
tactics are listed.
160160

161+
## Watchdog
162+
163+
When running a low level platform where arcan acts as the display server,
164+
there is a watchdog active that will first SIGINT (causing the Lua VM to reset)
165+
and if watchdog still isn't set, SIGKILL. When attaching a debugger, disable
166+
the watchdog with:
167+
168+
set arcan_watchdog_ping=0
169+
170+
which is allocated in arcan\_conductor\_enable\_watchdog().
171+
161172
## Engine invocation
162173

163174
The -q and -g arguments are useful tools for improving debugging. The -q

src/engine/arcan_conductor.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <stdbool.h>
1515
#include <stdio.h>
1616
#include <unistd.h>
17+
#include <stdatomic.h>
1718

1819
#include "arcan_math.h"
1920
#include "arcan_general.h"
@@ -30,6 +31,16 @@
3031
#include "../platform/platform.h"
3132
#include "../platform/video_platform.h"
3233

34+
/* defined in platform.h, used in psep open */
35+
uint64_t* _Atomic volatile arcan_watchdog_ping = NULL;
36+
37+
void arcan_conductor_enable_watchdog()
38+
{
39+
arcan_watchdog_ping = arcan_alloc_mem(system_page_size,
40+
ARCAN_MEM_SHARED, ARCAN_MEM_BZERO, ARCAN_MEMALIGN_NATURAL);
41+
atomic_store(arcan_watchdog_ping, arcan_timemillis());
42+
}
43+
3344
/*
3445
* checklist:
3546
* [ ] actual setup to realtime- plot the different timings and stages
@@ -545,6 +556,9 @@ static void conductor_cycle(int nticks)
545556
*
546557
* and tag transforms handlers being one tick off
547558
*/
559+
if (arcan_watchdog_ping)
560+
atomic_store(arcan_watchdog_ping, arcan_timemillis());
561+
548562
arcan_lua_tick(main_lua_context, nticks, conductor.tick_count);
549563
outcb(nticks);
550564

src/engine/arcan_conductor.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ enum synch_method {
1919
int arcan_conductor_run(arcan_tick_cb cb);
2020
#endif
2121

22+
/* [ called from platform ] */
23+
void arcan_conductor_enable_watchdog();
24+
2225
/* [ called from platform ]
2326
* Specify the relationship between display, gpu, synch and object - displays
2427
* are never 'lost' though they can be disabled with a bad vobj. */

src/engine/arcan_lua.c

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,7 @@ static bool tgtevent(arcan_vobj_id dst, arcan_event ev);
349349
static int alua_exposefuncs(lua_State* ctx, unsigned char debugfuncs);
350350
static bool grabapplfunction(lua_State* ctx, const char* funame, size_t funlen);
351351
static void rectrigger(const char* msg, ...);
352+
static void wraperr(lua_State* ctx, int errc, const char* src);
352353

353354
static char* colon_escape(char* in)
354355
{
@@ -7195,6 +7196,22 @@ void arcan_lua_dostring(lua_State* ctx, const char* code)
71957196
(void)luaL_dostring(ctx, code);
71967197
}
71977198

7199+
7200+
static jmp_buf watchdog_dst;
7201+
static void error_hook(lua_State* ctx, lua_Debug* ar)
7202+
{
7203+
/* will longjump into the pcall error handler which will trigger recovery */
7204+
luaL_error(ctx, "ANR - Application Not Responding");
7205+
}
7206+
7207+
static void sig_watchdog(int sig, siginfo_t* info, void* unused)
7208+
{
7209+
if (getppid() == info->si_pid){
7210+
/* set a hook that we can use to then invoke our error handler path */
7211+
lua_sethook(luactx.last_ctx, error_hook, LUA_MASKCOUNT, 1);
7212+
}
7213+
}
7214+
71987215
lua_State* arcan_lua_alloc()
71997216
{
72007217
lua_State* res = luaL_newstate();
@@ -7204,6 +7221,14 @@ lua_State* arcan_lua_alloc()
72047221
if (res)
72057222
luaL_openlibs(res);
72067223

7224+
/* watchdog has triggered with an ANR, continue the bouncy castle towards
7225+
* the 'normal' scripting recovery stage so we get information on where
7226+
* this comes from */
7227+
sigaction(SIGUSR1, &(struct sigaction){
7228+
.sa_sigaction = &sig_watchdog,
7229+
.sa_flags = SA_SIGINFO
7230+
}, NULL);
7231+
72077232
luactx.last_ctx = res;
72087233
return res;
72097234
}
@@ -7266,11 +7291,13 @@ static int alua_shutdown(lua_State *ctx)
72667291
}
72677292

72687293
/*
7269-
* There are three error functions,
7270-
* [wraperr] is called on a lua_pcall() failure or from panic()
7294+
* There are four error paths,
7295+
* [wraperr] is called on a lua_pcall() failure, panic or parent watchdog
72717296
* [panic] is called by the lua VM on an internal failure
72727297
* [rectrigger] is called on C API functions from Lua with bad arguments
72737298
* (through arcan_fatal redefinition)
7299+
* [SIGINT-handler] is used by the parent process if we fail to ping the
7300+
* watchdog (conductor processing)
72747301
*
72757302
* + the special 'alua_call' wrapper that may forward here with
72767303
* the results from calling either _fatal entry point or debug:traceback
@@ -7380,7 +7407,6 @@ static void rectrigger(const char* msg, ...)
73807407
longjmp(arcanmain_recover_state, ARCAN_LUA_RECOVERY_SWITCH);
73817408
}
73827409

7383-
73847410
static void alua_call(
73857411
struct arcan_luactx* ctx, int nargs, int retc, const char* src)
73867412
{

src/engine/arcan_main.c

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,6 @@ static struct arcan_strarr arr_hooks = {0};
6161
* This allows a simpler recovery script to adopt orphaned frameservers.
6262
*/
6363
jmp_buf arcanmain_recover_state;
64-
65-
/*
66-
* default, probed / replaced on some systems
67-
*/
68-
extern int system_page_size;
69-
70-
/*
71-
* set manually from debugger to disable as much timing sensitive
72-
* behavior as posible in order to minimise effects of breakpoints
73-
*/
74-
bool arcan_in_debug;
75-
7664
struct arcan_luactx* main_lua_context;
7765

7866
static const struct option longopts[] = {
@@ -332,6 +320,10 @@ static void add_hookscript(const char* instr)
332320

333321
int MAIN_REDIR(int argc, char* argv[])
334322
{
323+
/* needed first as device_init might fork and need some shared mem */
324+
system_page_size = sysconf(_SC_PAGE_SIZE);
325+
arcan_conductor_enable_watchdog();
326+
335327
/*
336328
* these are, in contrast to normal video_init/event_init, only set once
337329
* and not rerun on appl- switch etc. typically no-ops but may be used to
@@ -675,7 +667,6 @@ int MAIN_REDIR(int argc, char* argv[])
675667

676668
/* setup device polling, cleanup, ... */
677669
arcan_led_init();
678-
system_page_size = sysconf(_SC_PAGE_SIZE);
679670

680671
/*
681672
* fallback implementation resides here and a little further down in the "if

src/engine/arcan_mem.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ enum arcan_memtypes {
7777
*/
7878
ARCAN_MEM_STRINGBUF,
7979

80+
/*
81+
* Used for memory that needs to be shared/forked into a child process
82+
*/
83+
ARCAN_MEM_SHARED,
84+
8085
/*
8186
* Use- specific buffer associated with a video object (container
8287
* for 3d model, container for frameserver etc.) SMALL to TINY

src/platform/platform.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,26 @@ unsigned long long arcan_timemillis();
3131
* existing ones due to platform specific expansion rules
3232
*/
3333

34+
/*
35+
* Updated in the conductor stage with the latest known timestamp, Checked in
36+
* the platform/posix/psep_open and is used to either send SIGINT (first) then
37+
* if another timeout arrives, SIGKILL.
38+
*
39+
* The SIGINT is used in arcan_lua.c and used to trigger wraperr, which in turn
40+
* will log the event and rebuild the VM.
41+
*
42+
* This serves to protect against livelocks in general, but in particular for
43+
* the worst sinners, lua scripts getting stuck in while- loops from bad
44+
* programming, and complex interactions in the egl-dri platform.
45+
*/
46+
extern uint64_t* _Atomic volatile arcan_watchdog_ping;
47+
#define WATCHDOG_ANR_TIMEOUT_MS 5000
48+
49+
/*
50+
* default, probed / replaced on some systems
51+
*/
52+
extern int system_page_size;
53+
3454
/*
3555
* Execute and wait- for completion for the specified target. This will shut
3656
* down as much engine- locked resources as possible while still possible to

src/platform/posix/mem.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,16 @@ void* arcan_alloc_mem(size_t nb,
156156
size_t total;
157157

158158
switch (type){
159+
/* this is not safe to _free at the moment since we don't track metadata yet,
160+
* the refactor adding obsd/musl-new heap allocator should take this into account */
161+
case ARCAN_MEM_SHARED:
162+
rptr = mmap(NULL, system_page_size,
163+
PROT_READ | PROT_WRITE,
164+
MAP_SHARED | MAP_ANONYMOUS, -1, 0
165+
);
166+
total = system_page_size;
167+
168+
break;
159169
case ARCAN_MEM_BINDING:
160170
case ARCAN_MEM_THREADCTX:
161171
case ARCAN_MEM_VBUFFER:
@@ -289,7 +299,6 @@ void arcan_mem_free(void* inptr)
289299
* automatically shrink, but rather reset and flag
290300
* as unused */
291301

292-
293302
free(inptr);
294303
}
295304

src/platform/posix/psep_open.c

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <stdlib.h>
1010
#include <stdio.h>
1111
#include <stdint.h>
12+
#include <stdatomic.h>
1213
#include <inttypes.h>
1314
#include <ctype.h>
1415
#include <grp.h>
@@ -29,6 +30,9 @@
2930
#include <stdlib.h>
3031
#include <string.h>
3132
#include <unistd.h>
33+
#include "platform.h"
34+
35+
static uint64_t watchdog_anr_sent;
3236

3337
#if defined(__OpenBSD__) || defined(__FreeBSD__)
3438
#include <sys/event.h>
@@ -378,7 +382,7 @@ static int access_device(const char* path, int arg, bool release, bool* keep)
378382
struct stat devst;
379383
*keep = false;
380384

381-
/* special case, substitute TTY for the active tty device (if known) */
385+
/* special case 1, substitute TTY for the active tty device (if known) */
382386
if (strcmp(path, "TTY") == 0){
383387
if (!got_tty.active)
384388
return -1;
@@ -388,7 +392,7 @@ static int access_device(const char* path, int arg, bool release, bool* keep)
388392
}
389393
}
390394

391-
/* special case 2 - activate TTY (GRAPHICS switch) deferred to the first frame
395+
/* special case 3 - activate TTY (GRAPHICS switch) deferred to the first frame
392396
* so that any error output is actually visible after init but before first
393397
* composition, return this as an 'invalid file'. */
394398
if (strcmp(path, "TTYGRAPHICS") == 0){
@@ -430,6 +434,8 @@ static int access_device(const char* path, int arg, bool release, bool* keep)
430434
}
431435
*keep = true;
432436
if (whitelist[ind].mode & MODE_DRM){
437+
if (arcan_watchdog_ping)
438+
atomic_store(arcan_watchdog_ping, arcan_timemillis());
433439
drmSetMaster(whitelist[ind].fd);
434440
}
435441
return whitelist[ind].fd;
@@ -573,13 +579,47 @@ static void check_child(pid_t child, bool die)
573579
}
574580
}
575581
/* or we want the child to soft-die? */
576-
else if (die)
577-
kill(SIGTERM, child);
578-
579-
if (die){
582+
else if (die){
583+
kill(child, SIGTERM);
580584
release_devices();
581585
_exit(WEXITSTATUS(st));
582586
}
587+
588+
/* first send a watchdog signal, SIGUSR1, child will check if it comes from
589+
* ppid() or not - and use that as an 'ANR' (if it comes from the lua context,
590+
* second time around, it's killing time */
591+
uint64_t ts = arcan_watchdog_ping ? atomic_load(arcan_watchdog_ping) : 0;
592+
if (ts && arcan_timemillis() - ts > WATCHDOG_ANR_TIMEOUT_MS){
593+
594+
/* only trigger if there is a drm device open */
595+
bool found = false;
596+
for (size_t i = 0; i < COUNT_OF(whitelist); i++){
597+
if (whitelist[i].fd != -1 && (whitelist[i].mode & MODE_DRM)){
598+
found = true;
599+
break;
600+
}
601+
}
602+
if (!found)
603+
return;
604+
605+
/* the other option here would be to longjmp back into main, we ignore that as
606+
* that would cause more complexity with pledge/unveil, so shutdown and assume
607+
* an outer service manager would relaunch us if possible - we want explicit
608+
* relaunch in order to not act as a fork() oracle for ASLR break */
609+
if (watchdog_anr_sent){
610+
/* if (arcan_timemillis() - watchdog_anr_sent > WATCHDOG_ANR_TIMEOUT_MS){
611+
kill(child, SIGTERM);
612+
release_devices();
613+
_exit(EXIT_FAILURE);
614+
}*/
615+
}
616+
else {
617+
kill(child, SIGUSR1);
618+
watchdog_anr_sent = arcan_timemillis();
619+
}
620+
}
621+
else
622+
watchdog_anr_sent = 0;
583623
}
584624

585625
#if defined(__OpenBSD__) || defined(__FreeBSD__)
@@ -664,7 +704,7 @@ static void parent_loop(pid_t child, int netlink)
664704

665705
int rv = poll(pfd, netlink == -1 ? 1 : 2, 1000);
666706
if (-1 == rv && (errno != EAGAIN && errno != EINTR))
667-
check_child(child, true); /* don't to stronger than this */
707+
check_child(child, true); /* don't go stronger than this */
668708

669709
if (rv == 0)
670710
return;
@@ -760,6 +800,7 @@ static bool drop_privileges()
760800
static int psock = -1;
761801
void platform_device_init()
762802
{
803+
763804
/*
764805
* Signs of these environment variables indicate that we are in a context where
765806
* other display servers exist, drop out immediately so that we don't risk

0 commit comments

Comments
 (0)