From c281aba30e69a501fc183d068894bfa47f891700 Mon Sep 17 00:00:00 2001 From: Nick Mathewson Date: Sun, 24 Oct 2010 11:38:29 -0400 Subject: [PATCH] Fix a nasty bug related to use of dup() with epoll on Linux Current versions of the Linux kernel don't seem to remove the struct epitem for a given (file,fd) combo when the fd is closed unless the file itself is also completely closed. This means that if you do: fd = dup(fd_orig); add(fd); close(fd); dup2(fd_orig, fd); add(fd); you will get an EEXIST when you should have gotten a success. This could cause warnings and dropped events when using dup and epoll. The solution is pretty simple: when we get an EEXIST from EPOLL_CTL_ADD, we retry with EPOLL_CTL_MOD. Unit test included to demonstrate the bug. Found due to the patient efforts of Gilad Benjamini; diagnosed with help from Nicholas Marriott. --- epoll.c | 32 +++++++++++++--------- test/regress.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 13 deletions(-) diff --git a/epoll.c b/epoll.c index b574bf4d59..9c8f0e99e7 100644 --- a/epoll.c +++ b/epoll.c @@ -155,7 +155,6 @@ epoll_apply_changes(struct event_base *base) int op, events; for (i = 0; i < changelist->n_changes; ++i) { - int precautionary_add = 0; ch = &changelist->changes[i]; events = 0; @@ -203,13 +202,12 @@ epoll_apply_changes(struct event_base *base) on the fd, we need to try the ADD anyway, in case the fd was closed at some in the middle. If it wasn't, the ADD operation - will fail with; that's okay. - */ - precautionary_add = 1; + will fail with EEXIST; and we retry it as a + MOD. */ + op = EPOLL_CTL_ADD; } else if (ch->old_events) { op = EPOLL_CTL_MOD; } - } else if ((ch->read_change & EV_CHANGE_DEL) || (ch->write_change & EV_CHANGE_DEL)) { /* If we're deleting anything, we'll want to do a MOD @@ -255,14 +253,22 @@ epoll_apply_changes(struct event_base *base) (int)epev.events, ch->fd)); } - } else if (op == EPOLL_CTL_ADD && errno == EEXIST && - precautionary_add) { - /* If a precautionary ADD operation fails with - EEXIST, that's fine too. - */ - event_debug(("Epoll ADD(%d) on fd %d gave %s: ADD was redundant", - (int)epev.events, - ch->fd, strerror(errno))); + } else if (op == EPOLL_CTL_ADD && errno == EEXIST) { + /* If an ADD operation fails with EEXIST, + * either the operation was redundant (as with a + * precautionary add), or we ran into a fun + * kernel bug where using dup*() to duplicate the + * same file into the same fd gives you the same epitem + * rather than a fresh one. For the second case, + * we must retry with MOD. */ + if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) { + event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too", + (int)epev.events, ch->fd); + } else { + event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.", + (int)epev.events, + ch->fd)); + } } else if (op == EPOLL_CTL_DEL && (errno == ENOENT || errno == EBADF || errno == EPERM)) { diff --git a/test/regress.c b/test/regress.c index 21c921b6e8..0cc4017025 100644 --- a/test/regress.c +++ b/test/regress.c @@ -2061,6 +2061,76 @@ test_event_pending(void *ptr) } } +#ifndef WIN32 +/* You can't do this test on windows, since dup2 doesn't work on sockets */ + +static void +dfd_cb(evutil_socket_t fd, short e, void *data) +{ + *(int*)data = (int)e; +} + +/* Regression test for our workaround for a fun epoll/linux related bug + * where fd2 = dup(fd1); add(fd2); close(fd2); dup2(fd1,fd2); add(fd2) + * will get you an EEXIST */ +static void +test_dup_fd(void *arg) +{ + struct basic_test_data *data = arg; + struct event_base *base = data->base; + struct event *ev1=NULL, *ev2=NULL; + int fd, dfd=-1; + int ev1_got, ev2_got; + + tt_int_op(write(data->pair[0], "Hello world", + strlen("Hello world")), >, 0); + fd = data->pair[1]; + + dfd = dup(fd); + tt_int_op(dfd, >=, 0); + + ev1 = event_new(base, fd, EV_READ|EV_PERSIST, dfd_cb, &ev1_got); + ev2 = event_new(base, dfd, EV_READ|EV_PERSIST, dfd_cb, &ev2_got); + ev1_got = ev2_got = 0; + event_add(ev1, NULL); + event_add(ev2, NULL); + event_base_loop(base, EVLOOP_ONCE); + tt_int_op(ev1_got, ==, EV_READ); + tt_int_op(ev2_got, ==, EV_READ); + + /* Now close and delete dfd then dispatch. We need to do the + * dispatch here so that when we add it later, we think there + * was an intermediate delete. */ + close(dfd); + event_del(ev2); + ev1_got = ev2_got = 0; + event_base_loop(base, EVLOOP_ONCE); + tt_want_int_op(ev1_got, ==, EV_READ); + tt_int_op(ev2_got, ==, 0); + + /* Re-duplicate the fd. We need to get the same duplicated + * value that we closed to provoke the epoll quirk. Also, we + * need to change the events to write, or else the old lingering + * read event will make the test pass whether the change was + * successful or not. */ + tt_int_op(dup2(fd, dfd), ==, dfd); + event_free(ev2); + ev2 = event_new(base, dfd, EV_WRITE|EV_PERSIST, dfd_cb, &ev2_got); + event_add(ev2, NULL); + ev1_got = ev2_got = 0; + event_base_loop(base, EVLOOP_ONCE); + tt_want_int_op(ev1_got, ==, EV_READ); + tt_int_op(ev2_got, ==, EV_WRITE); + +end: + if (ev1) + event_free(ev1); + if (ev2) + event_free(ev2); + close(dfd); +} +#endif + #ifdef _EVENT_DISABLE_MM_REPLACEMENT static void test_mm_functions(void *arg) @@ -2229,6 +2299,9 @@ struct testcase_t main_testcases[] = { { "event_once", test_event_once, TT_ISOLATED, &basic_setup, NULL }, { "event_pending", test_event_pending, TT_ISOLATED, &basic_setup, NULL }, +#ifndef WIN32 + { "dup_fd", test_dup_fd, TT_ISOLATED, &basic_setup, NULL }, +#endif { "mm_functions", test_mm_functions, TT_FORK, NULL, NULL }, BASIC(many_events, TT_ISOLATED),