lib/tsocket: avoid endless cpu-spinning in tstream_bsd_fde_handler()
authorStefan Metzmacher <metze@samba.org>
Wed, 12 Oct 2022 15:26:16 +0000 (17:26 +0200)
committerStefan Metzmacher <metze@samba.org>
Wed, 19 Oct 2022 16:14:36 +0000 (16:14 +0000)
There were some reports that strace output an LDAP server socket is in
CLOSE_WAIT state, returning EAGAIN for writev over and over (after a call to
epoll() each time).

In the tstream_bsd code the problem happens when we have a pending
writev_send, while there's no readv_send pending. In that case
we still ask for TEVENT_FD_READ in order to notice connection errors
early, so we try to call writev even if the socket doesn't report TEVENT_FD_WRITE.
And there are situations where we do that over and over again.

It happens like this with a Linux kernel:

    tcp_fin() has this:
        struct tcp_sock *tp = tcp_sk(sk);

        inet_csk_schedule_ack(sk);

        sk->sk_shutdown |= RCV_SHUTDOWN;
        sock_set_flag(sk, SOCK_DONE);

        switch (sk->sk_state) {
        case TCP_SYN_RECV:
        case TCP_ESTABLISHED:
                /* Move to CLOSE_WAIT */
                tcp_set_state(sk, TCP_CLOSE_WAIT);
                inet_csk_enter_pingpong_mode(sk);
                break;

It means RCV_SHUTDOWN gets set as well as TCP_CLOSE_WAIT, but
sk->sk_err is not changed to indicate an error.

    tcp_sendmsg_locked has this:
    ...
        err = -EPIPE;
        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
                goto do_error;

        while (msg_data_left(msg)) {
                int copy = 0;

                skb = tcp_write_queue_tail(sk);
                if (skb)
                        copy = size_goal - skb->len;

                if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
                        bool first_skb;

    new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_space;

    ...

    wait_for_space:
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                if (copied)
                        tcp_push(sk, flags & ~MSG_MORE, mss_now,
                                 TCP_NAGLE_PUSH, size_goal);

                err = sk_stream_wait_memory(sk, &timeo);
                if (err != 0)
                        goto do_error;

It means if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) doesn't
hit as we only have RCV_SHUTDOWN and sk_stream_wait_memory returns
-EAGAIN.

    tcp_poll has this:

        if (sk->sk_shutdown & RCV_SHUTDOWN)
                mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

So we'll get EPOLLIN | EPOLLRDNORM | EPOLLRDHUP triggering
TEVENT_FD_READ and writev/sendmsg keeps getting EAGAIN.

So we need to always clear TEVENT_FD_READ if we don't
have readable handler in order to avoid burning cpu.
But we turn it on again after a timeout of 1 second
in order to monitor the error state of the connection.

And now that our tsocket_bsd_error() helper checks for POLLRDHUP,
we can check if the socket is in an error state before calling the
writable handler when TEVENT_FD_READ was reported.
Only on error we'll call the writable handler, which will pick
the error without calling writev().

BUG: https://bugzilla.samba.org/show_bug.cgi?id=15202

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Reviewed-by: Ralph Boehme <slow@samba.org>
lib/tsocket/tsocket_bsd.c
selftest/knownfail.d/tstream [deleted file]

index 72499561f2d89611f443aa0a386f5c216eeb677e..daba33eb9d77f6666bef504e4a23c064dad61b6d 100644 (file)
@@ -1754,6 +1754,9 @@ struct tstream_bsd {
        void (*readable_handler)(void *private_data);
        void *writeable_private;
        void (*writeable_handler)(void *private_data);
+
+       struct tevent_context *error_ctx;
+       struct tevent_timer *error_timer;
 };
 
 bool tstream_bsd_optimize_readv(struct tstream_context *stream,
@@ -1775,6 +1778,28 @@ bool tstream_bsd_optimize_readv(struct tstream_context *stream,
        return old;
 }
 
+static void tstream_bsd_error_timer(struct tevent_context *ev,
+                                   struct tevent_timer *te,
+                                   struct timeval current_time,
+                                   void *private_data)
+{
+       struct tstream_bsd *bsds =
+               talloc_get_type(private_data,
+               struct tstream_bsd);
+
+       TALLOC_FREE(bsds->error_timer);
+
+       /*
+        * Turn on TEVENT_FD_READABLE() again
+        * if we have a writeable_handler that
+        * wants to monitor the connection
+        * for errors.
+        */
+       if (bsds->writeable_handler != NULL) {
+               TEVENT_FD_READABLE(bsds->fde);
+       }
+}
+
 static void tstream_bsd_fde_handler(struct tevent_context *ev,
                                    struct tevent_fd *fde,
                                    uint16_t flags,
@@ -1789,11 +1814,74 @@ static void tstream_bsd_fde_handler(struct tevent_context *ev,
        }
        if (flags & TEVENT_FD_READ) {
                if (!bsds->readable_handler) {
-                       if (bsds->writeable_handler) {
+                       struct timeval recheck_time;
+
+                       /*
+                        * In order to avoid cpu-spinning
+                        * we no longer want to get TEVENT_FD_READ
+                        */
+                       TEVENT_FD_NOT_READABLE(bsds->fde);
+
+                       if (!bsds->writeable_handler) {
+                               return;
+                       }
+
+                       /*
+                        * If we have a writeable handler we
+                        * want that to report connection errors
+                        * early.
+                        *
+                        * So we check if the socket is in an
+                        * error state.
+                        */
+                       if (bsds->error == 0) {
+                               int ret = tsocket_bsd_error(bsds->fd);
+
+                               if (ret == -1) {
+                                       bsds->error = errno;
+                               }
+                       }
+
+                       if (bsds->error != 0) {
+                               /*
+                                * Let the writeable handler report the error
+                                */
+                               bsds->writeable_handler(bsds->writeable_private);
+                               return;
+                       }
+
+                       /*
+                        * Here we called TEVENT_FD_NOT_READABLE() without
+                        * calling into the writeable handler.
+                        *
+                        * So we may have to wait for the kernels tcp stack
+                        * to report TEVENT_FD_WRITE in order to let
+                        * make progress and turn on TEVENT_FD_READABLE()
+                        * again.
+                        *
+                        * As a fallback we use a timer that turns on
+                        * TEVENT_FD_READABLE() again after a timeout of
+                        * 1 second.
+                        */
+
+                       if (bsds->error_timer != NULL) {
+                               return;
+                       }
+
+                       recheck_time = timeval_current_ofs(1, 0);
+                       bsds->error_timer = tevent_add_timer(bsds->error_ctx,
+                                                            bsds,
+                                                            recheck_time,
+                                                            tstream_bsd_error_timer,
+                                                            bsds);
+                       if (bsds->error_timer == NULL) {
+                               bsds->error = ENOMEM;
+                               /*
+                                * Let the writeable handler report the error
+                                */
                                bsds->writeable_handler(bsds->writeable_private);
                                return;
                        }
-                       TEVENT_FD_NOT_READABLE(bsds->fde);
                        return;
                }
                bsds->readable_handler(bsds->readable_private);
@@ -1848,6 +1936,8 @@ static int tstream_bsd_set_readable_handler(struct tstream_bsd *bsds,
                TEVENT_FD_READABLE(bsds->fde);
        }
 
+       TALLOC_FREE(bsds->error_timer);
+
        bsds->readable_handler = handler;
        bsds->readable_private = private_data;
 
@@ -1870,7 +1960,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
                bsds->writeable_handler = NULL;
                bsds->writeable_private = NULL;
                TEVENT_FD_NOT_WRITEABLE(bsds->fde);
-
+               TALLOC_FREE(bsds->error_timer);
+               bsds->error_ctx = NULL;
                return 0;
        }
 
@@ -1882,6 +1973,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
                }
                bsds->event_ptr = NULL;
                TALLOC_FREE(bsds->fde);
+               TALLOC_FREE(bsds->error_timer);
+               bsds->error_ctx = NULL;
        }
 
        if (tevent_fd_get_flags(bsds->fde) == 0) {
@@ -1907,6 +2000,7 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds,
 
        bsds->writeable_handler = handler;
        bsds->writeable_private = private_data;
+       bsds->error_ctx = ev;
 
        return 0;
 }
@@ -2212,7 +2306,14 @@ static void tstream_bsd_writev_handler(void *private_data)
        }
        err = tsocket_bsd_error_from_errno(ret, errno, &retry);
        if (retry) {
-               /* retry later */
+               /*
+                * retry later...
+                *
+                * make sure we also wait readable again
+                * in order to notice errors early
+                */
+               TEVENT_FD_READABLE(bsds->fde);
+               TALLOC_FREE(bsds->error_timer);
                return;
        }
        if (err != 0) {
@@ -2238,7 +2339,13 @@ static void tstream_bsd_writev_handler(void *private_data)
        }
 
        if (state->count > 0) {
-               /* we have more to read */
+               /*
+                * we have more to write
+                *
+                * make sure we also wait readable again
+                * in order to notice errors early
+                */
+               TEVENT_FD_READABLE(bsds->fde);
                return;
        }
 
@@ -2286,6 +2393,8 @@ static struct tevent_req *tstream_bsd_disconnect_send(TALLOC_CTX *mem_ctx,
                goto post;
        }
 
+       TALLOC_FREE(bsds->error_timer);
+       bsds->error_ctx = NULL;
        TALLOC_FREE(bsds->fde);
        ret = close(bsds->fd);
        bsds->fd = -1;
@@ -2328,6 +2437,8 @@ static const struct tstream_context_ops tstream_bsd_ops = {
 
 static int tstream_bsd_destructor(struct tstream_bsd *bsds)
 {
+       TALLOC_FREE(bsds->error_timer);
+       bsds->error_ctx = NULL;
        TALLOC_FREE(bsds->fde);
        if (bsds->fd != -1) {
                close(bsds->fd);
diff --git a/selftest/knownfail.d/tstream b/selftest/knownfail.d/tstream
deleted file mode 100644 (file)
index 4d67b2c..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-^samba.unittests.tsocket_tstream.test_tstream_disconnected_tcp_client_spin
-^samba.unittests.tsocket_tstream.test_tstream_disconnected_unix_client_spin
-^samba.unittests.tsocket_tstream.test_tstream_more_tcp_client_spin
-^samba.unittests.tsocket_tstream.test_tstream_more_unix_client_spin
-^samba.unittests.tsocket_tstream.test_tstream_more_disconnect_tcp_client_spin
-^samba.unittests.tsocket_tstream.test_tstream_more_disconnect_unix_client_spin