net/tcp_fastopen: Disable active side TFO in certain scenarios
authorWei Wang <weiwan@google.com>
Thu, 20 Apr 2017 21:45:46 +0000 (14:45 -0700)
committerDavid S. Miller <davem@davemloft.net>
Mon, 24 Apr 2017 18:27:17 +0000 (14:27 -0400)
Middlebox firewall issues can potentially cause server's data being
blackholed after a successful 3WHS using TFO. Following are the related
reports from Apple:
https://www.nanog.org/sites/default/files/Paasch_Network_Support.pdf
Slide 31 identifies an issue where the client ACK to the server's data
sent during a TFO'd handshake is dropped.
C ---> syn-data ---> S
C <--- syn/ack ----- S
C (accept & write)
C <---- data ------- S
C ----- ACK -> X     S
[retry and timeout]

https://www.ietf.org/proceedings/94/slides/slides-94-tcpm-13.pdf
Slide 5 shows a similar situation that the server's data gets dropped
after 3WHS.
C ---- syn-data ---> S
C <--- syn/ack ----- S
C ---- ack --------> S
S (accept & write)
C?  X <- data ------ S
[retry and timeout]

This is the worst failure b/c the client can not detect such behavior to
mitigate the situation (such as disabling TFO). Failing to proceed, the
application (e.g., SSL library) may simply timeout and retry with TFO
again, and the process repeats indefinitely.

The proposed solution is to disable active TFO globally under the
following circumstances:
1. client side TFO socket detects out of order FIN
2. client side TFO socket receives out of order RST

We disable active side TFO globally for 1hr at first. Then if it
happens again, we disable it for 2h, then 4h, 8h, ...
And we reset the timeout to 1hr if a client side TFO sockets not opened
on loopback has successfully received data segs from server.
And we examine this condition during close().

The rational behind it is that when such firewall issue happens,
application running on the client should eventually close the socket as
it is not able to get the data it is expecting. Or application running
on the server should close the socket as it is not able to receive any
response from client.
In both cases, out of order FIN or RST will get received on the client
given that the firewall will not block them as no data are in those
frames.
And we want to disable active TFO globally as it helps if the middle box
is very close to the client and most of the connections are likely to
fail.

Also, add a debug sysctl:
  tcp_fastopen_blackhole_detect_timeout_sec:
    the initial timeout to use when firewall blackhole issue happens.
    This can be set and read.
    When setting it to 0, it means to disable the active disable logic.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Documentation/networking/ip-sysctl.txt
include/linux/tcp.h
include/net/tcp.h
net/ipv4/sysctl_net_ipv4.c
net/ipv4/tcp.c
net/ipv4/tcp_fastopen.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c

index b1c6500e7a8df4d7377b291e9afc09363e66cd17..974ab47ae53a81c27b2b57533db813288139fd7b 100644 (file)
@@ -602,6 +602,14 @@ tcp_fastopen - INTEGER
        Note that that additional client or server features are only
        effective if the basic support (0x1 and 0x2) are enabled respectively.
 
+tcp_fastopen_blackhole_timeout_sec - INTEGER
+       Initial time period in second to disable Fastopen on active TCP sockets
+       when a TFO firewall blackhole issue happens.
+       This time period will grow exponentially when more blackhole issues
+       get detected right after Fastopen is re-enabled and will reset to
+       initial value when the blackhole issue goes away.
+       By default, it is set to 1hr.
+
 tcp_syn_retries - INTEGER
        Number of times initial SYNs for an active TCP connection attempt
        will be retransmitted. Should not be higher than 127. Default value
index cfc2d9506ce8077af1ec92eb7086fd52ce4fe1ac..cbe5b602a2d349fdeb1e878305f37b4da1e6cc86 100644 (file)
@@ -233,6 +233,7 @@ struct tcp_sock {
        u8      syn_data:1,     /* SYN includes data */
                syn_fastopen:1, /* SYN includes Fast Open option */
                syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
+               syn_fastopen_ch:1, /* Active TFO re-enabling probe */
                syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
                save_syn:1,     /* Save headers of SYN packet */
                is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
index cc6ae0a95201f0adc52c2c46b429566806da6745..c1abc2abbdcbf2d13f7fb6e0b6d8fae719aceddf 100644 (file)
@@ -1506,6 +1506,12 @@ struct tcp_fastopen_context {
        struct rcu_head         rcu;
 };
 
+extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
+void tcp_fastopen_active_disable(void);
+bool tcp_fastopen_active_should_disable(struct sock *sk);
+void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
+void tcp_fastopen_active_timeout_reset(void);
+
 /* Latencies incurred by various limits for a sender. They are
  * chronograph-like stats that are mutually exclusive.
  */
index ddac9e64b7022452202cdb0697cbfee82ed1727b..86957e9cd6c6748ac00aa0307154bb131c43f1da 100644 (file)
@@ -350,6 +350,19 @@ static int proc_udp_early_demux(struct ctl_table *table, int write,
        return ret;
 }
 
+static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
+                                            int write,
+                                            void __user *buffer,
+                                            size_t *lenp, loff_t *ppos)
+{
+       int ret;
+
+       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       if (write && ret == 0)
+               tcp_fastopen_active_timeout_reset();
+       return ret;
+}
+
 static struct ctl_table ipv4_table[] = {
        {
                .procname       = "tcp_timestamps",
@@ -399,6 +412,14 @@ static struct ctl_table ipv4_table[] = {
                .maxlen         = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
                .proc_handler   = proc_tcp_fastopen_key,
        },
+       {
+               .procname       = "tcp_fastopen_blackhole_timeout_sec",
+               .data           = &sysctl_tcp_fastopen_blackhole_timeout,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_tfo_blackhole_detect_timeout,
+               .extra1         = &zero,
+       },
        {
                .procname       = "tcp_abort_on_overflow",
                .data           = &sysctl_tcp_abort_on_overflow,
index 04843ae77b9ecacb3e4f2e81096f11d35ae1915e..efc976ae66ae5b82d496323634c3030fb71c6c92 100644 (file)
@@ -2296,6 +2296,7 @@ int tcp_disconnect(struct sock *sk, int flags)
        tcp_clear_xmit_timers(sk);
        __skb_queue_purge(&sk->sk_receive_queue);
        tcp_write_queue_purge(sk);
+       tcp_fastopen_active_disable_ofo_check(sk);
        skb_rbtree_purge(&tp->out_of_order_queue);
 
        inet->inet_dport = 0;
index 8ea4e9787f82ba65cd07b4c2b663df76fe4eb143..ff2d30ffc6f3e0b2d817deff8d93b07ebba9044e 100644 (file)
@@ -341,6 +341,13 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
                cookie->len = -1;
                return false;
        }
+
+       /* Firewall blackhole issue check */
+       if (tcp_fastopen_active_should_disable(sk)) {
+               cookie->len = -1;
+               return false;
+       }
+
        if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
                cookie->len = -1;
                return true;
@@ -380,3 +387,97 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
        return false;
 }
 EXPORT_SYMBOL(tcp_fastopen_defer_connect);
+
+/*
+ * The following code block is to deal with middle box issues with TFO:
+ * Middlebox firewall issues can potentially cause server's data being
+ * blackholed after a successful 3WHS using TFO.
+ * The proposed solution is to disable active TFO globally under the
+ * following circumstances:
+ *   1. client side TFO socket receives out of order FIN
+ *   2. client side TFO socket receives out of order RST
+ * We disable active side TFO globally for 1hr at first. Then if it
+ * happens again, we disable it for 2h, then 4h, 8h, ...
+ * And we reset the timeout back to 1hr when we see a successful active
+ * TFO connection with data exchanges.
+ */
+
+/* Default to 1hr */
+unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60;
+static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0);
+static unsigned long tfo_active_disable_stamp __read_mostly;
+
+/* Disable active TFO and record current jiffies and
+ * tfo_active_disable_times
+ */
+void tcp_fastopen_active_disable(void)
+{
+       atomic_inc(&tfo_active_disable_times);
+       tfo_active_disable_stamp = jiffies;
+}
+
+/* Reset tfo_active_disable_times to 0 */
+void tcp_fastopen_active_timeout_reset(void)
+{
+       atomic_set(&tfo_active_disable_times, 0);
+}
+
+/* Calculate timeout for tfo active disable
+ * Return true if we are still in the active TFO disable period
+ * Return false if timeout already expired and we should use active TFO
+ */
+bool tcp_fastopen_active_should_disable(struct sock *sk)
+{
+       int tfo_da_times = atomic_read(&tfo_active_disable_times);
+       int multiplier;
+       unsigned long timeout;
+
+       if (!tfo_da_times)
+               return false;
+
+       /* Limit timout to max: 2^6 * initial timeout */
+       multiplier = 1 << min(tfo_da_times - 1, 6);
+       timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ;
+       if (time_before(jiffies, tfo_active_disable_stamp + timeout))
+               return true;
+
+       /* Mark check bit so we can check for successful active TFO
+        * condition and reset tfo_active_disable_times
+        */
+       tcp_sk(sk)->syn_fastopen_ch = 1;
+       return false;
+}
+
+/* Disable active TFO if FIN is the only packet in the ofo queue
+ * and no data is received.
+ * Also check if we can reset tfo_active_disable_times if data is
+ * received successfully on a marked active TFO sockets opened on
+ * a non-loopback interface
+ */
+void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct rb_node *p;
+       struct sk_buff *skb;
+       struct dst_entry *dst;
+
+       if (!tp->syn_fastopen)
+               return;
+
+       if (!tp->data_segs_in) {
+               p = rb_first(&tp->out_of_order_queue);
+               if (p && !rb_next(p)) {
+                       skb = rb_entry(p, struct sk_buff, rbnode);
+                       if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+                               tcp_fastopen_active_disable();
+                               return;
+                       }
+               }
+       } else if (tp->syn_fastopen_ch &&
+                  atomic_read(&tfo_active_disable_times)) {
+               dst = sk_dst_get(sk);
+               if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
+                       tcp_fastopen_active_timeout_reset();
+               dst_release(dst);
+       }
+}
index 341f021f02a2931cd75b2e1e71af9729fc4c7895..9f342a67dc741d2fffe45c123b31b4af9ae39e12 100644 (file)
@@ -5300,8 +5300,16 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 
                if (rst_seq_match)
                        tcp_reset(sk);
-               else
+               else {
+                       /* Disable TFO if RST is out-of-order
+                        * and no data has been received
+                        * for current active TFO socket
+                        */
+                       if (tp->syn_fastopen && !tp->data_segs_in &&
+                           sk->sk_state == TCP_ESTABLISHED)
+                               tcp_fastopen_active_disable();
                        tcp_send_challenge_ack(sk, skb);
+               }
                goto discard;
        }
 
@@ -6044,9 +6052,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                        break;
                }
 
-               if (tp->linger2 < 0 ||
-                   (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
-                    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+               if (tp->linger2 < 0) {
+                       tcp_done(sk);
+                       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+                       return 1;
+               }
+               if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+                   after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+                       /* Receive out of order FIN after close() */
+                       if (tp->syn_fastopen && th->fin)
+                               tcp_fastopen_active_disable();
                        tcp_done(sk);
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                        return 1;
index 20cbd2f07f281717c1cb4e901c4c4e22f7c46bd6..cbbafe546c0f5c5f43531eaf24f5b460264785c6 100644 (file)
@@ -1855,6 +1855,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
        /* Cleanup up the write buffer. */
        tcp_write_queue_purge(sk);
 
+       /* Check if we want to disable active TFO */
+       tcp_fastopen_active_disable_ofo_check(sk);
+
        /* Cleans up our, hopefully empty, out_of_order_queue. */
        skb_rbtree_purge(&tp->out_of_order_queue);