Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
[sfrench/cifs-2.6.git] / net / ipv4 / route.c
index 75fb8864be67fdb743054173d03b46aff1758500..bf4e4adc2d0002c3acc016bfa7e128a477150b33 100644 (file)
@@ -1341,6 +1341,37 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
        return NULL;
 }
 
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ */
+
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
+{
+       struct fib_info *fi = res->fi;
+       struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
+       struct net_device *dev = nh->nh_dev;
+       u32 mtu = 0;
+
+       if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+           fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
+               mtu = fi->fib_mtu;
+
+       if (likely(!mtu)) {
+               struct fib_nh_exception *fnhe;
+
+               fnhe = find_exception(nh, daddr);
+               if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
+                       mtu = fnhe->fnhe_pmtu;
+       }
+
+       if (likely(!mtu))
+               mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
+
+       return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
+}
+
 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
                              __be32 daddr, const bool do_cache)
 {
@@ -2563,11 +2594,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 /* called with rcu_read_lock held */
-static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
-                       struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
-                       u32 seq)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+                       struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+                       struct sk_buff *skb, u32 portid, u32 seq)
 {
-       struct rtable *rt = skb_rtable(skb);
        struct rtmsg *r;
        struct nlmsghdr *nlh;
        unsigned long expires = 0;
@@ -2663,7 +2693,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
                        }
                } else
 #endif
-                       if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+                       if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
                                goto nla_put_failure;
        }
 
@@ -2678,43 +2708,93 @@ nla_put_failure:
        return -EMSGSIZE;
 }
 
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+                                                  u8 ip_proto, __be16 sport,
+                                                  __be16 dport)
+{
+       struct sk_buff *skb;
+       struct iphdr *iph;
+
+       skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       if (!skb)
+               return NULL;
+
+       /* Reserve room for dummy headers, this skb can pass
+        * through good chunk of routing engine.
+        */
+       skb_reset_mac_header(skb);
+       skb_reset_network_header(skb);
+       skb->protocol = htons(ETH_P_IP);
+       iph = skb_put(skb, sizeof(struct iphdr));
+       iph->protocol = ip_proto;
+       iph->saddr = src;
+       iph->daddr = dst;
+       iph->version = 0x4;
+       iph->frag_off = 0;
+       iph->ihl = 0x5;
+       skb_set_transport_header(skb, skb->len);
+
+       switch (iph->protocol) {
+       case IPPROTO_UDP: {
+               struct udphdr *udph;
+
+               udph = skb_put_zero(skb, sizeof(struct udphdr));
+               udph->source = sport;
+               udph->dest = dport;
+               udph->len = sizeof(struct udphdr);
+               udph->check = 0;
+               break;
+       }
+       case IPPROTO_TCP: {
+               struct tcphdr *tcph;
+
+               tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+               tcph->source    = sport;
+               tcph->dest      = dport;
+               tcph->doff      = sizeof(struct tcphdr) / 4;
+               tcph->rst = 1;
+               tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+                                           src, dst, 0);
+               break;
+       }
+       case IPPROTO_ICMP: {
+               struct icmphdr *icmph;
+
+               icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+               icmph->type = ICMP_ECHO;
+               icmph->code = 0;
+       }
+       }
+
+       return skb;
+}
+
 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
 {
        struct net *net = sock_net(in_skb->sk);
-       struct rtmsg *rtm;
        struct nlattr *tb[RTA_MAX+1];
+       u32 table_id = RT_TABLE_MAIN;
+       __be16 sport = 0, dport = 0;
        struct fib_result res = {};
+       u8 ip_proto = IPPROTO_UDP;
        struct rtable *rt = NULL;
+       struct sk_buff *skb;
+       struct rtmsg *rtm;
        struct flowi4 fl4;
        __be32 dst = 0;
        __be32 src = 0;
+       kuid_t uid;
        u32 iif;
        int err;
        int mark;
-       struct sk_buff *skb;
-       u32 table_id = RT_TABLE_MAIN;
-       kuid_t uid;
 
        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
                          extack);
        if (err < 0)
-               goto errout;
+               return err;
 
        rtm = nlmsg_data(nlh);
-
-       skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-       if (!skb) {
-               err = -ENOBUFS;
-               goto errout;
-       }
-
-       /* Reserve room for dummy headers, this skb can pass
-          through good chunk of routing engine.
-        */
-       skb_reset_mac_header(skb);
-       skb_reset_network_header(skb);
-
        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2724,14 +2804,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
        else
                uid = (iif ? INVALID_UID : current_uid());
 
-       /* Bugfix: need to give ip_route_input enough of an IP header to
-        * not gag.
-        */
-       ip_hdr(skb)->protocol = IPPROTO_UDP;
-       ip_hdr(skb)->saddr = src;
-       ip_hdr(skb)->daddr = dst;
+       if (tb[RTA_IP_PROTO]) {
+               err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+                                                 &ip_proto, extack);
+               if (err)
+                       return err;
+       }
 
-       skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+       if (tb[RTA_SPORT])
+               sport = nla_get_be16(tb[RTA_SPORT]);
+
+       if (tb[RTA_DPORT])
+               dport = nla_get_be16(tb[RTA_DPORT]);
+
+       skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+       if (!skb)
+               return -ENOBUFS;
 
        memset(&fl4, 0, sizeof(fl4));
        fl4.daddr = dst;
@@ -2740,6 +2828,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
        fl4.flowi4_mark = mark;
        fl4.flowi4_uid = uid;
+       if (sport)
+               fl4.fl4_sport = sport;
+       if (dport)
+               fl4.fl4_dport = dport;
+       fl4.flowi4_proto = ip_proto;
 
        rcu_read_lock();
 
@@ -2749,10 +2842,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                dev = dev_get_by_index_rcu(net, iif);
                if (!dev) {
                        err = -ENODEV;
-                       goto errout_free;
+                       goto errout_rcu;
                }
 
-               skb->protocol   = htons(ETH_P_IP);
+               fl4.flowi4_iif = iif; /* for rt_fill_info */
                skb->dev        = dev;
                skb->mark       = mark;
                err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2772,7 +2865,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
        }
 
        if (err)
-               goto errout_free;
+               goto errout_rcu;
 
        if (rtm->rtm_flags & RTM_F_NOTIFY)
                rt->rt_flags |= RTCF_NOTIFY;
@@ -2780,34 +2873,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
                table_id = res.table ? res.table->tb_id : 0;
 
+       /* reset skb for netlink reply msg */
+       skb_trim(skb, 0);
+       skb_reset_network_header(skb);
+       skb_reset_transport_header(skb);
+       skb_reset_mac_header(skb);
+
        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
                if (!res.fi) {
                        err = fib_props[res.type].error;
                        if (!err)
                                err = -EHOSTUNREACH;
-                       goto errout_free;
+                       goto errout_rcu;
                }
                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
                                    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
                                    rt->rt_type, res.prefix, res.prefixlen,
                                    fl4.flowi4_tos, res.fi, 0);
        } else {
-               err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+               err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
                                   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
        }
        if (err < 0)
-               goto errout_free;
+               goto errout_rcu;
 
        rcu_read_unlock();
 
        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
-       return err;
 
 errout_free:
+       return err;
+errout_rcu:
        rcu_read_unlock();
        kfree_skb(skb);
-       goto errout;
+       goto errout_free;
 }
 
 void ip_rt_multicast_event(struct in_device *in_dev)