Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
index a96d5b385d8fa25bab416d3f6bc836e743b3ca99..05eb7bc36156a3e571f728ad33d61b1a41a81a1c 100644 (file)
@@ -44,6 +44,7 @@
 #include <linux/seq_file.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
+#include <linux/jhash.h>
 #include <net/net_namespace.h>
 #include <net/snmp.h>
 #include <net/ipv6.h>
@@ -104,6 +105,9 @@ static int rt6_fill_node(struct net *net,
                         struct in6_addr *dst, struct in6_addr *src,
                         int iif, int type, u32 portid, u32 seq,
                         unsigned int flags);
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+                                          struct in6_addr *daddr,
+                                          struct in6_addr *saddr);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -139,9 +143,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt)
 {
        if (!list_empty(&rt->rt6i_uncached)) {
                struct uncached_list *ul = rt->rt6i_uncached_list;
+               struct net *net = dev_net(rt->dst.dev);
 
                spin_lock_bh(&ul->lock);
                list_del(&rt->rt6i_uncached);
+               atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
                spin_unlock_bh(&ul->lock);
        }
 }
@@ -355,8 +361,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
                                        1, DST_OBSOLETE_FORCE_CHK, flags);
 
-       if (rt)
+       if (rt) {
                rt6_info_init(rt);
+               atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
+       }
 
        return rt;
 }
@@ -369,17 +377,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net,
 
        if (rt) {
                rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
-               if (rt->rt6i_pcpu) {
-                       int cpu;
-
-                       for_each_possible_cpu(cpu) {
-                               struct rt6_info **p;
-
-                               p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
-                               /* no one shares rt */
-                               *p =  NULL;
-                       }
-               } else {
+               if (!rt->rt6i_pcpu) {
                        dst_release_immediate(&rt->dst);
                        return NULL;
                }
@@ -392,6 +390,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
        struct rt6_info *rt = (struct rt6_info *)dst;
+       struct rt6_exception_bucket *bucket;
        struct dst_entry *from = dst->from;
        struct inet6_dev *idev;
 
@@ -404,6 +403,11 @@ static void ip6_dst_destroy(struct dst_entry *dst)
                rt->rt6i_idev = NULL;
                in6_dev_put(idev);
        }
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
+       if (bucket) {
+               rt->rt6i_exception_bucket = NULL;
+               kfree(bucket);
+       }
 
        dst->from = NULL;
        dst_release(from);
@@ -478,7 +482,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 }
 
 /*
- *     Route lookup. Any table->tb6_lock is implied.
+ *     Route lookup. rcu_read_lock() should be held.
  */
 
 static inline struct rt6_info *rt6_device_match(struct net *net,
@@ -493,7 +497,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
        if (!oif && ipv6_addr_any(saddr))
                goto out;
 
-       for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
+       for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
                struct net_device *dev = sprt->dst.dev;
 
                if (oif) {
@@ -702,6 +706,7 @@ out:
 }
 
 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
+                                    struct rt6_info *leaf,
                                     struct rt6_info *rr_head,
                                     u32 metric, int oif, int strict,
                                     bool *do_rr)
@@ -711,7 +716,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 
        match = NULL;
        cont = NULL;
-       for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
+       for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
                if (rt->rt6i_metric != metric) {
                        cont = rt;
                        break;
@@ -720,7 +725,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
                match = find_match(rt, oif, strict, &mpri, match, do_rr);
        }
 
-       for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
+       for (rt = leaf; rt && rt != rr_head;
+            rt = rcu_dereference(rt->dst.rt6_next)) {
                if (rt->rt6i_metric != metric) {
                        cont = rt;
                        break;
@@ -732,37 +738,59 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
        if (match || !cont)
                return match;
 
-       for (rt = cont; rt; rt = rt->dst.rt6_next)
+       for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
        return match;
 }
 
-static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
+static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
+                                  int oif, int strict)
 {
+       struct rt6_info *leaf = rcu_dereference(fn->leaf);
        struct rt6_info *match, *rt0;
-       struct net *net;
        bool do_rr = false;
+       int key_plen;
 
-       rt0 = fn->rr_ptr;
+       if (!leaf || leaf == net->ipv6.ip6_null_entry)
+               return net->ipv6.ip6_null_entry;
+
+       rt0 = rcu_dereference(fn->rr_ptr);
        if (!rt0)
-               fn->rr_ptr = rt0 = fn->leaf;
+               rt0 = leaf;
 
-       match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
+       /* Double check to make sure fn is not an intermediate node
+        * and fn->leaf does not points to its child's leaf
+        * (This might happen if all routes under fn are deleted from
+        * the tree and fib6_repair_tree() is called on the node.)
+        */
+       key_plen = rt0->rt6i_dst.plen;
+#ifdef CONFIG_IPV6_SUBTREES
+       if (rt0->rt6i_src.plen)
+               key_plen = rt0->rt6i_src.plen;
+#endif
+       if (fn->fn_bit != key_plen)
+               return net->ipv6.ip6_null_entry;
+
+       match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
                             &do_rr);
 
        if (do_rr) {
-               struct rt6_info *next = rt0->dst.rt6_next;
+               struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
 
                /* no entries matched; do round-robin */
                if (!next || next->rt6i_metric != rt0->rt6i_metric)
-                       next = fn->leaf;
-
-               if (next != rt0)
-                       fn->rr_ptr = next;
+                       next = leaf;
+
+               if (next != rt0) {
+                       spin_lock_bh(&leaf->rt6i_table->tb6_lock);
+                       /* make sure next is not being deleted from the tree */
+                       if (next->rt6i_node)
+                               rcu_assign_pointer(fn->rr_ptr, next);
+                       spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
+               }
        }
 
-       net = dev_net(rt0->dst.dev);
        return match ? match : net->ipv6.ip6_null_entry;
 }
 
@@ -850,13 +878,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
                                        struct in6_addr *saddr)
 {
-       struct fib6_node *pn;
+       struct fib6_node *pn, *sn;
        while (1) {
                if (fn->fn_flags & RTN_TL_ROOT)
                        return NULL;
-               pn = fn->parent;
-               if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
-                       fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+               pn = rcu_dereference(fn->parent);
+               sn = FIB6_SUBTREE(pn);
+               if (sn && sn != fn)
+                       fn = fib6_lookup(sn, NULL, saddr);
                else
                        fn = pn;
                if (fn->fn_flags & RTN_RTINFO)
@@ -864,29 +893,59 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
        }
 }
 
+static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
+                         bool null_fallback)
+{
+       struct rt6_info *rt = *prt;
+
+       if (dst_hold_safe(&rt->dst))
+               return true;
+       if (null_fallback) {
+               rt = net->ipv6.ip6_null_entry;
+               dst_hold(&rt->dst);
+       } else {
+               rt = NULL;
+       }
+       *prt = rt;
+       return false;
+}
+
 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6, int flags)
 {
+       struct rt6_info *rt, *rt_cache;
        struct fib6_node *fn;
-       struct rt6_info *rt;
 
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
-       rt = fn->leaf;
-       rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
-       if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-               rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
+       rt = rcu_dereference(fn->leaf);
+       if (!rt) {
+               rt = net->ipv6.ip6_null_entry;
+       } else {
+               rt = rt6_device_match(net, rt, &fl6->saddr,
+                                     fl6->flowi6_oif, flags);
+               if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+                       rt = rt6_multipath_select(rt, fl6,
+                                                 fl6->flowi6_oif, flags);
+       }
        if (rt == net->ipv6.ip6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto restart;
        }
-       dst_use(&rt->dst, jiffies);
-       read_unlock_bh(&table->tb6_lock);
+       /* Search through exception table */
+       rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+       if (rt_cache)
+               rt = rt_cache;
+
+       if (ip6_hold_safe(net, &rt, true))
+               dst_use_noref(&rt->dst, jiffies);
+
+       rcu_read_unlock();
 
-       trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+       trace_fib6_table_lookup(net, rt, table, fl6);
 
        return rt;
 
@@ -938,9 +997,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
        struct fib6_table *table;
 
        table = rt->rt6i_table;
-       write_lock_bh(&table->tb6_lock);
+       spin_lock_bh(&table->tb6_lock);
        err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
-       write_unlock_bh(&table->tb6_lock);
+       spin_unlock_bh(&table->tb6_lock);
 
        return err;
 }
@@ -1038,7 +1097,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
        return pcpu_rt;
 }
 
-/* It should be called with read_lock_bh(&tb6_lock) acquired */
+/* It should be called with rcu_read_lock() acquired */
 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 {
        struct rt6_info *pcpu_rt, **p;
@@ -1046,16 +1105,14 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
        p = this_cpu_ptr(rt->rt6i_pcpu);
        pcpu_rt = *p;
 
-       if (pcpu_rt) {
-               dst_hold(&pcpu_rt->dst);
+       if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
                rt6_dst_from_metrics_check(pcpu_rt);
-       }
+
        return pcpu_rt;
 }
 
 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
 {
-       struct fib6_table *table = rt->rt6i_table;
        struct rt6_info *pcpu_rt, *prev, **p;
 
        pcpu_rt = ip6_rt_pcpu_alloc(rt);
@@ -1066,36 +1123,526 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
                return net->ipv6.ip6_null_entry;
        }
 
-       read_lock_bh(&table->tb6_lock);
-       if (rt->rt6i_pcpu) {
-               p = this_cpu_ptr(rt->rt6i_pcpu);
-               prev = cmpxchg(p, NULL, pcpu_rt);
-               if (prev) {
-                       /* If someone did it before us, return prev instead */
-                       dst_release_immediate(&pcpu_rt->dst);
-                       pcpu_rt = prev;
-               }
-       } else {
-               /* rt has been removed from the fib6 tree
-                * before we have a chance to acquire the read_lock.
-                * In this case, don't brother to create a pcpu rt
-                * since rt is going away anyway.  The next
-                * dst_check() will trigger a re-lookup.
-                */
-               dst_release_immediate(&pcpu_rt->dst);
-               pcpu_rt = rt;
-       }
        dst_hold(&pcpu_rt->dst);
+       p = this_cpu_ptr(rt->rt6i_pcpu);
+       prev = cmpxchg(p, NULL, pcpu_rt);
+       BUG_ON(prev);
+
        rt6_dst_from_metrics_check(pcpu_rt);
-       read_unlock_bh(&table->tb6_lock);
        return pcpu_rt;
 }
 
+/* exception hash table implementation
+ */
+static DEFINE_SPINLOCK(rt6_exception_lock);
+
+/* Remove rt6_ex from hash table and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
+                                struct rt6_exception *rt6_ex)
+{
+       struct net *net;
+
+       if (!bucket || !rt6_ex)
+               return;
+
+       net = dev_net(rt6_ex->rt6i->dst.dev);
+       rt6_ex->rt6i->rt6i_node = NULL;
+       hlist_del_rcu(&rt6_ex->hlist);
+       rt6_release(rt6_ex->rt6i);
+       kfree_rcu(rt6_ex, rcu);
+       WARN_ON_ONCE(!bucket->depth);
+       bucket->depth--;
+       net->ipv6.rt6_stats->fib_rt_cache--;
+}
+
+/* Remove oldest rt6_ex in bucket and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
+{
+       struct rt6_exception *rt6_ex, *oldest = NULL;
+
+       if (!bucket)
+               return;
+
+       hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+               if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
+                       oldest = rt6_ex;
+       }
+       rt6_remove_exception(bucket, oldest);
+}
+
+static u32 rt6_exception_hash(const struct in6_addr *dst,
+                             const struct in6_addr *src)
+{
+       static u32 seed __read_mostly;
+       u32 val;
+
+       net_get_random_once(&seed, sizeof(seed));
+       val = jhash(dst, sizeof(*dst), seed);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       if (src)
+               val = jhash(src, sizeof(*src), val);
+#endif
+       return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rt6_exception_lock
+ */
+static struct rt6_exception *
+__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
+                             const struct in6_addr *daddr,
+                             const struct in6_addr *saddr)
+{
+       struct rt6_exception *rt6_ex;
+       u32 hval;
+
+       if (!(*bucket) || !daddr)
+               return NULL;
+
+       hval = rt6_exception_hash(daddr, saddr);
+       *bucket += hval;
+
+       hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
+               struct rt6_info *rt6 = rt6_ex->rt6i;
+               bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+               if (matched && saddr)
+                       matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+               if (matched)
+                       return rt6_ex;
+       }
+       return NULL;
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rcu_read_lock()
+ */
+static struct rt6_exception *
+__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
+                        const struct in6_addr *daddr,
+                        const struct in6_addr *saddr)
+{
+       struct rt6_exception *rt6_ex;
+       u32 hval;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (!(*bucket) || !daddr)
+               return NULL;
+
+       hval = rt6_exception_hash(daddr, saddr);
+       *bucket += hval;
+
+       hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
+               struct rt6_info *rt6 = rt6_ex->rt6i;
+               bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+               if (matched && saddr)
+                       matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+               if (matched)
+                       return rt6_ex;
+       }
+       return NULL;
+}
+
+static int rt6_insert_exception(struct rt6_info *nrt,
+                               struct rt6_info *ort)
+{
+       struct net *net = dev_net(ort->dst.dev);
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+       int err = 0;
+
+       /* ort can't be a cache or pcpu route */
+       if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
+               ort = (struct rt6_info *)ort->dst.from;
+       WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
+
+       spin_lock_bh(&rt6_exception_lock);
+
+       if (ort->exception_bucket_flushed) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
+                                       lockdep_is_held(&rt6_exception_lock));
+       if (!bucket) {
+               bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
+                                GFP_ATOMIC);
+               if (!bucket) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
+       }
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates ort is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (ort->rt6i_src.plen)
+               src_key = &nrt->rt6i_src.addr;
+#endif
+
+       /* Update rt6i_prefsrc as it could be changed
+        * in rt6_remove_prefsrc()
+        */
+       nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
+       /* rt6_mtu_change() might lower mtu on ort.
+        * Only insert this exception route if its mtu
+        * is less than ort's mtu value.
+        */
+       if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
+                                              src_key);
+       if (rt6_ex)
+               rt6_remove_exception(bucket, rt6_ex);
+
+       rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
+       if (!rt6_ex) {
+               err = -ENOMEM;
+               goto out;
+       }
+       rt6_ex->rt6i = nrt;
+       rt6_ex->stamp = jiffies;
+       atomic_inc(&nrt->rt6i_ref);
+       nrt->rt6i_node = ort->rt6i_node;
+       hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
+       bucket->depth++;
+       net->ipv6.rt6_stats->fib_rt_cache++;
+
+       if (bucket->depth > FIB6_MAX_DEPTH)
+               rt6_exception_remove_oldest(bucket);
+
+out:
+       spin_unlock_bh(&rt6_exception_lock);
+
+       /* Update fn->fn_sernum to invalidate all cached dst */
+       if (!err) {
+               fib6_update_sernum(ort);
+               fib6_force_start_gc(net);
+       }
+
+       return err;
+}
+
+void rt6_flush_exceptions(struct rt6_info *rt)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       struct hlist_node *tmp;
+       int i;
+
+       spin_lock_bh(&rt6_exception_lock);
+       /* Prevent rt6_insert_exception() to recreate the bucket list */
+       rt->exception_bucket_flushed = 1;
+
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                   lockdep_is_held(&rt6_exception_lock));
+       if (!bucket)
+               goto out;
+
+       for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+               hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
+                       rt6_remove_exception(bucket, rt6_ex);
+               WARN_ON_ONCE(bucket->depth);
+               bucket++;
+       }
+
+out:
+       spin_unlock_bh(&rt6_exception_lock);
+}
+
+/* Find cached rt in the hash table inside passed in rt
+ * Caller has to hold rcu_read_lock()
+ */
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+                                          struct in6_addr *daddr,
+                                          struct in6_addr *saddr)
+{
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+       struct rt6_info *res = NULL;
+
+       bucket = rcu_dereference(rt->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates rt is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (rt->rt6i_src.plen)
+               src_key = saddr;
+#endif
+       rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
+
+       if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
+               res = rt6_ex->rt6i;
+
+       return res;
+}
+
+/* Remove the passed in cached rt from the hash table that contains it */
+int rt6_remove_exception_rt(struct rt6_info *rt)
+{
+       struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+       int err;
+
+       if (!from ||
+           !(rt->rt6i_flags & RTF_CACHE))
+               return -EINVAL;
+
+       if (!rcu_access_pointer(from->rt6i_exception_bucket))
+               return -ENOENT;
+
+       spin_lock_bh(&rt6_exception_lock);
+       bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
+                                   lockdep_is_held(&rt6_exception_lock));
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates 'from' is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (from->rt6i_src.plen)
+               src_key = &rt->rt6i_src.addr;
+#endif
+       rt6_ex = __rt6_find_exception_spinlock(&bucket,
+                                              &rt->rt6i_dst.addr,
+                                              src_key);
+       if (rt6_ex) {
+               rt6_remove_exception(bucket, rt6_ex);
+               err = 0;
+       } else {
+               err = -ENOENT;
+       }
+
+       spin_unlock_bh(&rt6_exception_lock);
+       return err;
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+       struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+
+       if (!from ||
+           !(rt->rt6i_flags & RTF_CACHE))
+               return;
+
+       rcu_read_lock();
+       bucket = rcu_dereference(from->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates 'from' is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (from->rt6i_src.plen)
+               src_key = &rt->rt6i_src.addr;
+#endif
+       rt6_ex = __rt6_find_exception_rcu(&bucket,
+                                         &rt->rt6i_dst.addr,
+                                         src_key);
+       if (rt6_ex)
+               rt6_ex->stamp = jiffies;
+
+       rcu_read_unlock();
+}
+
+static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       int i;
+
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                       lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+                               rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
+                       }
+                       bucket++;
+               }
+       }
+}
+
+static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       int i;
+
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                       lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+                               struct rt6_info *entry = rt6_ex->rt6i;
+                               /* For RTF_CACHE with rt6i_pmtu == 0
+                                * (i.e. a redirected route),
+                                * the metrics of its rt->dst.from has already
+                                * been updated.
+                                */
+                               if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
+                                       entry->rt6i_pmtu = mtu;
+                       }
+                       bucket++;
+               }
+       }
+}
+
+#define RTF_CACHE_GATEWAY      (RTF_GATEWAY | RTF_CACHE)
+
+static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
+                                       struct in6_addr *gateway)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       struct hlist_node *tmp;
+       int i;
+
+       if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+               return;
+
+       spin_lock_bh(&rt6_exception_lock);
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                    lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry_safe(rt6_ex, tmp,
+                                                 &bucket->chain, hlist) {
+                               struct rt6_info *entry = rt6_ex->rt6i;
+
+                               if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
+                                   RTF_CACHE_GATEWAY &&
+                                   ipv6_addr_equal(gateway,
+                                                   &entry->rt6i_gateway)) {
+                                       rt6_remove_exception(bucket, rt6_ex);
+                               }
+                       }
+                       bucket++;
+               }
+       }
+
+       spin_unlock_bh(&rt6_exception_lock);
+}
+
+static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
+                                     struct rt6_exception *rt6_ex,
+                                     struct fib6_gc_args *gc_args,
+                                     unsigned long now)
+{
+       struct rt6_info *rt = rt6_ex->rt6i;
+
+       /* we are pruning and obsoleting aged-out and non gateway exceptions
+        * even if others have still references to them, so that on next
+        * dst_check() such references can be dropped.
+        * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
+        * expired, independently from their aging, as per RFC 8201 section 4
+        */
+       if (!(rt->rt6i_flags & RTF_EXPIRES) &&
+           time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+               RT6_TRACE("aging clone %p\n", rt);
+               rt6_remove_exception(bucket, rt6_ex);
+               return;
+       } else if (rt->rt6i_flags & RTF_GATEWAY) {
+               struct neighbour *neigh;
+               __u8 neigh_flags = 0;
+
+               neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
+               if (neigh) {
+                       neigh_flags = neigh->flags;
+                       neigh_release(neigh);
+               }
+               if (!(neigh_flags & NTF_ROUTER)) {
+                       RT6_TRACE("purging route %p via non-router but gateway\n",
+                                 rt);
+                       rt6_remove_exception(bucket, rt6_ex);
+                       return;
+               }
+       } else if (__rt6_check_expired(rt)) {
+               RT6_TRACE("purging expired route %p\n", rt);
+               rt6_remove_exception(bucket, rt6_ex);
+               return;
+       }
+       gc_args->more++;
+}
+
+void rt6_age_exceptions(struct rt6_info *rt,
+                       struct fib6_gc_args *gc_args,
+                       unsigned long now)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       struct hlist_node *tmp;
+       int i;
+
+       if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+               return;
+
+       spin_lock_bh(&rt6_exception_lock);
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                   lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry_safe(rt6_ex, tmp,
+                                                 &bucket->chain, hlist) {
+                               rt6_age_examine_exception(bucket, rt6_ex,
+                                                         gc_args, now);
+                       }
+                       bucket++;
+               }
+       }
+       spin_unlock_bh(&rt6_exception_lock);
+}
+
 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                               int oif, struct flowi6 *fl6, int flags)
 {
        struct fib6_node *fn, *saved_fn;
-       struct rt6_info *rt;
+       struct rt6_info *rt, *rt_cache;
        int strict = 0;
 
        strict |= flags & RT6_LOOKUP_F_IFACE;
@@ -1103,7 +1650,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
        if (net->ipv6.devconf_all->forwarding == 0)
                strict |= RT6_LOOKUP_F_REACHABLE;
 
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
 
        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
        saved_fn = fn;
@@ -1112,7 +1659,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                oif = 0;
 
 redo_rt6_select:
-       rt = rt6_select(fn, oif, strict);
+       rt = rt6_select(net, fn, oif, strict);
        if (rt->rt6i_nsiblings)
                rt = rt6_multipath_select(rt, fl6, oif, strict);
        if (rt == net->ipv6.ip6_null_entry) {
@@ -1127,14 +1674,23 @@ redo_rt6_select:
                }
        }
 
+       /*Search through exception table */
+       rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+       if (rt_cache)
+               rt = rt_cache;
 
-       if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
-               dst_use(&rt->dst, jiffies);
-               read_unlock_bh(&table->tb6_lock);
-
-               rt6_dst_from_metrics_check(rt);
-
-               trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+       if (rt == net->ipv6.ip6_null_entry) {
+               rcu_read_unlock();
+               dst_hold(&rt->dst);
+               trace_fib6_table_lookup(net, rt, table, fl6);
+               return rt;
+       } else if (rt->rt6i_flags & RTF_CACHE) {
+               if (ip6_hold_safe(net, &rt, true)) {
+                       dst_use_noref(&rt->dst, jiffies);
+                       rt6_dst_from_metrics_check(rt);
+               }
+               rcu_read_unlock();
+               trace_fib6_table_lookup(net, rt, table, fl6);
                return rt;
        } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
                            !(rt->rt6i_flags & RTF_GATEWAY))) {
@@ -1146,8 +1702,14 @@ redo_rt6_select:
 
                struct rt6_info *uncached_rt;
 
-               dst_use(&rt->dst, jiffies);
-               read_unlock_bh(&table->tb6_lock);
+               if (ip6_hold_safe(net, &rt, true)) {
+                       dst_use_noref(&rt->dst, jiffies);
+               } else {
+                       rcu_read_unlock();
+                       uncached_rt = rt;
+                       goto uncached_rt_out;
+               }
+               rcu_read_unlock();
 
                uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
                dst_release(&rt->dst);
@@ -1157,12 +1719,14 @@ redo_rt6_select:
                         * No need for another dst_hold()
                         */
                        rt6_uncached_list_add(uncached_rt);
+                       atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
                } else {
                        uncached_rt = net->ipv6.ip6_null_entry;
                        dst_hold(&uncached_rt->dst);
                }
 
-               trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
+uncached_rt_out:
+               trace_fib6_table_lookup(net, uncached_rt, table, fl6);
                return uncached_rt;
 
        } else {
@@ -1170,26 +1734,28 @@ redo_rt6_select:
 
                struct rt6_info *pcpu_rt;
 
-               rt->dst.lastuse = jiffies;
-               rt->dst.__use++;
+               dst_use_noref(&rt->dst, jiffies);
+               local_bh_disable();
                pcpu_rt = rt6_get_pcpu_route(rt);
 
-               if (pcpu_rt) {
-                       read_unlock_bh(&table->tb6_lock);
-               } else {
-                       /* We have to do the read_unlock first
-                        * because rt6_make_pcpu_route() may trigger
-                        * ip6_dst_gc() which will take the write_lock.
-                        */
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
-                       pcpu_rt = rt6_make_pcpu_route(rt);
-                       dst_release(&rt->dst);
+               if (!pcpu_rt) {
+                       /* atomic_inc_not_zero() is needed when using rcu */
+                       if (atomic_inc_not_zero(&rt->rt6i_ref)) {
+                               /* No dst_hold() on rt is needed because grabbing
+                                * rt->rt6i_ref makes sure rt can't be released.
+                                */
+                               pcpu_rt = rt6_make_pcpu_route(rt);
+                               rt6_release(rt);
+                       } else {
+                               /* rt is already removed from tree */
+                               pcpu_rt = net->ipv6.ip6_null_entry;
+                               dst_hold(&pcpu_rt->dst);
+                       }
                }
-
-               trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
+               local_bh_enable();
+               rcu_read_unlock();
+               trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
                return pcpu_rt;
-
        }
 }
 EXPORT_SYMBOL_GPL(ip6_pol_route);
@@ -1328,6 +1894,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
                       DST_OBSOLETE_DEAD, 0);
        if (rt) {
                rt6_info_init(rt);
+               atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
 
                new = &rt->dst;
                new->__use = 1;
@@ -1491,23 +2058,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 
        if (!rt6_cache_allowed_for_pmtu(rt6)) {
                rt6_do_update_pmtu(rt6, mtu);
+               /* update rt6_ex->stamp for cache */
+               if (rt6->rt6i_flags & RTF_CACHE)
+                       rt6_update_exception_stamp_rt(rt6);
        } else if (daddr) {
                struct rt6_info *nrt6;
 
                nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
                if (nrt6) {
                        rt6_do_update_pmtu(nrt6, mtu);
-
-                       /* ip6_ins_rt(nrt6) will bump the
-                        * rt6->rt6i_node->fn_sernum
-                        * which will fail the next rt6_check() and
-                        * invalidate the sk->sk_dst_cache.
-                        */
-                       ip6_ins_rt(nrt6);
-                       /* Release the reference taken in
-                        * ip6_rt_cache_alloc()
-                        */
-                       dst_release(&nrt6->dst);
+                       if (rt6_insert_exception(nrt6, rt6))
+                               dst_release_immediate(&nrt6->dst);
                }
        }
 }
@@ -1571,7 +2132,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
                                             int flags)
 {
        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
-       struct rt6_info *rt;
+       struct rt6_info *rt, *rt_cache;
        struct fib6_node *fn;
 
        /* Get the "current" route for this destination and
@@ -1584,10 +2145,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
         * routes.
         */
 
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
-       for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_node_rt_rcu(fn) {
                if (rt6_check_expired(rt))
                        continue;
                if (rt->dst.error)
@@ -1596,8 +2157,23 @@ restart:
                        continue;
                if (fl6->flowi6_oif != rt->dst.dev->ifindex)
                        continue;
-               if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
+               /* rt_cache's gateway might be different from its 'parent'
+                * in the case of an ip redirect.
+                * So we keep searching in the exception table if the gateway
+                * is different.
+                */
+               if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
+                       rt_cache = rt6_find_cached_rt(rt,
+                                                     &fl6->daddr,
+                                                     &fl6->saddr);
+                       if (rt_cache &&
+                           ipv6_addr_equal(&rdfl->gateway,
+                                           &rt_cache->rt6i_gateway)) {
+                               rt = rt_cache;
+                               break;
+                       }
                        continue;
+               }
                break;
        }
 
@@ -1615,11 +2191,11 @@ restart:
        }
 
 out:
-       dst_hold(&rt->dst);
+       ip6_hold_safe(net, &rt, true);
 
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
 
-       trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+       trace_fib6_table_lookup(net, rt, table, fl6);
        return rt;
 };
 
@@ -1766,6 +2342,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
         * do proper release of the net_device
         */
        rt6_uncached_list_add(rt);
+       atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
 
        dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
 
@@ -1801,6 +2378,7 @@ out:
 static int ip6_convert_metrics(struct mx6_config *mxc,
                               const struct fib6_config *cfg)
 {
+       struct net *net = cfg->fc_nlinfo.nl_net;
        bool ecn_ca = false;
        struct nlattr *nla;
        int remaining;
@@ -1826,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
                        char tmp[TCP_CA_NAME_MAX];
 
                        nla_strlcpy(tmp, nla, sizeof(tmp));
-                       val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+                       val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
                        if (val == TCP_CA_UNSPEC)
                                goto err;
                } else {
@@ -1901,6 +2479,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
                goto out;
        }
 
+       /* RTF_CACHE is an internal flag; can not be set by userspace */
+       if (cfg->fc_flags & RTF_CACHE) {
+               NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
+               goto out;
+       }
+
        if (cfg->fc_dst_len > 128) {
                NL_SET_ERR_MSG(extack, "Invalid prefix length");
                goto out;
@@ -2216,9 +2800,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
        }
 
        table = rt->rt6i_table;
-       write_lock_bh(&table->tb6_lock);
+       spin_lock_bh(&table->tb6_lock);
        err = fib6_del(rt, info);
-       write_unlock_bh(&table->tb6_lock);
+       spin_unlock_bh(&table->tb6_lock);
 
 out:
        ip6_rt_put(rt);
@@ -2244,7 +2828,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
        if (rt == net->ipv6.ip6_null_entry)
                goto out_put;
        table = rt->rt6i_table;
-       write_lock_bh(&table->tb6_lock);
+       spin_lock_bh(&table->tb6_lock);
 
        if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
                struct rt6_info *sibling, *next_sibling;
@@ -2274,7 +2858,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 
        err = fib6_del(rt, info);
 out_unlock:
-       write_unlock_bh(&table->tb6_lock);
+       spin_unlock_bh(&table->tb6_lock);
 out_put:
        ip6_rt_put(rt);
 
@@ -2288,9 +2872,9 @@ out_put:
 static int ip6_route_del(struct fib6_config *cfg,
                         struct netlink_ext_ack *extack)
 {
+       struct rt6_info *rt, *rt_cache;
        struct fib6_table *table;
        struct fib6_node *fn;
-       struct rt6_info *rt;
        int err = -ESRCH;
 
        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
@@ -2299,17 +2883,22 @@ static int ip6_route_del(struct fib6_config *cfg,
                return err;
        }
 
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
 
        fn = fib6_locate(&table->tb6_root,
                         &cfg->fc_dst, cfg->fc_dst_len,
-                        &cfg->fc_src, cfg->fc_src_len);
+                        &cfg->fc_src, cfg->fc_src_len,
+                        !(cfg->fc_flags & RTF_CACHE));
 
        if (fn) {
-               for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
-                       if ((rt->rt6i_flags & RTF_CACHE) &&
-                           !(cfg->fc_flags & RTF_CACHE))
-                               continue;
+               for_each_fib6_node_rt_rcu(fn) {
+                       if (cfg->fc_flags & RTF_CACHE) {
+                               rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
+                                                             &cfg->fc_src);
+                               if (!rt_cache)
+                                       continue;
+                               rt = rt_cache;
+                       }
                        if (cfg->fc_ifindex &&
                            (!rt->dst.dev ||
                             rt->dst.dev->ifindex != cfg->fc_ifindex))
@@ -2321,8 +2910,9 @@ static int ip6_route_del(struct fib6_config *cfg,
                                continue;
                        if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
                                continue;
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
+                       if (!dst_hold_safe(&rt->dst))
+                               break;
+                       rcu_read_unlock();
 
                        /* if gateway was specified only delete the one hop */
                        if (cfg->fc_flags & RTF_GATEWAY)
@@ -2331,7 +2921,7 @@ static int ip6_route_del(struct fib6_config *cfg,
                        return __ip6_del_rt_siblings(rt, cfg);
                }
        }
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
 
        return err;
 }
@@ -2435,8 +3025,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
        nrt->rt6i_protocol = RTPROT_REDIRECT;
        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
 
-       if (ip6_ins_rt(nrt))
-               goto out_release;
+       /* No need to remove rt from the exception table if rt is
+        * a cached route because rt6_insert_exception() will
+        * takes care of it
+        */
+       if (rt6_insert_exception(nrt, rt)) {
+               dst_release_immediate(&nrt->dst);
+               goto out;
+       }
 
        netevent.old = &rt->dst;
        netevent.new = &nrt->dst;
@@ -2444,17 +3040,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
        netevent.neigh = neigh;
        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
 
-       if (rt->rt6i_flags & RTF_CACHE) {
-               rt = (struct rt6_info *) dst_clone(&rt->dst);
-               ip6_del_rt(rt);
-       }
-
-out_release:
-       /* Release the reference taken in
-        * ip6_rt_cache_alloc()
-        */
-       dst_release(&nrt->dst);
-
 out:
        neigh_release(neigh);
 }
@@ -2511,23 +3096,23 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
        if (!table)
                return NULL;
 
-       read_lock_bh(&table->tb6_lock);
-       fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
+       rcu_read_lock();
+       fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
        if (!fn)
                goto out;
 
-       for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_node_rt_rcu(fn) {
                if (rt->dst.dev->ifindex != ifindex)
                        continue;
                if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
                        continue;
                if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
                        continue;
-               dst_hold(&rt->dst);
+               ip6_hold_safe(NULL, &rt, false);
                break;
        }
 out:
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
        return rt;
 }
 
@@ -2573,16 +3158,16 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
        if (!table)
                return NULL;
 
-       read_lock_bh(&table->tb6_lock);
-       for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+       rcu_read_lock();
+       for_each_fib6_node_rt_rcu(&table->tb6_root) {
                if (dev == rt->dst.dev &&
                    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
                    ipv6_addr_equal(&rt->rt6i_gateway, addr))
                        break;
        }
        if (rt)
-               dst_hold(&rt->dst);
-       read_unlock_bh(&table->tb6_lock);
+               ip6_hold_safe(NULL, &rt, false);
+       rcu_read_unlock();
        return rt;
 }
 
@@ -2620,17 +3205,20 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table)
        struct rt6_info *rt;
 
 restart:
-       read_lock_bh(&table->tb6_lock);
-       for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+       rcu_read_lock();
+       for_each_fib6_node_rt_rcu(&table->tb6_root) {
                if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
                    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
-                       ip6_del_rt(rt);
+                       if (dst_hold_safe(&rt->dst)) {
+                               rcu_read_unlock();
+                               ip6_del_rt(rt);
+                       } else {
+                               rcu_read_unlock();
+                       }
                        goto restart;
                }
        }
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
 
        table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
 }
@@ -2818,8 +3406,12 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
        if (((void *)rt->dst.dev == dev || !dev) &&
            rt != net->ipv6.ip6_null_entry &&
            ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+               spin_lock_bh(&rt6_exception_lock);
                /* remove prefsrc entry */
                rt->rt6i_prefsrc.plen = 0;
+               /* need to update cache as well */
+               rt6_exceptions_remove_prefsrc(rt);
+               spin_unlock_bh(&rt6_exception_lock);
        }
        return 0;
 }
@@ -2836,18 +3428,23 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
 }
 
 #define RTF_RA_ROUTER          (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
-#define RTF_CACHE_GATEWAY      (RTF_GATEWAY | RTF_CACHE)
 
 /* Remove routers and update dst entries when gateway turn into host. */
 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
 {
        struct in6_addr *gateway = (struct in6_addr *)arg;
 
-       if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
-            ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
-            ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
+       if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+           ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
                return -1;
        }
+
+       /* Further clean up cached routes in exception table.
+        * This is needed because cached route may have a different
+        * gateway than its 'parent' in the case of an ip redirect.
+        */
+       rt6_exceptions_clean_tohost(rt, gateway);
+
        return 0;
 }
 
@@ -2926,19 +3523,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
        if (rt->dst.dev == arg->dev &&
            dst_metric_raw(&rt->dst, RTAX_MTU) &&
            !dst_metric_locked(&rt->dst, RTAX_MTU)) {
-               if (rt->rt6i_flags & RTF_CACHE) {
-                       /* For RTF_CACHE with rt6i_pmtu == 0
-                        * (i.e. a redirected route),
-                        * the metrics of its rt->dst.from has already
-                        * been updated.
-                        */
-                       if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
-                               rt->rt6i_pmtu = arg->mtu;
-               } else if (dst_mtu(&rt->dst) >= arg->mtu ||
-                          (dst_mtu(&rt->dst) < arg->mtu &&
-                           dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+               spin_lock_bh(&rt6_exception_lock);
+               if (dst_mtu(&rt->dst) >= arg->mtu ||
+                   (dst_mtu(&rt->dst) < arg->mtu &&
+                    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
                        dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
                }
+               rt6_exceptions_update_pmtu(rt, arg->mtu);
+               spin_unlock_bh(&rt6_exception_lock);
        }
        return 0;
 }
@@ -3839,7 +4431,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
                   net->ipv6.rt6_stats->fib_nodes,
                   net->ipv6.rt6_stats->fib_route_nodes,
-                  net->ipv6.rt6_stats->fib_rt_alloc,
+                  atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
                   net->ipv6.rt6_stats->fib_rt_entries,
                   net->ipv6.rt6_stats->fib_rt_cache,
                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),