Merge tag 'ipvs-for-v4.18' of http://git.kernel.org/pub/scm/linux/kernel/git/horms...
authorPablo Neira Ayuso <pablo@netfilter.org>
Thu, 26 Apr 2018 22:16:14 +0000 (00:16 +0200)
committerPablo Neira Ayuso <pablo@netfilter.org>
Thu, 26 Apr 2018 22:16:14 +0000 (00:16 +0200)
Simon Horman says:

====================
IPVS Updates for v4.18

please consider these IPVS enhancements for v4.18.

* Whitepace cleanup

* Add Maglev hashing algorithm as a IPVS scheduler

  Inju Song says "Implements the Google's Maglev hashing algorithm as a
  IPVS scheduler.  Basically it provides consistent hashing but offers some
  special features about disruption and load balancing.

  1) minimal disruption: when the set of destinations changes,
     a connection will likely be sent to the same destination
     as it was before.

  2) load balancing: each destination will receive an almost
     equal number of connections.

 Seel also: [3.4 Consistent Hasing] in
 https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
 "

* Fix to correct implementation of Knuth's multiplicative hashing
  which is used in sh/dh/lblc/lblcr algorithms. Instead the
  implementation provided by the hash_32() macro is used.
====================

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
include/net/ip_vs.h
net/netfilter/ipvs/Kconfig
net/netfilter/ipvs/Makefile
net/netfilter/ipvs/ip_vs_ctl.c
net/netfilter/ipvs/ip_vs_dh.c
net/netfilter/ipvs/ip_vs_lblc.c
net/netfilter/ipvs/ip_vs_lblcr.c
net/netfilter/ipvs/ip_vs_mh.c [new file with mode: 0644]
net/netfilter/ipvs/ip_vs_proto_tcp.c
net/netfilter/ipvs/ip_vs_sh.c

index eb0bec043c9618b6f44a5bcb3d2138056672c851..0ac795b41ab80029ba4149f70933994dd2db64e2 100644 (file)
@@ -668,6 +668,7 @@ struct ip_vs_dest {
        volatile unsigned int   flags;          /* dest status flags */
        atomic_t                conn_flags;     /* flags to copy to conn */
        atomic_t                weight;         /* server weight */
+       atomic_t                last_weight;    /* server latest weight */
 
        refcount_t              refcnt;         /* reference counter */
        struct ip_vs_stats      stats;          /* statistics */
index b32fb0dbe237dcffb0a979b8c270c18d9f56e449..05dc1b77e466f4d556037b8bb6464c151ccb89a6 100644 (file)
@@ -225,6 +225,25 @@ config     IP_VS_SH
          If you want to compile it in kernel, say Y. To compile it as a
          module, choose M here. If unsure, say N.
 
+config IP_VS_MH
+       tristate "maglev hashing scheduling"
+       ---help---
+         The maglev consistent hashing scheduling algorithm provides the
+         Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
+         network connections to the servers through looking up a statically
+         assigned special hash table called the lookup table. Maglev hashing
+         is to assign a preference list of all the lookup table positions
+         to each destination.
+
+         Through this operation, The maglev hashing gives an almost equal
+         share of the lookup table to each of the destinations and provides
+         minimal disruption by using the lookup table. When the set of
+         destinations changes, a connection will likely be sent to the same
+         destination as it was before.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
 config IP_VS_SED
        tristate "shortest expected delay scheduling"
        ---help---
@@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS
          needs to be large enough to effectively fit all the destinations
          multiplied by their respective weights.
 
+comment 'IPVS MH scheduler'
+
+config IP_VS_MH_TAB_INDEX
+       int "IPVS maglev hashing table index of size (the prime numbers)"
+       range 8 17
+       default 12
+       ---help---
+         The maglev hashing scheduler maps source IPs to destinations
+         stored in a hash table. This table is assigned by a preference
+         list of the positions to each destination until all slots in
+         the table are filled. The index determines the prime for size of
+         the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
+         65521 or 131071. When using weights to allow destinations to
+         receive more connections, the table is assigned an amount
+         proportional to the weights specified. The table needs to be large
+         enough to effectively fit all the destinations multiplied by their
+         respective weights.
+
 comment 'IPVS application helper'
 
 config IP_VS_FTP
index c552993fa4b9181e68e7d9680babec0462288719..bfce2677fda2610b681effae44d27b465b64795a 100644 (file)
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
 obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
 obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
 obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
 obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
 obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
index 5ebde4b15810c5c219ad64c1bb0b6378a38d468c..b91bb70ece920d06f1dc266c2f3579a25bd28234 100644 (file)
@@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
        if (add && udest->af != svc->af)
                ipvs->mixed_address_family_dests++;
 
+       /* keep the last_weight with latest non-0 weight */
+       if (add || udest->weight != 0)
+               atomic_set(&dest->last_weight, udest->weight);
+
        /* set the weight and the flags */
        atomic_set(&dest->weight, udest->weight);
        conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
index 75f798f8e83b706701787706c6fad2facad35155..07459e71d9072387531d760dbbde7ab23fc5f145 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
+#include <linux/hash.h>
 
 #include <net/ip_vs.h>
 
@@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
                addr_fold = addr->ip6[0]^addr->ip6[1]^
                            addr->ip6[2]^addr->ip6[3];
 #endif
-       return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+       return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS);
 }
 
 
index 3057e453bf317c500cc77865ec4ed6d2a4eb479a..08147fc6400ccf65ca7b8688826aaba1f3fa85c5 100644 (file)
@@ -48,6 +48,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/jiffies.h>
+#include <linux/hash.h>
 
 /* for sysctl */
 #include <linux/fs.h>
@@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
                addr_fold = addr->ip6[0]^addr->ip6[1]^
                            addr->ip6[2]^addr->ip6[3];
 #endif
-       return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+       return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
 }
 
 
index 92adc04557ed954c769e309e9e51d945ac474620..9b6a6c9e9cfa143cf8dc3967472dcb65cb722a60 100644 (file)
@@ -47,6 +47,7 @@
 #include <linux/jiffies.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 
 /* for sysctl */
 #include <linux/fs.h>
@@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
                addr_fold = addr->ip6[0]^addr->ip6[1]^
                            addr->ip6[2]^addr->ip6[3];
 #endif
-       return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+       return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
new file mode 100644 (file)
index 0000000..0f795b1
--- /dev/null
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/* IPVS:       Maglev Hashing scheduling module
+ *
+ * Authors:    Inju Song <inju.song@navercorp.com>
+ *
+ */
+
+/* The mh algorithm is to assign a preference list of all the lookup
+ * table positions to each destination and populate the table with
+ * the most-preferred position of destinations. Then it is to select
+ * destination with the hash key of source IP address through looking
+ * up a the lookup table.
+ *
+ * The algorithm is detailed in:
+ * [3.4 Consistent Hasing]
+https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <linux/gcd.h>
+
+#define IP_VS_SVC_F_SCHED_MH_FALLBACK  IP_VS_SVC_F_SCHED1 /* MH fallback */
+#define IP_VS_SVC_F_SCHED_MH_PORT      IP_VS_SVC_F_SCHED2 /* MH use port */
+
+struct ip_vs_mh_lookup {
+       struct ip_vs_dest __rcu *dest;  /* real server (cache) */
+};
+
+struct ip_vs_mh_dest_setup {
+       unsigned int    offset; /* starting offset */
+       unsigned int    skip;   /* skip */
+       unsigned int    perm;   /* next_offset */
+       int             turns;  /* weight / gcd() and rshift */
+};
+
+/* Available prime numbers for MH table */
+static int primes[] = {251, 509, 1021, 2039, 4093,
+                      8191, 16381, 32749, 65521, 131071};
+
+/* For IPVS MH entry hash table */
+#ifndef CONFIG_IP_VS_MH_TAB_INDEX
+#define CONFIG_IP_VS_MH_TAB_INDEX      12
+#endif
+#define IP_VS_MH_TAB_BITS              (CONFIG_IP_VS_MH_TAB_INDEX / 2)
+#define IP_VS_MH_TAB_INDEX             (CONFIG_IP_VS_MH_TAB_INDEX - 8)
+#define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
+
+struct ip_vs_mh_state {
+       struct rcu_head                 rcu_head;
+       struct ip_vs_mh_lookup          *lookup;
+       struct ip_vs_mh_dest_setup      *dest_setup;
+       hsiphash_key_t                  hash1, hash2;
+       int                             gcd;
+       int                             rshift;
+};
+
+static inline void generate_hash_secret(hsiphash_key_t *hash1,
+                                       hsiphash_key_t *hash2)
+{
+       hash1->key[0] = 2654435761UL;
+       hash1->key[1] = 2654435761UL;
+
+       hash2->key[0] = 2654446892UL;
+       hash2->key[1] = 2654446892UL;
+}
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+       return atomic_read(&dest->weight) <= 0 ||
+              dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/* Returns hash value for IPVS MH entry */
+static inline unsigned int
+ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
+                __be16 port, hsiphash_key_t *key, unsigned int offset)
+{
+       unsigned int v;
+       __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
+                           addr->ip6[2] ^ addr->ip6[3];
+#endif
+       v = (offset + ntohs(port) + ntohl(addr_fold));
+       return hsiphash(&v, sizeof(v), key);
+}
+
+/* Reset all the hash buckets of the specified table. */
+static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
+{
+       int i;
+       struct ip_vs_mh_lookup *l;
+       struct ip_vs_dest *dest;
+
+       l = &s->lookup[0];
+       for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
+               dest = rcu_dereference_protected(l->dest, 1);
+               if (dest) {
+                       ip_vs_dest_put(dest);
+                       RCU_INIT_POINTER(l->dest, NULL);
+               }
+               l++;
+       }
+}
+
+static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
+                             struct ip_vs_service *svc)
+{
+       struct list_head *p;
+       struct ip_vs_mh_dest_setup *ds;
+       struct ip_vs_dest *dest;
+       int lw;
+
+       /* If gcd is smaller then 1, number of dests or
+        * all last_weight of dests are zero. So, skip
+        * permutation for the dests.
+        */
+       if (s->gcd < 1)
+               return 0;
+
+       /* Set dest_setup for the dests permutation */
+       p = &svc->destinations;
+       ds = &s->dest_setup[0];
+       while ((p = p->next) != &svc->destinations) {
+               dest = list_entry(p, struct ip_vs_dest, n_list);
+
+               ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
+                                             dest->port, &s->hash1, 0) %
+                                             IP_VS_MH_TAB_SIZE;
+               ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+                                           dest->port, &s->hash2, 0) %
+                                           (IP_VS_MH_TAB_SIZE - 1) + 1;
+               ds->perm = ds->offset;
+
+               lw = atomic_read(&dest->last_weight);
+               ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
+               ds++;
+       }
+
+       return 0;
+}
+
+static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
+                            struct ip_vs_service *svc)
+{
+       int n, c, dt_count;
+       unsigned long *table;
+       struct list_head *p;
+       struct ip_vs_mh_dest_setup *ds;
+       struct ip_vs_dest *dest, *new_dest;
+
+       /* If gcd is smaller then 1, number of dests or
+        * all last_weight of dests are zero. So, skip
+        * the population for the dests and reset lookup table.
+        */
+       if (s->gcd < 1) {
+               ip_vs_mh_reset(s);
+               return 0;
+       }
+
+       table =  kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
+                        sizeof(unsigned long), GFP_KERNEL);
+       if (!table)
+               return -ENOMEM;
+
+       p = &svc->destinations;
+       n = 0;
+       dt_count = 0;
+       while (n < IP_VS_MH_TAB_SIZE) {
+               if (p == &svc->destinations)
+                       p = p->next;
+
+               ds = &s->dest_setup[0];
+               while (p != &svc->destinations) {
+                       /* Ignore added server with zero weight */
+                       if (ds->turns < 1) {
+                               p = p->next;
+                               ds++;
+                               continue;
+                       }
+
+                       c = ds->perm;
+                       while (test_bit(c, table)) {
+                               /* Add skip, mod IP_VS_MH_TAB_SIZE */
+                               ds->perm += ds->skip;
+                               if (ds->perm >= IP_VS_MH_TAB_SIZE)
+                                       ds->perm -= IP_VS_MH_TAB_SIZE;
+                               c = ds->perm;
+                       }
+
+                       __set_bit(c, table);
+
+                       dest = rcu_dereference_protected(s->lookup[c].dest, 1);
+                       new_dest = list_entry(p, struct ip_vs_dest, n_list);
+                       if (dest != new_dest) {
+                               if (dest)
+                                       ip_vs_dest_put(dest);
+                               ip_vs_dest_hold(new_dest);
+                               RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
+                       }
+
+                       if (++n == IP_VS_MH_TAB_SIZE)
+                               goto out;
+
+                       if (++dt_count >= ds->turns) {
+                               dt_count = 0;
+                               p = p->next;
+                               ds++;
+                       }
+               }
+       }
+
+out:
+       kfree(table);
+       return 0;
+}
+
+/* Get ip_vs_dest associated with supplied parameters. */
+static inline struct ip_vs_dest *
+ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+            const union nf_inet_addr *addr, __be16 port)
+{
+       unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
+                                            % IP_VS_MH_TAB_SIZE;
+       struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
+
+       return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
+static inline struct ip_vs_dest *
+ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+                     const union nf_inet_addr *addr, __be16 port)
+{
+       unsigned int offset, roffset;
+       unsigned int hash, ihash;
+       struct ip_vs_dest *dest;
+
+       /* First try the dest it's supposed to go to */
+       ihash = ip_vs_mh_hashkey(svc->af, addr, port,
+                                &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
+       dest = rcu_dereference(s->lookup[ihash].dest);
+       if (!dest)
+               return NULL;
+       if (!is_unavailable(dest))
+               return dest;
+
+       IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
+                     IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+       /* If the original dest is unavailable, loop around the table
+        * starting from ihash to find a new dest
+        */
+       for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
+               roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
+               hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
+                                       roffset) % IP_VS_MH_TAB_SIZE;
+               dest = rcu_dereference(s->lookup[hash].dest);
+               if (!dest)
+                       break;
+               if (!is_unavailable(dest))
+                       return dest;
+               IP_VS_DBG_BUF(6,
+                             "MH: selected unavailable server %s:%u (offset %u), reselecting",
+                             IP_VS_DBG_ADDR(dest->af, &dest->addr),
+                             ntohs(dest->port), roffset);
+       }
+
+       return NULL;
+}
+
+/* Assign all the hash buckets of the specified table with the service. */
+static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
+                            struct ip_vs_service *svc)
+{
+       int ret;
+
+       if (svc->num_dests > IP_VS_MH_TAB_SIZE)
+               return -EINVAL;
+
+       if (svc->num_dests >= 1) {
+               s->dest_setup = kcalloc(svc->num_dests,
+                                       sizeof(struct ip_vs_mh_dest_setup),
+                                       GFP_KERNEL);
+               if (!s->dest_setup)
+                       return -ENOMEM;
+       }
+
+       ip_vs_mh_permutate(s, svc);
+
+       ret = ip_vs_mh_populate(s, svc);
+       if (ret < 0)
+               goto out;
+
+       IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
+                     IP_VS_DBG_ADDR(svc->af, &svc->addr),
+                     ntohs(svc->port));
+
+out:
+       if (svc->num_dests >= 1) {
+               kfree(s->dest_setup);
+               s->dest_setup = NULL;
+       }
+       return ret;
+}
+
+static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
+{
+       struct ip_vs_dest *dest;
+       int weight;
+       int g = 0;
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               weight = atomic_read(&dest->last_weight);
+               if (weight > 0) {
+                       if (g > 0)
+                               g = gcd(weight, g);
+                       else
+                               g = weight;
+               }
+       }
+       return g;
+}
+
+/* To avoid assigning huge weight for the MH table,
+ * calculate shift value with gcd.
+ */
+static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
+{
+       struct ip_vs_dest *dest;
+       int new_weight, weight = 0;
+       int mw, shift;
+
+       /* If gcd is smaller then 1, number of dests or
+        * all last_weight of dests are zero. So, return
+        * shift value as zero.
+        */
+       if (gcd < 1)
+               return 0;
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               new_weight = atomic_read(&dest->last_weight);
+               if (new_weight > weight)
+                       weight = new_weight;
+       }
+
+       /* Because gcd is greater than zero,
+        * the maximum weight and gcd are always greater than zero
+        */
+       mw = weight / gcd;
+
+       /* shift = occupied bits of weight/gcd - MH highest bits */
+       shift = fls(mw) - IP_VS_MH_TAB_BITS;
+       return (shift >= 0) ? shift : 0;
+}
+
+static void ip_vs_mh_state_free(struct rcu_head *head)
+{
+       struct ip_vs_mh_state *s;
+
+       s = container_of(head, struct ip_vs_mh_state, rcu_head);
+       kfree(s->lookup);
+       kfree(s);
+}
+
+static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
+{
+       int ret;
+       struct ip_vs_mh_state *s;
+
+       /* Allocate the MH table for this service */
+       s = kzalloc(sizeof(*s), GFP_KERNEL);
+       if (!s)
+               return -ENOMEM;
+
+       s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
+                           GFP_KERNEL);
+       if (!s->lookup) {
+               kfree(s);
+               return -ENOMEM;
+       }
+
+       generate_hash_secret(&s->hash1, &s->hash2);
+       s->gcd = ip_vs_mh_gcd_weight(svc);
+       s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+       IP_VS_DBG(6,
+                 "MH lookup table (memory=%zdbytes) allocated for current service\n",
+                 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+
+       /* Assign the lookup table with current dests */
+       ret = ip_vs_mh_reassign(s, svc);
+       if (ret < 0) {
+               ip_vs_mh_reset(s);
+               ip_vs_mh_state_free(&s->rcu_head);
+               return ret;
+       }
+
+       /* No more failures, attach state */
+       svc->sched_data = s;
+       return 0;
+}
+
+static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_mh_state *s = svc->sched_data;
+
+       /* Got to clean up lookup entry here */
+       ip_vs_mh_reset(s);
+
+       call_rcu(&s->rcu_head, ip_vs_mh_state_free);
+       IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
+                 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+}
+
+static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
+                                struct ip_vs_dest *dest)
+{
+       struct ip_vs_mh_state *s = svc->sched_data;
+
+       s->gcd = ip_vs_mh_gcd_weight(svc);
+       s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+       /* Assign the lookup table with the updated service */
+       return ip_vs_mh_reassign(s, svc);
+}
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+       __be16 _ports[2], *ports;
+
+       /* At this point we know that we have a valid packet of some kind.
+        * Because ICMP packets are only guaranteed to have the first 8
+        * bytes, let's just grab the ports.  Fortunately they're in the
+        * same position for all three of the protocols we care about.
+        */
+       switch (iph->protocol) {
+       case IPPROTO_TCP:
+       case IPPROTO_UDP:
+       case IPPROTO_SCTP:
+               ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+                                          &_ports);
+               if (unlikely(!ports))
+                       return 0;
+
+               if (likely(!ip_vs_iph_inverse(iph)))
+                       return ports[0];
+               else
+                       return ports[1];
+       default:
+               return 0;
+       }
+}
+
+/* Maglev Hashing scheduling */
+static struct ip_vs_dest *
+ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+                 struct ip_vs_iphdr *iph)
+{
+       struct ip_vs_dest *dest;
+       struct ip_vs_mh_state *s;
+       __be16 port = 0;
+       const union nf_inet_addr *hash_addr;
+
+       hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
+
+       IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
+
+       if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
+               port = ip_vs_mh_get_port(skb, iph);
+
+       s = (struct ip_vs_mh_state *)svc->sched_data;
+
+       if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
+               dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
+       else
+               dest = ip_vs_mh_get(svc, s, hash_addr, port);
+
+       if (!dest) {
+               ip_vs_scheduler_err(svc, "no destination available");
+               return NULL;
+       }
+
+       IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
+                     IP_VS_DBG_ADDR(svc->af, hash_addr),
+                     ntohs(port),
+                     IP_VS_DBG_ADDR(dest->af, &dest->addr),
+                     ntohs(dest->port));
+
+       return dest;
+}
+
+/* IPVS MH Scheduler structure */
+static struct ip_vs_scheduler ip_vs_mh_scheduler = {
+       .name =                 "mh",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list  =              LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
+       .init_service =         ip_vs_mh_init_svc,
+       .done_service =         ip_vs_mh_done_svc,
+       .add_dest =             ip_vs_mh_dest_changed,
+       .del_dest =             ip_vs_mh_dest_changed,
+       .upd_dest =             ip_vs_mh_dest_changed,
+       .schedule =             ip_vs_mh_schedule,
+};
+
+static int __init ip_vs_mh_init(void)
+{
+       return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
+}
+
+static void __exit ip_vs_mh_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
+       rcu_barrier();
+}
+
+module_init(ip_vs_mh_init);
+module_exit(ip_vs_mh_cleanup);
+MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
index bcd9b7bde4ee3da423456800436912e06dd46958..569631d2b2a10d32c149491b5140158020917831 100644 (file)
@@ -436,7 +436,7 @@ static bool tcp_state_active(int state)
        return tcp_state_active_table[state];
 }
 
-static struct tcp_states_t tcp_states [] = {
+static struct tcp_states_t tcp_states[] = {
 /*     INPUT */
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
@@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = {
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 };
 
-static struct tcp_states_t tcp_states_dos [] = {
+static struct tcp_states_t tcp_states_dos[] = {
 /*     INPUT */
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
index 16aaac6eedc963dee56f088c8933dc962bbd27c6..1e01c782583a62efd4258c133c7b2c8f97bc2e85 100644 (file)
@@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
                addr_fold = addr->ip6[0]^addr->ip6[1]^
                            addr->ip6[2]^addr->ip6[3];
 #endif
-       return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+       return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
+                                IP_VS_SH_TAB_BITS)) &
                IP_VS_SH_TAB_MASK;
 }