Pull thinkpad-2.6.24 into release branch
[sfrench/cifs-2.6.git] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/capability.h>
27 #include <linux/fs.h>
28 #include <linux/sysctl.h>
29 #include <linux/proc_fs.h>
30 #include <linux/workqueue.h>
31 #include <linux/swap.h>
32 #include <linux/seq_file.h>
33
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36 #include <linux/mutex.h>
37
38 #include <net/net_namespace.h>
39 #include <net/ip.h>
40 #include <net/route.h>
41 #include <net/sock.h>
42
43 #include <asm/uaccess.h>
44
45 #include <net/ip_vs.h>
46
47 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
48 static DEFINE_MUTEX(__ip_vs_mutex);
49
50 /* lock for service table */
51 static DEFINE_RWLOCK(__ip_vs_svc_lock);
52
53 /* lock for table with the real services */
54 static DEFINE_RWLOCK(__ip_vs_rs_lock);
55
56 /* lock for state and timeout tables */
57 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
58
59 /* lock for drop entry handling */
60 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
61
62 /* lock for drop packet handling */
63 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
64
65 /* 1/rate drop and drop-entry variables */
66 int ip_vs_drop_rate = 0;
67 int ip_vs_drop_counter = 0;
68 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
69
70 /* number of virtual services */
71 static int ip_vs_num_services = 0;
72
73 /* sysctl variables */
74 static int sysctl_ip_vs_drop_entry = 0;
75 static int sysctl_ip_vs_drop_packet = 0;
76 static int sysctl_ip_vs_secure_tcp = 0;
77 static int sysctl_ip_vs_amemthresh = 1024;
78 static int sysctl_ip_vs_am_droprate = 10;
79 int sysctl_ip_vs_cache_bypass = 0;
80 int sysctl_ip_vs_expire_nodest_conn = 0;
81 int sysctl_ip_vs_expire_quiescent_template = 0;
82 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83 int sysctl_ip_vs_nat_icmp_send = 0;
84
85
86 #ifdef CONFIG_IP_VS_DEBUG
87 static int sysctl_ip_vs_debug_level = 0;
88
89 int ip_vs_get_debug_level(void)
90 {
91         return sysctl_ip_vs_debug_level;
92 }
93 #endif
94
95 /*
96  *      update_defense_level is called from keventd and from sysctl,
97  *      so it needs to protect itself from softirqs
98  */
99 static void update_defense_level(void)
100 {
101         struct sysinfo i;
102         static int old_secure_tcp = 0;
103         int availmem;
104         int nomem;
105         int to_change = -1;
106
107         /* we only count free and buffered memory (in pages) */
108         si_meminfo(&i);
109         availmem = i.freeram + i.bufferram;
110         /* however in linux 2.5 the i.bufferram is total page cache size,
111            we need adjust it */
112         /* si_swapinfo(&i); */
113         /* availmem = availmem - (i.totalswap - i.freeswap); */
114
115         nomem = (availmem < sysctl_ip_vs_amemthresh);
116
117         local_bh_disable();
118
119         /* drop_entry */
120         spin_lock(&__ip_vs_dropentry_lock);
121         switch (sysctl_ip_vs_drop_entry) {
122         case 0:
123                 atomic_set(&ip_vs_dropentry, 0);
124                 break;
125         case 1:
126                 if (nomem) {
127                         atomic_set(&ip_vs_dropentry, 1);
128                         sysctl_ip_vs_drop_entry = 2;
129                 } else {
130                         atomic_set(&ip_vs_dropentry, 0);
131                 }
132                 break;
133         case 2:
134                 if (nomem) {
135                         atomic_set(&ip_vs_dropentry, 1);
136                 } else {
137                         atomic_set(&ip_vs_dropentry, 0);
138                         sysctl_ip_vs_drop_entry = 1;
139                 };
140                 break;
141         case 3:
142                 atomic_set(&ip_vs_dropentry, 1);
143                 break;
144         }
145         spin_unlock(&__ip_vs_dropentry_lock);
146
147         /* drop_packet */
148         spin_lock(&__ip_vs_droppacket_lock);
149         switch (sysctl_ip_vs_drop_packet) {
150         case 0:
151                 ip_vs_drop_rate = 0;
152                 break;
153         case 1:
154                 if (nomem) {
155                         ip_vs_drop_rate = ip_vs_drop_counter
156                                 = sysctl_ip_vs_amemthresh /
157                                 (sysctl_ip_vs_amemthresh-availmem);
158                         sysctl_ip_vs_drop_packet = 2;
159                 } else {
160                         ip_vs_drop_rate = 0;
161                 }
162                 break;
163         case 2:
164                 if (nomem) {
165                         ip_vs_drop_rate = ip_vs_drop_counter
166                                 = sysctl_ip_vs_amemthresh /
167                                 (sysctl_ip_vs_amemthresh-availmem);
168                 } else {
169                         ip_vs_drop_rate = 0;
170                         sysctl_ip_vs_drop_packet = 1;
171                 }
172                 break;
173         case 3:
174                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175                 break;
176         }
177         spin_unlock(&__ip_vs_droppacket_lock);
178
179         /* secure_tcp */
180         write_lock(&__ip_vs_securetcp_lock);
181         switch (sysctl_ip_vs_secure_tcp) {
182         case 0:
183                 if (old_secure_tcp >= 2)
184                         to_change = 0;
185                 break;
186         case 1:
187                 if (nomem) {
188                         if (old_secure_tcp < 2)
189                                 to_change = 1;
190                         sysctl_ip_vs_secure_tcp = 2;
191                 } else {
192                         if (old_secure_tcp >= 2)
193                                 to_change = 0;
194                 }
195                 break;
196         case 2:
197                 if (nomem) {
198                         if (old_secure_tcp < 2)
199                                 to_change = 1;
200                 } else {
201                         if (old_secure_tcp >= 2)
202                                 to_change = 0;
203                         sysctl_ip_vs_secure_tcp = 1;
204                 }
205                 break;
206         case 3:
207                 if (old_secure_tcp < 2)
208                         to_change = 1;
209                 break;
210         }
211         old_secure_tcp = sysctl_ip_vs_secure_tcp;
212         if (to_change >= 0)
213                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214         write_unlock(&__ip_vs_securetcp_lock);
215
216         local_bh_enable();
217 }
218
219
220 /*
221  *      Timer for checking the defense
222  */
223 #define DEFENSE_TIMER_PERIOD    1*HZ
224 static void defense_work_handler(struct work_struct *work);
225 static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
226
227 static void defense_work_handler(struct work_struct *work)
228 {
229         update_defense_level();
230         if (atomic_read(&ip_vs_dropentry))
231                 ip_vs_random_dropentry();
232
233         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234 }
235
236 int
237 ip_vs_use_count_inc(void)
238 {
239         return try_module_get(THIS_MODULE);
240 }
241
242 void
243 ip_vs_use_count_dec(void)
244 {
245         module_put(THIS_MODULE);
246 }
247
248
249 /*
250  *      Hash table: for virtual service lookups
251  */
252 #define IP_VS_SVC_TAB_BITS 8
253 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
255
256 /* the service table hashed by <protocol, addr, port> */
257 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258 /* the service table hashed by fwmark */
259 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260
261 /*
262  *      Hash table: for real service lookups
263  */
264 #define IP_VS_RTAB_BITS 4
265 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
267
268 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269
270 /*
271  *      Trash for destinations
272  */
273 static LIST_HEAD(ip_vs_dest_trash);
274
275 /*
276  *      FTP & NULL virtual service counters
277  */
278 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280
281
282 /*
283  *      Returns hash value for virtual service
284  */
285 static __inline__ unsigned
286 ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
287 {
288         register unsigned porth = ntohs(port);
289
290         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291                 & IP_VS_SVC_TAB_MASK;
292 }
293
294 /*
295  *      Returns hash value of fwmark for virtual service lookup
296  */
297 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
298 {
299         return fwmark & IP_VS_SVC_TAB_MASK;
300 }
301
302 /*
303  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304  *      or in the ip_vs_svc_fwm_table by fwmark.
305  *      Should be called with locked tables.
306  */
307 static int ip_vs_svc_hash(struct ip_vs_service *svc)
308 {
309         unsigned hash;
310
311         if (svc->flags & IP_VS_SVC_F_HASHED) {
312                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313                           "called from %p\n", __builtin_return_address(0));
314                 return 0;
315         }
316
317         if (svc->fwmark == 0) {
318                 /*
319                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
320                  */
321                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323         } else {
324                 /*
325                  *  Hash it by fwmark in ip_vs_svc_fwm_table
326                  */
327                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329         }
330
331         svc->flags |= IP_VS_SVC_F_HASHED;
332         /* increase its refcnt because it is referenced by the svc table */
333         atomic_inc(&svc->refcnt);
334         return 1;
335 }
336
337
338 /*
339  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340  *      Should be called with locked tables.
341  */
342 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
343 {
344         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346                           "called from %p\n", __builtin_return_address(0));
347                 return 0;
348         }
349
350         if (svc->fwmark == 0) {
351                 /* Remove it from the ip_vs_svc_table table */
352                 list_del(&svc->s_list);
353         } else {
354                 /* Remove it from the ip_vs_svc_fwm_table table */
355                 list_del(&svc->f_list);
356         }
357
358         svc->flags &= ~IP_VS_SVC_F_HASHED;
359         atomic_dec(&svc->refcnt);
360         return 1;
361 }
362
363
364 /*
365  *      Get service by {proto,addr,port} in the service table.
366  */
367 static __inline__ struct ip_vs_service *
368 __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
369 {
370         unsigned hash;
371         struct ip_vs_service *svc;
372
373         /* Check for "full" addressed entries */
374         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
375
376         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377                 if ((svc->addr == vaddr)
378                     && (svc->port == vport)
379                     && (svc->protocol == protocol)) {
380                         /* HIT */
381                         atomic_inc(&svc->usecnt);
382                         return svc;
383                 }
384         }
385
386         return NULL;
387 }
388
389
390 /*
391  *      Get service by {fwmark} in the service table.
392  */
393 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394 {
395         unsigned hash;
396         struct ip_vs_service *svc;
397
398         /* Check for fwmark addressed entries */
399         hash = ip_vs_svc_fwm_hashkey(fwmark);
400
401         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402                 if (svc->fwmark == fwmark) {
403                         /* HIT */
404                         atomic_inc(&svc->usecnt);
405                         return svc;
406                 }
407         }
408
409         return NULL;
410 }
411
412 struct ip_vs_service *
413 ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
414 {
415         struct ip_vs_service *svc;
416
417         read_lock(&__ip_vs_svc_lock);
418
419         /*
420          *      Check the table hashed by fwmark first
421          */
422         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423                 goto out;
424
425         /*
426          *      Check the table hashed by <protocol,addr,port>
427          *      for "full" addressed entries
428          */
429         svc = __ip_vs_service_get(protocol, vaddr, vport);
430
431         if (svc == NULL
432             && protocol == IPPROTO_TCP
433             && atomic_read(&ip_vs_ftpsvc_counter)
434             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
435                 /*
436                  * Check if ftp service entry exists, the packet
437                  * might belong to FTP data connections.
438                  */
439                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440         }
441
442         if (svc == NULL
443             && atomic_read(&ip_vs_nullsvc_counter)) {
444                 /*
445                  * Check if the catch-all port (port zero) exists
446                  */
447                 svc = __ip_vs_service_get(protocol, vaddr, 0);
448         }
449
450   out:
451         read_unlock(&__ip_vs_svc_lock);
452
453         IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
454                   fwmark, ip_vs_proto_name(protocol),
455                   NIPQUAD(vaddr), ntohs(vport),
456                   svc?"hit":"not hit");
457
458         return svc;
459 }
460
461
462 static inline void
463 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
464 {
465         atomic_inc(&svc->refcnt);
466         dest->svc = svc;
467 }
468
469 static inline void
470 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
471 {
472         struct ip_vs_service *svc = dest->svc;
473
474         dest->svc = NULL;
475         if (atomic_dec_and_test(&svc->refcnt))
476                 kfree(svc);
477 }
478
479
480 /*
481  *      Returns hash value for real service
482  */
483 static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
484 {
485         register unsigned porth = ntohs(port);
486
487         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
488                 & IP_VS_RTAB_MASK;
489 }
490
491 /*
492  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493  *      should be called with locked tables.
494  */
495 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496 {
497         unsigned hash;
498
499         if (!list_empty(&dest->d_list)) {
500                 return 0;
501         }
502
503         /*
504          *      Hash by proto,addr,port,
505          *      which are the parameters of the real service.
506          */
507         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508         list_add(&dest->d_list, &ip_vs_rtable[hash]);
509
510         return 1;
511 }
512
513 /*
514  *      UNhashes ip_vs_dest from ip_vs_rtable.
515  *      should be called with locked tables.
516  */
517 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518 {
519         /*
520          * Remove it from the ip_vs_rtable table.
521          */
522         if (!list_empty(&dest->d_list)) {
523                 list_del(&dest->d_list);
524                 INIT_LIST_HEAD(&dest->d_list);
525         }
526
527         return 1;
528 }
529
530 /*
531  *      Lookup real service by <proto,addr,port> in the real service table.
532  */
533 struct ip_vs_dest *
534 ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
535 {
536         unsigned hash;
537         struct ip_vs_dest *dest;
538
539         /*
540          *      Check for "full" addressed entries
541          *      Return the first found entry
542          */
543         hash = ip_vs_rs_hashkey(daddr, dport);
544
545         read_lock(&__ip_vs_rs_lock);
546         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547                 if ((dest->addr == daddr)
548                     && (dest->port == dport)
549                     && ((dest->protocol == protocol) ||
550                         dest->vfwmark)) {
551                         /* HIT */
552                         read_unlock(&__ip_vs_rs_lock);
553                         return dest;
554                 }
555         }
556         read_unlock(&__ip_vs_rs_lock);
557
558         return NULL;
559 }
560
561 /*
562  *      Lookup destination by {addr,port} in the given service
563  */
564 static struct ip_vs_dest *
565 ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
566 {
567         struct ip_vs_dest *dest;
568
569         /*
570          * Find the destination for the given service
571          */
572         list_for_each_entry(dest, &svc->destinations, n_list) {
573                 if ((dest->addr == daddr) && (dest->port == dport)) {
574                         /* HIT */
575                         return dest;
576                 }
577         }
578
579         return NULL;
580 }
581
582 /*
583  * Find destination by {daddr,dport,vaddr,protocol}
584  * Cretaed to be used in ip_vs_process_message() in
585  * the backup synchronization daemon. It finds the
586  * destination to be bound to the received connection
587  * on the backup.
588  *
589  * ip_vs_lookup_real_service() looked promissing, but
590  * seems not working as expected.
591  */
592 struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
593                                     __be32 vaddr, __be16 vport, __u16 protocol)
594 {
595         struct ip_vs_dest *dest;
596         struct ip_vs_service *svc;
597
598         svc = ip_vs_service_get(0, protocol, vaddr, vport);
599         if (!svc)
600                 return NULL;
601         dest = ip_vs_lookup_dest(svc, daddr, dport);
602         if (dest)
603                 atomic_inc(&dest->refcnt);
604         ip_vs_service_put(svc);
605         return dest;
606 }
607
608 /*
609  *  Lookup dest by {svc,addr,port} in the destination trash.
610  *  The destination trash is used to hold the destinations that are removed
611  *  from the service table but are still referenced by some conn entries.
612  *  The reason to add the destination trash is when the dest is temporary
613  *  down (either by administrator or by monitor program), the dest can be
614  *  picked back from the trash, the remaining connections to the dest can
615  *  continue, and the counting information of the dest is also useful for
616  *  scheduling.
617  */
618 static struct ip_vs_dest *
619 ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
620 {
621         struct ip_vs_dest *dest, *nxt;
622
623         /*
624          * Find the destination in trash
625          */
626         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
627                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
628                           "dest->refcnt=%d\n",
629                           dest->vfwmark,
630                           NIPQUAD(dest->addr), ntohs(dest->port),
631                           atomic_read(&dest->refcnt));
632                 if (dest->addr == daddr &&
633                     dest->port == dport &&
634                     dest->vfwmark == svc->fwmark &&
635                     dest->protocol == svc->protocol &&
636                     (svc->fwmark ||
637                      (dest->vaddr == svc->addr &&
638                       dest->vport == svc->port))) {
639                         /* HIT */
640                         return dest;
641                 }
642
643                 /*
644                  * Try to purge the destination from trash if not referenced
645                  */
646                 if (atomic_read(&dest->refcnt) == 1) {
647                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
648                                   "from trash\n",
649                                   dest->vfwmark,
650                                   NIPQUAD(dest->addr), ntohs(dest->port));
651                         list_del(&dest->n_list);
652                         ip_vs_dst_reset(dest);
653                         __ip_vs_unbind_svc(dest);
654                         kfree(dest);
655                 }
656         }
657
658         return NULL;
659 }
660
661
662 /*
663  *  Clean up all the destinations in the trash
664  *  Called by the ip_vs_control_cleanup()
665  *
666  *  When the ip_vs_control_clearup is activated by ipvs module exit,
667  *  the service tables must have been flushed and all the connections
668  *  are expired, and the refcnt of each destination in the trash must
669  *  be 1, so we simply release them here.
670  */
671 static void ip_vs_trash_cleanup(void)
672 {
673         struct ip_vs_dest *dest, *nxt;
674
675         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
676                 list_del(&dest->n_list);
677                 ip_vs_dst_reset(dest);
678                 __ip_vs_unbind_svc(dest);
679                 kfree(dest);
680         }
681 }
682
683
684 static void
685 ip_vs_zero_stats(struct ip_vs_stats *stats)
686 {
687         spin_lock_bh(&stats->lock);
688         memset(stats, 0, (char *)&stats->lock - (char *)stats);
689         spin_unlock_bh(&stats->lock);
690         ip_vs_zero_estimator(stats);
691 }
692
693 /*
694  *      Update a destination in the given service
695  */
696 static void
697 __ip_vs_update_dest(struct ip_vs_service *svc,
698                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
699 {
700         int conn_flags;
701
702         /* set the weight and the flags */
703         atomic_set(&dest->weight, udest->weight);
704         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
705
706         /* check if local node and update the flags */
707         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
708                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
709                         | IP_VS_CONN_F_LOCALNODE;
710         }
711
712         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
713         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
714                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
715         } else {
716                 /*
717                  *    Put the real service in ip_vs_rtable if not present.
718                  *    For now only for NAT!
719                  */
720                 write_lock_bh(&__ip_vs_rs_lock);
721                 ip_vs_rs_hash(dest);
722                 write_unlock_bh(&__ip_vs_rs_lock);
723         }
724         atomic_set(&dest->conn_flags, conn_flags);
725
726         /* bind the service */
727         if (!dest->svc) {
728                 __ip_vs_bind_svc(dest, svc);
729         } else {
730                 if (dest->svc != svc) {
731                         __ip_vs_unbind_svc(dest);
732                         ip_vs_zero_stats(&dest->stats);
733                         __ip_vs_bind_svc(dest, svc);
734                 }
735         }
736
737         /* set the dest status flags */
738         dest->flags |= IP_VS_DEST_F_AVAILABLE;
739
740         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
741                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
742         dest->u_threshold = udest->u_threshold;
743         dest->l_threshold = udest->l_threshold;
744 }
745
746
747 /*
748  *      Create a destination for the given service
749  */
750 static int
751 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
752                struct ip_vs_dest **dest_p)
753 {
754         struct ip_vs_dest *dest;
755         unsigned atype;
756
757         EnterFunction(2);
758
759         atype = inet_addr_type(udest->addr);
760         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
761                 return -EINVAL;
762
763         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
764         if (dest == NULL) {
765                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
766                 return -ENOMEM;
767         }
768
769         dest->protocol = svc->protocol;
770         dest->vaddr = svc->addr;
771         dest->vport = svc->port;
772         dest->vfwmark = svc->fwmark;
773         dest->addr = udest->addr;
774         dest->port = udest->port;
775
776         atomic_set(&dest->activeconns, 0);
777         atomic_set(&dest->inactconns, 0);
778         atomic_set(&dest->persistconns, 0);
779         atomic_set(&dest->refcnt, 0);
780
781         INIT_LIST_HEAD(&dest->d_list);
782         spin_lock_init(&dest->dst_lock);
783         spin_lock_init(&dest->stats.lock);
784         __ip_vs_update_dest(svc, dest, udest);
785         ip_vs_new_estimator(&dest->stats);
786
787         *dest_p = dest;
788
789         LeaveFunction(2);
790         return 0;
791 }
792
793
794 /*
795  *      Add a destination into an existing service
796  */
797 static int
798 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
799 {
800         struct ip_vs_dest *dest;
801         __be32 daddr = udest->addr;
802         __be16 dport = udest->port;
803         int ret;
804
805         EnterFunction(2);
806
807         if (udest->weight < 0) {
808                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
809                 return -ERANGE;
810         }
811
812         if (udest->l_threshold > udest->u_threshold) {
813                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
814                           "upper threshold\n");
815                 return -ERANGE;
816         }
817
818         /*
819          * Check if the dest already exists in the list
820          */
821         dest = ip_vs_lookup_dest(svc, daddr, dport);
822         if (dest != NULL) {
823                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
824                 return -EEXIST;
825         }
826
827         /*
828          * Check if the dest already exists in the trash and
829          * is from the same service
830          */
831         dest = ip_vs_trash_get_dest(svc, daddr, dport);
832         if (dest != NULL) {
833                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
834                           "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
835                           NIPQUAD(daddr), ntohs(dport),
836                           atomic_read(&dest->refcnt),
837                           dest->vfwmark,
838                           NIPQUAD(dest->vaddr),
839                           ntohs(dest->vport));
840                 __ip_vs_update_dest(svc, dest, udest);
841
842                 /*
843                  * Get the destination from the trash
844                  */
845                 list_del(&dest->n_list);
846
847                 ip_vs_new_estimator(&dest->stats);
848
849                 write_lock_bh(&__ip_vs_svc_lock);
850
851                 /*
852                  * Wait until all other svc users go away.
853                  */
854                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
855
856                 list_add(&dest->n_list, &svc->destinations);
857                 svc->num_dests++;
858
859                 /* call the update_service function of its scheduler */
860                 svc->scheduler->update_service(svc);
861
862                 write_unlock_bh(&__ip_vs_svc_lock);
863                 return 0;
864         }
865
866         /*
867          * Allocate and initialize the dest structure
868          */
869         ret = ip_vs_new_dest(svc, udest, &dest);
870         if (ret) {
871                 return ret;
872         }
873
874         /*
875          * Add the dest entry into the list
876          */
877         atomic_inc(&dest->refcnt);
878
879         write_lock_bh(&__ip_vs_svc_lock);
880
881         /*
882          * Wait until all other svc users go away.
883          */
884         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
885
886         list_add(&dest->n_list, &svc->destinations);
887         svc->num_dests++;
888
889         /* call the update_service function of its scheduler */
890         svc->scheduler->update_service(svc);
891
892         write_unlock_bh(&__ip_vs_svc_lock);
893
894         LeaveFunction(2);
895
896         return 0;
897 }
898
899
900 /*
901  *      Edit a destination in the given service
902  */
903 static int
904 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
905 {
906         struct ip_vs_dest *dest;
907         __be32 daddr = udest->addr;
908         __be16 dport = udest->port;
909
910         EnterFunction(2);
911
912         if (udest->weight < 0) {
913                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
914                 return -ERANGE;
915         }
916
917         if (udest->l_threshold > udest->u_threshold) {
918                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
919                           "upper threshold\n");
920                 return -ERANGE;
921         }
922
923         /*
924          *  Lookup the destination list
925          */
926         dest = ip_vs_lookup_dest(svc, daddr, dport);
927         if (dest == NULL) {
928                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
929                 return -ENOENT;
930         }
931
932         __ip_vs_update_dest(svc, dest, udest);
933
934         write_lock_bh(&__ip_vs_svc_lock);
935
936         /* Wait until all other svc users go away */
937         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
938
939         /* call the update_service, because server weight may be changed */
940         svc->scheduler->update_service(svc);
941
942         write_unlock_bh(&__ip_vs_svc_lock);
943
944         LeaveFunction(2);
945
946         return 0;
947 }
948
949
950 /*
951  *      Delete a destination (must be already unlinked from the service)
952  */
953 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
954 {
955         ip_vs_kill_estimator(&dest->stats);
956
957         /*
958          *  Remove it from the d-linked list with the real services.
959          */
960         write_lock_bh(&__ip_vs_rs_lock);
961         ip_vs_rs_unhash(dest);
962         write_unlock_bh(&__ip_vs_rs_lock);
963
964         /*
965          *  Decrease the refcnt of the dest, and free the dest
966          *  if nobody refers to it (refcnt=0). Otherwise, throw
967          *  the destination into the trash.
968          */
969         if (atomic_dec_and_test(&dest->refcnt)) {
970                 ip_vs_dst_reset(dest);
971                 /* simply decrease svc->refcnt here, let the caller check
972                    and release the service if nobody refers to it.
973                    Only user context can release destination and service,
974                    and only one user context can update virtual service at a
975                    time, so the operation here is OK */
976                 atomic_dec(&dest->svc->refcnt);
977                 kfree(dest);
978         } else {
979                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
980                           "dest->refcnt=%d\n",
981                           NIPQUAD(dest->addr), ntohs(dest->port),
982                           atomic_read(&dest->refcnt));
983                 list_add(&dest->n_list, &ip_vs_dest_trash);
984                 atomic_inc(&dest->refcnt);
985         }
986 }
987
988
989 /*
990  *      Unlink a destination from the given service
991  */
992 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
993                                 struct ip_vs_dest *dest,
994                                 int svcupd)
995 {
996         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
997
998         /*
999          *  Remove it from the d-linked destination list.
1000          */
1001         list_del(&dest->n_list);
1002         svc->num_dests--;
1003         if (svcupd) {
1004                 /*
1005                  *  Call the update_service function of its scheduler
1006                  */
1007                 svc->scheduler->update_service(svc);
1008         }
1009 }
1010
1011
1012 /*
1013  *      Delete a destination server in the given service
1014  */
1015 static int
1016 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1017 {
1018         struct ip_vs_dest *dest;
1019         __be32 daddr = udest->addr;
1020         __be16 dport = udest->port;
1021
1022         EnterFunction(2);
1023
1024         dest = ip_vs_lookup_dest(svc, daddr, dport);
1025         if (dest == NULL) {
1026                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1027                 return -ENOENT;
1028         }
1029
1030         write_lock_bh(&__ip_vs_svc_lock);
1031
1032         /*
1033          *      Wait until all other svc users go away.
1034          */
1035         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1036
1037         /*
1038          *      Unlink dest from the service
1039          */
1040         __ip_vs_unlink_dest(svc, dest, 1);
1041
1042         write_unlock_bh(&__ip_vs_svc_lock);
1043
1044         /*
1045          *      Delete the destination
1046          */
1047         __ip_vs_del_dest(dest);
1048
1049         LeaveFunction(2);
1050
1051         return 0;
1052 }
1053
1054
1055 /*
1056  *      Add a service into the service hash table
1057  */
1058 static int
1059 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1060 {
1061         int ret = 0;
1062         struct ip_vs_scheduler *sched = NULL;
1063         struct ip_vs_service *svc = NULL;
1064
1065         /* increase the module use count */
1066         ip_vs_use_count_inc();
1067
1068         /* Lookup the scheduler by 'u->sched_name' */
1069         sched = ip_vs_scheduler_get(u->sched_name);
1070         if (sched == NULL) {
1071                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1072                            u->sched_name);
1073                 ret = -ENOENT;
1074                 goto out_mod_dec;
1075         }
1076
1077         svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1078         if (svc == NULL) {
1079                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1080                 ret = -ENOMEM;
1081                 goto out_err;
1082         }
1083
1084         /* I'm the first user of the service */
1085         atomic_set(&svc->usecnt, 1);
1086         atomic_set(&svc->refcnt, 0);
1087
1088         svc->protocol = u->protocol;
1089         svc->addr = u->addr;
1090         svc->port = u->port;
1091         svc->fwmark = u->fwmark;
1092         svc->flags = u->flags;
1093         svc->timeout = u->timeout * HZ;
1094         svc->netmask = u->netmask;
1095
1096         INIT_LIST_HEAD(&svc->destinations);
1097         rwlock_init(&svc->sched_lock);
1098         spin_lock_init(&svc->stats.lock);
1099
1100         /* Bind the scheduler */
1101         ret = ip_vs_bind_scheduler(svc, sched);
1102         if (ret)
1103                 goto out_err;
1104         sched = NULL;
1105
1106         /* Update the virtual service counters */
1107         if (svc->port == FTPPORT)
1108                 atomic_inc(&ip_vs_ftpsvc_counter);
1109         else if (svc->port == 0)
1110                 atomic_inc(&ip_vs_nullsvc_counter);
1111
1112         ip_vs_new_estimator(&svc->stats);
1113         ip_vs_num_services++;
1114
1115         /* Hash the service into the service table */
1116         write_lock_bh(&__ip_vs_svc_lock);
1117         ip_vs_svc_hash(svc);
1118         write_unlock_bh(&__ip_vs_svc_lock);
1119
1120         *svc_p = svc;
1121         return 0;
1122
1123   out_err:
1124         if (svc != NULL) {
1125                 if (svc->scheduler)
1126                         ip_vs_unbind_scheduler(svc);
1127                 if (svc->inc) {
1128                         local_bh_disable();
1129                         ip_vs_app_inc_put(svc->inc);
1130                         local_bh_enable();
1131                 }
1132                 kfree(svc);
1133         }
1134         ip_vs_scheduler_put(sched);
1135
1136   out_mod_dec:
1137         /* decrease the module use count */
1138         ip_vs_use_count_dec();
1139
1140         return ret;
1141 }
1142
1143
1144 /*
1145  *      Edit a service and bind it with a new scheduler
1146  */
1147 static int
1148 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1149 {
1150         struct ip_vs_scheduler *sched, *old_sched;
1151         int ret = 0;
1152
1153         /*
1154          * Lookup the scheduler, by 'u->sched_name'
1155          */
1156         sched = ip_vs_scheduler_get(u->sched_name);
1157         if (sched == NULL) {
1158                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1159                            u->sched_name);
1160                 return -ENOENT;
1161         }
1162         old_sched = sched;
1163
1164         write_lock_bh(&__ip_vs_svc_lock);
1165
1166         /*
1167          * Wait until all other svc users go away.
1168          */
1169         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1170
1171         /*
1172          * Set the flags and timeout value
1173          */
1174         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1175         svc->timeout = u->timeout * HZ;
1176         svc->netmask = u->netmask;
1177
1178         old_sched = svc->scheduler;
1179         if (sched != old_sched) {
1180                 /*
1181                  * Unbind the old scheduler
1182                  */
1183                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1184                         old_sched = sched;
1185                         goto out;
1186                 }
1187
1188                 /*
1189                  * Bind the new scheduler
1190                  */
1191                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1192                         /*
1193                          * If ip_vs_bind_scheduler fails, restore the old
1194                          * scheduler.
1195                          * The main reason of failure is out of memory.
1196                          *
1197                          * The question is if the old scheduler can be
1198                          * restored all the time. TODO: if it cannot be
1199                          * restored some time, we must delete the service,
1200                          * otherwise the system may crash.
1201                          */
1202                         ip_vs_bind_scheduler(svc, old_sched);
1203                         old_sched = sched;
1204                         goto out;
1205                 }
1206         }
1207
1208   out:
1209         write_unlock_bh(&__ip_vs_svc_lock);
1210
1211         if (old_sched)
1212                 ip_vs_scheduler_put(old_sched);
1213
1214         return ret;
1215 }
1216
1217
1218 /*
1219  *      Delete a service from the service list
1220  *      - The service must be unlinked, unlocked and not referenced!
1221  *      - We are called under _bh lock
1222  */
1223 static void __ip_vs_del_service(struct ip_vs_service *svc)
1224 {
1225         struct ip_vs_dest *dest, *nxt;
1226         struct ip_vs_scheduler *old_sched;
1227
1228         ip_vs_num_services--;
1229         ip_vs_kill_estimator(&svc->stats);
1230
1231         /* Unbind scheduler */
1232         old_sched = svc->scheduler;
1233         ip_vs_unbind_scheduler(svc);
1234         if (old_sched)
1235                 ip_vs_scheduler_put(old_sched);
1236
1237         /* Unbind app inc */
1238         if (svc->inc) {
1239                 ip_vs_app_inc_put(svc->inc);
1240                 svc->inc = NULL;
1241         }
1242
1243         /*
1244          *    Unlink the whole destination list
1245          */
1246         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1247                 __ip_vs_unlink_dest(svc, dest, 0);
1248                 __ip_vs_del_dest(dest);
1249         }
1250
1251         /*
1252          *    Update the virtual service counters
1253          */
1254         if (svc->port == FTPPORT)
1255                 atomic_dec(&ip_vs_ftpsvc_counter);
1256         else if (svc->port == 0)
1257                 atomic_dec(&ip_vs_nullsvc_counter);
1258
1259         /*
1260          *    Free the service if nobody refers to it
1261          */
1262         if (atomic_read(&svc->refcnt) == 0)
1263                 kfree(svc);
1264
1265         /* decrease the module use count */
1266         ip_vs_use_count_dec();
1267 }
1268
1269 /*
1270  *      Delete a service from the service list
1271  */
1272 static int ip_vs_del_service(struct ip_vs_service *svc)
1273 {
1274         if (svc == NULL)
1275                 return -EEXIST;
1276
1277         /*
1278          * Unhash it from the service table
1279          */
1280         write_lock_bh(&__ip_vs_svc_lock);
1281
1282         ip_vs_svc_unhash(svc);
1283
1284         /*
1285          * Wait until all the svc users go away.
1286          */
1287         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1288
1289         __ip_vs_del_service(svc);
1290
1291         write_unlock_bh(&__ip_vs_svc_lock);
1292
1293         return 0;
1294 }
1295
1296
1297 /*
1298  *      Flush all the virtual services
1299  */
1300 static int ip_vs_flush(void)
1301 {
1302         int idx;
1303         struct ip_vs_service *svc, *nxt;
1304
1305         /*
1306          * Flush the service table hashed by <protocol,addr,port>
1307          */
1308         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1309                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1310                         write_lock_bh(&__ip_vs_svc_lock);
1311                         ip_vs_svc_unhash(svc);
1312                         /*
1313                          * Wait until all the svc users go away.
1314                          */
1315                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1316                         __ip_vs_del_service(svc);
1317                         write_unlock_bh(&__ip_vs_svc_lock);
1318                 }
1319         }
1320
1321         /*
1322          * Flush the service table hashed by fwmark
1323          */
1324         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1325                 list_for_each_entry_safe(svc, nxt,
1326                                          &ip_vs_svc_fwm_table[idx], f_list) {
1327                         write_lock_bh(&__ip_vs_svc_lock);
1328                         ip_vs_svc_unhash(svc);
1329                         /*
1330                          * Wait until all the svc users go away.
1331                          */
1332                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1333                         __ip_vs_del_service(svc);
1334                         write_unlock_bh(&__ip_vs_svc_lock);
1335                 }
1336         }
1337
1338         return 0;
1339 }
1340
1341
1342 /*
1343  *      Zero counters in a service or all services
1344  */
1345 static int ip_vs_zero_service(struct ip_vs_service *svc)
1346 {
1347         struct ip_vs_dest *dest;
1348
1349         write_lock_bh(&__ip_vs_svc_lock);
1350         list_for_each_entry(dest, &svc->destinations, n_list) {
1351                 ip_vs_zero_stats(&dest->stats);
1352         }
1353         ip_vs_zero_stats(&svc->stats);
1354         write_unlock_bh(&__ip_vs_svc_lock);
1355         return 0;
1356 }
1357
1358 static int ip_vs_zero_all(void)
1359 {
1360         int idx;
1361         struct ip_vs_service *svc;
1362
1363         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1364                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1365                         ip_vs_zero_service(svc);
1366                 }
1367         }
1368
1369         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1370                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1371                         ip_vs_zero_service(svc);
1372                 }
1373         }
1374
1375         ip_vs_zero_stats(&ip_vs_stats);
1376         return 0;
1377 }
1378
1379
1380 static int
1381 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1382                      void __user *buffer, size_t *lenp, loff_t *ppos)
1383 {
1384         int *valp = table->data;
1385         int val = *valp;
1386         int rc;
1387
1388         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1389         if (write && (*valp != val)) {
1390                 if ((*valp < 0) || (*valp > 3)) {
1391                         /* Restore the correct value */
1392                         *valp = val;
1393                 } else {
1394                         update_defense_level();
1395                 }
1396         }
1397         return rc;
1398 }
1399
1400
1401 static int
1402 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1403                        void __user *buffer, size_t *lenp, loff_t *ppos)
1404 {
1405         int *valp = table->data;
1406         int val[2];
1407         int rc;
1408
1409         /* backup the value first */
1410         memcpy(val, valp, sizeof(val));
1411
1412         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1413         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1414                 /* Restore the correct value */
1415                 memcpy(valp, val, sizeof(val));
1416         }
1417         return rc;
1418 }
1419
1420
1421 /*
1422  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1423  */
1424
1425 static struct ctl_table vs_vars[] = {
1426         {
1427                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1428                 .procname       = "amemthresh",
1429                 .data           = &sysctl_ip_vs_amemthresh,
1430                 .maxlen         = sizeof(int),
1431                 .mode           = 0644,
1432                 .proc_handler   = &proc_dointvec,
1433         },
1434 #ifdef CONFIG_IP_VS_DEBUG
1435         {
1436                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1437                 .procname       = "debug_level",
1438                 .data           = &sysctl_ip_vs_debug_level,
1439                 .maxlen         = sizeof(int),
1440                 .mode           = 0644,
1441                 .proc_handler   = &proc_dointvec,
1442         },
1443 #endif
1444         {
1445                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1446                 .procname       = "am_droprate",
1447                 .data           = &sysctl_ip_vs_am_droprate,
1448                 .maxlen         = sizeof(int),
1449                 .mode           = 0644,
1450                 .proc_handler   = &proc_dointvec,
1451         },
1452         {
1453                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1454                 .procname       = "drop_entry",
1455                 .data           = &sysctl_ip_vs_drop_entry,
1456                 .maxlen         = sizeof(int),
1457                 .mode           = 0644,
1458                 .proc_handler   = &proc_do_defense_mode,
1459         },
1460         {
1461                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1462                 .procname       = "drop_packet",
1463                 .data           = &sysctl_ip_vs_drop_packet,
1464                 .maxlen         = sizeof(int),
1465                 .mode           = 0644,
1466                 .proc_handler   = &proc_do_defense_mode,
1467         },
1468         {
1469                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1470                 .procname       = "secure_tcp",
1471                 .data           = &sysctl_ip_vs_secure_tcp,
1472                 .maxlen         = sizeof(int),
1473                 .mode           = 0644,
1474                 .proc_handler   = &proc_do_defense_mode,
1475         },
1476 #if 0
1477         {
1478                 .ctl_name       = NET_IPV4_VS_TO_ES,
1479                 .procname       = "timeout_established",
1480                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1481                 .maxlen         = sizeof(int),
1482                 .mode           = 0644,
1483                 .proc_handler   = &proc_dointvec_jiffies,
1484         },
1485         {
1486                 .ctl_name       = NET_IPV4_VS_TO_SS,
1487                 .procname       = "timeout_synsent",
1488                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1489                 .maxlen         = sizeof(int),
1490                 .mode           = 0644,
1491                 .proc_handler   = &proc_dointvec_jiffies,
1492         },
1493         {
1494                 .ctl_name       = NET_IPV4_VS_TO_SR,
1495                 .procname       = "timeout_synrecv",
1496                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1497                 .maxlen         = sizeof(int),
1498                 .mode           = 0644,
1499                 .proc_handler   = &proc_dointvec_jiffies,
1500         },
1501         {
1502                 .ctl_name       = NET_IPV4_VS_TO_FW,
1503                 .procname       = "timeout_finwait",
1504                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1505                 .maxlen         = sizeof(int),
1506                 .mode           = 0644,
1507                 .proc_handler   = &proc_dointvec_jiffies,
1508         },
1509         {
1510                 .ctl_name       = NET_IPV4_VS_TO_TW,
1511                 .procname       = "timeout_timewait",
1512                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1513                 .maxlen         = sizeof(int),
1514                 .mode           = 0644,
1515                 .proc_handler   = &proc_dointvec_jiffies,
1516         },
1517         {
1518                 .ctl_name       = NET_IPV4_VS_TO_CL,
1519                 .procname       = "timeout_close",
1520                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1521                 .maxlen         = sizeof(int),
1522                 .mode           = 0644,
1523                 .proc_handler   = &proc_dointvec_jiffies,
1524         },
1525         {
1526                 .ctl_name       = NET_IPV4_VS_TO_CW,
1527                 .procname       = "timeout_closewait",
1528                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1529                 .maxlen         = sizeof(int),
1530                 .mode           = 0644,
1531                 .proc_handler   = &proc_dointvec_jiffies,
1532         },
1533         {
1534                 .ctl_name       = NET_IPV4_VS_TO_LA,
1535                 .procname       = "timeout_lastack",
1536                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1537                 .maxlen         = sizeof(int),
1538                 .mode           = 0644,
1539                 .proc_handler   = &proc_dointvec_jiffies,
1540         },
1541         {
1542                 .ctl_name       = NET_IPV4_VS_TO_LI,
1543                 .procname       = "timeout_listen",
1544                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1545                 .maxlen         = sizeof(int),
1546                 .mode           = 0644,
1547                 .proc_handler   = &proc_dointvec_jiffies,
1548         },
1549         {
1550                 .ctl_name       = NET_IPV4_VS_TO_SA,
1551                 .procname       = "timeout_synack",
1552                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1553                 .maxlen         = sizeof(int),
1554                 .mode           = 0644,
1555                 .proc_handler   = &proc_dointvec_jiffies,
1556         },
1557         {
1558                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1559                 .procname       = "timeout_udp",
1560                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1561                 .maxlen         = sizeof(int),
1562                 .mode           = 0644,
1563                 .proc_handler   = &proc_dointvec_jiffies,
1564         },
1565         {
1566                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1567                 .procname       = "timeout_icmp",
1568                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1569                 .maxlen         = sizeof(int),
1570                 .mode           = 0644,
1571                 .proc_handler   = &proc_dointvec_jiffies,
1572         },
1573 #endif
1574         {
1575                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1576                 .procname       = "cache_bypass",
1577                 .data           = &sysctl_ip_vs_cache_bypass,
1578                 .maxlen         = sizeof(int),
1579                 .mode           = 0644,
1580                 .proc_handler   = &proc_dointvec,
1581         },
1582         {
1583                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1584                 .procname       = "expire_nodest_conn",
1585                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1586                 .maxlen         = sizeof(int),
1587                 .mode           = 0644,
1588                 .proc_handler   = &proc_dointvec,
1589         },
1590         {
1591                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1592                 .procname       = "expire_quiescent_template",
1593                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1594                 .maxlen         = sizeof(int),
1595                 .mode           = 0644,
1596                 .proc_handler   = &proc_dointvec,
1597         },
1598         {
1599                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1600                 .procname       = "sync_threshold",
1601                 .data           = &sysctl_ip_vs_sync_threshold,
1602                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1603                 .mode           = 0644,
1604                 .proc_handler   = &proc_do_sync_threshold,
1605         },
1606         {
1607                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1608                 .procname       = "nat_icmp_send",
1609                 .data           = &sysctl_ip_vs_nat_icmp_send,
1610                 .maxlen         = sizeof(int),
1611                 .mode           = 0644,
1612                 .proc_handler   = &proc_dointvec,
1613         },
1614         { .ctl_name = 0 }
1615 };
1616
1617 static ctl_table vs_table[] = {
1618         {
1619                 .ctl_name       = NET_IPV4_VS,
1620                 .procname       = "vs",
1621                 .mode           = 0555,
1622                 .child          = vs_vars
1623         },
1624         { .ctl_name = 0 }
1625 };
1626
1627 static ctl_table ipvs_ipv4_table[] = {
1628         {
1629                 .ctl_name       = NET_IPV4,
1630                 .procname       = "ipv4",
1631                 .mode           = 0555,
1632                 .child          = vs_table,
1633         },
1634         { .ctl_name = 0 }
1635 };
1636
1637 static ctl_table vs_root_table[] = {
1638         {
1639                 .ctl_name       = CTL_NET,
1640                 .procname       = "net",
1641                 .mode           = 0555,
1642                 .child          = ipvs_ipv4_table,
1643         },
1644         { .ctl_name = 0 }
1645 };
1646
1647 static struct ctl_table_header * sysctl_header;
1648
1649 #ifdef CONFIG_PROC_FS
1650
1651 struct ip_vs_iter {
1652         struct list_head *table;
1653         int bucket;
1654 };
1655
1656 /*
1657  *      Write the contents of the VS rule table to a PROCfs file.
1658  *      (It is kept just for backward compatibility)
1659  */
1660 static inline const char *ip_vs_fwd_name(unsigned flags)
1661 {
1662         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1663         case IP_VS_CONN_F_LOCALNODE:
1664                 return "Local";
1665         case IP_VS_CONN_F_TUNNEL:
1666                 return "Tunnel";
1667         case IP_VS_CONN_F_DROUTE:
1668                 return "Route";
1669         default:
1670                 return "Masq";
1671         }
1672 }
1673
1674
1675 /* Get the Nth entry in the two lists */
1676 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1677 {
1678         struct ip_vs_iter *iter = seq->private;
1679         int idx;
1680         struct ip_vs_service *svc;
1681
1682         /* look in hash by protocol */
1683         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1684                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1685                         if (pos-- == 0){
1686                                 iter->table = ip_vs_svc_table;
1687                                 iter->bucket = idx;
1688                                 return svc;
1689                         }
1690                 }
1691         }
1692
1693         /* keep looking in fwmark */
1694         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1695                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1696                         if (pos-- == 0) {
1697                                 iter->table = ip_vs_svc_fwm_table;
1698                                 iter->bucket = idx;
1699                                 return svc;
1700                         }
1701                 }
1702         }
1703
1704         return NULL;
1705 }
1706
1707 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1708 {
1709
1710         read_lock_bh(&__ip_vs_svc_lock);
1711         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1712 }
1713
1714
1715 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1716 {
1717         struct list_head *e;
1718         struct ip_vs_iter *iter;
1719         struct ip_vs_service *svc;
1720
1721         ++*pos;
1722         if (v == SEQ_START_TOKEN)
1723                 return ip_vs_info_array(seq,0);
1724
1725         svc = v;
1726         iter = seq->private;
1727
1728         if (iter->table == ip_vs_svc_table) {
1729                 /* next service in table hashed by protocol */
1730                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1731                         return list_entry(e, struct ip_vs_service, s_list);
1732
1733
1734                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1735                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1736                                             s_list) {
1737                                 return svc;
1738                         }
1739                 }
1740
1741                 iter->table = ip_vs_svc_fwm_table;
1742                 iter->bucket = -1;
1743                 goto scan_fwmark;
1744         }
1745
1746         /* next service in hashed by fwmark */
1747         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1748                 return list_entry(e, struct ip_vs_service, f_list);
1749
1750  scan_fwmark:
1751         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1752                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1753                                     f_list)
1754                         return svc;
1755         }
1756
1757         return NULL;
1758 }
1759
1760 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1761 {
1762         read_unlock_bh(&__ip_vs_svc_lock);
1763 }
1764
1765
1766 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1767 {
1768         if (v == SEQ_START_TOKEN) {
1769                 seq_printf(seq,
1770                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1771                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1772                 seq_puts(seq,
1773                          "Prot LocalAddress:Port Scheduler Flags\n");
1774                 seq_puts(seq,
1775                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1776         } else {
1777                 const struct ip_vs_service *svc = v;
1778                 const struct ip_vs_iter *iter = seq->private;
1779                 const struct ip_vs_dest *dest;
1780
1781                 if (iter->table == ip_vs_svc_table)
1782                         seq_printf(seq, "%s  %08X:%04X %s ",
1783                                    ip_vs_proto_name(svc->protocol),
1784                                    ntohl(svc->addr),
1785                                    ntohs(svc->port),
1786                                    svc->scheduler->name);
1787                 else
1788                         seq_printf(seq, "FWM  %08X %s ",
1789                                    svc->fwmark, svc->scheduler->name);
1790
1791                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1792                         seq_printf(seq, "persistent %d %08X\n",
1793                                 svc->timeout,
1794                                 ntohl(svc->netmask));
1795                 else
1796                         seq_putc(seq, '\n');
1797
1798                 list_for_each_entry(dest, &svc->destinations, n_list) {
1799                         seq_printf(seq,
1800                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1801                                    ntohl(dest->addr), ntohs(dest->port),
1802                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1803                                    atomic_read(&dest->weight),
1804                                    atomic_read(&dest->activeconns),
1805                                    atomic_read(&dest->inactconns));
1806                 }
1807         }
1808         return 0;
1809 }
1810
1811 static const struct seq_operations ip_vs_info_seq_ops = {
1812         .start = ip_vs_info_seq_start,
1813         .next  = ip_vs_info_seq_next,
1814         .stop  = ip_vs_info_seq_stop,
1815         .show  = ip_vs_info_seq_show,
1816 };
1817
1818 static int ip_vs_info_open(struct inode *inode, struct file *file)
1819 {
1820         return seq_open_private(file, &ip_vs_info_seq_ops,
1821                         sizeof(struct ip_vs_iter));
1822 }
1823
1824 static const struct file_operations ip_vs_info_fops = {
1825         .owner   = THIS_MODULE,
1826         .open    = ip_vs_info_open,
1827         .read    = seq_read,
1828         .llseek  = seq_lseek,
1829         .release = seq_release_private,
1830 };
1831
1832 #endif
1833
1834 struct ip_vs_stats ip_vs_stats;
1835
1836 #ifdef CONFIG_PROC_FS
1837 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1838 {
1839
1840 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1841         seq_puts(seq,
1842                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1843         seq_printf(seq,
1844                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1845
1846         spin_lock_bh(&ip_vs_stats.lock);
1847         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1848                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1849                    (unsigned long long) ip_vs_stats.inbytes,
1850                    (unsigned long long) ip_vs_stats.outbytes);
1851
1852 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1853         seq_puts(seq,
1854                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1855         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1856                         ip_vs_stats.cps,
1857                         ip_vs_stats.inpps,
1858                         ip_vs_stats.outpps,
1859                         ip_vs_stats.inbps,
1860                         ip_vs_stats.outbps);
1861         spin_unlock_bh(&ip_vs_stats.lock);
1862
1863         return 0;
1864 }
1865
1866 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1867 {
1868         return single_open(file, ip_vs_stats_show, NULL);
1869 }
1870
1871 static const struct file_operations ip_vs_stats_fops = {
1872         .owner = THIS_MODULE,
1873         .open = ip_vs_stats_seq_open,
1874         .read = seq_read,
1875         .llseek = seq_lseek,
1876         .release = single_release,
1877 };
1878
1879 #endif
1880
1881 /*
1882  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1883  */
1884 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1885 {
1886         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1887                   u->tcp_timeout,
1888                   u->tcp_fin_timeout,
1889                   u->udp_timeout);
1890
1891 #ifdef CONFIG_IP_VS_PROTO_TCP
1892         if (u->tcp_timeout) {
1893                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1894                         = u->tcp_timeout * HZ;
1895         }
1896
1897         if (u->tcp_fin_timeout) {
1898                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1899                         = u->tcp_fin_timeout * HZ;
1900         }
1901 #endif
1902
1903 #ifdef CONFIG_IP_VS_PROTO_UDP
1904         if (u->udp_timeout) {
1905                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1906                         = u->udp_timeout * HZ;
1907         }
1908 #endif
1909         return 0;
1910 }
1911
1912
1913 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1914 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1915 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1916                                  sizeof(struct ip_vs_dest_user))
1917 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1918 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1919 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1920
1921 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1922         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1923         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1924         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1925         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1926         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1927         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1928         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1929         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1930         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1931         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1932         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1933 };
1934
1935 static int
1936 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1937 {
1938         int ret;
1939         unsigned char arg[MAX_ARG_LEN];
1940         struct ip_vs_service_user *usvc;
1941         struct ip_vs_service *svc;
1942         struct ip_vs_dest_user *udest;
1943
1944         if (!capable(CAP_NET_ADMIN))
1945                 return -EPERM;
1946
1947         if (len != set_arglen[SET_CMDID(cmd)]) {
1948                 IP_VS_ERR("set_ctl: len %u != %u\n",
1949                           len, set_arglen[SET_CMDID(cmd)]);
1950                 return -EINVAL;
1951         }
1952
1953         if (copy_from_user(arg, user, len) != 0)
1954                 return -EFAULT;
1955
1956         /* increase the module use count */
1957         ip_vs_use_count_inc();
1958
1959         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1960                 ret = -ERESTARTSYS;
1961                 goto out_dec;
1962         }
1963
1964         if (cmd == IP_VS_SO_SET_FLUSH) {
1965                 /* Flush the virtual service */
1966                 ret = ip_vs_flush();
1967                 goto out_unlock;
1968         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1969                 /* Set timeout values for (tcp tcpfin udp) */
1970                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1971                 goto out_unlock;
1972         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1973                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1974                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1975                 goto out_unlock;
1976         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1977                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1978                 ret = stop_sync_thread(dm->state);
1979                 goto out_unlock;
1980         }
1981
1982         usvc = (struct ip_vs_service_user *)arg;
1983         udest = (struct ip_vs_dest_user *)(usvc + 1);
1984
1985         if (cmd == IP_VS_SO_SET_ZERO) {
1986                 /* if no service address is set, zero counters in all */
1987                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1988                         ret = ip_vs_zero_all();
1989                         goto out_unlock;
1990                 }
1991         }
1992
1993         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1994         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1995                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1996                           usvc->protocol, NIPQUAD(usvc->addr),
1997                           ntohs(usvc->port), usvc->sched_name);
1998                 ret = -EFAULT;
1999                 goto out_unlock;
2000         }
2001
2002         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2003         if (usvc->fwmark == 0)
2004                 svc = __ip_vs_service_get(usvc->protocol,
2005                                           usvc->addr, usvc->port);
2006         else
2007                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2008
2009         if (cmd != IP_VS_SO_SET_ADD
2010             && (svc == NULL || svc->protocol != usvc->protocol)) {
2011                 ret = -ESRCH;
2012                 goto out_unlock;
2013         }
2014
2015         switch (cmd) {
2016         case IP_VS_SO_SET_ADD:
2017                 if (svc != NULL)
2018                         ret = -EEXIST;
2019                 else
2020                         ret = ip_vs_add_service(usvc, &svc);
2021                 break;
2022         case IP_VS_SO_SET_EDIT:
2023                 ret = ip_vs_edit_service(svc, usvc);
2024                 break;
2025         case IP_VS_SO_SET_DEL:
2026                 ret = ip_vs_del_service(svc);
2027                 if (!ret)
2028                         goto out_unlock;
2029                 break;
2030         case IP_VS_SO_SET_ZERO:
2031                 ret = ip_vs_zero_service(svc);
2032                 break;
2033         case IP_VS_SO_SET_ADDDEST:
2034                 ret = ip_vs_add_dest(svc, udest);
2035                 break;
2036         case IP_VS_SO_SET_EDITDEST:
2037                 ret = ip_vs_edit_dest(svc, udest);
2038                 break;
2039         case IP_VS_SO_SET_DELDEST:
2040                 ret = ip_vs_del_dest(svc, udest);
2041                 break;
2042         default:
2043                 ret = -EINVAL;
2044         }
2045
2046         if (svc)
2047                 ip_vs_service_put(svc);
2048
2049   out_unlock:
2050         mutex_unlock(&__ip_vs_mutex);
2051   out_dec:
2052         /* decrease the module use count */
2053         ip_vs_use_count_dec();
2054
2055         return ret;
2056 }
2057
2058
2059 static void
2060 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2061 {
2062         spin_lock_bh(&src->lock);
2063         memcpy(dst, src, (char*)&src->lock - (char*)src);
2064         spin_unlock_bh(&src->lock);
2065 }
2066
2067 static void
2068 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2069 {
2070         dst->protocol = src->protocol;
2071         dst->addr = src->addr;
2072         dst->port = src->port;
2073         dst->fwmark = src->fwmark;
2074         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2075         dst->flags = src->flags;
2076         dst->timeout = src->timeout / HZ;
2077         dst->netmask = src->netmask;
2078         dst->num_dests = src->num_dests;
2079         ip_vs_copy_stats(&dst->stats, &src->stats);
2080 }
2081
2082 static inline int
2083 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2084                             struct ip_vs_get_services __user *uptr)
2085 {
2086         int idx, count=0;
2087         struct ip_vs_service *svc;
2088         struct ip_vs_service_entry entry;
2089         int ret = 0;
2090
2091         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2092                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2093                         if (count >= get->num_services)
2094                                 goto out;
2095                         memset(&entry, 0, sizeof(entry));
2096                         ip_vs_copy_service(&entry, svc);
2097                         if (copy_to_user(&uptr->entrytable[count],
2098                                          &entry, sizeof(entry))) {
2099                                 ret = -EFAULT;
2100                                 goto out;
2101                         }
2102                         count++;
2103                 }
2104         }
2105
2106         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2107                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2108                         if (count >= get->num_services)
2109                                 goto out;
2110                         memset(&entry, 0, sizeof(entry));
2111                         ip_vs_copy_service(&entry, svc);
2112                         if (copy_to_user(&uptr->entrytable[count],
2113                                          &entry, sizeof(entry))) {
2114                                 ret = -EFAULT;
2115                                 goto out;
2116                         }
2117                         count++;
2118                 }
2119         }
2120   out:
2121         return ret;
2122 }
2123
2124 static inline int
2125 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2126                          struct ip_vs_get_dests __user *uptr)
2127 {
2128         struct ip_vs_service *svc;
2129         int ret = 0;
2130
2131         if (get->fwmark)
2132                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2133         else
2134                 svc = __ip_vs_service_get(get->protocol,
2135                                           get->addr, get->port);
2136         if (svc) {
2137                 int count = 0;
2138                 struct ip_vs_dest *dest;
2139                 struct ip_vs_dest_entry entry;
2140
2141                 list_for_each_entry(dest, &svc->destinations, n_list) {
2142                         if (count >= get->num_dests)
2143                                 break;
2144
2145                         entry.addr = dest->addr;
2146                         entry.port = dest->port;
2147                         entry.conn_flags = atomic_read(&dest->conn_flags);
2148                         entry.weight = atomic_read(&dest->weight);
2149                         entry.u_threshold = dest->u_threshold;
2150                         entry.l_threshold = dest->l_threshold;
2151                         entry.activeconns = atomic_read(&dest->activeconns);
2152                         entry.inactconns = atomic_read(&dest->inactconns);
2153                         entry.persistconns = atomic_read(&dest->persistconns);
2154                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2155                         if (copy_to_user(&uptr->entrytable[count],
2156                                          &entry, sizeof(entry))) {
2157                                 ret = -EFAULT;
2158                                 break;
2159                         }
2160                         count++;
2161                 }
2162                 ip_vs_service_put(svc);
2163         } else
2164                 ret = -ESRCH;
2165         return ret;
2166 }
2167
2168 static inline void
2169 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2170 {
2171 #ifdef CONFIG_IP_VS_PROTO_TCP
2172         u->tcp_timeout =
2173                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2174         u->tcp_fin_timeout =
2175                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2176 #endif
2177 #ifdef CONFIG_IP_VS_PROTO_UDP
2178         u->udp_timeout =
2179                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2180 #endif
2181 }
2182
2183
2184 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2185 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2186 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2187 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2188 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2189 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2190 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2191
2192 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2193         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2194         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2195         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2196         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2197         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2198         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2199         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2200 };
2201
2202 static int
2203 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2204 {
2205         unsigned char arg[128];
2206         int ret = 0;
2207
2208         if (!capable(CAP_NET_ADMIN))
2209                 return -EPERM;
2210
2211         if (*len < get_arglen[GET_CMDID(cmd)]) {
2212                 IP_VS_ERR("get_ctl: len %u < %u\n",
2213                           *len, get_arglen[GET_CMDID(cmd)]);
2214                 return -EINVAL;
2215         }
2216
2217         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2218                 return -EFAULT;
2219
2220         if (mutex_lock_interruptible(&__ip_vs_mutex))
2221                 return -ERESTARTSYS;
2222
2223         switch (cmd) {
2224         case IP_VS_SO_GET_VERSION:
2225         {
2226                 char buf[64];
2227
2228                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2229                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2230                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2231                         ret = -EFAULT;
2232                         goto out;
2233                 }
2234                 *len = strlen(buf)+1;
2235         }
2236         break;
2237
2238         case IP_VS_SO_GET_INFO:
2239         {
2240                 struct ip_vs_getinfo info;
2241                 info.version = IP_VS_VERSION_CODE;
2242                 info.size = IP_VS_CONN_TAB_SIZE;
2243                 info.num_services = ip_vs_num_services;
2244                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2245                         ret = -EFAULT;
2246         }
2247         break;
2248
2249         case IP_VS_SO_GET_SERVICES:
2250         {
2251                 struct ip_vs_get_services *get;
2252                 int size;
2253
2254                 get = (struct ip_vs_get_services *)arg;
2255                 size = sizeof(*get) +
2256                         sizeof(struct ip_vs_service_entry) * get->num_services;
2257                 if (*len != size) {
2258                         IP_VS_ERR("length: %u != %u\n", *len, size);
2259                         ret = -EINVAL;
2260                         goto out;
2261                 }
2262                 ret = __ip_vs_get_service_entries(get, user);
2263         }
2264         break;
2265
2266         case IP_VS_SO_GET_SERVICE:
2267         {
2268                 struct ip_vs_service_entry *entry;
2269                 struct ip_vs_service *svc;
2270
2271                 entry = (struct ip_vs_service_entry *)arg;
2272                 if (entry->fwmark)
2273                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2274                 else
2275                         svc = __ip_vs_service_get(entry->protocol,
2276                                                   entry->addr, entry->port);
2277                 if (svc) {
2278                         ip_vs_copy_service(entry, svc);
2279                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2280                                 ret = -EFAULT;
2281                         ip_vs_service_put(svc);
2282                 } else
2283                         ret = -ESRCH;
2284         }
2285         break;
2286
2287         case IP_VS_SO_GET_DESTS:
2288         {
2289                 struct ip_vs_get_dests *get;
2290                 int size;
2291
2292                 get = (struct ip_vs_get_dests *)arg;
2293                 size = sizeof(*get) +
2294                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2295                 if (*len != size) {
2296                         IP_VS_ERR("length: %u != %u\n", *len, size);
2297                         ret = -EINVAL;
2298                         goto out;
2299                 }
2300                 ret = __ip_vs_get_dest_entries(get, user);
2301         }
2302         break;
2303
2304         case IP_VS_SO_GET_TIMEOUT:
2305         {
2306                 struct ip_vs_timeout_user t;
2307
2308                 __ip_vs_get_timeouts(&t);
2309                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2310                         ret = -EFAULT;
2311         }
2312         break;
2313
2314         case IP_VS_SO_GET_DAEMON:
2315         {
2316                 struct ip_vs_daemon_user d[2];
2317
2318                 memset(&d, 0, sizeof(d));
2319                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2320                         d[0].state = IP_VS_STATE_MASTER;
2321                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2322                         d[0].syncid = ip_vs_master_syncid;
2323                 }
2324                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2325                         d[1].state = IP_VS_STATE_BACKUP;
2326                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2327                         d[1].syncid = ip_vs_backup_syncid;
2328                 }
2329                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2330                         ret = -EFAULT;
2331         }
2332         break;
2333
2334         default:
2335                 ret = -EINVAL;
2336         }
2337
2338   out:
2339         mutex_unlock(&__ip_vs_mutex);
2340         return ret;
2341 }
2342
2343
2344 static struct nf_sockopt_ops ip_vs_sockopts = {
2345         .pf             = PF_INET,
2346         .set_optmin     = IP_VS_BASE_CTL,
2347         .set_optmax     = IP_VS_SO_SET_MAX+1,
2348         .set            = do_ip_vs_set_ctl,
2349         .get_optmin     = IP_VS_BASE_CTL,
2350         .get_optmax     = IP_VS_SO_GET_MAX+1,
2351         .get            = do_ip_vs_get_ctl,
2352         .owner          = THIS_MODULE,
2353 };
2354
2355
2356 int ip_vs_control_init(void)
2357 {
2358         int ret;
2359         int idx;
2360
2361         EnterFunction(2);
2362
2363         ret = nf_register_sockopt(&ip_vs_sockopts);
2364         if (ret) {
2365                 IP_VS_ERR("cannot register sockopt.\n");
2366                 return ret;
2367         }
2368
2369         proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2370         proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
2371
2372         sysctl_header = register_sysctl_table(vs_root_table);
2373
2374         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2375         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2376                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2377                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2378         }
2379         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2380                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2381         }
2382
2383         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2384         spin_lock_init(&ip_vs_stats.lock);
2385         ip_vs_new_estimator(&ip_vs_stats);
2386
2387         /* Hook the defense timer */
2388         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2389
2390         LeaveFunction(2);
2391         return 0;
2392 }
2393
2394
2395 void ip_vs_control_cleanup(void)
2396 {
2397         EnterFunction(2);
2398         ip_vs_trash_cleanup();
2399         cancel_rearming_delayed_work(&defense_work);
2400         cancel_work_sync(&defense_work.work);
2401         ip_vs_kill_estimator(&ip_vs_stats);
2402         unregister_sysctl_table(sysctl_header);
2403         proc_net_remove(&init_net, "ip_vs_stats");
2404         proc_net_remove(&init_net, "ip_vs");
2405         nf_unregister_sockopt(&ip_vs_sockopts);
2406         LeaveFunction(2);
2407 }