Merge master.kernel.org:/home/rmk/linux-2.6-arm
[sfrench/cifs-2.6.git] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/capability.h>
27 #include <linux/fs.h>
28 #include <linux/sysctl.h>
29 #include <linux/proc_fs.h>
30 #include <linux/workqueue.h>
31 #include <linux/swap.h>
32 #include <linux/seq_file.h>
33
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36 #include <linux/mutex.h>
37
38 #include <net/ip.h>
39 #include <net/route.h>
40 #include <net/sock.h>
41
42 #include <asm/uaccess.h>
43
44 #include <net/ip_vs.h>
45
46 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
47 static DEFINE_MUTEX(__ip_vs_mutex);
48
49 /* lock for service table */
50 static DEFINE_RWLOCK(__ip_vs_svc_lock);
51
52 /* lock for table with the real services */
53 static DEFINE_RWLOCK(__ip_vs_rs_lock);
54
55 /* lock for state and timeout tables */
56 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
57
58 /* lock for drop entry handling */
59 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
60
61 /* lock for drop packet handling */
62 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
63
64 /* 1/rate drop and drop-entry variables */
65 int ip_vs_drop_rate = 0;
66 int ip_vs_drop_counter = 0;
67 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
68
69 /* number of virtual services */
70 static int ip_vs_num_services = 0;
71
72 /* sysctl variables */
73 static int sysctl_ip_vs_drop_entry = 0;
74 static int sysctl_ip_vs_drop_packet = 0;
75 static int sysctl_ip_vs_secure_tcp = 0;
76 static int sysctl_ip_vs_amemthresh = 1024;
77 static int sysctl_ip_vs_am_droprate = 10;
78 int sysctl_ip_vs_cache_bypass = 0;
79 int sysctl_ip_vs_expire_nodest_conn = 0;
80 int sysctl_ip_vs_expire_quiescent_template = 0;
81 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
82 int sysctl_ip_vs_nat_icmp_send = 0;
83
84
85 #ifdef CONFIG_IP_VS_DEBUG
86 static int sysctl_ip_vs_debug_level = 0;
87
88 int ip_vs_get_debug_level(void)
89 {
90         return sysctl_ip_vs_debug_level;
91 }
92 #endif
93
94 /*
95  *      update_defense_level is called from keventd and from sysctl,
96  *      so it needs to protect itself from softirqs
97  */
98 static void update_defense_level(void)
99 {
100         struct sysinfo i;
101         static int old_secure_tcp = 0;
102         int availmem;
103         int nomem;
104         int to_change = -1;
105
106         /* we only count free and buffered memory (in pages) */
107         si_meminfo(&i);
108         availmem = i.freeram + i.bufferram;
109         /* however in linux 2.5 the i.bufferram is total page cache size,
110            we need adjust it */
111         /* si_swapinfo(&i); */
112         /* availmem = availmem - (i.totalswap - i.freeswap); */
113
114         nomem = (availmem < sysctl_ip_vs_amemthresh);
115
116         local_bh_disable();
117
118         /* drop_entry */
119         spin_lock(&__ip_vs_dropentry_lock);
120         switch (sysctl_ip_vs_drop_entry) {
121         case 0:
122                 atomic_set(&ip_vs_dropentry, 0);
123                 break;
124         case 1:
125                 if (nomem) {
126                         atomic_set(&ip_vs_dropentry, 1);
127                         sysctl_ip_vs_drop_entry = 2;
128                 } else {
129                         atomic_set(&ip_vs_dropentry, 0);
130                 }
131                 break;
132         case 2:
133                 if (nomem) {
134                         atomic_set(&ip_vs_dropentry, 1);
135                 } else {
136                         atomic_set(&ip_vs_dropentry, 0);
137                         sysctl_ip_vs_drop_entry = 1;
138                 };
139                 break;
140         case 3:
141                 atomic_set(&ip_vs_dropentry, 1);
142                 break;
143         }
144         spin_unlock(&__ip_vs_dropentry_lock);
145
146         /* drop_packet */
147         spin_lock(&__ip_vs_droppacket_lock);
148         switch (sysctl_ip_vs_drop_packet) {
149         case 0:
150                 ip_vs_drop_rate = 0;
151                 break;
152         case 1:
153                 if (nomem) {
154                         ip_vs_drop_rate = ip_vs_drop_counter
155                                 = sysctl_ip_vs_amemthresh /
156                                 (sysctl_ip_vs_amemthresh-availmem);
157                         sysctl_ip_vs_drop_packet = 2;
158                 } else {
159                         ip_vs_drop_rate = 0;
160                 }
161                 break;
162         case 2:
163                 if (nomem) {
164                         ip_vs_drop_rate = ip_vs_drop_counter
165                                 = sysctl_ip_vs_amemthresh /
166                                 (sysctl_ip_vs_amemthresh-availmem);
167                 } else {
168                         ip_vs_drop_rate = 0;
169                         sysctl_ip_vs_drop_packet = 1;
170                 }
171                 break;
172         case 3:
173                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
174                 break;
175         }
176         spin_unlock(&__ip_vs_droppacket_lock);
177
178         /* secure_tcp */
179         write_lock(&__ip_vs_securetcp_lock);
180         switch (sysctl_ip_vs_secure_tcp) {
181         case 0:
182                 if (old_secure_tcp >= 2)
183                         to_change = 0;
184                 break;
185         case 1:
186                 if (nomem) {
187                         if (old_secure_tcp < 2)
188                                 to_change = 1;
189                         sysctl_ip_vs_secure_tcp = 2;
190                 } else {
191                         if (old_secure_tcp >= 2)
192                                 to_change = 0;
193                 }
194                 break;
195         case 2:
196                 if (nomem) {
197                         if (old_secure_tcp < 2)
198                                 to_change = 1;
199                 } else {
200                         if (old_secure_tcp >= 2)
201                                 to_change = 0;
202                         sysctl_ip_vs_secure_tcp = 1;
203                 }
204                 break;
205         case 3:
206                 if (old_secure_tcp < 2)
207                         to_change = 1;
208                 break;
209         }
210         old_secure_tcp = sysctl_ip_vs_secure_tcp;
211         if (to_change >= 0)
212                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
213         write_unlock(&__ip_vs_securetcp_lock);
214
215         local_bh_enable();
216 }
217
218
219 /*
220  *      Timer for checking the defense
221  */
222 #define DEFENSE_TIMER_PERIOD    1*HZ
223 static void defense_work_handler(struct work_struct *work);
224 static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
225
226 static void defense_work_handler(struct work_struct *work)
227 {
228         update_defense_level();
229         if (atomic_read(&ip_vs_dropentry))
230                 ip_vs_random_dropentry();
231
232         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
233 }
234
235 int
236 ip_vs_use_count_inc(void)
237 {
238         return try_module_get(THIS_MODULE);
239 }
240
241 void
242 ip_vs_use_count_dec(void)
243 {
244         module_put(THIS_MODULE);
245 }
246
247
248 /*
249  *      Hash table: for virtual service lookups
250  */
251 #define IP_VS_SVC_TAB_BITS 8
252 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
253 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
254
255 /* the service table hashed by <protocol, addr, port> */
256 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
257 /* the service table hashed by fwmark */
258 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
259
260 /*
261  *      Hash table: for real service lookups
262  */
263 #define IP_VS_RTAB_BITS 4
264 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
265 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
266
267 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
268
269 /*
270  *      Trash for destinations
271  */
272 static LIST_HEAD(ip_vs_dest_trash);
273
274 /*
275  *      FTP & NULL virtual service counters
276  */
277 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
278 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
279
280
281 /*
282  *      Returns hash value for virtual service
283  */
284 static __inline__ unsigned
285 ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
286 {
287         register unsigned porth = ntohs(port);
288
289         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
290                 & IP_VS_SVC_TAB_MASK;
291 }
292
293 /*
294  *      Returns hash value of fwmark for virtual service lookup
295  */
296 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
297 {
298         return fwmark & IP_VS_SVC_TAB_MASK;
299 }
300
301 /*
302  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
303  *      or in the ip_vs_svc_fwm_table by fwmark.
304  *      Should be called with locked tables.
305  */
306 static int ip_vs_svc_hash(struct ip_vs_service *svc)
307 {
308         unsigned hash;
309
310         if (svc->flags & IP_VS_SVC_F_HASHED) {
311                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
312                           "called from %p\n", __builtin_return_address(0));
313                 return 0;
314         }
315
316         if (svc->fwmark == 0) {
317                 /*
318                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
319                  */
320                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
321                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
322         } else {
323                 /*
324                  *  Hash it by fwmark in ip_vs_svc_fwm_table
325                  */
326                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
327                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
328         }
329
330         svc->flags |= IP_VS_SVC_F_HASHED;
331         /* increase its refcnt because it is referenced by the svc table */
332         atomic_inc(&svc->refcnt);
333         return 1;
334 }
335
336
337 /*
338  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
339  *      Should be called with locked tables.
340  */
341 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
342 {
343         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
344                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
345                           "called from %p\n", __builtin_return_address(0));
346                 return 0;
347         }
348
349         if (svc->fwmark == 0) {
350                 /* Remove it from the ip_vs_svc_table table */
351                 list_del(&svc->s_list);
352         } else {
353                 /* Remove it from the ip_vs_svc_fwm_table table */
354                 list_del(&svc->f_list);
355         }
356
357         svc->flags &= ~IP_VS_SVC_F_HASHED;
358         atomic_dec(&svc->refcnt);
359         return 1;
360 }
361
362
363 /*
364  *      Get service by {proto,addr,port} in the service table.
365  */
366 static __inline__ struct ip_vs_service *
367 __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
368 {
369         unsigned hash;
370         struct ip_vs_service *svc;
371
372         /* Check for "full" addressed entries */
373         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
374
375         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
376                 if ((svc->addr == vaddr)
377                     && (svc->port == vport)
378                     && (svc->protocol == protocol)) {
379                         /* HIT */
380                         atomic_inc(&svc->usecnt);
381                         return svc;
382                 }
383         }
384
385         return NULL;
386 }
387
388
389 /*
390  *      Get service by {fwmark} in the service table.
391  */
392 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
393 {
394         unsigned hash;
395         struct ip_vs_service *svc;
396
397         /* Check for fwmark addressed entries */
398         hash = ip_vs_svc_fwm_hashkey(fwmark);
399
400         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
401                 if (svc->fwmark == fwmark) {
402                         /* HIT */
403                         atomic_inc(&svc->usecnt);
404                         return svc;
405                 }
406         }
407
408         return NULL;
409 }
410
411 struct ip_vs_service *
412 ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
413 {
414         struct ip_vs_service *svc;
415
416         read_lock(&__ip_vs_svc_lock);
417
418         /*
419          *      Check the table hashed by fwmark first
420          */
421         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
422                 goto out;
423
424         /*
425          *      Check the table hashed by <protocol,addr,port>
426          *      for "full" addressed entries
427          */
428         svc = __ip_vs_service_get(protocol, vaddr, vport);
429
430         if (svc == NULL
431             && protocol == IPPROTO_TCP
432             && atomic_read(&ip_vs_ftpsvc_counter)
433             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
434                 /*
435                  * Check if ftp service entry exists, the packet
436                  * might belong to FTP data connections.
437                  */
438                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
439         }
440
441         if (svc == NULL
442             && atomic_read(&ip_vs_nullsvc_counter)) {
443                 /*
444                  * Check if the catch-all port (port zero) exists
445                  */
446                 svc = __ip_vs_service_get(protocol, vaddr, 0);
447         }
448
449   out:
450         read_unlock(&__ip_vs_svc_lock);
451
452         IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
453                   fwmark, ip_vs_proto_name(protocol),
454                   NIPQUAD(vaddr), ntohs(vport),
455                   svc?"hit":"not hit");
456
457         return svc;
458 }
459
460
461 static inline void
462 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
463 {
464         atomic_inc(&svc->refcnt);
465         dest->svc = svc;
466 }
467
468 static inline void
469 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
470 {
471         struct ip_vs_service *svc = dest->svc;
472
473         dest->svc = NULL;
474         if (atomic_dec_and_test(&svc->refcnt))
475                 kfree(svc);
476 }
477
478
479 /*
480  *      Returns hash value for real service
481  */
482 static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
483 {
484         register unsigned porth = ntohs(port);
485
486         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
487                 & IP_VS_RTAB_MASK;
488 }
489
490 /*
491  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
492  *      should be called with locked tables.
493  */
494 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
495 {
496         unsigned hash;
497
498         if (!list_empty(&dest->d_list)) {
499                 return 0;
500         }
501
502         /*
503          *      Hash by proto,addr,port,
504          *      which are the parameters of the real service.
505          */
506         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
507         list_add(&dest->d_list, &ip_vs_rtable[hash]);
508
509         return 1;
510 }
511
512 /*
513  *      UNhashes ip_vs_dest from ip_vs_rtable.
514  *      should be called with locked tables.
515  */
516 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
517 {
518         /*
519          * Remove it from the ip_vs_rtable table.
520          */
521         if (!list_empty(&dest->d_list)) {
522                 list_del(&dest->d_list);
523                 INIT_LIST_HEAD(&dest->d_list);
524         }
525
526         return 1;
527 }
528
529 /*
530  *      Lookup real service by <proto,addr,port> in the real service table.
531  */
532 struct ip_vs_dest *
533 ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
534 {
535         unsigned hash;
536         struct ip_vs_dest *dest;
537
538         /*
539          *      Check for "full" addressed entries
540          *      Return the first found entry
541          */
542         hash = ip_vs_rs_hashkey(daddr, dport);
543
544         read_lock(&__ip_vs_rs_lock);
545         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
546                 if ((dest->addr == daddr)
547                     && (dest->port == dport)
548                     && ((dest->protocol == protocol) ||
549                         dest->vfwmark)) {
550                         /* HIT */
551                         read_unlock(&__ip_vs_rs_lock);
552                         return dest;
553                 }
554         }
555         read_unlock(&__ip_vs_rs_lock);
556
557         return NULL;
558 }
559
560 /*
561  *      Lookup destination by {addr,port} in the given service
562  */
563 static struct ip_vs_dest *
564 ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
565 {
566         struct ip_vs_dest *dest;
567
568         /*
569          * Find the destination for the given service
570          */
571         list_for_each_entry(dest, &svc->destinations, n_list) {
572                 if ((dest->addr == daddr) && (dest->port == dport)) {
573                         /* HIT */
574                         return dest;
575                 }
576         }
577
578         return NULL;
579 }
580
581
582 /*
583  *  Lookup dest by {svc,addr,port} in the destination trash.
584  *  The destination trash is used to hold the destinations that are removed
585  *  from the service table but are still referenced by some conn entries.
586  *  The reason to add the destination trash is when the dest is temporary
587  *  down (either by administrator or by monitor program), the dest can be
588  *  picked back from the trash, the remaining connections to the dest can
589  *  continue, and the counting information of the dest is also useful for
590  *  scheduling.
591  */
592 static struct ip_vs_dest *
593 ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
594 {
595         struct ip_vs_dest *dest, *nxt;
596
597         /*
598          * Find the destination in trash
599          */
600         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
601                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
602                           "dest->refcnt=%d\n",
603                           dest->vfwmark,
604                           NIPQUAD(dest->addr), ntohs(dest->port),
605                           atomic_read(&dest->refcnt));
606                 if (dest->addr == daddr &&
607                     dest->port == dport &&
608                     dest->vfwmark == svc->fwmark &&
609                     dest->protocol == svc->protocol &&
610                     (svc->fwmark ||
611                      (dest->vaddr == svc->addr &&
612                       dest->vport == svc->port))) {
613                         /* HIT */
614                         return dest;
615                 }
616
617                 /*
618                  * Try to purge the destination from trash if not referenced
619                  */
620                 if (atomic_read(&dest->refcnt) == 1) {
621                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
622                                   "from trash\n",
623                                   dest->vfwmark,
624                                   NIPQUAD(dest->addr), ntohs(dest->port));
625                         list_del(&dest->n_list);
626                         ip_vs_dst_reset(dest);
627                         __ip_vs_unbind_svc(dest);
628                         kfree(dest);
629                 }
630         }
631
632         return NULL;
633 }
634
635
636 /*
637  *  Clean up all the destinations in the trash
638  *  Called by the ip_vs_control_cleanup()
639  *
640  *  When the ip_vs_control_clearup is activated by ipvs module exit,
641  *  the service tables must have been flushed and all the connections
642  *  are expired, and the refcnt of each destination in the trash must
643  *  be 1, so we simply release them here.
644  */
645 static void ip_vs_trash_cleanup(void)
646 {
647         struct ip_vs_dest *dest, *nxt;
648
649         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
650                 list_del(&dest->n_list);
651                 ip_vs_dst_reset(dest);
652                 __ip_vs_unbind_svc(dest);
653                 kfree(dest);
654         }
655 }
656
657
658 static void
659 ip_vs_zero_stats(struct ip_vs_stats *stats)
660 {
661         spin_lock_bh(&stats->lock);
662         memset(stats, 0, (char *)&stats->lock - (char *)stats);
663         spin_unlock_bh(&stats->lock);
664         ip_vs_zero_estimator(stats);
665 }
666
667 /*
668  *      Update a destination in the given service
669  */
670 static void
671 __ip_vs_update_dest(struct ip_vs_service *svc,
672                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
673 {
674         int conn_flags;
675
676         /* set the weight and the flags */
677         atomic_set(&dest->weight, udest->weight);
678         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
679
680         /* check if local node and update the flags */
681         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
682                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
683                         | IP_VS_CONN_F_LOCALNODE;
684         }
685
686         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
687         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
688                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
689         } else {
690                 /*
691                  *    Put the real service in ip_vs_rtable if not present.
692                  *    For now only for NAT!
693                  */
694                 write_lock_bh(&__ip_vs_rs_lock);
695                 ip_vs_rs_hash(dest);
696                 write_unlock_bh(&__ip_vs_rs_lock);
697         }
698         atomic_set(&dest->conn_flags, conn_flags);
699
700         /* bind the service */
701         if (!dest->svc) {
702                 __ip_vs_bind_svc(dest, svc);
703         } else {
704                 if (dest->svc != svc) {
705                         __ip_vs_unbind_svc(dest);
706                         ip_vs_zero_stats(&dest->stats);
707                         __ip_vs_bind_svc(dest, svc);
708                 }
709         }
710
711         /* set the dest status flags */
712         dest->flags |= IP_VS_DEST_F_AVAILABLE;
713
714         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
715                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
716         dest->u_threshold = udest->u_threshold;
717         dest->l_threshold = udest->l_threshold;
718 }
719
720
721 /*
722  *      Create a destination for the given service
723  */
724 static int
725 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
726                struct ip_vs_dest **dest_p)
727 {
728         struct ip_vs_dest *dest;
729         unsigned atype;
730
731         EnterFunction(2);
732
733         atype = inet_addr_type(udest->addr);
734         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
735                 return -EINVAL;
736
737         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
738         if (dest == NULL) {
739                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
740                 return -ENOMEM;
741         }
742
743         dest->protocol = svc->protocol;
744         dest->vaddr = svc->addr;
745         dest->vport = svc->port;
746         dest->vfwmark = svc->fwmark;
747         dest->addr = udest->addr;
748         dest->port = udest->port;
749
750         atomic_set(&dest->activeconns, 0);
751         atomic_set(&dest->inactconns, 0);
752         atomic_set(&dest->persistconns, 0);
753         atomic_set(&dest->refcnt, 0);
754
755         INIT_LIST_HEAD(&dest->d_list);
756         spin_lock_init(&dest->dst_lock);
757         spin_lock_init(&dest->stats.lock);
758         __ip_vs_update_dest(svc, dest, udest);
759         ip_vs_new_estimator(&dest->stats);
760
761         *dest_p = dest;
762
763         LeaveFunction(2);
764         return 0;
765 }
766
767
768 /*
769  *      Add a destination into an existing service
770  */
771 static int
772 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
773 {
774         struct ip_vs_dest *dest;
775         __be32 daddr = udest->addr;
776         __be16 dport = udest->port;
777         int ret;
778
779         EnterFunction(2);
780
781         if (udest->weight < 0) {
782                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
783                 return -ERANGE;
784         }
785
786         if (udest->l_threshold > udest->u_threshold) {
787                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
788                           "upper threshold\n");
789                 return -ERANGE;
790         }
791
792         /*
793          * Check if the dest already exists in the list
794          */
795         dest = ip_vs_lookup_dest(svc, daddr, dport);
796         if (dest != NULL) {
797                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
798                 return -EEXIST;
799         }
800
801         /*
802          * Check if the dest already exists in the trash and
803          * is from the same service
804          */
805         dest = ip_vs_trash_get_dest(svc, daddr, dport);
806         if (dest != NULL) {
807                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
808                           "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
809                           NIPQUAD(daddr), ntohs(dport),
810                           atomic_read(&dest->refcnt),
811                           dest->vfwmark,
812                           NIPQUAD(dest->vaddr),
813                           ntohs(dest->vport));
814                 __ip_vs_update_dest(svc, dest, udest);
815
816                 /*
817                  * Get the destination from the trash
818                  */
819                 list_del(&dest->n_list);
820
821                 ip_vs_new_estimator(&dest->stats);
822
823                 write_lock_bh(&__ip_vs_svc_lock);
824
825                 /*
826                  * Wait until all other svc users go away.
827                  */
828                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
829
830                 list_add(&dest->n_list, &svc->destinations);
831                 svc->num_dests++;
832
833                 /* call the update_service function of its scheduler */
834                 svc->scheduler->update_service(svc);
835
836                 write_unlock_bh(&__ip_vs_svc_lock);
837                 return 0;
838         }
839
840         /*
841          * Allocate and initialize the dest structure
842          */
843         ret = ip_vs_new_dest(svc, udest, &dest);
844         if (ret) {
845                 return ret;
846         }
847
848         /*
849          * Add the dest entry into the list
850          */
851         atomic_inc(&dest->refcnt);
852
853         write_lock_bh(&__ip_vs_svc_lock);
854
855         /*
856          * Wait until all other svc users go away.
857          */
858         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
859
860         list_add(&dest->n_list, &svc->destinations);
861         svc->num_dests++;
862
863         /* call the update_service function of its scheduler */
864         svc->scheduler->update_service(svc);
865
866         write_unlock_bh(&__ip_vs_svc_lock);
867
868         LeaveFunction(2);
869
870         return 0;
871 }
872
873
874 /*
875  *      Edit a destination in the given service
876  */
877 static int
878 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
879 {
880         struct ip_vs_dest *dest;
881         __be32 daddr = udest->addr;
882         __be16 dport = udest->port;
883
884         EnterFunction(2);
885
886         if (udest->weight < 0) {
887                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
888                 return -ERANGE;
889         }
890
891         if (udest->l_threshold > udest->u_threshold) {
892                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
893                           "upper threshold\n");
894                 return -ERANGE;
895         }
896
897         /*
898          *  Lookup the destination list
899          */
900         dest = ip_vs_lookup_dest(svc, daddr, dport);
901         if (dest == NULL) {
902                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
903                 return -ENOENT;
904         }
905
906         __ip_vs_update_dest(svc, dest, udest);
907
908         write_lock_bh(&__ip_vs_svc_lock);
909
910         /* Wait until all other svc users go away */
911         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
912
913         /* call the update_service, because server weight may be changed */
914         svc->scheduler->update_service(svc);
915
916         write_unlock_bh(&__ip_vs_svc_lock);
917
918         LeaveFunction(2);
919
920         return 0;
921 }
922
923
924 /*
925  *      Delete a destination (must be already unlinked from the service)
926  */
927 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
928 {
929         ip_vs_kill_estimator(&dest->stats);
930
931         /*
932          *  Remove it from the d-linked list with the real services.
933          */
934         write_lock_bh(&__ip_vs_rs_lock);
935         ip_vs_rs_unhash(dest);
936         write_unlock_bh(&__ip_vs_rs_lock);
937
938         /*
939          *  Decrease the refcnt of the dest, and free the dest
940          *  if nobody refers to it (refcnt=0). Otherwise, throw
941          *  the destination into the trash.
942          */
943         if (atomic_dec_and_test(&dest->refcnt)) {
944                 ip_vs_dst_reset(dest);
945                 /* simply decrease svc->refcnt here, let the caller check
946                    and release the service if nobody refers to it.
947                    Only user context can release destination and service,
948                    and only one user context can update virtual service at a
949                    time, so the operation here is OK */
950                 atomic_dec(&dest->svc->refcnt);
951                 kfree(dest);
952         } else {
953                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
954                           "dest->refcnt=%d\n",
955                           NIPQUAD(dest->addr), ntohs(dest->port),
956                           atomic_read(&dest->refcnt));
957                 list_add(&dest->n_list, &ip_vs_dest_trash);
958                 atomic_inc(&dest->refcnt);
959         }
960 }
961
962
963 /*
964  *      Unlink a destination from the given service
965  */
966 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
967                                 struct ip_vs_dest *dest,
968                                 int svcupd)
969 {
970         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
971
972         /*
973          *  Remove it from the d-linked destination list.
974          */
975         list_del(&dest->n_list);
976         svc->num_dests--;
977         if (svcupd) {
978                 /*
979                  *  Call the update_service function of its scheduler
980                  */
981                 svc->scheduler->update_service(svc);
982         }
983 }
984
985
986 /*
987  *      Delete a destination server in the given service
988  */
989 static int
990 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
991 {
992         struct ip_vs_dest *dest;
993         __be32 daddr = udest->addr;
994         __be16 dport = udest->port;
995
996         EnterFunction(2);
997
998         dest = ip_vs_lookup_dest(svc, daddr, dport);
999         if (dest == NULL) {
1000                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1001                 return -ENOENT;
1002         }
1003
1004         write_lock_bh(&__ip_vs_svc_lock);
1005
1006         /*
1007          *      Wait until all other svc users go away.
1008          */
1009         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1010
1011         /*
1012          *      Unlink dest from the service
1013          */
1014         __ip_vs_unlink_dest(svc, dest, 1);
1015
1016         write_unlock_bh(&__ip_vs_svc_lock);
1017
1018         /*
1019          *      Delete the destination
1020          */
1021         __ip_vs_del_dest(dest);
1022
1023         LeaveFunction(2);
1024
1025         return 0;
1026 }
1027
1028
1029 /*
1030  *      Add a service into the service hash table
1031  */
1032 static int
1033 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1034 {
1035         int ret = 0;
1036         struct ip_vs_scheduler *sched = NULL;
1037         struct ip_vs_service *svc = NULL;
1038
1039         /* increase the module use count */
1040         ip_vs_use_count_inc();
1041
1042         /* Lookup the scheduler by 'u->sched_name' */
1043         sched = ip_vs_scheduler_get(u->sched_name);
1044         if (sched == NULL) {
1045                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1046                            u->sched_name);
1047                 ret = -ENOENT;
1048                 goto out_mod_dec;
1049         }
1050
1051         svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1052         if (svc == NULL) {
1053                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1054                 ret = -ENOMEM;
1055                 goto out_err;
1056         }
1057
1058         /* I'm the first user of the service */
1059         atomic_set(&svc->usecnt, 1);
1060         atomic_set(&svc->refcnt, 0);
1061
1062         svc->protocol = u->protocol;
1063         svc->addr = u->addr;
1064         svc->port = u->port;
1065         svc->fwmark = u->fwmark;
1066         svc->flags = u->flags;
1067         svc->timeout = u->timeout * HZ;
1068         svc->netmask = u->netmask;
1069
1070         INIT_LIST_HEAD(&svc->destinations);
1071         rwlock_init(&svc->sched_lock);
1072         spin_lock_init(&svc->stats.lock);
1073
1074         /* Bind the scheduler */
1075         ret = ip_vs_bind_scheduler(svc, sched);
1076         if (ret)
1077                 goto out_err;
1078         sched = NULL;
1079
1080         /* Update the virtual service counters */
1081         if (svc->port == FTPPORT)
1082                 atomic_inc(&ip_vs_ftpsvc_counter);
1083         else if (svc->port == 0)
1084                 atomic_inc(&ip_vs_nullsvc_counter);
1085
1086         ip_vs_new_estimator(&svc->stats);
1087         ip_vs_num_services++;
1088
1089         /* Hash the service into the service table */
1090         write_lock_bh(&__ip_vs_svc_lock);
1091         ip_vs_svc_hash(svc);
1092         write_unlock_bh(&__ip_vs_svc_lock);
1093
1094         *svc_p = svc;
1095         return 0;
1096
1097   out_err:
1098         if (svc != NULL) {
1099                 if (svc->scheduler)
1100                         ip_vs_unbind_scheduler(svc);
1101                 if (svc->inc) {
1102                         local_bh_disable();
1103                         ip_vs_app_inc_put(svc->inc);
1104                         local_bh_enable();
1105                 }
1106                 kfree(svc);
1107         }
1108         ip_vs_scheduler_put(sched);
1109
1110   out_mod_dec:
1111         /* decrease the module use count */
1112         ip_vs_use_count_dec();
1113
1114         return ret;
1115 }
1116
1117
1118 /*
1119  *      Edit a service and bind it with a new scheduler
1120  */
1121 static int
1122 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1123 {
1124         struct ip_vs_scheduler *sched, *old_sched;
1125         int ret = 0;
1126
1127         /*
1128          * Lookup the scheduler, by 'u->sched_name'
1129          */
1130         sched = ip_vs_scheduler_get(u->sched_name);
1131         if (sched == NULL) {
1132                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1133                            u->sched_name);
1134                 return -ENOENT;
1135         }
1136         old_sched = sched;
1137
1138         write_lock_bh(&__ip_vs_svc_lock);
1139
1140         /*
1141          * Wait until all other svc users go away.
1142          */
1143         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1144
1145         /*
1146          * Set the flags and timeout value
1147          */
1148         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1149         svc->timeout = u->timeout * HZ;
1150         svc->netmask = u->netmask;
1151
1152         old_sched = svc->scheduler;
1153         if (sched != old_sched) {
1154                 /*
1155                  * Unbind the old scheduler
1156                  */
1157                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1158                         old_sched = sched;
1159                         goto out;
1160                 }
1161
1162                 /*
1163                  * Bind the new scheduler
1164                  */
1165                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1166                         /*
1167                          * If ip_vs_bind_scheduler fails, restore the old
1168                          * scheduler.
1169                          * The main reason of failure is out of memory.
1170                          *
1171                          * The question is if the old scheduler can be
1172                          * restored all the time. TODO: if it cannot be
1173                          * restored some time, we must delete the service,
1174                          * otherwise the system may crash.
1175                          */
1176                         ip_vs_bind_scheduler(svc, old_sched);
1177                         old_sched = sched;
1178                         goto out;
1179                 }
1180         }
1181
1182   out:
1183         write_unlock_bh(&__ip_vs_svc_lock);
1184
1185         if (old_sched)
1186                 ip_vs_scheduler_put(old_sched);
1187
1188         return ret;
1189 }
1190
1191
1192 /*
1193  *      Delete a service from the service list
1194  *      - The service must be unlinked, unlocked and not referenced!
1195  *      - We are called under _bh lock
1196  */
1197 static void __ip_vs_del_service(struct ip_vs_service *svc)
1198 {
1199         struct ip_vs_dest *dest, *nxt;
1200         struct ip_vs_scheduler *old_sched;
1201
1202         ip_vs_num_services--;
1203         ip_vs_kill_estimator(&svc->stats);
1204
1205         /* Unbind scheduler */
1206         old_sched = svc->scheduler;
1207         ip_vs_unbind_scheduler(svc);
1208         if (old_sched)
1209                 ip_vs_scheduler_put(old_sched);
1210
1211         /* Unbind app inc */
1212         if (svc->inc) {
1213                 ip_vs_app_inc_put(svc->inc);
1214                 svc->inc = NULL;
1215         }
1216
1217         /*
1218          *    Unlink the whole destination list
1219          */
1220         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1221                 __ip_vs_unlink_dest(svc, dest, 0);
1222                 __ip_vs_del_dest(dest);
1223         }
1224
1225         /*
1226          *    Update the virtual service counters
1227          */
1228         if (svc->port == FTPPORT)
1229                 atomic_dec(&ip_vs_ftpsvc_counter);
1230         else if (svc->port == 0)
1231                 atomic_dec(&ip_vs_nullsvc_counter);
1232
1233         /*
1234          *    Free the service if nobody refers to it
1235          */
1236         if (atomic_read(&svc->refcnt) == 0)
1237                 kfree(svc);
1238
1239         /* decrease the module use count */
1240         ip_vs_use_count_dec();
1241 }
1242
1243 /*
1244  *      Delete a service from the service list
1245  */
1246 static int ip_vs_del_service(struct ip_vs_service *svc)
1247 {
1248         if (svc == NULL)
1249                 return -EEXIST;
1250
1251         /*
1252          * Unhash it from the service table
1253          */
1254         write_lock_bh(&__ip_vs_svc_lock);
1255
1256         ip_vs_svc_unhash(svc);
1257
1258         /*
1259          * Wait until all the svc users go away.
1260          */
1261         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1262
1263         __ip_vs_del_service(svc);
1264
1265         write_unlock_bh(&__ip_vs_svc_lock);
1266
1267         return 0;
1268 }
1269
1270
1271 /*
1272  *      Flush all the virtual services
1273  */
1274 static int ip_vs_flush(void)
1275 {
1276         int idx;
1277         struct ip_vs_service *svc, *nxt;
1278
1279         /*
1280          * Flush the service table hashed by <protocol,addr,port>
1281          */
1282         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1283                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1284                         write_lock_bh(&__ip_vs_svc_lock);
1285                         ip_vs_svc_unhash(svc);
1286                         /*
1287                          * Wait until all the svc users go away.
1288                          */
1289                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1290                         __ip_vs_del_service(svc);
1291                         write_unlock_bh(&__ip_vs_svc_lock);
1292                 }
1293         }
1294
1295         /*
1296          * Flush the service table hashed by fwmark
1297          */
1298         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1299                 list_for_each_entry_safe(svc, nxt,
1300                                          &ip_vs_svc_fwm_table[idx], f_list) {
1301                         write_lock_bh(&__ip_vs_svc_lock);
1302                         ip_vs_svc_unhash(svc);
1303                         /*
1304                          * Wait until all the svc users go away.
1305                          */
1306                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1307                         __ip_vs_del_service(svc);
1308                         write_unlock_bh(&__ip_vs_svc_lock);
1309                 }
1310         }
1311
1312         return 0;
1313 }
1314
1315
1316 /*
1317  *      Zero counters in a service or all services
1318  */
1319 static int ip_vs_zero_service(struct ip_vs_service *svc)
1320 {
1321         struct ip_vs_dest *dest;
1322
1323         write_lock_bh(&__ip_vs_svc_lock);
1324         list_for_each_entry(dest, &svc->destinations, n_list) {
1325                 ip_vs_zero_stats(&dest->stats);
1326         }
1327         ip_vs_zero_stats(&svc->stats);
1328         write_unlock_bh(&__ip_vs_svc_lock);
1329         return 0;
1330 }
1331
1332 static int ip_vs_zero_all(void)
1333 {
1334         int idx;
1335         struct ip_vs_service *svc;
1336
1337         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1338                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1339                         ip_vs_zero_service(svc);
1340                 }
1341         }
1342
1343         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1344                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1345                         ip_vs_zero_service(svc);
1346                 }
1347         }
1348
1349         ip_vs_zero_stats(&ip_vs_stats);
1350         return 0;
1351 }
1352
1353
1354 static int
1355 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1356                      void __user *buffer, size_t *lenp, loff_t *ppos)
1357 {
1358         int *valp = table->data;
1359         int val = *valp;
1360         int rc;
1361
1362         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1363         if (write && (*valp != val)) {
1364                 if ((*valp < 0) || (*valp > 3)) {
1365                         /* Restore the correct value */
1366                         *valp = val;
1367                 } else {
1368                         update_defense_level();
1369                 }
1370         }
1371         return rc;
1372 }
1373
1374
1375 static int
1376 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1377                        void __user *buffer, size_t *lenp, loff_t *ppos)
1378 {
1379         int *valp = table->data;
1380         int val[2];
1381         int rc;
1382
1383         /* backup the value first */
1384         memcpy(val, valp, sizeof(val));
1385
1386         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1387         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1388                 /* Restore the correct value */
1389                 memcpy(valp, val, sizeof(val));
1390         }
1391         return rc;
1392 }
1393
1394
1395 /*
1396  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1397  */
1398
1399 static struct ctl_table vs_vars[] = {
1400         {
1401                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1402                 .procname       = "amemthresh",
1403                 .data           = &sysctl_ip_vs_amemthresh,
1404                 .maxlen         = sizeof(int),
1405                 .mode           = 0644,
1406                 .proc_handler   = &proc_dointvec,
1407         },
1408 #ifdef CONFIG_IP_VS_DEBUG
1409         {
1410                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1411                 .procname       = "debug_level",
1412                 .data           = &sysctl_ip_vs_debug_level,
1413                 .maxlen         = sizeof(int),
1414                 .mode           = 0644,
1415                 .proc_handler   = &proc_dointvec,
1416         },
1417 #endif
1418         {
1419                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1420                 .procname       = "am_droprate",
1421                 .data           = &sysctl_ip_vs_am_droprate,
1422                 .maxlen         = sizeof(int),
1423                 .mode           = 0644,
1424                 .proc_handler   = &proc_dointvec,
1425         },
1426         {
1427                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1428                 .procname       = "drop_entry",
1429                 .data           = &sysctl_ip_vs_drop_entry,
1430                 .maxlen         = sizeof(int),
1431                 .mode           = 0644,
1432                 .proc_handler   = &proc_do_defense_mode,
1433         },
1434         {
1435                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1436                 .procname       = "drop_packet",
1437                 .data           = &sysctl_ip_vs_drop_packet,
1438                 .maxlen         = sizeof(int),
1439                 .mode           = 0644,
1440                 .proc_handler   = &proc_do_defense_mode,
1441         },
1442         {
1443                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1444                 .procname       = "secure_tcp",
1445                 .data           = &sysctl_ip_vs_secure_tcp,
1446                 .maxlen         = sizeof(int),
1447                 .mode           = 0644,
1448                 .proc_handler   = &proc_do_defense_mode,
1449         },
1450 #if 0
1451         {
1452                 .ctl_name       = NET_IPV4_VS_TO_ES,
1453                 .procname       = "timeout_established",
1454                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1455                 .maxlen         = sizeof(int),
1456                 .mode           = 0644,
1457                 .proc_handler   = &proc_dointvec_jiffies,
1458         },
1459         {
1460                 .ctl_name       = NET_IPV4_VS_TO_SS,
1461                 .procname       = "timeout_synsent",
1462                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1463                 .maxlen         = sizeof(int),
1464                 .mode           = 0644,
1465                 .proc_handler   = &proc_dointvec_jiffies,
1466         },
1467         {
1468                 .ctl_name       = NET_IPV4_VS_TO_SR,
1469                 .procname       = "timeout_synrecv",
1470                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1471                 .maxlen         = sizeof(int),
1472                 .mode           = 0644,
1473                 .proc_handler   = &proc_dointvec_jiffies,
1474         },
1475         {
1476                 .ctl_name       = NET_IPV4_VS_TO_FW,
1477                 .procname       = "timeout_finwait",
1478                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1479                 .maxlen         = sizeof(int),
1480                 .mode           = 0644,
1481                 .proc_handler   = &proc_dointvec_jiffies,
1482         },
1483         {
1484                 .ctl_name       = NET_IPV4_VS_TO_TW,
1485                 .procname       = "timeout_timewait",
1486                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1487                 .maxlen         = sizeof(int),
1488                 .mode           = 0644,
1489                 .proc_handler   = &proc_dointvec_jiffies,
1490         },
1491         {
1492                 .ctl_name       = NET_IPV4_VS_TO_CL,
1493                 .procname       = "timeout_close",
1494                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1495                 .maxlen         = sizeof(int),
1496                 .mode           = 0644,
1497                 .proc_handler   = &proc_dointvec_jiffies,
1498         },
1499         {
1500                 .ctl_name       = NET_IPV4_VS_TO_CW,
1501                 .procname       = "timeout_closewait",
1502                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1503                 .maxlen         = sizeof(int),
1504                 .mode           = 0644,
1505                 .proc_handler   = &proc_dointvec_jiffies,
1506         },
1507         {
1508                 .ctl_name       = NET_IPV4_VS_TO_LA,
1509                 .procname       = "timeout_lastack",
1510                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1511                 .maxlen         = sizeof(int),
1512                 .mode           = 0644,
1513                 .proc_handler   = &proc_dointvec_jiffies,
1514         },
1515         {
1516                 .ctl_name       = NET_IPV4_VS_TO_LI,
1517                 .procname       = "timeout_listen",
1518                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1519                 .maxlen         = sizeof(int),
1520                 .mode           = 0644,
1521                 .proc_handler   = &proc_dointvec_jiffies,
1522         },
1523         {
1524                 .ctl_name       = NET_IPV4_VS_TO_SA,
1525                 .procname       = "timeout_synack",
1526                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1527                 .maxlen         = sizeof(int),
1528                 .mode           = 0644,
1529                 .proc_handler   = &proc_dointvec_jiffies,
1530         },
1531         {
1532                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1533                 .procname       = "timeout_udp",
1534                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1535                 .maxlen         = sizeof(int),
1536                 .mode           = 0644,
1537                 .proc_handler   = &proc_dointvec_jiffies,
1538         },
1539         {
1540                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1541                 .procname       = "timeout_icmp",
1542                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1543                 .maxlen         = sizeof(int),
1544                 .mode           = 0644,
1545                 .proc_handler   = &proc_dointvec_jiffies,
1546         },
1547 #endif
1548         {
1549                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1550                 .procname       = "cache_bypass",
1551                 .data           = &sysctl_ip_vs_cache_bypass,
1552                 .maxlen         = sizeof(int),
1553                 .mode           = 0644,
1554                 .proc_handler   = &proc_dointvec,
1555         },
1556         {
1557                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1558                 .procname       = "expire_nodest_conn",
1559                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1560                 .maxlen         = sizeof(int),
1561                 .mode           = 0644,
1562                 .proc_handler   = &proc_dointvec,
1563         },
1564         {
1565                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1566                 .procname       = "expire_quiescent_template",
1567                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1568                 .maxlen         = sizeof(int),
1569                 .mode           = 0644,
1570                 .proc_handler   = &proc_dointvec,
1571         },
1572         {
1573                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1574                 .procname       = "sync_threshold",
1575                 .data           = &sysctl_ip_vs_sync_threshold,
1576                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1577                 .mode           = 0644,
1578                 .proc_handler   = &proc_do_sync_threshold,
1579         },
1580         {
1581                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1582                 .procname       = "nat_icmp_send",
1583                 .data           = &sysctl_ip_vs_nat_icmp_send,
1584                 .maxlen         = sizeof(int),
1585                 .mode           = 0644,
1586                 .proc_handler   = &proc_dointvec,
1587         },
1588         { .ctl_name = 0 }
1589 };
1590
1591 static ctl_table vs_table[] = {
1592         {
1593                 .ctl_name       = NET_IPV4_VS,
1594                 .procname       = "vs",
1595                 .mode           = 0555,
1596                 .child          = vs_vars
1597         },
1598         { .ctl_name = 0 }
1599 };
1600
1601 static ctl_table ipvs_ipv4_table[] = {
1602         {
1603                 .ctl_name       = NET_IPV4,
1604                 .procname       = "ipv4",
1605                 .mode           = 0555,
1606                 .child          = vs_table,
1607         },
1608         { .ctl_name = 0 }
1609 };
1610
1611 static ctl_table vs_root_table[] = {
1612         {
1613                 .ctl_name       = CTL_NET,
1614                 .procname       = "net",
1615                 .mode           = 0555,
1616                 .child          = ipvs_ipv4_table,
1617         },
1618         { .ctl_name = 0 }
1619 };
1620
1621 static struct ctl_table_header * sysctl_header;
1622
1623 #ifdef CONFIG_PROC_FS
1624
1625 struct ip_vs_iter {
1626         struct list_head *table;
1627         int bucket;
1628 };
1629
1630 /*
1631  *      Write the contents of the VS rule table to a PROCfs file.
1632  *      (It is kept just for backward compatibility)
1633  */
1634 static inline const char *ip_vs_fwd_name(unsigned flags)
1635 {
1636         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1637         case IP_VS_CONN_F_LOCALNODE:
1638                 return "Local";
1639         case IP_VS_CONN_F_TUNNEL:
1640                 return "Tunnel";
1641         case IP_VS_CONN_F_DROUTE:
1642                 return "Route";
1643         default:
1644                 return "Masq";
1645         }
1646 }
1647
1648
1649 /* Get the Nth entry in the two lists */
1650 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1651 {
1652         struct ip_vs_iter *iter = seq->private;
1653         int idx;
1654         struct ip_vs_service *svc;
1655
1656         /* look in hash by protocol */
1657         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1658                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1659                         if (pos-- == 0){
1660                                 iter->table = ip_vs_svc_table;
1661                                 iter->bucket = idx;
1662                                 return svc;
1663                         }
1664                 }
1665         }
1666
1667         /* keep looking in fwmark */
1668         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1669                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1670                         if (pos-- == 0) {
1671                                 iter->table = ip_vs_svc_fwm_table;
1672                                 iter->bucket = idx;
1673                                 return svc;
1674                         }
1675                 }
1676         }
1677
1678         return NULL;
1679 }
1680
1681 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1682 {
1683
1684         read_lock_bh(&__ip_vs_svc_lock);
1685         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1686 }
1687
1688
1689 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1690 {
1691         struct list_head *e;
1692         struct ip_vs_iter *iter;
1693         struct ip_vs_service *svc;
1694
1695         ++*pos;
1696         if (v == SEQ_START_TOKEN)
1697                 return ip_vs_info_array(seq,0);
1698
1699         svc = v;
1700         iter = seq->private;
1701
1702         if (iter->table == ip_vs_svc_table) {
1703                 /* next service in table hashed by protocol */
1704                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1705                         return list_entry(e, struct ip_vs_service, s_list);
1706
1707
1708                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1709                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1710                                             s_list) {
1711                                 return svc;
1712                         }
1713                 }
1714
1715                 iter->table = ip_vs_svc_fwm_table;
1716                 iter->bucket = -1;
1717                 goto scan_fwmark;
1718         }
1719
1720         /* next service in hashed by fwmark */
1721         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1722                 return list_entry(e, struct ip_vs_service, f_list);
1723
1724  scan_fwmark:
1725         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1726                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1727                                     f_list)
1728                         return svc;
1729         }
1730
1731         return NULL;
1732 }
1733
1734 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1735 {
1736         read_unlock_bh(&__ip_vs_svc_lock);
1737 }
1738
1739
1740 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1741 {
1742         if (v == SEQ_START_TOKEN) {
1743                 seq_printf(seq,
1744                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1745                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1746                 seq_puts(seq,
1747                          "Prot LocalAddress:Port Scheduler Flags\n");
1748                 seq_puts(seq,
1749                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1750         } else {
1751                 const struct ip_vs_service *svc = v;
1752                 const struct ip_vs_iter *iter = seq->private;
1753                 const struct ip_vs_dest *dest;
1754
1755                 if (iter->table == ip_vs_svc_table)
1756                         seq_printf(seq, "%s  %08X:%04X %s ",
1757                                    ip_vs_proto_name(svc->protocol),
1758                                    ntohl(svc->addr),
1759                                    ntohs(svc->port),
1760                                    svc->scheduler->name);
1761                 else
1762                         seq_printf(seq, "FWM  %08X %s ",
1763                                    svc->fwmark, svc->scheduler->name);
1764
1765                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1766                         seq_printf(seq, "persistent %d %08X\n",
1767                                 svc->timeout,
1768                                 ntohl(svc->netmask));
1769                 else
1770                         seq_putc(seq, '\n');
1771
1772                 list_for_each_entry(dest, &svc->destinations, n_list) {
1773                         seq_printf(seq,
1774                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1775                                    ntohl(dest->addr), ntohs(dest->port),
1776                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1777                                    atomic_read(&dest->weight),
1778                                    atomic_read(&dest->activeconns),
1779                                    atomic_read(&dest->inactconns));
1780                 }
1781         }
1782         return 0;
1783 }
1784
1785 static const struct seq_operations ip_vs_info_seq_ops = {
1786         .start = ip_vs_info_seq_start,
1787         .next  = ip_vs_info_seq_next,
1788         .stop  = ip_vs_info_seq_stop,
1789         .show  = ip_vs_info_seq_show,
1790 };
1791
1792 static int ip_vs_info_open(struct inode *inode, struct file *file)
1793 {
1794         struct seq_file *seq;
1795         int rc = -ENOMEM;
1796         struct ip_vs_iter *s = kzalloc(sizeof(*s), GFP_KERNEL);
1797
1798         if (!s)
1799                 goto out;
1800
1801         rc = seq_open(file, &ip_vs_info_seq_ops);
1802         if (rc)
1803                 goto out_kfree;
1804
1805         seq          = file->private_data;
1806         seq->private = s;
1807 out:
1808         return rc;
1809 out_kfree:
1810         kfree(s);
1811         goto out;
1812 }
1813
1814 static const struct file_operations ip_vs_info_fops = {
1815         .owner   = THIS_MODULE,
1816         .open    = ip_vs_info_open,
1817         .read    = seq_read,
1818         .llseek  = seq_lseek,
1819         .release = seq_release_private,
1820 };
1821
1822 #endif
1823
1824 struct ip_vs_stats ip_vs_stats;
1825
1826 #ifdef CONFIG_PROC_FS
1827 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1828 {
1829
1830 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1831         seq_puts(seq,
1832                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1833         seq_printf(seq,
1834                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1835
1836         spin_lock_bh(&ip_vs_stats.lock);
1837         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1838                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1839                    (unsigned long long) ip_vs_stats.inbytes,
1840                    (unsigned long long) ip_vs_stats.outbytes);
1841
1842 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1843         seq_puts(seq,
1844                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1845         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1846                         ip_vs_stats.cps,
1847                         ip_vs_stats.inpps,
1848                         ip_vs_stats.outpps,
1849                         ip_vs_stats.inbps,
1850                         ip_vs_stats.outbps);
1851         spin_unlock_bh(&ip_vs_stats.lock);
1852
1853         return 0;
1854 }
1855
1856 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1857 {
1858         return single_open(file, ip_vs_stats_show, NULL);
1859 }
1860
1861 static const struct file_operations ip_vs_stats_fops = {
1862         .owner = THIS_MODULE,
1863         .open = ip_vs_stats_seq_open,
1864         .read = seq_read,
1865         .llseek = seq_lseek,
1866         .release = single_release,
1867 };
1868
1869 #endif
1870
1871 /*
1872  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1873  */
1874 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1875 {
1876         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1877                   u->tcp_timeout,
1878                   u->tcp_fin_timeout,
1879                   u->udp_timeout);
1880
1881 #ifdef CONFIG_IP_VS_PROTO_TCP
1882         if (u->tcp_timeout) {
1883                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1884                         = u->tcp_timeout * HZ;
1885         }
1886
1887         if (u->tcp_fin_timeout) {
1888                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1889                         = u->tcp_fin_timeout * HZ;
1890         }
1891 #endif
1892
1893 #ifdef CONFIG_IP_VS_PROTO_UDP
1894         if (u->udp_timeout) {
1895                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1896                         = u->udp_timeout * HZ;
1897         }
1898 #endif
1899         return 0;
1900 }
1901
1902
1903 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1904 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1905 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1906                                  sizeof(struct ip_vs_dest_user))
1907 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1908 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1909 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1910
1911 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1912         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1913         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1914         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1915         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1916         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1917         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1918         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1919         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1920         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1921         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1922         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1923 };
1924
1925 static int
1926 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1927 {
1928         int ret;
1929         unsigned char arg[MAX_ARG_LEN];
1930         struct ip_vs_service_user *usvc;
1931         struct ip_vs_service *svc;
1932         struct ip_vs_dest_user *udest;
1933
1934         if (!capable(CAP_NET_ADMIN))
1935                 return -EPERM;
1936
1937         if (len != set_arglen[SET_CMDID(cmd)]) {
1938                 IP_VS_ERR("set_ctl: len %u != %u\n",
1939                           len, set_arglen[SET_CMDID(cmd)]);
1940                 return -EINVAL;
1941         }
1942
1943         if (copy_from_user(arg, user, len) != 0)
1944                 return -EFAULT;
1945
1946         /* increase the module use count */
1947         ip_vs_use_count_inc();
1948
1949         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1950                 ret = -ERESTARTSYS;
1951                 goto out_dec;
1952         }
1953
1954         if (cmd == IP_VS_SO_SET_FLUSH) {
1955                 /* Flush the virtual service */
1956                 ret = ip_vs_flush();
1957                 goto out_unlock;
1958         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1959                 /* Set timeout values for (tcp tcpfin udp) */
1960                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1961                 goto out_unlock;
1962         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1963                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1964                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1965                 goto out_unlock;
1966         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1967                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1968                 ret = stop_sync_thread(dm->state);
1969                 goto out_unlock;
1970         }
1971
1972         usvc = (struct ip_vs_service_user *)arg;
1973         udest = (struct ip_vs_dest_user *)(usvc + 1);
1974
1975         if (cmd == IP_VS_SO_SET_ZERO) {
1976                 /* if no service address is set, zero counters in all */
1977                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1978                         ret = ip_vs_zero_all();
1979                         goto out_unlock;
1980                 }
1981         }
1982
1983         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1984         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1985                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1986                           usvc->protocol, NIPQUAD(usvc->addr),
1987                           ntohs(usvc->port), usvc->sched_name);
1988                 ret = -EFAULT;
1989                 goto out_unlock;
1990         }
1991
1992         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1993         if (usvc->fwmark == 0)
1994                 svc = __ip_vs_service_get(usvc->protocol,
1995                                           usvc->addr, usvc->port);
1996         else
1997                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1998
1999         if (cmd != IP_VS_SO_SET_ADD
2000             && (svc == NULL || svc->protocol != usvc->protocol)) {
2001                 ret = -ESRCH;
2002                 goto out_unlock;
2003         }
2004
2005         switch (cmd) {
2006         case IP_VS_SO_SET_ADD:
2007                 if (svc != NULL)
2008                         ret = -EEXIST;
2009                 else
2010                         ret = ip_vs_add_service(usvc, &svc);
2011                 break;
2012         case IP_VS_SO_SET_EDIT:
2013                 ret = ip_vs_edit_service(svc, usvc);
2014                 break;
2015         case IP_VS_SO_SET_DEL:
2016                 ret = ip_vs_del_service(svc);
2017                 if (!ret)
2018                         goto out_unlock;
2019                 break;
2020         case IP_VS_SO_SET_ZERO:
2021                 ret = ip_vs_zero_service(svc);
2022                 break;
2023         case IP_VS_SO_SET_ADDDEST:
2024                 ret = ip_vs_add_dest(svc, udest);
2025                 break;
2026         case IP_VS_SO_SET_EDITDEST:
2027                 ret = ip_vs_edit_dest(svc, udest);
2028                 break;
2029         case IP_VS_SO_SET_DELDEST:
2030                 ret = ip_vs_del_dest(svc, udest);
2031                 break;
2032         default:
2033                 ret = -EINVAL;
2034         }
2035
2036         if (svc)
2037                 ip_vs_service_put(svc);
2038
2039   out_unlock:
2040         mutex_unlock(&__ip_vs_mutex);
2041   out_dec:
2042         /* decrease the module use count */
2043         ip_vs_use_count_dec();
2044
2045         return ret;
2046 }
2047
2048
2049 static void
2050 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2051 {
2052         spin_lock_bh(&src->lock);
2053         memcpy(dst, src, (char*)&src->lock - (char*)src);
2054         spin_unlock_bh(&src->lock);
2055 }
2056
2057 static void
2058 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2059 {
2060         dst->protocol = src->protocol;
2061         dst->addr = src->addr;
2062         dst->port = src->port;
2063         dst->fwmark = src->fwmark;
2064         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2065         dst->flags = src->flags;
2066         dst->timeout = src->timeout / HZ;
2067         dst->netmask = src->netmask;
2068         dst->num_dests = src->num_dests;
2069         ip_vs_copy_stats(&dst->stats, &src->stats);
2070 }
2071
2072 static inline int
2073 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2074                             struct ip_vs_get_services __user *uptr)
2075 {
2076         int idx, count=0;
2077         struct ip_vs_service *svc;
2078         struct ip_vs_service_entry entry;
2079         int ret = 0;
2080
2081         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2082                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2083                         if (count >= get->num_services)
2084                                 goto out;
2085                         memset(&entry, 0, sizeof(entry));
2086                         ip_vs_copy_service(&entry, svc);
2087                         if (copy_to_user(&uptr->entrytable[count],
2088                                          &entry, sizeof(entry))) {
2089                                 ret = -EFAULT;
2090                                 goto out;
2091                         }
2092                         count++;
2093                 }
2094         }
2095
2096         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2097                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2098                         if (count >= get->num_services)
2099                                 goto out;
2100                         memset(&entry, 0, sizeof(entry));
2101                         ip_vs_copy_service(&entry, svc);
2102                         if (copy_to_user(&uptr->entrytable[count],
2103                                          &entry, sizeof(entry))) {
2104                                 ret = -EFAULT;
2105                                 goto out;
2106                         }
2107                         count++;
2108                 }
2109         }
2110   out:
2111         return ret;
2112 }
2113
2114 static inline int
2115 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2116                          struct ip_vs_get_dests __user *uptr)
2117 {
2118         struct ip_vs_service *svc;
2119         int ret = 0;
2120
2121         if (get->fwmark)
2122                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2123         else
2124                 svc = __ip_vs_service_get(get->protocol,
2125                                           get->addr, get->port);
2126         if (svc) {
2127                 int count = 0;
2128                 struct ip_vs_dest *dest;
2129                 struct ip_vs_dest_entry entry;
2130
2131                 list_for_each_entry(dest, &svc->destinations, n_list) {
2132                         if (count >= get->num_dests)
2133                                 break;
2134
2135                         entry.addr = dest->addr;
2136                         entry.port = dest->port;
2137                         entry.conn_flags = atomic_read(&dest->conn_flags);
2138                         entry.weight = atomic_read(&dest->weight);
2139                         entry.u_threshold = dest->u_threshold;
2140                         entry.l_threshold = dest->l_threshold;
2141                         entry.activeconns = atomic_read(&dest->activeconns);
2142                         entry.inactconns = atomic_read(&dest->inactconns);
2143                         entry.persistconns = atomic_read(&dest->persistconns);
2144                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2145                         if (copy_to_user(&uptr->entrytable[count],
2146                                          &entry, sizeof(entry))) {
2147                                 ret = -EFAULT;
2148                                 break;
2149                         }
2150                         count++;
2151                 }
2152                 ip_vs_service_put(svc);
2153         } else
2154                 ret = -ESRCH;
2155         return ret;
2156 }
2157
2158 static inline void
2159 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2160 {
2161 #ifdef CONFIG_IP_VS_PROTO_TCP
2162         u->tcp_timeout =
2163                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2164         u->tcp_fin_timeout =
2165                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2166 #endif
2167 #ifdef CONFIG_IP_VS_PROTO_UDP
2168         u->udp_timeout =
2169                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2170 #endif
2171 }
2172
2173
2174 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2175 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2176 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2177 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2178 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2179 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2180 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2181
2182 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2183         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2184         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2185         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2186         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2187         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2188         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2189         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2190 };
2191
2192 static int
2193 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2194 {
2195         unsigned char arg[128];
2196         int ret = 0;
2197
2198         if (!capable(CAP_NET_ADMIN))
2199                 return -EPERM;
2200
2201         if (*len < get_arglen[GET_CMDID(cmd)]) {
2202                 IP_VS_ERR("get_ctl: len %u < %u\n",
2203                           *len, get_arglen[GET_CMDID(cmd)]);
2204                 return -EINVAL;
2205         }
2206
2207         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2208                 return -EFAULT;
2209
2210         if (mutex_lock_interruptible(&__ip_vs_mutex))
2211                 return -ERESTARTSYS;
2212
2213         switch (cmd) {
2214         case IP_VS_SO_GET_VERSION:
2215         {
2216                 char buf[64];
2217
2218                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2219                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2220                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2221                         ret = -EFAULT;
2222                         goto out;
2223                 }
2224                 *len = strlen(buf)+1;
2225         }
2226         break;
2227
2228         case IP_VS_SO_GET_INFO:
2229         {
2230                 struct ip_vs_getinfo info;
2231                 info.version = IP_VS_VERSION_CODE;
2232                 info.size = IP_VS_CONN_TAB_SIZE;
2233                 info.num_services = ip_vs_num_services;
2234                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2235                         ret = -EFAULT;
2236         }
2237         break;
2238
2239         case IP_VS_SO_GET_SERVICES:
2240         {
2241                 struct ip_vs_get_services *get;
2242                 int size;
2243
2244                 get = (struct ip_vs_get_services *)arg;
2245                 size = sizeof(*get) +
2246                         sizeof(struct ip_vs_service_entry) * get->num_services;
2247                 if (*len != size) {
2248                         IP_VS_ERR("length: %u != %u\n", *len, size);
2249                         ret = -EINVAL;
2250                         goto out;
2251                 }
2252                 ret = __ip_vs_get_service_entries(get, user);
2253         }
2254         break;
2255
2256         case IP_VS_SO_GET_SERVICE:
2257         {
2258                 struct ip_vs_service_entry *entry;
2259                 struct ip_vs_service *svc;
2260
2261                 entry = (struct ip_vs_service_entry *)arg;
2262                 if (entry->fwmark)
2263                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2264                 else
2265                         svc = __ip_vs_service_get(entry->protocol,
2266                                                   entry->addr, entry->port);
2267                 if (svc) {
2268                         ip_vs_copy_service(entry, svc);
2269                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2270                                 ret = -EFAULT;
2271                         ip_vs_service_put(svc);
2272                 } else
2273                         ret = -ESRCH;
2274         }
2275         break;
2276
2277         case IP_VS_SO_GET_DESTS:
2278         {
2279                 struct ip_vs_get_dests *get;
2280                 int size;
2281
2282                 get = (struct ip_vs_get_dests *)arg;
2283                 size = sizeof(*get) +
2284                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2285                 if (*len != size) {
2286                         IP_VS_ERR("length: %u != %u\n", *len, size);
2287                         ret = -EINVAL;
2288                         goto out;
2289                 }
2290                 ret = __ip_vs_get_dest_entries(get, user);
2291         }
2292         break;
2293
2294         case IP_VS_SO_GET_TIMEOUT:
2295         {
2296                 struct ip_vs_timeout_user t;
2297
2298                 __ip_vs_get_timeouts(&t);
2299                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2300                         ret = -EFAULT;
2301         }
2302         break;
2303
2304         case IP_VS_SO_GET_DAEMON:
2305         {
2306                 struct ip_vs_daemon_user d[2];
2307
2308                 memset(&d, 0, sizeof(d));
2309                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2310                         d[0].state = IP_VS_STATE_MASTER;
2311                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2312                         d[0].syncid = ip_vs_master_syncid;
2313                 }
2314                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2315                         d[1].state = IP_VS_STATE_BACKUP;
2316                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2317                         d[1].syncid = ip_vs_backup_syncid;
2318                 }
2319                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2320                         ret = -EFAULT;
2321         }
2322         break;
2323
2324         default:
2325                 ret = -EINVAL;
2326         }
2327
2328   out:
2329         mutex_unlock(&__ip_vs_mutex);
2330         return ret;
2331 }
2332
2333
2334 static struct nf_sockopt_ops ip_vs_sockopts = {
2335         .pf             = PF_INET,
2336         .set_optmin     = IP_VS_BASE_CTL,
2337         .set_optmax     = IP_VS_SO_SET_MAX+1,
2338         .set            = do_ip_vs_set_ctl,
2339         .get_optmin     = IP_VS_BASE_CTL,
2340         .get_optmax     = IP_VS_SO_GET_MAX+1,
2341         .get            = do_ip_vs_get_ctl,
2342 };
2343
2344
2345 int ip_vs_control_init(void)
2346 {
2347         int ret;
2348         int idx;
2349
2350         EnterFunction(2);
2351
2352         ret = nf_register_sockopt(&ip_vs_sockopts);
2353         if (ret) {
2354                 IP_VS_ERR("cannot register sockopt.\n");
2355                 return ret;
2356         }
2357
2358         proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2359         proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2360
2361         sysctl_header = register_sysctl_table(vs_root_table);
2362
2363         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2364         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2365                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2366                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2367         }
2368         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2369                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2370         }
2371
2372         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2373         spin_lock_init(&ip_vs_stats.lock);
2374         ip_vs_new_estimator(&ip_vs_stats);
2375
2376         /* Hook the defense timer */
2377         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2378
2379         LeaveFunction(2);
2380         return 0;
2381 }
2382
2383
2384 void ip_vs_control_cleanup(void)
2385 {
2386         EnterFunction(2);
2387         ip_vs_trash_cleanup();
2388         cancel_rearming_delayed_work(&defense_work);
2389         cancel_work_sync(&defense_work.work);
2390         ip_vs_kill_estimator(&ip_vs_stats);
2391         unregister_sysctl_table(sysctl_header);
2392         proc_net_remove("ip_vs_stats");
2393         proc_net_remove("ip_vs");
2394         nf_unregister_sockopt(&ip_vs_sockopts);
2395         LeaveFunction(2);
2396 }