Merge tag 'spi-v4.1-rc1' into spi-linus
[sfrench/cifs-2.6.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/unregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* Get default qdisc if not otherwise specified */
204 void qdisc_get_default(char *name, size_t len)
205 {
206         read_lock(&qdisc_mod_lock);
207         strlcpy(name, default_qdisc_ops->id, len);
208         read_unlock(&qdisc_mod_lock);
209 }
210
211 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212 {
213         struct Qdisc_ops *q = NULL;
214
215         for (q = qdisc_base; q; q = q->next) {
216                 if (!strcmp(name, q->id)) {
217                         if (!try_module_get(q->owner))
218                                 q = NULL;
219                         break;
220                 }
221         }
222
223         return q;
224 }
225
226 /* Set new default qdisc to use */
227 int qdisc_set_default(const char *name)
228 {
229         const struct Qdisc_ops *ops;
230
231         if (!capable(CAP_NET_ADMIN))
232                 return -EPERM;
233
234         write_lock(&qdisc_mod_lock);
235         ops = qdisc_lookup_default(name);
236         if (!ops) {
237                 /* Not found, drop lock and try to load module */
238                 write_unlock(&qdisc_mod_lock);
239                 request_module("sch_%s", name);
240                 write_lock(&qdisc_mod_lock);
241
242                 ops = qdisc_lookup_default(name);
243         }
244
245         if (ops) {
246                 /* Set new default */
247                 module_put(default_qdisc_ops->owner);
248                 default_qdisc_ops = ops;
249         }
250         write_unlock(&qdisc_mod_lock);
251
252         return ops ? 0 : -ENOENT;
253 }
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256    (root qdisc, all its children, children of children etc.)
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!(root->flags & TCQ_F_BUILTIN) &&
264             root->handle == handle)
265                 return root;
266
267         list_for_each_entry(q, &root->list, list) {
268                 if (q->handle == handle)
269                         return q;
270         }
271         return NULL;
272 }
273
274 void qdisc_list_add(struct Qdisc *q)
275 {
276         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
277                 struct Qdisc *root = qdisc_dev(q)->qdisc;
278
279                 WARN_ON_ONCE(root == &noop_qdisc);
280                 list_add_tail(&q->list, &root->list);
281         }
282 }
283 EXPORT_SYMBOL(qdisc_list_add);
284
285 void qdisc_list_del(struct Qdisc *q)
286 {
287         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
288                 list_del(&q->list);
289 }
290 EXPORT_SYMBOL(qdisc_list_del);
291
292 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
293 {
294         struct Qdisc *q;
295
296         q = qdisc_match_from_root(dev->qdisc, handle);
297         if (q)
298                 goto out;
299
300         if (dev_ingress_queue(dev))
301                 q = qdisc_match_from_root(
302                         dev_ingress_queue(dev)->qdisc_sleeping,
303                         handle);
304 out:
305         return q;
306 }
307
308 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
309 {
310         unsigned long cl;
311         struct Qdisc *leaf;
312         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
313
314         if (cops == NULL)
315                 return NULL;
316         cl = cops->get(p, classid);
317
318         if (cl == 0)
319                 return NULL;
320         leaf = cops->leaf(p, cl);
321         cops->put(p, cl);
322         return leaf;
323 }
324
325 /* Find queueing discipline by name */
326
327 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
328 {
329         struct Qdisc_ops *q = NULL;
330
331         if (kind) {
332                 read_lock(&qdisc_mod_lock);
333                 for (q = qdisc_base; q; q = q->next) {
334                         if (nla_strcmp(kind, q->id) == 0) {
335                                 if (!try_module_get(q->owner))
336                                         q = NULL;
337                                 break;
338                         }
339                 }
340                 read_unlock(&qdisc_mod_lock);
341         }
342         return q;
343 }
344
345 /* The linklayer setting were not transferred from iproute2, in older
346  * versions, and the rate tables lookup systems have been dropped in
347  * the kernel. To keep backward compatible with older iproute2 tc
348  * utils, we detect the linklayer setting by detecting if the rate
349  * table were modified.
350  *
351  * For linklayer ATM table entries, the rate table will be aligned to
352  * 48 bytes, thus some table entries will contain the same value.  The
353  * mpu (min packet unit) is also encoded into the old rate table, thus
354  * starting from the mpu, we find low and high table entries for
355  * mapping this cell.  If these entries contain the same value, when
356  * the rate tables have been modified for linklayer ATM.
357  *
358  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
359  * and then roundup to the next cell, calc the table entry one below,
360  * and compare.
361  */
362 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
363 {
364         int low       = roundup(r->mpu, 48);
365         int high      = roundup(low+1, 48);
366         int cell_low  = low >> r->cell_log;
367         int cell_high = (high >> r->cell_log) - 1;
368
369         /* rtab is too inaccurate at rates > 100Mbit/s */
370         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
371                 pr_debug("TC linklayer: Giving up ATM detection\n");
372                 return TC_LINKLAYER_ETHERNET;
373         }
374
375         if ((cell_high > cell_low) && (cell_high < 256)
376             && (rtab[cell_low] == rtab[cell_high])) {
377                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
378                          cell_low, cell_high, rtab[cell_high]);
379                 return TC_LINKLAYER_ATM;
380         }
381         return TC_LINKLAYER_ETHERNET;
382 }
383
384 static struct qdisc_rate_table *qdisc_rtab_list;
385
386 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
387 {
388         struct qdisc_rate_table *rtab;
389
390         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
391             nla_len(tab) != TC_RTAB_SIZE)
392                 return NULL;
393
394         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
395                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
396                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
397                         rtab->refcnt++;
398                         return rtab;
399                 }
400         }
401
402         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
403         if (rtab) {
404                 rtab->rate = *r;
405                 rtab->refcnt = 1;
406                 memcpy(rtab->data, nla_data(tab), 1024);
407                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
408                         r->linklayer = __detect_linklayer(r, rtab->data);
409                 rtab->next = qdisc_rtab_list;
410                 qdisc_rtab_list = rtab;
411         }
412         return rtab;
413 }
414 EXPORT_SYMBOL(qdisc_get_rtab);
415
416 void qdisc_put_rtab(struct qdisc_rate_table *tab)
417 {
418         struct qdisc_rate_table *rtab, **rtabp;
419
420         if (!tab || --tab->refcnt)
421                 return;
422
423         for (rtabp = &qdisc_rtab_list;
424              (rtab = *rtabp) != NULL;
425              rtabp = &rtab->next) {
426                 if (rtab == tab) {
427                         *rtabp = rtab->next;
428                         kfree(rtab);
429                         return;
430                 }
431         }
432 }
433 EXPORT_SYMBOL(qdisc_put_rtab);
434
435 static LIST_HEAD(qdisc_stab_list);
436 static DEFINE_SPINLOCK(qdisc_stab_lock);
437
438 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
439         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
440         [TCA_STAB_DATA] = { .type = NLA_BINARY },
441 };
442
443 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
444 {
445         struct nlattr *tb[TCA_STAB_MAX + 1];
446         struct qdisc_size_table *stab;
447         struct tc_sizespec *s;
448         unsigned int tsize = 0;
449         u16 *tab = NULL;
450         int err;
451
452         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
453         if (err < 0)
454                 return ERR_PTR(err);
455         if (!tb[TCA_STAB_BASE])
456                 return ERR_PTR(-EINVAL);
457
458         s = nla_data(tb[TCA_STAB_BASE]);
459
460         if (s->tsize > 0) {
461                 if (!tb[TCA_STAB_DATA])
462                         return ERR_PTR(-EINVAL);
463                 tab = nla_data(tb[TCA_STAB_DATA]);
464                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
465         }
466
467         if (tsize != s->tsize || (!tab && tsize > 0))
468                 return ERR_PTR(-EINVAL);
469
470         spin_lock(&qdisc_stab_lock);
471
472         list_for_each_entry(stab, &qdisc_stab_list, list) {
473                 if (memcmp(&stab->szopts, s, sizeof(*s)))
474                         continue;
475                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
476                         continue;
477                 stab->refcnt++;
478                 spin_unlock(&qdisc_stab_lock);
479                 return stab;
480         }
481
482         spin_unlock(&qdisc_stab_lock);
483
484         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
485         if (!stab)
486                 return ERR_PTR(-ENOMEM);
487
488         stab->refcnt = 1;
489         stab->szopts = *s;
490         if (tsize > 0)
491                 memcpy(stab->data, tab, tsize * sizeof(u16));
492
493         spin_lock(&qdisc_stab_lock);
494         list_add_tail(&stab->list, &qdisc_stab_list);
495         spin_unlock(&qdisc_stab_lock);
496
497         return stab;
498 }
499
500 static void stab_kfree_rcu(struct rcu_head *head)
501 {
502         kfree(container_of(head, struct qdisc_size_table, rcu));
503 }
504
505 void qdisc_put_stab(struct qdisc_size_table *tab)
506 {
507         if (!tab)
508                 return;
509
510         spin_lock(&qdisc_stab_lock);
511
512         if (--tab->refcnt == 0) {
513                 list_del(&tab->list);
514                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
515         }
516
517         spin_unlock(&qdisc_stab_lock);
518 }
519 EXPORT_SYMBOL(qdisc_put_stab);
520
521 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
522 {
523         struct nlattr *nest;
524
525         nest = nla_nest_start(skb, TCA_STAB);
526         if (nest == NULL)
527                 goto nla_put_failure;
528         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
529                 goto nla_put_failure;
530         nla_nest_end(skb, nest);
531
532         return skb->len;
533
534 nla_put_failure:
535         return -1;
536 }
537
538 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
539 {
540         int pkt_len, slot;
541
542         pkt_len = skb->len + stab->szopts.overhead;
543         if (unlikely(!stab->szopts.tsize))
544                 goto out;
545
546         slot = pkt_len + stab->szopts.cell_align;
547         if (unlikely(slot < 0))
548                 slot = 0;
549
550         slot >>= stab->szopts.cell_log;
551         if (likely(slot < stab->szopts.tsize))
552                 pkt_len = stab->data[slot];
553         else
554                 pkt_len = stab->data[stab->szopts.tsize - 1] *
555                                 (slot / stab->szopts.tsize) +
556                                 stab->data[slot % stab->szopts.tsize];
557
558         pkt_len <<= stab->szopts.size_log;
559 out:
560         if (unlikely(pkt_len < 1))
561                 pkt_len = 1;
562         qdisc_skb_cb(skb)->pkt_len = pkt_len;
563 }
564 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
565
566 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
567 {
568         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
569                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
570                         txt, qdisc->ops->id, qdisc->handle >> 16);
571                 qdisc->flags |= TCQ_F_WARN_NONWC;
572         }
573 }
574 EXPORT_SYMBOL(qdisc_warn_nonwc);
575
576 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
577 {
578         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
579                                                  timer);
580
581         rcu_read_lock();
582         qdisc_unthrottled(wd->qdisc);
583         __netif_schedule(qdisc_root(wd->qdisc));
584         rcu_read_unlock();
585
586         return HRTIMER_NORESTART;
587 }
588
589 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
590 {
591         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
592         wd->timer.function = qdisc_watchdog;
593         wd->qdisc = qdisc;
594 }
595 EXPORT_SYMBOL(qdisc_watchdog_init);
596
597 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle)
598 {
599         if (test_bit(__QDISC_STATE_DEACTIVATED,
600                      &qdisc_root_sleeping(wd->qdisc)->state))
601                 return;
602
603         if (throttle)
604                 qdisc_throttled(wd->qdisc);
605
606         hrtimer_start(&wd->timer,
607                       ns_to_ktime(expires),
608                       HRTIMER_MODE_ABS_PINNED);
609 }
610 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
611
612 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
613 {
614         hrtimer_cancel(&wd->timer);
615         qdisc_unthrottled(wd->qdisc);
616 }
617 EXPORT_SYMBOL(qdisc_watchdog_cancel);
618
619 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
620 {
621         unsigned int size = n * sizeof(struct hlist_head), i;
622         struct hlist_head *h;
623
624         if (size <= PAGE_SIZE)
625                 h = kmalloc(size, GFP_KERNEL);
626         else
627                 h = (struct hlist_head *)
628                         __get_free_pages(GFP_KERNEL, get_order(size));
629
630         if (h != NULL) {
631                 for (i = 0; i < n; i++)
632                         INIT_HLIST_HEAD(&h[i]);
633         }
634         return h;
635 }
636
637 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
638 {
639         unsigned int size = n * sizeof(struct hlist_head);
640
641         if (size <= PAGE_SIZE)
642                 kfree(h);
643         else
644                 free_pages((unsigned long)h, get_order(size));
645 }
646
647 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
648 {
649         struct Qdisc_class_common *cl;
650         struct hlist_node *next;
651         struct hlist_head *nhash, *ohash;
652         unsigned int nsize, nmask, osize;
653         unsigned int i, h;
654
655         /* Rehash when load factor exceeds 0.75 */
656         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
657                 return;
658         nsize = clhash->hashsize * 2;
659         nmask = nsize - 1;
660         nhash = qdisc_class_hash_alloc(nsize);
661         if (nhash == NULL)
662                 return;
663
664         ohash = clhash->hash;
665         osize = clhash->hashsize;
666
667         sch_tree_lock(sch);
668         for (i = 0; i < osize; i++) {
669                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
670                         h = qdisc_class_hash(cl->classid, nmask);
671                         hlist_add_head(&cl->hnode, &nhash[h]);
672                 }
673         }
674         clhash->hash     = nhash;
675         clhash->hashsize = nsize;
676         clhash->hashmask = nmask;
677         sch_tree_unlock(sch);
678
679         qdisc_class_hash_free(ohash, osize);
680 }
681 EXPORT_SYMBOL(qdisc_class_hash_grow);
682
683 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
684 {
685         unsigned int size = 4;
686
687         clhash->hash = qdisc_class_hash_alloc(size);
688         if (clhash->hash == NULL)
689                 return -ENOMEM;
690         clhash->hashsize  = size;
691         clhash->hashmask  = size - 1;
692         clhash->hashelems = 0;
693         return 0;
694 }
695 EXPORT_SYMBOL(qdisc_class_hash_init);
696
697 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
698 {
699         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
700 }
701 EXPORT_SYMBOL(qdisc_class_hash_destroy);
702
703 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
704                              struct Qdisc_class_common *cl)
705 {
706         unsigned int h;
707
708         INIT_HLIST_NODE(&cl->hnode);
709         h = qdisc_class_hash(cl->classid, clhash->hashmask);
710         hlist_add_head(&cl->hnode, &clhash->hash[h]);
711         clhash->hashelems++;
712 }
713 EXPORT_SYMBOL(qdisc_class_hash_insert);
714
715 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
716                              struct Qdisc_class_common *cl)
717 {
718         hlist_del(&cl->hnode);
719         clhash->hashelems--;
720 }
721 EXPORT_SYMBOL(qdisc_class_hash_remove);
722
723 /* Allocate an unique handle from space managed by kernel
724  * Possible range is [8000-FFFF]:0000 (0x8000 values)
725  */
726 static u32 qdisc_alloc_handle(struct net_device *dev)
727 {
728         int i = 0x8000;
729         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
730
731         do {
732                 autohandle += TC_H_MAKE(0x10000U, 0);
733                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
734                         autohandle = TC_H_MAKE(0x80000000U, 0);
735                 if (!qdisc_lookup(dev, autohandle))
736                         return autohandle;
737                 cond_resched();
738         } while (--i > 0);
739
740         return 0;
741 }
742
743 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
744 {
745         const struct Qdisc_class_ops *cops;
746         unsigned long cl;
747         u32 parentid;
748         int drops;
749
750         if (n == 0)
751                 return;
752         drops = max_t(int, n, 0);
753         while ((parentid = sch->parent)) {
754                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
755                         return;
756
757                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
758                 if (sch == NULL) {
759                         WARN_ON(parentid != TC_H_ROOT);
760                         return;
761                 }
762                 cops = sch->ops->cl_ops;
763                 if (cops->qlen_notify) {
764                         cl = cops->get(sch, parentid);
765                         cops->qlen_notify(sch, cl);
766                         cops->put(sch, cl);
767                 }
768                 sch->q.qlen -= n;
769                 __qdisc_qstats_drop(sch, drops);
770         }
771 }
772 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
773
774 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
775                                struct nlmsghdr *n, u32 clid,
776                                struct Qdisc *old, struct Qdisc *new)
777 {
778         if (new || old)
779                 qdisc_notify(net, skb, n, clid, old, new);
780
781         if (old)
782                 qdisc_destroy(old);
783 }
784
785 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
786  * to device "dev".
787  *
788  * When appropriate send a netlink notification using 'skb'
789  * and "n".
790  *
791  * On success, destroy old qdisc.
792  */
793
794 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
795                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
796                        struct Qdisc *new, struct Qdisc *old)
797 {
798         struct Qdisc *q = old;
799         struct net *net = dev_net(dev);
800         int err = 0;
801
802         if (parent == NULL) {
803                 unsigned int i, num_q, ingress;
804
805                 ingress = 0;
806                 num_q = dev->num_tx_queues;
807                 if ((q && q->flags & TCQ_F_INGRESS) ||
808                     (new && new->flags & TCQ_F_INGRESS)) {
809                         num_q = 1;
810                         ingress = 1;
811                         if (!dev_ingress_queue(dev))
812                                 return -ENOENT;
813                 }
814
815                 if (dev->flags & IFF_UP)
816                         dev_deactivate(dev);
817
818                 if (new && new->ops->attach) {
819                         new->ops->attach(new);
820                         num_q = 0;
821                 }
822
823                 for (i = 0; i < num_q; i++) {
824                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
825
826                         if (!ingress)
827                                 dev_queue = netdev_get_tx_queue(dev, i);
828
829                         old = dev_graft_qdisc(dev_queue, new);
830                         if (new && i > 0)
831                                 atomic_inc(&new->refcnt);
832
833                         if (!ingress)
834                                 qdisc_destroy(old);
835                 }
836
837                 if (!ingress) {
838                         notify_and_destroy(net, skb, n, classid,
839                                            dev->qdisc, new);
840                         if (new && !new->ops->attach)
841                                 atomic_inc(&new->refcnt);
842                         dev->qdisc = new ? : &noop_qdisc;
843                 } else {
844                         notify_and_destroy(net, skb, n, classid, old, new);
845                 }
846
847                 if (dev->flags & IFF_UP)
848                         dev_activate(dev);
849         } else {
850                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
851
852                 err = -EOPNOTSUPP;
853                 if (cops && cops->graft) {
854                         unsigned long cl = cops->get(parent, classid);
855                         if (cl) {
856                                 err = cops->graft(parent, cl, new, &old);
857                                 cops->put(parent, cl);
858                         } else
859                                 err = -ENOENT;
860                 }
861                 if (!err)
862                         notify_and_destroy(net, skb, n, classid, old, new);
863         }
864         return err;
865 }
866
867 /* lockdep annotation is needed for ingress; egress gets it only for name */
868 static struct lock_class_key qdisc_tx_lock;
869 static struct lock_class_key qdisc_rx_lock;
870
871 /*
872    Allocate and initialize new qdisc.
873
874    Parameters are passed via opt.
875  */
876
877 static struct Qdisc *
878 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
879              struct Qdisc *p, u32 parent, u32 handle,
880              struct nlattr **tca, int *errp)
881 {
882         int err;
883         struct nlattr *kind = tca[TCA_KIND];
884         struct Qdisc *sch;
885         struct Qdisc_ops *ops;
886         struct qdisc_size_table *stab;
887
888         ops = qdisc_lookup_ops(kind);
889 #ifdef CONFIG_MODULES
890         if (ops == NULL && kind != NULL) {
891                 char name[IFNAMSIZ];
892                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
893                         /* We dropped the RTNL semaphore in order to
894                          * perform the module load.  So, even if we
895                          * succeeded in loading the module we have to
896                          * tell the caller to replay the request.  We
897                          * indicate this using -EAGAIN.
898                          * We replay the request because the device may
899                          * go away in the mean time.
900                          */
901                         rtnl_unlock();
902                         request_module("sch_%s", name);
903                         rtnl_lock();
904                         ops = qdisc_lookup_ops(kind);
905                         if (ops != NULL) {
906                                 /* We will try again qdisc_lookup_ops,
907                                  * so don't keep a reference.
908                                  */
909                                 module_put(ops->owner);
910                                 err = -EAGAIN;
911                                 goto err_out;
912                         }
913                 }
914         }
915 #endif
916
917         err = -ENOENT;
918         if (ops == NULL)
919                 goto err_out;
920
921         sch = qdisc_alloc(dev_queue, ops);
922         if (IS_ERR(sch)) {
923                 err = PTR_ERR(sch);
924                 goto err_out2;
925         }
926
927         sch->parent = parent;
928
929         if (handle == TC_H_INGRESS) {
930                 sch->flags |= TCQ_F_INGRESS;
931                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
932                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
933         } else {
934                 if (handle == 0) {
935                         handle = qdisc_alloc_handle(dev);
936                         err = -ENOMEM;
937                         if (handle == 0)
938                                 goto err_out3;
939                 }
940                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
941                 if (!netif_is_multiqueue(dev))
942                         sch->flags |= TCQ_F_ONETXQUEUE;
943         }
944
945         sch->handle = handle;
946
947         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
948                 if (qdisc_is_percpu_stats(sch)) {
949                         sch->cpu_bstats =
950                                 netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
951                         if (!sch->cpu_bstats)
952                                 goto err_out4;
953
954                         sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
955                         if (!sch->cpu_qstats)
956                                 goto err_out4;
957                 }
958
959                 if (tca[TCA_STAB]) {
960                         stab = qdisc_get_stab(tca[TCA_STAB]);
961                         if (IS_ERR(stab)) {
962                                 err = PTR_ERR(stab);
963                                 goto err_out4;
964                         }
965                         rcu_assign_pointer(sch->stab, stab);
966                 }
967                 if (tca[TCA_RATE]) {
968                         spinlock_t *root_lock;
969
970                         err = -EOPNOTSUPP;
971                         if (sch->flags & TCQ_F_MQROOT)
972                                 goto err_out4;
973
974                         if ((sch->parent != TC_H_ROOT) &&
975                             !(sch->flags & TCQ_F_INGRESS) &&
976                             (!p || !(p->flags & TCQ_F_MQROOT)))
977                                 root_lock = qdisc_root_sleeping_lock(sch);
978                         else
979                                 root_lock = qdisc_lock(sch);
980
981                         err = gen_new_estimator(&sch->bstats,
982                                                 sch->cpu_bstats,
983                                                 &sch->rate_est,
984                                                 root_lock,
985                                                 tca[TCA_RATE]);
986                         if (err)
987                                 goto err_out4;
988                 }
989
990                 qdisc_list_add(sch);
991
992                 return sch;
993         }
994 err_out3:
995         dev_put(dev);
996         kfree((char *) sch - sch->padded);
997 err_out2:
998         module_put(ops->owner);
999 err_out:
1000         *errp = err;
1001         return NULL;
1002
1003 err_out4:
1004         free_percpu(sch->cpu_bstats);
1005         free_percpu(sch->cpu_qstats);
1006         /*
1007          * Any broken qdiscs that would require a ops->reset() here?
1008          * The qdisc was never in action so it shouldn't be necessary.
1009          */
1010         qdisc_put_stab(rtnl_dereference(sch->stab));
1011         if (ops->destroy)
1012                 ops->destroy(sch);
1013         goto err_out3;
1014 }
1015
1016 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1017 {
1018         struct qdisc_size_table *ostab, *stab = NULL;
1019         int err = 0;
1020
1021         if (tca[TCA_OPTIONS]) {
1022                 if (sch->ops->change == NULL)
1023                         return -EINVAL;
1024                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1025                 if (err)
1026                         return err;
1027         }
1028
1029         if (tca[TCA_STAB]) {
1030                 stab = qdisc_get_stab(tca[TCA_STAB]);
1031                 if (IS_ERR(stab))
1032                         return PTR_ERR(stab);
1033         }
1034
1035         ostab = rtnl_dereference(sch->stab);
1036         rcu_assign_pointer(sch->stab, stab);
1037         qdisc_put_stab(ostab);
1038
1039         if (tca[TCA_RATE]) {
1040                 /* NB: ignores errors from replace_estimator
1041                    because change can't be undone. */
1042                 if (sch->flags & TCQ_F_MQROOT)
1043                         goto out;
1044                 gen_replace_estimator(&sch->bstats,
1045                                       sch->cpu_bstats,
1046                                       &sch->rate_est,
1047                                       qdisc_root_sleeping_lock(sch),
1048                                       tca[TCA_RATE]);
1049         }
1050 out:
1051         return 0;
1052 }
1053
1054 struct check_loop_arg {
1055         struct qdisc_walker     w;
1056         struct Qdisc            *p;
1057         int                     depth;
1058 };
1059
1060 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1061
1062 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1063 {
1064         struct check_loop_arg   arg;
1065
1066         if (q->ops->cl_ops == NULL)
1067                 return 0;
1068
1069         arg.w.stop = arg.w.skip = arg.w.count = 0;
1070         arg.w.fn = check_loop_fn;
1071         arg.depth = depth;
1072         arg.p = p;
1073         q->ops->cl_ops->walk(q, &arg.w);
1074         return arg.w.stop ? -ELOOP : 0;
1075 }
1076
1077 static int
1078 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1079 {
1080         struct Qdisc *leaf;
1081         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1082         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1083
1084         leaf = cops->leaf(q, cl);
1085         if (leaf) {
1086                 if (leaf == arg->p || arg->depth > 7)
1087                         return -ELOOP;
1088                 return check_loop(leaf, arg->p, arg->depth + 1);
1089         }
1090         return 0;
1091 }
1092
1093 /*
1094  * Delete/get qdisc.
1095  */
1096
1097 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1098 {
1099         struct net *net = sock_net(skb->sk);
1100         struct tcmsg *tcm = nlmsg_data(n);
1101         struct nlattr *tca[TCA_MAX + 1];
1102         struct net_device *dev;
1103         u32 clid;
1104         struct Qdisc *q = NULL;
1105         struct Qdisc *p = NULL;
1106         int err;
1107
1108         if ((n->nlmsg_type != RTM_GETQDISC) &&
1109             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1110                 return -EPERM;
1111
1112         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1113         if (err < 0)
1114                 return err;
1115
1116         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1117         if (!dev)
1118                 return -ENODEV;
1119
1120         clid = tcm->tcm_parent;
1121         if (clid) {
1122                 if (clid != TC_H_ROOT) {
1123                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1124                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1125                                 if (!p)
1126                                         return -ENOENT;
1127                                 q = qdisc_leaf(p, clid);
1128                         } else if (dev_ingress_queue(dev)) {
1129                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1130                         }
1131                 } else {
1132                         q = dev->qdisc;
1133                 }
1134                 if (!q)
1135                         return -ENOENT;
1136
1137                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1138                         return -EINVAL;
1139         } else {
1140                 q = qdisc_lookup(dev, tcm->tcm_handle);
1141                 if (!q)
1142                         return -ENOENT;
1143         }
1144
1145         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1146                 return -EINVAL;
1147
1148         if (n->nlmsg_type == RTM_DELQDISC) {
1149                 if (!clid)
1150                         return -EINVAL;
1151                 if (q->handle == 0)
1152                         return -ENOENT;
1153                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1154                 if (err != 0)
1155                         return err;
1156         } else {
1157                 qdisc_notify(net, skb, n, clid, NULL, q);
1158         }
1159         return 0;
1160 }
1161
1162 /*
1163  * Create/change qdisc.
1164  */
1165
1166 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1167 {
1168         struct net *net = sock_net(skb->sk);
1169         struct tcmsg *tcm;
1170         struct nlattr *tca[TCA_MAX + 1];
1171         struct net_device *dev;
1172         u32 clid;
1173         struct Qdisc *q, *p;
1174         int err;
1175
1176         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1177                 return -EPERM;
1178
1179 replay:
1180         /* Reinit, just in case something touches this. */
1181         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1182         if (err < 0)
1183                 return err;
1184
1185         tcm = nlmsg_data(n);
1186         clid = tcm->tcm_parent;
1187         q = p = NULL;
1188
1189         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1190         if (!dev)
1191                 return -ENODEV;
1192
1193
1194         if (clid) {
1195                 if (clid != TC_H_ROOT) {
1196                         if (clid != TC_H_INGRESS) {
1197                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1198                                 if (!p)
1199                                         return -ENOENT;
1200                                 q = qdisc_leaf(p, clid);
1201                         } else if (dev_ingress_queue_create(dev)) {
1202                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1203                         }
1204                 } else {
1205                         q = dev->qdisc;
1206                 }
1207
1208                 /* It may be default qdisc, ignore it */
1209                 if (q && q->handle == 0)
1210                         q = NULL;
1211
1212                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1213                         if (tcm->tcm_handle) {
1214                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1215                                         return -EEXIST;
1216                                 if (TC_H_MIN(tcm->tcm_handle))
1217                                         return -EINVAL;
1218                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1219                                 if (!q)
1220                                         goto create_n_graft;
1221                                 if (n->nlmsg_flags & NLM_F_EXCL)
1222                                         return -EEXIST;
1223                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1224                                         return -EINVAL;
1225                                 if (q == p ||
1226                                     (p && check_loop(q, p, 0)))
1227                                         return -ELOOP;
1228                                 atomic_inc(&q->refcnt);
1229                                 goto graft;
1230                         } else {
1231                                 if (!q)
1232                                         goto create_n_graft;
1233
1234                                 /* This magic test requires explanation.
1235                                  *
1236                                  *   We know, that some child q is already
1237                                  *   attached to this parent and have choice:
1238                                  *   either to change it or to create/graft new one.
1239                                  *
1240                                  *   1. We are allowed to create/graft only
1241                                  *   if CREATE and REPLACE flags are set.
1242                                  *
1243                                  *   2. If EXCL is set, requestor wanted to say,
1244                                  *   that qdisc tcm_handle is not expected
1245                                  *   to exist, so that we choose create/graft too.
1246                                  *
1247                                  *   3. The last case is when no flags are set.
1248                                  *   Alas, it is sort of hole in API, we
1249                                  *   cannot decide what to do unambiguously.
1250                                  *   For now we select create/graft, if
1251                                  *   user gave KIND, which does not match existing.
1252                                  */
1253                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1254                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1255                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1256                                      (tca[TCA_KIND] &&
1257                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1258                                         goto create_n_graft;
1259                         }
1260                 }
1261         } else {
1262                 if (!tcm->tcm_handle)
1263                         return -EINVAL;
1264                 q = qdisc_lookup(dev, tcm->tcm_handle);
1265         }
1266
1267         /* Change qdisc parameters */
1268         if (q == NULL)
1269                 return -ENOENT;
1270         if (n->nlmsg_flags & NLM_F_EXCL)
1271                 return -EEXIST;
1272         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1273                 return -EINVAL;
1274         err = qdisc_change(q, tca);
1275         if (err == 0)
1276                 qdisc_notify(net, skb, n, clid, NULL, q);
1277         return err;
1278
1279 create_n_graft:
1280         if (!(n->nlmsg_flags & NLM_F_CREATE))
1281                 return -ENOENT;
1282         if (clid == TC_H_INGRESS) {
1283                 if (dev_ingress_queue(dev))
1284                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1285                                          tcm->tcm_parent, tcm->tcm_parent,
1286                                          tca, &err);
1287                 else
1288                         err = -ENOENT;
1289         } else {
1290                 struct netdev_queue *dev_queue;
1291
1292                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1293                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1294                 else if (p)
1295                         dev_queue = p->dev_queue;
1296                 else
1297                         dev_queue = netdev_get_tx_queue(dev, 0);
1298
1299                 q = qdisc_create(dev, dev_queue, p,
1300                                  tcm->tcm_parent, tcm->tcm_handle,
1301                                  tca, &err);
1302         }
1303         if (q == NULL) {
1304                 if (err == -EAGAIN)
1305                         goto replay;
1306                 return err;
1307         }
1308
1309 graft:
1310         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1311         if (err) {
1312                 if (q)
1313                         qdisc_destroy(q);
1314                 return err;
1315         }
1316
1317         return 0;
1318 }
1319
1320 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1321                          u32 portid, u32 seq, u16 flags, int event)
1322 {
1323         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1324         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1325         struct tcmsg *tcm;
1326         struct nlmsghdr  *nlh;
1327         unsigned char *b = skb_tail_pointer(skb);
1328         struct gnet_dump d;
1329         struct qdisc_size_table *stab;
1330         __u32 qlen;
1331
1332         cond_resched();
1333         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1334         if (!nlh)
1335                 goto out_nlmsg_trim;
1336         tcm = nlmsg_data(nlh);
1337         tcm->tcm_family = AF_UNSPEC;
1338         tcm->tcm__pad1 = 0;
1339         tcm->tcm__pad2 = 0;
1340         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1341         tcm->tcm_parent = clid;
1342         tcm->tcm_handle = q->handle;
1343         tcm->tcm_info = atomic_read(&q->refcnt);
1344         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1345                 goto nla_put_failure;
1346         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1347                 goto nla_put_failure;
1348         qlen = q->q.qlen;
1349
1350         stab = rtnl_dereference(q->stab);
1351         if (stab && qdisc_dump_stab(skb, stab) < 0)
1352                 goto nla_put_failure;
1353
1354         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1355                                          qdisc_root_sleeping_lock(q), &d) < 0)
1356                 goto nla_put_failure;
1357
1358         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1359                 goto nla_put_failure;
1360
1361         if (qdisc_is_percpu_stats(q)) {
1362                 cpu_bstats = q->cpu_bstats;
1363                 cpu_qstats = q->cpu_qstats;
1364         }
1365
1366         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
1367             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1368             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1369                 goto nla_put_failure;
1370
1371         if (gnet_stats_finish_copy(&d) < 0)
1372                 goto nla_put_failure;
1373
1374         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1375         return skb->len;
1376
1377 out_nlmsg_trim:
1378 nla_put_failure:
1379         nlmsg_trim(skb, b);
1380         return -1;
1381 }
1382
1383 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1384 {
1385         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1386 }
1387
1388 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1389                         struct nlmsghdr *n, u32 clid,
1390                         struct Qdisc *old, struct Qdisc *new)
1391 {
1392         struct sk_buff *skb;
1393         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1394
1395         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1396         if (!skb)
1397                 return -ENOBUFS;
1398
1399         if (old && !tc_qdisc_dump_ignore(old)) {
1400                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1401                                   0, RTM_DELQDISC) < 0)
1402                         goto err_out;
1403         }
1404         if (new && !tc_qdisc_dump_ignore(new)) {
1405                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1406                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1407                         goto err_out;
1408         }
1409
1410         if (skb->len)
1411                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1412                                       n->nlmsg_flags & NLM_F_ECHO);
1413
1414 err_out:
1415         kfree_skb(skb);
1416         return -EINVAL;
1417 }
1418
1419 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1420                               struct netlink_callback *cb,
1421                               int *q_idx_p, int s_q_idx)
1422 {
1423         int ret = 0, q_idx = *q_idx_p;
1424         struct Qdisc *q;
1425
1426         if (!root)
1427                 return 0;
1428
1429         q = root;
1430         if (q_idx < s_q_idx) {
1431                 q_idx++;
1432         } else {
1433                 if (!tc_qdisc_dump_ignore(q) &&
1434                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1435                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1436                         goto done;
1437                 q_idx++;
1438         }
1439         list_for_each_entry(q, &root->list, list) {
1440                 if (q_idx < s_q_idx) {
1441                         q_idx++;
1442                         continue;
1443                 }
1444                 if (!tc_qdisc_dump_ignore(q) &&
1445                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1446                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1447                         goto done;
1448                 q_idx++;
1449         }
1450
1451 out:
1452         *q_idx_p = q_idx;
1453         return ret;
1454 done:
1455         ret = -1;
1456         goto out;
1457 }
1458
1459 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1460 {
1461         struct net *net = sock_net(skb->sk);
1462         int idx, q_idx;
1463         int s_idx, s_q_idx;
1464         struct net_device *dev;
1465
1466         s_idx = cb->args[0];
1467         s_q_idx = q_idx = cb->args[1];
1468
1469         idx = 0;
1470         ASSERT_RTNL();
1471         for_each_netdev(net, dev) {
1472                 struct netdev_queue *dev_queue;
1473
1474                 if (idx < s_idx)
1475                         goto cont;
1476                 if (idx > s_idx)
1477                         s_q_idx = 0;
1478                 q_idx = 0;
1479
1480                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1481                         goto done;
1482
1483                 dev_queue = dev_ingress_queue(dev);
1484                 if (dev_queue &&
1485                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1486                                        &q_idx, s_q_idx) < 0)
1487                         goto done;
1488
1489 cont:
1490                 idx++;
1491         }
1492
1493 done:
1494         cb->args[0] = idx;
1495         cb->args[1] = q_idx;
1496
1497         return skb->len;
1498 }
1499
1500
1501
1502 /************************************************
1503  *      Traffic classes manipulation.           *
1504  ************************************************/
1505
1506
1507
1508 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1509 {
1510         struct net *net = sock_net(skb->sk);
1511         struct tcmsg *tcm = nlmsg_data(n);
1512         struct nlattr *tca[TCA_MAX + 1];
1513         struct net_device *dev;
1514         struct Qdisc *q = NULL;
1515         const struct Qdisc_class_ops *cops;
1516         unsigned long cl = 0;
1517         unsigned long new_cl;
1518         u32 portid;
1519         u32 clid;
1520         u32 qid;
1521         int err;
1522
1523         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1524             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1525                 return -EPERM;
1526
1527         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1528         if (err < 0)
1529                 return err;
1530
1531         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1532         if (!dev)
1533                 return -ENODEV;
1534
1535         /*
1536            parent == TC_H_UNSPEC - unspecified parent.
1537            parent == TC_H_ROOT   - class is root, which has no parent.
1538            parent == X:0         - parent is root class.
1539            parent == X:Y         - parent is a node in hierarchy.
1540            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1541
1542            handle == 0:0         - generate handle from kernel pool.
1543            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1544            handle == X:Y         - clear.
1545            handle == X:0         - root class.
1546          */
1547
1548         /* Step 1. Determine qdisc handle X:0 */
1549
1550         portid = tcm->tcm_parent;
1551         clid = tcm->tcm_handle;
1552         qid = TC_H_MAJ(clid);
1553
1554         if (portid != TC_H_ROOT) {
1555                 u32 qid1 = TC_H_MAJ(portid);
1556
1557                 if (qid && qid1) {
1558                         /* If both majors are known, they must be identical. */
1559                         if (qid != qid1)
1560                                 return -EINVAL;
1561                 } else if (qid1) {
1562                         qid = qid1;
1563                 } else if (qid == 0)
1564                         qid = dev->qdisc->handle;
1565
1566                 /* Now qid is genuine qdisc handle consistent
1567                  * both with parent and child.
1568                  *
1569                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1570                  */
1571                 if (portid)
1572                         portid = TC_H_MAKE(qid, portid);
1573         } else {
1574                 if (qid == 0)
1575                         qid = dev->qdisc->handle;
1576         }
1577
1578         /* OK. Locate qdisc */
1579         q = qdisc_lookup(dev, qid);
1580         if (!q)
1581                 return -ENOENT;
1582
1583         /* An check that it supports classes */
1584         cops = q->ops->cl_ops;
1585         if (cops == NULL)
1586                 return -EINVAL;
1587
1588         /* Now try to get class */
1589         if (clid == 0) {
1590                 if (portid == TC_H_ROOT)
1591                         clid = qid;
1592         } else
1593                 clid = TC_H_MAKE(qid, clid);
1594
1595         if (clid)
1596                 cl = cops->get(q, clid);
1597
1598         if (cl == 0) {
1599                 err = -ENOENT;
1600                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1601                     !(n->nlmsg_flags & NLM_F_CREATE))
1602                         goto out;
1603         } else {
1604                 switch (n->nlmsg_type) {
1605                 case RTM_NEWTCLASS:
1606                         err = -EEXIST;
1607                         if (n->nlmsg_flags & NLM_F_EXCL)
1608                                 goto out;
1609                         break;
1610                 case RTM_DELTCLASS:
1611                         err = -EOPNOTSUPP;
1612                         if (cops->delete)
1613                                 err = cops->delete(q, cl);
1614                         if (err == 0)
1615                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1616                         goto out;
1617                 case RTM_GETTCLASS:
1618                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1619                         goto out;
1620                 default:
1621                         err = -EINVAL;
1622                         goto out;
1623                 }
1624         }
1625
1626         new_cl = cl;
1627         err = -EOPNOTSUPP;
1628         if (cops->change)
1629                 err = cops->change(q, clid, portid, tca, &new_cl);
1630         if (err == 0)
1631                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1632
1633 out:
1634         if (cl)
1635                 cops->put(q, cl);
1636
1637         return err;
1638 }
1639
1640
1641 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1642                           unsigned long cl,
1643                           u32 portid, u32 seq, u16 flags, int event)
1644 {
1645         struct tcmsg *tcm;
1646         struct nlmsghdr  *nlh;
1647         unsigned char *b = skb_tail_pointer(skb);
1648         struct gnet_dump d;
1649         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1650
1651         cond_resched();
1652         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1653         if (!nlh)
1654                 goto out_nlmsg_trim;
1655         tcm = nlmsg_data(nlh);
1656         tcm->tcm_family = AF_UNSPEC;
1657         tcm->tcm__pad1 = 0;
1658         tcm->tcm__pad2 = 0;
1659         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1660         tcm->tcm_parent = q->handle;
1661         tcm->tcm_handle = q->handle;
1662         tcm->tcm_info = 0;
1663         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1664                 goto nla_put_failure;
1665         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1666                 goto nla_put_failure;
1667
1668         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1669                                          qdisc_root_sleeping_lock(q), &d) < 0)
1670                 goto nla_put_failure;
1671
1672         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1673                 goto nla_put_failure;
1674
1675         if (gnet_stats_finish_copy(&d) < 0)
1676                 goto nla_put_failure;
1677
1678         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1679         return skb->len;
1680
1681 out_nlmsg_trim:
1682 nla_put_failure:
1683         nlmsg_trim(skb, b);
1684         return -1;
1685 }
1686
1687 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1688                          struct nlmsghdr *n, struct Qdisc *q,
1689                          unsigned long cl, int event)
1690 {
1691         struct sk_buff *skb;
1692         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1693
1694         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1695         if (!skb)
1696                 return -ENOBUFS;
1697
1698         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1699                 kfree_skb(skb);
1700                 return -EINVAL;
1701         }
1702
1703         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1704                               n->nlmsg_flags & NLM_F_ECHO);
1705 }
1706
1707 struct qdisc_dump_args {
1708         struct qdisc_walker     w;
1709         struct sk_buff          *skb;
1710         struct netlink_callback *cb;
1711 };
1712
1713 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1714 {
1715         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1716
1717         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1718                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1719 }
1720
1721 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1722                                 struct tcmsg *tcm, struct netlink_callback *cb,
1723                                 int *t_p, int s_t)
1724 {
1725         struct qdisc_dump_args arg;
1726
1727         if (tc_qdisc_dump_ignore(q) ||
1728             *t_p < s_t || !q->ops->cl_ops ||
1729             (tcm->tcm_parent &&
1730              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1731                 (*t_p)++;
1732                 return 0;
1733         }
1734         if (*t_p > s_t)
1735                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1736         arg.w.fn = qdisc_class_dump;
1737         arg.skb = skb;
1738         arg.cb = cb;
1739         arg.w.stop  = 0;
1740         arg.w.skip = cb->args[1];
1741         arg.w.count = 0;
1742         q->ops->cl_ops->walk(q, &arg.w);
1743         cb->args[1] = arg.w.count;
1744         if (arg.w.stop)
1745                 return -1;
1746         (*t_p)++;
1747         return 0;
1748 }
1749
1750 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1751                                struct tcmsg *tcm, struct netlink_callback *cb,
1752                                int *t_p, int s_t)
1753 {
1754         struct Qdisc *q;
1755
1756         if (!root)
1757                 return 0;
1758
1759         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1760                 return -1;
1761
1762         list_for_each_entry(q, &root->list, list) {
1763                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1764                         return -1;
1765         }
1766
1767         return 0;
1768 }
1769
1770 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1771 {
1772         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1773         struct net *net = sock_net(skb->sk);
1774         struct netdev_queue *dev_queue;
1775         struct net_device *dev;
1776         int t, s_t;
1777
1778         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1779                 return 0;
1780         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1781         if (!dev)
1782                 return 0;
1783
1784         s_t = cb->args[0];
1785         t = 0;
1786
1787         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1788                 goto done;
1789
1790         dev_queue = dev_ingress_queue(dev);
1791         if (dev_queue &&
1792             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1793                                 &t, s_t) < 0)
1794                 goto done;
1795
1796 done:
1797         cb->args[0] = t;
1798
1799         dev_put(dev);
1800         return skb->len;
1801 }
1802
1803 /* Main classifier routine: scans classifier chain attached
1804  * to this qdisc, (optionally) tests for protocol and asks
1805  * specific classifiers.
1806  */
1807 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1808                        struct tcf_result *res)
1809 {
1810         __be16 protocol = tc_skb_protocol(skb);
1811         int err;
1812
1813         for (; tp; tp = rcu_dereference_bh(tp->next)) {
1814                 if (tp->protocol != protocol &&
1815                     tp->protocol != htons(ETH_P_ALL))
1816                         continue;
1817                 err = tp->classify(skb, tp, res);
1818
1819                 if (err >= 0) {
1820 #ifdef CONFIG_NET_CLS_ACT
1821                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1822                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1823 #endif
1824                         return err;
1825                 }
1826         }
1827         return -1;
1828 }
1829 EXPORT_SYMBOL(tc_classify_compat);
1830
1831 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1832                 struct tcf_result *res)
1833 {
1834         int err = 0;
1835 #ifdef CONFIG_NET_CLS_ACT
1836         const struct tcf_proto *otp = tp;
1837 reclassify:
1838 #endif
1839
1840         err = tc_classify_compat(skb, tp, res);
1841 #ifdef CONFIG_NET_CLS_ACT
1842         if (err == TC_ACT_RECLASSIFY) {
1843                 u32 verd = G_TC_VERD(skb->tc_verd);
1844                 tp = otp;
1845
1846                 if (verd++ >= MAX_REC_LOOP) {
1847                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1848                                                tp->q->ops->id,
1849                                                tp->prio & 0xffff,
1850                                                ntohs(tp->protocol));
1851                         return TC_ACT_SHOT;
1852                 }
1853                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1854                 goto reclassify;
1855         }
1856 #endif
1857         return err;
1858 }
1859 EXPORT_SYMBOL(tc_classify);
1860
1861 bool tcf_destroy(struct tcf_proto *tp, bool force)
1862 {
1863         if (tp->ops->destroy(tp, force)) {
1864                 module_put(tp->ops->owner);
1865                 kfree_rcu(tp, rcu);
1866                 return true;
1867         }
1868
1869         return false;
1870 }
1871
1872 void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1873 {
1874         struct tcf_proto *tp;
1875
1876         while ((tp = rtnl_dereference(*fl)) != NULL) {
1877                 RCU_INIT_POINTER(*fl, tp->next);
1878                 tcf_destroy(tp, true);
1879         }
1880 }
1881 EXPORT_SYMBOL(tcf_destroy_chain);
1882
1883 #ifdef CONFIG_PROC_FS
1884 static int psched_show(struct seq_file *seq, void *v)
1885 {
1886         struct timespec ts;
1887
1888         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1889         seq_printf(seq, "%08x %08x %08x %08x\n",
1890                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1891                    1000000,
1892                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1893
1894         return 0;
1895 }
1896
1897 static int psched_open(struct inode *inode, struct file *file)
1898 {
1899         return single_open(file, psched_show, NULL);
1900 }
1901
1902 static const struct file_operations psched_fops = {
1903         .owner = THIS_MODULE,
1904         .open = psched_open,
1905         .read  = seq_read,
1906         .llseek = seq_lseek,
1907         .release = single_release,
1908 };
1909
1910 static int __net_init psched_net_init(struct net *net)
1911 {
1912         struct proc_dir_entry *e;
1913
1914         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1915         if (e == NULL)
1916                 return -ENOMEM;
1917
1918         return 0;
1919 }
1920
1921 static void __net_exit psched_net_exit(struct net *net)
1922 {
1923         remove_proc_entry("psched", net->proc_net);
1924 }
1925 #else
1926 static int __net_init psched_net_init(struct net *net)
1927 {
1928         return 0;
1929 }
1930
1931 static void __net_exit psched_net_exit(struct net *net)
1932 {
1933 }
1934 #endif
1935
1936 static struct pernet_operations psched_net_ops = {
1937         .init = psched_net_init,
1938         .exit = psched_net_exit,
1939 };
1940
1941 static int __init pktsched_init(void)
1942 {
1943         int err;
1944
1945         err = register_pernet_subsys(&psched_net_ops);
1946         if (err) {
1947                 pr_err("pktsched_init: "
1948                        "cannot initialize per netns operations\n");
1949                 return err;
1950         }
1951
1952         register_qdisc(&pfifo_fast_ops);
1953         register_qdisc(&pfifo_qdisc_ops);
1954         register_qdisc(&bfifo_qdisc_ops);
1955         register_qdisc(&pfifo_head_drop_qdisc_ops);
1956         register_qdisc(&mq_qdisc_ops);
1957
1958         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1959         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1960         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1961         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1962         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1963         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1964
1965         return 0;
1966 }
1967
1968 subsys_initcall(pktsched_init);