Merge tag 'mips_fixes_4.15_2' of git://git.kernel.org/pub/scm/linux/kernel/git/jhogan...
[sfrench/cifs-2.6.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39
40 /*
41
42    Short review.
43    -------------
44
45    This file consists of two interrelated parts:
46
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68
69    All real intelligent work is done inside qdisc modules.
70
71
72
73    Every discipline has two major routines: enqueue and dequeue.
74
75    ---dequeue
76
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83
84    ---enqueue
85
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP        - this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93
94    Auxiliary routines:
95
96    ---peek
97
98    like dequeue but without removing a packet from the queue
99
100    ---reset
101
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104
105    ---init
106
107    initializes newly created qdisc.
108
109    ---destroy
110
111    destroys resources allocated by init and during lifetime of qdisc.
112
113    ---change
114
115    changes qdisc parameters.
116  */
117
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120
121
122 /************************************************
123  *      Queueing disciplines manipulation.      *
124  ************************************************/
125
126
127 /* The list of all installed queueing disciplines. */
128
129 static struct Qdisc_ops *qdisc_base;
130
131 /* Register/unregister queueing discipline */
132
133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135         struct Qdisc_ops *q, **qp;
136         int rc = -EEXIST;
137
138         write_lock(&qdisc_mod_lock);
139         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140                 if (!strcmp(qops->id, q->id))
141                         goto out;
142
143         if (qops->enqueue == NULL)
144                 qops->enqueue = noop_qdisc_ops.enqueue;
145         if (qops->peek == NULL) {
146                 if (qops->dequeue == NULL)
147                         qops->peek = noop_qdisc_ops.peek;
148                 else
149                         goto out_einval;
150         }
151         if (qops->dequeue == NULL)
152                 qops->dequeue = noop_qdisc_ops.dequeue;
153
154         if (qops->cl_ops) {
155                 const struct Qdisc_class_ops *cops = qops->cl_ops;
156
157                 if (!(cops->find && cops->walk && cops->leaf))
158                         goto out_einval;
159
160                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161                         goto out_einval;
162         }
163
164         qops->next = NULL;
165         *qp = qops;
166         rc = 0;
167 out:
168         write_unlock(&qdisc_mod_lock);
169         return rc;
170
171 out_einval:
172         rc = -EINVAL;
173         goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176
177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179         struct Qdisc_ops *q, **qp;
180         int err = -ENOENT;
181
182         write_lock(&qdisc_mod_lock);
183         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184                 if (q == qops)
185                         break;
186         if (q) {
187                 *qp = q->next;
188                 q->next = NULL;
189                 err = 0;
190         }
191         write_unlock(&qdisc_mod_lock);
192         return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
198 {
199         read_lock(&qdisc_mod_lock);
200         strlcpy(name, default_qdisc_ops->id, len);
201         read_unlock(&qdisc_mod_lock);
202 }
203
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206         struct Qdisc_ops *q = NULL;
207
208         for (q = qdisc_base; q; q = q->next) {
209                 if (!strcmp(name, q->id)) {
210                         if (!try_module_get(q->owner))
211                                 q = NULL;
212                         break;
213                 }
214         }
215
216         return q;
217 }
218
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
221 {
222         const struct Qdisc_ops *ops;
223
224         if (!capable(CAP_NET_ADMIN))
225                 return -EPERM;
226
227         write_lock(&qdisc_mod_lock);
228         ops = qdisc_lookup_default(name);
229         if (!ops) {
230                 /* Not found, drop lock and try to load module */
231                 write_unlock(&qdisc_mod_lock);
232                 request_module("sch_%s", name);
233                 write_lock(&qdisc_mod_lock);
234
235                 ops = qdisc_lookup_default(name);
236         }
237
238         if (ops) {
239                 /* Set new default */
240                 module_put(default_qdisc_ops->owner);
241                 default_qdisc_ops = ops;
242         }
243         write_unlock(&qdisc_mod_lock);
244
245         return ops ? 0 : -ENOENT;
246 }
247
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
251 {
252         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264         struct Qdisc *q;
265
266         if (!qdisc_dev(root))
267                 return (root->handle == handle ? root : NULL);
268
269         if (!(root->flags & TCQ_F_BUILTIN) &&
270             root->handle == handle)
271                 return root;
272
273         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(dev->qdisc, handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         dev_ingress_queue(dev)->qdisc_sleeping,
313                         handle);
314 out:
315         return q;
316 }
317
318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320         unsigned long cl;
321         struct Qdisc *leaf;
322         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323
324         if (cops == NULL)
325                 return NULL;
326         cl = cops->find(p, classid);
327
328         if (cl == 0)
329                 return NULL;
330         leaf = cops->leaf(p, cl);
331         return leaf;
332 }
333
334 /* Find queueing discipline by name */
335
336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338         struct Qdisc_ops *q = NULL;
339
340         if (kind) {
341                 read_lock(&qdisc_mod_lock);
342                 for (q = qdisc_base; q; q = q->next) {
343                         if (nla_strcmp(kind, q->id) == 0) {
344                                 if (!try_module_get(q->owner))
345                                         q = NULL;
346                                 break;
347                         }
348                 }
349                 read_unlock(&qdisc_mod_lock);
350         }
351         return q;
352 }
353
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373         int low       = roundup(r->mpu, 48);
374         int high      = roundup(low+1, 48);
375         int cell_low  = low >> r->cell_log;
376         int cell_high = (high >> r->cell_log) - 1;
377
378         /* rtab is too inaccurate at rates > 100Mbit/s */
379         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380                 pr_debug("TC linklayer: Giving up ATM detection\n");
381                 return TC_LINKLAYER_ETHERNET;
382         }
383
384         if ((cell_high > cell_low) && (cell_high < 256)
385             && (rtab[cell_low] == rtab[cell_high])) {
386                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387                          cell_low, cell_high, rtab[cell_high]);
388                 return TC_LINKLAYER_ATM;
389         }
390         return TC_LINKLAYER_ETHERNET;
391 }
392
393 static struct qdisc_rate_table *qdisc_rtab_list;
394
395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396                                         struct nlattr *tab)
397 {
398         struct qdisc_rate_table *rtab;
399
400         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
401             nla_len(tab) != TC_RTAB_SIZE)
402                 return NULL;
403
404         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
405                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
406                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
407                         rtab->refcnt++;
408                         return rtab;
409                 }
410         }
411
412         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
413         if (rtab) {
414                 rtab->rate = *r;
415                 rtab->refcnt = 1;
416                 memcpy(rtab->data, nla_data(tab), 1024);
417                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
418                         r->linklayer = __detect_linklayer(r, rtab->data);
419                 rtab->next = qdisc_rtab_list;
420                 qdisc_rtab_list = rtab;
421         }
422         return rtab;
423 }
424 EXPORT_SYMBOL(qdisc_get_rtab);
425
426 void qdisc_put_rtab(struct qdisc_rate_table *tab)
427 {
428         struct qdisc_rate_table *rtab, **rtabp;
429
430         if (!tab || --tab->refcnt)
431                 return;
432
433         for (rtabp = &qdisc_rtab_list;
434              (rtab = *rtabp) != NULL;
435              rtabp = &rtab->next) {
436                 if (rtab == tab) {
437                         *rtabp = rtab->next;
438                         kfree(rtab);
439                         return;
440                 }
441         }
442 }
443 EXPORT_SYMBOL(qdisc_put_rtab);
444
445 static LIST_HEAD(qdisc_stab_list);
446
447 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
448         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
449         [TCA_STAB_DATA] = { .type = NLA_BINARY },
450 };
451
452 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
453 {
454         struct nlattr *tb[TCA_STAB_MAX + 1];
455         struct qdisc_size_table *stab;
456         struct tc_sizespec *s;
457         unsigned int tsize = 0;
458         u16 *tab = NULL;
459         int err;
460
461         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, NULL);
462         if (err < 0)
463                 return ERR_PTR(err);
464         if (!tb[TCA_STAB_BASE])
465                 return ERR_PTR(-EINVAL);
466
467         s = nla_data(tb[TCA_STAB_BASE]);
468
469         if (s->tsize > 0) {
470                 if (!tb[TCA_STAB_DATA])
471                         return ERR_PTR(-EINVAL);
472                 tab = nla_data(tb[TCA_STAB_DATA]);
473                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
474         }
475
476         if (tsize != s->tsize || (!tab && tsize > 0))
477                 return ERR_PTR(-EINVAL);
478
479         list_for_each_entry(stab, &qdisc_stab_list, list) {
480                 if (memcmp(&stab->szopts, s, sizeof(*s)))
481                         continue;
482                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
483                         continue;
484                 stab->refcnt++;
485                 return stab;
486         }
487
488         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
489         if (!stab)
490                 return ERR_PTR(-ENOMEM);
491
492         stab->refcnt = 1;
493         stab->szopts = *s;
494         if (tsize > 0)
495                 memcpy(stab->data, tab, tsize * sizeof(u16));
496
497         list_add_tail(&stab->list, &qdisc_stab_list);
498
499         return stab;
500 }
501
502 static void stab_kfree_rcu(struct rcu_head *head)
503 {
504         kfree(container_of(head, struct qdisc_size_table, rcu));
505 }
506
507 void qdisc_put_stab(struct qdisc_size_table *tab)
508 {
509         if (!tab)
510                 return;
511
512         if (--tab->refcnt == 0) {
513                 list_del(&tab->list);
514                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
515         }
516 }
517 EXPORT_SYMBOL(qdisc_put_stab);
518
519 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
520 {
521         struct nlattr *nest;
522
523         nest = nla_nest_start(skb, TCA_STAB);
524         if (nest == NULL)
525                 goto nla_put_failure;
526         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
527                 goto nla_put_failure;
528         nla_nest_end(skb, nest);
529
530         return skb->len;
531
532 nla_put_failure:
533         return -1;
534 }
535
536 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
537                                const struct qdisc_size_table *stab)
538 {
539         int pkt_len, slot;
540
541         pkt_len = skb->len + stab->szopts.overhead;
542         if (unlikely(!stab->szopts.tsize))
543                 goto out;
544
545         slot = pkt_len + stab->szopts.cell_align;
546         if (unlikely(slot < 0))
547                 slot = 0;
548
549         slot >>= stab->szopts.cell_log;
550         if (likely(slot < stab->szopts.tsize))
551                 pkt_len = stab->data[slot];
552         else
553                 pkt_len = stab->data[stab->szopts.tsize - 1] *
554                                 (slot / stab->szopts.tsize) +
555                                 stab->data[slot % stab->szopts.tsize];
556
557         pkt_len <<= stab->szopts.size_log;
558 out:
559         if (unlikely(pkt_len < 1))
560                 pkt_len = 1;
561         qdisc_skb_cb(skb)->pkt_len = pkt_len;
562 }
563 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
564
565 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
566 {
567         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
568                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
569                         txt, qdisc->ops->id, qdisc->handle >> 16);
570                 qdisc->flags |= TCQ_F_WARN_NONWC;
571         }
572 }
573 EXPORT_SYMBOL(qdisc_warn_nonwc);
574
575 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
576 {
577         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
578                                                  timer);
579
580         rcu_read_lock();
581         __netif_schedule(qdisc_root(wd->qdisc));
582         rcu_read_unlock();
583
584         return HRTIMER_NORESTART;
585 }
586
587 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
588 {
589         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
590         wd->timer.function = qdisc_watchdog;
591         wd->qdisc = qdisc;
592 }
593 EXPORT_SYMBOL(qdisc_watchdog_init);
594
595 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
596 {
597         if (test_bit(__QDISC_STATE_DEACTIVATED,
598                      &qdisc_root_sleeping(wd->qdisc)->state))
599                 return;
600
601         if (wd->last_expires == expires)
602                 return;
603
604         wd->last_expires = expires;
605         hrtimer_start(&wd->timer,
606                       ns_to_ktime(expires),
607                       HRTIMER_MODE_ABS_PINNED);
608 }
609 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
610
611 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
612 {
613         hrtimer_cancel(&wd->timer);
614 }
615 EXPORT_SYMBOL(qdisc_watchdog_cancel);
616
617 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
618 {
619         struct hlist_head *h;
620         unsigned int i;
621
622         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
623
624         if (h != NULL) {
625                 for (i = 0; i < n; i++)
626                         INIT_HLIST_HEAD(&h[i]);
627         }
628         return h;
629 }
630
631 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
632 {
633         struct Qdisc_class_common *cl;
634         struct hlist_node *next;
635         struct hlist_head *nhash, *ohash;
636         unsigned int nsize, nmask, osize;
637         unsigned int i, h;
638
639         /* Rehash when load factor exceeds 0.75 */
640         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
641                 return;
642         nsize = clhash->hashsize * 2;
643         nmask = nsize - 1;
644         nhash = qdisc_class_hash_alloc(nsize);
645         if (nhash == NULL)
646                 return;
647
648         ohash = clhash->hash;
649         osize = clhash->hashsize;
650
651         sch_tree_lock(sch);
652         for (i = 0; i < osize; i++) {
653                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
654                         h = qdisc_class_hash(cl->classid, nmask);
655                         hlist_add_head(&cl->hnode, &nhash[h]);
656                 }
657         }
658         clhash->hash     = nhash;
659         clhash->hashsize = nsize;
660         clhash->hashmask = nmask;
661         sch_tree_unlock(sch);
662
663         kvfree(ohash);
664 }
665 EXPORT_SYMBOL(qdisc_class_hash_grow);
666
667 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
668 {
669         unsigned int size = 4;
670
671         clhash->hash = qdisc_class_hash_alloc(size);
672         if (clhash->hash == NULL)
673                 return -ENOMEM;
674         clhash->hashsize  = size;
675         clhash->hashmask  = size - 1;
676         clhash->hashelems = 0;
677         return 0;
678 }
679 EXPORT_SYMBOL(qdisc_class_hash_init);
680
681 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
682 {
683         kvfree(clhash->hash);
684 }
685 EXPORT_SYMBOL(qdisc_class_hash_destroy);
686
687 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
688                              struct Qdisc_class_common *cl)
689 {
690         unsigned int h;
691
692         INIT_HLIST_NODE(&cl->hnode);
693         h = qdisc_class_hash(cl->classid, clhash->hashmask);
694         hlist_add_head(&cl->hnode, &clhash->hash[h]);
695         clhash->hashelems++;
696 }
697 EXPORT_SYMBOL(qdisc_class_hash_insert);
698
699 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
700                              struct Qdisc_class_common *cl)
701 {
702         hlist_del(&cl->hnode);
703         clhash->hashelems--;
704 }
705 EXPORT_SYMBOL(qdisc_class_hash_remove);
706
707 /* Allocate an unique handle from space managed by kernel
708  * Possible range is [8000-FFFF]:0000 (0x8000 values)
709  */
710 static u32 qdisc_alloc_handle(struct net_device *dev)
711 {
712         int i = 0x8000;
713         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
714
715         do {
716                 autohandle += TC_H_MAKE(0x10000U, 0);
717                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
718                         autohandle = TC_H_MAKE(0x80000000U, 0);
719                 if (!qdisc_lookup(dev, autohandle))
720                         return autohandle;
721                 cond_resched();
722         } while (--i > 0);
723
724         return 0;
725 }
726
727 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
728                                unsigned int len)
729 {
730         const struct Qdisc_class_ops *cops;
731         unsigned long cl;
732         u32 parentid;
733         bool notify;
734         int drops;
735
736         if (n == 0 && len == 0)
737                 return;
738         drops = max_t(int, n, 0);
739         rcu_read_lock();
740         while ((parentid = sch->parent)) {
741                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
742                         break;
743
744                 if (sch->flags & TCQ_F_NOPARENT)
745                         break;
746                 /* Notify parent qdisc only if child qdisc becomes empty.
747                  *
748                  * If child was empty even before update then backlog
749                  * counter is screwed and we skip notification because
750                  * parent class is already passive.
751                  */
752                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n);
753                 /* TODO: perform the search on a per txq basis */
754                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
755                 if (sch == NULL) {
756                         WARN_ON_ONCE(parentid != TC_H_ROOT);
757                         break;
758                 }
759                 cops = sch->ops->cl_ops;
760                 if (notify && cops->qlen_notify) {
761                         cl = cops->find(sch, parentid);
762                         cops->qlen_notify(sch, cl);
763                 }
764                 sch->q.qlen -= n;
765                 sch->qstats.backlog -= len;
766                 __qdisc_qstats_drop(sch, drops);
767         }
768         rcu_read_unlock();
769 }
770 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
771
772 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
773                          u32 portid, u32 seq, u16 flags, int event)
774 {
775         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
776         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
777         struct tcmsg *tcm;
778         struct nlmsghdr  *nlh;
779         unsigned char *b = skb_tail_pointer(skb);
780         struct gnet_dump d;
781         struct qdisc_size_table *stab;
782         __u32 qlen;
783
784         cond_resched();
785         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
786         if (!nlh)
787                 goto out_nlmsg_trim;
788         tcm = nlmsg_data(nlh);
789         tcm->tcm_family = AF_UNSPEC;
790         tcm->tcm__pad1 = 0;
791         tcm->tcm__pad2 = 0;
792         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
793         tcm->tcm_parent = clid;
794         tcm->tcm_handle = q->handle;
795         tcm->tcm_info = refcount_read(&q->refcnt);
796         if (nla_put_string(skb, TCA_KIND, q->ops->id))
797                 goto nla_put_failure;
798         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
799                 goto nla_put_failure;
800         if (q->ops->dump && q->ops->dump(q, skb) < 0)
801                 goto nla_put_failure;
802         qlen = q->q.qlen;
803
804         stab = rtnl_dereference(q->stab);
805         if (stab && qdisc_dump_stab(skb, stab) < 0)
806                 goto nla_put_failure;
807
808         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
809                                          NULL, &d, TCA_PAD) < 0)
810                 goto nla_put_failure;
811
812         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
813                 goto nla_put_failure;
814
815         if (qdisc_is_percpu_stats(q)) {
816                 cpu_bstats = q->cpu_bstats;
817                 cpu_qstats = q->cpu_qstats;
818         }
819
820         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
821                                   &d, cpu_bstats, &q->bstats) < 0 ||
822             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
823             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
824                 goto nla_put_failure;
825
826         if (gnet_stats_finish_copy(&d) < 0)
827                 goto nla_put_failure;
828
829         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
830         return skb->len;
831
832 out_nlmsg_trim:
833 nla_put_failure:
834         nlmsg_trim(skb, b);
835         return -1;
836 }
837
838 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
839 {
840         if (q->flags & TCQ_F_BUILTIN)
841                 return true;
842         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
843                 return true;
844
845         return false;
846 }
847
848 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
849                         struct nlmsghdr *n, u32 clid,
850                         struct Qdisc *old, struct Qdisc *new)
851 {
852         struct sk_buff *skb;
853         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
854
855         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
856         if (!skb)
857                 return -ENOBUFS;
858
859         if (old && !tc_qdisc_dump_ignore(old, false)) {
860                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
861                                   0, RTM_DELQDISC) < 0)
862                         goto err_out;
863         }
864         if (new && !tc_qdisc_dump_ignore(new, false)) {
865                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
866                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
867                         goto err_out;
868         }
869
870         if (skb->len)
871                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
872                                       n->nlmsg_flags & NLM_F_ECHO);
873
874 err_out:
875         kfree_skb(skb);
876         return -EINVAL;
877 }
878
879 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
880                                struct nlmsghdr *n, u32 clid,
881                                struct Qdisc *old, struct Qdisc *new)
882 {
883         if (new || old)
884                 qdisc_notify(net, skb, n, clid, old, new);
885
886         if (old)
887                 qdisc_destroy(old);
888 }
889
890 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
891  * to device "dev".
892  *
893  * When appropriate send a netlink notification using 'skb'
894  * and "n".
895  *
896  * On success, destroy old qdisc.
897  */
898
899 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
900                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
901                        struct Qdisc *new, struct Qdisc *old)
902 {
903         struct Qdisc *q = old;
904         struct net *net = dev_net(dev);
905         int err = 0;
906
907         if (parent == NULL) {
908                 unsigned int i, num_q, ingress;
909
910                 ingress = 0;
911                 num_q = dev->num_tx_queues;
912                 if ((q && q->flags & TCQ_F_INGRESS) ||
913                     (new && new->flags & TCQ_F_INGRESS)) {
914                         num_q = 1;
915                         ingress = 1;
916                         if (!dev_ingress_queue(dev))
917                                 return -ENOENT;
918                 }
919
920                 if (dev->flags & IFF_UP)
921                         dev_deactivate(dev);
922
923                 if (new && new->ops->attach)
924                         goto skip;
925
926                 for (i = 0; i < num_q; i++) {
927                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
928
929                         if (!ingress)
930                                 dev_queue = netdev_get_tx_queue(dev, i);
931
932                         old = dev_graft_qdisc(dev_queue, new);
933                         if (new && i > 0)
934                                 qdisc_refcount_inc(new);
935
936                         if (!ingress)
937                                 qdisc_destroy(old);
938                 }
939
940 skip:
941                 if (!ingress) {
942                         notify_and_destroy(net, skb, n, classid,
943                                            dev->qdisc, new);
944                         if (new && !new->ops->attach)
945                                 qdisc_refcount_inc(new);
946                         dev->qdisc = new ? : &noop_qdisc;
947
948                         if (new && new->ops->attach)
949                                 new->ops->attach(new);
950                 } else {
951                         notify_and_destroy(net, skb, n, classid, old, new);
952                 }
953
954                 if (dev->flags & IFF_UP)
955                         dev_activate(dev);
956         } else {
957                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
958
959                 err = -EOPNOTSUPP;
960                 if (cops && cops->graft) {
961                         unsigned long cl = cops->find(parent, classid);
962
963                         if (cl)
964                                 err = cops->graft(parent, cl, new, &old);
965                         else
966                                 err = -ENOENT;
967                 }
968                 if (!err)
969                         notify_and_destroy(net, skb, n, classid, old, new);
970         }
971         return err;
972 }
973
974 /* lockdep annotation is needed for ingress; egress gets it only for name */
975 static struct lock_class_key qdisc_tx_lock;
976 static struct lock_class_key qdisc_rx_lock;
977
978 /*
979    Allocate and initialize new qdisc.
980
981    Parameters are passed via opt.
982  */
983
984 static struct Qdisc *qdisc_create(struct net_device *dev,
985                                   struct netdev_queue *dev_queue,
986                                   struct Qdisc *p, u32 parent, u32 handle,
987                                   struct nlattr **tca, int *errp)
988 {
989         int err;
990         struct nlattr *kind = tca[TCA_KIND];
991         struct Qdisc *sch;
992         struct Qdisc_ops *ops;
993         struct qdisc_size_table *stab;
994
995         ops = qdisc_lookup_ops(kind);
996 #ifdef CONFIG_MODULES
997         if (ops == NULL && kind != NULL) {
998                 char name[IFNAMSIZ];
999                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1000                         /* We dropped the RTNL semaphore in order to
1001                          * perform the module load.  So, even if we
1002                          * succeeded in loading the module we have to
1003                          * tell the caller to replay the request.  We
1004                          * indicate this using -EAGAIN.
1005                          * We replay the request because the device may
1006                          * go away in the mean time.
1007                          */
1008                         rtnl_unlock();
1009                         request_module("sch_%s", name);
1010                         rtnl_lock();
1011                         ops = qdisc_lookup_ops(kind);
1012                         if (ops != NULL) {
1013                                 /* We will try again qdisc_lookup_ops,
1014                                  * so don't keep a reference.
1015                                  */
1016                                 module_put(ops->owner);
1017                                 err = -EAGAIN;
1018                                 goto err_out;
1019                         }
1020                 }
1021         }
1022 #endif
1023
1024         err = -ENOENT;
1025         if (ops == NULL)
1026                 goto err_out;
1027
1028         sch = qdisc_alloc(dev_queue, ops);
1029         if (IS_ERR(sch)) {
1030                 err = PTR_ERR(sch);
1031                 goto err_out2;
1032         }
1033
1034         sch->parent = parent;
1035
1036         if (handle == TC_H_INGRESS) {
1037                 sch->flags |= TCQ_F_INGRESS;
1038                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1039                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1040         } else {
1041                 if (handle == 0) {
1042                         handle = qdisc_alloc_handle(dev);
1043                         err = -ENOMEM;
1044                         if (handle == 0)
1045                                 goto err_out3;
1046                 }
1047                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1048                 if (!netif_is_multiqueue(dev))
1049                         sch->flags |= TCQ_F_ONETXQUEUE;
1050         }
1051
1052         sch->handle = handle;
1053
1054         /* This exist to keep backward compatible with a userspace
1055          * loophole, what allowed userspace to get IFF_NO_QUEUE
1056          * facility on older kernels by setting tx_queue_len=0 (prior
1057          * to qdisc init), and then forgot to reinit tx_queue_len
1058          * before again attaching a qdisc.
1059          */
1060         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1061                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1062                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1063         }
1064
1065         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
1066                 if (tca[TCA_STAB]) {
1067                         stab = qdisc_get_stab(tca[TCA_STAB]);
1068                         if (IS_ERR(stab)) {
1069                                 err = PTR_ERR(stab);
1070                                 goto err_out4;
1071                         }
1072                         rcu_assign_pointer(sch->stab, stab);
1073                 }
1074                 if (tca[TCA_RATE]) {
1075                         seqcount_t *running;
1076
1077                         err = -EOPNOTSUPP;
1078                         if (sch->flags & TCQ_F_MQROOT)
1079                                 goto err_out4;
1080
1081                         if ((sch->parent != TC_H_ROOT) &&
1082                             !(sch->flags & TCQ_F_INGRESS) &&
1083                             (!p || !(p->flags & TCQ_F_MQROOT)))
1084                                 running = qdisc_root_sleeping_running(sch);
1085                         else
1086                                 running = &sch->running;
1087
1088                         err = gen_new_estimator(&sch->bstats,
1089                                                 sch->cpu_bstats,
1090                                                 &sch->rate_est,
1091                                                 NULL,
1092                                                 running,
1093                                                 tca[TCA_RATE]);
1094                         if (err)
1095                                 goto err_out4;
1096                 }
1097
1098                 qdisc_hash_add(sch, false);
1099
1100                 return sch;
1101         }
1102         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1103         if (ops->destroy)
1104                 ops->destroy(sch);
1105 err_out3:
1106         dev_put(dev);
1107         qdisc_free(sch);
1108 err_out2:
1109         module_put(ops->owner);
1110 err_out:
1111         *errp = err;
1112         return NULL;
1113
1114 err_out4:
1115         /*
1116          * Any broken qdiscs that would require a ops->reset() here?
1117          * The qdisc was never in action so it shouldn't be necessary.
1118          */
1119         qdisc_put_stab(rtnl_dereference(sch->stab));
1120         if (ops->destroy)
1121                 ops->destroy(sch);
1122         goto err_out3;
1123 }
1124
1125 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1126 {
1127         struct qdisc_size_table *ostab, *stab = NULL;
1128         int err = 0;
1129
1130         if (tca[TCA_OPTIONS]) {
1131                 if (sch->ops->change == NULL)
1132                         return -EINVAL;
1133                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1134                 if (err)
1135                         return err;
1136         }
1137
1138         if (tca[TCA_STAB]) {
1139                 stab = qdisc_get_stab(tca[TCA_STAB]);
1140                 if (IS_ERR(stab))
1141                         return PTR_ERR(stab);
1142         }
1143
1144         ostab = rtnl_dereference(sch->stab);
1145         rcu_assign_pointer(sch->stab, stab);
1146         qdisc_put_stab(ostab);
1147
1148         if (tca[TCA_RATE]) {
1149                 /* NB: ignores errors from replace_estimator
1150                    because change can't be undone. */
1151                 if (sch->flags & TCQ_F_MQROOT)
1152                         goto out;
1153                 gen_replace_estimator(&sch->bstats,
1154                                       sch->cpu_bstats,
1155                                       &sch->rate_est,
1156                                       NULL,
1157                                       qdisc_root_sleeping_running(sch),
1158                                       tca[TCA_RATE]);
1159         }
1160 out:
1161         return 0;
1162 }
1163
1164 struct check_loop_arg {
1165         struct qdisc_walker     w;
1166         struct Qdisc            *p;
1167         int                     depth;
1168 };
1169
1170 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1171                          struct qdisc_walker *w);
1172
1173 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1174 {
1175         struct check_loop_arg   arg;
1176
1177         if (q->ops->cl_ops == NULL)
1178                 return 0;
1179
1180         arg.w.stop = arg.w.skip = arg.w.count = 0;
1181         arg.w.fn = check_loop_fn;
1182         arg.depth = depth;
1183         arg.p = p;
1184         q->ops->cl_ops->walk(q, &arg.w);
1185         return arg.w.stop ? -ELOOP : 0;
1186 }
1187
1188 static int
1189 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1190 {
1191         struct Qdisc *leaf;
1192         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1193         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1194
1195         leaf = cops->leaf(q, cl);
1196         if (leaf) {
1197                 if (leaf == arg->p || arg->depth > 7)
1198                         return -ELOOP;
1199                 return check_loop(leaf, arg->p, arg->depth + 1);
1200         }
1201         return 0;
1202 }
1203
1204 /*
1205  * Delete/get qdisc.
1206  */
1207
1208 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1209                         struct netlink_ext_ack *extack)
1210 {
1211         struct net *net = sock_net(skb->sk);
1212         struct tcmsg *tcm = nlmsg_data(n);
1213         struct nlattr *tca[TCA_MAX + 1];
1214         struct net_device *dev;
1215         u32 clid;
1216         struct Qdisc *q = NULL;
1217         struct Qdisc *p = NULL;
1218         int err;
1219
1220         if ((n->nlmsg_type != RTM_GETQDISC) &&
1221             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1222                 return -EPERM;
1223
1224         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1225         if (err < 0)
1226                 return err;
1227
1228         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1229         if (!dev)
1230                 return -ENODEV;
1231
1232         clid = tcm->tcm_parent;
1233         if (clid) {
1234                 if (clid != TC_H_ROOT) {
1235                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1236                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1237                                 if (!p)
1238                                         return -ENOENT;
1239                                 q = qdisc_leaf(p, clid);
1240                         } else if (dev_ingress_queue(dev)) {
1241                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1242                         }
1243                 } else {
1244                         q = dev->qdisc;
1245                 }
1246                 if (!q)
1247                         return -ENOENT;
1248
1249                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1250                         return -EINVAL;
1251         } else {
1252                 q = qdisc_lookup(dev, tcm->tcm_handle);
1253                 if (!q)
1254                         return -ENOENT;
1255         }
1256
1257         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1258                 return -EINVAL;
1259
1260         if (n->nlmsg_type == RTM_DELQDISC) {
1261                 if (!clid)
1262                         return -EINVAL;
1263                 if (q->handle == 0)
1264                         return -ENOENT;
1265                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1266                 if (err != 0)
1267                         return err;
1268         } else {
1269                 qdisc_notify(net, skb, n, clid, NULL, q);
1270         }
1271         return 0;
1272 }
1273
1274 /*
1275  * Create/change qdisc.
1276  */
1277
1278 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1279                            struct netlink_ext_ack *extack)
1280 {
1281         struct net *net = sock_net(skb->sk);
1282         struct tcmsg *tcm;
1283         struct nlattr *tca[TCA_MAX + 1];
1284         struct net_device *dev;
1285         u32 clid;
1286         struct Qdisc *q, *p;
1287         int err;
1288
1289         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1290                 return -EPERM;
1291
1292 replay:
1293         /* Reinit, just in case something touches this. */
1294         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1295         if (err < 0)
1296                 return err;
1297
1298         tcm = nlmsg_data(n);
1299         clid = tcm->tcm_parent;
1300         q = p = NULL;
1301
1302         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1303         if (!dev)
1304                 return -ENODEV;
1305
1306
1307         if (clid) {
1308                 if (clid != TC_H_ROOT) {
1309                         if (clid != TC_H_INGRESS) {
1310                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1311                                 if (!p)
1312                                         return -ENOENT;
1313                                 q = qdisc_leaf(p, clid);
1314                         } else if (dev_ingress_queue_create(dev)) {
1315                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1316                         }
1317                 } else {
1318                         q = dev->qdisc;
1319                 }
1320
1321                 /* It may be default qdisc, ignore it */
1322                 if (q && q->handle == 0)
1323                         q = NULL;
1324
1325                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1326                         if (tcm->tcm_handle) {
1327                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1328                                         return -EEXIST;
1329                                 if (TC_H_MIN(tcm->tcm_handle))
1330                                         return -EINVAL;
1331                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1332                                 if (!q)
1333                                         goto create_n_graft;
1334                                 if (n->nlmsg_flags & NLM_F_EXCL)
1335                                         return -EEXIST;
1336                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1337                                         return -EINVAL;
1338                                 if (q == p ||
1339                                     (p && check_loop(q, p, 0)))
1340                                         return -ELOOP;
1341                                 qdisc_refcount_inc(q);
1342                                 goto graft;
1343                         } else {
1344                                 if (!q)
1345                                         goto create_n_graft;
1346
1347                                 /* This magic test requires explanation.
1348                                  *
1349                                  *   We know, that some child q is already
1350                                  *   attached to this parent and have choice:
1351                                  *   either to change it or to create/graft new one.
1352                                  *
1353                                  *   1. We are allowed to create/graft only
1354                                  *   if CREATE and REPLACE flags are set.
1355                                  *
1356                                  *   2. If EXCL is set, requestor wanted to say,
1357                                  *   that qdisc tcm_handle is not expected
1358                                  *   to exist, so that we choose create/graft too.
1359                                  *
1360                                  *   3. The last case is when no flags are set.
1361                                  *   Alas, it is sort of hole in API, we
1362                                  *   cannot decide what to do unambiguously.
1363                                  *   For now we select create/graft, if
1364                                  *   user gave KIND, which does not match existing.
1365                                  */
1366                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1367                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1368                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1369                                      (tca[TCA_KIND] &&
1370                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1371                                         goto create_n_graft;
1372                         }
1373                 }
1374         } else {
1375                 if (!tcm->tcm_handle)
1376                         return -EINVAL;
1377                 q = qdisc_lookup(dev, tcm->tcm_handle);
1378         }
1379
1380         /* Change qdisc parameters */
1381         if (q == NULL)
1382                 return -ENOENT;
1383         if (n->nlmsg_flags & NLM_F_EXCL)
1384                 return -EEXIST;
1385         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1386                 return -EINVAL;
1387         err = qdisc_change(q, tca);
1388         if (err == 0)
1389                 qdisc_notify(net, skb, n, clid, NULL, q);
1390         return err;
1391
1392 create_n_graft:
1393         if (!(n->nlmsg_flags & NLM_F_CREATE))
1394                 return -ENOENT;
1395         if (clid == TC_H_INGRESS) {
1396                 if (dev_ingress_queue(dev))
1397                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1398                                          tcm->tcm_parent, tcm->tcm_parent,
1399                                          tca, &err);
1400                 else
1401                         err = -ENOENT;
1402         } else {
1403                 struct netdev_queue *dev_queue;
1404
1405                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1406                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1407                 else if (p)
1408                         dev_queue = p->dev_queue;
1409                 else
1410                         dev_queue = netdev_get_tx_queue(dev, 0);
1411
1412                 q = qdisc_create(dev, dev_queue, p,
1413                                  tcm->tcm_parent, tcm->tcm_handle,
1414                                  tca, &err);
1415         }
1416         if (q == NULL) {
1417                 if (err == -EAGAIN)
1418                         goto replay;
1419                 return err;
1420         }
1421
1422 graft:
1423         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1424         if (err) {
1425                 if (q)
1426                         qdisc_destroy(q);
1427                 return err;
1428         }
1429
1430         return 0;
1431 }
1432
1433 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1434                               struct netlink_callback *cb,
1435                               int *q_idx_p, int s_q_idx, bool recur,
1436                               bool dump_invisible)
1437 {
1438         int ret = 0, q_idx = *q_idx_p;
1439         struct Qdisc *q;
1440         int b;
1441
1442         if (!root)
1443                 return 0;
1444
1445         q = root;
1446         if (q_idx < s_q_idx) {
1447                 q_idx++;
1448         } else {
1449                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1450                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1451                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1452                                   RTM_NEWQDISC) <= 0)
1453                         goto done;
1454                 q_idx++;
1455         }
1456
1457         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1458          * itself has already been dumped.
1459          *
1460          * If we've already dumped the top-level (ingress) qdisc above and the global
1461          * qdisc hashtable, we don't want to hit it again
1462          */
1463         if (!qdisc_dev(root) || !recur)
1464                 goto out;
1465
1466         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1467                 if (q_idx < s_q_idx) {
1468                         q_idx++;
1469                         continue;
1470                 }
1471                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1472                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1473                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1474                                   RTM_NEWQDISC) <= 0)
1475                         goto done;
1476                 q_idx++;
1477         }
1478
1479 out:
1480         *q_idx_p = q_idx;
1481         return ret;
1482 done:
1483         ret = -1;
1484         goto out;
1485 }
1486
1487 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1488 {
1489         struct net *net = sock_net(skb->sk);
1490         int idx, q_idx;
1491         int s_idx, s_q_idx;
1492         struct net_device *dev;
1493         const struct nlmsghdr *nlh = cb->nlh;
1494         struct nlattr *tca[TCA_MAX + 1];
1495         int err;
1496
1497         s_idx = cb->args[0];
1498         s_q_idx = q_idx = cb->args[1];
1499
1500         idx = 0;
1501         ASSERT_RTNL();
1502
1503         err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1504         if (err < 0)
1505                 return err;
1506
1507         for_each_netdev(net, dev) {
1508                 struct netdev_queue *dev_queue;
1509
1510                 if (idx < s_idx)
1511                         goto cont;
1512                 if (idx > s_idx)
1513                         s_q_idx = 0;
1514                 q_idx = 0;
1515
1516                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1517                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1518                         goto done;
1519
1520                 dev_queue = dev_ingress_queue(dev);
1521                 if (dev_queue &&
1522                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1523                                        &q_idx, s_q_idx, false,
1524                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1525                         goto done;
1526
1527 cont:
1528                 idx++;
1529         }
1530
1531 done:
1532         cb->args[0] = idx;
1533         cb->args[1] = q_idx;
1534
1535         return skb->len;
1536 }
1537
1538
1539
1540 /************************************************
1541  *      Traffic classes manipulation.           *
1542  ************************************************/
1543
1544 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1545                           unsigned long cl,
1546                           u32 portid, u32 seq, u16 flags, int event)
1547 {
1548         struct tcmsg *tcm;
1549         struct nlmsghdr  *nlh;
1550         unsigned char *b = skb_tail_pointer(skb);
1551         struct gnet_dump d;
1552         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1553
1554         cond_resched();
1555         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1556         if (!nlh)
1557                 goto out_nlmsg_trim;
1558         tcm = nlmsg_data(nlh);
1559         tcm->tcm_family = AF_UNSPEC;
1560         tcm->tcm__pad1 = 0;
1561         tcm->tcm__pad2 = 0;
1562         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1563         tcm->tcm_parent = q->handle;
1564         tcm->tcm_handle = q->handle;
1565         tcm->tcm_info = 0;
1566         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1567                 goto nla_put_failure;
1568         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1569                 goto nla_put_failure;
1570
1571         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1572                                          NULL, &d, TCA_PAD) < 0)
1573                 goto nla_put_failure;
1574
1575         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1576                 goto nla_put_failure;
1577
1578         if (gnet_stats_finish_copy(&d) < 0)
1579                 goto nla_put_failure;
1580
1581         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1582         return skb->len;
1583
1584 out_nlmsg_trim:
1585 nla_put_failure:
1586         nlmsg_trim(skb, b);
1587         return -1;
1588 }
1589
1590 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1591                          struct nlmsghdr *n, struct Qdisc *q,
1592                          unsigned long cl, int event)
1593 {
1594         struct sk_buff *skb;
1595         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1596
1597         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1598         if (!skb)
1599                 return -ENOBUFS;
1600
1601         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1602                 kfree_skb(skb);
1603                 return -EINVAL;
1604         }
1605
1606         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1607                               n->nlmsg_flags & NLM_F_ECHO);
1608 }
1609
1610 static int tclass_del_notify(struct net *net,
1611                              const struct Qdisc_class_ops *cops,
1612                              struct sk_buff *oskb, struct nlmsghdr *n,
1613                              struct Qdisc *q, unsigned long cl)
1614 {
1615         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1616         struct sk_buff *skb;
1617         int err = 0;
1618
1619         if (!cops->delete)
1620                 return -EOPNOTSUPP;
1621
1622         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1623         if (!skb)
1624                 return -ENOBUFS;
1625
1626         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1627                            RTM_DELTCLASS) < 0) {
1628                 kfree_skb(skb);
1629                 return -EINVAL;
1630         }
1631
1632         err = cops->delete(q, cl);
1633         if (err) {
1634                 kfree_skb(skb);
1635                 return err;
1636         }
1637
1638         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1639                               n->nlmsg_flags & NLM_F_ECHO);
1640 }
1641
1642 #ifdef CONFIG_NET_CLS
1643
1644 struct tcf_bind_args {
1645         struct tcf_walker w;
1646         u32 classid;
1647         unsigned long cl;
1648 };
1649
1650 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1651 {
1652         struct tcf_bind_args *a = (void *)arg;
1653
1654         if (tp->ops->bind_class) {
1655                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1656
1657                 sch_tree_lock(q);
1658                 tp->ops->bind_class(n, a->classid, a->cl);
1659                 sch_tree_unlock(q);
1660         }
1661         return 0;
1662 }
1663
1664 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1665                            unsigned long new_cl)
1666 {
1667         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1668         struct tcf_block *block;
1669         struct tcf_chain *chain;
1670         unsigned long cl;
1671
1672         cl = cops->find(q, portid);
1673         if (!cl)
1674                 return;
1675         block = cops->tcf_block(q, cl);
1676         if (!block)
1677                 return;
1678         list_for_each_entry(chain, &block->chain_list, list) {
1679                 struct tcf_proto *tp;
1680
1681                 for (tp = rtnl_dereference(chain->filter_chain);
1682                      tp; tp = rtnl_dereference(tp->next)) {
1683                         struct tcf_bind_args arg = {};
1684
1685                         arg.w.fn = tcf_node_bind;
1686                         arg.classid = clid;
1687                         arg.cl = new_cl;
1688                         tp->ops->walk(tp, &arg.w);
1689                 }
1690         }
1691 }
1692
1693 #else
1694
1695 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1696                            unsigned long new_cl)
1697 {
1698 }
1699
1700 #endif
1701
1702 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1703                          struct netlink_ext_ack *extack)
1704 {
1705         struct net *net = sock_net(skb->sk);
1706         struct tcmsg *tcm = nlmsg_data(n);
1707         struct nlattr *tca[TCA_MAX + 1];
1708         struct net_device *dev;
1709         struct Qdisc *q = NULL;
1710         const struct Qdisc_class_ops *cops;
1711         unsigned long cl = 0;
1712         unsigned long new_cl;
1713         u32 portid;
1714         u32 clid;
1715         u32 qid;
1716         int err;
1717
1718         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1719             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1720                 return -EPERM;
1721
1722         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1723         if (err < 0)
1724                 return err;
1725
1726         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1727         if (!dev)
1728                 return -ENODEV;
1729
1730         /*
1731            parent == TC_H_UNSPEC - unspecified parent.
1732            parent == TC_H_ROOT   - class is root, which has no parent.
1733            parent == X:0         - parent is root class.
1734            parent == X:Y         - parent is a node in hierarchy.
1735            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1736
1737            handle == 0:0         - generate handle from kernel pool.
1738            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1739            handle == X:Y         - clear.
1740            handle == X:0         - root class.
1741          */
1742
1743         /* Step 1. Determine qdisc handle X:0 */
1744
1745         portid = tcm->tcm_parent;
1746         clid = tcm->tcm_handle;
1747         qid = TC_H_MAJ(clid);
1748
1749         if (portid != TC_H_ROOT) {
1750                 u32 qid1 = TC_H_MAJ(portid);
1751
1752                 if (qid && qid1) {
1753                         /* If both majors are known, they must be identical. */
1754                         if (qid != qid1)
1755                                 return -EINVAL;
1756                 } else if (qid1) {
1757                         qid = qid1;
1758                 } else if (qid == 0)
1759                         qid = dev->qdisc->handle;
1760
1761                 /* Now qid is genuine qdisc handle consistent
1762                  * both with parent and child.
1763                  *
1764                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1765                  */
1766                 if (portid)
1767                         portid = TC_H_MAKE(qid, portid);
1768         } else {
1769                 if (qid == 0)
1770                         qid = dev->qdisc->handle;
1771         }
1772
1773         /* OK. Locate qdisc */
1774         q = qdisc_lookup(dev, qid);
1775         if (!q)
1776                 return -ENOENT;
1777
1778         /* An check that it supports classes */
1779         cops = q->ops->cl_ops;
1780         if (cops == NULL)
1781                 return -EINVAL;
1782
1783         /* Now try to get class */
1784         if (clid == 0) {
1785                 if (portid == TC_H_ROOT)
1786                         clid = qid;
1787         } else
1788                 clid = TC_H_MAKE(qid, clid);
1789
1790         if (clid)
1791                 cl = cops->find(q, clid);
1792
1793         if (cl == 0) {
1794                 err = -ENOENT;
1795                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1796                     !(n->nlmsg_flags & NLM_F_CREATE))
1797                         goto out;
1798         } else {
1799                 switch (n->nlmsg_type) {
1800                 case RTM_NEWTCLASS:
1801                         err = -EEXIST;
1802                         if (n->nlmsg_flags & NLM_F_EXCL)
1803                                 goto out;
1804                         break;
1805                 case RTM_DELTCLASS:
1806                         err = tclass_del_notify(net, cops, skb, n, q, cl);
1807                         /* Unbind the class with flilters with 0 */
1808                         tc_bind_tclass(q, portid, clid, 0);
1809                         goto out;
1810                 case RTM_GETTCLASS:
1811                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1812                         goto out;
1813                 default:
1814                         err = -EINVAL;
1815                         goto out;
1816                 }
1817         }
1818
1819         new_cl = cl;
1820         err = -EOPNOTSUPP;
1821         if (cops->change)
1822                 err = cops->change(q, clid, portid, tca, &new_cl);
1823         if (err == 0) {
1824                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1825                 /* We just create a new class, need to do reverse binding. */
1826                 if (cl != new_cl)
1827                         tc_bind_tclass(q, portid, clid, new_cl);
1828         }
1829 out:
1830         return err;
1831 }
1832
1833 struct qdisc_dump_args {
1834         struct qdisc_walker     w;
1835         struct sk_buff          *skb;
1836         struct netlink_callback *cb;
1837 };
1838
1839 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1840                             struct qdisc_walker *arg)
1841 {
1842         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1843
1844         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1845                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1846                               RTM_NEWTCLASS);
1847 }
1848
1849 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1850                                 struct tcmsg *tcm, struct netlink_callback *cb,
1851                                 int *t_p, int s_t)
1852 {
1853         struct qdisc_dump_args arg;
1854
1855         if (tc_qdisc_dump_ignore(q, false) ||
1856             *t_p < s_t || !q->ops->cl_ops ||
1857             (tcm->tcm_parent &&
1858              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1859                 (*t_p)++;
1860                 return 0;
1861         }
1862         if (*t_p > s_t)
1863                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1864         arg.w.fn = qdisc_class_dump;
1865         arg.skb = skb;
1866         arg.cb = cb;
1867         arg.w.stop  = 0;
1868         arg.w.skip = cb->args[1];
1869         arg.w.count = 0;
1870         q->ops->cl_ops->walk(q, &arg.w);
1871         cb->args[1] = arg.w.count;
1872         if (arg.w.stop)
1873                 return -1;
1874         (*t_p)++;
1875         return 0;
1876 }
1877
1878 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1879                                struct tcmsg *tcm, struct netlink_callback *cb,
1880                                int *t_p, int s_t)
1881 {
1882         struct Qdisc *q;
1883         int b;
1884
1885         if (!root)
1886                 return 0;
1887
1888         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1889                 return -1;
1890
1891         if (!qdisc_dev(root))
1892                 return 0;
1893
1894         if (tcm->tcm_parent) {
1895                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
1896                 if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1897                         return -1;
1898                 return 0;
1899         }
1900         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1901                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1902                         return -1;
1903         }
1904
1905         return 0;
1906 }
1907
1908 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1909 {
1910         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1911         struct net *net = sock_net(skb->sk);
1912         struct netdev_queue *dev_queue;
1913         struct net_device *dev;
1914         int t, s_t;
1915
1916         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1917                 return 0;
1918         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1919         if (!dev)
1920                 return 0;
1921
1922         s_t = cb->args[0];
1923         t = 0;
1924
1925         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1926                 goto done;
1927
1928         dev_queue = dev_ingress_queue(dev);
1929         if (dev_queue &&
1930             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1931                                 &t, s_t) < 0)
1932                 goto done;
1933
1934 done:
1935         cb->args[0] = t;
1936
1937         dev_put(dev);
1938         return skb->len;
1939 }
1940
1941 #ifdef CONFIG_PROC_FS
1942 static int psched_show(struct seq_file *seq, void *v)
1943 {
1944         seq_printf(seq, "%08x %08x %08x %08x\n",
1945                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1946                    1000000,
1947                    (u32)NSEC_PER_SEC / hrtimer_resolution);
1948
1949         return 0;
1950 }
1951
1952 static int psched_open(struct inode *inode, struct file *file)
1953 {
1954         return single_open(file, psched_show, NULL);
1955 }
1956
1957 static const struct file_operations psched_fops = {
1958         .owner = THIS_MODULE,
1959         .open = psched_open,
1960         .read  = seq_read,
1961         .llseek = seq_lseek,
1962         .release = single_release,
1963 };
1964
1965 static int __net_init psched_net_init(struct net *net)
1966 {
1967         struct proc_dir_entry *e;
1968
1969         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1970         if (e == NULL)
1971                 return -ENOMEM;
1972
1973         return 0;
1974 }
1975
1976 static void __net_exit psched_net_exit(struct net *net)
1977 {
1978         remove_proc_entry("psched", net->proc_net);
1979 }
1980 #else
1981 static int __net_init psched_net_init(struct net *net)
1982 {
1983         return 0;
1984 }
1985
1986 static void __net_exit psched_net_exit(struct net *net)
1987 {
1988 }
1989 #endif
1990
1991 static struct pernet_operations psched_net_ops = {
1992         .init = psched_net_init,
1993         .exit = psched_net_exit,
1994 };
1995
1996 static int __init pktsched_init(void)
1997 {
1998         int err;
1999
2000         err = register_pernet_subsys(&psched_net_ops);
2001         if (err) {
2002                 pr_err("pktsched_init: "
2003                        "cannot initialize per netns operations\n");
2004                 return err;
2005         }
2006
2007         register_qdisc(&pfifo_fast_ops);
2008         register_qdisc(&pfifo_qdisc_ops);
2009         register_qdisc(&bfifo_qdisc_ops);
2010         register_qdisc(&pfifo_head_drop_qdisc_ops);
2011         register_qdisc(&mq_qdisc_ops);
2012         register_qdisc(&noqueue_qdisc_ops);
2013
2014         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2015         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2016         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2017                       0);
2018         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2019         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2020         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2021                       0);
2022
2023         return 0;
2024 }
2025
2026 subsys_initcall(pktsched_init);