Merge tag 'pci-v5.1-fixes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci
[sfrench/cifs-2.6.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/slab.h>
31 #include <linux/hashtable.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 #include <net/pkt_cls.h>
38
39 /*
40
41    Short review.
42    -------------
43
44    This file consists of two interrelated parts:
45
46    1. queueing disciplines manager frontend.
47    2. traffic classes manager frontend.
48
49    Generally, queueing discipline ("qdisc") is a black box,
50    which is able to enqueue packets and to dequeue them (when
51    device is ready to send something) in order and at times
52    determined by algorithm hidden in it.
53
54    qdisc's are divided to two categories:
55    - "queues", which have no internal structure visible from outside.
56    - "schedulers", which split all the packets to "traffic classes",
57      using "packet classifiers" (look at cls_api.c)
58
59    In turn, classes may have child qdiscs (as rule, queues)
60    attached to them etc. etc. etc.
61
62    The goal of the routines in this file is to translate
63    information supplied by user in the form of handles
64    to more intelligible for kernel form, to make some sanity
65    checks and part of work, which is common to all qdiscs
66    and to provide rtnetlink notifications.
67
68    All real intelligent work is done inside qdisc modules.
69
70
71
72    Every discipline has two major routines: enqueue and dequeue.
73
74    ---dequeue
75
76    dequeue usually returns a skb to send. It is allowed to return NULL,
77    but it does not mean that queue is empty, it just means that
78    discipline does not want to send anything this time.
79    Queue is really empty if q->q.qlen == 0.
80    For complicated disciplines with multiple queues q->q is not
81    real packet queue, but however q->q.qlen must be valid.
82
83    ---enqueue
84
85    enqueue returns 0, if packet was enqueued successfully.
86    If packet (this one or another one) was dropped, it returns
87    not zero error code.
88    NET_XMIT_DROP        - this packet dropped
89      Expected action: do not backoff, but wait until queue will clear.
90    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
91      Expected action: backoff or ignore
92
93    Auxiliary routines:
94
95    ---peek
96
97    like dequeue but without removing a packet from the queue
98
99    ---reset
100
101    returns qdisc to initial state: purge all buffers, clear all
102    timers, counters (except for statistics) etc.
103
104    ---init
105
106    initializes newly created qdisc.
107
108    ---destroy
109
110    destroys resources allocated by init and during lifetime of qdisc.
111
112    ---change
113
114    changes qdisc parameters.
115  */
116
117 /* Protects list of registered TC modules. It is pure SMP lock. */
118 static DEFINE_RWLOCK(qdisc_mod_lock);
119
120
121 /************************************************
122  *      Queueing disciplines manipulation.      *
123  ************************************************/
124
125
126 /* The list of all installed queueing disciplines. */
127
128 static struct Qdisc_ops *qdisc_base;
129
130 /* Register/unregister queueing discipline */
131
132 int register_qdisc(struct Qdisc_ops *qops)
133 {
134         struct Qdisc_ops *q, **qp;
135         int rc = -EEXIST;
136
137         write_lock(&qdisc_mod_lock);
138         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
139                 if (!strcmp(qops->id, q->id))
140                         goto out;
141
142         if (qops->enqueue == NULL)
143                 qops->enqueue = noop_qdisc_ops.enqueue;
144         if (qops->peek == NULL) {
145                 if (qops->dequeue == NULL)
146                         qops->peek = noop_qdisc_ops.peek;
147                 else
148                         goto out_einval;
149         }
150         if (qops->dequeue == NULL)
151                 qops->dequeue = noop_qdisc_ops.dequeue;
152
153         if (qops->cl_ops) {
154                 const struct Qdisc_class_ops *cops = qops->cl_ops;
155
156                 if (!(cops->find && cops->walk && cops->leaf))
157                         goto out_einval;
158
159                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
160                         goto out_einval;
161         }
162
163         qops->next = NULL;
164         *qp = qops;
165         rc = 0;
166 out:
167         write_unlock(&qdisc_mod_lock);
168         return rc;
169
170 out_einval:
171         rc = -EINVAL;
172         goto out;
173 }
174 EXPORT_SYMBOL(register_qdisc);
175
176 int unregister_qdisc(struct Qdisc_ops *qops)
177 {
178         struct Qdisc_ops *q, **qp;
179         int err = -ENOENT;
180
181         write_lock(&qdisc_mod_lock);
182         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
183                 if (q == qops)
184                         break;
185         if (q) {
186                 *qp = q->next;
187                 q->next = NULL;
188                 err = 0;
189         }
190         write_unlock(&qdisc_mod_lock);
191         return err;
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strlcpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214
215         return q;
216 }
217
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module("sch_%s", name);
232                 write_lock(&qdisc_mod_lock);
233
234                 ops = qdisc_lookup_default(name);
235         }
236
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243
244         return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
273                 if (q->handle == handle)
274                         return q;
275         }
276         return NULL;
277 }
278
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282                 ASSERT_RTNL();
283                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284                 if (invisible)
285                         q->flags |= TCQ_F_INVISIBLE;
286         }
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293                 ASSERT_RTNL();
294                 hash_del_rcu(&q->hash);
295         }
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301         struct Qdisc *q;
302
303         if (!handle)
304                 return NULL;
305         q = qdisc_match_from_root(dev->qdisc, handle);
306         if (q)
307                 goto out;
308
309         if (dev_ingress_queue(dev))
310                 q = qdisc_match_from_root(
311                         dev_ingress_queue(dev)->qdisc_sleeping,
312                         handle);
313 out:
314         return q;
315 }
316
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319         struct netdev_queue *nq;
320         struct Qdisc *q;
321
322         if (!handle)
323                 return NULL;
324         q = qdisc_match_from_root(dev->qdisc, handle);
325         if (q)
326                 goto out;
327
328         nq = dev_ingress_queue_rcu(dev);
329         if (nq)
330                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331 out:
332         return q;
333 }
334
335 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
336 {
337         unsigned long cl;
338         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
339
340         if (cops == NULL)
341                 return NULL;
342         cl = cops->find(p, classid);
343
344         if (cl == 0)
345                 return NULL;
346         return cops->leaf(p, cl);
347 }
348
349 /* Find queueing discipline by name */
350
351 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
352 {
353         struct Qdisc_ops *q = NULL;
354
355         if (kind) {
356                 read_lock(&qdisc_mod_lock);
357                 for (q = qdisc_base; q; q = q->next) {
358                         if (nla_strcmp(kind, q->id) == 0) {
359                                 if (!try_module_get(q->owner))
360                                         q = NULL;
361                                 break;
362                         }
363                 }
364                 read_unlock(&qdisc_mod_lock);
365         }
366         return q;
367 }
368
369 /* The linklayer setting were not transferred from iproute2, in older
370  * versions, and the rate tables lookup systems have been dropped in
371  * the kernel. To keep backward compatible with older iproute2 tc
372  * utils, we detect the linklayer setting by detecting if the rate
373  * table were modified.
374  *
375  * For linklayer ATM table entries, the rate table will be aligned to
376  * 48 bytes, thus some table entries will contain the same value.  The
377  * mpu (min packet unit) is also encoded into the old rate table, thus
378  * starting from the mpu, we find low and high table entries for
379  * mapping this cell.  If these entries contain the same value, when
380  * the rate tables have been modified for linklayer ATM.
381  *
382  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
383  * and then roundup to the next cell, calc the table entry one below,
384  * and compare.
385  */
386 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
387 {
388         int low       = roundup(r->mpu, 48);
389         int high      = roundup(low+1, 48);
390         int cell_low  = low >> r->cell_log;
391         int cell_high = (high >> r->cell_log) - 1;
392
393         /* rtab is too inaccurate at rates > 100Mbit/s */
394         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
395                 pr_debug("TC linklayer: Giving up ATM detection\n");
396                 return TC_LINKLAYER_ETHERNET;
397         }
398
399         if ((cell_high > cell_low) && (cell_high < 256)
400             && (rtab[cell_low] == rtab[cell_high])) {
401                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
402                          cell_low, cell_high, rtab[cell_high]);
403                 return TC_LINKLAYER_ATM;
404         }
405         return TC_LINKLAYER_ETHERNET;
406 }
407
408 static struct qdisc_rate_table *qdisc_rtab_list;
409
410 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
411                                         struct nlattr *tab,
412                                         struct netlink_ext_ack *extack)
413 {
414         struct qdisc_rate_table *rtab;
415
416         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
417             nla_len(tab) != TC_RTAB_SIZE) {
418                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419                 return NULL;
420         }
421
422         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
425                         rtab->refcnt++;
426                         return rtab;
427                 }
428         }
429
430         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431         if (rtab) {
432                 rtab->rate = *r;
433                 rtab->refcnt = 1;
434                 memcpy(rtab->data, nla_data(tab), 1024);
435                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
436                         r->linklayer = __detect_linklayer(r, rtab->data);
437                 rtab->next = qdisc_rtab_list;
438                 qdisc_rtab_list = rtab;
439         } else {
440                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441         }
442         return rtab;
443 }
444 EXPORT_SYMBOL(qdisc_get_rtab);
445
446 void qdisc_put_rtab(struct qdisc_rate_table *tab)
447 {
448         struct qdisc_rate_table *rtab, **rtabp;
449
450         if (!tab || --tab->refcnt)
451                 return;
452
453         for (rtabp = &qdisc_rtab_list;
454              (rtab = *rtabp) != NULL;
455              rtabp = &rtab->next) {
456                 if (rtab == tab) {
457                         *rtabp = rtab->next;
458                         kfree(rtab);
459                         return;
460                 }
461         }
462 }
463 EXPORT_SYMBOL(qdisc_put_rtab);
464
465 static LIST_HEAD(qdisc_stab_list);
466
467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
469         [TCA_STAB_DATA] = { .type = NLA_BINARY },
470 };
471
472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473                                                struct netlink_ext_ack *extack)
474 {
475         struct nlattr *tb[TCA_STAB_MAX + 1];
476         struct qdisc_size_table *stab;
477         struct tc_sizespec *s;
478         unsigned int tsize = 0;
479         u16 *tab = NULL;
480         int err;
481
482         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
483         if (err < 0)
484                 return ERR_PTR(err);
485         if (!tb[TCA_STAB_BASE]) {
486                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
487                 return ERR_PTR(-EINVAL);
488         }
489
490         s = nla_data(tb[TCA_STAB_BASE]);
491
492         if (s->tsize > 0) {
493                 if (!tb[TCA_STAB_DATA]) {
494                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
495                         return ERR_PTR(-EINVAL);
496                 }
497                 tab = nla_data(tb[TCA_STAB_DATA]);
498                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
499         }
500
501         if (tsize != s->tsize || (!tab && tsize > 0)) {
502                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
503                 return ERR_PTR(-EINVAL);
504         }
505
506         list_for_each_entry(stab, &qdisc_stab_list, list) {
507                 if (memcmp(&stab->szopts, s, sizeof(*s)))
508                         continue;
509                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
510                         continue;
511                 stab->refcnt++;
512                 return stab;
513         }
514
515         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
516         if (!stab)
517                 return ERR_PTR(-ENOMEM);
518
519         stab->refcnt = 1;
520         stab->szopts = *s;
521         if (tsize > 0)
522                 memcpy(stab->data, tab, tsize * sizeof(u16));
523
524         list_add_tail(&stab->list, &qdisc_stab_list);
525
526         return stab;
527 }
528
529 void qdisc_put_stab(struct qdisc_size_table *tab)
530 {
531         if (!tab)
532                 return;
533
534         if (--tab->refcnt == 0) {
535                 list_del(&tab->list);
536                 kfree_rcu(tab, rcu);
537         }
538 }
539 EXPORT_SYMBOL(qdisc_put_stab);
540
541 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
542 {
543         struct nlattr *nest;
544
545         nest = nla_nest_start(skb, TCA_STAB);
546         if (nest == NULL)
547                 goto nla_put_failure;
548         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
549                 goto nla_put_failure;
550         nla_nest_end(skb, nest);
551
552         return skb->len;
553
554 nla_put_failure:
555         return -1;
556 }
557
558 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
559                                const struct qdisc_size_table *stab)
560 {
561         int pkt_len, slot;
562
563         pkt_len = skb->len + stab->szopts.overhead;
564         if (unlikely(!stab->szopts.tsize))
565                 goto out;
566
567         slot = pkt_len + stab->szopts.cell_align;
568         if (unlikely(slot < 0))
569                 slot = 0;
570
571         slot >>= stab->szopts.cell_log;
572         if (likely(slot < stab->szopts.tsize))
573                 pkt_len = stab->data[slot];
574         else
575                 pkt_len = stab->data[stab->szopts.tsize - 1] *
576                                 (slot / stab->szopts.tsize) +
577                                 stab->data[slot % stab->szopts.tsize];
578
579         pkt_len <<= stab->szopts.size_log;
580 out:
581         if (unlikely(pkt_len < 1))
582                 pkt_len = 1;
583         qdisc_skb_cb(skb)->pkt_len = pkt_len;
584 }
585 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
586
587 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
588 {
589         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
590                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
591                         txt, qdisc->ops->id, qdisc->handle >> 16);
592                 qdisc->flags |= TCQ_F_WARN_NONWC;
593         }
594 }
595 EXPORT_SYMBOL(qdisc_warn_nonwc);
596
597 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
598 {
599         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
600                                                  timer);
601
602         rcu_read_lock();
603         __netif_schedule(qdisc_root(wd->qdisc));
604         rcu_read_unlock();
605
606         return HRTIMER_NORESTART;
607 }
608
609 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
610                                  clockid_t clockid)
611 {
612         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
613         wd->timer.function = qdisc_watchdog;
614         wd->qdisc = qdisc;
615 }
616 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
617
618 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
619 {
620         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
621 }
622 EXPORT_SYMBOL(qdisc_watchdog_init);
623
624 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
625 {
626         if (test_bit(__QDISC_STATE_DEACTIVATED,
627                      &qdisc_root_sleeping(wd->qdisc)->state))
628                 return;
629
630         if (wd->last_expires == expires)
631                 return;
632
633         wd->last_expires = expires;
634         hrtimer_start(&wd->timer,
635                       ns_to_ktime(expires),
636                       HRTIMER_MODE_ABS_PINNED);
637 }
638 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
639
640 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
641 {
642         hrtimer_cancel(&wd->timer);
643 }
644 EXPORT_SYMBOL(qdisc_watchdog_cancel);
645
646 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
647 {
648         struct hlist_head *h;
649         unsigned int i;
650
651         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
652
653         if (h != NULL) {
654                 for (i = 0; i < n; i++)
655                         INIT_HLIST_HEAD(&h[i]);
656         }
657         return h;
658 }
659
660 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
661 {
662         struct Qdisc_class_common *cl;
663         struct hlist_node *next;
664         struct hlist_head *nhash, *ohash;
665         unsigned int nsize, nmask, osize;
666         unsigned int i, h;
667
668         /* Rehash when load factor exceeds 0.75 */
669         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
670                 return;
671         nsize = clhash->hashsize * 2;
672         nmask = nsize - 1;
673         nhash = qdisc_class_hash_alloc(nsize);
674         if (nhash == NULL)
675                 return;
676
677         ohash = clhash->hash;
678         osize = clhash->hashsize;
679
680         sch_tree_lock(sch);
681         for (i = 0; i < osize; i++) {
682                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
683                         h = qdisc_class_hash(cl->classid, nmask);
684                         hlist_add_head(&cl->hnode, &nhash[h]);
685                 }
686         }
687         clhash->hash     = nhash;
688         clhash->hashsize = nsize;
689         clhash->hashmask = nmask;
690         sch_tree_unlock(sch);
691
692         kvfree(ohash);
693 }
694 EXPORT_SYMBOL(qdisc_class_hash_grow);
695
696 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
697 {
698         unsigned int size = 4;
699
700         clhash->hash = qdisc_class_hash_alloc(size);
701         if (!clhash->hash)
702                 return -ENOMEM;
703         clhash->hashsize  = size;
704         clhash->hashmask  = size - 1;
705         clhash->hashelems = 0;
706         return 0;
707 }
708 EXPORT_SYMBOL(qdisc_class_hash_init);
709
710 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
711 {
712         kvfree(clhash->hash);
713 }
714 EXPORT_SYMBOL(qdisc_class_hash_destroy);
715
716 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
717                              struct Qdisc_class_common *cl)
718 {
719         unsigned int h;
720
721         INIT_HLIST_NODE(&cl->hnode);
722         h = qdisc_class_hash(cl->classid, clhash->hashmask);
723         hlist_add_head(&cl->hnode, &clhash->hash[h]);
724         clhash->hashelems++;
725 }
726 EXPORT_SYMBOL(qdisc_class_hash_insert);
727
728 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
729                              struct Qdisc_class_common *cl)
730 {
731         hlist_del(&cl->hnode);
732         clhash->hashelems--;
733 }
734 EXPORT_SYMBOL(qdisc_class_hash_remove);
735
736 /* Allocate an unique handle from space managed by kernel
737  * Possible range is [8000-FFFF]:0000 (0x8000 values)
738  */
739 static u32 qdisc_alloc_handle(struct net_device *dev)
740 {
741         int i = 0x8000;
742         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
743
744         do {
745                 autohandle += TC_H_MAKE(0x10000U, 0);
746                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
747                         autohandle = TC_H_MAKE(0x80000000U, 0);
748                 if (!qdisc_lookup(dev, autohandle))
749                         return autohandle;
750                 cond_resched();
751         } while (--i > 0);
752
753         return 0;
754 }
755
756 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
757 {
758         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
759         const struct Qdisc_class_ops *cops;
760         unsigned long cl;
761         u32 parentid;
762         bool notify;
763         int drops;
764
765         if (n == 0 && len == 0)
766                 return;
767         drops = max_t(int, n, 0);
768         rcu_read_lock();
769         while ((parentid = sch->parent)) {
770                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
771                         break;
772
773                 if (sch->flags & TCQ_F_NOPARENT)
774                         break;
775                 /* Notify parent qdisc only if child qdisc becomes empty.
776                  *
777                  * If child was empty even before update then backlog
778                  * counter is screwed and we skip notification because
779                  * parent class is already passive.
780                  *
781                  * If the original child was offloaded then it is allowed
782                  * to be seem as empty, so the parent is notified anyway.
783                  */
784                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
785                                                        !qdisc_is_offloaded);
786                 /* TODO: perform the search on a per txq basis */
787                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
788                 if (sch == NULL) {
789                         WARN_ON_ONCE(parentid != TC_H_ROOT);
790                         break;
791                 }
792                 cops = sch->ops->cl_ops;
793                 if (notify && cops->qlen_notify) {
794                         cl = cops->find(sch, parentid);
795                         cops->qlen_notify(sch, cl);
796                 }
797                 sch->q.qlen -= n;
798                 sch->qstats.backlog -= len;
799                 __qdisc_qstats_drop(sch, drops);
800         }
801         rcu_read_unlock();
802 }
803 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
804
805 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
806                               void *type_data)
807 {
808         struct net_device *dev = qdisc_dev(sch);
809         int err;
810
811         sch->flags &= ~TCQ_F_OFFLOADED;
812         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
813                 return 0;
814
815         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
816         if (err == -EOPNOTSUPP)
817                 return 0;
818
819         if (!err)
820                 sch->flags |= TCQ_F_OFFLOADED;
821
822         return err;
823 }
824 EXPORT_SYMBOL(qdisc_offload_dump_helper);
825
826 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
827                                 struct Qdisc *new, struct Qdisc *old,
828                                 enum tc_setup_type type, void *type_data,
829                                 struct netlink_ext_ack *extack)
830 {
831         bool any_qdisc_is_offloaded;
832         int err;
833
834         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
835                 return;
836
837         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
838
839         /* Don't report error if the graft is part of destroy operation. */
840         if (!err || !new || new == &noop_qdisc)
841                 return;
842
843         /* Don't report error if the parent, the old child and the new
844          * one are not offloaded.
845          */
846         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
847         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
848         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
849
850         if (any_qdisc_is_offloaded)
851                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
852 }
853 EXPORT_SYMBOL(qdisc_offload_graft_helper);
854
855 static void qdisc_offload_graft_root(struct net_device *dev,
856                                      struct Qdisc *new, struct Qdisc *old,
857                                      struct netlink_ext_ack *extack)
858 {
859         struct tc_root_qopt_offload graft_offload = {
860                 .command        = TC_ROOT_GRAFT,
861                 .handle         = new ? new->handle : 0,
862                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
863                                   (old && old->flags & TCQ_F_INGRESS),
864         };
865
866         qdisc_offload_graft_helper(dev, NULL, new, old,
867                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
868 }
869
870 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
871                          u32 portid, u32 seq, u16 flags, int event)
872 {
873         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
874         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
875         struct tcmsg *tcm;
876         struct nlmsghdr  *nlh;
877         unsigned char *b = skb_tail_pointer(skb);
878         struct gnet_dump d;
879         struct qdisc_size_table *stab;
880         u32 block_index;
881         __u32 qlen;
882
883         cond_resched();
884         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
885         if (!nlh)
886                 goto out_nlmsg_trim;
887         tcm = nlmsg_data(nlh);
888         tcm->tcm_family = AF_UNSPEC;
889         tcm->tcm__pad1 = 0;
890         tcm->tcm__pad2 = 0;
891         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
892         tcm->tcm_parent = clid;
893         tcm->tcm_handle = q->handle;
894         tcm->tcm_info = refcount_read(&q->refcnt);
895         if (nla_put_string(skb, TCA_KIND, q->ops->id))
896                 goto nla_put_failure;
897         if (q->ops->ingress_block_get) {
898                 block_index = q->ops->ingress_block_get(q);
899                 if (block_index &&
900                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
901                         goto nla_put_failure;
902         }
903         if (q->ops->egress_block_get) {
904                 block_index = q->ops->egress_block_get(q);
905                 if (block_index &&
906                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
907                         goto nla_put_failure;
908         }
909         if (q->ops->dump && q->ops->dump(q, skb) < 0)
910                 goto nla_put_failure;
911         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
912                 goto nla_put_failure;
913         qlen = qdisc_qlen_sum(q);
914
915         stab = rtnl_dereference(q->stab);
916         if (stab && qdisc_dump_stab(skb, stab) < 0)
917                 goto nla_put_failure;
918
919         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
920                                          NULL, &d, TCA_PAD) < 0)
921                 goto nla_put_failure;
922
923         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
924                 goto nla_put_failure;
925
926         if (qdisc_is_percpu_stats(q)) {
927                 cpu_bstats = q->cpu_bstats;
928                 cpu_qstats = q->cpu_qstats;
929         }
930
931         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
932                                   &d, cpu_bstats, &q->bstats) < 0 ||
933             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
934             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
935                 goto nla_put_failure;
936
937         if (gnet_stats_finish_copy(&d) < 0)
938                 goto nla_put_failure;
939
940         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
941         return skb->len;
942
943 out_nlmsg_trim:
944 nla_put_failure:
945         nlmsg_trim(skb, b);
946         return -1;
947 }
948
949 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
950 {
951         if (q->flags & TCQ_F_BUILTIN)
952                 return true;
953         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
954                 return true;
955
956         return false;
957 }
958
959 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
960                         struct nlmsghdr *n, u32 clid,
961                         struct Qdisc *old, struct Qdisc *new)
962 {
963         struct sk_buff *skb;
964         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
965
966         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
967         if (!skb)
968                 return -ENOBUFS;
969
970         if (old && !tc_qdisc_dump_ignore(old, false)) {
971                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
972                                   0, RTM_DELQDISC) < 0)
973                         goto err_out;
974         }
975         if (new && !tc_qdisc_dump_ignore(new, false)) {
976                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
977                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
978                         goto err_out;
979         }
980
981         if (skb->len)
982                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
983                                       n->nlmsg_flags & NLM_F_ECHO);
984
985 err_out:
986         kfree_skb(skb);
987         return -EINVAL;
988 }
989
990 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
991                                struct nlmsghdr *n, u32 clid,
992                                struct Qdisc *old, struct Qdisc *new)
993 {
994         if (new || old)
995                 qdisc_notify(net, skb, n, clid, old, new);
996
997         if (old)
998                 qdisc_put(old);
999 }
1000
1001 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1002  * to device "dev".
1003  *
1004  * When appropriate send a netlink notification using 'skb'
1005  * and "n".
1006  *
1007  * On success, destroy old qdisc.
1008  */
1009
1010 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1011                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1012                        struct Qdisc *new, struct Qdisc *old,
1013                        struct netlink_ext_ack *extack)
1014 {
1015         struct Qdisc *q = old;
1016         struct net *net = dev_net(dev);
1017
1018         if (parent == NULL) {
1019                 unsigned int i, num_q, ingress;
1020
1021                 ingress = 0;
1022                 num_q = dev->num_tx_queues;
1023                 if ((q && q->flags & TCQ_F_INGRESS) ||
1024                     (new && new->flags & TCQ_F_INGRESS)) {
1025                         num_q = 1;
1026                         ingress = 1;
1027                         if (!dev_ingress_queue(dev)) {
1028                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1029                                 return -ENOENT;
1030                         }
1031                 }
1032
1033                 if (dev->flags & IFF_UP)
1034                         dev_deactivate(dev);
1035
1036                 qdisc_offload_graft_root(dev, new, old, extack);
1037
1038                 if (new && new->ops->attach)
1039                         goto skip;
1040
1041                 for (i = 0; i < num_q; i++) {
1042                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1043
1044                         if (!ingress)
1045                                 dev_queue = netdev_get_tx_queue(dev, i);
1046
1047                         old = dev_graft_qdisc(dev_queue, new);
1048                         if (new && i > 0)
1049                                 qdisc_refcount_inc(new);
1050
1051                         if (!ingress)
1052                                 qdisc_put(old);
1053                 }
1054
1055 skip:
1056                 if (!ingress) {
1057                         notify_and_destroy(net, skb, n, classid,
1058                                            dev->qdisc, new);
1059                         if (new && !new->ops->attach)
1060                                 qdisc_refcount_inc(new);
1061                         dev->qdisc = new ? : &noop_qdisc;
1062
1063                         if (new && new->ops->attach)
1064                                 new->ops->attach(new);
1065                 } else {
1066                         notify_and_destroy(net, skb, n, classid, old, new);
1067                 }
1068
1069                 if (dev->flags & IFF_UP)
1070                         dev_activate(dev);
1071         } else {
1072                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1073                 unsigned long cl;
1074                 int err;
1075
1076                 /* Only support running class lockless if parent is lockless */
1077                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1078                     parent && !(parent->flags & TCQ_F_NOLOCK))
1079                         new->flags &= ~TCQ_F_NOLOCK;
1080
1081                 if (!cops || !cops->graft)
1082                         return -EOPNOTSUPP;
1083
1084                 cl = cops->find(parent, classid);
1085                 if (!cl) {
1086                         NL_SET_ERR_MSG(extack, "Specified class not found");
1087                         return -ENOENT;
1088                 }
1089
1090                 err = cops->graft(parent, cl, new, &old, extack);
1091                 if (err)
1092                         return err;
1093                 notify_and_destroy(net, skb, n, classid, old, new);
1094         }
1095         return 0;
1096 }
1097
1098 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1099                                    struct netlink_ext_ack *extack)
1100 {
1101         u32 block_index;
1102
1103         if (tca[TCA_INGRESS_BLOCK]) {
1104                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1105
1106                 if (!block_index) {
1107                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1108                         return -EINVAL;
1109                 }
1110                 if (!sch->ops->ingress_block_set) {
1111                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1112                         return -EOPNOTSUPP;
1113                 }
1114                 sch->ops->ingress_block_set(sch, block_index);
1115         }
1116         if (tca[TCA_EGRESS_BLOCK]) {
1117                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1118
1119                 if (!block_index) {
1120                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1121                         return -EINVAL;
1122                 }
1123                 if (!sch->ops->egress_block_set) {
1124                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1125                         return -EOPNOTSUPP;
1126                 }
1127                 sch->ops->egress_block_set(sch, block_index);
1128         }
1129         return 0;
1130 }
1131
1132 /*
1133    Allocate and initialize new qdisc.
1134
1135    Parameters are passed via opt.
1136  */
1137
1138 static struct Qdisc *qdisc_create(struct net_device *dev,
1139                                   struct netdev_queue *dev_queue,
1140                                   struct Qdisc *p, u32 parent, u32 handle,
1141                                   struct nlattr **tca, int *errp,
1142                                   struct netlink_ext_ack *extack)
1143 {
1144         int err;
1145         struct nlattr *kind = tca[TCA_KIND];
1146         struct Qdisc *sch;
1147         struct Qdisc_ops *ops;
1148         struct qdisc_size_table *stab;
1149
1150         ops = qdisc_lookup_ops(kind);
1151 #ifdef CONFIG_MODULES
1152         if (ops == NULL && kind != NULL) {
1153                 char name[IFNAMSIZ];
1154                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1155                         /* We dropped the RTNL semaphore in order to
1156                          * perform the module load.  So, even if we
1157                          * succeeded in loading the module we have to
1158                          * tell the caller to replay the request.  We
1159                          * indicate this using -EAGAIN.
1160                          * We replay the request because the device may
1161                          * go away in the mean time.
1162                          */
1163                         rtnl_unlock();
1164                         request_module("sch_%s", name);
1165                         rtnl_lock();
1166                         ops = qdisc_lookup_ops(kind);
1167                         if (ops != NULL) {
1168                                 /* We will try again qdisc_lookup_ops,
1169                                  * so don't keep a reference.
1170                                  */
1171                                 module_put(ops->owner);
1172                                 err = -EAGAIN;
1173                                 goto err_out;
1174                         }
1175                 }
1176         }
1177 #endif
1178
1179         err = -ENOENT;
1180         if (!ops) {
1181                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1182                 goto err_out;
1183         }
1184
1185         sch = qdisc_alloc(dev_queue, ops, extack);
1186         if (IS_ERR(sch)) {
1187                 err = PTR_ERR(sch);
1188                 goto err_out2;
1189         }
1190
1191         sch->parent = parent;
1192
1193         if (handle == TC_H_INGRESS) {
1194                 sch->flags |= TCQ_F_INGRESS;
1195                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1196         } else {
1197                 if (handle == 0) {
1198                         handle = qdisc_alloc_handle(dev);
1199                         if (handle == 0) {
1200                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1201                                 err = -ENOSPC;
1202                                 goto err_out3;
1203                         }
1204                 }
1205                 if (!netif_is_multiqueue(dev))
1206                         sch->flags |= TCQ_F_ONETXQUEUE;
1207         }
1208
1209         sch->handle = handle;
1210
1211         /* This exist to keep backward compatible with a userspace
1212          * loophole, what allowed userspace to get IFF_NO_QUEUE
1213          * facility on older kernels by setting tx_queue_len=0 (prior
1214          * to qdisc init), and then forgot to reinit tx_queue_len
1215          * before again attaching a qdisc.
1216          */
1217         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1218                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1219                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1220         }
1221
1222         err = qdisc_block_indexes_set(sch, tca, extack);
1223         if (err)
1224                 goto err_out3;
1225
1226         if (ops->init) {
1227                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1228                 if (err != 0)
1229                         goto err_out5;
1230         }
1231
1232         if (tca[TCA_STAB]) {
1233                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1234                 if (IS_ERR(stab)) {
1235                         err = PTR_ERR(stab);
1236                         goto err_out4;
1237                 }
1238                 rcu_assign_pointer(sch->stab, stab);
1239         }
1240         if (tca[TCA_RATE]) {
1241                 seqcount_t *running;
1242
1243                 err = -EOPNOTSUPP;
1244                 if (sch->flags & TCQ_F_MQROOT) {
1245                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1246                         goto err_out4;
1247                 }
1248
1249                 if (sch->parent != TC_H_ROOT &&
1250                     !(sch->flags & TCQ_F_INGRESS) &&
1251                     (!p || !(p->flags & TCQ_F_MQROOT)))
1252                         running = qdisc_root_sleeping_running(sch);
1253                 else
1254                         running = &sch->running;
1255
1256                 err = gen_new_estimator(&sch->bstats,
1257                                         sch->cpu_bstats,
1258                                         &sch->rate_est,
1259                                         NULL,
1260                                         running,
1261                                         tca[TCA_RATE]);
1262                 if (err) {
1263                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1264                         goto err_out4;
1265                 }
1266         }
1267
1268         qdisc_hash_add(sch, false);
1269
1270         return sch;
1271
1272 err_out5:
1273         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1274         if (ops->destroy)
1275                 ops->destroy(sch);
1276 err_out3:
1277         dev_put(dev);
1278         qdisc_free(sch);
1279 err_out2:
1280         module_put(ops->owner);
1281 err_out:
1282         *errp = err;
1283         return NULL;
1284
1285 err_out4:
1286         /*
1287          * Any broken qdiscs that would require a ops->reset() here?
1288          * The qdisc was never in action so it shouldn't be necessary.
1289          */
1290         qdisc_put_stab(rtnl_dereference(sch->stab));
1291         if (ops->destroy)
1292                 ops->destroy(sch);
1293         goto err_out3;
1294 }
1295
1296 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1297                         struct netlink_ext_ack *extack)
1298 {
1299         struct qdisc_size_table *ostab, *stab = NULL;
1300         int err = 0;
1301
1302         if (tca[TCA_OPTIONS]) {
1303                 if (!sch->ops->change) {
1304                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1305                         return -EINVAL;
1306                 }
1307                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1308                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1309                         return -EOPNOTSUPP;
1310                 }
1311                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1312                 if (err)
1313                         return err;
1314         }
1315
1316         if (tca[TCA_STAB]) {
1317                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1318                 if (IS_ERR(stab))
1319                         return PTR_ERR(stab);
1320         }
1321
1322         ostab = rtnl_dereference(sch->stab);
1323         rcu_assign_pointer(sch->stab, stab);
1324         qdisc_put_stab(ostab);
1325
1326         if (tca[TCA_RATE]) {
1327                 /* NB: ignores errors from replace_estimator
1328                    because change can't be undone. */
1329                 if (sch->flags & TCQ_F_MQROOT)
1330                         goto out;
1331                 gen_replace_estimator(&sch->bstats,
1332                                       sch->cpu_bstats,
1333                                       &sch->rate_est,
1334                                       NULL,
1335                                       qdisc_root_sleeping_running(sch),
1336                                       tca[TCA_RATE]);
1337         }
1338 out:
1339         return 0;
1340 }
1341
1342 struct check_loop_arg {
1343         struct qdisc_walker     w;
1344         struct Qdisc            *p;
1345         int                     depth;
1346 };
1347
1348 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1349                          struct qdisc_walker *w);
1350
1351 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1352 {
1353         struct check_loop_arg   arg;
1354
1355         if (q->ops->cl_ops == NULL)
1356                 return 0;
1357
1358         arg.w.stop = arg.w.skip = arg.w.count = 0;
1359         arg.w.fn = check_loop_fn;
1360         arg.depth = depth;
1361         arg.p = p;
1362         q->ops->cl_ops->walk(q, &arg.w);
1363         return arg.w.stop ? -ELOOP : 0;
1364 }
1365
1366 static int
1367 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1368 {
1369         struct Qdisc *leaf;
1370         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1371         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1372
1373         leaf = cops->leaf(q, cl);
1374         if (leaf) {
1375                 if (leaf == arg->p || arg->depth > 7)
1376                         return -ELOOP;
1377                 return check_loop(leaf, arg->p, arg->depth + 1);
1378         }
1379         return 0;
1380 }
1381
1382 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1383         [TCA_KIND]              = { .type = NLA_STRING },
1384         [TCA_RATE]              = { .type = NLA_BINARY,
1385                                     .len = sizeof(struct tc_estimator) },
1386         [TCA_STAB]              = { .type = NLA_NESTED },
1387         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1388         [TCA_CHAIN]             = { .type = NLA_U32 },
1389         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1390         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1391 };
1392
1393 /*
1394  * Delete/get qdisc.
1395  */
1396
1397 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1398                         struct netlink_ext_ack *extack)
1399 {
1400         struct net *net = sock_net(skb->sk);
1401         struct tcmsg *tcm = nlmsg_data(n);
1402         struct nlattr *tca[TCA_MAX + 1];
1403         struct net_device *dev;
1404         u32 clid;
1405         struct Qdisc *q = NULL;
1406         struct Qdisc *p = NULL;
1407         int err;
1408
1409         if ((n->nlmsg_type != RTM_GETQDISC) &&
1410             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1411                 return -EPERM;
1412
1413         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1414                           extack);
1415         if (err < 0)
1416                 return err;
1417
1418         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1419         if (!dev)
1420                 return -ENODEV;
1421
1422         clid = tcm->tcm_parent;
1423         if (clid) {
1424                 if (clid != TC_H_ROOT) {
1425                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1426                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1427                                 if (!p) {
1428                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1429                                         return -ENOENT;
1430                                 }
1431                                 q = qdisc_leaf(p, clid);
1432                         } else if (dev_ingress_queue(dev)) {
1433                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1434                         }
1435                 } else {
1436                         q = dev->qdisc;
1437                 }
1438                 if (!q) {
1439                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1440                         return -ENOENT;
1441                 }
1442
1443                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1444                         NL_SET_ERR_MSG(extack, "Invalid handle");
1445                         return -EINVAL;
1446                 }
1447         } else {
1448                 q = qdisc_lookup(dev, tcm->tcm_handle);
1449                 if (!q) {
1450                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1451                         return -ENOENT;
1452                 }
1453         }
1454
1455         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1456                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1457                 return -EINVAL;
1458         }
1459
1460         if (n->nlmsg_type == RTM_DELQDISC) {
1461                 if (!clid) {
1462                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1463                         return -EINVAL;
1464                 }
1465                 if (q->handle == 0) {
1466                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1467                         return -ENOENT;
1468                 }
1469                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1470                 if (err != 0)
1471                         return err;
1472         } else {
1473                 qdisc_notify(net, skb, n, clid, NULL, q);
1474         }
1475         return 0;
1476 }
1477
1478 /*
1479  * Create/change qdisc.
1480  */
1481
1482 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1483                            struct netlink_ext_ack *extack)
1484 {
1485         struct net *net = sock_net(skb->sk);
1486         struct tcmsg *tcm;
1487         struct nlattr *tca[TCA_MAX + 1];
1488         struct net_device *dev;
1489         u32 clid;
1490         struct Qdisc *q, *p;
1491         int err;
1492
1493         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1494                 return -EPERM;
1495
1496 replay:
1497         /* Reinit, just in case something touches this. */
1498         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1499                           extack);
1500         if (err < 0)
1501                 return err;
1502
1503         tcm = nlmsg_data(n);
1504         clid = tcm->tcm_parent;
1505         q = p = NULL;
1506
1507         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1508         if (!dev)
1509                 return -ENODEV;
1510
1511
1512         if (clid) {
1513                 if (clid != TC_H_ROOT) {
1514                         if (clid != TC_H_INGRESS) {
1515                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1516                                 if (!p) {
1517                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1518                                         return -ENOENT;
1519                                 }
1520                                 q = qdisc_leaf(p, clid);
1521                         } else if (dev_ingress_queue_create(dev)) {
1522                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1523                         }
1524                 } else {
1525                         q = dev->qdisc;
1526                 }
1527
1528                 /* It may be default qdisc, ignore it */
1529                 if (q && q->handle == 0)
1530                         q = NULL;
1531
1532                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1533                         if (tcm->tcm_handle) {
1534                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1535                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1536                                         return -EEXIST;
1537                                 }
1538                                 if (TC_H_MIN(tcm->tcm_handle)) {
1539                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1540                                         return -EINVAL;
1541                                 }
1542                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1543                                 if (!q)
1544                                         goto create_n_graft;
1545                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1546                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1547                                         return -EEXIST;
1548                                 }
1549                                 if (tca[TCA_KIND] &&
1550                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1551                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1552                                         return -EINVAL;
1553                                 }
1554                                 if (q == p ||
1555                                     (p && check_loop(q, p, 0))) {
1556                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1557                                         return -ELOOP;
1558                                 }
1559                                 qdisc_refcount_inc(q);
1560                                 goto graft;
1561                         } else {
1562                                 if (!q)
1563                                         goto create_n_graft;
1564
1565                                 /* This magic test requires explanation.
1566                                  *
1567                                  *   We know, that some child q is already
1568                                  *   attached to this parent and have choice:
1569                                  *   either to change it or to create/graft new one.
1570                                  *
1571                                  *   1. We are allowed to create/graft only
1572                                  *   if CREATE and REPLACE flags are set.
1573                                  *
1574                                  *   2. If EXCL is set, requestor wanted to say,
1575                                  *   that qdisc tcm_handle is not expected
1576                                  *   to exist, so that we choose create/graft too.
1577                                  *
1578                                  *   3. The last case is when no flags are set.
1579                                  *   Alas, it is sort of hole in API, we
1580                                  *   cannot decide what to do unambiguously.
1581                                  *   For now we select create/graft, if
1582                                  *   user gave KIND, which does not match existing.
1583                                  */
1584                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1585                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1586                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1587                                      (tca[TCA_KIND] &&
1588                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1589                                         goto create_n_graft;
1590                         }
1591                 }
1592         } else {
1593                 if (!tcm->tcm_handle) {
1594                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1595                         return -EINVAL;
1596                 }
1597                 q = qdisc_lookup(dev, tcm->tcm_handle);
1598         }
1599
1600         /* Change qdisc parameters */
1601         if (!q) {
1602                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1603                 return -ENOENT;
1604         }
1605         if (n->nlmsg_flags & NLM_F_EXCL) {
1606                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1607                 return -EEXIST;
1608         }
1609         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1610                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1611                 return -EINVAL;
1612         }
1613         err = qdisc_change(q, tca, extack);
1614         if (err == 0)
1615                 qdisc_notify(net, skb, n, clid, NULL, q);
1616         return err;
1617
1618 create_n_graft:
1619         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1620                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1621                 return -ENOENT;
1622         }
1623         if (clid == TC_H_INGRESS) {
1624                 if (dev_ingress_queue(dev)) {
1625                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1626                                          tcm->tcm_parent, tcm->tcm_parent,
1627                                          tca, &err, extack);
1628                 } else {
1629                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1630                         err = -ENOENT;
1631                 }
1632         } else {
1633                 struct netdev_queue *dev_queue;
1634
1635                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1636                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1637                 else if (p)
1638                         dev_queue = p->dev_queue;
1639                 else
1640                         dev_queue = netdev_get_tx_queue(dev, 0);
1641
1642                 q = qdisc_create(dev, dev_queue, p,
1643                                  tcm->tcm_parent, tcm->tcm_handle,
1644                                  tca, &err, extack);
1645         }
1646         if (q == NULL) {
1647                 if (err == -EAGAIN)
1648                         goto replay;
1649                 return err;
1650         }
1651
1652 graft:
1653         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1654         if (err) {
1655                 if (q)
1656                         qdisc_put(q);
1657                 return err;
1658         }
1659
1660         return 0;
1661 }
1662
1663 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1664                               struct netlink_callback *cb,
1665                               int *q_idx_p, int s_q_idx, bool recur,
1666                               bool dump_invisible)
1667 {
1668         int ret = 0, q_idx = *q_idx_p;
1669         struct Qdisc *q;
1670         int b;
1671
1672         if (!root)
1673                 return 0;
1674
1675         q = root;
1676         if (q_idx < s_q_idx) {
1677                 q_idx++;
1678         } else {
1679                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1680                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1681                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1682                                   RTM_NEWQDISC) <= 0)
1683                         goto done;
1684                 q_idx++;
1685         }
1686
1687         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1688          * itself has already been dumped.
1689          *
1690          * If we've already dumped the top-level (ingress) qdisc above and the global
1691          * qdisc hashtable, we don't want to hit it again
1692          */
1693         if (!qdisc_dev(root) || !recur)
1694                 goto out;
1695
1696         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1697                 if (q_idx < s_q_idx) {
1698                         q_idx++;
1699                         continue;
1700                 }
1701                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1702                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1703                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1704                                   RTM_NEWQDISC) <= 0)
1705                         goto done;
1706                 q_idx++;
1707         }
1708
1709 out:
1710         *q_idx_p = q_idx;
1711         return ret;
1712 done:
1713         ret = -1;
1714         goto out;
1715 }
1716
1717 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1718 {
1719         struct net *net = sock_net(skb->sk);
1720         int idx, q_idx;
1721         int s_idx, s_q_idx;
1722         struct net_device *dev;
1723         const struct nlmsghdr *nlh = cb->nlh;
1724         struct nlattr *tca[TCA_MAX + 1];
1725         int err;
1726
1727         s_idx = cb->args[0];
1728         s_q_idx = q_idx = cb->args[1];
1729
1730         idx = 0;
1731         ASSERT_RTNL();
1732
1733         err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1734                           rtm_tca_policy, cb->extack);
1735         if (err < 0)
1736                 return err;
1737
1738         for_each_netdev(net, dev) {
1739                 struct netdev_queue *dev_queue;
1740
1741                 if (idx < s_idx)
1742                         goto cont;
1743                 if (idx > s_idx)
1744                         s_q_idx = 0;
1745                 q_idx = 0;
1746
1747                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1748                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1749                         goto done;
1750
1751                 dev_queue = dev_ingress_queue(dev);
1752                 if (dev_queue &&
1753                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1754                                        &q_idx, s_q_idx, false,
1755                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1756                         goto done;
1757
1758 cont:
1759                 idx++;
1760         }
1761
1762 done:
1763         cb->args[0] = idx;
1764         cb->args[1] = q_idx;
1765
1766         return skb->len;
1767 }
1768
1769
1770
1771 /************************************************
1772  *      Traffic classes manipulation.           *
1773  ************************************************/
1774
1775 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1776                           unsigned long cl,
1777                           u32 portid, u32 seq, u16 flags, int event)
1778 {
1779         struct tcmsg *tcm;
1780         struct nlmsghdr  *nlh;
1781         unsigned char *b = skb_tail_pointer(skb);
1782         struct gnet_dump d;
1783         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1784
1785         cond_resched();
1786         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1787         if (!nlh)
1788                 goto out_nlmsg_trim;
1789         tcm = nlmsg_data(nlh);
1790         tcm->tcm_family = AF_UNSPEC;
1791         tcm->tcm__pad1 = 0;
1792         tcm->tcm__pad2 = 0;
1793         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1794         tcm->tcm_parent = q->handle;
1795         tcm->tcm_handle = q->handle;
1796         tcm->tcm_info = 0;
1797         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1798                 goto nla_put_failure;
1799         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1800                 goto nla_put_failure;
1801
1802         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1803                                          NULL, &d, TCA_PAD) < 0)
1804                 goto nla_put_failure;
1805
1806         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1807                 goto nla_put_failure;
1808
1809         if (gnet_stats_finish_copy(&d) < 0)
1810                 goto nla_put_failure;
1811
1812         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1813         return skb->len;
1814
1815 out_nlmsg_trim:
1816 nla_put_failure:
1817         nlmsg_trim(skb, b);
1818         return -1;
1819 }
1820
1821 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1822                          struct nlmsghdr *n, struct Qdisc *q,
1823                          unsigned long cl, int event)
1824 {
1825         struct sk_buff *skb;
1826         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1827         int err = 0;
1828
1829         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1830         if (!skb)
1831                 return -ENOBUFS;
1832
1833         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1834                 kfree_skb(skb);
1835                 return -EINVAL;
1836         }
1837
1838         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1839                              n->nlmsg_flags & NLM_F_ECHO);
1840         if (err > 0)
1841                 err = 0;
1842         return err;
1843 }
1844
1845 static int tclass_del_notify(struct net *net,
1846                              const struct Qdisc_class_ops *cops,
1847                              struct sk_buff *oskb, struct nlmsghdr *n,
1848                              struct Qdisc *q, unsigned long cl)
1849 {
1850         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1851         struct sk_buff *skb;
1852         int err = 0;
1853
1854         if (!cops->delete)
1855                 return -EOPNOTSUPP;
1856
1857         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1858         if (!skb)
1859                 return -ENOBUFS;
1860
1861         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1862                            RTM_DELTCLASS) < 0) {
1863                 kfree_skb(skb);
1864                 return -EINVAL;
1865         }
1866
1867         err = cops->delete(q, cl);
1868         if (err) {
1869                 kfree_skb(skb);
1870                 return err;
1871         }
1872
1873         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1874                              n->nlmsg_flags & NLM_F_ECHO);
1875         if (err > 0)
1876                 err = 0;
1877         return err;
1878 }
1879
1880 #ifdef CONFIG_NET_CLS
1881
1882 struct tcf_bind_args {
1883         struct tcf_walker w;
1884         u32 classid;
1885         unsigned long cl;
1886 };
1887
1888 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1889 {
1890         struct tcf_bind_args *a = (void *)arg;
1891
1892         if (tp->ops->bind_class) {
1893                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1894
1895                 sch_tree_lock(q);
1896                 tp->ops->bind_class(n, a->classid, a->cl);
1897                 sch_tree_unlock(q);
1898         }
1899         return 0;
1900 }
1901
1902 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1903                            unsigned long new_cl)
1904 {
1905         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1906         struct tcf_block *block;
1907         struct tcf_chain *chain;
1908         unsigned long cl;
1909
1910         cl = cops->find(q, portid);
1911         if (!cl)
1912                 return;
1913         block = cops->tcf_block(q, cl, NULL);
1914         if (!block)
1915                 return;
1916         for (chain = tcf_get_next_chain(block, NULL);
1917              chain;
1918              chain = tcf_get_next_chain(block, chain)) {
1919                 struct tcf_proto *tp;
1920
1921                 for (tp = tcf_get_next_proto(chain, NULL, true);
1922                      tp; tp = tcf_get_next_proto(chain, tp, true)) {
1923                         struct tcf_bind_args arg = {};
1924
1925                         arg.w.fn = tcf_node_bind;
1926                         arg.classid = clid;
1927                         arg.cl = new_cl;
1928                         tp->ops->walk(tp, &arg.w, true);
1929                 }
1930         }
1931 }
1932
1933 #else
1934
1935 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1936                            unsigned long new_cl)
1937 {
1938 }
1939
1940 #endif
1941
1942 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1943                          struct netlink_ext_ack *extack)
1944 {
1945         struct net *net = sock_net(skb->sk);
1946         struct tcmsg *tcm = nlmsg_data(n);
1947         struct nlattr *tca[TCA_MAX + 1];
1948         struct net_device *dev;
1949         struct Qdisc *q = NULL;
1950         const struct Qdisc_class_ops *cops;
1951         unsigned long cl = 0;
1952         unsigned long new_cl;
1953         u32 portid;
1954         u32 clid;
1955         u32 qid;
1956         int err;
1957
1958         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1959             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1960                 return -EPERM;
1961
1962         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1963                           extack);
1964         if (err < 0)
1965                 return err;
1966
1967         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1968         if (!dev)
1969                 return -ENODEV;
1970
1971         /*
1972            parent == TC_H_UNSPEC - unspecified parent.
1973            parent == TC_H_ROOT   - class is root, which has no parent.
1974            parent == X:0         - parent is root class.
1975            parent == X:Y         - parent is a node in hierarchy.
1976            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1977
1978            handle == 0:0         - generate handle from kernel pool.
1979            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1980            handle == X:Y         - clear.
1981            handle == X:0         - root class.
1982          */
1983
1984         /* Step 1. Determine qdisc handle X:0 */
1985
1986         portid = tcm->tcm_parent;
1987         clid = tcm->tcm_handle;
1988         qid = TC_H_MAJ(clid);
1989
1990         if (portid != TC_H_ROOT) {
1991                 u32 qid1 = TC_H_MAJ(portid);
1992
1993                 if (qid && qid1) {
1994                         /* If both majors are known, they must be identical. */
1995                         if (qid != qid1)
1996                                 return -EINVAL;
1997                 } else if (qid1) {
1998                         qid = qid1;
1999                 } else if (qid == 0)
2000                         qid = dev->qdisc->handle;
2001
2002                 /* Now qid is genuine qdisc handle consistent
2003                  * both with parent and child.
2004                  *
2005                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2006                  */
2007                 if (portid)
2008                         portid = TC_H_MAKE(qid, portid);
2009         } else {
2010                 if (qid == 0)
2011                         qid = dev->qdisc->handle;
2012         }
2013
2014         /* OK. Locate qdisc */
2015         q = qdisc_lookup(dev, qid);
2016         if (!q)
2017                 return -ENOENT;
2018
2019         /* An check that it supports classes */
2020         cops = q->ops->cl_ops;
2021         if (cops == NULL)
2022                 return -EINVAL;
2023
2024         /* Now try to get class */
2025         if (clid == 0) {
2026                 if (portid == TC_H_ROOT)
2027                         clid = qid;
2028         } else
2029                 clid = TC_H_MAKE(qid, clid);
2030
2031         if (clid)
2032                 cl = cops->find(q, clid);
2033
2034         if (cl == 0) {
2035                 err = -ENOENT;
2036                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2037                     !(n->nlmsg_flags & NLM_F_CREATE))
2038                         goto out;
2039         } else {
2040                 switch (n->nlmsg_type) {
2041                 case RTM_NEWTCLASS:
2042                         err = -EEXIST;
2043                         if (n->nlmsg_flags & NLM_F_EXCL)
2044                                 goto out;
2045                         break;
2046                 case RTM_DELTCLASS:
2047                         err = tclass_del_notify(net, cops, skb, n, q, cl);
2048                         /* Unbind the class with flilters with 0 */
2049                         tc_bind_tclass(q, portid, clid, 0);
2050                         goto out;
2051                 case RTM_GETTCLASS:
2052                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2053                         goto out;
2054                 default:
2055                         err = -EINVAL;
2056                         goto out;
2057                 }
2058         }
2059
2060         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2061                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2062                 return -EOPNOTSUPP;
2063         }
2064
2065         new_cl = cl;
2066         err = -EOPNOTSUPP;
2067         if (cops->change)
2068                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2069         if (err == 0) {
2070                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2071                 /* We just create a new class, need to do reverse binding. */
2072                 if (cl != new_cl)
2073                         tc_bind_tclass(q, portid, clid, new_cl);
2074         }
2075 out:
2076         return err;
2077 }
2078
2079 struct qdisc_dump_args {
2080         struct qdisc_walker     w;
2081         struct sk_buff          *skb;
2082         struct netlink_callback *cb;
2083 };
2084
2085 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2086                             struct qdisc_walker *arg)
2087 {
2088         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2089
2090         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2091                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2092                               RTM_NEWTCLASS);
2093 }
2094
2095 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2096                                 struct tcmsg *tcm, struct netlink_callback *cb,
2097                                 int *t_p, int s_t)
2098 {
2099         struct qdisc_dump_args arg;
2100
2101         if (tc_qdisc_dump_ignore(q, false) ||
2102             *t_p < s_t || !q->ops->cl_ops ||
2103             (tcm->tcm_parent &&
2104              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2105                 (*t_p)++;
2106                 return 0;
2107         }
2108         if (*t_p > s_t)
2109                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2110         arg.w.fn = qdisc_class_dump;
2111         arg.skb = skb;
2112         arg.cb = cb;
2113         arg.w.stop  = 0;
2114         arg.w.skip = cb->args[1];
2115         arg.w.count = 0;
2116         q->ops->cl_ops->walk(q, &arg.w);
2117         cb->args[1] = arg.w.count;
2118         if (arg.w.stop)
2119                 return -1;
2120         (*t_p)++;
2121         return 0;
2122 }
2123
2124 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2125                                struct tcmsg *tcm, struct netlink_callback *cb,
2126                                int *t_p, int s_t)
2127 {
2128         struct Qdisc *q;
2129         int b;
2130
2131         if (!root)
2132                 return 0;
2133
2134         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2135                 return -1;
2136
2137         if (!qdisc_dev(root))
2138                 return 0;
2139
2140         if (tcm->tcm_parent) {
2141                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2142                 if (q && q != root &&
2143                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2144                         return -1;
2145                 return 0;
2146         }
2147         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2148                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2149                         return -1;
2150         }
2151
2152         return 0;
2153 }
2154
2155 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2156 {
2157         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2158         struct net *net = sock_net(skb->sk);
2159         struct netdev_queue *dev_queue;
2160         struct net_device *dev;
2161         int t, s_t;
2162
2163         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2164                 return 0;
2165         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2166         if (!dev)
2167                 return 0;
2168
2169         s_t = cb->args[0];
2170         t = 0;
2171
2172         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2173                 goto done;
2174
2175         dev_queue = dev_ingress_queue(dev);
2176         if (dev_queue &&
2177             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2178                                 &t, s_t) < 0)
2179                 goto done;
2180
2181 done:
2182         cb->args[0] = t;
2183
2184         dev_put(dev);
2185         return skb->len;
2186 }
2187
2188 #ifdef CONFIG_PROC_FS
2189 static int psched_show(struct seq_file *seq, void *v)
2190 {
2191         seq_printf(seq, "%08x %08x %08x %08x\n",
2192                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2193                    1000000,
2194                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2195
2196         return 0;
2197 }
2198
2199 static int __net_init psched_net_init(struct net *net)
2200 {
2201         struct proc_dir_entry *e;
2202
2203         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2204         if (e == NULL)
2205                 return -ENOMEM;
2206
2207         return 0;
2208 }
2209
2210 static void __net_exit psched_net_exit(struct net *net)
2211 {
2212         remove_proc_entry("psched", net->proc_net);
2213 }
2214 #else
2215 static int __net_init psched_net_init(struct net *net)
2216 {
2217         return 0;
2218 }
2219
2220 static void __net_exit psched_net_exit(struct net *net)
2221 {
2222 }
2223 #endif
2224
2225 static struct pernet_operations psched_net_ops = {
2226         .init = psched_net_init,
2227         .exit = psched_net_exit,
2228 };
2229
2230 static int __init pktsched_init(void)
2231 {
2232         int err;
2233
2234         err = register_pernet_subsys(&psched_net_ops);
2235         if (err) {
2236                 pr_err("pktsched_init: "
2237                        "cannot initialize per netns operations\n");
2238                 return err;
2239         }
2240
2241         register_qdisc(&pfifo_fast_ops);
2242         register_qdisc(&pfifo_qdisc_ops);
2243         register_qdisc(&bfifo_qdisc_ops);
2244         register_qdisc(&pfifo_head_drop_qdisc_ops);
2245         register_qdisc(&mq_qdisc_ops);
2246         register_qdisc(&noqueue_qdisc_ops);
2247
2248         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2249         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2250         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2251                       0);
2252         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2253         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2254         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2255                       0);
2256
2257         return 0;
2258 }
2259
2260 subsys_initcall(pktsched_init);