Documentation: embargoed-hardware-issues.rst: Add myself for Power
[sfrench/cifs-2.6.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35
36 #include <trace/events/qdisc.h>
37
38 /*
39
40    Short review.
41    -------------
42
43    This file consists of two interrelated parts:
44
45    1. queueing disciplines manager frontend.
46    2. traffic classes manager frontend.
47
48    Generally, queueing discipline ("qdisc") is a black box,
49    which is able to enqueue packets and to dequeue them (when
50    device is ready to send something) in order and at times
51    determined by algorithm hidden in it.
52
53    qdisc's are divided to two categories:
54    - "queues", which have no internal structure visible from outside.
55    - "schedulers", which split all the packets to "traffic classes",
56      using "packet classifiers" (look at cls_api.c)
57
58    In turn, classes may have child qdiscs (as rule, queues)
59    attached to them etc. etc. etc.
60
61    The goal of the routines in this file is to translate
62    information supplied by user in the form of handles
63    to more intelligible for kernel form, to make some sanity
64    checks and part of work, which is common to all qdiscs
65    and to provide rtnetlink notifications.
66
67    All real intelligent work is done inside qdisc modules.
68
69
70
71    Every discipline has two major routines: enqueue and dequeue.
72
73    ---dequeue
74
75    dequeue usually returns a skb to send. It is allowed to return NULL,
76    but it does not mean that queue is empty, it just means that
77    discipline does not want to send anything this time.
78    Queue is really empty if q->q.qlen == 0.
79    For complicated disciplines with multiple queues q->q is not
80    real packet queue, but however q->q.qlen must be valid.
81
82    ---enqueue
83
84    enqueue returns 0, if packet was enqueued successfully.
85    If packet (this one or another one) was dropped, it returns
86    not zero error code.
87    NET_XMIT_DROP        - this packet dropped
88      Expected action: do not backoff, but wait until queue will clear.
89    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
90      Expected action: backoff or ignore
91
92    Auxiliary routines:
93
94    ---peek
95
96    like dequeue but without removing a packet from the queue
97
98    ---reset
99
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102
103    ---init
104
105    initializes newly created qdisc.
106
107    ---destroy
108
109    destroys resources allocated by init and during lifetime of qdisc.
110
111    ---change
112
113    changes qdisc parameters.
114  */
115
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118
119
120 /************************************************
121  *      Queueing disciplines manipulation.      *
122  ************************************************/
123
124
125 /* The list of all installed queueing disciplines. */
126
127 static struct Qdisc_ops *qdisc_base;
128
129 /* Register/unregister queueing discipline */
130
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133         struct Qdisc_ops *q, **qp;
134         int rc = -EEXIST;
135
136         write_lock(&qdisc_mod_lock);
137         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138                 if (!strcmp(qops->id, q->id))
139                         goto out;
140
141         if (qops->enqueue == NULL)
142                 qops->enqueue = noop_qdisc_ops.enqueue;
143         if (qops->peek == NULL) {
144                 if (qops->dequeue == NULL)
145                         qops->peek = noop_qdisc_ops.peek;
146                 else
147                         goto out_einval;
148         }
149         if (qops->dequeue == NULL)
150                 qops->dequeue = noop_qdisc_ops.dequeue;
151
152         if (qops->cl_ops) {
153                 const struct Qdisc_class_ops *cops = qops->cl_ops;
154
155                 if (!(cops->find && cops->walk && cops->leaf))
156                         goto out_einval;
157
158                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159                         goto out_einval;
160         }
161
162         qops->next = NULL;
163         *qp = qops;
164         rc = 0;
165 out:
166         write_unlock(&qdisc_mod_lock);
167         return rc;
168
169 out_einval:
170         rc = -EINVAL;
171         goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177         struct Qdisc_ops *q, **qp;
178         int err = -ENOENT;
179
180         write_lock(&qdisc_mod_lock);
181         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182                 if (q == qops)
183                         break;
184         if (q) {
185                 *qp = q->next;
186                 q->next = NULL;
187                 err = 0;
188         }
189         write_unlock(&qdisc_mod_lock);
190
191         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strscpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214
215         return q;
216 }
217
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module(NET_SCH_ALIAS_PREFIX "%s", name);
232                 write_lock(&qdisc_mod_lock);
233
234                 ops = qdisc_lookup_default(name);
235         }
236
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243
244         return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273                                    lockdep_rtnl_is_held()) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
313                         handle);
314 out:
315         return q;
316 }
317
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320         struct netdev_queue *nq;
321         struct Qdisc *q;
322
323         if (!handle)
324                 return NULL;
325         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326         if (q)
327                 goto out;
328
329         nq = dev_ingress_queue_rcu(dev);
330         if (nq)
331                 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
332                                           handle);
333 out:
334         return q;
335 }
336
337 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
338 {
339         unsigned long cl;
340         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
341
342         if (cops == NULL)
343                 return NULL;
344         cl = cops->find(p, classid);
345
346         if (cl == 0)
347                 return NULL;
348         return cops->leaf(p, cl);
349 }
350
351 /* Find queueing discipline by name */
352
353 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 {
355         struct Qdisc_ops *q = NULL;
356
357         if (kind) {
358                 read_lock(&qdisc_mod_lock);
359                 for (q = qdisc_base; q; q = q->next) {
360                         if (nla_strcmp(kind, q->id) == 0) {
361                                 if (!try_module_get(q->owner))
362                                         q = NULL;
363                                 break;
364                         }
365                 }
366                 read_unlock(&qdisc_mod_lock);
367         }
368         return q;
369 }
370
371 /* The linklayer setting were not transferred from iproute2, in older
372  * versions, and the rate tables lookup systems have been dropped in
373  * the kernel. To keep backward compatible with older iproute2 tc
374  * utils, we detect the linklayer setting by detecting if the rate
375  * table were modified.
376  *
377  * For linklayer ATM table entries, the rate table will be aligned to
378  * 48 bytes, thus some table entries will contain the same value.  The
379  * mpu (min packet unit) is also encoded into the old rate table, thus
380  * starting from the mpu, we find low and high table entries for
381  * mapping this cell.  If these entries contain the same value, when
382  * the rate tables have been modified for linklayer ATM.
383  *
384  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
385  * and then roundup to the next cell, calc the table entry one below,
386  * and compare.
387  */
388 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 {
390         int low       = roundup(r->mpu, 48);
391         int high      = roundup(low+1, 48);
392         int cell_low  = low >> r->cell_log;
393         int cell_high = (high >> r->cell_log) - 1;
394
395         /* rtab is too inaccurate at rates > 100Mbit/s */
396         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
397                 pr_debug("TC linklayer: Giving up ATM detection\n");
398                 return TC_LINKLAYER_ETHERNET;
399         }
400
401         if ((cell_high > cell_low) && (cell_high < 256)
402             && (rtab[cell_low] == rtab[cell_high])) {
403                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
404                          cell_low, cell_high, rtab[cell_high]);
405                 return TC_LINKLAYER_ATM;
406         }
407         return TC_LINKLAYER_ETHERNET;
408 }
409
410 static struct qdisc_rate_table *qdisc_rtab_list;
411
412 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413                                         struct nlattr *tab,
414                                         struct netlink_ext_ack *extack)
415 {
416         struct qdisc_rate_table *rtab;
417
418         if (tab == NULL || r->rate == 0 ||
419             r->cell_log == 0 || r->cell_log >= 32 ||
420             nla_len(tab) != TC_RTAB_SIZE) {
421                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
422                 return NULL;
423         }
424
425         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
426                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
427                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
428                         rtab->refcnt++;
429                         return rtab;
430                 }
431         }
432
433         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
434         if (rtab) {
435                 rtab->rate = *r;
436                 rtab->refcnt = 1;
437                 memcpy(rtab->data, nla_data(tab), 1024);
438                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
439                         r->linklayer = __detect_linklayer(r, rtab->data);
440                 rtab->next = qdisc_rtab_list;
441                 qdisc_rtab_list = rtab;
442         } else {
443                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
444         }
445         return rtab;
446 }
447 EXPORT_SYMBOL(qdisc_get_rtab);
448
449 void qdisc_put_rtab(struct qdisc_rate_table *tab)
450 {
451         struct qdisc_rate_table *rtab, **rtabp;
452
453         if (!tab || --tab->refcnt)
454                 return;
455
456         for (rtabp = &qdisc_rtab_list;
457              (rtab = *rtabp) != NULL;
458              rtabp = &rtab->next) {
459                 if (rtab == tab) {
460                         *rtabp = rtab->next;
461                         kfree(rtab);
462                         return;
463                 }
464         }
465 }
466 EXPORT_SYMBOL(qdisc_put_rtab);
467
468 static LIST_HEAD(qdisc_stab_list);
469
470 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
471         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
472         [TCA_STAB_DATA] = { .type = NLA_BINARY },
473 };
474
475 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
476                                                struct netlink_ext_ack *extack)
477 {
478         struct nlattr *tb[TCA_STAB_MAX + 1];
479         struct qdisc_size_table *stab;
480         struct tc_sizespec *s;
481         unsigned int tsize = 0;
482         u16 *tab = NULL;
483         int err;
484
485         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
486                                           extack);
487         if (err < 0)
488                 return ERR_PTR(err);
489         if (!tb[TCA_STAB_BASE]) {
490                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
491                 return ERR_PTR(-EINVAL);
492         }
493
494         s = nla_data(tb[TCA_STAB_BASE]);
495
496         if (s->tsize > 0) {
497                 if (!tb[TCA_STAB_DATA]) {
498                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
499                         return ERR_PTR(-EINVAL);
500                 }
501                 tab = nla_data(tb[TCA_STAB_DATA]);
502                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
503         }
504
505         if (tsize != s->tsize || (!tab && tsize > 0)) {
506                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
507                 return ERR_PTR(-EINVAL);
508         }
509
510         list_for_each_entry(stab, &qdisc_stab_list, list) {
511                 if (memcmp(&stab->szopts, s, sizeof(*s)))
512                         continue;
513                 if (tsize > 0 &&
514                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
515                         continue;
516                 stab->refcnt++;
517                 return stab;
518         }
519
520         if (s->size_log > STAB_SIZE_LOG_MAX ||
521             s->cell_log > STAB_SIZE_LOG_MAX) {
522                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
523                 return ERR_PTR(-EINVAL);
524         }
525
526         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
527         if (!stab)
528                 return ERR_PTR(-ENOMEM);
529
530         stab->refcnt = 1;
531         stab->szopts = *s;
532         if (tsize > 0)
533                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
534
535         list_add_tail(&stab->list, &qdisc_stab_list);
536
537         return stab;
538 }
539
540 void qdisc_put_stab(struct qdisc_size_table *tab)
541 {
542         if (!tab)
543                 return;
544
545         if (--tab->refcnt == 0) {
546                 list_del(&tab->list);
547                 kfree_rcu(tab, rcu);
548         }
549 }
550 EXPORT_SYMBOL(qdisc_put_stab);
551
552 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
553 {
554         struct nlattr *nest;
555
556         nest = nla_nest_start_noflag(skb, TCA_STAB);
557         if (nest == NULL)
558                 goto nla_put_failure;
559         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
560                 goto nla_put_failure;
561         nla_nest_end(skb, nest);
562
563         return skb->len;
564
565 nla_put_failure:
566         return -1;
567 }
568
569 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
570                                const struct qdisc_size_table *stab)
571 {
572         int pkt_len, slot;
573
574         pkt_len = skb->len + stab->szopts.overhead;
575         if (unlikely(!stab->szopts.tsize))
576                 goto out;
577
578         slot = pkt_len + stab->szopts.cell_align;
579         if (unlikely(slot < 0))
580                 slot = 0;
581
582         slot >>= stab->szopts.cell_log;
583         if (likely(slot < stab->szopts.tsize))
584                 pkt_len = stab->data[slot];
585         else
586                 pkt_len = stab->data[stab->szopts.tsize - 1] *
587                                 (slot / stab->szopts.tsize) +
588                                 stab->data[slot % stab->szopts.tsize];
589
590         pkt_len <<= stab->szopts.size_log;
591 out:
592         if (unlikely(pkt_len < 1))
593                 pkt_len = 1;
594         qdisc_skb_cb(skb)->pkt_len = pkt_len;
595 }
596 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
597
598 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
599 {
600         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
601                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
602                         txt, qdisc->ops->id, qdisc->handle >> 16);
603                 qdisc->flags |= TCQ_F_WARN_NONWC;
604         }
605 }
606 EXPORT_SYMBOL(qdisc_warn_nonwc);
607
608 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
609 {
610         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
611                                                  timer);
612
613         rcu_read_lock();
614         __netif_schedule(qdisc_root(wd->qdisc));
615         rcu_read_unlock();
616
617         return HRTIMER_NORESTART;
618 }
619
620 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
621                                  clockid_t clockid)
622 {
623         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
624         wd->timer.function = qdisc_watchdog;
625         wd->qdisc = qdisc;
626 }
627 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
628
629 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
630 {
631         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
632 }
633 EXPORT_SYMBOL(qdisc_watchdog_init);
634
635 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
636                                       u64 delta_ns)
637 {
638         bool deactivated;
639
640         rcu_read_lock();
641         deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
642                                &qdisc_root_sleeping(wd->qdisc)->state);
643         rcu_read_unlock();
644         if (deactivated)
645                 return;
646
647         if (hrtimer_is_queued(&wd->timer)) {
648                 u64 softexpires;
649
650                 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
651                 /* If timer is already set in [expires, expires + delta_ns],
652                  * do not reprogram it.
653                  */
654                 if (softexpires - expires <= delta_ns)
655                         return;
656         }
657
658         hrtimer_start_range_ns(&wd->timer,
659                                ns_to_ktime(expires),
660                                delta_ns,
661                                HRTIMER_MODE_ABS_PINNED);
662 }
663 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
664
665 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
666 {
667         hrtimer_cancel(&wd->timer);
668 }
669 EXPORT_SYMBOL(qdisc_watchdog_cancel);
670
671 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
672 {
673         struct hlist_head *h;
674         unsigned int i;
675
676         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
677
678         if (h != NULL) {
679                 for (i = 0; i < n; i++)
680                         INIT_HLIST_HEAD(&h[i]);
681         }
682         return h;
683 }
684
685 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
686 {
687         struct Qdisc_class_common *cl;
688         struct hlist_node *next;
689         struct hlist_head *nhash, *ohash;
690         unsigned int nsize, nmask, osize;
691         unsigned int i, h;
692
693         /* Rehash when load factor exceeds 0.75 */
694         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
695                 return;
696         nsize = clhash->hashsize * 2;
697         nmask = nsize - 1;
698         nhash = qdisc_class_hash_alloc(nsize);
699         if (nhash == NULL)
700                 return;
701
702         ohash = clhash->hash;
703         osize = clhash->hashsize;
704
705         sch_tree_lock(sch);
706         for (i = 0; i < osize; i++) {
707                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
708                         h = qdisc_class_hash(cl->classid, nmask);
709                         hlist_add_head(&cl->hnode, &nhash[h]);
710                 }
711         }
712         clhash->hash     = nhash;
713         clhash->hashsize = nsize;
714         clhash->hashmask = nmask;
715         sch_tree_unlock(sch);
716
717         kvfree(ohash);
718 }
719 EXPORT_SYMBOL(qdisc_class_hash_grow);
720
721 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
722 {
723         unsigned int size = 4;
724
725         clhash->hash = qdisc_class_hash_alloc(size);
726         if (!clhash->hash)
727                 return -ENOMEM;
728         clhash->hashsize  = size;
729         clhash->hashmask  = size - 1;
730         clhash->hashelems = 0;
731         return 0;
732 }
733 EXPORT_SYMBOL(qdisc_class_hash_init);
734
735 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
736 {
737         kvfree(clhash->hash);
738 }
739 EXPORT_SYMBOL(qdisc_class_hash_destroy);
740
741 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
742                              struct Qdisc_class_common *cl)
743 {
744         unsigned int h;
745
746         INIT_HLIST_NODE(&cl->hnode);
747         h = qdisc_class_hash(cl->classid, clhash->hashmask);
748         hlist_add_head(&cl->hnode, &clhash->hash[h]);
749         clhash->hashelems++;
750 }
751 EXPORT_SYMBOL(qdisc_class_hash_insert);
752
753 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
754                              struct Qdisc_class_common *cl)
755 {
756         hlist_del(&cl->hnode);
757         clhash->hashelems--;
758 }
759 EXPORT_SYMBOL(qdisc_class_hash_remove);
760
761 /* Allocate an unique handle from space managed by kernel
762  * Possible range is [8000-FFFF]:0000 (0x8000 values)
763  */
764 static u32 qdisc_alloc_handle(struct net_device *dev)
765 {
766         int i = 0x8000;
767         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
768
769         do {
770                 autohandle += TC_H_MAKE(0x10000U, 0);
771                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
772                         autohandle = TC_H_MAKE(0x80000000U, 0);
773                 if (!qdisc_lookup(dev, autohandle))
774                         return autohandle;
775                 cond_resched();
776         } while (--i > 0);
777
778         return 0;
779 }
780
781 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
782 {
783         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
784         const struct Qdisc_class_ops *cops;
785         unsigned long cl;
786         u32 parentid;
787         bool notify;
788         int drops;
789
790         if (n == 0 && len == 0)
791                 return;
792         drops = max_t(int, n, 0);
793         rcu_read_lock();
794         while ((parentid = sch->parent)) {
795                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
796                         break;
797
798                 if (sch->flags & TCQ_F_NOPARENT)
799                         break;
800                 /* Notify parent qdisc only if child qdisc becomes empty.
801                  *
802                  * If child was empty even before update then backlog
803                  * counter is screwed and we skip notification because
804                  * parent class is already passive.
805                  *
806                  * If the original child was offloaded then it is allowed
807                  * to be seem as empty, so the parent is notified anyway.
808                  */
809                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
810                                                        !qdisc_is_offloaded);
811                 /* TODO: perform the search on a per txq basis */
812                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
813                 if (sch == NULL) {
814                         WARN_ON_ONCE(parentid != TC_H_ROOT);
815                         break;
816                 }
817                 cops = sch->ops->cl_ops;
818                 if (notify && cops->qlen_notify) {
819                         cl = cops->find(sch, parentid);
820                         cops->qlen_notify(sch, cl);
821                 }
822                 sch->q.qlen -= n;
823                 sch->qstats.backlog -= len;
824                 __qdisc_qstats_drop(sch, drops);
825         }
826         rcu_read_unlock();
827 }
828 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
829
830 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
831                               void *type_data)
832 {
833         struct net_device *dev = qdisc_dev(sch);
834         int err;
835
836         sch->flags &= ~TCQ_F_OFFLOADED;
837         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
838                 return 0;
839
840         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
841         if (err == -EOPNOTSUPP)
842                 return 0;
843
844         if (!err)
845                 sch->flags |= TCQ_F_OFFLOADED;
846
847         return err;
848 }
849 EXPORT_SYMBOL(qdisc_offload_dump_helper);
850
851 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
852                                 struct Qdisc *new, struct Qdisc *old,
853                                 enum tc_setup_type type, void *type_data,
854                                 struct netlink_ext_ack *extack)
855 {
856         bool any_qdisc_is_offloaded;
857         int err;
858
859         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
860                 return;
861
862         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
863
864         /* Don't report error if the graft is part of destroy operation. */
865         if (!err || !new || new == &noop_qdisc)
866                 return;
867
868         /* Don't report error if the parent, the old child and the new
869          * one are not offloaded.
870          */
871         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
872         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
873         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
874
875         if (any_qdisc_is_offloaded)
876                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
877 }
878 EXPORT_SYMBOL(qdisc_offload_graft_helper);
879
880 void qdisc_offload_query_caps(struct net_device *dev,
881                               enum tc_setup_type type,
882                               void *caps, size_t caps_len)
883 {
884         const struct net_device_ops *ops = dev->netdev_ops;
885         struct tc_query_caps_base base = {
886                 .type = type,
887                 .caps = caps,
888         };
889
890         memset(caps, 0, caps_len);
891
892         if (ops->ndo_setup_tc)
893                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
894 }
895 EXPORT_SYMBOL(qdisc_offload_query_caps);
896
897 static void qdisc_offload_graft_root(struct net_device *dev,
898                                      struct Qdisc *new, struct Qdisc *old,
899                                      struct netlink_ext_ack *extack)
900 {
901         struct tc_root_qopt_offload graft_offload = {
902                 .command        = TC_ROOT_GRAFT,
903                 .handle         = new ? new->handle : 0,
904                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
905                                   (old && old->flags & TCQ_F_INGRESS),
906         };
907
908         qdisc_offload_graft_helper(dev, NULL, new, old,
909                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
910 }
911
912 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
913                          u32 portid, u32 seq, u16 flags, int event,
914                          struct netlink_ext_ack *extack)
915 {
916         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
917         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
918         struct tcmsg *tcm;
919         struct nlmsghdr  *nlh;
920         unsigned char *b = skb_tail_pointer(skb);
921         struct gnet_dump d;
922         struct qdisc_size_table *stab;
923         u32 block_index;
924         __u32 qlen;
925
926         cond_resched();
927         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
928         if (!nlh)
929                 goto out_nlmsg_trim;
930         tcm = nlmsg_data(nlh);
931         tcm->tcm_family = AF_UNSPEC;
932         tcm->tcm__pad1 = 0;
933         tcm->tcm__pad2 = 0;
934         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
935         tcm->tcm_parent = clid;
936         tcm->tcm_handle = q->handle;
937         tcm->tcm_info = refcount_read(&q->refcnt);
938         if (nla_put_string(skb, TCA_KIND, q->ops->id))
939                 goto nla_put_failure;
940         if (q->ops->ingress_block_get) {
941                 block_index = q->ops->ingress_block_get(q);
942                 if (block_index &&
943                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
944                         goto nla_put_failure;
945         }
946         if (q->ops->egress_block_get) {
947                 block_index = q->ops->egress_block_get(q);
948                 if (block_index &&
949                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
950                         goto nla_put_failure;
951         }
952         if (q->ops->dump && q->ops->dump(q, skb) < 0)
953                 goto nla_put_failure;
954         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
955                 goto nla_put_failure;
956         qlen = qdisc_qlen_sum(q);
957
958         stab = rtnl_dereference(q->stab);
959         if (stab && qdisc_dump_stab(skb, stab) < 0)
960                 goto nla_put_failure;
961
962         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
963                                          NULL, &d, TCA_PAD) < 0)
964                 goto nla_put_failure;
965
966         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
967                 goto nla_put_failure;
968
969         if (qdisc_is_percpu_stats(q)) {
970                 cpu_bstats = q->cpu_bstats;
971                 cpu_qstats = q->cpu_qstats;
972         }
973
974         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
975             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
976             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
977                 goto nla_put_failure;
978
979         if (gnet_stats_finish_copy(&d) < 0)
980                 goto nla_put_failure;
981
982         if (extack && extack->_msg &&
983             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
984                 goto out_nlmsg_trim;
985
986         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
987
988         return skb->len;
989
990 out_nlmsg_trim:
991 nla_put_failure:
992         nlmsg_trim(skb, b);
993         return -1;
994 }
995
996 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
997 {
998         if (q->flags & TCQ_F_BUILTIN)
999                 return true;
1000         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
1001                 return true;
1002
1003         return false;
1004 }
1005
1006 static int qdisc_get_notify(struct net *net, struct sk_buff *oskb,
1007                             struct nlmsghdr *n, u32 clid, struct Qdisc *q,
1008                             struct netlink_ext_ack *extack)
1009 {
1010         struct sk_buff *skb;
1011         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1012
1013         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1014         if (!skb)
1015                 return -ENOBUFS;
1016
1017         if (!tc_qdisc_dump_ignore(q, false)) {
1018                 if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0,
1019                                   RTM_NEWQDISC, extack) < 0)
1020                         goto err_out;
1021         }
1022
1023         if (skb->len)
1024                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1025                                       n->nlmsg_flags & NLM_F_ECHO);
1026
1027 err_out:
1028         kfree_skb(skb);
1029         return -EINVAL;
1030 }
1031
1032 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1033                         struct nlmsghdr *n, u32 clid,
1034                         struct Qdisc *old, struct Qdisc *new,
1035                         struct netlink_ext_ack *extack)
1036 {
1037         struct sk_buff *skb;
1038         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1039
1040         if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
1041                 return 0;
1042
1043         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1044         if (!skb)
1045                 return -ENOBUFS;
1046
1047         if (old && !tc_qdisc_dump_ignore(old, false)) {
1048                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1049                                   0, RTM_DELQDISC, extack) < 0)
1050                         goto err_out;
1051         }
1052         if (new && !tc_qdisc_dump_ignore(new, false)) {
1053                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1054                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1055                         goto err_out;
1056         }
1057
1058         if (skb->len)
1059                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1060                                       n->nlmsg_flags & NLM_F_ECHO);
1061
1062 err_out:
1063         kfree_skb(skb);
1064         return -EINVAL;
1065 }
1066
1067 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1068                                struct nlmsghdr *n, u32 clid,
1069                                struct Qdisc *old, struct Qdisc *new,
1070                                struct netlink_ext_ack *extack)
1071 {
1072         if (new || old)
1073                 qdisc_notify(net, skb, n, clid, old, new, extack);
1074
1075         if (old)
1076                 qdisc_put(old);
1077 }
1078
1079 static void qdisc_clear_nolock(struct Qdisc *sch)
1080 {
1081         sch->flags &= ~TCQ_F_NOLOCK;
1082         if (!(sch->flags & TCQ_F_CPUSTATS))
1083                 return;
1084
1085         free_percpu(sch->cpu_bstats);
1086         free_percpu(sch->cpu_qstats);
1087         sch->cpu_bstats = NULL;
1088         sch->cpu_qstats = NULL;
1089         sch->flags &= ~TCQ_F_CPUSTATS;
1090 }
1091
1092 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1093  * to device "dev".
1094  *
1095  * When appropriate send a netlink notification using 'skb'
1096  * and "n".
1097  *
1098  * On success, destroy old qdisc.
1099  */
1100
1101 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1102                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1103                        struct Qdisc *new, struct Qdisc *old,
1104                        struct netlink_ext_ack *extack)
1105 {
1106         struct Qdisc *q = old;
1107         struct net *net = dev_net(dev);
1108
1109         if (parent == NULL) {
1110                 unsigned int i, num_q, ingress;
1111                 struct netdev_queue *dev_queue;
1112
1113                 ingress = 0;
1114                 num_q = dev->num_tx_queues;
1115                 if ((q && q->flags & TCQ_F_INGRESS) ||
1116                     (new && new->flags & TCQ_F_INGRESS)) {
1117                         ingress = 1;
1118                         dev_queue = dev_ingress_queue(dev);
1119                         if (!dev_queue) {
1120                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1121                                 return -ENOENT;
1122                         }
1123
1124                         q = rtnl_dereference(dev_queue->qdisc_sleeping);
1125
1126                         /* This is the counterpart of that qdisc_refcount_inc_nz() call in
1127                          * __tcf_qdisc_find() for filter requests.
1128                          */
1129                         if (!qdisc_refcount_dec_if_one(q)) {
1130                                 NL_SET_ERR_MSG(extack,
1131                                                "Current ingress or clsact Qdisc has ongoing filter requests");
1132                                 return -EBUSY;
1133                         }
1134                 }
1135
1136                 if (dev->flags & IFF_UP)
1137                         dev_deactivate(dev);
1138
1139                 qdisc_offload_graft_root(dev, new, old, extack);
1140
1141                 if (new && new->ops->attach && !ingress)
1142                         goto skip;
1143
1144                 if (!ingress) {
1145                         for (i = 0; i < num_q; i++) {
1146                                 dev_queue = netdev_get_tx_queue(dev, i);
1147                                 old = dev_graft_qdisc(dev_queue, new);
1148
1149                                 if (new && i > 0)
1150                                         qdisc_refcount_inc(new);
1151                                 qdisc_put(old);
1152                         }
1153                 } else {
1154                         old = dev_graft_qdisc(dev_queue, NULL);
1155
1156                         /* {ingress,clsact}_destroy() @old before grafting @new to avoid
1157                          * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1158                          * pointer(s) in mini_qdisc_pair_swap().
1159                          */
1160                         qdisc_notify(net, skb, n, classid, old, new, extack);
1161                         qdisc_destroy(old);
1162
1163                         dev_graft_qdisc(dev_queue, new);
1164                 }
1165
1166 skip:
1167                 if (!ingress) {
1168                         old = rtnl_dereference(dev->qdisc);
1169                         if (new && !new->ops->attach)
1170                                 qdisc_refcount_inc(new);
1171                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1172
1173                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1174
1175                         if (new && new->ops->attach)
1176                                 new->ops->attach(new);
1177                 }
1178
1179                 if (dev->flags & IFF_UP)
1180                         dev_activate(dev);
1181         } else {
1182                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1183                 unsigned long cl;
1184                 int err;
1185
1186                 /* Only support running class lockless if parent is lockless */
1187                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1188                         qdisc_clear_nolock(new);
1189
1190                 if (!cops || !cops->graft)
1191                         return -EOPNOTSUPP;
1192
1193                 cl = cops->find(parent, classid);
1194                 if (!cl) {
1195                         NL_SET_ERR_MSG(extack, "Specified class not found");
1196                         return -ENOENT;
1197                 }
1198
1199                 if (new && new->ops == &noqueue_qdisc_ops) {
1200                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1201                         return -EINVAL;
1202                 }
1203
1204                 err = cops->graft(parent, cl, new, &old, extack);
1205                 if (err)
1206                         return err;
1207                 notify_and_destroy(net, skb, n, classid, old, new, extack);
1208         }
1209         return 0;
1210 }
1211
1212 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1213                                    struct netlink_ext_ack *extack)
1214 {
1215         u32 block_index;
1216
1217         if (tca[TCA_INGRESS_BLOCK]) {
1218                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1219
1220                 if (!block_index) {
1221                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1222                         return -EINVAL;
1223                 }
1224                 if (!sch->ops->ingress_block_set) {
1225                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1226                         return -EOPNOTSUPP;
1227                 }
1228                 sch->ops->ingress_block_set(sch, block_index);
1229         }
1230         if (tca[TCA_EGRESS_BLOCK]) {
1231                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1232
1233                 if (!block_index) {
1234                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1235                         return -EINVAL;
1236                 }
1237                 if (!sch->ops->egress_block_set) {
1238                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1239                         return -EOPNOTSUPP;
1240                 }
1241                 sch->ops->egress_block_set(sch, block_index);
1242         }
1243         return 0;
1244 }
1245
1246 /*
1247    Allocate and initialize new qdisc.
1248
1249    Parameters are passed via opt.
1250  */
1251
1252 static struct Qdisc *qdisc_create(struct net_device *dev,
1253                                   struct netdev_queue *dev_queue,
1254                                   u32 parent, u32 handle,
1255                                   struct nlattr **tca, int *errp,
1256                                   struct netlink_ext_ack *extack)
1257 {
1258         int err;
1259         struct nlattr *kind = tca[TCA_KIND];
1260         struct Qdisc *sch;
1261         struct Qdisc_ops *ops;
1262         struct qdisc_size_table *stab;
1263
1264         ops = qdisc_lookup_ops(kind);
1265 #ifdef CONFIG_MODULES
1266         if (ops == NULL && kind != NULL) {
1267                 char name[IFNAMSIZ];
1268                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1269                         /* We dropped the RTNL semaphore in order to
1270                          * perform the module load.  So, even if we
1271                          * succeeded in loading the module we have to
1272                          * tell the caller to replay the request.  We
1273                          * indicate this using -EAGAIN.
1274                          * We replay the request because the device may
1275                          * go away in the mean time.
1276                          */
1277                         rtnl_unlock();
1278                         request_module(NET_SCH_ALIAS_PREFIX "%s", name);
1279                         rtnl_lock();
1280                         ops = qdisc_lookup_ops(kind);
1281                         if (ops != NULL) {
1282                                 /* We will try again qdisc_lookup_ops,
1283                                  * so don't keep a reference.
1284                                  */
1285                                 module_put(ops->owner);
1286                                 err = -EAGAIN;
1287                                 goto err_out;
1288                         }
1289                 }
1290         }
1291 #endif
1292
1293         err = -ENOENT;
1294         if (!ops) {
1295                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1296                 goto err_out;
1297         }
1298
1299         sch = qdisc_alloc(dev_queue, ops, extack);
1300         if (IS_ERR(sch)) {
1301                 err = PTR_ERR(sch);
1302                 goto err_out2;
1303         }
1304
1305         sch->parent = parent;
1306
1307         if (handle == TC_H_INGRESS) {
1308                 if (!(sch->flags & TCQ_F_INGRESS)) {
1309                         NL_SET_ERR_MSG(extack,
1310                                        "Specified parent ID is reserved for ingress and clsact Qdiscs");
1311                         err = -EINVAL;
1312                         goto err_out3;
1313                 }
1314                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1315         } else {
1316                 if (handle == 0) {
1317                         handle = qdisc_alloc_handle(dev);
1318                         if (handle == 0) {
1319                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1320                                 err = -ENOSPC;
1321                                 goto err_out3;
1322                         }
1323                 }
1324                 if (!netif_is_multiqueue(dev))
1325                         sch->flags |= TCQ_F_ONETXQUEUE;
1326         }
1327
1328         sch->handle = handle;
1329
1330         /* This exist to keep backward compatible with a userspace
1331          * loophole, what allowed userspace to get IFF_NO_QUEUE
1332          * facility on older kernels by setting tx_queue_len=0 (prior
1333          * to qdisc init), and then forgot to reinit tx_queue_len
1334          * before again attaching a qdisc.
1335          */
1336         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1337                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1338                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1339         }
1340
1341         err = qdisc_block_indexes_set(sch, tca, extack);
1342         if (err)
1343                 goto err_out3;
1344
1345         if (tca[TCA_STAB]) {
1346                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1347                 if (IS_ERR(stab)) {
1348                         err = PTR_ERR(stab);
1349                         goto err_out3;
1350                 }
1351                 rcu_assign_pointer(sch->stab, stab);
1352         }
1353
1354         if (ops->init) {
1355                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1356                 if (err != 0)
1357                         goto err_out4;
1358         }
1359
1360         if (tca[TCA_RATE]) {
1361                 err = -EOPNOTSUPP;
1362                 if (sch->flags & TCQ_F_MQROOT) {
1363                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1364                         goto err_out4;
1365                 }
1366
1367                 err = gen_new_estimator(&sch->bstats,
1368                                         sch->cpu_bstats,
1369                                         &sch->rate_est,
1370                                         NULL,
1371                                         true,
1372                                         tca[TCA_RATE]);
1373                 if (err) {
1374                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1375                         goto err_out4;
1376                 }
1377         }
1378
1379         qdisc_hash_add(sch, false);
1380         trace_qdisc_create(ops, dev, parent);
1381
1382         return sch;
1383
1384 err_out4:
1385         /* Even if ops->init() failed, we call ops->destroy()
1386          * like qdisc_create_dflt().
1387          */
1388         if (ops->destroy)
1389                 ops->destroy(sch);
1390         qdisc_put_stab(rtnl_dereference(sch->stab));
1391 err_out3:
1392         netdev_put(dev, &sch->dev_tracker);
1393         qdisc_free(sch);
1394 err_out2:
1395         module_put(ops->owner);
1396 err_out:
1397         *errp = err;
1398         return NULL;
1399 }
1400
1401 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1402                         struct netlink_ext_ack *extack)
1403 {
1404         struct qdisc_size_table *ostab, *stab = NULL;
1405         int err = 0;
1406
1407         if (tca[TCA_OPTIONS]) {
1408                 if (!sch->ops->change) {
1409                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1410                         return -EINVAL;
1411                 }
1412                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1413                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1414                         return -EOPNOTSUPP;
1415                 }
1416                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1417                 if (err)
1418                         return err;
1419         }
1420
1421         if (tca[TCA_STAB]) {
1422                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1423                 if (IS_ERR(stab))
1424                         return PTR_ERR(stab);
1425         }
1426
1427         ostab = rtnl_dereference(sch->stab);
1428         rcu_assign_pointer(sch->stab, stab);
1429         qdisc_put_stab(ostab);
1430
1431         if (tca[TCA_RATE]) {
1432                 /* NB: ignores errors from replace_estimator
1433                    because change can't be undone. */
1434                 if (sch->flags & TCQ_F_MQROOT)
1435                         goto out;
1436                 gen_replace_estimator(&sch->bstats,
1437                                       sch->cpu_bstats,
1438                                       &sch->rate_est,
1439                                       NULL,
1440                                       true,
1441                                       tca[TCA_RATE]);
1442         }
1443 out:
1444         return 0;
1445 }
1446
1447 struct check_loop_arg {
1448         struct qdisc_walker     w;
1449         struct Qdisc            *p;
1450         int                     depth;
1451 };
1452
1453 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1454                          struct qdisc_walker *w);
1455
1456 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1457 {
1458         struct check_loop_arg   arg;
1459
1460         if (q->ops->cl_ops == NULL)
1461                 return 0;
1462
1463         arg.w.stop = arg.w.skip = arg.w.count = 0;
1464         arg.w.fn = check_loop_fn;
1465         arg.depth = depth;
1466         arg.p = p;
1467         q->ops->cl_ops->walk(q, &arg.w);
1468         return arg.w.stop ? -ELOOP : 0;
1469 }
1470
1471 static int
1472 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1473 {
1474         struct Qdisc *leaf;
1475         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1476         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1477
1478         leaf = cops->leaf(q, cl);
1479         if (leaf) {
1480                 if (leaf == arg->p || arg->depth > 7)
1481                         return -ELOOP;
1482                 return check_loop(leaf, arg->p, arg->depth + 1);
1483         }
1484         return 0;
1485 }
1486
1487 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1488         [TCA_KIND]              = { .type = NLA_STRING },
1489         [TCA_RATE]              = { .type = NLA_BINARY,
1490                                     .len = sizeof(struct tc_estimator) },
1491         [TCA_STAB]              = { .type = NLA_NESTED },
1492         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1493         [TCA_CHAIN]             = { .type = NLA_U32 },
1494         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1495         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1496 };
1497
1498 /*
1499  * Delete/get qdisc.
1500  */
1501
1502 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1503                         struct netlink_ext_ack *extack)
1504 {
1505         struct net *net = sock_net(skb->sk);
1506         struct tcmsg *tcm = nlmsg_data(n);
1507         struct nlattr *tca[TCA_MAX + 1];
1508         struct net_device *dev;
1509         u32 clid;
1510         struct Qdisc *q = NULL;
1511         struct Qdisc *p = NULL;
1512         int err;
1513
1514         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1515                                      rtm_tca_policy, extack);
1516         if (err < 0)
1517                 return err;
1518
1519         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1520         if (!dev)
1521                 return -ENODEV;
1522
1523         clid = tcm->tcm_parent;
1524         if (clid) {
1525                 if (clid != TC_H_ROOT) {
1526                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1527                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1528                                 if (!p) {
1529                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1530                                         return -ENOENT;
1531                                 }
1532                                 q = qdisc_leaf(p, clid);
1533                         } else if (dev_ingress_queue(dev)) {
1534                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1535                         }
1536                 } else {
1537                         q = rtnl_dereference(dev->qdisc);
1538                 }
1539                 if (!q) {
1540                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1541                         return -ENOENT;
1542                 }
1543
1544                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1545                         NL_SET_ERR_MSG(extack, "Invalid handle");
1546                         return -EINVAL;
1547                 }
1548         } else {
1549                 q = qdisc_lookup(dev, tcm->tcm_handle);
1550                 if (!q) {
1551                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1552                         return -ENOENT;
1553                 }
1554         }
1555
1556         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1557                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1558                 return -EINVAL;
1559         }
1560
1561         if (n->nlmsg_type == RTM_DELQDISC) {
1562                 if (!clid) {
1563                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1564                         return -EINVAL;
1565                 }
1566                 if (q->handle == 0) {
1567                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1568                         return -ENOENT;
1569                 }
1570                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1571                 if (err != 0)
1572                         return err;
1573         } else {
1574                 qdisc_get_notify(net, skb, n, clid, q, NULL);
1575         }
1576         return 0;
1577 }
1578
1579 static bool req_create_or_replace(struct nlmsghdr *n)
1580 {
1581         return (n->nlmsg_flags & NLM_F_CREATE &&
1582                 n->nlmsg_flags & NLM_F_REPLACE);
1583 }
1584
1585 static bool req_create_exclusive(struct nlmsghdr *n)
1586 {
1587         return (n->nlmsg_flags & NLM_F_CREATE &&
1588                 n->nlmsg_flags & NLM_F_EXCL);
1589 }
1590
1591 static bool req_change(struct nlmsghdr *n)
1592 {
1593         return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1594                 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1595                 !(n->nlmsg_flags & NLM_F_EXCL));
1596 }
1597
1598 /*
1599  * Create/change qdisc.
1600  */
1601 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1602                            struct netlink_ext_ack *extack)
1603 {
1604         struct net *net = sock_net(skb->sk);
1605         struct tcmsg *tcm;
1606         struct nlattr *tca[TCA_MAX + 1];
1607         struct net_device *dev;
1608         u32 clid;
1609         struct Qdisc *q, *p;
1610         int err;
1611
1612 replay:
1613         /* Reinit, just in case something touches this. */
1614         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1615                                      rtm_tca_policy, extack);
1616         if (err < 0)
1617                 return err;
1618
1619         tcm = nlmsg_data(n);
1620         clid = tcm->tcm_parent;
1621         q = p = NULL;
1622
1623         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1624         if (!dev)
1625                 return -ENODEV;
1626
1627
1628         if (clid) {
1629                 if (clid != TC_H_ROOT) {
1630                         if (clid != TC_H_INGRESS) {
1631                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1632                                 if (!p) {
1633                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1634                                         return -ENOENT;
1635                                 }
1636                                 q = qdisc_leaf(p, clid);
1637                         } else if (dev_ingress_queue_create(dev)) {
1638                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1639                         }
1640                 } else {
1641                         q = rtnl_dereference(dev->qdisc);
1642                 }
1643
1644                 /* It may be default qdisc, ignore it */
1645                 if (q && q->handle == 0)
1646                         q = NULL;
1647
1648                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1649                         if (tcm->tcm_handle) {
1650                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1651                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1652                                         return -EEXIST;
1653                                 }
1654                                 if (TC_H_MIN(tcm->tcm_handle)) {
1655                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1656                                         return -EINVAL;
1657                                 }
1658                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1659                                 if (!q)
1660                                         goto create_n_graft;
1661                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1662                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1663                                         return -EEXIST;
1664                                 }
1665                                 if (tca[TCA_KIND] &&
1666                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1667                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1668                                         return -EINVAL;
1669                                 }
1670                                 if (q->flags & TCQ_F_INGRESS) {
1671                                         NL_SET_ERR_MSG(extack,
1672                                                        "Cannot regraft ingress or clsact Qdiscs");
1673                                         return -EINVAL;
1674                                 }
1675                                 if (q == p ||
1676                                     (p && check_loop(q, p, 0))) {
1677                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1678                                         return -ELOOP;
1679                                 }
1680                                 if (clid == TC_H_INGRESS) {
1681                                         NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1682                                         return -EINVAL;
1683                                 }
1684                                 qdisc_refcount_inc(q);
1685                                 goto graft;
1686                         } else {
1687                                 if (!q)
1688                                         goto create_n_graft;
1689
1690                                 /* This magic test requires explanation.
1691                                  *
1692                                  *   We know, that some child q is already
1693                                  *   attached to this parent and have choice:
1694                                  *   1) change it or 2) create/graft new one.
1695                                  *   If the requested qdisc kind is different
1696                                  *   than the existing one, then we choose graft.
1697                                  *   If they are the same then this is "change"
1698                                  *   operation - just let it fallthrough..
1699                                  *
1700                                  *   1. We are allowed to create/graft only
1701                                  *   if the request is explicitly stating
1702                                  *   "please create if it doesn't exist".
1703                                  *
1704                                  *   2. If the request is to exclusive create
1705                                  *   then the qdisc tcm_handle is not expected
1706                                  *   to exist, so that we choose create/graft too.
1707                                  *
1708                                  *   3. The last case is when no flags are set.
1709                                  *   This will happen when for example tc
1710                                  *   utility issues a "change" command.
1711                                  *   Alas, it is sort of hole in API, we
1712                                  *   cannot decide what to do unambiguously.
1713                                  *   For now we select create/graft.
1714                                  */
1715                                 if (tca[TCA_KIND] &&
1716                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1717                                         if (req_create_or_replace(n) ||
1718                                             req_create_exclusive(n))
1719                                                 goto create_n_graft;
1720                                         else if (req_change(n))
1721                                                 goto create_n_graft2;
1722                                 }
1723                         }
1724                 }
1725         } else {
1726                 if (!tcm->tcm_handle) {
1727                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1728                         return -EINVAL;
1729                 }
1730                 q = qdisc_lookup(dev, tcm->tcm_handle);
1731         }
1732
1733         /* Change qdisc parameters */
1734         if (!q) {
1735                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1736                 return -ENOENT;
1737         }
1738         if (n->nlmsg_flags & NLM_F_EXCL) {
1739                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1740                 return -EEXIST;
1741         }
1742         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1743                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1744                 return -EINVAL;
1745         }
1746         err = qdisc_change(q, tca, extack);
1747         if (err == 0)
1748                 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1749         return err;
1750
1751 create_n_graft:
1752         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1753                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1754                 return -ENOENT;
1755         }
1756 create_n_graft2:
1757         if (clid == TC_H_INGRESS) {
1758                 if (dev_ingress_queue(dev)) {
1759                         q = qdisc_create(dev, dev_ingress_queue(dev),
1760                                          tcm->tcm_parent, tcm->tcm_parent,
1761                                          tca, &err, extack);
1762                 } else {
1763                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1764                         err = -ENOENT;
1765                 }
1766         } else {
1767                 struct netdev_queue *dev_queue;
1768
1769                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1770                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1771                 else if (p)
1772                         dev_queue = p->dev_queue;
1773                 else
1774                         dev_queue = netdev_get_tx_queue(dev, 0);
1775
1776                 q = qdisc_create(dev, dev_queue,
1777                                  tcm->tcm_parent, tcm->tcm_handle,
1778                                  tca, &err, extack);
1779         }
1780         if (q == NULL) {
1781                 if (err == -EAGAIN)
1782                         goto replay;
1783                 return err;
1784         }
1785
1786 graft:
1787         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1788         if (err) {
1789                 if (q)
1790                         qdisc_put(q);
1791                 return err;
1792         }
1793
1794         return 0;
1795 }
1796
1797 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1798                               struct netlink_callback *cb,
1799                               int *q_idx_p, int s_q_idx, bool recur,
1800                               bool dump_invisible)
1801 {
1802         int ret = 0, q_idx = *q_idx_p;
1803         struct Qdisc *q;
1804         int b;
1805
1806         if (!root)
1807                 return 0;
1808
1809         q = root;
1810         if (q_idx < s_q_idx) {
1811                 q_idx++;
1812         } else {
1813                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1814                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1815                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1816                                   RTM_NEWQDISC, NULL) <= 0)
1817                         goto done;
1818                 q_idx++;
1819         }
1820
1821         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1822          * itself has already been dumped.
1823          *
1824          * If we've already dumped the top-level (ingress) qdisc above and the global
1825          * qdisc hashtable, we don't want to hit it again
1826          */
1827         if (!qdisc_dev(root) || !recur)
1828                 goto out;
1829
1830         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1831                 if (q_idx < s_q_idx) {
1832                         q_idx++;
1833                         continue;
1834                 }
1835                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1836                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1837                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1838                                   RTM_NEWQDISC, NULL) <= 0)
1839                         goto done;
1840                 q_idx++;
1841         }
1842
1843 out:
1844         *q_idx_p = q_idx;
1845         return ret;
1846 done:
1847         ret = -1;
1848         goto out;
1849 }
1850
1851 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1852 {
1853         struct net *net = sock_net(skb->sk);
1854         int idx, q_idx;
1855         int s_idx, s_q_idx;
1856         struct net_device *dev;
1857         const struct nlmsghdr *nlh = cb->nlh;
1858         struct nlattr *tca[TCA_MAX + 1];
1859         int err;
1860
1861         s_idx = cb->args[0];
1862         s_q_idx = q_idx = cb->args[1];
1863
1864         idx = 0;
1865         ASSERT_RTNL();
1866
1867         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1868                                      rtm_tca_policy, cb->extack);
1869         if (err < 0)
1870                 return err;
1871
1872         for_each_netdev(net, dev) {
1873                 struct netdev_queue *dev_queue;
1874
1875                 if (idx < s_idx)
1876                         goto cont;
1877                 if (idx > s_idx)
1878                         s_q_idx = 0;
1879                 q_idx = 0;
1880
1881                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1882                                        skb, cb, &q_idx, s_q_idx,
1883                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1884                         goto done;
1885
1886                 dev_queue = dev_ingress_queue(dev);
1887                 if (dev_queue &&
1888                     tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1889                                        skb, cb, &q_idx, s_q_idx, false,
1890                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1891                         goto done;
1892
1893 cont:
1894                 idx++;
1895         }
1896
1897 done:
1898         cb->args[0] = idx;
1899         cb->args[1] = q_idx;
1900
1901         return skb->len;
1902 }
1903
1904
1905
1906 /************************************************
1907  *      Traffic classes manipulation.           *
1908  ************************************************/
1909
1910 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1911                           unsigned long cl, u32 portid, u32 seq, u16 flags,
1912                           int event, struct netlink_ext_ack *extack)
1913 {
1914         struct tcmsg *tcm;
1915         struct nlmsghdr  *nlh;
1916         unsigned char *b = skb_tail_pointer(skb);
1917         struct gnet_dump d;
1918         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1919
1920         cond_resched();
1921         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1922         if (!nlh)
1923                 goto out_nlmsg_trim;
1924         tcm = nlmsg_data(nlh);
1925         tcm->tcm_family = AF_UNSPEC;
1926         tcm->tcm__pad1 = 0;
1927         tcm->tcm__pad2 = 0;
1928         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1929         tcm->tcm_parent = q->handle;
1930         tcm->tcm_handle = q->handle;
1931         tcm->tcm_info = 0;
1932         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1933                 goto nla_put_failure;
1934         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1935                 goto nla_put_failure;
1936
1937         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1938                                          NULL, &d, TCA_PAD) < 0)
1939                 goto nla_put_failure;
1940
1941         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1942                 goto nla_put_failure;
1943
1944         if (gnet_stats_finish_copy(&d) < 0)
1945                 goto nla_put_failure;
1946
1947         if (extack && extack->_msg &&
1948             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1949                 goto out_nlmsg_trim;
1950
1951         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1952
1953         return skb->len;
1954
1955 out_nlmsg_trim:
1956 nla_put_failure:
1957         nlmsg_trim(skb, b);
1958         return -1;
1959 }
1960
1961 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1962                          struct nlmsghdr *n, struct Qdisc *q,
1963                          unsigned long cl, int event, struct netlink_ext_ack *extack)
1964 {
1965         struct sk_buff *skb;
1966         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1967
1968         if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
1969                 return 0;
1970
1971         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1972         if (!skb)
1973                 return -ENOBUFS;
1974
1975         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1976                 kfree_skb(skb);
1977                 return -EINVAL;
1978         }
1979
1980         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1981                               n->nlmsg_flags & NLM_F_ECHO);
1982 }
1983
1984 static int tclass_get_notify(struct net *net, struct sk_buff *oskb,
1985                              struct nlmsghdr *n, struct Qdisc *q,
1986                              unsigned long cl, struct netlink_ext_ack *extack)
1987 {
1988         struct sk_buff *skb;
1989         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1990
1991         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1992         if (!skb)
1993                 return -ENOBUFS;
1994
1995         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS,
1996                            extack) < 0) {
1997                 kfree_skb(skb);
1998                 return -EINVAL;
1999         }
2000
2001         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
2002                               n->nlmsg_flags & NLM_F_ECHO);
2003 }
2004
2005 static int tclass_del_notify(struct net *net,
2006                              const struct Qdisc_class_ops *cops,
2007                              struct sk_buff *oskb, struct nlmsghdr *n,
2008                              struct Qdisc *q, unsigned long cl,
2009                              struct netlink_ext_ack *extack)
2010 {
2011         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
2012         struct sk_buff *skb;
2013         int err = 0;
2014
2015         if (!cops->delete)
2016                 return -EOPNOTSUPP;
2017
2018         if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) {
2019                 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2020                 if (!skb)
2021                         return -ENOBUFS;
2022
2023                 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
2024                                    RTM_DELTCLASS, extack) < 0) {
2025                         kfree_skb(skb);
2026                         return -EINVAL;
2027                 }
2028         } else {
2029                 skb = NULL;
2030         }
2031
2032         err = cops->delete(q, cl, extack);
2033         if (err) {
2034                 kfree_skb(skb);
2035                 return err;
2036         }
2037
2038         err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC,
2039                                    n->nlmsg_flags & NLM_F_ECHO);
2040         return err;
2041 }
2042
2043 #ifdef CONFIG_NET_CLS
2044
2045 struct tcf_bind_args {
2046         struct tcf_walker w;
2047         unsigned long base;
2048         unsigned long cl;
2049         u32 classid;
2050 };
2051
2052 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
2053 {
2054         struct tcf_bind_args *a = (void *)arg;
2055
2056         if (n && tp->ops->bind_class) {
2057                 struct Qdisc *q = tcf_block_q(tp->chain->block);
2058
2059                 sch_tree_lock(q);
2060                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
2061                 sch_tree_unlock(q);
2062         }
2063         return 0;
2064 }
2065
2066 struct tc_bind_class_args {
2067         struct qdisc_walker w;
2068         unsigned long new_cl;
2069         u32 portid;
2070         u32 clid;
2071 };
2072
2073 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
2074                                 struct qdisc_walker *w)
2075 {
2076         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
2077         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2078         struct tcf_block *block;
2079         struct tcf_chain *chain;
2080
2081         block = cops->tcf_block(q, cl, NULL);
2082         if (!block)
2083                 return 0;
2084         for (chain = tcf_get_next_chain(block, NULL);
2085              chain;
2086              chain = tcf_get_next_chain(block, chain)) {
2087                 struct tcf_proto *tp;
2088
2089                 for (tp = tcf_get_next_proto(chain, NULL);
2090                      tp; tp = tcf_get_next_proto(chain, tp)) {
2091                         struct tcf_bind_args arg = {};
2092
2093                         arg.w.fn = tcf_node_bind;
2094                         arg.classid = a->clid;
2095                         arg.base = cl;
2096                         arg.cl = a->new_cl;
2097                         tp->ops->walk(tp, &arg.w, true);
2098                 }
2099         }
2100
2101         return 0;
2102 }
2103
2104 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2105                            unsigned long new_cl)
2106 {
2107         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2108         struct tc_bind_class_args args = {};
2109
2110         if (!cops->tcf_block)
2111                 return;
2112         args.portid = portid;
2113         args.clid = clid;
2114         args.new_cl = new_cl;
2115         args.w.fn = tc_bind_class_walker;
2116         q->ops->cl_ops->walk(q, &args.w);
2117 }
2118
2119 #else
2120
2121 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2122                            unsigned long new_cl)
2123 {
2124 }
2125
2126 #endif
2127
2128 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2129                          struct netlink_ext_ack *extack)
2130 {
2131         struct net *net = sock_net(skb->sk);
2132         struct tcmsg *tcm = nlmsg_data(n);
2133         struct nlattr *tca[TCA_MAX + 1];
2134         struct net_device *dev;
2135         struct Qdisc *q = NULL;
2136         const struct Qdisc_class_ops *cops;
2137         unsigned long cl = 0;
2138         unsigned long new_cl;
2139         u32 portid;
2140         u32 clid;
2141         u32 qid;
2142         int err;
2143
2144         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2145                                      rtm_tca_policy, extack);
2146         if (err < 0)
2147                 return err;
2148
2149         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2150         if (!dev)
2151                 return -ENODEV;
2152
2153         /*
2154            parent == TC_H_UNSPEC - unspecified parent.
2155            parent == TC_H_ROOT   - class is root, which has no parent.
2156            parent == X:0         - parent is root class.
2157            parent == X:Y         - parent is a node in hierarchy.
2158            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2159
2160            handle == 0:0         - generate handle from kernel pool.
2161            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2162            handle == X:Y         - clear.
2163            handle == X:0         - root class.
2164          */
2165
2166         /* Step 1. Determine qdisc handle X:0 */
2167
2168         portid = tcm->tcm_parent;
2169         clid = tcm->tcm_handle;
2170         qid = TC_H_MAJ(clid);
2171
2172         if (portid != TC_H_ROOT) {
2173                 u32 qid1 = TC_H_MAJ(portid);
2174
2175                 if (qid && qid1) {
2176                         /* If both majors are known, they must be identical. */
2177                         if (qid != qid1)
2178                                 return -EINVAL;
2179                 } else if (qid1) {
2180                         qid = qid1;
2181                 } else if (qid == 0)
2182                         qid = rtnl_dereference(dev->qdisc)->handle;
2183
2184                 /* Now qid is genuine qdisc handle consistent
2185                  * both with parent and child.
2186                  *
2187                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2188                  */
2189                 if (portid)
2190                         portid = TC_H_MAKE(qid, portid);
2191         } else {
2192                 if (qid == 0)
2193                         qid = rtnl_dereference(dev->qdisc)->handle;
2194         }
2195
2196         /* OK. Locate qdisc */
2197         q = qdisc_lookup(dev, qid);
2198         if (!q)
2199                 return -ENOENT;
2200
2201         /* An check that it supports classes */
2202         cops = q->ops->cl_ops;
2203         if (cops == NULL)
2204                 return -EINVAL;
2205
2206         /* Now try to get class */
2207         if (clid == 0) {
2208                 if (portid == TC_H_ROOT)
2209                         clid = qid;
2210         } else
2211                 clid = TC_H_MAKE(qid, clid);
2212
2213         if (clid)
2214                 cl = cops->find(q, clid);
2215
2216         if (cl == 0) {
2217                 err = -ENOENT;
2218                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2219                     !(n->nlmsg_flags & NLM_F_CREATE))
2220                         goto out;
2221         } else {
2222                 switch (n->nlmsg_type) {
2223                 case RTM_NEWTCLASS:
2224                         err = -EEXIST;
2225                         if (n->nlmsg_flags & NLM_F_EXCL)
2226                                 goto out;
2227                         break;
2228                 case RTM_DELTCLASS:
2229                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2230                         /* Unbind the class with flilters with 0 */
2231                         tc_bind_tclass(q, portid, clid, 0);
2232                         goto out;
2233                 case RTM_GETTCLASS:
2234                         err = tclass_get_notify(net, skb, n, q, cl, extack);
2235                         goto out;
2236                 default:
2237                         err = -EINVAL;
2238                         goto out;
2239                 }
2240         }
2241
2242         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2243                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2244                 return -EOPNOTSUPP;
2245         }
2246
2247         new_cl = cl;
2248         err = -EOPNOTSUPP;
2249         if (cops->change)
2250                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2251         if (err == 0) {
2252                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2253                 /* We just create a new class, need to do reverse binding. */
2254                 if (cl != new_cl)
2255                         tc_bind_tclass(q, portid, clid, new_cl);
2256         }
2257 out:
2258         return err;
2259 }
2260
2261 struct qdisc_dump_args {
2262         struct qdisc_walker     w;
2263         struct sk_buff          *skb;
2264         struct netlink_callback *cb;
2265 };
2266
2267 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2268                             struct qdisc_walker *arg)
2269 {
2270         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2271
2272         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2273                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2274                               RTM_NEWTCLASS, NULL);
2275 }
2276
2277 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2278                                 struct tcmsg *tcm, struct netlink_callback *cb,
2279                                 int *t_p, int s_t)
2280 {
2281         struct qdisc_dump_args arg;
2282
2283         if (tc_qdisc_dump_ignore(q, false) ||
2284             *t_p < s_t || !q->ops->cl_ops ||
2285             (tcm->tcm_parent &&
2286              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2287                 (*t_p)++;
2288                 return 0;
2289         }
2290         if (*t_p > s_t)
2291                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2292         arg.w.fn = qdisc_class_dump;
2293         arg.skb = skb;
2294         arg.cb = cb;
2295         arg.w.stop  = 0;
2296         arg.w.skip = cb->args[1];
2297         arg.w.count = 0;
2298         q->ops->cl_ops->walk(q, &arg.w);
2299         cb->args[1] = arg.w.count;
2300         if (arg.w.stop)
2301                 return -1;
2302         (*t_p)++;
2303         return 0;
2304 }
2305
2306 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2307                                struct tcmsg *tcm, struct netlink_callback *cb,
2308                                int *t_p, int s_t, bool recur)
2309 {
2310         struct Qdisc *q;
2311         int b;
2312
2313         if (!root)
2314                 return 0;
2315
2316         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2317                 return -1;
2318
2319         if (!qdisc_dev(root) || !recur)
2320                 return 0;
2321
2322         if (tcm->tcm_parent) {
2323                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2324                 if (q && q != root &&
2325                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2326                         return -1;
2327                 return 0;
2328         }
2329         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2330                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2331                         return -1;
2332         }
2333
2334         return 0;
2335 }
2336
2337 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2338 {
2339         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2340         struct net *net = sock_net(skb->sk);
2341         struct netdev_queue *dev_queue;
2342         struct net_device *dev;
2343         int t, s_t;
2344
2345         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2346                 return 0;
2347         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2348         if (!dev)
2349                 return 0;
2350
2351         s_t = cb->args[0];
2352         t = 0;
2353
2354         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2355                                 skb, tcm, cb, &t, s_t, true) < 0)
2356                 goto done;
2357
2358         dev_queue = dev_ingress_queue(dev);
2359         if (dev_queue &&
2360             tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2361                                 skb, tcm, cb, &t, s_t, false) < 0)
2362                 goto done;
2363
2364 done:
2365         cb->args[0] = t;
2366
2367         dev_put(dev);
2368         return skb->len;
2369 }
2370
2371 #ifdef CONFIG_PROC_FS
2372 static int psched_show(struct seq_file *seq, void *v)
2373 {
2374         seq_printf(seq, "%08x %08x %08x %08x\n",
2375                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2376                    1000000,
2377                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2378
2379         return 0;
2380 }
2381
2382 static int __net_init psched_net_init(struct net *net)
2383 {
2384         struct proc_dir_entry *e;
2385
2386         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2387         if (e == NULL)
2388                 return -ENOMEM;
2389
2390         return 0;
2391 }
2392
2393 static void __net_exit psched_net_exit(struct net *net)
2394 {
2395         remove_proc_entry("psched", net->proc_net);
2396 }
2397 #else
2398 static int __net_init psched_net_init(struct net *net)
2399 {
2400         return 0;
2401 }
2402
2403 static void __net_exit psched_net_exit(struct net *net)
2404 {
2405 }
2406 #endif
2407
2408 static struct pernet_operations psched_net_ops = {
2409         .init = psched_net_init,
2410         .exit = psched_net_exit,
2411 };
2412
2413 #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
2414 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2415 #endif
2416
2417 static int __init pktsched_init(void)
2418 {
2419         int err;
2420
2421         err = register_pernet_subsys(&psched_net_ops);
2422         if (err) {
2423                 pr_err("pktsched_init: "
2424                        "cannot initialize per netns operations\n");
2425                 return err;
2426         }
2427
2428         register_qdisc(&pfifo_fast_ops);
2429         register_qdisc(&pfifo_qdisc_ops);
2430         register_qdisc(&bfifo_qdisc_ops);
2431         register_qdisc(&pfifo_head_drop_qdisc_ops);
2432         register_qdisc(&mq_qdisc_ops);
2433         register_qdisc(&noqueue_qdisc_ops);
2434
2435         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2436         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2437         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2438                       0);
2439         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2440         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2441         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2442                       0);
2443
2444         tc_wrapper_init();
2445
2446         return 0;
2447 }
2448
2449 subsys_initcall(pktsched_init);