2 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/flow_keys.h>
29 /* Stochastic Fairness Queuing algorithm.
30 =======================================
33 Paul E. McKenney "Stochastic Fairness Queuing",
34 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 Paul E. McKenney "Stochastic Fairness Queuing",
37 "Interworking: Research and Experience", v.2, 1991, p.113-131.
41 M. Shreedhar and George Varghese "Efficient Fair
42 Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
45 This is not the thing that is usually called (W)FQ nowadays.
46 It does not use any timestamp mechanism, but instead
47 processes queues in round-robin order.
51 - It is very cheap. Both CPU and memory requirements are minimal.
55 - "Stochastic" -> It is not 100% fair.
56 When hash collisions occur, several flows are considered as one.
58 - "Round-robin" -> It introduces larger delays than virtual clock
59 based schemes, and should not be used for isolating interactive
60 traffic from non-interactive. It means, that this scheduler
61 should be used as leaf of CBQ or P3, which put interactive traffic
62 to higher priority band.
64 We still need true WFQ for top level CSZ, but using WFQ
65 for the best effort traffic is absolutely pointless:
66 SFQ is superior for this purpose.
69 This implementation limits :
70 - maximal queue length per flow to 127 packets.
73 - number of hash buckets to 65536.
75 It is easy to increase these values, but not in flight. */
77 #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
78 #define SFQ_DEFAULT_FLOWS 128
79 #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
80 #define SFQ_EMPTY_SLOT 0xffff
81 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 /* We use 16 bits to store allot, and want to handle packets up to 64K
84 * Scale allot by 8 (1<<3) so that no overflow occurs.
86 #define SFQ_ALLOT_SHIFT 3
87 #define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
90 typedef u16 sfq_index;
93 * We dont use pointers to save space.
94 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
95 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
96 * are 'pointers' to dep[] array
104 struct sk_buff *skblist_next;
105 struct sk_buff *skblist_prev;
106 sfq_index qlen; /* number of skbs in skblist */
107 sfq_index next; /* next slot in sfq RR chain */
108 struct sfq_head dep; /* anchor in dep[] chains */
109 unsigned short hash; /* hash value (index in ht[]) */
110 short allot; /* credit for this slot */
113 struct sfq_sched_data {
114 /* frequently used fields */
115 int limit; /* limit of total number of packets in this qdisc */
116 unsigned int divisor; /* number of slots in hash table */
117 unsigned int maxflows; /* number of flows in flows array */
119 int maxdepth; /* limit of packets per flow */
122 struct tcf_proto *filter_list;
123 sfq_index cur_depth; /* depth of longest slot */
124 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
125 struct sfq_slot *tail; /* current slot in round */
126 sfq_index *ht; /* Hash table ('divisor' slots) */
127 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
129 struct sfq_head dep[SFQ_MAX_DEPTH + 1];
130 /* Linked lists of slots, indexed by depth
131 * dep[0] : list of unused flows
132 * dep[1] : list of flows with 1 packet
133 * dep[X] : list of flows with X packets
137 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
138 struct timer_list perturb_timer;
142 * sfq_head are either in a sfq_slot or in dep[] array
144 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
146 if (val < SFQ_MAX_FLOWS)
147 return &q->slots[val].dep;
148 return &q->dep[val - SFQ_MAX_FLOWS];
152 * In order to be able to quickly rehash our queue when timer changes
153 * q->perturbation, we store flow_keys in skb->cb[]
156 struct flow_keys keys;
159 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
161 BUILD_BUG_ON(sizeof(skb->cb) <
162 sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb));
163 return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
166 static unsigned int sfq_hash(const struct sfq_sched_data *q,
167 const struct sk_buff *skb)
169 const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
172 hash = jhash_3words((__force u32)keys->dst,
173 (__force u32)keys->src ^ keys->ip_proto,
174 (__force u32)keys->ports, q->perturbation);
175 return hash & (q->divisor - 1);
178 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
181 struct sfq_sched_data *q = qdisc_priv(sch);
182 struct tcf_result res;
185 if (TC_H_MAJ(skb->priority) == sch->handle &&
186 TC_H_MIN(skb->priority) > 0 &&
187 TC_H_MIN(skb->priority) <= q->divisor)
188 return TC_H_MIN(skb->priority);
190 if (!q->filter_list) {
191 skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
192 return sfq_hash(q, skb) + 1;
195 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
196 result = tc_classify(skb, q->filter_list, &res);
198 #ifdef CONFIG_NET_CLS_ACT
202 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
207 if (TC_H_MIN(res.classid) <= q->divisor)
208 return TC_H_MIN(res.classid);
214 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
216 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
219 struct sfq_slot *slot = &q->slots[x];
220 int qlen = slot->qlen;
222 p = qlen + SFQ_MAX_FLOWS;
223 n = q->dep[qlen].next;
228 q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
229 sfq_dep_head(q, n)->prev = x;
232 #define sfq_unlink(q, x, n, p) \
233 n = q->slots[x].dep.next; \
234 p = q->slots[x].dep.prev; \
235 sfq_dep_head(q, p)->next = n; \
236 sfq_dep_head(q, n)->prev = p
239 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
244 sfq_unlink(q, x, n, p);
246 d = q->slots[x].qlen--;
247 if (n == p && q->cur_depth == d)
252 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
257 sfq_unlink(q, x, n, p);
259 d = ++q->slots[x].qlen;
260 if (q->cur_depth < d)
265 /* helper functions : might be changed when/if skb use a standard list_head */
267 /* remove one skb from tail of slot queue */
268 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
270 struct sk_buff *skb = slot->skblist_prev;
272 slot->skblist_prev = skb->prev;
273 skb->prev->next = (struct sk_buff *)slot;
274 skb->next = skb->prev = NULL;
278 /* remove one skb from head of slot queue */
279 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
281 struct sk_buff *skb = slot->skblist_next;
283 slot->skblist_next = skb->next;
284 skb->next->prev = (struct sk_buff *)slot;
285 skb->next = skb->prev = NULL;
289 static inline void slot_queue_init(struct sfq_slot *slot)
291 memset(slot, 0, sizeof(*slot));
292 slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
295 /* add skb to slot queue (tail add) */
296 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
298 skb->prev = slot->skblist_prev;
299 skb->next = (struct sk_buff *)slot;
300 slot->skblist_prev->next = skb;
301 slot->skblist_prev = skb;
304 #define slot_queue_walk(slot, skb) \
305 for (skb = slot->skblist_next; \
306 skb != (struct sk_buff *)slot; \
309 static unsigned int sfq_drop(struct Qdisc *sch)
311 struct sfq_sched_data *q = qdisc_priv(sch);
312 sfq_index x, d = q->cur_depth;
315 struct sfq_slot *slot;
317 /* Queue is full! Find the longest slot and drop tail packet from it */
322 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
323 len = qdisc_pkt_len(skb);
328 sch->qstats.backlog -= len;
333 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
336 q->tail->next = slot->next;
337 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
345 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
347 struct sfq_sched_data *q = qdisc_priv(sch);
350 struct sfq_slot *slot;
351 int uninitialized_var(ret);
353 hash = sfq_classify(skb, sch, &ret);
355 if (ret & __NET_XMIT_BYPASS)
364 if (x == SFQ_EMPTY_SLOT) {
365 x = q->dep[0].next; /* get a free slot */
366 if (x >= SFQ_MAX_FLOWS)
367 return qdisc_drop(skb, sch);
373 if (slot->qlen >= q->maxdepth) {
374 struct sk_buff *head;
377 return qdisc_drop(skb, sch);
379 head = slot_dequeue_head(slot);
380 sch->qstats.backlog -= qdisc_pkt_len(head);
381 qdisc_drop(head, sch);
383 sch->qstats.backlog += qdisc_pkt_len(skb);
384 slot_queue_add(slot, skb);
388 sch->qstats.backlog += qdisc_pkt_len(skb);
389 slot_queue_add(slot, skb);
391 if (slot->qlen == 1) { /* The flow is new */
392 if (q->tail == NULL) { /* It is the first flow */
396 slot->next = q->tail->next;
399 slot->allot = q->scaled_quantum;
401 if (++sch->q.qlen <= q->limit)
402 return NET_XMIT_SUCCESS;
406 /* Return Congestion Notification only if we dropped a packet
409 if (qlen != slot->qlen)
412 /* As we dropped a packet, better let upper stack know this */
413 qdisc_tree_decrease_qlen(sch, 1);
414 return NET_XMIT_SUCCESS;
417 static struct sk_buff *
418 sfq_dequeue(struct Qdisc *sch)
420 struct sfq_sched_data *q = qdisc_priv(sch);
423 struct sfq_slot *slot;
425 /* No active slots */
432 if (slot->allot <= 0) {
434 slot->allot += q->scaled_quantum;
437 skb = slot_dequeue_head(slot);
439 qdisc_bstats_update(sch, skb);
441 sch->qstats.backlog -= qdisc_pkt_len(skb);
443 /* Is the slot empty? */
444 if (slot->qlen == 0) {
445 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
448 q->tail = NULL; /* no more active slots */
451 q->tail->next = next_a;
453 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
459 sfq_reset(struct Qdisc *sch)
463 while ((skb = sfq_dequeue(sch)) != NULL)
468 * When q->perturbation is changed, we rehash all queued skbs
469 * to avoid OOO (Out Of Order) effects.
470 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
473 static void sfq_rehash(struct Qdisc *sch)
475 struct sfq_sched_data *q = qdisc_priv(sch);
478 struct sfq_slot *slot;
479 struct sk_buff_head list;
482 __skb_queue_head_init(&list);
484 for (i = 0; i < q->maxflows; i++) {
489 skb = slot_dequeue_head(slot);
491 __skb_queue_tail(&list, skb);
493 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
497 while ((skb = __skb_dequeue(&list)) != NULL) {
498 unsigned int hash = sfq_hash(q, skb);
499 sfq_index x = q->ht[hash];
502 if (x == SFQ_EMPTY_SLOT) {
503 x = q->dep[0].next; /* get a free slot */
504 if (x >= SFQ_MAX_FLOWS) {
505 drop: sch->qstats.backlog -= qdisc_pkt_len(skb);
514 if (slot->qlen >= q->maxdepth)
516 slot_queue_add(slot, skb);
518 if (slot->qlen == 1) { /* The flow is new */
519 if (q->tail == NULL) { /* It is the first flow */
522 slot->next = q->tail->next;
526 slot->allot = q->scaled_quantum;
529 sch->q.qlen -= dropped;
530 qdisc_tree_decrease_qlen(sch, dropped);
533 static void sfq_perturbation(unsigned long arg)
535 struct Qdisc *sch = (struct Qdisc *)arg;
536 struct sfq_sched_data *q = qdisc_priv(sch);
537 spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
539 spin_lock(root_lock);
540 q->perturbation = net_random();
541 if (!q->filter_list && q->tail)
543 spin_unlock(root_lock);
545 if (q->perturb_period)
546 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
549 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
551 struct sfq_sched_data *q = qdisc_priv(sch);
552 struct tc_sfq_qopt *ctl = nla_data(opt);
553 struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
556 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
558 if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
559 ctl_v1 = nla_data(opt);
561 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
566 q->quantum = ctl->quantum;
567 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
569 q->perturb_period = ctl->perturb_period * HZ;
571 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
573 q->divisor = ctl->divisor;
574 q->maxflows = min_t(u32, q->maxflows, q->divisor);
578 q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
579 q->headdrop = ctl_v1->headdrop;
582 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
583 q->maxflows = min_t(u32, q->maxflows, q->limit);
587 while (sch->q.qlen > q->limit)
589 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
591 del_timer(&q->perturb_timer);
592 if (q->perturb_period) {
593 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
594 q->perturbation = net_random();
596 sch_tree_unlock(sch);
600 static void *sfq_alloc(size_t sz)
602 void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
609 static void sfq_free(void *addr)
612 if (is_vmalloc_addr(addr))
619 static void sfq_destroy(struct Qdisc *sch)
621 struct sfq_sched_data *q = qdisc_priv(sch);
623 tcf_destroy_chain(&q->filter_list);
624 q->perturb_period = 0;
625 del_timer_sync(&q->perturb_timer);
630 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
632 struct sfq_sched_data *q = qdisc_priv(sch);
635 q->perturb_timer.function = sfq_perturbation;
636 q->perturb_timer.data = (unsigned long)sch;
637 init_timer_deferrable(&q->perturb_timer);
639 for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
640 q->dep[i].next = i + SFQ_MAX_FLOWS;
641 q->dep[i].prev = i + SFQ_MAX_FLOWS;
644 q->limit = SFQ_MAX_DEPTH;
645 q->maxdepth = SFQ_MAX_DEPTH;
648 q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
649 q->maxflows = SFQ_DEFAULT_FLOWS;
650 q->quantum = psched_mtu(qdisc_dev(sch));
651 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
652 q->perturb_period = 0;
653 q->perturbation = net_random();
656 int err = sfq_change(sch, opt);
661 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
662 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
663 if (!q->ht || !q->slots) {
667 for (i = 0; i < q->divisor; i++)
668 q->ht[i] = SFQ_EMPTY_SLOT;
670 for (i = 0; i < q->maxflows; i++) {
671 slot_queue_init(&q->slots[i]);
675 sch->flags |= TCQ_F_CAN_BYPASS;
677 sch->flags &= ~TCQ_F_CAN_BYPASS;
681 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
683 struct sfq_sched_data *q = qdisc_priv(sch);
684 unsigned char *b = skb_tail_pointer(skb);
685 struct tc_sfq_qopt_v1 opt;
687 memset(&opt, 0, sizeof(opt));
688 opt.v0.quantum = q->quantum;
689 opt.v0.perturb_period = q->perturb_period / HZ;
690 opt.v0.limit = q->limit;
691 opt.v0.divisor = q->divisor;
692 opt.v0.flows = q->maxflows;
693 opt.depth = q->maxdepth;
694 opt.headdrop = q->headdrop;
696 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
705 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
710 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
715 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
718 /* we cannot bypass queue discipline anymore */
719 sch->flags &= ~TCQ_F_CAN_BYPASS;
723 static void sfq_put(struct Qdisc *q, unsigned long cl)
727 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
729 struct sfq_sched_data *q = qdisc_priv(sch);
733 return &q->filter_list;
736 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
737 struct sk_buff *skb, struct tcmsg *tcm)
739 tcm->tcm_handle |= TC_H_MIN(cl);
743 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
746 struct sfq_sched_data *q = qdisc_priv(sch);
747 sfq_index idx = q->ht[cl - 1];
748 struct gnet_stats_queue qs = { 0 };
749 struct tc_sfq_xstats xstats = { 0 };
752 if (idx != SFQ_EMPTY_SLOT) {
753 const struct sfq_slot *slot = &q->slots[idx];
755 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
756 qs.qlen = slot->qlen;
757 slot_queue_walk(slot, skb)
758 qs.backlog += qdisc_pkt_len(skb);
760 if (gnet_stats_copy_queue(d, &qs) < 0)
762 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
765 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
767 struct sfq_sched_data *q = qdisc_priv(sch);
773 for (i = 0; i < q->divisor; i++) {
774 if (q->ht[i] == SFQ_EMPTY_SLOT ||
775 arg->count < arg->skip) {
779 if (arg->fn(sch, i + 1, arg) < 0) {
787 static const struct Qdisc_class_ops sfq_class_ops = {
791 .tcf_chain = sfq_find_tcf,
792 .bind_tcf = sfq_bind,
793 .unbind_tcf = sfq_put,
794 .dump = sfq_dump_class,
795 .dump_stats = sfq_dump_class_stats,
799 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
800 .cl_ops = &sfq_class_ops,
802 .priv_size = sizeof(struct sfq_sched_data),
803 .enqueue = sfq_enqueue,
804 .dequeue = sfq_dequeue,
805 .peek = qdisc_peek_dequeued,
809 .destroy = sfq_destroy,
812 .owner = THIS_MODULE,
815 static int __init sfq_module_init(void)
817 return register_qdisc(&sfq_qdisc_ops);
819 static void __exit sfq_module_exit(void)
821 unregister_qdisc(&sfq_qdisc_ops);
823 module_init(sfq_module_init)
824 module_exit(sfq_module_exit)
825 MODULE_LICENSE("GPL");