net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/flow_keys.h>
  27
  28
  29 /*      Stochastic Fairness Queuing algorithm.
  30         =======================================
  31
  32         Source:
  33         Paul E. McKenney "Stochastic Fairness Queuing",
  34         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  35
  36         Paul E. McKenney "Stochastic Fairness Queuing",
  37         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  38
  39
  40         See also:
  41         M. Shreedhar and George Varghese "Efficient Fair
  42         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  43
  44
  45         This is not the thing that is usually called (W)FQ nowadays.
  46         It does not use any timestamp mechanism, but instead
  47         processes queues in round-robin order.
  48
  49         ADVANTAGE:
  50
  51         - It is very cheap. Both CPU and memory requirements are minimal.
  52
  53         DRAWBACKS:
  54
  55         - "Stochastic" -> It is not 100% fair.
  56         When hash collisions occur, several flows are considered as one.
  57
  58         - "Round-robin" -> It introduces larger delays than virtual clock
  59         based schemes, and should not be used for isolating interactive
  60         traffic from non-interactive. It means, that this scheduler
  61         should be used as leaf of CBQ or P3, which put interactive traffic
  62         to higher priority band.
  63
  64         We still need true WFQ for top level CSZ, but using WFQ
  65         for the best effort traffic is absolutely pointless:
  66         SFQ is superior for this purpose.
  67
  68         IMPLEMENTATION:
  69         This implementation limits :
  70         - maximal queue length per flow to 127 packets.
  71         - max mtu to 2^18-1;
  72         - max 65408 flows,
  73         - number of hash buckets to 65536.
  74
  75         It is easy to increase these values, but not in flight.  */
  76
  77 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  78 #define SFQ_DEFAULT_FLOWS       128
  79 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  80 #define SFQ_EMPTY_SLOT          0xffff
  81 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  82
  83 /* We use 16 bits to store allot, and want to handle packets up to 64K
  84  * Scale allot by 8 (1<<3) so that no overflow occurs.
  85  */
  86 #define SFQ_ALLOT_SHIFT         3
  87 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  88
  89 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  90 typedef u16 sfq_index;
  91
  92 /*
  93  * We dont use pointers to save space.
  94  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  95  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  96  * are 'pointers' to dep[] array
  97  */
  98 struct sfq_head {
  99         sfq_index       next;
 100         sfq_index       prev;
 101 };
 102
 103 struct sfq_slot {
 104         struct sk_buff  *skblist_next;
 105         struct sk_buff  *skblist_prev;
 106         sfq_index       qlen; /* number of skbs in skblist */
 107         sfq_index       next; /* next slot in sfq RR chain */
 108         struct sfq_head dep; /* anchor in dep[] chains */
 109         unsigned short  hash; /* hash value (index in ht[]) */
 110         short           allot; /* credit for this slot */
 111 };
 112
 113 struct sfq_sched_data {
 114 /* frequently used fields */
 115         int             limit;          /* limit of total number of packets in this qdisc */
 116         unsigned int    divisor;        /* number of slots in hash table */
 117         unsigned int    maxflows;       /* number of flows in flows array */
 118         int             headdrop;
 119         int             maxdepth;       /* limit of packets per flow */
 120
 121         u32             perturbation;
 122         struct tcf_proto *filter_list;
 123         sfq_index       cur_depth;      /* depth of longest slot */
 124         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 125         struct sfq_slot *tail;          /* current slot in round */
 126         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 127         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 128
 129         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 130                                         /* Linked lists of slots, indexed by depth
 131                                          * dep[0] : list of unused flows
 132                                          * dep[1] : list of flows with 1 packet
 133                                          * dep[X] : list of flows with X packets
 134                                          */
 135
 136         int             perturb_period;
 137         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 138         struct timer_list perturb_timer;
 139 };
 140
 141 /*
 142  * sfq_head are either in a sfq_slot or in dep[] array
 143  */
 144 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 145 {
 146         if (val < SFQ_MAX_FLOWS)
 147                 return &q->slots[val].dep;
 148         return &q->dep[val - SFQ_MAX_FLOWS];
 149 }
 150
 151 /*
 152  * In order to be able to quickly rehash our queue when timer changes
 153  * q->perturbation, we store flow_keys in skb->cb[]
 154  */
 155 struct sfq_skb_cb {
 156        struct flow_keys        keys;
 157 };
 158
 159 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 160 {
 161        BUILD_BUG_ON(sizeof(skb->cb) <
 162                sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb));
 163        return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 164 }
 165
 166 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 167                              const struct sk_buff *skb)
 168 {
 169         const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
 170         unsigned int hash;
 171
 172         hash = jhash_3words((__force u32)keys->dst,
 173                             (__force u32)keys->src ^ keys->ip_proto,
 174                             (__force u32)keys->ports, q->perturbation);
 175         return hash & (q->divisor - 1);
 176 }
 177
 178 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 179                                  int *qerr)
 180 {
 181         struct sfq_sched_data *q = qdisc_priv(sch);
 182         struct tcf_result res;
 183         int result;
 184
 185         if (TC_H_MAJ(skb->priority) == sch->handle &&
 186             TC_H_MIN(skb->priority) > 0 &&
 187             TC_H_MIN(skb->priority) <= q->divisor)
 188                 return TC_H_MIN(skb->priority);
 189
 190         if (!q->filter_list) {
 191                 skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 192                 return sfq_hash(q, skb) + 1;
 193         }
 194
 195         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 196         result = tc_classify(skb, q->filter_list, &res);
 197         if (result >= 0) {
 198 #ifdef CONFIG_NET_CLS_ACT
 199                 switch (result) {
 200                 case TC_ACT_STOLEN:
 201                 case TC_ACT_QUEUED:
 202                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 203                 case TC_ACT_SHOT:
 204                         return 0;
 205                 }
 206 #endif
 207                 if (TC_H_MIN(res.classid) <= q->divisor)
 208                         return TC_H_MIN(res.classid);
 209         }
 210         return 0;
 211 }
 212
 213 /*
 214  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 215  */
 216 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 217 {
 218         sfq_index p, n;
 219         struct sfq_slot *slot = &q->slots[x];
 220         int qlen = slot->qlen;
 221
 222         p = qlen + SFQ_MAX_FLOWS;
 223         n = q->dep[qlen].next;
 224
 225         slot->dep.next = n;
 226         slot->dep.prev = p;
 227
 228         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 229         sfq_dep_head(q, n)->prev = x;
 230 }
 231
 232 #define sfq_unlink(q, x, n, p)                  \
 233         n = q->slots[x].dep.next;               \
 234         p = q->slots[x].dep.prev;               \
 235         sfq_dep_head(q, p)->next = n;           \
 236         sfq_dep_head(q, n)->prev = p
 237
 238
 239 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 240 {
 241         sfq_index p, n;
 242         int d;
 243
 244         sfq_unlink(q, x, n, p);
 245
 246         d = q->slots[x].qlen--;
 247         if (n == p && q->cur_depth == d)
 248                 q->cur_depth--;
 249         sfq_link(q, x);
 250 }
 251
 252 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 253 {
 254         sfq_index p, n;
 255         int d;
 256
 257         sfq_unlink(q, x, n, p);
 258
 259         d = ++q->slots[x].qlen;
 260         if (q->cur_depth < d)
 261                 q->cur_depth = d;
 262         sfq_link(q, x);
 263 }
 264
 265 /* helper functions : might be changed when/if skb use a standard list_head */
 266
 267 /* remove one skb from tail of slot queue */
 268 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 269 {
 270         struct sk_buff *skb = slot->skblist_prev;
 271
 272         slot->skblist_prev = skb->prev;
 273         skb->prev->next = (struct sk_buff *)slot;
 274         skb->next = skb->prev = NULL;
 275         return skb;
 276 }
 277
 278 /* remove one skb from head of slot queue */
 279 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 280 {
 281         struct sk_buff *skb = slot->skblist_next;
 282
 283         slot->skblist_next = skb->next;
 284         skb->next->prev = (struct sk_buff *)slot;
 285         skb->next = skb->prev = NULL;
 286         return skb;
 287 }
 288
 289 static inline void slot_queue_init(struct sfq_slot *slot)
 290 {
 291         memset(slot, 0, sizeof(*slot));
 292         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 293 }
 294
 295 /* add skb to slot queue (tail add) */
 296 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 297 {
 298         skb->prev = slot->skblist_prev;
 299         skb->next = (struct sk_buff *)slot;
 300         slot->skblist_prev->next = skb;
 301         slot->skblist_prev = skb;
 302 }
 303
 304 #define slot_queue_walk(slot, skb)              \
 305         for (skb = slot->skblist_next;          \
 306              skb != (struct sk_buff *)slot;     \
 307              skb = skb->next)
 308
 309 static unsigned int sfq_drop(struct Qdisc *sch)
 310 {
 311         struct sfq_sched_data *q = qdisc_priv(sch);
 312         sfq_index x, d = q->cur_depth;
 313         struct sk_buff *skb;
 314         unsigned int len;
 315         struct sfq_slot *slot;
 316
 317         /* Queue is full! Find the longest slot and drop tail packet from it */
 318         if (d > 1) {
 319                 x = q->dep[d].next;
 320                 slot = &q->slots[x];
 321 drop:
 322                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 323                 len = qdisc_pkt_len(skb);
 324                 sfq_dec(q, x);
 325                 kfree_skb(skb);
 326                 sch->q.qlen--;
 327                 sch->qstats.drops++;
 328                 sch->qstats.backlog -= len;
 329                 return len;
 330         }
 331
 332         if (d == 1) {
 333                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 334                 x = q->tail->next;
 335                 slot = &q->slots[x];
 336                 q->tail->next = slot->next;
 337                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 338                 goto drop;
 339         }
 340
 341         return 0;
 342 }
 343
 344 static int
 345 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 346 {
 347         struct sfq_sched_data *q = qdisc_priv(sch);
 348         unsigned int hash;
 349         sfq_index x, qlen;
 350         struct sfq_slot *slot;
 351         int uninitialized_var(ret);
 352
 353         hash = sfq_classify(skb, sch, &ret);
 354         if (hash == 0) {
 355                 if (ret & __NET_XMIT_BYPASS)
 356                         sch->qstats.drops++;
 357                 kfree_skb(skb);
 358                 return ret;
 359         }
 360         hash--;
 361
 362         x = q->ht[hash];
 363         slot = &q->slots[x];
 364         if (x == SFQ_EMPTY_SLOT) {
 365                 x = q->dep[0].next; /* get a free slot */
 366                 if (x >= SFQ_MAX_FLOWS)
 367                         return qdisc_drop(skb, sch);
 368                 q->ht[hash] = x;
 369                 slot = &q->slots[x];
 370                 slot->hash = hash;
 371         }
 372
 373         if (slot->qlen >= q->maxdepth) {
 374                 struct sk_buff *head;
 375
 376                 if (!q->headdrop)
 377                         return qdisc_drop(skb, sch);
 378
 379                 head = slot_dequeue_head(slot);
 380                 sch->qstats.backlog -= qdisc_pkt_len(head);
 381                 qdisc_drop(head, sch);
 382
 383                 sch->qstats.backlog += qdisc_pkt_len(skb);
 384                 slot_queue_add(slot, skb);
 385                 return NET_XMIT_CN;
 386         }
 387
 388         sch->qstats.backlog += qdisc_pkt_len(skb);
 389         slot_queue_add(slot, skb);
 390         sfq_inc(q, x);
 391         if (slot->qlen == 1) {          /* The flow is new */
 392                 if (q->tail == NULL) {  /* It is the first flow */
 393                         slot->next = x;
 394                         q->tail = slot;
 395                 } else {
 396                         slot->next = q->tail->next;
 397                         q->tail->next = x;
 398                 }
 399                 slot->allot = q->scaled_quantum;
 400         }
 401         if (++sch->q.qlen <= q->limit)
 402                 return NET_XMIT_SUCCESS;
 403
 404         qlen = slot->qlen;
 405         sfq_drop(sch);
 406         /* Return Congestion Notification only if we dropped a packet
 407          * from this flow.
 408          */
 409         if (qlen != slot->qlen)
 410                 return NET_XMIT_CN;
 411
 412         /* As we dropped a packet, better let upper stack know this */
 413         qdisc_tree_decrease_qlen(sch, 1);
 414         return NET_XMIT_SUCCESS;
 415 }
 416
 417 static struct sk_buff *
 418 sfq_dequeue(struct Qdisc *sch)
 419 {
 420         struct sfq_sched_data *q = qdisc_priv(sch);
 421         struct sk_buff *skb;
 422         sfq_index a, next_a;
 423         struct sfq_slot *slot;
 424
 425         /* No active slots */
 426         if (q->tail == NULL)
 427                 return NULL;
 428
 429 next_slot:
 430         a = q->tail->next;
 431         slot = &q->slots[a];
 432         if (slot->allot <= 0) {
 433                 q->tail = slot;
 434                 slot->allot += q->scaled_quantum;
 435                 goto next_slot;
 436         }
 437         skb = slot_dequeue_head(slot);
 438         sfq_dec(q, a);
 439         qdisc_bstats_update(sch, skb);
 440         sch->q.qlen--;
 441         sch->qstats.backlog -= qdisc_pkt_len(skb);
 442
 443         /* Is the slot empty? */
 444         if (slot->qlen == 0) {
 445                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 446                 next_a = slot->next;
 447                 if (a == next_a) {
 448                         q->tail = NULL; /* no more active slots */
 449                         return skb;
 450                 }
 451                 q->tail->next = next_a;
 452         } else {
 453                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 454         }
 455         return skb;
 456 }
 457
 458 static void
 459 sfq_reset(struct Qdisc *sch)
 460 {
 461         struct sk_buff *skb;
 462
 463         while ((skb = sfq_dequeue(sch)) != NULL)
 464                 kfree_skb(skb);
 465 }
 466
 467 /*
 468  * When q->perturbation is changed, we rehash all queued skbs
 469  * to avoid OOO (Out Of Order) effects.
 470  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 471  * counters.
 472  */
 473 static void sfq_rehash(struct Qdisc *sch)
 474 {
 475         struct sfq_sched_data *q = qdisc_priv(sch);
 476         struct sk_buff *skb;
 477         int i;
 478         struct sfq_slot *slot;
 479         struct sk_buff_head list;
 480         int dropped = 0;
 481
 482         __skb_queue_head_init(&list);
 483
 484         for (i = 0; i < q->maxflows; i++) {
 485                 slot = &q->slots[i];
 486                 if (!slot->qlen)
 487                         continue;
 488                 while (slot->qlen) {
 489                         skb = slot_dequeue_head(slot);
 490                         sfq_dec(q, i);
 491                         __skb_queue_tail(&list, skb);
 492                 }
 493                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 494         }
 495         q->tail = NULL;
 496
 497         while ((skb = __skb_dequeue(&list)) != NULL) {
 498                 unsigned int hash = sfq_hash(q, skb);
 499                 sfq_index x = q->ht[hash];
 500
 501                 slot = &q->slots[x];
 502                 if (x == SFQ_EMPTY_SLOT) {
 503                         x = q->dep[0].next; /* get a free slot */
 504                         if (x >= SFQ_MAX_FLOWS) {
 505 drop:                           sch->qstats.backlog -= qdisc_pkt_len(skb);
 506                                 kfree_skb(skb);
 507                                 dropped++;
 508                                 continue;
 509                         }
 510                         q->ht[hash] = x;
 511                         slot = &q->slots[x];
 512                         slot->hash = hash;
 513                 }
 514                 if (slot->qlen >= q->maxdepth)
 515                         goto drop;
 516                 slot_queue_add(slot, skb);
 517                 sfq_inc(q, x);
 518                 if (slot->qlen == 1) {          /* The flow is new */
 519                         if (q->tail == NULL) {  /* It is the first flow */
 520                                 slot->next = x;
 521                         } else {
 522                                 slot->next = q->tail->next;
 523                                 q->tail->next = x;
 524                         }
 525                         q->tail = slot;
 526                         slot->allot = q->scaled_quantum;
 527                 }
 528         }
 529         sch->q.qlen -= dropped;
 530         qdisc_tree_decrease_qlen(sch, dropped);
 531 }
 532
 533 static void sfq_perturbation(unsigned long arg)
 534 {
 535         struct Qdisc *sch = (struct Qdisc *)arg;
 536         struct sfq_sched_data *q = qdisc_priv(sch);
 537         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 538
 539         spin_lock(root_lock);
 540         q->perturbation = net_random();
 541         if (!q->filter_list && q->tail)
 542                 sfq_rehash(sch);
 543         spin_unlock(root_lock);
 544
 545         if (q->perturb_period)
 546                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 547 }
 548
 549 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 550 {
 551         struct sfq_sched_data *q = qdisc_priv(sch);
 552         struct tc_sfq_qopt *ctl = nla_data(opt);
 553         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 554         unsigned int qlen;
 555
 556         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 557                 return -EINVAL;
 558         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 559                 ctl_v1 = nla_data(opt);
 560         if (ctl->divisor &&
 561             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 562                 return -EINVAL;
 563
 564         sch_tree_lock(sch);
 565         if (ctl->quantum) {
 566                 q->quantum = ctl->quantum;
 567                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 568         }
 569         q->perturb_period = ctl->perturb_period * HZ;
 570         if (ctl->flows)
 571                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 572         if (ctl->divisor) {
 573                 q->divisor = ctl->divisor;
 574                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 575         }
 576         if (ctl_v1) {
 577                 if (ctl_v1->depth)
 578                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 579                 q->headdrop = ctl_v1->headdrop;
 580         }
 581         if (ctl->limit) {
 582                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 583                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 584         }
 585
 586         qlen = sch->q.qlen;
 587         while (sch->q.qlen > q->limit)
 588                 sfq_drop(sch);
 589         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 590
 591         del_timer(&q->perturb_timer);
 592         if (q->perturb_period) {
 593                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 594                 q->perturbation = net_random();
 595         }
 596         sch_tree_unlock(sch);
 597         return 0;
 598 }
 599
 600 static void *sfq_alloc(size_t sz)
 601 {
 602         void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 603
 604         if (!ptr)
 605                 ptr = vmalloc(sz);
 606         return ptr;
 607 }
 608
 609 static void sfq_free(void *addr)
 610 {
 611         if (addr) {
 612                 if (is_vmalloc_addr(addr))
 613                         vfree(addr);
 614                 else
 615                         kfree(addr);
 616         }
 617 }
 618
 619 static void sfq_destroy(struct Qdisc *sch)
 620 {
 621         struct sfq_sched_data *q = qdisc_priv(sch);
 622
 623         tcf_destroy_chain(&q->filter_list);
 624         q->perturb_period = 0;
 625         del_timer_sync(&q->perturb_timer);
 626         sfq_free(q->ht);
 627         sfq_free(q->slots);
 628 }
 629
 630 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 631 {
 632         struct sfq_sched_data *q = qdisc_priv(sch);
 633         int i;
 634
 635         q->perturb_timer.function = sfq_perturbation;
 636         q->perturb_timer.data = (unsigned long)sch;
 637         init_timer_deferrable(&q->perturb_timer);
 638
 639         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 640                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 641                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 642         }
 643
 644         q->limit = SFQ_MAX_DEPTH;
 645         q->maxdepth = SFQ_MAX_DEPTH;
 646         q->cur_depth = 0;
 647         q->tail = NULL;
 648         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 649         q->maxflows = SFQ_DEFAULT_FLOWS;
 650         q->quantum = psched_mtu(qdisc_dev(sch));
 651         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 652         q->perturb_period = 0;
 653         q->perturbation = net_random();
 654
 655         if (opt) {
 656                 int err = sfq_change(sch, opt);
 657                 if (err)
 658                         return err;
 659         }
 660
 661         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 662         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 663         if (!q->ht || !q->slots) {
 664                 sfq_destroy(sch);
 665                 return -ENOMEM;
 666         }
 667         for (i = 0; i < q->divisor; i++)
 668                 q->ht[i] = SFQ_EMPTY_SLOT;
 669
 670         for (i = 0; i < q->maxflows; i++) {
 671                 slot_queue_init(&q->slots[i]);
 672                 sfq_link(q, i);
 673         }
 674         if (q->limit >= 1)
 675                 sch->flags |= TCQ_F_CAN_BYPASS;
 676         else
 677                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 678         return 0;
 679 }
 680
 681 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 682 {
 683         struct sfq_sched_data *q = qdisc_priv(sch);
 684         unsigned char *b = skb_tail_pointer(skb);
 685         struct tc_sfq_qopt_v1 opt;
 686
 687         memset(&opt, 0, sizeof(opt));
 688         opt.v0.quantum  = q->quantum;
 689         opt.v0.perturb_period = q->perturb_period / HZ;
 690         opt.v0.limit    = q->limit;
 691         opt.v0.divisor  = q->divisor;
 692         opt.v0.flows    = q->maxflows;
 693         opt.depth       = q->maxdepth;
 694         opt.headdrop    = q->headdrop;
 695
 696         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 697
 698         return skb->len;
 699
 700 nla_put_failure:
 701         nlmsg_trim(skb, b);
 702         return -1;
 703 }
 704
 705 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 706 {
 707         return NULL;
 708 }
 709
 710 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 711 {
 712         return 0;
 713 }
 714
 715 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 716                               u32 classid)
 717 {
 718         /* we cannot bypass queue discipline anymore */
 719         sch->flags &= ~TCQ_F_CAN_BYPASS;
 720         return 0;
 721 }
 722
 723 static void sfq_put(struct Qdisc *q, unsigned long cl)
 724 {
 725 }
 726
 727 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 728 {
 729         struct sfq_sched_data *q = qdisc_priv(sch);
 730
 731         if (cl)
 732                 return NULL;
 733         return &q->filter_list;
 734 }
 735
 736 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 737                           struct sk_buff *skb, struct tcmsg *tcm)
 738 {
 739         tcm->tcm_handle |= TC_H_MIN(cl);
 740         return 0;
 741 }
 742
 743 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 744                                 struct gnet_dump *d)
 745 {
 746         struct sfq_sched_data *q = qdisc_priv(sch);
 747         sfq_index idx = q->ht[cl - 1];
 748         struct gnet_stats_queue qs = { 0 };
 749         struct tc_sfq_xstats xstats = { 0 };
 750         struct sk_buff *skb;
 751
 752         if (idx != SFQ_EMPTY_SLOT) {
 753                 const struct sfq_slot *slot = &q->slots[idx];
 754
 755                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 756                 qs.qlen = slot->qlen;
 757                 slot_queue_walk(slot, skb)
 758                         qs.backlog += qdisc_pkt_len(skb);
 759         }
 760         if (gnet_stats_copy_queue(d, &qs) < 0)
 761                 return -1;
 762         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 763 }
 764
 765 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 766 {
 767         struct sfq_sched_data *q = qdisc_priv(sch);
 768         unsigned int i;
 769
 770         if (arg->stop)
 771                 return;
 772
 773         for (i = 0; i < q->divisor; i++) {
 774                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 775                     arg->count < arg->skip) {
 776                         arg->count++;
 777                         continue;
 778                 }
 779                 if (arg->fn(sch, i + 1, arg) < 0) {
 780                         arg->stop = 1;
 781                         break;
 782                 }
 783                 arg->count++;
 784         }
 785 }
 786
 787 static const struct Qdisc_class_ops sfq_class_ops = {
 788         .leaf           =       sfq_leaf,
 789         .get            =       sfq_get,
 790         .put            =       sfq_put,
 791         .tcf_chain      =       sfq_find_tcf,
 792         .bind_tcf       =       sfq_bind,
 793         .unbind_tcf     =       sfq_put,
 794         .dump           =       sfq_dump_class,
 795         .dump_stats     =       sfq_dump_class_stats,
 796         .walk           =       sfq_walk,
 797 };
 798
 799 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 800         .cl_ops         =       &sfq_class_ops,
 801         .id             =       "sfq",
 802         .priv_size      =       sizeof(struct sfq_sched_data),
 803         .enqueue        =       sfq_enqueue,
 804         .dequeue        =       sfq_dequeue,
 805         .peek           =       qdisc_peek_dequeued,
 806         .drop           =       sfq_drop,
 807         .init           =       sfq_init,
 808         .reset          =       sfq_reset,
 809         .destroy        =       sfq_destroy,
 810         .change         =       NULL,
 811         .dump           =       sfq_dump,
 812         .owner          =       THIS_MODULE,
 813 };
 814
 815 static int __init sfq_module_init(void)
 816 {
 817         return register_qdisc(&sfq_qdisc_ops);
 818 }
 819 static void __exit sfq_module_exit(void)
 820 {
 821         unregister_qdisc(&sfq_qdisc_ops);
 822 }
 823 module_init(sfq_module_init)
 824 module_exit(sfq_module_exit)
 825 MODULE_LICENSE("GPL");