net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/pkt_cls.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto __rcu *filter_list;
 129         struct tcf_block *block;
 130         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 131         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 132
 133         struct red_parms *red_parms;
 134         struct tc_sfqred_stats stats;
 135         struct sfq_slot *tail;          /* current slot in round */
 136
 137         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 138                                         /* Linked lists of slots, indexed by depth
 139                                          * dep[0] : list of unused flows
 140                                          * dep[1] : list of flows with 1 packet
 141                                          * dep[X] : list of flows with X packets
 142                                          */
 143
 144         unsigned int    maxflows;       /* number of flows in flows array */
 145         int             perturb_period;
 146         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 147         struct timer_list perturb_timer;
 148         struct Qdisc    *sch;
 149 };
 150
 151 /*
 152  * sfq_head are either in a sfq_slot or in dep[] array
 153  */
 154 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 155 {
 156         if (val < SFQ_MAX_FLOWS)
 157                 return &q->slots[val].dep;
 158         return &q->dep[val - SFQ_MAX_FLOWS];
 159 }
 160
 161 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 162                              const struct sk_buff *skb)
 163 {
 164         return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
 165 }
 166
 167 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 168                                  int *qerr)
 169 {
 170         struct sfq_sched_data *q = qdisc_priv(sch);
 171         struct tcf_result res;
 172         struct tcf_proto *fl;
 173         int result;
 174
 175         if (TC_H_MAJ(skb->priority) == sch->handle &&
 176             TC_H_MIN(skb->priority) > 0 &&
 177             TC_H_MIN(skb->priority) <= q->divisor)
 178                 return TC_H_MIN(skb->priority);
 179
 180         fl = rcu_dereference_bh(q->filter_list);
 181         if (!fl)
 182                 return sfq_hash(q, skb) + 1;
 183
 184         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 185         result = tcf_classify(skb, fl, &res, false);
 186         if (result >= 0) {
 187 #ifdef CONFIG_NET_CLS_ACT
 188                 switch (result) {
 189                 case TC_ACT_STOLEN:
 190                 case TC_ACT_QUEUED:
 191                 case TC_ACT_TRAP:
 192                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 193                         /* fall through */
 194                 case TC_ACT_SHOT:
 195                         return 0;
 196                 }
 197 #endif
 198                 if (TC_H_MIN(res.classid) <= q->divisor)
 199                         return TC_H_MIN(res.classid);
 200         }
 201         return 0;
 202 }
 203
 204 /*
 205  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 206  */
 207 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 208 {
 209         sfq_index p, n;
 210         struct sfq_slot *slot = &q->slots[x];
 211         int qlen = slot->qlen;
 212
 213         p = qlen + SFQ_MAX_FLOWS;
 214         n = q->dep[qlen].next;
 215
 216         slot->dep.next = n;
 217         slot->dep.prev = p;
 218
 219         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 220         sfq_dep_head(q, n)->prev = x;
 221 }
 222
 223 #define sfq_unlink(q, x, n, p)                  \
 224         do {                                    \
 225                 n = q->slots[x].dep.next;       \
 226                 p = q->slots[x].dep.prev;       \
 227                 sfq_dep_head(q, p)->next = n;   \
 228                 sfq_dep_head(q, n)->prev = p;   \
 229         } while (0)
 230
 231
 232 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 233 {
 234         sfq_index p, n;
 235         int d;
 236
 237         sfq_unlink(q, x, n, p);
 238
 239         d = q->slots[x].qlen--;
 240         if (n == p && q->cur_depth == d)
 241                 q->cur_depth--;
 242         sfq_link(q, x);
 243 }
 244
 245 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 246 {
 247         sfq_index p, n;
 248         int d;
 249
 250         sfq_unlink(q, x, n, p);
 251
 252         d = ++q->slots[x].qlen;
 253         if (q->cur_depth < d)
 254                 q->cur_depth = d;
 255         sfq_link(q, x);
 256 }
 257
 258 /* helper functions : might be changed when/if skb use a standard list_head */
 259
 260 /* remove one skb from tail of slot queue */
 261 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 262 {
 263         struct sk_buff *skb = slot->skblist_prev;
 264
 265         slot->skblist_prev = skb->prev;
 266         skb->prev->next = (struct sk_buff *)slot;
 267         skb->next = skb->prev = NULL;
 268         return skb;
 269 }
 270
 271 /* remove one skb from head of slot queue */
 272 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 273 {
 274         struct sk_buff *skb = slot->skblist_next;
 275
 276         slot->skblist_next = skb->next;
 277         skb->next->prev = (struct sk_buff *)slot;
 278         skb->next = skb->prev = NULL;
 279         return skb;
 280 }
 281
 282 static inline void slot_queue_init(struct sfq_slot *slot)
 283 {
 284         memset(slot, 0, sizeof(*slot));
 285         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 286 }
 287
 288 /* add skb to slot queue (tail add) */
 289 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 290 {
 291         skb->prev = slot->skblist_prev;
 292         skb->next = (struct sk_buff *)slot;
 293         slot->skblist_prev->next = skb;
 294         slot->skblist_prev = skb;
 295 }
 296
 297 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
 298 {
 299         struct sfq_sched_data *q = qdisc_priv(sch);
 300         sfq_index x, d = q->cur_depth;
 301         struct sk_buff *skb;
 302         unsigned int len;
 303         struct sfq_slot *slot;
 304
 305         /* Queue is full! Find the longest slot and drop tail packet from it */
 306         if (d > 1) {
 307                 x = q->dep[d].next;
 308                 slot = &q->slots[x];
 309 drop:
 310                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 311                 len = qdisc_pkt_len(skb);
 312                 slot->backlog -= len;
 313                 sfq_dec(q, x);
 314                 sch->q.qlen--;
 315                 qdisc_qstats_backlog_dec(sch, skb);
 316                 qdisc_drop(skb, sch, to_free);
 317                 return len;
 318         }
 319
 320         if (d == 1) {
 321                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 322                 x = q->tail->next;
 323                 slot = &q->slots[x];
 324                 q->tail->next = slot->next;
 325                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 326                 goto drop;
 327         }
 328
 329         return 0;
 330 }
 331
 332 /* Is ECN parameter configured */
 333 static int sfq_prob_mark(const struct sfq_sched_data *q)
 334 {
 335         return q->flags & TC_RED_ECN;
 336 }
 337
 338 /* Should packets over max threshold just be marked */
 339 static int sfq_hard_mark(const struct sfq_sched_data *q)
 340 {
 341         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 342 }
 343
 344 static int sfq_headdrop(const struct sfq_sched_data *q)
 345 {
 346         return q->headdrop;
 347 }
 348
 349 static int
 350 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 351 {
 352         struct sfq_sched_data *q = qdisc_priv(sch);
 353         unsigned int hash, dropped;
 354         sfq_index x, qlen;
 355         struct sfq_slot *slot;
 356         int uninitialized_var(ret);
 357         struct sk_buff *head;
 358         int delta;
 359
 360         hash = sfq_classify(skb, sch, &ret);
 361         if (hash == 0) {
 362                 if (ret & __NET_XMIT_BYPASS)
 363                         qdisc_qstats_drop(sch);
 364                 __qdisc_drop(skb, to_free);
 365                 return ret;
 366         }
 367         hash--;
 368
 369         x = q->ht[hash];
 370         slot = &q->slots[x];
 371         if (x == SFQ_EMPTY_SLOT) {
 372                 x = q->dep[0].next; /* get a free slot */
 373                 if (x >= SFQ_MAX_FLOWS)
 374                         return qdisc_drop(skb, sch, to_free);
 375                 q->ht[hash] = x;
 376                 slot = &q->slots[x];
 377                 slot->hash = hash;
 378                 slot->backlog = 0; /* should already be 0 anyway... */
 379                 red_set_vars(&slot->vars);
 380                 goto enqueue;
 381         }
 382         if (q->red_parms) {
 383                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 384                                                         &slot->vars,
 385                                                         slot->backlog);
 386                 switch (red_action(q->red_parms,
 387                                    &slot->vars,
 388                                    slot->vars.qavg)) {
 389                 case RED_DONT_MARK:
 390                         break;
 391
 392                 case RED_PROB_MARK:
 393                         qdisc_qstats_overlimit(sch);
 394                         if (sfq_prob_mark(q)) {
 395                                 /* We know we have at least one packet in queue */
 396                                 if (sfq_headdrop(q) &&
 397                                     INET_ECN_set_ce(slot->skblist_next)) {
 398                                         q->stats.prob_mark_head++;
 399                                         break;
 400                                 }
 401                                 if (INET_ECN_set_ce(skb)) {
 402                                         q->stats.prob_mark++;
 403                                         break;
 404                                 }
 405                         }
 406                         q->stats.prob_drop++;
 407                         goto congestion_drop;
 408
 409                 case RED_HARD_MARK:
 410                         qdisc_qstats_overlimit(sch);
 411                         if (sfq_hard_mark(q)) {
 412                                 /* We know we have at least one packet in queue */
 413                                 if (sfq_headdrop(q) &&
 414                                     INET_ECN_set_ce(slot->skblist_next)) {
 415                                         q->stats.forced_mark_head++;
 416                                         break;
 417                                 }
 418                                 if (INET_ECN_set_ce(skb)) {
 419                                         q->stats.forced_mark++;
 420                                         break;
 421                                 }
 422                         }
 423                         q->stats.forced_drop++;
 424                         goto congestion_drop;
 425                 }
 426         }
 427
 428         if (slot->qlen >= q->maxdepth) {
 429 congestion_drop:
 430                 if (!sfq_headdrop(q))
 431                         return qdisc_drop(skb, sch, to_free);
 432
 433                 /* We know we have at least one packet in queue */
 434                 head = slot_dequeue_head(slot);
 435                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 436                 sch->qstats.backlog -= delta;
 437                 slot->backlog -= delta;
 438                 qdisc_drop(head, sch, to_free);
 439
 440                 slot_queue_add(slot, skb);
 441                 qdisc_tree_reduce_backlog(sch, 0, delta);
 442                 return NET_XMIT_CN;
 443         }
 444
 445 enqueue:
 446         qdisc_qstats_backlog_inc(sch, skb);
 447         slot->backlog += qdisc_pkt_len(skb);
 448         slot_queue_add(slot, skb);
 449         sfq_inc(q, x);
 450         if (slot->qlen == 1) {          /* The flow is new */
 451                 if (q->tail == NULL) {  /* It is the first flow */
 452                         slot->next = x;
 453                 } else {
 454                         slot->next = q->tail->next;
 455                         q->tail->next = x;
 456                 }
 457                 /* We put this flow at the end of our flow list.
 458                  * This might sound unfair for a new flow to wait after old ones,
 459                  * but we could endup servicing new flows only, and freeze old ones.
 460                  */
 461                 q->tail = slot;
 462                 /* We could use a bigger initial quantum for new flows */
 463                 slot->allot = q->scaled_quantum;
 464         }
 465         if (++sch->q.qlen <= q->limit)
 466                 return NET_XMIT_SUCCESS;
 467
 468         qlen = slot->qlen;
 469         dropped = sfq_drop(sch, to_free);
 470         /* Return Congestion Notification only if we dropped a packet
 471          * from this flow.
 472          */
 473         if (qlen != slot->qlen) {
 474                 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 475                 return NET_XMIT_CN;
 476         }
 477
 478         /* As we dropped a packet, better let upper stack know this */
 479         qdisc_tree_reduce_backlog(sch, 1, dropped);
 480         return NET_XMIT_SUCCESS;
 481 }
 482
 483 static struct sk_buff *
 484 sfq_dequeue(struct Qdisc *sch)
 485 {
 486         struct sfq_sched_data *q = qdisc_priv(sch);
 487         struct sk_buff *skb;
 488         sfq_index a, next_a;
 489         struct sfq_slot *slot;
 490
 491         /* No active slots */
 492         if (q->tail == NULL)
 493                 return NULL;
 494
 495 next_slot:
 496         a = q->tail->next;
 497         slot = &q->slots[a];
 498         if (slot->allot <= 0) {
 499                 q->tail = slot;
 500                 slot->allot += q->scaled_quantum;
 501                 goto next_slot;
 502         }
 503         skb = slot_dequeue_head(slot);
 504         sfq_dec(q, a);
 505         qdisc_bstats_update(sch, skb);
 506         sch->q.qlen--;
 507         qdisc_qstats_backlog_dec(sch, skb);
 508         slot->backlog -= qdisc_pkt_len(skb);
 509         /* Is the slot empty? */
 510         if (slot->qlen == 0) {
 511                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 512                 next_a = slot->next;
 513                 if (a == next_a) {
 514                         q->tail = NULL; /* no more active slots */
 515                         return skb;
 516                 }
 517                 q->tail->next = next_a;
 518         } else {
 519                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 520         }
 521         return skb;
 522 }
 523
 524 static void
 525 sfq_reset(struct Qdisc *sch)
 526 {
 527         struct sk_buff *skb;
 528
 529         while ((skb = sfq_dequeue(sch)) != NULL)
 530                 rtnl_kfree_skbs(skb, skb);
 531 }
 532
 533 /*
 534  * When q->perturbation is changed, we rehash all queued skbs
 535  * to avoid OOO (Out Of Order) effects.
 536  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 537  * counters.
 538  */
 539 static void sfq_rehash(struct Qdisc *sch)
 540 {
 541         struct sfq_sched_data *q = qdisc_priv(sch);
 542         struct sk_buff *skb;
 543         int i;
 544         struct sfq_slot *slot;
 545         struct sk_buff_head list;
 546         int dropped = 0;
 547         unsigned int drop_len = 0;
 548
 549         __skb_queue_head_init(&list);
 550
 551         for (i = 0; i < q->maxflows; i++) {
 552                 slot = &q->slots[i];
 553                 if (!slot->qlen)
 554                         continue;
 555                 while (slot->qlen) {
 556                         skb = slot_dequeue_head(slot);
 557                         sfq_dec(q, i);
 558                         __skb_queue_tail(&list, skb);
 559                 }
 560                 slot->backlog = 0;
 561                 red_set_vars(&slot->vars);
 562                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 563         }
 564         q->tail = NULL;
 565
 566         while ((skb = __skb_dequeue(&list)) != NULL) {
 567                 unsigned int hash = sfq_hash(q, skb);
 568                 sfq_index x = q->ht[hash];
 569
 570                 slot = &q->slots[x];
 571                 if (x == SFQ_EMPTY_SLOT) {
 572                         x = q->dep[0].next; /* get a free slot */
 573                         if (x >= SFQ_MAX_FLOWS) {
 574 drop:
 575                                 qdisc_qstats_backlog_dec(sch, skb);
 576                                 drop_len += qdisc_pkt_len(skb);
 577                                 kfree_skb(skb);
 578                                 dropped++;
 579                                 continue;
 580                         }
 581                         q->ht[hash] = x;
 582                         slot = &q->slots[x];
 583                         slot->hash = hash;
 584                 }
 585                 if (slot->qlen >= q->maxdepth)
 586                         goto drop;
 587                 slot_queue_add(slot, skb);
 588                 if (q->red_parms)
 589                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 590                                                         &slot->vars,
 591                                                         slot->backlog);
 592                 slot->backlog += qdisc_pkt_len(skb);
 593                 sfq_inc(q, x);
 594                 if (slot->qlen == 1) {          /* The flow is new */
 595                         if (q->tail == NULL) {  /* It is the first flow */
 596                                 slot->next = x;
 597                         } else {
 598                                 slot->next = q->tail->next;
 599                                 q->tail->next = x;
 600                         }
 601                         q->tail = slot;
 602                         slot->allot = q->scaled_quantum;
 603                 }
 604         }
 605         sch->q.qlen -= dropped;
 606         qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 607 }
 608
 609 static void sfq_perturbation(struct timer_list *t)
 610 {
 611         struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
 612         struct Qdisc *sch = q->sch;
 613         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 614
 615         spin_lock(root_lock);
 616         q->perturbation = prandom_u32();
 617         if (!q->filter_list && q->tail)
 618                 sfq_rehash(sch);
 619         spin_unlock(root_lock);
 620
 621         if (q->perturb_period)
 622                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 623 }
 624
 625 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 626 {
 627         struct sfq_sched_data *q = qdisc_priv(sch);
 628         struct tc_sfq_qopt *ctl = nla_data(opt);
 629         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 630         unsigned int qlen, dropped = 0;
 631         struct red_parms *p = NULL;
 632         struct sk_buff *to_free = NULL;
 633         struct sk_buff *tail = NULL;
 634
 635         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 636                 return -EINVAL;
 637         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 638                 ctl_v1 = nla_data(opt);
 639         if (ctl->divisor &&
 640             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 641                 return -EINVAL;
 642         if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
 643                                         ctl_v1->Wlog))
 644                 return -EINVAL;
 645         if (ctl_v1 && ctl_v1->qth_min) {
 646                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 647                 if (!p)
 648                         return -ENOMEM;
 649         }
 650         sch_tree_lock(sch);
 651         if (ctl->quantum) {
 652                 q->quantum = ctl->quantum;
 653                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 654         }
 655         q->perturb_period = ctl->perturb_period * HZ;
 656         if (ctl->flows)
 657                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 658         if (ctl->divisor) {
 659                 q->divisor = ctl->divisor;
 660                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 661         }
 662         if (ctl_v1) {
 663                 if (ctl_v1->depth)
 664                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 665                 if (p) {
 666                         swap(q->red_parms, p);
 667                         red_set_parms(q->red_parms,
 668                                       ctl_v1->qth_min, ctl_v1->qth_max,
 669                                       ctl_v1->Wlog,
 670                                       ctl_v1->Plog, ctl_v1->Scell_log,
 671                                       NULL,
 672                                       ctl_v1->max_P);
 673                 }
 674                 q->flags = ctl_v1->flags;
 675                 q->headdrop = ctl_v1->headdrop;
 676         }
 677         if (ctl->limit) {
 678                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 679                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 680         }
 681
 682         qlen = sch->q.qlen;
 683         while (sch->q.qlen > q->limit) {
 684                 dropped += sfq_drop(sch, &to_free);
 685                 if (!tail)
 686                         tail = to_free;
 687         }
 688
 689         rtnl_kfree_skbs(to_free, tail);
 690         qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 691
 692         del_timer(&q->perturb_timer);
 693         if (q->perturb_period) {
 694                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 695                 q->perturbation = prandom_u32();
 696         }
 697         sch_tree_unlock(sch);
 698         kfree(p);
 699         return 0;
 700 }
 701
 702 static void *sfq_alloc(size_t sz)
 703 {
 704         return  kvmalloc(sz, GFP_KERNEL);
 705 }
 706
 707 static void sfq_free(void *addr)
 708 {
 709         kvfree(addr);
 710 }
 711
 712 static void sfq_destroy(struct Qdisc *sch)
 713 {
 714         struct sfq_sched_data *q = qdisc_priv(sch);
 715
 716         tcf_block_put(q->block);
 717         q->perturb_period = 0;
 718         del_timer_sync(&q->perturb_timer);
 719         sfq_free(q->ht);
 720         sfq_free(q->slots);
 721         kfree(q->red_parms);
 722 }
 723
 724 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
 725                     struct netlink_ext_ack *extack)
 726 {
 727         struct sfq_sched_data *q = qdisc_priv(sch);
 728         int i;
 729         int err;
 730
 731         q->sch = sch;
 732         timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
 733
 734         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 735         if (err)
 736                 return err;
 737
 738         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 739                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 740                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 741         }
 742
 743         q->limit = SFQ_MAX_DEPTH;
 744         q->maxdepth = SFQ_MAX_DEPTH;
 745         q->cur_depth = 0;
 746         q->tail = NULL;
 747         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 748         q->maxflows = SFQ_DEFAULT_FLOWS;
 749         q->quantum = psched_mtu(qdisc_dev(sch));
 750         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 751         q->perturb_period = 0;
 752         q->perturbation = prandom_u32();
 753
 754         if (opt) {
 755                 int err = sfq_change(sch, opt);
 756                 if (err)
 757                         return err;
 758         }
 759
 760         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 761         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 762         if (!q->ht || !q->slots) {
 763                 /* Note: sfq_destroy() will be called by our caller */
 764                 return -ENOMEM;
 765         }
 766
 767         for (i = 0; i < q->divisor; i++)
 768                 q->ht[i] = SFQ_EMPTY_SLOT;
 769
 770         for (i = 0; i < q->maxflows; i++) {
 771                 slot_queue_init(&q->slots[i]);
 772                 sfq_link(q, i);
 773         }
 774         if (q->limit >= 1)
 775                 sch->flags |= TCQ_F_CAN_BYPASS;
 776         else
 777                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 778         return 0;
 779 }
 780
 781 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 782 {
 783         struct sfq_sched_data *q = qdisc_priv(sch);
 784         unsigned char *b = skb_tail_pointer(skb);
 785         struct tc_sfq_qopt_v1 opt;
 786         struct red_parms *p = q->red_parms;
 787
 788         memset(&opt, 0, sizeof(opt));
 789         opt.v0.quantum  = q->quantum;
 790         opt.v0.perturb_period = q->perturb_period / HZ;
 791         opt.v0.limit    = q->limit;
 792         opt.v0.divisor  = q->divisor;
 793         opt.v0.flows    = q->maxflows;
 794         opt.depth       = q->maxdepth;
 795         opt.headdrop    = q->headdrop;
 796
 797         if (p) {
 798                 opt.qth_min     = p->qth_min >> p->Wlog;
 799                 opt.qth_max     = p->qth_max >> p->Wlog;
 800                 opt.Wlog        = p->Wlog;
 801                 opt.Plog        = p->Plog;
 802                 opt.Scell_log   = p->Scell_log;
 803                 opt.max_P       = p->max_P;
 804         }
 805         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 806         opt.flags       = q->flags;
 807
 808         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 809                 goto nla_put_failure;
 810
 811         return skb->len;
 812
 813 nla_put_failure:
 814         nlmsg_trim(skb, b);
 815         return -1;
 816 }
 817
 818 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 819 {
 820         return NULL;
 821 }
 822
 823 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
 824 {
 825         return 0;
 826 }
 827
 828 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 829                               u32 classid)
 830 {
 831         /* we cannot bypass queue discipline anymore */
 832         sch->flags &= ~TCQ_F_CAN_BYPASS;
 833         return 0;
 834 }
 835
 836 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
 837 {
 838 }
 839
 840 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
 841                                        struct netlink_ext_ack *extack)
 842 {
 843         struct sfq_sched_data *q = qdisc_priv(sch);
 844
 845         if (cl)
 846                 return NULL;
 847         return q->block;
 848 }
 849
 850 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 851                           struct sk_buff *skb, struct tcmsg *tcm)
 852 {
 853         tcm->tcm_handle |= TC_H_MIN(cl);
 854         return 0;
 855 }
 856
 857 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 858                                 struct gnet_dump *d)
 859 {
 860         struct sfq_sched_data *q = qdisc_priv(sch);
 861         sfq_index idx = q->ht[cl - 1];
 862         struct gnet_stats_queue qs = { 0 };
 863         struct tc_sfq_xstats xstats = { 0 };
 864
 865         if (idx != SFQ_EMPTY_SLOT) {
 866                 const struct sfq_slot *slot = &q->slots[idx];
 867
 868                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 869                 qs.qlen = slot->qlen;
 870                 qs.backlog = slot->backlog;
 871         }
 872         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 873                 return -1;
 874         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 875 }
 876
 877 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 878 {
 879         struct sfq_sched_data *q = qdisc_priv(sch);
 880         unsigned int i;
 881
 882         if (arg->stop)
 883                 return;
 884
 885         for (i = 0; i < q->divisor; i++) {
 886                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 887                     arg->count < arg->skip) {
 888                         arg->count++;
 889                         continue;
 890                 }
 891                 if (arg->fn(sch, i + 1, arg) < 0) {
 892                         arg->stop = 1;
 893                         break;
 894                 }
 895                 arg->count++;
 896         }
 897 }
 898
 899 static const struct Qdisc_class_ops sfq_class_ops = {
 900         .leaf           =       sfq_leaf,
 901         .find           =       sfq_find,
 902         .tcf_block      =       sfq_tcf_block,
 903         .bind_tcf       =       sfq_bind,
 904         .unbind_tcf     =       sfq_unbind,
 905         .dump           =       sfq_dump_class,
 906         .dump_stats     =       sfq_dump_class_stats,
 907         .walk           =       sfq_walk,
 908 };
 909
 910 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 911         .cl_ops         =       &sfq_class_ops,
 912         .id             =       "sfq",
 913         .priv_size      =       sizeof(struct sfq_sched_data),
 914         .enqueue        =       sfq_enqueue,
 915         .dequeue        =       sfq_dequeue,
 916         .peek           =       qdisc_peek_dequeued,
 917         .init           =       sfq_init,
 918         .reset          =       sfq_reset,
 919         .destroy        =       sfq_destroy,
 920         .change         =       NULL,
 921         .dump           =       sfq_dump,
 922         .owner          =       THIS_MODULE,
 923 };
 924
 925 static int __init sfq_module_init(void)
 926 {
 927         return register_qdisc(&sfq_qdisc_ops);
 928 }
 929 static void __exit sfq_module_exit(void)
 930 {
 931         unregister_qdisc(&sfq_qdisc_ops);
 932 }
 933 module_init(sfq_module_init)
 934 module_exit(sfq_module_exit)
 935 MODULE_LICENSE("GPL");