30cda707e4002098a7d696dfff69a40eb6ec9941
[deliverable/linux.git] / net / sched / sch_sfq.c
1 /*
2 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/flow_keys.h>
27
28
29 /* Stochastic Fairness Queuing algorithm.
30 =======================================
31
32 Source:
33 Paul E. McKenney "Stochastic Fairness Queuing",
34 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
35
36 Paul E. McKenney "Stochastic Fairness Queuing",
37 "Interworking: Research and Experience", v.2, 1991, p.113-131.
38
39
40 See also:
41 M. Shreedhar and George Varghese "Efficient Fair
42 Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
43
44
45 This is not the thing that is usually called (W)FQ nowadays.
46 It does not use any timestamp mechanism, but instead
47 processes queues in round-robin order.
48
49 ADVANTAGE:
50
51 - It is very cheap. Both CPU and memory requirements are minimal.
52
53 DRAWBACKS:
54
55 - "Stochastic" -> It is not 100% fair.
56 When hash collisions occur, several flows are considered as one.
57
58 - "Round-robin" -> It introduces larger delays than virtual clock
59 based schemes, and should not be used for isolating interactive
60 traffic from non-interactive. It means, that this scheduler
61 should be used as leaf of CBQ or P3, which put interactive traffic
62 to higher priority band.
63
64 We still need true WFQ for top level CSZ, but using WFQ
65 for the best effort traffic is absolutely pointless:
66 SFQ is superior for this purpose.
67
68 IMPLEMENTATION:
69 This implementation limits maximal queue length to 128;
70 max mtu to 2^18-1; max 128 flows, number of hash buckets to 1024.
71 The only goal of this restrictions was that all data
72 fit into one 4K page on 32bit arches.
73
74 It is easy to increase these values, but not in flight. */
75
76 #define SFQ_DEPTH 128 /* max number of packets per flow */
77 #define SFQ_SLOTS 128 /* max number of flows */
78 #define SFQ_EMPTY_SLOT 255
79 #define SFQ_DEFAULT_HASH_DIVISOR 1024
80
81 /* We use 16 bits to store allot, and want to handle packets up to 64K
82 * Scale allot by 8 (1<<3) so that no overflow occurs.
83 */
84 #define SFQ_ALLOT_SHIFT 3
85 #define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
86
87 /* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */
88 typedef unsigned char sfq_index;
89
90 /*
91 * We dont use pointers to save space.
92 * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array
93 * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1]
94 * are 'pointers' to dep[] array
95 */
96 struct sfq_head {
97 sfq_index next;
98 sfq_index prev;
99 };
100
101 struct sfq_slot {
102 struct sk_buff *skblist_next;
103 struct sk_buff *skblist_prev;
104 sfq_index qlen; /* number of skbs in skblist */
105 sfq_index next; /* next slot in sfq chain */
106 struct sfq_head dep; /* anchor in dep[] chains */
107 unsigned short hash; /* hash value (index in ht[]) */
108 short allot; /* credit for this slot */
109 };
110
111 struct sfq_sched_data {
112 /* Parameters */
113 int perturb_period;
114 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
115 int limit;
116 unsigned int divisor; /* number of slots in hash table */
117 /* Variables */
118 struct tcf_proto *filter_list;
119 struct timer_list perturb_timer;
120 u32 perturbation;
121 sfq_index cur_depth; /* depth of longest slot */
122 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
123 struct sfq_slot *tail; /* current slot in round */
124 sfq_index *ht; /* Hash table (divisor slots) */
125 struct sfq_slot slots[SFQ_SLOTS];
126 struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */
127 };
128
129 /*
130 * sfq_head are either in a sfq_slot or in dep[] array
131 */
132 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
133 {
134 if (val < SFQ_SLOTS)
135 return &q->slots[val].dep;
136 return &q->dep[val - SFQ_SLOTS];
137 }
138
139 static unsigned int sfq_hash(const struct sfq_sched_data *q,
140 const struct sk_buff *skb)
141 {
142 struct flow_keys keys;
143 unsigned int hash;
144
145 skb_flow_dissect(skb, &keys);
146 hash = jhash_3words((__force u32)keys.dst,
147 (__force u32)keys.src ^ keys.ip_proto,
148 (__force u32)keys.ports, q->perturbation);
149 return hash & (q->divisor - 1);
150 }
151
152 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
153 int *qerr)
154 {
155 struct sfq_sched_data *q = qdisc_priv(sch);
156 struct tcf_result res;
157 int result;
158
159 if (TC_H_MAJ(skb->priority) == sch->handle &&
160 TC_H_MIN(skb->priority) > 0 &&
161 TC_H_MIN(skb->priority) <= q->divisor)
162 return TC_H_MIN(skb->priority);
163
164 if (!q->filter_list)
165 return sfq_hash(q, skb) + 1;
166
167 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
168 result = tc_classify(skb, q->filter_list, &res);
169 if (result >= 0) {
170 #ifdef CONFIG_NET_CLS_ACT
171 switch (result) {
172 case TC_ACT_STOLEN:
173 case TC_ACT_QUEUED:
174 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
175 case TC_ACT_SHOT:
176 return 0;
177 }
178 #endif
179 if (TC_H_MIN(res.classid) <= q->divisor)
180 return TC_H_MIN(res.classid);
181 }
182 return 0;
183 }
184
185 /*
186 * x : slot number [0 .. SFQ_SLOTS - 1]
187 */
188 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
189 {
190 sfq_index p, n;
191 int qlen = q->slots[x].qlen;
192
193 p = qlen + SFQ_SLOTS;
194 n = q->dep[qlen].next;
195
196 q->slots[x].dep.next = n;
197 q->slots[x].dep.prev = p;
198
199 q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
200 sfq_dep_head(q, n)->prev = x;
201 }
202
203 #define sfq_unlink(q, x, n, p) \
204 n = q->slots[x].dep.next; \
205 p = q->slots[x].dep.prev; \
206 sfq_dep_head(q, p)->next = n; \
207 sfq_dep_head(q, n)->prev = p
208
209
210 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
211 {
212 sfq_index p, n;
213 int d;
214
215 sfq_unlink(q, x, n, p);
216
217 d = q->slots[x].qlen--;
218 if (n == p && q->cur_depth == d)
219 q->cur_depth--;
220 sfq_link(q, x);
221 }
222
223 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
224 {
225 sfq_index p, n;
226 int d;
227
228 sfq_unlink(q, x, n, p);
229
230 d = ++q->slots[x].qlen;
231 if (q->cur_depth < d)
232 q->cur_depth = d;
233 sfq_link(q, x);
234 }
235
236 /* helper functions : might be changed when/if skb use a standard list_head */
237
238 /* remove one skb from tail of slot queue */
239 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
240 {
241 struct sk_buff *skb = slot->skblist_prev;
242
243 slot->skblist_prev = skb->prev;
244 skb->prev->next = (struct sk_buff *)slot;
245 skb->next = skb->prev = NULL;
246 return skb;
247 }
248
249 /* remove one skb from head of slot queue */
250 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
251 {
252 struct sk_buff *skb = slot->skblist_next;
253
254 slot->skblist_next = skb->next;
255 skb->next->prev = (struct sk_buff *)slot;
256 skb->next = skb->prev = NULL;
257 return skb;
258 }
259
260 static inline void slot_queue_init(struct sfq_slot *slot)
261 {
262 slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
263 }
264
265 /* add skb to slot queue (tail add) */
266 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
267 {
268 skb->prev = slot->skblist_prev;
269 skb->next = (struct sk_buff *)slot;
270 slot->skblist_prev->next = skb;
271 slot->skblist_prev = skb;
272 }
273
274 #define slot_queue_walk(slot, skb) \
275 for (skb = slot->skblist_next; \
276 skb != (struct sk_buff *)slot; \
277 skb = skb->next)
278
279 static unsigned int sfq_drop(struct Qdisc *sch)
280 {
281 struct sfq_sched_data *q = qdisc_priv(sch);
282 sfq_index x, d = q->cur_depth;
283 struct sk_buff *skb;
284 unsigned int len;
285 struct sfq_slot *slot;
286
287 /* Queue is full! Find the longest slot and drop tail packet from it */
288 if (d > 1) {
289 x = q->dep[d].next;
290 slot = &q->slots[x];
291 drop:
292 skb = slot_dequeue_tail(slot);
293 len = qdisc_pkt_len(skb);
294 sfq_dec(q, x);
295 kfree_skb(skb);
296 sch->q.qlen--;
297 sch->qstats.drops++;
298 sch->qstats.backlog -= len;
299 return len;
300 }
301
302 if (d == 1) {
303 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
304 x = q->tail->next;
305 slot = &q->slots[x];
306 q->tail->next = slot->next;
307 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
308 goto drop;
309 }
310
311 return 0;
312 }
313
314 static int
315 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
316 {
317 struct sfq_sched_data *q = qdisc_priv(sch);
318 unsigned int hash;
319 sfq_index x, qlen;
320 struct sfq_slot *slot;
321 int uninitialized_var(ret);
322
323 hash = sfq_classify(skb, sch, &ret);
324 if (hash == 0) {
325 if (ret & __NET_XMIT_BYPASS)
326 sch->qstats.drops++;
327 kfree_skb(skb);
328 return ret;
329 }
330 hash--;
331
332 x = q->ht[hash];
333 slot = &q->slots[x];
334 if (x == SFQ_EMPTY_SLOT) {
335 x = q->dep[0].next; /* get a free slot */
336 q->ht[hash] = x;
337 slot = &q->slots[x];
338 slot->hash = hash;
339 }
340
341 /* If selected queue has length q->limit, do simple tail drop,
342 * i.e. drop _this_ packet.
343 */
344 if (slot->qlen >= q->limit)
345 return qdisc_drop(skb, sch);
346
347 sch->qstats.backlog += qdisc_pkt_len(skb);
348 slot_queue_add(slot, skb);
349 sfq_inc(q, x);
350 if (slot->qlen == 1) { /* The flow is new */
351 if (q->tail == NULL) { /* It is the first flow */
352 slot->next = x;
353 } else {
354 slot->next = q->tail->next;
355 q->tail->next = x;
356 }
357 q->tail = slot;
358 slot->allot = q->scaled_quantum;
359 }
360 if (++sch->q.qlen <= q->limit)
361 return NET_XMIT_SUCCESS;
362
363 qlen = slot->qlen;
364 sfq_drop(sch);
365 /* Return Congestion Notification only if we dropped a packet
366 * from this flow.
367 */
368 if (qlen != slot->qlen)
369 return NET_XMIT_CN;
370
371 /* As we dropped a packet, better let upper stack know this */
372 qdisc_tree_decrease_qlen(sch, 1);
373 return NET_XMIT_SUCCESS;
374 }
375
376 static struct sk_buff *
377 sfq_dequeue(struct Qdisc *sch)
378 {
379 struct sfq_sched_data *q = qdisc_priv(sch);
380 struct sk_buff *skb;
381 sfq_index a, next_a;
382 struct sfq_slot *slot;
383
384 /* No active slots */
385 if (q->tail == NULL)
386 return NULL;
387
388 next_slot:
389 a = q->tail->next;
390 slot = &q->slots[a];
391 if (slot->allot <= 0) {
392 q->tail = slot;
393 slot->allot += q->scaled_quantum;
394 goto next_slot;
395 }
396 skb = slot_dequeue_head(slot);
397 sfq_dec(q, a);
398 qdisc_bstats_update(sch, skb);
399 sch->q.qlen--;
400 sch->qstats.backlog -= qdisc_pkt_len(skb);
401
402 /* Is the slot empty? */
403 if (slot->qlen == 0) {
404 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
405 next_a = slot->next;
406 if (a == next_a) {
407 q->tail = NULL; /* no more active slots */
408 return skb;
409 }
410 q->tail->next = next_a;
411 } else {
412 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
413 }
414 return skb;
415 }
416
417 static void
418 sfq_reset(struct Qdisc *sch)
419 {
420 struct sk_buff *skb;
421
422 while ((skb = sfq_dequeue(sch)) != NULL)
423 kfree_skb(skb);
424 }
425
426 static void sfq_perturbation(unsigned long arg)
427 {
428 struct Qdisc *sch = (struct Qdisc *)arg;
429 struct sfq_sched_data *q = qdisc_priv(sch);
430
431 q->perturbation = net_random();
432
433 if (q->perturb_period)
434 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
435 }
436
437 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
438 {
439 struct sfq_sched_data *q = qdisc_priv(sch);
440 struct tc_sfq_qopt *ctl = nla_data(opt);
441 unsigned int qlen;
442
443 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
444 return -EINVAL;
445
446 if (ctl->divisor &&
447 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
448 return -EINVAL;
449
450 sch_tree_lock(sch);
451 q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
452 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
453 q->perturb_period = ctl->perturb_period * HZ;
454 if (ctl->limit)
455 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
456 if (ctl->divisor)
457 q->divisor = ctl->divisor;
458 qlen = sch->q.qlen;
459 while (sch->q.qlen > q->limit)
460 sfq_drop(sch);
461 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
462
463 del_timer(&q->perturb_timer);
464 if (q->perturb_period) {
465 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
466 q->perturbation = net_random();
467 }
468 sch_tree_unlock(sch);
469 return 0;
470 }
471
472 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
473 {
474 struct sfq_sched_data *q = qdisc_priv(sch);
475 size_t sz;
476 int i;
477
478 q->perturb_timer.function = sfq_perturbation;
479 q->perturb_timer.data = (unsigned long)sch;
480 init_timer_deferrable(&q->perturb_timer);
481
482 for (i = 0; i < SFQ_DEPTH; i++) {
483 q->dep[i].next = i + SFQ_SLOTS;
484 q->dep[i].prev = i + SFQ_SLOTS;
485 }
486
487 q->limit = SFQ_DEPTH - 1;
488 q->cur_depth = 0;
489 q->tail = NULL;
490 q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
491 if (opt == NULL) {
492 q->quantum = psched_mtu(qdisc_dev(sch));
493 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
494 q->perturb_period = 0;
495 q->perturbation = net_random();
496 } else {
497 int err = sfq_change(sch, opt);
498 if (err)
499 return err;
500 }
501
502 sz = sizeof(q->ht[0]) * q->divisor;
503 q->ht = kmalloc(sz, GFP_KERNEL);
504 if (!q->ht && sz > PAGE_SIZE)
505 q->ht = vmalloc(sz);
506 if (!q->ht)
507 return -ENOMEM;
508 for (i = 0; i < q->divisor; i++)
509 q->ht[i] = SFQ_EMPTY_SLOT;
510
511 for (i = 0; i < SFQ_SLOTS; i++) {
512 slot_queue_init(&q->slots[i]);
513 sfq_link(q, i);
514 }
515 if (q->limit >= 1)
516 sch->flags |= TCQ_F_CAN_BYPASS;
517 else
518 sch->flags &= ~TCQ_F_CAN_BYPASS;
519 return 0;
520 }
521
522 static void sfq_destroy(struct Qdisc *sch)
523 {
524 struct sfq_sched_data *q = qdisc_priv(sch);
525
526 tcf_destroy_chain(&q->filter_list);
527 q->perturb_period = 0;
528 del_timer_sync(&q->perturb_timer);
529 if (is_vmalloc_addr(q->ht))
530 vfree(q->ht);
531 else
532 kfree(q->ht);
533 }
534
535 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
536 {
537 struct sfq_sched_data *q = qdisc_priv(sch);
538 unsigned char *b = skb_tail_pointer(skb);
539 struct tc_sfq_qopt opt;
540
541 opt.quantum = q->quantum;
542 opt.perturb_period = q->perturb_period / HZ;
543
544 opt.limit = q->limit;
545 opt.divisor = q->divisor;
546 opt.flows = q->limit;
547
548 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
549
550 return skb->len;
551
552 nla_put_failure:
553 nlmsg_trim(skb, b);
554 return -1;
555 }
556
557 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
558 {
559 return NULL;
560 }
561
562 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
563 {
564 return 0;
565 }
566
567 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
568 u32 classid)
569 {
570 /* we cannot bypass queue discipline anymore */
571 sch->flags &= ~TCQ_F_CAN_BYPASS;
572 return 0;
573 }
574
575 static void sfq_put(struct Qdisc *q, unsigned long cl)
576 {
577 }
578
579 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
580 {
581 struct sfq_sched_data *q = qdisc_priv(sch);
582
583 if (cl)
584 return NULL;
585 return &q->filter_list;
586 }
587
588 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
589 struct sk_buff *skb, struct tcmsg *tcm)
590 {
591 tcm->tcm_handle |= TC_H_MIN(cl);
592 return 0;
593 }
594
595 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
596 struct gnet_dump *d)
597 {
598 struct sfq_sched_data *q = qdisc_priv(sch);
599 sfq_index idx = q->ht[cl - 1];
600 struct gnet_stats_queue qs = { 0 };
601 struct tc_sfq_xstats xstats = { 0 };
602 struct sk_buff *skb;
603
604 if (idx != SFQ_EMPTY_SLOT) {
605 const struct sfq_slot *slot = &q->slots[idx];
606
607 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
608 qs.qlen = slot->qlen;
609 slot_queue_walk(slot, skb)
610 qs.backlog += qdisc_pkt_len(skb);
611 }
612 if (gnet_stats_copy_queue(d, &qs) < 0)
613 return -1;
614 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
615 }
616
617 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
618 {
619 struct sfq_sched_data *q = qdisc_priv(sch);
620 unsigned int i;
621
622 if (arg->stop)
623 return;
624
625 for (i = 0; i < q->divisor; i++) {
626 if (q->ht[i] == SFQ_EMPTY_SLOT ||
627 arg->count < arg->skip) {
628 arg->count++;
629 continue;
630 }
631 if (arg->fn(sch, i + 1, arg) < 0) {
632 arg->stop = 1;
633 break;
634 }
635 arg->count++;
636 }
637 }
638
639 static const struct Qdisc_class_ops sfq_class_ops = {
640 .leaf = sfq_leaf,
641 .get = sfq_get,
642 .put = sfq_put,
643 .tcf_chain = sfq_find_tcf,
644 .bind_tcf = sfq_bind,
645 .unbind_tcf = sfq_put,
646 .dump = sfq_dump_class,
647 .dump_stats = sfq_dump_class_stats,
648 .walk = sfq_walk,
649 };
650
651 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
652 .cl_ops = &sfq_class_ops,
653 .id = "sfq",
654 .priv_size = sizeof(struct sfq_sched_data),
655 .enqueue = sfq_enqueue,
656 .dequeue = sfq_dequeue,
657 .peek = qdisc_peek_dequeued,
658 .drop = sfq_drop,
659 .init = sfq_init,
660 .reset = sfq_reset,
661 .destroy = sfq_destroy,
662 .change = NULL,
663 .dump = sfq_dump,
664 .owner = THIS_MODULE,
665 };
666
667 static int __init sfq_module_init(void)
668 {
669 return register_qdisc(&sfq_qdisc_ops);
670 }
671 static void __exit sfq_module_exit(void)
672 {
673 unregister_qdisc(&sfq_qdisc_ops);
674 }
675 module_init(sfq_module_init)
676 module_exit(sfq_module_exit)
677 MODULE_LICENSE("GPL");
This page took 0.044731 seconds and 4 git commands to generate.