bridge: Fix IPv6 multicast snooping by correcting offset in MLDv2 report
[deliverable/linux.git] / net / sched / sch_cbq.c
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22
23
24 /* Class-Based Queueing (CBQ) algorithm.
25 =======================================
26
27 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28 Management Models for Packet Networks",
29 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30
31 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32
33 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34 Parameters", 1996
35
36 [4] Sally Floyd and Michael Speer, "Experimental Results
37 for Class-Based Queueing", 1998, not published.
38
39 -----------------------------------------------------------------------
40
41 Algorithm skeleton was taken from NS simulator cbq.cc.
42 If someone wants to check this code against the LBL version,
43 he should take into account that ONLY the skeleton was borrowed,
44 the implementation is different. Particularly:
45
46 --- The WRR algorithm is different. Our version looks more
47 reasonable (I hope) and works when quanta are allowed to be
48 less than MTU, which is always the case when real time classes
49 have small rates. Note, that the statement of [3] is
50 incomplete, delay may actually be estimated even if class
51 per-round allotment is less than MTU. Namely, if per-round
52 allotment is W*r_i, and r_1+...+r_k = r < 1
53
54 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55
56 In the worst case we have IntServ estimate with D = W*r+k*MTU
57 and C = MTU*r. The proof (if correct at all) is trivial.
58
59
60 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
61 interpret some places, which look like wrong translations
62 from NS. Anyone is advised to find these differences
63 and explain to me, why I am wrong 8).
64
65 --- Linux has no EOI event, so that we cannot estimate true class
66 idle time. Workaround is to consider the next dequeue event
67 as sign that previous packet is finished. This is wrong because of
68 internal device queueing, but on a permanently loaded link it is true.
69 Moreover, combined with clock integrator, this scheme looks
70 very close to an ideal solution. */
71
72 struct cbq_sched_data;
73
74
75 struct cbq_class
76 {
77 struct Qdisc_class_common common;
78 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79
80 /* Parameters */
81 unsigned char priority; /* class priority */
82 unsigned char priority2; /* priority to be used after overlimit */
83 unsigned char ewma_log; /* time constant for idle time calculation */
84 unsigned char ovl_strategy;
85 #ifdef CONFIG_NET_CLS_ACT
86 unsigned char police;
87 #endif
88
89 u32 defmap;
90
91 /* Link-sharing scheduler parameters */
92 long maxidle; /* Class parameters: see below. */
93 long offtime;
94 long minidle;
95 u32 avpkt;
96 struct qdisc_rate_table *R_tab;
97
98 /* Overlimit strategy parameters */
99 void (*overlimit)(struct cbq_class *cl);
100 psched_tdiff_t penalty;
101
102 /* General scheduler (WRR) parameters */
103 long allot;
104 long quantum; /* Allotment per WRR round */
105 long weight; /* Relative allotment: see below */
106
107 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
108 struct cbq_class *split; /* Ptr to split node */
109 struct cbq_class *share; /* Ptr to LS parent in the class tree */
110 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
111 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
112 parent otherwise */
113 struct cbq_class *sibling; /* Sibling chain */
114 struct cbq_class *children; /* Pointer to children chain */
115
116 struct Qdisc *q; /* Elementary queueing discipline */
117
118
119 /* Variables */
120 unsigned char cpriority; /* Effective priority */
121 unsigned char delayed;
122 unsigned char level; /* level of the class in hierarchy:
123 0 for leaf classes, and maximal
124 level of children + 1 for nodes.
125 */
126
127 psched_time_t last; /* Last end of service */
128 psched_time_t undertime;
129 long avgidle;
130 long deficit; /* Saved deficit for WRR */
131 psched_time_t penalized;
132 struct gnet_stats_basic_packed bstats;
133 struct gnet_stats_queue qstats;
134 struct gnet_stats_rate_est rate_est;
135 struct tc_cbq_xstats xstats;
136
137 struct tcf_proto *filter_list;
138
139 int refcnt;
140 int filters;
141
142 struct cbq_class *defaults[TC_PRIO_MAX+1];
143 };
144
145 struct cbq_sched_data
146 {
147 struct Qdisc_class_hash clhash; /* Hash table of all classes */
148 int nclasses[TC_CBQ_MAXPRIO+1];
149 unsigned quanta[TC_CBQ_MAXPRIO+1];
150
151 struct cbq_class link;
152
153 unsigned activemask;
154 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
155 with backlog */
156
157 #ifdef CONFIG_NET_CLS_ACT
158 struct cbq_class *rx_class;
159 #endif
160 struct cbq_class *tx_class;
161 struct cbq_class *tx_borrowed;
162 int tx_len;
163 psched_time_t now; /* Cached timestamp */
164 psched_time_t now_rt; /* Cached real time */
165 unsigned pmask;
166
167 struct hrtimer delay_timer;
168 struct qdisc_watchdog watchdog; /* Watchdog timer,
169 started when CBQ has
170 backlog, but cannot
171 transmit just now */
172 psched_tdiff_t wd_expires;
173 int toplevel;
174 u32 hgenerator;
175 };
176
177
178 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
179
180 static __inline__ struct cbq_class *
181 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
182 {
183 struct Qdisc_class_common *clc;
184
185 clc = qdisc_class_find(&q->clhash, classid);
186 if (clc == NULL)
187 return NULL;
188 return container_of(clc, struct cbq_class, common);
189 }
190
191 #ifdef CONFIG_NET_CLS_ACT
192
193 static struct cbq_class *
194 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
195 {
196 struct cbq_class *cl, *new;
197
198 for (cl = this->tparent; cl; cl = cl->tparent)
199 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
200 return new;
201
202 return NULL;
203 }
204
205 #endif
206
207 /* Classify packet. The procedure is pretty complicated, but
208 it allows us to combine link sharing and priority scheduling
209 transparently.
210
211 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
212 so that it resolves to split nodes. Then packets are classified
213 by logical priority, or a more specific classifier may be attached
214 to the split node.
215 */
216
217 static struct cbq_class *
218 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
219 {
220 struct cbq_sched_data *q = qdisc_priv(sch);
221 struct cbq_class *head = &q->link;
222 struct cbq_class **defmap;
223 struct cbq_class *cl = NULL;
224 u32 prio = skb->priority;
225 struct tcf_result res;
226
227 /*
228 * Step 1. If skb->priority points to one of our classes, use it.
229 */
230 if (TC_H_MAJ(prio^sch->handle) == 0 &&
231 (cl = cbq_class_lookup(q, prio)) != NULL)
232 return cl;
233
234 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
235 for (;;) {
236 int result = 0;
237 defmap = head->defaults;
238
239 /*
240 * Step 2+n. Apply classifier.
241 */
242 if (!head->filter_list ||
243 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
244 goto fallback;
245
246 if ((cl = (void*)res.class) == NULL) {
247 if (TC_H_MAJ(res.classid))
248 cl = cbq_class_lookup(q, res.classid);
249 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
250 cl = defmap[TC_PRIO_BESTEFFORT];
251
252 if (cl == NULL || cl->level >= head->level)
253 goto fallback;
254 }
255
256 #ifdef CONFIG_NET_CLS_ACT
257 switch (result) {
258 case TC_ACT_QUEUED:
259 case TC_ACT_STOLEN:
260 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
261 case TC_ACT_SHOT:
262 return NULL;
263 case TC_ACT_RECLASSIFY:
264 return cbq_reclassify(skb, cl);
265 }
266 #endif
267 if (cl->level == 0)
268 return cl;
269
270 /*
271 * Step 3+n. If classifier selected a link sharing class,
272 * apply agency specific classifier.
273 * Repeat this procdure until we hit a leaf node.
274 */
275 head = cl;
276 }
277
278 fallback:
279 cl = head;
280
281 /*
282 * Step 4. No success...
283 */
284 if (TC_H_MAJ(prio) == 0 &&
285 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
286 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
287 return head;
288
289 return cl;
290 }
291
292 /*
293 A packet has just been enqueued on the empty class.
294 cbq_activate_class adds it to the tail of active class list
295 of its priority band.
296 */
297
298 static __inline__ void cbq_activate_class(struct cbq_class *cl)
299 {
300 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
301 int prio = cl->cpriority;
302 struct cbq_class *cl_tail;
303
304 cl_tail = q->active[prio];
305 q->active[prio] = cl;
306
307 if (cl_tail != NULL) {
308 cl->next_alive = cl_tail->next_alive;
309 cl_tail->next_alive = cl;
310 } else {
311 cl->next_alive = cl;
312 q->activemask |= (1<<prio);
313 }
314 }
315
316 /*
317 Unlink class from active chain.
318 Note that this same procedure is done directly in cbq_dequeue*
319 during round-robin procedure.
320 */
321
322 static void cbq_deactivate_class(struct cbq_class *this)
323 {
324 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
325 int prio = this->cpriority;
326 struct cbq_class *cl;
327 struct cbq_class *cl_prev = q->active[prio];
328
329 do {
330 cl = cl_prev->next_alive;
331 if (cl == this) {
332 cl_prev->next_alive = cl->next_alive;
333 cl->next_alive = NULL;
334
335 if (cl == q->active[prio]) {
336 q->active[prio] = cl_prev;
337 if (cl == q->active[prio]) {
338 q->active[prio] = NULL;
339 q->activemask &= ~(1<<prio);
340 return;
341 }
342 }
343 return;
344 }
345 } while ((cl_prev = cl) != q->active[prio]);
346 }
347
348 static void
349 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
350 {
351 int toplevel = q->toplevel;
352
353 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
354 psched_time_t now;
355 psched_tdiff_t incr;
356
357 now = psched_get_time();
358 incr = now - q->now_rt;
359 now = q->now + incr;
360
361 do {
362 if (cl->undertime < now) {
363 q->toplevel = cl->level;
364 return;
365 }
366 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
367 }
368 }
369
370 static int
371 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
372 {
373 struct cbq_sched_data *q = qdisc_priv(sch);
374 int uninitialized_var(ret);
375 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
376
377 #ifdef CONFIG_NET_CLS_ACT
378 q->rx_class = cl;
379 #endif
380 if (cl == NULL) {
381 if (ret & __NET_XMIT_BYPASS)
382 sch->qstats.drops++;
383 kfree_skb(skb);
384 return ret;
385 }
386
387 #ifdef CONFIG_NET_CLS_ACT
388 cl->q->__parent = sch;
389 #endif
390 ret = qdisc_enqueue(skb, cl->q);
391 if (ret == NET_XMIT_SUCCESS) {
392 sch->q.qlen++;
393 cbq_mark_toplevel(q, cl);
394 if (!cl->next_alive)
395 cbq_activate_class(cl);
396 return ret;
397 }
398
399 if (net_xmit_drop_count(ret)) {
400 sch->qstats.drops++;
401 cbq_mark_toplevel(q, cl);
402 cl->qstats.drops++;
403 }
404 return ret;
405 }
406
407 /* Overlimit actions */
408
409 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
410
411 static void cbq_ovl_classic(struct cbq_class *cl)
412 {
413 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
414 psched_tdiff_t delay = cl->undertime - q->now;
415
416 if (!cl->delayed) {
417 delay += cl->offtime;
418
419 /*
420 Class goes to sleep, so that it will have no
421 chance to work avgidle. Let's forgive it 8)
422
423 BTW cbq-2.0 has a crap in this
424 place, apparently they forgot to shift it by cl->ewma_log.
425 */
426 if (cl->avgidle < 0)
427 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
428 if (cl->avgidle < cl->minidle)
429 cl->avgidle = cl->minidle;
430 if (delay <= 0)
431 delay = 1;
432 cl->undertime = q->now + delay;
433
434 cl->xstats.overactions++;
435 cl->delayed = 1;
436 }
437 if (q->wd_expires == 0 || q->wd_expires > delay)
438 q->wd_expires = delay;
439
440 /* Dirty work! We must schedule wakeups based on
441 real available rate, rather than leaf rate,
442 which may be tiny (even zero).
443 */
444 if (q->toplevel == TC_CBQ_MAXLEVEL) {
445 struct cbq_class *b;
446 psched_tdiff_t base_delay = q->wd_expires;
447
448 for (b = cl->borrow; b; b = b->borrow) {
449 delay = b->undertime - q->now;
450 if (delay < base_delay) {
451 if (delay <= 0)
452 delay = 1;
453 base_delay = delay;
454 }
455 }
456
457 q->wd_expires = base_delay;
458 }
459 }
460
461 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
462 they go overlimit
463 */
464
465 static void cbq_ovl_rclassic(struct cbq_class *cl)
466 {
467 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
468 struct cbq_class *this = cl;
469
470 do {
471 if (cl->level > q->toplevel) {
472 cl = NULL;
473 break;
474 }
475 } while ((cl = cl->borrow) != NULL);
476
477 if (cl == NULL)
478 cl = this;
479 cbq_ovl_classic(cl);
480 }
481
482 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
483
484 static void cbq_ovl_delay(struct cbq_class *cl)
485 {
486 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
487 psched_tdiff_t delay = cl->undertime - q->now;
488
489 if (test_bit(__QDISC_STATE_DEACTIVATED,
490 &qdisc_root_sleeping(cl->qdisc)->state))
491 return;
492
493 if (!cl->delayed) {
494 psched_time_t sched = q->now;
495 ktime_t expires;
496
497 delay += cl->offtime;
498 if (cl->avgidle < 0)
499 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
500 if (cl->avgidle < cl->minidle)
501 cl->avgidle = cl->minidle;
502 cl->undertime = q->now + delay;
503
504 if (delay > 0) {
505 sched += delay + cl->penalty;
506 cl->penalized = sched;
507 cl->cpriority = TC_CBQ_MAXPRIO;
508 q->pmask |= (1<<TC_CBQ_MAXPRIO);
509
510 expires = ktime_set(0, 0);
511 expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
512 if (hrtimer_try_to_cancel(&q->delay_timer) &&
513 ktime_to_ns(ktime_sub(
514 hrtimer_get_expires(&q->delay_timer),
515 expires)) > 0)
516 hrtimer_set_expires(&q->delay_timer, expires);
517 hrtimer_restart(&q->delay_timer);
518 cl->delayed = 1;
519 cl->xstats.overactions++;
520 return;
521 }
522 delay = 1;
523 }
524 if (q->wd_expires == 0 || q->wd_expires > delay)
525 q->wd_expires = delay;
526 }
527
528 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
529
530 static void cbq_ovl_lowprio(struct cbq_class *cl)
531 {
532 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
533
534 cl->penalized = q->now + cl->penalty;
535
536 if (cl->cpriority != cl->priority2) {
537 cl->cpriority = cl->priority2;
538 q->pmask |= (1<<cl->cpriority);
539 cl->xstats.overactions++;
540 }
541 cbq_ovl_classic(cl);
542 }
543
544 /* TC_CBQ_OVL_DROP: penalize class by dropping */
545
546 static void cbq_ovl_drop(struct cbq_class *cl)
547 {
548 if (cl->q->ops->drop)
549 if (cl->q->ops->drop(cl->q))
550 cl->qdisc->q.qlen--;
551 cl->xstats.overactions++;
552 cbq_ovl_classic(cl);
553 }
554
555 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
556 psched_time_t now)
557 {
558 struct cbq_class *cl;
559 struct cbq_class *cl_prev = q->active[prio];
560 psched_time_t sched = now;
561
562 if (cl_prev == NULL)
563 return 0;
564
565 do {
566 cl = cl_prev->next_alive;
567 if (now - cl->penalized > 0) {
568 cl_prev->next_alive = cl->next_alive;
569 cl->next_alive = NULL;
570 cl->cpriority = cl->priority;
571 cl->delayed = 0;
572 cbq_activate_class(cl);
573
574 if (cl == q->active[prio]) {
575 q->active[prio] = cl_prev;
576 if (cl == q->active[prio]) {
577 q->active[prio] = NULL;
578 return 0;
579 }
580 }
581
582 cl = cl_prev->next_alive;
583 } else if (sched - cl->penalized > 0)
584 sched = cl->penalized;
585 } while ((cl_prev = cl) != q->active[prio]);
586
587 return sched - now;
588 }
589
590 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
591 {
592 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
593 delay_timer);
594 struct Qdisc *sch = q->watchdog.qdisc;
595 psched_time_t now;
596 psched_tdiff_t delay = 0;
597 unsigned pmask;
598
599 now = psched_get_time();
600
601 pmask = q->pmask;
602 q->pmask = 0;
603
604 while (pmask) {
605 int prio = ffz(~pmask);
606 psched_tdiff_t tmp;
607
608 pmask &= ~(1<<prio);
609
610 tmp = cbq_undelay_prio(q, prio, now);
611 if (tmp > 0) {
612 q->pmask |= 1<<prio;
613 if (tmp < delay || delay == 0)
614 delay = tmp;
615 }
616 }
617
618 if (delay) {
619 ktime_t time;
620
621 time = ktime_set(0, 0);
622 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
623 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
624 }
625
626 sch->flags &= ~TCQ_F_THROTTLED;
627 __netif_schedule(qdisc_root(sch));
628 return HRTIMER_NORESTART;
629 }
630
631 #ifdef CONFIG_NET_CLS_ACT
632 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
633 {
634 struct Qdisc *sch = child->__parent;
635 struct cbq_sched_data *q = qdisc_priv(sch);
636 struct cbq_class *cl = q->rx_class;
637
638 q->rx_class = NULL;
639
640 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
641 int ret;
642
643 cbq_mark_toplevel(q, cl);
644
645 q->rx_class = cl;
646 cl->q->__parent = sch;
647
648 ret = qdisc_enqueue(skb, cl->q);
649 if (ret == NET_XMIT_SUCCESS) {
650 sch->q.qlen++;
651 if (!cl->next_alive)
652 cbq_activate_class(cl);
653 return 0;
654 }
655 if (net_xmit_drop_count(ret))
656 sch->qstats.drops++;
657 return 0;
658 }
659
660 sch->qstats.drops++;
661 return -1;
662 }
663 #endif
664
665 /*
666 It is mission critical procedure.
667
668 We "regenerate" toplevel cutoff, if transmitting class
669 has backlog and it is not regulated. It is not part of
670 original CBQ description, but looks more reasonable.
671 Probably, it is wrong. This question needs further investigation.
672 */
673
674 static __inline__ void
675 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
676 struct cbq_class *borrowed)
677 {
678 if (cl && q->toplevel >= borrowed->level) {
679 if (cl->q->q.qlen > 1) {
680 do {
681 if (borrowed->undertime == PSCHED_PASTPERFECT) {
682 q->toplevel = borrowed->level;
683 return;
684 }
685 } while ((borrowed=borrowed->borrow) != NULL);
686 }
687 #if 0
688 /* It is not necessary now. Uncommenting it
689 will save CPU cycles, but decrease fairness.
690 */
691 q->toplevel = TC_CBQ_MAXLEVEL;
692 #endif
693 }
694 }
695
696 static void
697 cbq_update(struct cbq_sched_data *q)
698 {
699 struct cbq_class *this = q->tx_class;
700 struct cbq_class *cl = this;
701 int len = q->tx_len;
702
703 q->tx_class = NULL;
704
705 for ( ; cl; cl = cl->share) {
706 long avgidle = cl->avgidle;
707 long idle;
708
709 cl->bstats.packets++;
710 cl->bstats.bytes += len;
711
712 /*
713 (now - last) is total time between packet right edges.
714 (last_pktlen/rate) is "virtual" busy time, so that
715
716 idle = (now - last) - last_pktlen/rate
717 */
718
719 idle = q->now - cl->last;
720 if ((unsigned long)idle > 128*1024*1024) {
721 avgidle = cl->maxidle;
722 } else {
723 idle -= L2T(cl, len);
724
725 /* true_avgidle := (1-W)*true_avgidle + W*idle,
726 where W=2^{-ewma_log}. But cl->avgidle is scaled:
727 cl->avgidle == true_avgidle/W,
728 hence:
729 */
730 avgidle += idle - (avgidle>>cl->ewma_log);
731 }
732
733 if (avgidle <= 0) {
734 /* Overlimit or at-limit */
735
736 if (avgidle < cl->minidle)
737 avgidle = cl->minidle;
738
739 cl->avgidle = avgidle;
740
741 /* Calculate expected time, when this class
742 will be allowed to send.
743 It will occur, when:
744 (1-W)*true_avgidle + W*delay = 0, i.e.
745 idle = (1/W - 1)*(-true_avgidle)
746 or
747 idle = (1 - W)*(-cl->avgidle);
748 */
749 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
750
751 /*
752 That is not all.
753 To maintain the rate allocated to the class,
754 we add to undertime virtual clock,
755 necessary to complete transmitted packet.
756 (len/phys_bandwidth has been already passed
757 to the moment of cbq_update)
758 */
759
760 idle -= L2T(&q->link, len);
761 idle += L2T(cl, len);
762
763 cl->undertime = q->now + idle;
764 } else {
765 /* Underlimit */
766
767 cl->undertime = PSCHED_PASTPERFECT;
768 if (avgidle > cl->maxidle)
769 cl->avgidle = cl->maxidle;
770 else
771 cl->avgidle = avgidle;
772 }
773 cl->last = q->now;
774 }
775
776 cbq_update_toplevel(q, this, q->tx_borrowed);
777 }
778
779 static __inline__ struct cbq_class *
780 cbq_under_limit(struct cbq_class *cl)
781 {
782 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
783 struct cbq_class *this_cl = cl;
784
785 if (cl->tparent == NULL)
786 return cl;
787
788 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
789 cl->delayed = 0;
790 return cl;
791 }
792
793 do {
794 /* It is very suspicious place. Now overlimit
795 action is generated for not bounded classes
796 only if link is completely congested.
797 Though it is in agree with ancestor-only paradigm,
798 it looks very stupid. Particularly,
799 it means that this chunk of code will either
800 never be called or result in strong amplification
801 of burstiness. Dangerous, silly, and, however,
802 no another solution exists.
803 */
804 if ((cl = cl->borrow) == NULL) {
805 this_cl->qstats.overlimits++;
806 this_cl->overlimit(this_cl);
807 return NULL;
808 }
809 if (cl->level > q->toplevel)
810 return NULL;
811 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
812
813 cl->delayed = 0;
814 return cl;
815 }
816
817 static __inline__ struct sk_buff *
818 cbq_dequeue_prio(struct Qdisc *sch, int prio)
819 {
820 struct cbq_sched_data *q = qdisc_priv(sch);
821 struct cbq_class *cl_tail, *cl_prev, *cl;
822 struct sk_buff *skb;
823 int deficit;
824
825 cl_tail = cl_prev = q->active[prio];
826 cl = cl_prev->next_alive;
827
828 do {
829 deficit = 0;
830
831 /* Start round */
832 do {
833 struct cbq_class *borrow = cl;
834
835 if (cl->q->q.qlen &&
836 (borrow = cbq_under_limit(cl)) == NULL)
837 goto skip_class;
838
839 if (cl->deficit <= 0) {
840 /* Class exhausted its allotment per
841 this round. Switch to the next one.
842 */
843 deficit = 1;
844 cl->deficit += cl->quantum;
845 goto next_class;
846 }
847
848 skb = cl->q->dequeue(cl->q);
849
850 /* Class did not give us any skb :-(
851 It could occur even if cl->q->q.qlen != 0
852 f.e. if cl->q == "tbf"
853 */
854 if (skb == NULL)
855 goto skip_class;
856
857 cl->deficit -= qdisc_pkt_len(skb);
858 q->tx_class = cl;
859 q->tx_borrowed = borrow;
860 if (borrow != cl) {
861 #ifndef CBQ_XSTATS_BORROWS_BYTES
862 borrow->xstats.borrows++;
863 cl->xstats.borrows++;
864 #else
865 borrow->xstats.borrows += qdisc_pkt_len(skb);
866 cl->xstats.borrows += qdisc_pkt_len(skb);
867 #endif
868 }
869 q->tx_len = qdisc_pkt_len(skb);
870
871 if (cl->deficit <= 0) {
872 q->active[prio] = cl;
873 cl = cl->next_alive;
874 cl->deficit += cl->quantum;
875 }
876 return skb;
877
878 skip_class:
879 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
880 /* Class is empty or penalized.
881 Unlink it from active chain.
882 */
883 cl_prev->next_alive = cl->next_alive;
884 cl->next_alive = NULL;
885
886 /* Did cl_tail point to it? */
887 if (cl == cl_tail) {
888 /* Repair it! */
889 cl_tail = cl_prev;
890
891 /* Was it the last class in this band? */
892 if (cl == cl_tail) {
893 /* Kill the band! */
894 q->active[prio] = NULL;
895 q->activemask &= ~(1<<prio);
896 if (cl->q->q.qlen)
897 cbq_activate_class(cl);
898 return NULL;
899 }
900
901 q->active[prio] = cl_tail;
902 }
903 if (cl->q->q.qlen)
904 cbq_activate_class(cl);
905
906 cl = cl_prev;
907 }
908
909 next_class:
910 cl_prev = cl;
911 cl = cl->next_alive;
912 } while (cl_prev != cl_tail);
913 } while (deficit);
914
915 q->active[prio] = cl_prev;
916
917 return NULL;
918 }
919
920 static __inline__ struct sk_buff *
921 cbq_dequeue_1(struct Qdisc *sch)
922 {
923 struct cbq_sched_data *q = qdisc_priv(sch);
924 struct sk_buff *skb;
925 unsigned activemask;
926
927 activemask = q->activemask&0xFF;
928 while (activemask) {
929 int prio = ffz(~activemask);
930 activemask &= ~(1<<prio);
931 skb = cbq_dequeue_prio(sch, prio);
932 if (skb)
933 return skb;
934 }
935 return NULL;
936 }
937
938 static struct sk_buff *
939 cbq_dequeue(struct Qdisc *sch)
940 {
941 struct sk_buff *skb;
942 struct cbq_sched_data *q = qdisc_priv(sch);
943 psched_time_t now;
944 psched_tdiff_t incr;
945
946 now = psched_get_time();
947 incr = now - q->now_rt;
948
949 if (q->tx_class) {
950 psched_tdiff_t incr2;
951 /* Time integrator. We calculate EOS time
952 by adding expected packet transmission time.
953 If real time is greater, we warp artificial clock,
954 so that:
955
956 cbq_time = max(real_time, work);
957 */
958 incr2 = L2T(&q->link, q->tx_len);
959 q->now += incr2;
960 cbq_update(q);
961 if ((incr -= incr2) < 0)
962 incr = 0;
963 }
964 q->now += incr;
965 q->now_rt = now;
966
967 for (;;) {
968 q->wd_expires = 0;
969
970 skb = cbq_dequeue_1(sch);
971 if (skb) {
972 qdisc_bstats_update(sch, skb);
973 sch->q.qlen--;
974 sch->flags &= ~TCQ_F_THROTTLED;
975 return skb;
976 }
977
978 /* All the classes are overlimit.
979
980 It is possible, if:
981
982 1. Scheduler is empty.
983 2. Toplevel cutoff inhibited borrowing.
984 3. Root class is overlimit.
985
986 Reset 2d and 3d conditions and retry.
987
988 Note, that NS and cbq-2.0 are buggy, peeking
989 an arbitrary class is appropriate for ancestor-only
990 sharing, but not for toplevel algorithm.
991
992 Our version is better, but slower, because it requires
993 two passes, but it is unavoidable with top-level sharing.
994 */
995
996 if (q->toplevel == TC_CBQ_MAXLEVEL &&
997 q->link.undertime == PSCHED_PASTPERFECT)
998 break;
999
1000 q->toplevel = TC_CBQ_MAXLEVEL;
1001 q->link.undertime = PSCHED_PASTPERFECT;
1002 }
1003
1004 /* No packets in scheduler or nobody wants to give them to us :-(
1005 Sigh... start watchdog timer in the last case. */
1006
1007 if (sch->q.qlen) {
1008 sch->qstats.overlimits++;
1009 if (q->wd_expires)
1010 qdisc_watchdog_schedule(&q->watchdog,
1011 now + q->wd_expires);
1012 }
1013 return NULL;
1014 }
1015
1016 /* CBQ class maintanance routines */
1017
1018 static void cbq_adjust_levels(struct cbq_class *this)
1019 {
1020 if (this == NULL)
1021 return;
1022
1023 do {
1024 int level = 0;
1025 struct cbq_class *cl;
1026
1027 if ((cl = this->children) != NULL) {
1028 do {
1029 if (cl->level > level)
1030 level = cl->level;
1031 } while ((cl = cl->sibling) != this->children);
1032 }
1033 this->level = level+1;
1034 } while ((this = this->tparent) != NULL);
1035 }
1036
1037 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1038 {
1039 struct cbq_class *cl;
1040 struct hlist_node *n;
1041 unsigned int h;
1042
1043 if (q->quanta[prio] == 0)
1044 return;
1045
1046 for (h = 0; h < q->clhash.hashsize; h++) {
1047 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1048 /* BUGGGG... Beware! This expression suffer of
1049 arithmetic overflows!
1050 */
1051 if (cl->priority == prio) {
1052 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1053 q->quanta[prio];
1054 }
1055 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1056 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
1057 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1058 }
1059 }
1060 }
1061 }
1062
1063 static void cbq_sync_defmap(struct cbq_class *cl)
1064 {
1065 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1066 struct cbq_class *split = cl->split;
1067 unsigned h;
1068 int i;
1069
1070 if (split == NULL)
1071 return;
1072
1073 for (i=0; i<=TC_PRIO_MAX; i++) {
1074 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1075 split->defaults[i] = NULL;
1076 }
1077
1078 for (i=0; i<=TC_PRIO_MAX; i++) {
1079 int level = split->level;
1080
1081 if (split->defaults[i])
1082 continue;
1083
1084 for (h = 0; h < q->clhash.hashsize; h++) {
1085 struct hlist_node *n;
1086 struct cbq_class *c;
1087
1088 hlist_for_each_entry(c, n, &q->clhash.hash[h],
1089 common.hnode) {
1090 if (c->split == split && c->level < level &&
1091 c->defmap&(1<<i)) {
1092 split->defaults[i] = c;
1093 level = c->level;
1094 }
1095 }
1096 }
1097 }
1098 }
1099
1100 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1101 {
1102 struct cbq_class *split = NULL;
1103
1104 if (splitid == 0) {
1105 if ((split = cl->split) == NULL)
1106 return;
1107 splitid = split->common.classid;
1108 }
1109
1110 if (split == NULL || split->common.classid != splitid) {
1111 for (split = cl->tparent; split; split = split->tparent)
1112 if (split->common.classid == splitid)
1113 break;
1114 }
1115
1116 if (split == NULL)
1117 return;
1118
1119 if (cl->split != split) {
1120 cl->defmap = 0;
1121 cbq_sync_defmap(cl);
1122 cl->split = split;
1123 cl->defmap = def&mask;
1124 } else
1125 cl->defmap = (cl->defmap&~mask)|(def&mask);
1126
1127 cbq_sync_defmap(cl);
1128 }
1129
1130 static void cbq_unlink_class(struct cbq_class *this)
1131 {
1132 struct cbq_class *cl, **clp;
1133 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1134
1135 qdisc_class_hash_remove(&q->clhash, &this->common);
1136
1137 if (this->tparent) {
1138 clp=&this->sibling;
1139 cl = *clp;
1140 do {
1141 if (cl == this) {
1142 *clp = cl->sibling;
1143 break;
1144 }
1145 clp = &cl->sibling;
1146 } while ((cl = *clp) != this->sibling);
1147
1148 if (this->tparent->children == this) {
1149 this->tparent->children = this->sibling;
1150 if (this->sibling == this)
1151 this->tparent->children = NULL;
1152 }
1153 } else {
1154 WARN_ON(this->sibling != this);
1155 }
1156 }
1157
1158 static void cbq_link_class(struct cbq_class *this)
1159 {
1160 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1161 struct cbq_class *parent = this->tparent;
1162
1163 this->sibling = this;
1164 qdisc_class_hash_insert(&q->clhash, &this->common);
1165
1166 if (parent == NULL)
1167 return;
1168
1169 if (parent->children == NULL) {
1170 parent->children = this;
1171 } else {
1172 this->sibling = parent->children->sibling;
1173 parent->children->sibling = this;
1174 }
1175 }
1176
1177 static unsigned int cbq_drop(struct Qdisc* sch)
1178 {
1179 struct cbq_sched_data *q = qdisc_priv(sch);
1180 struct cbq_class *cl, *cl_head;
1181 int prio;
1182 unsigned int len;
1183
1184 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1185 if ((cl_head = q->active[prio]) == NULL)
1186 continue;
1187
1188 cl = cl_head;
1189 do {
1190 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1191 sch->q.qlen--;
1192 if (!cl->q->q.qlen)
1193 cbq_deactivate_class(cl);
1194 return len;
1195 }
1196 } while ((cl = cl->next_alive) != cl_head);
1197 }
1198 return 0;
1199 }
1200
1201 static void
1202 cbq_reset(struct Qdisc* sch)
1203 {
1204 struct cbq_sched_data *q = qdisc_priv(sch);
1205 struct cbq_class *cl;
1206 struct hlist_node *n;
1207 int prio;
1208 unsigned h;
1209
1210 q->activemask = 0;
1211 q->pmask = 0;
1212 q->tx_class = NULL;
1213 q->tx_borrowed = NULL;
1214 qdisc_watchdog_cancel(&q->watchdog);
1215 hrtimer_cancel(&q->delay_timer);
1216 q->toplevel = TC_CBQ_MAXLEVEL;
1217 q->now = psched_get_time();
1218 q->now_rt = q->now;
1219
1220 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1221 q->active[prio] = NULL;
1222
1223 for (h = 0; h < q->clhash.hashsize; h++) {
1224 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1225 qdisc_reset(cl->q);
1226
1227 cl->next_alive = NULL;
1228 cl->undertime = PSCHED_PASTPERFECT;
1229 cl->avgidle = cl->maxidle;
1230 cl->deficit = cl->quantum;
1231 cl->cpriority = cl->priority;
1232 }
1233 }
1234 sch->q.qlen = 0;
1235 }
1236
1237
1238 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1239 {
1240 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1241 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1242 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1243 }
1244 if (lss->change&TCF_CBQ_LSS_EWMA)
1245 cl->ewma_log = lss->ewma_log;
1246 if (lss->change&TCF_CBQ_LSS_AVPKT)
1247 cl->avpkt = lss->avpkt;
1248 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1249 cl->minidle = -(long)lss->minidle;
1250 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1251 cl->maxidle = lss->maxidle;
1252 cl->avgidle = lss->maxidle;
1253 }
1254 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1255 cl->offtime = lss->offtime;
1256 return 0;
1257 }
1258
1259 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1260 {
1261 q->nclasses[cl->priority]--;
1262 q->quanta[cl->priority] -= cl->weight;
1263 cbq_normalize_quanta(q, cl->priority);
1264 }
1265
1266 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1267 {
1268 q->nclasses[cl->priority]++;
1269 q->quanta[cl->priority] += cl->weight;
1270 cbq_normalize_quanta(q, cl->priority);
1271 }
1272
1273 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1274 {
1275 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1276
1277 if (wrr->allot)
1278 cl->allot = wrr->allot;
1279 if (wrr->weight)
1280 cl->weight = wrr->weight;
1281 if (wrr->priority) {
1282 cl->priority = wrr->priority-1;
1283 cl->cpriority = cl->priority;
1284 if (cl->priority >= cl->priority2)
1285 cl->priority2 = TC_CBQ_MAXPRIO-1;
1286 }
1287
1288 cbq_addprio(q, cl);
1289 return 0;
1290 }
1291
1292 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1293 {
1294 switch (ovl->strategy) {
1295 case TC_CBQ_OVL_CLASSIC:
1296 cl->overlimit = cbq_ovl_classic;
1297 break;
1298 case TC_CBQ_OVL_DELAY:
1299 cl->overlimit = cbq_ovl_delay;
1300 break;
1301 case TC_CBQ_OVL_LOWPRIO:
1302 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1303 ovl->priority2-1 <= cl->priority)
1304 return -EINVAL;
1305 cl->priority2 = ovl->priority2-1;
1306 cl->overlimit = cbq_ovl_lowprio;
1307 break;
1308 case TC_CBQ_OVL_DROP:
1309 cl->overlimit = cbq_ovl_drop;
1310 break;
1311 case TC_CBQ_OVL_RCLASSIC:
1312 cl->overlimit = cbq_ovl_rclassic;
1313 break;
1314 default:
1315 return -EINVAL;
1316 }
1317 cl->penalty = ovl->penalty;
1318 return 0;
1319 }
1320
1321 #ifdef CONFIG_NET_CLS_ACT
1322 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1323 {
1324 cl->police = p->police;
1325
1326 if (cl->q->handle) {
1327 if (p->police == TC_POLICE_RECLASSIFY)
1328 cl->q->reshape_fail = cbq_reshape_fail;
1329 else
1330 cl->q->reshape_fail = NULL;
1331 }
1332 return 0;
1333 }
1334 #endif
1335
1336 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1337 {
1338 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1339 return 0;
1340 }
1341
1342 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1343 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1344 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1345 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1346 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1347 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1348 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1349 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1350 };
1351
1352 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1353 {
1354 struct cbq_sched_data *q = qdisc_priv(sch);
1355 struct nlattr *tb[TCA_CBQ_MAX + 1];
1356 struct tc_ratespec *r;
1357 int err;
1358
1359 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1360 if (err < 0)
1361 return err;
1362
1363 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1364 return -EINVAL;
1365
1366 r = nla_data(tb[TCA_CBQ_RATE]);
1367
1368 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1369 return -EINVAL;
1370
1371 err = qdisc_class_hash_init(&q->clhash);
1372 if (err < 0)
1373 goto put_rtab;
1374
1375 q->link.refcnt = 1;
1376 q->link.sibling = &q->link;
1377 q->link.common.classid = sch->handle;
1378 q->link.qdisc = sch;
1379 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1380 sch->handle);
1381 if (!q->link.q)
1382 q->link.q = &noop_qdisc;
1383
1384 q->link.priority = TC_CBQ_MAXPRIO-1;
1385 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1386 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1387 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1388 q->link.overlimit = cbq_ovl_classic;
1389 q->link.allot = psched_mtu(qdisc_dev(sch));
1390 q->link.quantum = q->link.allot;
1391 q->link.weight = q->link.R_tab->rate.rate;
1392
1393 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1394 q->link.avpkt = q->link.allot/2;
1395 q->link.minidle = -0x7FFFFFFF;
1396
1397 qdisc_watchdog_init(&q->watchdog, sch);
1398 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1399 q->delay_timer.function = cbq_undelay;
1400 q->toplevel = TC_CBQ_MAXLEVEL;
1401 q->now = psched_get_time();
1402 q->now_rt = q->now;
1403
1404 cbq_link_class(&q->link);
1405
1406 if (tb[TCA_CBQ_LSSOPT])
1407 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1408
1409 cbq_addprio(q, &q->link);
1410 return 0;
1411
1412 put_rtab:
1413 qdisc_put_rtab(q->link.R_tab);
1414 return err;
1415 }
1416
1417 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1418 {
1419 unsigned char *b = skb_tail_pointer(skb);
1420
1421 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1422 return skb->len;
1423
1424 nla_put_failure:
1425 nlmsg_trim(skb, b);
1426 return -1;
1427 }
1428
1429 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1430 {
1431 unsigned char *b = skb_tail_pointer(skb);
1432 struct tc_cbq_lssopt opt;
1433
1434 opt.flags = 0;
1435 if (cl->borrow == NULL)
1436 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1437 if (cl->share == NULL)
1438 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1439 opt.ewma_log = cl->ewma_log;
1440 opt.level = cl->level;
1441 opt.avpkt = cl->avpkt;
1442 opt.maxidle = cl->maxidle;
1443 opt.minidle = (u32)(-cl->minidle);
1444 opt.offtime = cl->offtime;
1445 opt.change = ~0;
1446 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1447 return skb->len;
1448
1449 nla_put_failure:
1450 nlmsg_trim(skb, b);
1451 return -1;
1452 }
1453
1454 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1455 {
1456 unsigned char *b = skb_tail_pointer(skb);
1457 struct tc_cbq_wrropt opt;
1458
1459 opt.flags = 0;
1460 opt.allot = cl->allot;
1461 opt.priority = cl->priority+1;
1462 opt.cpriority = cl->cpriority+1;
1463 opt.weight = cl->weight;
1464 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1465 return skb->len;
1466
1467 nla_put_failure:
1468 nlmsg_trim(skb, b);
1469 return -1;
1470 }
1471
1472 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1473 {
1474 unsigned char *b = skb_tail_pointer(skb);
1475 struct tc_cbq_ovl opt;
1476
1477 opt.strategy = cl->ovl_strategy;
1478 opt.priority2 = cl->priority2+1;
1479 opt.pad = 0;
1480 opt.penalty = cl->penalty;
1481 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1482 return skb->len;
1483
1484 nla_put_failure:
1485 nlmsg_trim(skb, b);
1486 return -1;
1487 }
1488
1489 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1490 {
1491 unsigned char *b = skb_tail_pointer(skb);
1492 struct tc_cbq_fopt opt;
1493
1494 if (cl->split || cl->defmap) {
1495 opt.split = cl->split ? cl->split->common.classid : 0;
1496 opt.defmap = cl->defmap;
1497 opt.defchange = ~0;
1498 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1499 }
1500 return skb->len;
1501
1502 nla_put_failure:
1503 nlmsg_trim(skb, b);
1504 return -1;
1505 }
1506
1507 #ifdef CONFIG_NET_CLS_ACT
1508 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1509 {
1510 unsigned char *b = skb_tail_pointer(skb);
1511 struct tc_cbq_police opt;
1512
1513 if (cl->police) {
1514 opt.police = cl->police;
1515 opt.__res1 = 0;
1516 opt.__res2 = 0;
1517 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1518 }
1519 return skb->len;
1520
1521 nla_put_failure:
1522 nlmsg_trim(skb, b);
1523 return -1;
1524 }
1525 #endif
1526
1527 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1528 {
1529 if (cbq_dump_lss(skb, cl) < 0 ||
1530 cbq_dump_rate(skb, cl) < 0 ||
1531 cbq_dump_wrr(skb, cl) < 0 ||
1532 cbq_dump_ovl(skb, cl) < 0 ||
1533 #ifdef CONFIG_NET_CLS_ACT
1534 cbq_dump_police(skb, cl) < 0 ||
1535 #endif
1536 cbq_dump_fopt(skb, cl) < 0)
1537 return -1;
1538 return 0;
1539 }
1540
1541 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1542 {
1543 struct cbq_sched_data *q = qdisc_priv(sch);
1544 struct nlattr *nest;
1545
1546 nest = nla_nest_start(skb, TCA_OPTIONS);
1547 if (nest == NULL)
1548 goto nla_put_failure;
1549 if (cbq_dump_attr(skb, &q->link) < 0)
1550 goto nla_put_failure;
1551 nla_nest_end(skb, nest);
1552 return skb->len;
1553
1554 nla_put_failure:
1555 nla_nest_cancel(skb, nest);
1556 return -1;
1557 }
1558
1559 static int
1560 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1561 {
1562 struct cbq_sched_data *q = qdisc_priv(sch);
1563
1564 q->link.xstats.avgidle = q->link.avgidle;
1565 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1566 }
1567
1568 static int
1569 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1570 struct sk_buff *skb, struct tcmsg *tcm)
1571 {
1572 struct cbq_class *cl = (struct cbq_class*)arg;
1573 struct nlattr *nest;
1574
1575 if (cl->tparent)
1576 tcm->tcm_parent = cl->tparent->common.classid;
1577 else
1578 tcm->tcm_parent = TC_H_ROOT;
1579 tcm->tcm_handle = cl->common.classid;
1580 tcm->tcm_info = cl->q->handle;
1581
1582 nest = nla_nest_start(skb, TCA_OPTIONS);
1583 if (nest == NULL)
1584 goto nla_put_failure;
1585 if (cbq_dump_attr(skb, cl) < 0)
1586 goto nla_put_failure;
1587 nla_nest_end(skb, nest);
1588 return skb->len;
1589
1590 nla_put_failure:
1591 nla_nest_cancel(skb, nest);
1592 return -1;
1593 }
1594
1595 static int
1596 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1597 struct gnet_dump *d)
1598 {
1599 struct cbq_sched_data *q = qdisc_priv(sch);
1600 struct cbq_class *cl = (struct cbq_class*)arg;
1601
1602 cl->qstats.qlen = cl->q->q.qlen;
1603 cl->xstats.avgidle = cl->avgidle;
1604 cl->xstats.undertime = 0;
1605
1606 if (cl->undertime != PSCHED_PASTPERFECT)
1607 cl->xstats.undertime = cl->undertime - q->now;
1608
1609 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1610 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1611 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1612 return -1;
1613
1614 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1615 }
1616
1617 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1618 struct Qdisc **old)
1619 {
1620 struct cbq_class *cl = (struct cbq_class*)arg;
1621
1622 if (new == NULL) {
1623 new = qdisc_create_dflt(sch->dev_queue,
1624 &pfifo_qdisc_ops, cl->common.classid);
1625 if (new == NULL)
1626 return -ENOBUFS;
1627 } else {
1628 #ifdef CONFIG_NET_CLS_ACT
1629 if (cl->police == TC_POLICE_RECLASSIFY)
1630 new->reshape_fail = cbq_reshape_fail;
1631 #endif
1632 }
1633 sch_tree_lock(sch);
1634 *old = cl->q;
1635 cl->q = new;
1636 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1637 qdisc_reset(*old);
1638 sch_tree_unlock(sch);
1639
1640 return 0;
1641 }
1642
1643 static struct Qdisc *
1644 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1645 {
1646 struct cbq_class *cl = (struct cbq_class*)arg;
1647
1648 return cl->q;
1649 }
1650
1651 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1652 {
1653 struct cbq_class *cl = (struct cbq_class *)arg;
1654
1655 if (cl->q->q.qlen == 0)
1656 cbq_deactivate_class(cl);
1657 }
1658
1659 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1660 {
1661 struct cbq_sched_data *q = qdisc_priv(sch);
1662 struct cbq_class *cl = cbq_class_lookup(q, classid);
1663
1664 if (cl) {
1665 cl->refcnt++;
1666 return (unsigned long)cl;
1667 }
1668 return 0;
1669 }
1670
1671 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1672 {
1673 struct cbq_sched_data *q = qdisc_priv(sch);
1674
1675 WARN_ON(cl->filters);
1676
1677 tcf_destroy_chain(&cl->filter_list);
1678 qdisc_destroy(cl->q);
1679 qdisc_put_rtab(cl->R_tab);
1680 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1681 if (cl != &q->link)
1682 kfree(cl);
1683 }
1684
1685 static void
1686 cbq_destroy(struct Qdisc* sch)
1687 {
1688 struct cbq_sched_data *q = qdisc_priv(sch);
1689 struct hlist_node *n, *next;
1690 struct cbq_class *cl;
1691 unsigned h;
1692
1693 #ifdef CONFIG_NET_CLS_ACT
1694 q->rx_class = NULL;
1695 #endif
1696 /*
1697 * Filters must be destroyed first because we don't destroy the
1698 * classes from root to leafs which means that filters can still
1699 * be bound to classes which have been destroyed already. --TGR '04
1700 */
1701 for (h = 0; h < q->clhash.hashsize; h++) {
1702 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1703 tcf_destroy_chain(&cl->filter_list);
1704 }
1705 for (h = 0; h < q->clhash.hashsize; h++) {
1706 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1707 common.hnode)
1708 cbq_destroy_class(sch, cl);
1709 }
1710 qdisc_class_hash_destroy(&q->clhash);
1711 }
1712
1713 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1714 {
1715 struct cbq_class *cl = (struct cbq_class*)arg;
1716
1717 if (--cl->refcnt == 0) {
1718 #ifdef CONFIG_NET_CLS_ACT
1719 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1720 struct cbq_sched_data *q = qdisc_priv(sch);
1721
1722 spin_lock_bh(root_lock);
1723 if (q->rx_class == cl)
1724 q->rx_class = NULL;
1725 spin_unlock_bh(root_lock);
1726 #endif
1727
1728 cbq_destroy_class(sch, cl);
1729 }
1730 }
1731
1732 static int
1733 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1734 unsigned long *arg)
1735 {
1736 int err;
1737 struct cbq_sched_data *q = qdisc_priv(sch);
1738 struct cbq_class *cl = (struct cbq_class*)*arg;
1739 struct nlattr *opt = tca[TCA_OPTIONS];
1740 struct nlattr *tb[TCA_CBQ_MAX + 1];
1741 struct cbq_class *parent;
1742 struct qdisc_rate_table *rtab = NULL;
1743
1744 if (opt == NULL)
1745 return -EINVAL;
1746
1747 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1748 if (err < 0)
1749 return err;
1750
1751 if (cl) {
1752 /* Check parent */
1753 if (parentid) {
1754 if (cl->tparent &&
1755 cl->tparent->common.classid != parentid)
1756 return -EINVAL;
1757 if (!cl->tparent && parentid != TC_H_ROOT)
1758 return -EINVAL;
1759 }
1760
1761 if (tb[TCA_CBQ_RATE]) {
1762 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1763 tb[TCA_CBQ_RTAB]);
1764 if (rtab == NULL)
1765 return -EINVAL;
1766 }
1767
1768 if (tca[TCA_RATE]) {
1769 err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1770 qdisc_root_sleeping_lock(sch),
1771 tca[TCA_RATE]);
1772 if (err) {
1773 if (rtab)
1774 qdisc_put_rtab(rtab);
1775 return err;
1776 }
1777 }
1778
1779 /* Change class parameters */
1780 sch_tree_lock(sch);
1781
1782 if (cl->next_alive != NULL)
1783 cbq_deactivate_class(cl);
1784
1785 if (rtab) {
1786 qdisc_put_rtab(cl->R_tab);
1787 cl->R_tab = rtab;
1788 }
1789
1790 if (tb[TCA_CBQ_LSSOPT])
1791 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1792
1793 if (tb[TCA_CBQ_WRROPT]) {
1794 cbq_rmprio(q, cl);
1795 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1796 }
1797
1798 if (tb[TCA_CBQ_OVL_STRATEGY])
1799 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1800
1801 #ifdef CONFIG_NET_CLS_ACT
1802 if (tb[TCA_CBQ_POLICE])
1803 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1804 #endif
1805
1806 if (tb[TCA_CBQ_FOPT])
1807 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1808
1809 if (cl->q->q.qlen)
1810 cbq_activate_class(cl);
1811
1812 sch_tree_unlock(sch);
1813
1814 return 0;
1815 }
1816
1817 if (parentid == TC_H_ROOT)
1818 return -EINVAL;
1819
1820 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1821 tb[TCA_CBQ_LSSOPT] == NULL)
1822 return -EINVAL;
1823
1824 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1825 if (rtab == NULL)
1826 return -EINVAL;
1827
1828 if (classid) {
1829 err = -EINVAL;
1830 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1831 goto failure;
1832 } else {
1833 int i;
1834 classid = TC_H_MAKE(sch->handle,0x8000);
1835
1836 for (i=0; i<0x8000; i++) {
1837 if (++q->hgenerator >= 0x8000)
1838 q->hgenerator = 1;
1839 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1840 break;
1841 }
1842 err = -ENOSR;
1843 if (i >= 0x8000)
1844 goto failure;
1845 classid = classid|q->hgenerator;
1846 }
1847
1848 parent = &q->link;
1849 if (parentid) {
1850 parent = cbq_class_lookup(q, parentid);
1851 err = -EINVAL;
1852 if (parent == NULL)
1853 goto failure;
1854 }
1855
1856 err = -ENOBUFS;
1857 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1858 if (cl == NULL)
1859 goto failure;
1860
1861 if (tca[TCA_RATE]) {
1862 err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1863 qdisc_root_sleeping_lock(sch),
1864 tca[TCA_RATE]);
1865 if (err) {
1866 kfree(cl);
1867 goto failure;
1868 }
1869 }
1870
1871 cl->R_tab = rtab;
1872 rtab = NULL;
1873 cl->refcnt = 1;
1874 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1875 if (!cl->q)
1876 cl->q = &noop_qdisc;
1877 cl->common.classid = classid;
1878 cl->tparent = parent;
1879 cl->qdisc = sch;
1880 cl->allot = parent->allot;
1881 cl->quantum = cl->allot;
1882 cl->weight = cl->R_tab->rate.rate;
1883
1884 sch_tree_lock(sch);
1885 cbq_link_class(cl);
1886 cl->borrow = cl->tparent;
1887 if (cl->tparent != &q->link)
1888 cl->share = cl->tparent;
1889 cbq_adjust_levels(parent);
1890 cl->minidle = -0x7FFFFFFF;
1891 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1892 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1893 if (cl->ewma_log==0)
1894 cl->ewma_log = q->link.ewma_log;
1895 if (cl->maxidle==0)
1896 cl->maxidle = q->link.maxidle;
1897 if (cl->avpkt==0)
1898 cl->avpkt = q->link.avpkt;
1899 cl->overlimit = cbq_ovl_classic;
1900 if (tb[TCA_CBQ_OVL_STRATEGY])
1901 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1902 #ifdef CONFIG_NET_CLS_ACT
1903 if (tb[TCA_CBQ_POLICE])
1904 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1905 #endif
1906 if (tb[TCA_CBQ_FOPT])
1907 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1908 sch_tree_unlock(sch);
1909
1910 qdisc_class_hash_grow(sch, &q->clhash);
1911
1912 *arg = (unsigned long)cl;
1913 return 0;
1914
1915 failure:
1916 qdisc_put_rtab(rtab);
1917 return err;
1918 }
1919
1920 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1921 {
1922 struct cbq_sched_data *q = qdisc_priv(sch);
1923 struct cbq_class *cl = (struct cbq_class*)arg;
1924 unsigned int qlen;
1925
1926 if (cl->filters || cl->children || cl == &q->link)
1927 return -EBUSY;
1928
1929 sch_tree_lock(sch);
1930
1931 qlen = cl->q->q.qlen;
1932 qdisc_reset(cl->q);
1933 qdisc_tree_decrease_qlen(cl->q, qlen);
1934
1935 if (cl->next_alive)
1936 cbq_deactivate_class(cl);
1937
1938 if (q->tx_borrowed == cl)
1939 q->tx_borrowed = q->tx_class;
1940 if (q->tx_class == cl) {
1941 q->tx_class = NULL;
1942 q->tx_borrowed = NULL;
1943 }
1944 #ifdef CONFIG_NET_CLS_ACT
1945 if (q->rx_class == cl)
1946 q->rx_class = NULL;
1947 #endif
1948
1949 cbq_unlink_class(cl);
1950 cbq_adjust_levels(cl->tparent);
1951 cl->defmap = 0;
1952 cbq_sync_defmap(cl);
1953
1954 cbq_rmprio(q, cl);
1955 sch_tree_unlock(sch);
1956
1957 BUG_ON(--cl->refcnt == 0);
1958 /*
1959 * This shouldn't happen: we "hold" one cops->get() when called
1960 * from tc_ctl_tclass; the destroy method is done from cops->put().
1961 */
1962
1963 return 0;
1964 }
1965
1966 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1967 {
1968 struct cbq_sched_data *q = qdisc_priv(sch);
1969 struct cbq_class *cl = (struct cbq_class *)arg;
1970
1971 if (cl == NULL)
1972 cl = &q->link;
1973
1974 return &cl->filter_list;
1975 }
1976
1977 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1978 u32 classid)
1979 {
1980 struct cbq_sched_data *q = qdisc_priv(sch);
1981 struct cbq_class *p = (struct cbq_class*)parent;
1982 struct cbq_class *cl = cbq_class_lookup(q, classid);
1983
1984 if (cl) {
1985 if (p && p->level <= cl->level)
1986 return 0;
1987 cl->filters++;
1988 return (unsigned long)cl;
1989 }
1990 return 0;
1991 }
1992
1993 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1994 {
1995 struct cbq_class *cl = (struct cbq_class*)arg;
1996
1997 cl->filters--;
1998 }
1999
2000 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2001 {
2002 struct cbq_sched_data *q = qdisc_priv(sch);
2003 struct cbq_class *cl;
2004 struct hlist_node *n;
2005 unsigned h;
2006
2007 if (arg->stop)
2008 return;
2009
2010 for (h = 0; h < q->clhash.hashsize; h++) {
2011 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2012 if (arg->count < arg->skip) {
2013 arg->count++;
2014 continue;
2015 }
2016 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2017 arg->stop = 1;
2018 return;
2019 }
2020 arg->count++;
2021 }
2022 }
2023 }
2024
2025 static const struct Qdisc_class_ops cbq_class_ops = {
2026 .graft = cbq_graft,
2027 .leaf = cbq_leaf,
2028 .qlen_notify = cbq_qlen_notify,
2029 .get = cbq_get,
2030 .put = cbq_put,
2031 .change = cbq_change_class,
2032 .delete = cbq_delete,
2033 .walk = cbq_walk,
2034 .tcf_chain = cbq_find_tcf,
2035 .bind_tcf = cbq_bind_filter,
2036 .unbind_tcf = cbq_unbind_filter,
2037 .dump = cbq_dump_class,
2038 .dump_stats = cbq_dump_class_stats,
2039 };
2040
2041 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2042 .next = NULL,
2043 .cl_ops = &cbq_class_ops,
2044 .id = "cbq",
2045 .priv_size = sizeof(struct cbq_sched_data),
2046 .enqueue = cbq_enqueue,
2047 .dequeue = cbq_dequeue,
2048 .peek = qdisc_peek_dequeued,
2049 .drop = cbq_drop,
2050 .init = cbq_init,
2051 .reset = cbq_reset,
2052 .destroy = cbq_destroy,
2053 .change = NULL,
2054 .dump = cbq_dump,
2055 .dump_stats = cbq_dump_stats,
2056 .owner = THIS_MODULE,
2057 };
2058
2059 static int __init cbq_module_init(void)
2060 {
2061 return register_qdisc(&cbq_qdisc_ops);
2062 }
2063 static void __exit cbq_module_exit(void)
2064 {
2065 unregister_qdisc(&cbq_qdisc_ops);
2066 }
2067 module_init(cbq_module_init)
2068 module_exit(cbq_module_exit)
2069 MODULE_LICENSE("GPL");
This page took 0.101214 seconds and 5 git commands to generate.