pkt_sched: Fix locking of qdisc_root with qdisc_root_sleeping_lock()
[deliverable/linux.git] / net / sched / sch_cbq.c
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/string.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <net/netlink.h>
20 #include <net/pkt_sched.h>
21
22
23 /* Class-Based Queueing (CBQ) algorithm.
24 =======================================
25
26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
27 Management Models for Packet Networks",
28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
29
30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
31
32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
33 Parameters", 1996
34
35 [4] Sally Floyd and Michael Speer, "Experimental Results
36 for Class-Based Queueing", 1998, not published.
37
38 -----------------------------------------------------------------------
39
40 Algorithm skeleton was taken from NS simulator cbq.cc.
41 If someone wants to check this code against the LBL version,
42 he should take into account that ONLY the skeleton was borrowed,
43 the implementation is different. Particularly:
44
45 --- The WRR algorithm is different. Our version looks more
46 reasonable (I hope) and works when quanta are allowed to be
47 less than MTU, which is always the case when real time classes
48 have small rates. Note, that the statement of [3] is
49 incomplete, delay may actually be estimated even if class
50 per-round allotment is less than MTU. Namely, if per-round
51 allotment is W*r_i, and r_1+...+r_k = r < 1
52
53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
54
55 In the worst case we have IntServ estimate with D = W*r+k*MTU
56 and C = MTU*r. The proof (if correct at all) is trivial.
57
58
59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
60 interpret some places, which look like wrong translations
61 from NS. Anyone is advised to find these differences
62 and explain to me, why I am wrong 8).
63
64 --- Linux has no EOI event, so that we cannot estimate true class
65 idle time. Workaround is to consider the next dequeue event
66 as sign that previous packet is finished. This is wrong because of
67 internal device queueing, but on a permanently loaded link it is true.
68 Moreover, combined with clock integrator, this scheme looks
69 very close to an ideal solution. */
70
71 struct cbq_sched_data;
72
73
74 struct cbq_class
75 {
76 struct Qdisc_class_common common;
77 struct cbq_class *next_alive; /* next class with backlog in this priority band */
78
79 /* Parameters */
80 unsigned char priority; /* class priority */
81 unsigned char priority2; /* priority to be used after overlimit */
82 unsigned char ewma_log; /* time constant for idle time calculation */
83 unsigned char ovl_strategy;
84 #ifdef CONFIG_NET_CLS_ACT
85 unsigned char police;
86 #endif
87
88 u32 defmap;
89
90 /* Link-sharing scheduler parameters */
91 long maxidle; /* Class parameters: see below. */
92 long offtime;
93 long minidle;
94 u32 avpkt;
95 struct qdisc_rate_table *R_tab;
96
97 /* Overlimit strategy parameters */
98 void (*overlimit)(struct cbq_class *cl);
99 psched_tdiff_t penalty;
100
101 /* General scheduler (WRR) parameters */
102 long allot;
103 long quantum; /* Allotment per WRR round */
104 long weight; /* Relative allotment: see below */
105
106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
107 struct cbq_class *split; /* Ptr to split node */
108 struct cbq_class *share; /* Ptr to LS parent in the class tree */
109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
110 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
111 parent otherwise */
112 struct cbq_class *sibling; /* Sibling chain */
113 struct cbq_class *children; /* Pointer to children chain */
114
115 struct Qdisc *q; /* Elementary queueing discipline */
116
117
118 /* Variables */
119 unsigned char cpriority; /* Effective priority */
120 unsigned char delayed;
121 unsigned char level; /* level of the class in hierarchy:
122 0 for leaf classes, and maximal
123 level of children + 1 for nodes.
124 */
125
126 psched_time_t last; /* Last end of service */
127 psched_time_t undertime;
128 long avgidle;
129 long deficit; /* Saved deficit for WRR */
130 psched_time_t penalized;
131 struct gnet_stats_basic bstats;
132 struct gnet_stats_queue qstats;
133 struct gnet_stats_rate_est rate_est;
134 struct tc_cbq_xstats xstats;
135
136 struct tcf_proto *filter_list;
137
138 int refcnt;
139 int filters;
140
141 struct cbq_class *defaults[TC_PRIO_MAX+1];
142 };
143
144 struct cbq_sched_data
145 {
146 struct Qdisc_class_hash clhash; /* Hash table of all classes */
147 int nclasses[TC_CBQ_MAXPRIO+1];
148 unsigned quanta[TC_CBQ_MAXPRIO+1];
149
150 struct cbq_class link;
151
152 unsigned activemask;
153 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
154 with backlog */
155
156 #ifdef CONFIG_NET_CLS_ACT
157 struct cbq_class *rx_class;
158 #endif
159 struct cbq_class *tx_class;
160 struct cbq_class *tx_borrowed;
161 int tx_len;
162 psched_time_t now; /* Cached timestamp */
163 psched_time_t now_rt; /* Cached real time */
164 unsigned pmask;
165
166 struct hrtimer delay_timer;
167 struct qdisc_watchdog watchdog; /* Watchdog timer,
168 started when CBQ has
169 backlog, but cannot
170 transmit just now */
171 psched_tdiff_t wd_expires;
172 int toplevel;
173 u32 hgenerator;
174 };
175
176
177 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
178
179 static __inline__ struct cbq_class *
180 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
181 {
182 struct Qdisc_class_common *clc;
183
184 clc = qdisc_class_find(&q->clhash, classid);
185 if (clc == NULL)
186 return NULL;
187 return container_of(clc, struct cbq_class, common);
188 }
189
190 #ifdef CONFIG_NET_CLS_ACT
191
192 static struct cbq_class *
193 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
194 {
195 struct cbq_class *cl, *new;
196
197 for (cl = this->tparent; cl; cl = cl->tparent)
198 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
199 return new;
200
201 return NULL;
202 }
203
204 #endif
205
206 /* Classify packet. The procedure is pretty complicated, but
207 it allows us to combine link sharing and priority scheduling
208 transparently.
209
210 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
211 so that it resolves to split nodes. Then packets are classified
212 by logical priority, or a more specific classifier may be attached
213 to the split node.
214 */
215
216 static struct cbq_class *
217 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
218 {
219 struct cbq_sched_data *q = qdisc_priv(sch);
220 struct cbq_class *head = &q->link;
221 struct cbq_class **defmap;
222 struct cbq_class *cl = NULL;
223 u32 prio = skb->priority;
224 struct tcf_result res;
225
226 /*
227 * Step 1. If skb->priority points to one of our classes, use it.
228 */
229 if (TC_H_MAJ(prio^sch->handle) == 0 &&
230 (cl = cbq_class_lookup(q, prio)) != NULL)
231 return cl;
232
233 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
234 for (;;) {
235 int result = 0;
236 defmap = head->defaults;
237
238 /*
239 * Step 2+n. Apply classifier.
240 */
241 if (!head->filter_list ||
242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
243 goto fallback;
244
245 if ((cl = (void*)res.class) == NULL) {
246 if (TC_H_MAJ(res.classid))
247 cl = cbq_class_lookup(q, res.classid);
248 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
249 cl = defmap[TC_PRIO_BESTEFFORT];
250
251 if (cl == NULL || cl->level >= head->level)
252 goto fallback;
253 }
254
255 #ifdef CONFIG_NET_CLS_ACT
256 switch (result) {
257 case TC_ACT_QUEUED:
258 case TC_ACT_STOLEN:
259 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
260 case TC_ACT_SHOT:
261 return NULL;
262 case TC_ACT_RECLASSIFY:
263 return cbq_reclassify(skb, cl);
264 }
265 #endif
266 if (cl->level == 0)
267 return cl;
268
269 /*
270 * Step 3+n. If classifier selected a link sharing class,
271 * apply agency specific classifier.
272 * Repeat this procdure until we hit a leaf node.
273 */
274 head = cl;
275 }
276
277 fallback:
278 cl = head;
279
280 /*
281 * Step 4. No success...
282 */
283 if (TC_H_MAJ(prio) == 0 &&
284 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
285 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
286 return head;
287
288 return cl;
289 }
290
291 /*
292 A packet has just been enqueued on the empty class.
293 cbq_activate_class adds it to the tail of active class list
294 of its priority band.
295 */
296
297 static __inline__ void cbq_activate_class(struct cbq_class *cl)
298 {
299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
300 int prio = cl->cpriority;
301 struct cbq_class *cl_tail;
302
303 cl_tail = q->active[prio];
304 q->active[prio] = cl;
305
306 if (cl_tail != NULL) {
307 cl->next_alive = cl_tail->next_alive;
308 cl_tail->next_alive = cl;
309 } else {
310 cl->next_alive = cl;
311 q->activemask |= (1<<prio);
312 }
313 }
314
315 /*
316 Unlink class from active chain.
317 Note that this same procedure is done directly in cbq_dequeue*
318 during round-robin procedure.
319 */
320
321 static void cbq_deactivate_class(struct cbq_class *this)
322 {
323 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
324 int prio = this->cpriority;
325 struct cbq_class *cl;
326 struct cbq_class *cl_prev = q->active[prio];
327
328 do {
329 cl = cl_prev->next_alive;
330 if (cl == this) {
331 cl_prev->next_alive = cl->next_alive;
332 cl->next_alive = NULL;
333
334 if (cl == q->active[prio]) {
335 q->active[prio] = cl_prev;
336 if (cl == q->active[prio]) {
337 q->active[prio] = NULL;
338 q->activemask &= ~(1<<prio);
339 return;
340 }
341 }
342 return;
343 }
344 } while ((cl_prev = cl) != q->active[prio]);
345 }
346
347 static void
348 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
349 {
350 int toplevel = q->toplevel;
351
352 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
353 psched_time_t now;
354 psched_tdiff_t incr;
355
356 now = psched_get_time();
357 incr = now - q->now_rt;
358 now = q->now + incr;
359
360 do {
361 if (cl->undertime < now) {
362 q->toplevel = cl->level;
363 return;
364 }
365 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
366 }
367 }
368
369 static int
370 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
371 {
372 struct cbq_sched_data *q = qdisc_priv(sch);
373 int uninitialized_var(ret);
374 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
375
376 #ifdef CONFIG_NET_CLS_ACT
377 q->rx_class = cl;
378 #endif
379 if (cl == NULL) {
380 if (ret & __NET_XMIT_BYPASS)
381 sch->qstats.drops++;
382 kfree_skb(skb);
383 return ret;
384 }
385
386 #ifdef CONFIG_NET_CLS_ACT
387 cl->q->__parent = sch;
388 #endif
389 ret = qdisc_enqueue(skb, cl->q);
390 if (ret == NET_XMIT_SUCCESS) {
391 sch->q.qlen++;
392 sch->bstats.packets++;
393 sch->bstats.bytes += qdisc_pkt_len(skb);
394 cbq_mark_toplevel(q, cl);
395 if (!cl->next_alive)
396 cbq_activate_class(cl);
397 return ret;
398 }
399
400 if (net_xmit_drop_count(ret)) {
401 sch->qstats.drops++;
402 cbq_mark_toplevel(q, cl);
403 cl->qstats.drops++;
404 }
405 return ret;
406 }
407
408 static int
409 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
410 {
411 struct cbq_sched_data *q = qdisc_priv(sch);
412 struct cbq_class *cl;
413 int ret;
414
415 if ((cl = q->tx_class) == NULL) {
416 kfree_skb(skb);
417 sch->qstats.drops++;
418 return NET_XMIT_CN;
419 }
420 q->tx_class = NULL;
421
422 cbq_mark_toplevel(q, cl);
423
424 #ifdef CONFIG_NET_CLS_ACT
425 q->rx_class = cl;
426 cl->q->__parent = sch;
427 #endif
428 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
429 sch->q.qlen++;
430 sch->qstats.requeues++;
431 if (!cl->next_alive)
432 cbq_activate_class(cl);
433 return 0;
434 }
435 if (net_xmit_drop_count(ret)) {
436 sch->qstats.drops++;
437 cl->qstats.drops++;
438 }
439 return ret;
440 }
441
442 /* Overlimit actions */
443
444 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
445
446 static void cbq_ovl_classic(struct cbq_class *cl)
447 {
448 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
449 psched_tdiff_t delay = cl->undertime - q->now;
450
451 if (!cl->delayed) {
452 delay += cl->offtime;
453
454 /*
455 Class goes to sleep, so that it will have no
456 chance to work avgidle. Let's forgive it 8)
457
458 BTW cbq-2.0 has a crap in this
459 place, apparently they forgot to shift it by cl->ewma_log.
460 */
461 if (cl->avgidle < 0)
462 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
463 if (cl->avgidle < cl->minidle)
464 cl->avgidle = cl->minidle;
465 if (delay <= 0)
466 delay = 1;
467 cl->undertime = q->now + delay;
468
469 cl->xstats.overactions++;
470 cl->delayed = 1;
471 }
472 if (q->wd_expires == 0 || q->wd_expires > delay)
473 q->wd_expires = delay;
474
475 /* Dirty work! We must schedule wakeups based on
476 real available rate, rather than leaf rate,
477 which may be tiny (even zero).
478 */
479 if (q->toplevel == TC_CBQ_MAXLEVEL) {
480 struct cbq_class *b;
481 psched_tdiff_t base_delay = q->wd_expires;
482
483 for (b = cl->borrow; b; b = b->borrow) {
484 delay = b->undertime - q->now;
485 if (delay < base_delay) {
486 if (delay <= 0)
487 delay = 1;
488 base_delay = delay;
489 }
490 }
491
492 q->wd_expires = base_delay;
493 }
494 }
495
496 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
497 they go overlimit
498 */
499
500 static void cbq_ovl_rclassic(struct cbq_class *cl)
501 {
502 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
503 struct cbq_class *this = cl;
504
505 do {
506 if (cl->level > q->toplevel) {
507 cl = NULL;
508 break;
509 }
510 } while ((cl = cl->borrow) != NULL);
511
512 if (cl == NULL)
513 cl = this;
514 cbq_ovl_classic(cl);
515 }
516
517 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
518
519 static void cbq_ovl_delay(struct cbq_class *cl)
520 {
521 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
522 psched_tdiff_t delay = cl->undertime - q->now;
523
524 if (test_bit(__QDISC_STATE_DEACTIVATED,
525 &qdisc_root_sleeping(cl->qdisc)->state))
526 return;
527
528 if (!cl->delayed) {
529 psched_time_t sched = q->now;
530 ktime_t expires;
531
532 delay += cl->offtime;
533 if (cl->avgidle < 0)
534 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
535 if (cl->avgidle < cl->minidle)
536 cl->avgidle = cl->minidle;
537 cl->undertime = q->now + delay;
538
539 if (delay > 0) {
540 sched += delay + cl->penalty;
541 cl->penalized = sched;
542 cl->cpriority = TC_CBQ_MAXPRIO;
543 q->pmask |= (1<<TC_CBQ_MAXPRIO);
544
545 expires = ktime_set(0, 0);
546 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
547 if (hrtimer_try_to_cancel(&q->delay_timer) &&
548 ktime_to_ns(ktime_sub(q->delay_timer.expires,
549 expires)) > 0)
550 q->delay_timer.expires = expires;
551 hrtimer_restart(&q->delay_timer);
552 cl->delayed = 1;
553 cl->xstats.overactions++;
554 return;
555 }
556 delay = 1;
557 }
558 if (q->wd_expires == 0 || q->wd_expires > delay)
559 q->wd_expires = delay;
560 }
561
562 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
563
564 static void cbq_ovl_lowprio(struct cbq_class *cl)
565 {
566 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
567
568 cl->penalized = q->now + cl->penalty;
569
570 if (cl->cpriority != cl->priority2) {
571 cl->cpriority = cl->priority2;
572 q->pmask |= (1<<cl->cpriority);
573 cl->xstats.overactions++;
574 }
575 cbq_ovl_classic(cl);
576 }
577
578 /* TC_CBQ_OVL_DROP: penalize class by dropping */
579
580 static void cbq_ovl_drop(struct cbq_class *cl)
581 {
582 if (cl->q->ops->drop)
583 if (cl->q->ops->drop(cl->q))
584 cl->qdisc->q.qlen--;
585 cl->xstats.overactions++;
586 cbq_ovl_classic(cl);
587 }
588
589 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
590 psched_time_t now)
591 {
592 struct cbq_class *cl;
593 struct cbq_class *cl_prev = q->active[prio];
594 psched_time_t sched = now;
595
596 if (cl_prev == NULL)
597 return 0;
598
599 do {
600 cl = cl_prev->next_alive;
601 if (now - cl->penalized > 0) {
602 cl_prev->next_alive = cl->next_alive;
603 cl->next_alive = NULL;
604 cl->cpriority = cl->priority;
605 cl->delayed = 0;
606 cbq_activate_class(cl);
607
608 if (cl == q->active[prio]) {
609 q->active[prio] = cl_prev;
610 if (cl == q->active[prio]) {
611 q->active[prio] = NULL;
612 return 0;
613 }
614 }
615
616 cl = cl_prev->next_alive;
617 } else if (sched - cl->penalized > 0)
618 sched = cl->penalized;
619 } while ((cl_prev = cl) != q->active[prio]);
620
621 return sched - now;
622 }
623
624 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
625 {
626 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
627 delay_timer);
628 struct Qdisc *sch = q->watchdog.qdisc;
629 psched_time_t now;
630 psched_tdiff_t delay = 0;
631 unsigned pmask;
632
633 now = psched_get_time();
634
635 pmask = q->pmask;
636 q->pmask = 0;
637
638 while (pmask) {
639 int prio = ffz(~pmask);
640 psched_tdiff_t tmp;
641
642 pmask &= ~(1<<prio);
643
644 tmp = cbq_undelay_prio(q, prio, now);
645 if (tmp > 0) {
646 q->pmask |= 1<<prio;
647 if (tmp < delay || delay == 0)
648 delay = tmp;
649 }
650 }
651
652 if (delay) {
653 ktime_t time;
654
655 time = ktime_set(0, 0);
656 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
657 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
658 }
659
660 sch->flags &= ~TCQ_F_THROTTLED;
661 __netif_schedule(qdisc_root(sch));
662 return HRTIMER_NORESTART;
663 }
664
665 #ifdef CONFIG_NET_CLS_ACT
666 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
667 {
668 struct Qdisc *sch = child->__parent;
669 struct cbq_sched_data *q = qdisc_priv(sch);
670 struct cbq_class *cl = q->rx_class;
671
672 q->rx_class = NULL;
673
674 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
675 int ret;
676
677 cbq_mark_toplevel(q, cl);
678
679 q->rx_class = cl;
680 cl->q->__parent = sch;
681
682 ret = qdisc_enqueue(skb, cl->q);
683 if (ret == NET_XMIT_SUCCESS) {
684 sch->q.qlen++;
685 sch->bstats.packets++;
686 sch->bstats.bytes += qdisc_pkt_len(skb);
687 if (!cl->next_alive)
688 cbq_activate_class(cl);
689 return 0;
690 }
691 if (net_xmit_drop_count(ret))
692 sch->qstats.drops++;
693 return 0;
694 }
695
696 sch->qstats.drops++;
697 return -1;
698 }
699 #endif
700
701 /*
702 It is mission critical procedure.
703
704 We "regenerate" toplevel cutoff, if transmitting class
705 has backlog and it is not regulated. It is not part of
706 original CBQ description, but looks more reasonable.
707 Probably, it is wrong. This question needs further investigation.
708 */
709
710 static __inline__ void
711 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
712 struct cbq_class *borrowed)
713 {
714 if (cl && q->toplevel >= borrowed->level) {
715 if (cl->q->q.qlen > 1) {
716 do {
717 if (borrowed->undertime == PSCHED_PASTPERFECT) {
718 q->toplevel = borrowed->level;
719 return;
720 }
721 } while ((borrowed=borrowed->borrow) != NULL);
722 }
723 #if 0
724 /* It is not necessary now. Uncommenting it
725 will save CPU cycles, but decrease fairness.
726 */
727 q->toplevel = TC_CBQ_MAXLEVEL;
728 #endif
729 }
730 }
731
732 static void
733 cbq_update(struct cbq_sched_data *q)
734 {
735 struct cbq_class *this = q->tx_class;
736 struct cbq_class *cl = this;
737 int len = q->tx_len;
738
739 q->tx_class = NULL;
740
741 for ( ; cl; cl = cl->share) {
742 long avgidle = cl->avgidle;
743 long idle;
744
745 cl->bstats.packets++;
746 cl->bstats.bytes += len;
747
748 /*
749 (now - last) is total time between packet right edges.
750 (last_pktlen/rate) is "virtual" busy time, so that
751
752 idle = (now - last) - last_pktlen/rate
753 */
754
755 idle = q->now - cl->last;
756 if ((unsigned long)idle > 128*1024*1024) {
757 avgidle = cl->maxidle;
758 } else {
759 idle -= L2T(cl, len);
760
761 /* true_avgidle := (1-W)*true_avgidle + W*idle,
762 where W=2^{-ewma_log}. But cl->avgidle is scaled:
763 cl->avgidle == true_avgidle/W,
764 hence:
765 */
766 avgidle += idle - (avgidle>>cl->ewma_log);
767 }
768
769 if (avgidle <= 0) {
770 /* Overlimit or at-limit */
771
772 if (avgidle < cl->minidle)
773 avgidle = cl->minidle;
774
775 cl->avgidle = avgidle;
776
777 /* Calculate expected time, when this class
778 will be allowed to send.
779 It will occur, when:
780 (1-W)*true_avgidle + W*delay = 0, i.e.
781 idle = (1/W - 1)*(-true_avgidle)
782 or
783 idle = (1 - W)*(-cl->avgidle);
784 */
785 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
786
787 /*
788 That is not all.
789 To maintain the rate allocated to the class,
790 we add to undertime virtual clock,
791 necessary to complete transmitted packet.
792 (len/phys_bandwidth has been already passed
793 to the moment of cbq_update)
794 */
795
796 idle -= L2T(&q->link, len);
797 idle += L2T(cl, len);
798
799 cl->undertime = q->now + idle;
800 } else {
801 /* Underlimit */
802
803 cl->undertime = PSCHED_PASTPERFECT;
804 if (avgidle > cl->maxidle)
805 cl->avgidle = cl->maxidle;
806 else
807 cl->avgidle = avgidle;
808 }
809 cl->last = q->now;
810 }
811
812 cbq_update_toplevel(q, this, q->tx_borrowed);
813 }
814
815 static __inline__ struct cbq_class *
816 cbq_under_limit(struct cbq_class *cl)
817 {
818 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
819 struct cbq_class *this_cl = cl;
820
821 if (cl->tparent == NULL)
822 return cl;
823
824 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
825 cl->delayed = 0;
826 return cl;
827 }
828
829 do {
830 /* It is very suspicious place. Now overlimit
831 action is generated for not bounded classes
832 only if link is completely congested.
833 Though it is in agree with ancestor-only paradigm,
834 it looks very stupid. Particularly,
835 it means that this chunk of code will either
836 never be called or result in strong amplification
837 of burstiness. Dangerous, silly, and, however,
838 no another solution exists.
839 */
840 if ((cl = cl->borrow) == NULL) {
841 this_cl->qstats.overlimits++;
842 this_cl->overlimit(this_cl);
843 return NULL;
844 }
845 if (cl->level > q->toplevel)
846 return NULL;
847 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
848
849 cl->delayed = 0;
850 return cl;
851 }
852
853 static __inline__ struct sk_buff *
854 cbq_dequeue_prio(struct Qdisc *sch, int prio)
855 {
856 struct cbq_sched_data *q = qdisc_priv(sch);
857 struct cbq_class *cl_tail, *cl_prev, *cl;
858 struct sk_buff *skb;
859 int deficit;
860
861 cl_tail = cl_prev = q->active[prio];
862 cl = cl_prev->next_alive;
863
864 do {
865 deficit = 0;
866
867 /* Start round */
868 do {
869 struct cbq_class *borrow = cl;
870
871 if (cl->q->q.qlen &&
872 (borrow = cbq_under_limit(cl)) == NULL)
873 goto skip_class;
874
875 if (cl->deficit <= 0) {
876 /* Class exhausted its allotment per
877 this round. Switch to the next one.
878 */
879 deficit = 1;
880 cl->deficit += cl->quantum;
881 goto next_class;
882 }
883
884 skb = cl->q->dequeue(cl->q);
885
886 /* Class did not give us any skb :-(
887 It could occur even if cl->q->q.qlen != 0
888 f.e. if cl->q == "tbf"
889 */
890 if (skb == NULL)
891 goto skip_class;
892
893 cl->deficit -= qdisc_pkt_len(skb);
894 q->tx_class = cl;
895 q->tx_borrowed = borrow;
896 if (borrow != cl) {
897 #ifndef CBQ_XSTATS_BORROWS_BYTES
898 borrow->xstats.borrows++;
899 cl->xstats.borrows++;
900 #else
901 borrow->xstats.borrows += qdisc_pkt_len(skb);
902 cl->xstats.borrows += qdisc_pkt_len(skb);
903 #endif
904 }
905 q->tx_len = qdisc_pkt_len(skb);
906
907 if (cl->deficit <= 0) {
908 q->active[prio] = cl;
909 cl = cl->next_alive;
910 cl->deficit += cl->quantum;
911 }
912 return skb;
913
914 skip_class:
915 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
916 /* Class is empty or penalized.
917 Unlink it from active chain.
918 */
919 cl_prev->next_alive = cl->next_alive;
920 cl->next_alive = NULL;
921
922 /* Did cl_tail point to it? */
923 if (cl == cl_tail) {
924 /* Repair it! */
925 cl_tail = cl_prev;
926
927 /* Was it the last class in this band? */
928 if (cl == cl_tail) {
929 /* Kill the band! */
930 q->active[prio] = NULL;
931 q->activemask &= ~(1<<prio);
932 if (cl->q->q.qlen)
933 cbq_activate_class(cl);
934 return NULL;
935 }
936
937 q->active[prio] = cl_tail;
938 }
939 if (cl->q->q.qlen)
940 cbq_activate_class(cl);
941
942 cl = cl_prev;
943 }
944
945 next_class:
946 cl_prev = cl;
947 cl = cl->next_alive;
948 } while (cl_prev != cl_tail);
949 } while (deficit);
950
951 q->active[prio] = cl_prev;
952
953 return NULL;
954 }
955
956 static __inline__ struct sk_buff *
957 cbq_dequeue_1(struct Qdisc *sch)
958 {
959 struct cbq_sched_data *q = qdisc_priv(sch);
960 struct sk_buff *skb;
961 unsigned activemask;
962
963 activemask = q->activemask&0xFF;
964 while (activemask) {
965 int prio = ffz(~activemask);
966 activemask &= ~(1<<prio);
967 skb = cbq_dequeue_prio(sch, prio);
968 if (skb)
969 return skb;
970 }
971 return NULL;
972 }
973
974 static struct sk_buff *
975 cbq_dequeue(struct Qdisc *sch)
976 {
977 struct sk_buff *skb;
978 struct cbq_sched_data *q = qdisc_priv(sch);
979 psched_time_t now;
980 psched_tdiff_t incr;
981
982 now = psched_get_time();
983 incr = now - q->now_rt;
984
985 if (q->tx_class) {
986 psched_tdiff_t incr2;
987 /* Time integrator. We calculate EOS time
988 by adding expected packet transmission time.
989 If real time is greater, we warp artificial clock,
990 so that:
991
992 cbq_time = max(real_time, work);
993 */
994 incr2 = L2T(&q->link, q->tx_len);
995 q->now += incr2;
996 cbq_update(q);
997 if ((incr -= incr2) < 0)
998 incr = 0;
999 }
1000 q->now += incr;
1001 q->now_rt = now;
1002
1003 for (;;) {
1004 q->wd_expires = 0;
1005
1006 skb = cbq_dequeue_1(sch);
1007 if (skb) {
1008 sch->q.qlen--;
1009 sch->flags &= ~TCQ_F_THROTTLED;
1010 return skb;
1011 }
1012
1013 /* All the classes are overlimit.
1014
1015 It is possible, if:
1016
1017 1. Scheduler is empty.
1018 2. Toplevel cutoff inhibited borrowing.
1019 3. Root class is overlimit.
1020
1021 Reset 2d and 3d conditions and retry.
1022
1023 Note, that NS and cbq-2.0 are buggy, peeking
1024 an arbitrary class is appropriate for ancestor-only
1025 sharing, but not for toplevel algorithm.
1026
1027 Our version is better, but slower, because it requires
1028 two passes, but it is unavoidable with top-level sharing.
1029 */
1030
1031 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1032 q->link.undertime == PSCHED_PASTPERFECT)
1033 break;
1034
1035 q->toplevel = TC_CBQ_MAXLEVEL;
1036 q->link.undertime = PSCHED_PASTPERFECT;
1037 }
1038
1039 /* No packets in scheduler or nobody wants to give them to us :-(
1040 Sigh... start watchdog timer in the last case. */
1041
1042 if (sch->q.qlen) {
1043 sch->qstats.overlimits++;
1044 if (q->wd_expires)
1045 qdisc_watchdog_schedule(&q->watchdog,
1046 now + q->wd_expires);
1047 }
1048 return NULL;
1049 }
1050
1051 /* CBQ class maintanance routines */
1052
1053 static void cbq_adjust_levels(struct cbq_class *this)
1054 {
1055 if (this == NULL)
1056 return;
1057
1058 do {
1059 int level = 0;
1060 struct cbq_class *cl;
1061
1062 if ((cl = this->children) != NULL) {
1063 do {
1064 if (cl->level > level)
1065 level = cl->level;
1066 } while ((cl = cl->sibling) != this->children);
1067 }
1068 this->level = level+1;
1069 } while ((this = this->tparent) != NULL);
1070 }
1071
1072 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1073 {
1074 struct cbq_class *cl;
1075 struct hlist_node *n;
1076 unsigned int h;
1077
1078 if (q->quanta[prio] == 0)
1079 return;
1080
1081 for (h = 0; h < q->clhash.hashsize; h++) {
1082 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1083 /* BUGGGG... Beware! This expression suffer of
1084 arithmetic overflows!
1085 */
1086 if (cl->priority == prio) {
1087 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1088 q->quanta[prio];
1089 }
1090 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1091 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
1092 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1093 }
1094 }
1095 }
1096 }
1097
1098 static void cbq_sync_defmap(struct cbq_class *cl)
1099 {
1100 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1101 struct cbq_class *split = cl->split;
1102 unsigned h;
1103 int i;
1104
1105 if (split == NULL)
1106 return;
1107
1108 for (i=0; i<=TC_PRIO_MAX; i++) {
1109 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1110 split->defaults[i] = NULL;
1111 }
1112
1113 for (i=0; i<=TC_PRIO_MAX; i++) {
1114 int level = split->level;
1115
1116 if (split->defaults[i])
1117 continue;
1118
1119 for (h = 0; h < q->clhash.hashsize; h++) {
1120 struct hlist_node *n;
1121 struct cbq_class *c;
1122
1123 hlist_for_each_entry(c, n, &q->clhash.hash[h],
1124 common.hnode) {
1125 if (c->split == split && c->level < level &&
1126 c->defmap&(1<<i)) {
1127 split->defaults[i] = c;
1128 level = c->level;
1129 }
1130 }
1131 }
1132 }
1133 }
1134
1135 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1136 {
1137 struct cbq_class *split = NULL;
1138
1139 if (splitid == 0) {
1140 if ((split = cl->split) == NULL)
1141 return;
1142 splitid = split->common.classid;
1143 }
1144
1145 if (split == NULL || split->common.classid != splitid) {
1146 for (split = cl->tparent; split; split = split->tparent)
1147 if (split->common.classid == splitid)
1148 break;
1149 }
1150
1151 if (split == NULL)
1152 return;
1153
1154 if (cl->split != split) {
1155 cl->defmap = 0;
1156 cbq_sync_defmap(cl);
1157 cl->split = split;
1158 cl->defmap = def&mask;
1159 } else
1160 cl->defmap = (cl->defmap&~mask)|(def&mask);
1161
1162 cbq_sync_defmap(cl);
1163 }
1164
1165 static void cbq_unlink_class(struct cbq_class *this)
1166 {
1167 struct cbq_class *cl, **clp;
1168 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1169
1170 qdisc_class_hash_remove(&q->clhash, &this->common);
1171
1172 if (this->tparent) {
1173 clp=&this->sibling;
1174 cl = *clp;
1175 do {
1176 if (cl == this) {
1177 *clp = cl->sibling;
1178 break;
1179 }
1180 clp = &cl->sibling;
1181 } while ((cl = *clp) != this->sibling);
1182
1183 if (this->tparent->children == this) {
1184 this->tparent->children = this->sibling;
1185 if (this->sibling == this)
1186 this->tparent->children = NULL;
1187 }
1188 } else {
1189 WARN_ON(this->sibling != this);
1190 }
1191 }
1192
1193 static void cbq_link_class(struct cbq_class *this)
1194 {
1195 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1196 struct cbq_class *parent = this->tparent;
1197
1198 this->sibling = this;
1199 qdisc_class_hash_insert(&q->clhash, &this->common);
1200
1201 if (parent == NULL)
1202 return;
1203
1204 if (parent->children == NULL) {
1205 parent->children = this;
1206 } else {
1207 this->sibling = parent->children->sibling;
1208 parent->children->sibling = this;
1209 }
1210 }
1211
1212 static unsigned int cbq_drop(struct Qdisc* sch)
1213 {
1214 struct cbq_sched_data *q = qdisc_priv(sch);
1215 struct cbq_class *cl, *cl_head;
1216 int prio;
1217 unsigned int len;
1218
1219 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1220 if ((cl_head = q->active[prio]) == NULL)
1221 continue;
1222
1223 cl = cl_head;
1224 do {
1225 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1226 sch->q.qlen--;
1227 if (!cl->q->q.qlen)
1228 cbq_deactivate_class(cl);
1229 return len;
1230 }
1231 } while ((cl = cl->next_alive) != cl_head);
1232 }
1233 return 0;
1234 }
1235
1236 static void
1237 cbq_reset(struct Qdisc* sch)
1238 {
1239 struct cbq_sched_data *q = qdisc_priv(sch);
1240 struct cbq_class *cl;
1241 struct hlist_node *n;
1242 int prio;
1243 unsigned h;
1244
1245 q->activemask = 0;
1246 q->pmask = 0;
1247 q->tx_class = NULL;
1248 q->tx_borrowed = NULL;
1249 qdisc_watchdog_cancel(&q->watchdog);
1250 hrtimer_cancel(&q->delay_timer);
1251 q->toplevel = TC_CBQ_MAXLEVEL;
1252 q->now = psched_get_time();
1253 q->now_rt = q->now;
1254
1255 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1256 q->active[prio] = NULL;
1257
1258 for (h = 0; h < q->clhash.hashsize; h++) {
1259 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1260 qdisc_reset(cl->q);
1261
1262 cl->next_alive = NULL;
1263 cl->undertime = PSCHED_PASTPERFECT;
1264 cl->avgidle = cl->maxidle;
1265 cl->deficit = cl->quantum;
1266 cl->cpriority = cl->priority;
1267 }
1268 }
1269 sch->q.qlen = 0;
1270 }
1271
1272
1273 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1274 {
1275 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1276 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1277 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1278 }
1279 if (lss->change&TCF_CBQ_LSS_EWMA)
1280 cl->ewma_log = lss->ewma_log;
1281 if (lss->change&TCF_CBQ_LSS_AVPKT)
1282 cl->avpkt = lss->avpkt;
1283 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1284 cl->minidle = -(long)lss->minidle;
1285 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1286 cl->maxidle = lss->maxidle;
1287 cl->avgidle = lss->maxidle;
1288 }
1289 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1290 cl->offtime = lss->offtime;
1291 return 0;
1292 }
1293
1294 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1295 {
1296 q->nclasses[cl->priority]--;
1297 q->quanta[cl->priority] -= cl->weight;
1298 cbq_normalize_quanta(q, cl->priority);
1299 }
1300
1301 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1302 {
1303 q->nclasses[cl->priority]++;
1304 q->quanta[cl->priority] += cl->weight;
1305 cbq_normalize_quanta(q, cl->priority);
1306 }
1307
1308 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1309 {
1310 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1311
1312 if (wrr->allot)
1313 cl->allot = wrr->allot;
1314 if (wrr->weight)
1315 cl->weight = wrr->weight;
1316 if (wrr->priority) {
1317 cl->priority = wrr->priority-1;
1318 cl->cpriority = cl->priority;
1319 if (cl->priority >= cl->priority2)
1320 cl->priority2 = TC_CBQ_MAXPRIO-1;
1321 }
1322
1323 cbq_addprio(q, cl);
1324 return 0;
1325 }
1326
1327 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1328 {
1329 switch (ovl->strategy) {
1330 case TC_CBQ_OVL_CLASSIC:
1331 cl->overlimit = cbq_ovl_classic;
1332 break;
1333 case TC_CBQ_OVL_DELAY:
1334 cl->overlimit = cbq_ovl_delay;
1335 break;
1336 case TC_CBQ_OVL_LOWPRIO:
1337 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1338 ovl->priority2-1 <= cl->priority)
1339 return -EINVAL;
1340 cl->priority2 = ovl->priority2-1;
1341 cl->overlimit = cbq_ovl_lowprio;
1342 break;
1343 case TC_CBQ_OVL_DROP:
1344 cl->overlimit = cbq_ovl_drop;
1345 break;
1346 case TC_CBQ_OVL_RCLASSIC:
1347 cl->overlimit = cbq_ovl_rclassic;
1348 break;
1349 default:
1350 return -EINVAL;
1351 }
1352 cl->penalty = ovl->penalty;
1353 return 0;
1354 }
1355
1356 #ifdef CONFIG_NET_CLS_ACT
1357 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1358 {
1359 cl->police = p->police;
1360
1361 if (cl->q->handle) {
1362 if (p->police == TC_POLICE_RECLASSIFY)
1363 cl->q->reshape_fail = cbq_reshape_fail;
1364 else
1365 cl->q->reshape_fail = NULL;
1366 }
1367 return 0;
1368 }
1369 #endif
1370
1371 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1372 {
1373 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1374 return 0;
1375 }
1376
1377 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1378 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1379 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1380 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1381 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1382 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1383 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1384 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1385 };
1386
1387 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1388 {
1389 struct cbq_sched_data *q = qdisc_priv(sch);
1390 struct nlattr *tb[TCA_CBQ_MAX + 1];
1391 struct tc_ratespec *r;
1392 int err;
1393
1394 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1395 if (err < 0)
1396 return err;
1397
1398 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1399 return -EINVAL;
1400
1401 r = nla_data(tb[TCA_CBQ_RATE]);
1402
1403 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1404 return -EINVAL;
1405
1406 err = qdisc_class_hash_init(&q->clhash);
1407 if (err < 0)
1408 goto put_rtab;
1409
1410 q->link.refcnt = 1;
1411 q->link.sibling = &q->link;
1412 q->link.common.classid = sch->handle;
1413 q->link.qdisc = sch;
1414 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1415 &pfifo_qdisc_ops,
1416 sch->handle)))
1417 q->link.q = &noop_qdisc;
1418
1419 q->link.priority = TC_CBQ_MAXPRIO-1;
1420 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1421 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1422 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1423 q->link.overlimit = cbq_ovl_classic;
1424 q->link.allot = psched_mtu(qdisc_dev(sch));
1425 q->link.quantum = q->link.allot;
1426 q->link.weight = q->link.R_tab->rate.rate;
1427
1428 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1429 q->link.avpkt = q->link.allot/2;
1430 q->link.minidle = -0x7FFFFFFF;
1431
1432 qdisc_watchdog_init(&q->watchdog, sch);
1433 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1434 q->delay_timer.function = cbq_undelay;
1435 q->toplevel = TC_CBQ_MAXLEVEL;
1436 q->now = psched_get_time();
1437 q->now_rt = q->now;
1438
1439 cbq_link_class(&q->link);
1440
1441 if (tb[TCA_CBQ_LSSOPT])
1442 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1443
1444 cbq_addprio(q, &q->link);
1445 return 0;
1446
1447 put_rtab:
1448 qdisc_put_rtab(q->link.R_tab);
1449 return err;
1450 }
1451
1452 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1453 {
1454 unsigned char *b = skb_tail_pointer(skb);
1455
1456 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1457 return skb->len;
1458
1459 nla_put_failure:
1460 nlmsg_trim(skb, b);
1461 return -1;
1462 }
1463
1464 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1465 {
1466 unsigned char *b = skb_tail_pointer(skb);
1467 struct tc_cbq_lssopt opt;
1468
1469 opt.flags = 0;
1470 if (cl->borrow == NULL)
1471 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1472 if (cl->share == NULL)
1473 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1474 opt.ewma_log = cl->ewma_log;
1475 opt.level = cl->level;
1476 opt.avpkt = cl->avpkt;
1477 opt.maxidle = cl->maxidle;
1478 opt.minidle = (u32)(-cl->minidle);
1479 opt.offtime = cl->offtime;
1480 opt.change = ~0;
1481 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1482 return skb->len;
1483
1484 nla_put_failure:
1485 nlmsg_trim(skb, b);
1486 return -1;
1487 }
1488
1489 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1490 {
1491 unsigned char *b = skb_tail_pointer(skb);
1492 struct tc_cbq_wrropt opt;
1493
1494 opt.flags = 0;
1495 opt.allot = cl->allot;
1496 opt.priority = cl->priority+1;
1497 opt.cpriority = cl->cpriority+1;
1498 opt.weight = cl->weight;
1499 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1500 return skb->len;
1501
1502 nla_put_failure:
1503 nlmsg_trim(skb, b);
1504 return -1;
1505 }
1506
1507 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1508 {
1509 unsigned char *b = skb_tail_pointer(skb);
1510 struct tc_cbq_ovl opt;
1511
1512 opt.strategy = cl->ovl_strategy;
1513 opt.priority2 = cl->priority2+1;
1514 opt.pad = 0;
1515 opt.penalty = cl->penalty;
1516 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1517 return skb->len;
1518
1519 nla_put_failure:
1520 nlmsg_trim(skb, b);
1521 return -1;
1522 }
1523
1524 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1525 {
1526 unsigned char *b = skb_tail_pointer(skb);
1527 struct tc_cbq_fopt opt;
1528
1529 if (cl->split || cl->defmap) {
1530 opt.split = cl->split ? cl->split->common.classid : 0;
1531 opt.defmap = cl->defmap;
1532 opt.defchange = ~0;
1533 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1534 }
1535 return skb->len;
1536
1537 nla_put_failure:
1538 nlmsg_trim(skb, b);
1539 return -1;
1540 }
1541
1542 #ifdef CONFIG_NET_CLS_ACT
1543 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1544 {
1545 unsigned char *b = skb_tail_pointer(skb);
1546 struct tc_cbq_police opt;
1547
1548 if (cl->police) {
1549 opt.police = cl->police;
1550 opt.__res1 = 0;
1551 opt.__res2 = 0;
1552 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1553 }
1554 return skb->len;
1555
1556 nla_put_failure:
1557 nlmsg_trim(skb, b);
1558 return -1;
1559 }
1560 #endif
1561
1562 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1563 {
1564 if (cbq_dump_lss(skb, cl) < 0 ||
1565 cbq_dump_rate(skb, cl) < 0 ||
1566 cbq_dump_wrr(skb, cl) < 0 ||
1567 cbq_dump_ovl(skb, cl) < 0 ||
1568 #ifdef CONFIG_NET_CLS_ACT
1569 cbq_dump_police(skb, cl) < 0 ||
1570 #endif
1571 cbq_dump_fopt(skb, cl) < 0)
1572 return -1;
1573 return 0;
1574 }
1575
1576 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1577 {
1578 struct cbq_sched_data *q = qdisc_priv(sch);
1579 struct nlattr *nest;
1580
1581 nest = nla_nest_start(skb, TCA_OPTIONS);
1582 if (nest == NULL)
1583 goto nla_put_failure;
1584 if (cbq_dump_attr(skb, &q->link) < 0)
1585 goto nla_put_failure;
1586 nla_nest_end(skb, nest);
1587 return skb->len;
1588
1589 nla_put_failure:
1590 nla_nest_cancel(skb, nest);
1591 return -1;
1592 }
1593
1594 static int
1595 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1596 {
1597 struct cbq_sched_data *q = qdisc_priv(sch);
1598
1599 q->link.xstats.avgidle = q->link.avgidle;
1600 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1601 }
1602
1603 static int
1604 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1605 struct sk_buff *skb, struct tcmsg *tcm)
1606 {
1607 struct cbq_class *cl = (struct cbq_class*)arg;
1608 struct nlattr *nest;
1609
1610 if (cl->tparent)
1611 tcm->tcm_parent = cl->tparent->common.classid;
1612 else
1613 tcm->tcm_parent = TC_H_ROOT;
1614 tcm->tcm_handle = cl->common.classid;
1615 tcm->tcm_info = cl->q->handle;
1616
1617 nest = nla_nest_start(skb, TCA_OPTIONS);
1618 if (nest == NULL)
1619 goto nla_put_failure;
1620 if (cbq_dump_attr(skb, cl) < 0)
1621 goto nla_put_failure;
1622 nla_nest_end(skb, nest);
1623 return skb->len;
1624
1625 nla_put_failure:
1626 nla_nest_cancel(skb, nest);
1627 return -1;
1628 }
1629
1630 static int
1631 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1632 struct gnet_dump *d)
1633 {
1634 struct cbq_sched_data *q = qdisc_priv(sch);
1635 struct cbq_class *cl = (struct cbq_class*)arg;
1636
1637 cl->qstats.qlen = cl->q->q.qlen;
1638 cl->xstats.avgidle = cl->avgidle;
1639 cl->xstats.undertime = 0;
1640
1641 if (cl->undertime != PSCHED_PASTPERFECT)
1642 cl->xstats.undertime = cl->undertime - q->now;
1643
1644 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1645 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1646 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1647 return -1;
1648
1649 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1650 }
1651
1652 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1653 struct Qdisc **old)
1654 {
1655 struct cbq_class *cl = (struct cbq_class*)arg;
1656
1657 if (cl) {
1658 if (new == NULL) {
1659 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1660 &pfifo_qdisc_ops,
1661 cl->common.classid);
1662 if (new == NULL)
1663 return -ENOBUFS;
1664 } else {
1665 #ifdef CONFIG_NET_CLS_ACT
1666 if (cl->police == TC_POLICE_RECLASSIFY)
1667 new->reshape_fail = cbq_reshape_fail;
1668 #endif
1669 }
1670 sch_tree_lock(sch);
1671 *old = xchg(&cl->q, new);
1672 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1673 qdisc_reset(*old);
1674 sch_tree_unlock(sch);
1675
1676 return 0;
1677 }
1678 return -ENOENT;
1679 }
1680
1681 static struct Qdisc *
1682 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1683 {
1684 struct cbq_class *cl = (struct cbq_class*)arg;
1685
1686 return cl ? cl->q : NULL;
1687 }
1688
1689 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1690 {
1691 struct cbq_class *cl = (struct cbq_class *)arg;
1692
1693 if (cl->q->q.qlen == 0)
1694 cbq_deactivate_class(cl);
1695 }
1696
1697 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1698 {
1699 struct cbq_sched_data *q = qdisc_priv(sch);
1700 struct cbq_class *cl = cbq_class_lookup(q, classid);
1701
1702 if (cl) {
1703 cl->refcnt++;
1704 return (unsigned long)cl;
1705 }
1706 return 0;
1707 }
1708
1709 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1710 {
1711 struct cbq_sched_data *q = qdisc_priv(sch);
1712
1713 WARN_ON(cl->filters);
1714
1715 tcf_destroy_chain(&cl->filter_list);
1716 qdisc_destroy(cl->q);
1717 qdisc_put_rtab(cl->R_tab);
1718 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1719 if (cl != &q->link)
1720 kfree(cl);
1721 }
1722
1723 static void
1724 cbq_destroy(struct Qdisc* sch)
1725 {
1726 struct cbq_sched_data *q = qdisc_priv(sch);
1727 struct hlist_node *n, *next;
1728 struct cbq_class *cl;
1729 unsigned h;
1730
1731 #ifdef CONFIG_NET_CLS_ACT
1732 q->rx_class = NULL;
1733 #endif
1734 /*
1735 * Filters must be destroyed first because we don't destroy the
1736 * classes from root to leafs which means that filters can still
1737 * be bound to classes which have been destroyed already. --TGR '04
1738 */
1739 for (h = 0; h < q->clhash.hashsize; h++) {
1740 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1741 tcf_destroy_chain(&cl->filter_list);
1742 }
1743 for (h = 0; h < q->clhash.hashsize; h++) {
1744 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1745 common.hnode)
1746 cbq_destroy_class(sch, cl);
1747 }
1748 qdisc_class_hash_destroy(&q->clhash);
1749 }
1750
1751 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1752 {
1753 struct cbq_class *cl = (struct cbq_class*)arg;
1754
1755 if (--cl->refcnt == 0) {
1756 #ifdef CONFIG_NET_CLS_ACT
1757 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1758 struct cbq_sched_data *q = qdisc_priv(sch);
1759
1760 spin_lock_bh(root_lock);
1761 if (q->rx_class == cl)
1762 q->rx_class = NULL;
1763 spin_unlock_bh(root_lock);
1764 #endif
1765
1766 cbq_destroy_class(sch, cl);
1767 }
1768 }
1769
1770 static int
1771 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1772 unsigned long *arg)
1773 {
1774 int err;
1775 struct cbq_sched_data *q = qdisc_priv(sch);
1776 struct cbq_class *cl = (struct cbq_class*)*arg;
1777 struct nlattr *opt = tca[TCA_OPTIONS];
1778 struct nlattr *tb[TCA_CBQ_MAX + 1];
1779 struct cbq_class *parent;
1780 struct qdisc_rate_table *rtab = NULL;
1781
1782 if (opt == NULL)
1783 return -EINVAL;
1784
1785 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1786 if (err < 0)
1787 return err;
1788
1789 if (cl) {
1790 /* Check parent */
1791 if (parentid) {
1792 if (cl->tparent &&
1793 cl->tparent->common.classid != parentid)
1794 return -EINVAL;
1795 if (!cl->tparent && parentid != TC_H_ROOT)
1796 return -EINVAL;
1797 }
1798
1799 if (tb[TCA_CBQ_RATE]) {
1800 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1801 if (rtab == NULL)
1802 return -EINVAL;
1803 }
1804
1805 /* Change class parameters */
1806 sch_tree_lock(sch);
1807
1808 if (cl->next_alive != NULL)
1809 cbq_deactivate_class(cl);
1810
1811 if (rtab) {
1812 rtab = xchg(&cl->R_tab, rtab);
1813 qdisc_put_rtab(rtab);
1814 }
1815
1816 if (tb[TCA_CBQ_LSSOPT])
1817 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1818
1819 if (tb[TCA_CBQ_WRROPT]) {
1820 cbq_rmprio(q, cl);
1821 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1822 }
1823
1824 if (tb[TCA_CBQ_OVL_STRATEGY])
1825 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1826
1827 #ifdef CONFIG_NET_CLS_ACT
1828 if (tb[TCA_CBQ_POLICE])
1829 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1830 #endif
1831
1832 if (tb[TCA_CBQ_FOPT])
1833 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1834
1835 if (cl->q->q.qlen)
1836 cbq_activate_class(cl);
1837
1838 sch_tree_unlock(sch);
1839
1840 if (tca[TCA_RATE])
1841 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1842 qdisc_root_sleeping_lock(sch),
1843 tca[TCA_RATE]);
1844 return 0;
1845 }
1846
1847 if (parentid == TC_H_ROOT)
1848 return -EINVAL;
1849
1850 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1851 tb[TCA_CBQ_LSSOPT] == NULL)
1852 return -EINVAL;
1853
1854 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1855 if (rtab == NULL)
1856 return -EINVAL;
1857
1858 if (classid) {
1859 err = -EINVAL;
1860 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1861 goto failure;
1862 } else {
1863 int i;
1864 classid = TC_H_MAKE(sch->handle,0x8000);
1865
1866 for (i=0; i<0x8000; i++) {
1867 if (++q->hgenerator >= 0x8000)
1868 q->hgenerator = 1;
1869 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1870 break;
1871 }
1872 err = -ENOSR;
1873 if (i >= 0x8000)
1874 goto failure;
1875 classid = classid|q->hgenerator;
1876 }
1877
1878 parent = &q->link;
1879 if (parentid) {
1880 parent = cbq_class_lookup(q, parentid);
1881 err = -EINVAL;
1882 if (parent == NULL)
1883 goto failure;
1884 }
1885
1886 err = -ENOBUFS;
1887 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1888 if (cl == NULL)
1889 goto failure;
1890 cl->R_tab = rtab;
1891 rtab = NULL;
1892 cl->refcnt = 1;
1893 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1894 &pfifo_qdisc_ops, classid)))
1895 cl->q = &noop_qdisc;
1896 cl->common.classid = classid;
1897 cl->tparent = parent;
1898 cl->qdisc = sch;
1899 cl->allot = parent->allot;
1900 cl->quantum = cl->allot;
1901 cl->weight = cl->R_tab->rate.rate;
1902
1903 sch_tree_lock(sch);
1904 cbq_link_class(cl);
1905 cl->borrow = cl->tparent;
1906 if (cl->tparent != &q->link)
1907 cl->share = cl->tparent;
1908 cbq_adjust_levels(parent);
1909 cl->minidle = -0x7FFFFFFF;
1910 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1911 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1912 if (cl->ewma_log==0)
1913 cl->ewma_log = q->link.ewma_log;
1914 if (cl->maxidle==0)
1915 cl->maxidle = q->link.maxidle;
1916 if (cl->avpkt==0)
1917 cl->avpkt = q->link.avpkt;
1918 cl->overlimit = cbq_ovl_classic;
1919 if (tb[TCA_CBQ_OVL_STRATEGY])
1920 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1921 #ifdef CONFIG_NET_CLS_ACT
1922 if (tb[TCA_CBQ_POLICE])
1923 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1924 #endif
1925 if (tb[TCA_CBQ_FOPT])
1926 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1927 sch_tree_unlock(sch);
1928
1929 qdisc_class_hash_grow(sch, &q->clhash);
1930
1931 if (tca[TCA_RATE])
1932 gen_new_estimator(&cl->bstats, &cl->rate_est,
1933 qdisc_root_sleeping_lock(sch), tca[TCA_RATE]);
1934
1935 *arg = (unsigned long)cl;
1936 return 0;
1937
1938 failure:
1939 qdisc_put_rtab(rtab);
1940 return err;
1941 }
1942
1943 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1944 {
1945 struct cbq_sched_data *q = qdisc_priv(sch);
1946 struct cbq_class *cl = (struct cbq_class*)arg;
1947 unsigned int qlen;
1948
1949 if (cl->filters || cl->children || cl == &q->link)
1950 return -EBUSY;
1951
1952 sch_tree_lock(sch);
1953
1954 qlen = cl->q->q.qlen;
1955 qdisc_reset(cl->q);
1956 qdisc_tree_decrease_qlen(cl->q, qlen);
1957
1958 if (cl->next_alive)
1959 cbq_deactivate_class(cl);
1960
1961 if (q->tx_borrowed == cl)
1962 q->tx_borrowed = q->tx_class;
1963 if (q->tx_class == cl) {
1964 q->tx_class = NULL;
1965 q->tx_borrowed = NULL;
1966 }
1967 #ifdef CONFIG_NET_CLS_ACT
1968 if (q->rx_class == cl)
1969 q->rx_class = NULL;
1970 #endif
1971
1972 cbq_unlink_class(cl);
1973 cbq_adjust_levels(cl->tparent);
1974 cl->defmap = 0;
1975 cbq_sync_defmap(cl);
1976
1977 cbq_rmprio(q, cl);
1978 sch_tree_unlock(sch);
1979
1980 if (--cl->refcnt == 0)
1981 cbq_destroy_class(sch, cl);
1982
1983 return 0;
1984 }
1985
1986 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1987 {
1988 struct cbq_sched_data *q = qdisc_priv(sch);
1989 struct cbq_class *cl = (struct cbq_class *)arg;
1990
1991 if (cl == NULL)
1992 cl = &q->link;
1993
1994 return &cl->filter_list;
1995 }
1996
1997 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1998 u32 classid)
1999 {
2000 struct cbq_sched_data *q = qdisc_priv(sch);
2001 struct cbq_class *p = (struct cbq_class*)parent;
2002 struct cbq_class *cl = cbq_class_lookup(q, classid);
2003
2004 if (cl) {
2005 if (p && p->level <= cl->level)
2006 return 0;
2007 cl->filters++;
2008 return (unsigned long)cl;
2009 }
2010 return 0;
2011 }
2012
2013 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2014 {
2015 struct cbq_class *cl = (struct cbq_class*)arg;
2016
2017 cl->filters--;
2018 }
2019
2020 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2021 {
2022 struct cbq_sched_data *q = qdisc_priv(sch);
2023 struct cbq_class *cl;
2024 struct hlist_node *n;
2025 unsigned h;
2026
2027 if (arg->stop)
2028 return;
2029
2030 for (h = 0; h < q->clhash.hashsize; h++) {
2031 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2032 if (arg->count < arg->skip) {
2033 arg->count++;
2034 continue;
2035 }
2036 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2037 arg->stop = 1;
2038 return;
2039 }
2040 arg->count++;
2041 }
2042 }
2043 }
2044
2045 static const struct Qdisc_class_ops cbq_class_ops = {
2046 .graft = cbq_graft,
2047 .leaf = cbq_leaf,
2048 .qlen_notify = cbq_qlen_notify,
2049 .get = cbq_get,
2050 .put = cbq_put,
2051 .change = cbq_change_class,
2052 .delete = cbq_delete,
2053 .walk = cbq_walk,
2054 .tcf_chain = cbq_find_tcf,
2055 .bind_tcf = cbq_bind_filter,
2056 .unbind_tcf = cbq_unbind_filter,
2057 .dump = cbq_dump_class,
2058 .dump_stats = cbq_dump_class_stats,
2059 };
2060
2061 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2062 .next = NULL,
2063 .cl_ops = &cbq_class_ops,
2064 .id = "cbq",
2065 .priv_size = sizeof(struct cbq_sched_data),
2066 .enqueue = cbq_enqueue,
2067 .dequeue = cbq_dequeue,
2068 .requeue = cbq_requeue,
2069 .drop = cbq_drop,
2070 .init = cbq_init,
2071 .reset = cbq_reset,
2072 .destroy = cbq_destroy,
2073 .change = NULL,
2074 .dump = cbq_dump,
2075 .dump_stats = cbq_dump_stats,
2076 .owner = THIS_MODULE,
2077 };
2078
2079 static int __init cbq_module_init(void)
2080 {
2081 return register_qdisc(&cbq_qdisc_ops);
2082 }
2083 static void __exit cbq_module_exit(void)
2084 {
2085 unregister_qdisc(&cbq_qdisc_ops);
2086 }
2087 module_init(cbq_module_init)
2088 module_exit(cbq_module_exit)
2089 MODULE_LICENSE("GPL");
This page took 0.190177 seconds and 5 git commands to generate.