ext3: Flush disk caches on fsync when needed
[deliverable/linux.git] / net / sched / sch_api.c
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 struct Qdisc *q, unsigned long cl, int event);
41
42 /*
43
44 Short review.
45 -------------
46
47 This file consists of two interrelated parts:
48
49 1. queueing disciplines manager frontend.
50 2. traffic classes manager frontend.
51
52 Generally, queueing discipline ("qdisc") is a black box,
53 which is able to enqueue packets and to dequeue them (when
54 device is ready to send something) in order and at times
55 determined by algorithm hidden in it.
56
57 qdisc's are divided to two categories:
58 - "queues", which have no internal structure visible from outside.
59 - "schedulers", which split all the packets to "traffic classes",
60 using "packet classifiers" (look at cls_api.c)
61
62 In turn, classes may have child qdiscs (as rule, queues)
63 attached to them etc. etc. etc.
64
65 The goal of the routines in this file is to translate
66 information supplied by user in the form of handles
67 to more intelligible for kernel form, to make some sanity
68 checks and part of work, which is common to all qdiscs
69 and to provide rtnetlink notifications.
70
71 All real intelligent work is done inside qdisc modules.
72
73
74
75 Every discipline has two major routines: enqueue and dequeue.
76
77 ---dequeue
78
79 dequeue usually returns a skb to send. It is allowed to return NULL,
80 but it does not mean that queue is empty, it just means that
81 discipline does not want to send anything this time.
82 Queue is really empty if q->q.qlen == 0.
83 For complicated disciplines with multiple queues q->q is not
84 real packet queue, but however q->q.qlen must be valid.
85
86 ---enqueue
87
88 enqueue returns 0, if packet was enqueued successfully.
89 If packet (this one or another one) was dropped, it returns
90 not zero error code.
91 NET_XMIT_DROP - this packet dropped
92 Expected action: do not backoff, but wait until queue will clear.
93 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
94 Expected action: backoff or ignore
95 NET_XMIT_POLICED - dropped by police.
96 Expected action: backoff or error to real-time apps.
97
98 Auxiliary routines:
99
100 ---peek
101
102 like dequeue but without removing a packet from the queue
103
104 ---reset
105
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
108
109 ---init
110
111 initializes newly created qdisc.
112
113 ---destroy
114
115 destroys resources allocated by init and during lifetime of qdisc.
116
117 ---change
118
119 changes qdisc parameters.
120 */
121
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
124
125
126 /************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
129
130
131 /* The list of all installed queueing disciplines. */
132
133 static struct Qdisc_ops *qdisc_base;
134
135 /* Register/uregister queueing discipline */
136
137 int register_qdisc(struct Qdisc_ops *qops)
138 {
139 struct Qdisc_ops *q, **qp;
140 int rc = -EEXIST;
141
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
145 goto out;
146
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
149 if (qops->peek == NULL) {
150 if (qops->dequeue == NULL) {
151 qops->peek = noop_qdisc_ops.peek;
152 } else {
153 rc = -EINVAL;
154 goto out;
155 }
156 }
157 if (qops->dequeue == NULL)
158 qops->dequeue = noop_qdisc_ops.dequeue;
159
160 qops->next = NULL;
161 *qp = qops;
162 rc = 0;
163 out:
164 write_unlock(&qdisc_mod_lock);
165 return rc;
166 }
167 EXPORT_SYMBOL(register_qdisc);
168
169 int unregister_qdisc(struct Qdisc_ops *qops)
170 {
171 struct Qdisc_ops *q, **qp;
172 int err = -ENOENT;
173
174 write_lock(&qdisc_mod_lock);
175 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176 if (q == qops)
177 break;
178 if (q) {
179 *qp = q->next;
180 q->next = NULL;
181 err = 0;
182 }
183 write_unlock(&qdisc_mod_lock);
184 return err;
185 }
186 EXPORT_SYMBOL(unregister_qdisc);
187
188 /* We know handle. Find qdisc among all qdisc's attached to device
189 (root qdisc, all its children, children of children etc.)
190 */
191
192 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193 {
194 struct Qdisc *q;
195
196 if (!(root->flags & TCQ_F_BUILTIN) &&
197 root->handle == handle)
198 return root;
199
200 list_for_each_entry(q, &root->list, list) {
201 if (q->handle == handle)
202 return q;
203 }
204 return NULL;
205 }
206
207 static void qdisc_list_add(struct Qdisc *q)
208 {
209 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
211 }
212
213 void qdisc_list_del(struct Qdisc *q)
214 {
215 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
216 list_del(&q->list);
217 }
218 EXPORT_SYMBOL(qdisc_list_del);
219
220 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
221 {
222 struct Qdisc *q;
223
224 q = qdisc_match_from_root(dev->qdisc, handle);
225 if (q)
226 goto out;
227
228 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
229 out:
230 return q;
231 }
232
233 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
234 {
235 unsigned long cl;
236 struct Qdisc *leaf;
237 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
238
239 if (cops == NULL)
240 return NULL;
241 cl = cops->get(p, classid);
242
243 if (cl == 0)
244 return NULL;
245 leaf = cops->leaf(p, cl);
246 cops->put(p, cl);
247 return leaf;
248 }
249
250 /* Find queueing discipline by name */
251
252 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
253 {
254 struct Qdisc_ops *q = NULL;
255
256 if (kind) {
257 read_lock(&qdisc_mod_lock);
258 for (q = qdisc_base; q; q = q->next) {
259 if (nla_strcmp(kind, q->id) == 0) {
260 if (!try_module_get(q->owner))
261 q = NULL;
262 break;
263 }
264 }
265 read_unlock(&qdisc_mod_lock);
266 }
267 return q;
268 }
269
270 static struct qdisc_rate_table *qdisc_rtab_list;
271
272 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
273 {
274 struct qdisc_rate_table *rtab;
275
276 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
277 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
278 rtab->refcnt++;
279 return rtab;
280 }
281 }
282
283 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
284 nla_len(tab) != TC_RTAB_SIZE)
285 return NULL;
286
287 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
288 if (rtab) {
289 rtab->rate = *r;
290 rtab->refcnt = 1;
291 memcpy(rtab->data, nla_data(tab), 1024);
292 rtab->next = qdisc_rtab_list;
293 qdisc_rtab_list = rtab;
294 }
295 return rtab;
296 }
297 EXPORT_SYMBOL(qdisc_get_rtab);
298
299 void qdisc_put_rtab(struct qdisc_rate_table *tab)
300 {
301 struct qdisc_rate_table *rtab, **rtabp;
302
303 if (!tab || --tab->refcnt)
304 return;
305
306 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
307 if (rtab == tab) {
308 *rtabp = rtab->next;
309 kfree(rtab);
310 return;
311 }
312 }
313 }
314 EXPORT_SYMBOL(qdisc_put_rtab);
315
316 static LIST_HEAD(qdisc_stab_list);
317 static DEFINE_SPINLOCK(qdisc_stab_lock);
318
319 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
320 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
321 [TCA_STAB_DATA] = { .type = NLA_BINARY },
322 };
323
324 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
325 {
326 struct nlattr *tb[TCA_STAB_MAX + 1];
327 struct qdisc_size_table *stab;
328 struct tc_sizespec *s;
329 unsigned int tsize = 0;
330 u16 *tab = NULL;
331 int err;
332
333 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
334 if (err < 0)
335 return ERR_PTR(err);
336 if (!tb[TCA_STAB_BASE])
337 return ERR_PTR(-EINVAL);
338
339 s = nla_data(tb[TCA_STAB_BASE]);
340
341 if (s->tsize > 0) {
342 if (!tb[TCA_STAB_DATA])
343 return ERR_PTR(-EINVAL);
344 tab = nla_data(tb[TCA_STAB_DATA]);
345 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
346 }
347
348 if (!s || tsize != s->tsize || (!tab && tsize > 0))
349 return ERR_PTR(-EINVAL);
350
351 spin_lock(&qdisc_stab_lock);
352
353 list_for_each_entry(stab, &qdisc_stab_list, list) {
354 if (memcmp(&stab->szopts, s, sizeof(*s)))
355 continue;
356 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
357 continue;
358 stab->refcnt++;
359 spin_unlock(&qdisc_stab_lock);
360 return stab;
361 }
362
363 spin_unlock(&qdisc_stab_lock);
364
365 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
366 if (!stab)
367 return ERR_PTR(-ENOMEM);
368
369 stab->refcnt = 1;
370 stab->szopts = *s;
371 if (tsize > 0)
372 memcpy(stab->data, tab, tsize * sizeof(u16));
373
374 spin_lock(&qdisc_stab_lock);
375 list_add_tail(&stab->list, &qdisc_stab_list);
376 spin_unlock(&qdisc_stab_lock);
377
378 return stab;
379 }
380
381 void qdisc_put_stab(struct qdisc_size_table *tab)
382 {
383 if (!tab)
384 return;
385
386 spin_lock(&qdisc_stab_lock);
387
388 if (--tab->refcnt == 0) {
389 list_del(&tab->list);
390 kfree(tab);
391 }
392
393 spin_unlock(&qdisc_stab_lock);
394 }
395 EXPORT_SYMBOL(qdisc_put_stab);
396
397 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
398 {
399 struct nlattr *nest;
400
401 nest = nla_nest_start(skb, TCA_STAB);
402 if (nest == NULL)
403 goto nla_put_failure;
404 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
405 nla_nest_end(skb, nest);
406
407 return skb->len;
408
409 nla_put_failure:
410 return -1;
411 }
412
413 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
414 {
415 int pkt_len, slot;
416
417 pkt_len = skb->len + stab->szopts.overhead;
418 if (unlikely(!stab->szopts.tsize))
419 goto out;
420
421 slot = pkt_len + stab->szopts.cell_align;
422 if (unlikely(slot < 0))
423 slot = 0;
424
425 slot >>= stab->szopts.cell_log;
426 if (likely(slot < stab->szopts.tsize))
427 pkt_len = stab->data[slot];
428 else
429 pkt_len = stab->data[stab->szopts.tsize - 1] *
430 (slot / stab->szopts.tsize) +
431 stab->data[slot % stab->szopts.tsize];
432
433 pkt_len <<= stab->szopts.size_log;
434 out:
435 if (unlikely(pkt_len < 1))
436 pkt_len = 1;
437 qdisc_skb_cb(skb)->pkt_len = pkt_len;
438 }
439 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
440
441 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
442 {
443 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
444 printk(KERN_WARNING
445 "%s: %s qdisc %X: is non-work-conserving?\n",
446 txt, qdisc->ops->id, qdisc->handle >> 16);
447 qdisc->flags |= TCQ_F_WARN_NONWC;
448 }
449 }
450 EXPORT_SYMBOL(qdisc_warn_nonwc);
451
452 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
453 {
454 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
455 timer);
456
457 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
458 __netif_schedule(qdisc_root(wd->qdisc));
459
460 return HRTIMER_NORESTART;
461 }
462
463 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
464 {
465 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
466 wd->timer.function = qdisc_watchdog;
467 wd->qdisc = qdisc;
468 }
469 EXPORT_SYMBOL(qdisc_watchdog_init);
470
471 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
472 {
473 ktime_t time;
474
475 if (test_bit(__QDISC_STATE_DEACTIVATED,
476 &qdisc_root_sleeping(wd->qdisc)->state))
477 return;
478
479 wd->qdisc->flags |= TCQ_F_THROTTLED;
480 time = ktime_set(0, 0);
481 time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
482 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
483 }
484 EXPORT_SYMBOL(qdisc_watchdog_schedule);
485
486 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
487 {
488 hrtimer_cancel(&wd->timer);
489 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
490 }
491 EXPORT_SYMBOL(qdisc_watchdog_cancel);
492
493 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
494 {
495 unsigned int size = n * sizeof(struct hlist_head), i;
496 struct hlist_head *h;
497
498 if (size <= PAGE_SIZE)
499 h = kmalloc(size, GFP_KERNEL);
500 else
501 h = (struct hlist_head *)
502 __get_free_pages(GFP_KERNEL, get_order(size));
503
504 if (h != NULL) {
505 for (i = 0; i < n; i++)
506 INIT_HLIST_HEAD(&h[i]);
507 }
508 return h;
509 }
510
511 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
512 {
513 unsigned int size = n * sizeof(struct hlist_head);
514
515 if (size <= PAGE_SIZE)
516 kfree(h);
517 else
518 free_pages((unsigned long)h, get_order(size));
519 }
520
521 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
522 {
523 struct Qdisc_class_common *cl;
524 struct hlist_node *n, *next;
525 struct hlist_head *nhash, *ohash;
526 unsigned int nsize, nmask, osize;
527 unsigned int i, h;
528
529 /* Rehash when load factor exceeds 0.75 */
530 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
531 return;
532 nsize = clhash->hashsize * 2;
533 nmask = nsize - 1;
534 nhash = qdisc_class_hash_alloc(nsize);
535 if (nhash == NULL)
536 return;
537
538 ohash = clhash->hash;
539 osize = clhash->hashsize;
540
541 sch_tree_lock(sch);
542 for (i = 0; i < osize; i++) {
543 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
544 h = qdisc_class_hash(cl->classid, nmask);
545 hlist_add_head(&cl->hnode, &nhash[h]);
546 }
547 }
548 clhash->hash = nhash;
549 clhash->hashsize = nsize;
550 clhash->hashmask = nmask;
551 sch_tree_unlock(sch);
552
553 qdisc_class_hash_free(ohash, osize);
554 }
555 EXPORT_SYMBOL(qdisc_class_hash_grow);
556
557 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
558 {
559 unsigned int size = 4;
560
561 clhash->hash = qdisc_class_hash_alloc(size);
562 if (clhash->hash == NULL)
563 return -ENOMEM;
564 clhash->hashsize = size;
565 clhash->hashmask = size - 1;
566 clhash->hashelems = 0;
567 return 0;
568 }
569 EXPORT_SYMBOL(qdisc_class_hash_init);
570
571 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
572 {
573 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
574 }
575 EXPORT_SYMBOL(qdisc_class_hash_destroy);
576
577 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
578 struct Qdisc_class_common *cl)
579 {
580 unsigned int h;
581
582 INIT_HLIST_NODE(&cl->hnode);
583 h = qdisc_class_hash(cl->classid, clhash->hashmask);
584 hlist_add_head(&cl->hnode, &clhash->hash[h]);
585 clhash->hashelems++;
586 }
587 EXPORT_SYMBOL(qdisc_class_hash_insert);
588
589 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
590 struct Qdisc_class_common *cl)
591 {
592 hlist_del(&cl->hnode);
593 clhash->hashelems--;
594 }
595 EXPORT_SYMBOL(qdisc_class_hash_remove);
596
597 /* Allocate an unique handle from space managed by kernel */
598
599 static u32 qdisc_alloc_handle(struct net_device *dev)
600 {
601 int i = 0x10000;
602 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
603
604 do {
605 autohandle += TC_H_MAKE(0x10000U, 0);
606 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
607 autohandle = TC_H_MAKE(0x80000000U, 0);
608 } while (qdisc_lookup(dev, autohandle) && --i > 0);
609
610 return i>0 ? autohandle : 0;
611 }
612
613 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
614 {
615 const struct Qdisc_class_ops *cops;
616 unsigned long cl;
617 u32 parentid;
618
619 if (n == 0)
620 return;
621 while ((parentid = sch->parent)) {
622 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
623 return;
624
625 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
626 if (sch == NULL) {
627 WARN_ON(parentid != TC_H_ROOT);
628 return;
629 }
630 cops = sch->ops->cl_ops;
631 if (cops->qlen_notify) {
632 cl = cops->get(sch, parentid);
633 cops->qlen_notify(sch, cl);
634 cops->put(sch, cl);
635 }
636 sch->q.qlen -= n;
637 }
638 }
639 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
640
641 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
642 struct Qdisc *old, struct Qdisc *new)
643 {
644 if (new || old)
645 qdisc_notify(skb, n, clid, old, new);
646
647 if (old)
648 qdisc_destroy(old);
649 }
650
651 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
652 * to device "dev".
653 *
654 * When appropriate send a netlink notification using 'skb'
655 * and "n".
656 *
657 * On success, destroy old qdisc.
658 */
659
660 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
661 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
662 struct Qdisc *new, struct Qdisc *old)
663 {
664 struct Qdisc *q = old;
665 int err = 0;
666
667 if (parent == NULL) {
668 unsigned int i, num_q, ingress;
669
670 ingress = 0;
671 num_q = dev->num_tx_queues;
672 if ((q && q->flags & TCQ_F_INGRESS) ||
673 (new && new->flags & TCQ_F_INGRESS)) {
674 num_q = 1;
675 ingress = 1;
676 }
677
678 if (dev->flags & IFF_UP)
679 dev_deactivate(dev);
680
681 if (new && new->ops->attach) {
682 new->ops->attach(new);
683 num_q = 0;
684 }
685
686 for (i = 0; i < num_q; i++) {
687 struct netdev_queue *dev_queue = &dev->rx_queue;
688
689 if (!ingress)
690 dev_queue = netdev_get_tx_queue(dev, i);
691
692 old = dev_graft_qdisc(dev_queue, new);
693 if (new && i > 0)
694 atomic_inc(&new->refcnt);
695
696 qdisc_destroy(old);
697 }
698
699 notify_and_destroy(skb, n, classid, dev->qdisc, new);
700 if (new && !new->ops->attach)
701 atomic_inc(&new->refcnt);
702 dev->qdisc = new ? : &noop_qdisc;
703
704 if (dev->flags & IFF_UP)
705 dev_activate(dev);
706 } else {
707 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
708
709 err = -EOPNOTSUPP;
710 if (cops && cops->graft) {
711 unsigned long cl = cops->get(parent, classid);
712 if (cl) {
713 err = cops->graft(parent, cl, new, &old);
714 cops->put(parent, cl);
715 } else
716 err = -ENOENT;
717 }
718 if (!err)
719 notify_and_destroy(skb, n, classid, old, new);
720 }
721 return err;
722 }
723
724 /* lockdep annotation is needed for ingress; egress gets it only for name */
725 static struct lock_class_key qdisc_tx_lock;
726 static struct lock_class_key qdisc_rx_lock;
727
728 /*
729 Allocate and initialize new qdisc.
730
731 Parameters are passed via opt.
732 */
733
734 static struct Qdisc *
735 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
736 struct Qdisc *p, u32 parent, u32 handle,
737 struct nlattr **tca, int *errp)
738 {
739 int err;
740 struct nlattr *kind = tca[TCA_KIND];
741 struct Qdisc *sch;
742 struct Qdisc_ops *ops;
743 struct qdisc_size_table *stab;
744
745 ops = qdisc_lookup_ops(kind);
746 #ifdef CONFIG_MODULES
747 if (ops == NULL && kind != NULL) {
748 char name[IFNAMSIZ];
749 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
750 /* We dropped the RTNL semaphore in order to
751 * perform the module load. So, even if we
752 * succeeded in loading the module we have to
753 * tell the caller to replay the request. We
754 * indicate this using -EAGAIN.
755 * We replay the request because the device may
756 * go away in the mean time.
757 */
758 rtnl_unlock();
759 request_module("sch_%s", name);
760 rtnl_lock();
761 ops = qdisc_lookup_ops(kind);
762 if (ops != NULL) {
763 /* We will try again qdisc_lookup_ops,
764 * so don't keep a reference.
765 */
766 module_put(ops->owner);
767 err = -EAGAIN;
768 goto err_out;
769 }
770 }
771 }
772 #endif
773
774 err = -ENOENT;
775 if (ops == NULL)
776 goto err_out;
777
778 sch = qdisc_alloc(dev_queue, ops);
779 if (IS_ERR(sch)) {
780 err = PTR_ERR(sch);
781 goto err_out2;
782 }
783
784 sch->parent = parent;
785
786 if (handle == TC_H_INGRESS) {
787 sch->flags |= TCQ_F_INGRESS;
788 handle = TC_H_MAKE(TC_H_INGRESS, 0);
789 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
790 } else {
791 if (handle == 0) {
792 handle = qdisc_alloc_handle(dev);
793 err = -ENOMEM;
794 if (handle == 0)
795 goto err_out3;
796 }
797 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
798 }
799
800 sch->handle = handle;
801
802 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
803 if (tca[TCA_STAB]) {
804 stab = qdisc_get_stab(tca[TCA_STAB]);
805 if (IS_ERR(stab)) {
806 err = PTR_ERR(stab);
807 goto err_out3;
808 }
809 sch->stab = stab;
810 }
811 if (tca[TCA_RATE]) {
812 spinlock_t *root_lock;
813
814 err = -EOPNOTSUPP;
815 if (sch->flags & TCQ_F_MQROOT)
816 goto err_out4;
817
818 if ((sch->parent != TC_H_ROOT) &&
819 !(sch->flags & TCQ_F_INGRESS) &&
820 (!p || !(p->flags & TCQ_F_MQROOT)))
821 root_lock = qdisc_root_sleeping_lock(sch);
822 else
823 root_lock = qdisc_lock(sch);
824
825 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
826 root_lock, tca[TCA_RATE]);
827 if (err)
828 goto err_out4;
829 }
830
831 qdisc_list_add(sch);
832
833 return sch;
834 }
835 err_out3:
836 qdisc_put_stab(sch->stab);
837 dev_put(dev);
838 kfree((char *) sch - sch->padded);
839 err_out2:
840 module_put(ops->owner);
841 err_out:
842 *errp = err;
843 return NULL;
844
845 err_out4:
846 /*
847 * Any broken qdiscs that would require a ops->reset() here?
848 * The qdisc was never in action so it shouldn't be necessary.
849 */
850 if (ops->destroy)
851 ops->destroy(sch);
852 goto err_out3;
853 }
854
855 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
856 {
857 struct qdisc_size_table *stab = NULL;
858 int err = 0;
859
860 if (tca[TCA_OPTIONS]) {
861 if (sch->ops->change == NULL)
862 return -EINVAL;
863 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
864 if (err)
865 return err;
866 }
867
868 if (tca[TCA_STAB]) {
869 stab = qdisc_get_stab(tca[TCA_STAB]);
870 if (IS_ERR(stab))
871 return PTR_ERR(stab);
872 }
873
874 qdisc_put_stab(sch->stab);
875 sch->stab = stab;
876
877 if (tca[TCA_RATE]) {
878 /* NB: ignores errors from replace_estimator
879 because change can't be undone. */
880 if (sch->flags & TCQ_F_MQROOT)
881 goto out;
882 gen_replace_estimator(&sch->bstats, &sch->rate_est,
883 qdisc_root_sleeping_lock(sch),
884 tca[TCA_RATE]);
885 }
886 out:
887 return 0;
888 }
889
890 struct check_loop_arg
891 {
892 struct qdisc_walker w;
893 struct Qdisc *p;
894 int depth;
895 };
896
897 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
898
899 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
900 {
901 struct check_loop_arg arg;
902
903 if (q->ops->cl_ops == NULL)
904 return 0;
905
906 arg.w.stop = arg.w.skip = arg.w.count = 0;
907 arg.w.fn = check_loop_fn;
908 arg.depth = depth;
909 arg.p = p;
910 q->ops->cl_ops->walk(q, &arg.w);
911 return arg.w.stop ? -ELOOP : 0;
912 }
913
914 static int
915 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
916 {
917 struct Qdisc *leaf;
918 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
919 struct check_loop_arg *arg = (struct check_loop_arg *)w;
920
921 leaf = cops->leaf(q, cl);
922 if (leaf) {
923 if (leaf == arg->p || arg->depth > 7)
924 return -ELOOP;
925 return check_loop(leaf, arg->p, arg->depth + 1);
926 }
927 return 0;
928 }
929
930 /*
931 * Delete/get qdisc.
932 */
933
934 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
935 {
936 struct net *net = sock_net(skb->sk);
937 struct tcmsg *tcm = NLMSG_DATA(n);
938 struct nlattr *tca[TCA_MAX + 1];
939 struct net_device *dev;
940 u32 clid = tcm->tcm_parent;
941 struct Qdisc *q = NULL;
942 struct Qdisc *p = NULL;
943 int err;
944
945 if (net != &init_net)
946 return -EINVAL;
947
948 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
949 return -ENODEV;
950
951 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
952 if (err < 0)
953 return err;
954
955 if (clid) {
956 if (clid != TC_H_ROOT) {
957 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
958 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
959 return -ENOENT;
960 q = qdisc_leaf(p, clid);
961 } else { /* ingress */
962 q = dev->rx_queue.qdisc_sleeping;
963 }
964 } else {
965 q = dev->qdisc;
966 }
967 if (!q)
968 return -ENOENT;
969
970 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
971 return -EINVAL;
972 } else {
973 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
974 return -ENOENT;
975 }
976
977 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
978 return -EINVAL;
979
980 if (n->nlmsg_type == RTM_DELQDISC) {
981 if (!clid)
982 return -EINVAL;
983 if (q->handle == 0)
984 return -ENOENT;
985 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
986 return err;
987 } else {
988 qdisc_notify(skb, n, clid, NULL, q);
989 }
990 return 0;
991 }
992
993 /*
994 Create/change qdisc.
995 */
996
997 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
998 {
999 struct net *net = sock_net(skb->sk);
1000 struct tcmsg *tcm;
1001 struct nlattr *tca[TCA_MAX + 1];
1002 struct net_device *dev;
1003 u32 clid;
1004 struct Qdisc *q, *p;
1005 int err;
1006
1007 if (net != &init_net)
1008 return -EINVAL;
1009
1010 replay:
1011 /* Reinit, just in case something touches this. */
1012 tcm = NLMSG_DATA(n);
1013 clid = tcm->tcm_parent;
1014 q = p = NULL;
1015
1016 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1017 return -ENODEV;
1018
1019 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1020 if (err < 0)
1021 return err;
1022
1023 if (clid) {
1024 if (clid != TC_H_ROOT) {
1025 if (clid != TC_H_INGRESS) {
1026 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1027 return -ENOENT;
1028 q = qdisc_leaf(p, clid);
1029 } else { /*ingress */
1030 q = dev->rx_queue.qdisc_sleeping;
1031 }
1032 } else {
1033 q = dev->qdisc;
1034 }
1035
1036 /* It may be default qdisc, ignore it */
1037 if (q && q->handle == 0)
1038 q = NULL;
1039
1040 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1041 if (tcm->tcm_handle) {
1042 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1043 return -EEXIST;
1044 if (TC_H_MIN(tcm->tcm_handle))
1045 return -EINVAL;
1046 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1047 goto create_n_graft;
1048 if (n->nlmsg_flags&NLM_F_EXCL)
1049 return -EEXIST;
1050 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1051 return -EINVAL;
1052 if (q == p ||
1053 (p && check_loop(q, p, 0)))
1054 return -ELOOP;
1055 atomic_inc(&q->refcnt);
1056 goto graft;
1057 } else {
1058 if (q == NULL)
1059 goto create_n_graft;
1060
1061 /* This magic test requires explanation.
1062 *
1063 * We know, that some child q is already
1064 * attached to this parent and have choice:
1065 * either to change it or to create/graft new one.
1066 *
1067 * 1. We are allowed to create/graft only
1068 * if CREATE and REPLACE flags are set.
1069 *
1070 * 2. If EXCL is set, requestor wanted to say,
1071 * that qdisc tcm_handle is not expected
1072 * to exist, so that we choose create/graft too.
1073 *
1074 * 3. The last case is when no flags are set.
1075 * Alas, it is sort of hole in API, we
1076 * cannot decide what to do unambiguously.
1077 * For now we select create/graft, if
1078 * user gave KIND, which does not match existing.
1079 */
1080 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1081 (n->nlmsg_flags&NLM_F_REPLACE) &&
1082 ((n->nlmsg_flags&NLM_F_EXCL) ||
1083 (tca[TCA_KIND] &&
1084 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1085 goto create_n_graft;
1086 }
1087 }
1088 } else {
1089 if (!tcm->tcm_handle)
1090 return -EINVAL;
1091 q = qdisc_lookup(dev, tcm->tcm_handle);
1092 }
1093
1094 /* Change qdisc parameters */
1095 if (q == NULL)
1096 return -ENOENT;
1097 if (n->nlmsg_flags&NLM_F_EXCL)
1098 return -EEXIST;
1099 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1100 return -EINVAL;
1101 err = qdisc_change(q, tca);
1102 if (err == 0)
1103 qdisc_notify(skb, n, clid, NULL, q);
1104 return err;
1105
1106 create_n_graft:
1107 if (!(n->nlmsg_flags&NLM_F_CREATE))
1108 return -ENOENT;
1109 if (clid == TC_H_INGRESS)
1110 q = qdisc_create(dev, &dev->rx_queue, p,
1111 tcm->tcm_parent, tcm->tcm_parent,
1112 tca, &err);
1113 else {
1114 unsigned int ntx = 0;
1115
1116 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1117 ntx = p->ops->cl_ops->select_queue(p, tcm);
1118
1119 q = qdisc_create(dev, netdev_get_tx_queue(dev, ntx), p,
1120 tcm->tcm_parent, tcm->tcm_handle,
1121 tca, &err);
1122 }
1123 if (q == NULL) {
1124 if (err == -EAGAIN)
1125 goto replay;
1126 return err;
1127 }
1128
1129 graft:
1130 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1131 if (err) {
1132 if (q)
1133 qdisc_destroy(q);
1134 return err;
1135 }
1136
1137 return 0;
1138 }
1139
1140 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1141 u32 pid, u32 seq, u16 flags, int event)
1142 {
1143 struct tcmsg *tcm;
1144 struct nlmsghdr *nlh;
1145 unsigned char *b = skb_tail_pointer(skb);
1146 struct gnet_dump d;
1147
1148 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1149 tcm = NLMSG_DATA(nlh);
1150 tcm->tcm_family = AF_UNSPEC;
1151 tcm->tcm__pad1 = 0;
1152 tcm->tcm__pad2 = 0;
1153 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1154 tcm->tcm_parent = clid;
1155 tcm->tcm_handle = q->handle;
1156 tcm->tcm_info = atomic_read(&q->refcnt);
1157 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1158 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1159 goto nla_put_failure;
1160 q->qstats.qlen = q->q.qlen;
1161
1162 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1163 goto nla_put_failure;
1164
1165 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1166 qdisc_root_sleeping_lock(q), &d) < 0)
1167 goto nla_put_failure;
1168
1169 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1170 goto nla_put_failure;
1171
1172 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1173 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1174 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1175 goto nla_put_failure;
1176
1177 if (gnet_stats_finish_copy(&d) < 0)
1178 goto nla_put_failure;
1179
1180 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1181 return skb->len;
1182
1183 nlmsg_failure:
1184 nla_put_failure:
1185 nlmsg_trim(skb, b);
1186 return -1;
1187 }
1188
1189 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1190 u32 clid, struct Qdisc *old, struct Qdisc *new)
1191 {
1192 struct sk_buff *skb;
1193 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1194
1195 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1196 if (!skb)
1197 return -ENOBUFS;
1198
1199 if (old && old->handle) {
1200 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1201 goto err_out;
1202 }
1203 if (new) {
1204 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1205 goto err_out;
1206 }
1207
1208 if (skb->len)
1209 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1210
1211 err_out:
1212 kfree_skb(skb);
1213 return -EINVAL;
1214 }
1215
1216 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1217 {
1218 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1219 }
1220
1221 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1222 struct netlink_callback *cb,
1223 int *q_idx_p, int s_q_idx)
1224 {
1225 int ret = 0, q_idx = *q_idx_p;
1226 struct Qdisc *q;
1227
1228 if (!root)
1229 return 0;
1230
1231 q = root;
1232 if (q_idx < s_q_idx) {
1233 q_idx++;
1234 } else {
1235 if (!tc_qdisc_dump_ignore(q) &&
1236 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1237 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1238 goto done;
1239 q_idx++;
1240 }
1241 list_for_each_entry(q, &root->list, list) {
1242 if (q_idx < s_q_idx) {
1243 q_idx++;
1244 continue;
1245 }
1246 if (!tc_qdisc_dump_ignore(q) &&
1247 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1248 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1249 goto done;
1250 q_idx++;
1251 }
1252
1253 out:
1254 *q_idx_p = q_idx;
1255 return ret;
1256 done:
1257 ret = -1;
1258 goto out;
1259 }
1260
1261 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1262 {
1263 struct net *net = sock_net(skb->sk);
1264 int idx, q_idx;
1265 int s_idx, s_q_idx;
1266 struct net_device *dev;
1267
1268 if (net != &init_net)
1269 return 0;
1270
1271 s_idx = cb->args[0];
1272 s_q_idx = q_idx = cb->args[1];
1273 read_lock(&dev_base_lock);
1274 idx = 0;
1275 for_each_netdev(&init_net, dev) {
1276 struct netdev_queue *dev_queue;
1277
1278 if (idx < s_idx)
1279 goto cont;
1280 if (idx > s_idx)
1281 s_q_idx = 0;
1282 q_idx = 0;
1283
1284 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1285 goto done;
1286
1287 dev_queue = &dev->rx_queue;
1288 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1289 goto done;
1290
1291 cont:
1292 idx++;
1293 }
1294
1295 done:
1296 read_unlock(&dev_base_lock);
1297
1298 cb->args[0] = idx;
1299 cb->args[1] = q_idx;
1300
1301 return skb->len;
1302 }
1303
1304
1305
1306 /************************************************
1307 * Traffic classes manipulation. *
1308 ************************************************/
1309
1310
1311
1312 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1313 {
1314 struct net *net = sock_net(skb->sk);
1315 struct tcmsg *tcm = NLMSG_DATA(n);
1316 struct nlattr *tca[TCA_MAX + 1];
1317 struct net_device *dev;
1318 struct Qdisc *q = NULL;
1319 const struct Qdisc_class_ops *cops;
1320 unsigned long cl = 0;
1321 unsigned long new_cl;
1322 u32 pid = tcm->tcm_parent;
1323 u32 clid = tcm->tcm_handle;
1324 u32 qid = TC_H_MAJ(clid);
1325 int err;
1326
1327 if (net != &init_net)
1328 return -EINVAL;
1329
1330 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1331 return -ENODEV;
1332
1333 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1334 if (err < 0)
1335 return err;
1336
1337 /*
1338 parent == TC_H_UNSPEC - unspecified parent.
1339 parent == TC_H_ROOT - class is root, which has no parent.
1340 parent == X:0 - parent is root class.
1341 parent == X:Y - parent is a node in hierarchy.
1342 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1343
1344 handle == 0:0 - generate handle from kernel pool.
1345 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1346 handle == X:Y - clear.
1347 handle == X:0 - root class.
1348 */
1349
1350 /* Step 1. Determine qdisc handle X:0 */
1351
1352 if (pid != TC_H_ROOT) {
1353 u32 qid1 = TC_H_MAJ(pid);
1354
1355 if (qid && qid1) {
1356 /* If both majors are known, they must be identical. */
1357 if (qid != qid1)
1358 return -EINVAL;
1359 } else if (qid1) {
1360 qid = qid1;
1361 } else if (qid == 0)
1362 qid = dev->qdisc->handle;
1363
1364 /* Now qid is genuine qdisc handle consistent
1365 both with parent and child.
1366
1367 TC_H_MAJ(pid) still may be unspecified, complete it now.
1368 */
1369 if (pid)
1370 pid = TC_H_MAKE(qid, pid);
1371 } else {
1372 if (qid == 0)
1373 qid = dev->qdisc->handle;
1374 }
1375
1376 /* OK. Locate qdisc */
1377 if ((q = qdisc_lookup(dev, qid)) == NULL)
1378 return -ENOENT;
1379
1380 /* An check that it supports classes */
1381 cops = q->ops->cl_ops;
1382 if (cops == NULL)
1383 return -EINVAL;
1384
1385 /* Now try to get class */
1386 if (clid == 0) {
1387 if (pid == TC_H_ROOT)
1388 clid = qid;
1389 } else
1390 clid = TC_H_MAKE(qid, clid);
1391
1392 if (clid)
1393 cl = cops->get(q, clid);
1394
1395 if (cl == 0) {
1396 err = -ENOENT;
1397 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1398 goto out;
1399 } else {
1400 switch (n->nlmsg_type) {
1401 case RTM_NEWTCLASS:
1402 err = -EEXIST;
1403 if (n->nlmsg_flags&NLM_F_EXCL)
1404 goto out;
1405 break;
1406 case RTM_DELTCLASS:
1407 err = -EOPNOTSUPP;
1408 if (cops->delete)
1409 err = cops->delete(q, cl);
1410 if (err == 0)
1411 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1412 goto out;
1413 case RTM_GETTCLASS:
1414 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1415 goto out;
1416 default:
1417 err = -EINVAL;
1418 goto out;
1419 }
1420 }
1421
1422 new_cl = cl;
1423 err = -EOPNOTSUPP;
1424 if (cops->change)
1425 err = cops->change(q, clid, pid, tca, &new_cl);
1426 if (err == 0)
1427 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1428
1429 out:
1430 if (cl)
1431 cops->put(q, cl);
1432
1433 return err;
1434 }
1435
1436
1437 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1438 unsigned long cl,
1439 u32 pid, u32 seq, u16 flags, int event)
1440 {
1441 struct tcmsg *tcm;
1442 struct nlmsghdr *nlh;
1443 unsigned char *b = skb_tail_pointer(skb);
1444 struct gnet_dump d;
1445 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1446
1447 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1448 tcm = NLMSG_DATA(nlh);
1449 tcm->tcm_family = AF_UNSPEC;
1450 tcm->tcm__pad1 = 0;
1451 tcm->tcm__pad2 = 0;
1452 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1453 tcm->tcm_parent = q->handle;
1454 tcm->tcm_handle = q->handle;
1455 tcm->tcm_info = 0;
1456 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1457 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1458 goto nla_put_failure;
1459
1460 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1461 qdisc_root_sleeping_lock(q), &d) < 0)
1462 goto nla_put_failure;
1463
1464 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1465 goto nla_put_failure;
1466
1467 if (gnet_stats_finish_copy(&d) < 0)
1468 goto nla_put_failure;
1469
1470 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1471 return skb->len;
1472
1473 nlmsg_failure:
1474 nla_put_failure:
1475 nlmsg_trim(skb, b);
1476 return -1;
1477 }
1478
1479 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1480 struct Qdisc *q, unsigned long cl, int event)
1481 {
1482 struct sk_buff *skb;
1483 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1484
1485 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1486 if (!skb)
1487 return -ENOBUFS;
1488
1489 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1490 kfree_skb(skb);
1491 return -EINVAL;
1492 }
1493
1494 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1495 }
1496
1497 struct qdisc_dump_args
1498 {
1499 struct qdisc_walker w;
1500 struct sk_buff *skb;
1501 struct netlink_callback *cb;
1502 };
1503
1504 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1505 {
1506 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1507
1508 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1509 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1510 }
1511
1512 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1513 struct tcmsg *tcm, struct netlink_callback *cb,
1514 int *t_p, int s_t)
1515 {
1516 struct qdisc_dump_args arg;
1517
1518 if (tc_qdisc_dump_ignore(q) ||
1519 *t_p < s_t || !q->ops->cl_ops ||
1520 (tcm->tcm_parent &&
1521 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1522 (*t_p)++;
1523 return 0;
1524 }
1525 if (*t_p > s_t)
1526 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1527 arg.w.fn = qdisc_class_dump;
1528 arg.skb = skb;
1529 arg.cb = cb;
1530 arg.w.stop = 0;
1531 arg.w.skip = cb->args[1];
1532 arg.w.count = 0;
1533 q->ops->cl_ops->walk(q, &arg.w);
1534 cb->args[1] = arg.w.count;
1535 if (arg.w.stop)
1536 return -1;
1537 (*t_p)++;
1538 return 0;
1539 }
1540
1541 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1542 struct tcmsg *tcm, struct netlink_callback *cb,
1543 int *t_p, int s_t)
1544 {
1545 struct Qdisc *q;
1546
1547 if (!root)
1548 return 0;
1549
1550 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1551 return -1;
1552
1553 list_for_each_entry(q, &root->list, list) {
1554 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1555 return -1;
1556 }
1557
1558 return 0;
1559 }
1560
1561 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1562 {
1563 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1564 struct net *net = sock_net(skb->sk);
1565 struct netdev_queue *dev_queue;
1566 struct net_device *dev;
1567 int t, s_t;
1568
1569 if (net != &init_net)
1570 return 0;
1571
1572 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1573 return 0;
1574 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1575 return 0;
1576
1577 s_t = cb->args[0];
1578 t = 0;
1579
1580 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1581 goto done;
1582
1583 dev_queue = &dev->rx_queue;
1584 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1585 goto done;
1586
1587 done:
1588 cb->args[0] = t;
1589
1590 dev_put(dev);
1591 return skb->len;
1592 }
1593
1594 /* Main classifier routine: scans classifier chain attached
1595 to this qdisc, (optionally) tests for protocol and asks
1596 specific classifiers.
1597 */
1598 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1599 struct tcf_result *res)
1600 {
1601 __be16 protocol = skb->protocol;
1602 int err = 0;
1603
1604 for (; tp; tp = tp->next) {
1605 if ((tp->protocol == protocol ||
1606 tp->protocol == htons(ETH_P_ALL)) &&
1607 (err = tp->classify(skb, tp, res)) >= 0) {
1608 #ifdef CONFIG_NET_CLS_ACT
1609 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1610 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1611 #endif
1612 return err;
1613 }
1614 }
1615 return -1;
1616 }
1617 EXPORT_SYMBOL(tc_classify_compat);
1618
1619 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1620 struct tcf_result *res)
1621 {
1622 int err = 0;
1623 __be16 protocol;
1624 #ifdef CONFIG_NET_CLS_ACT
1625 struct tcf_proto *otp = tp;
1626 reclassify:
1627 #endif
1628 protocol = skb->protocol;
1629
1630 err = tc_classify_compat(skb, tp, res);
1631 #ifdef CONFIG_NET_CLS_ACT
1632 if (err == TC_ACT_RECLASSIFY) {
1633 u32 verd = G_TC_VERD(skb->tc_verd);
1634 tp = otp;
1635
1636 if (verd++ >= MAX_REC_LOOP) {
1637 printk("rule prio %u protocol %02x reclassify loop, "
1638 "packet dropped\n",
1639 tp->prio&0xffff, ntohs(tp->protocol));
1640 return TC_ACT_SHOT;
1641 }
1642 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1643 goto reclassify;
1644 }
1645 #endif
1646 return err;
1647 }
1648 EXPORT_SYMBOL(tc_classify);
1649
1650 void tcf_destroy(struct tcf_proto *tp)
1651 {
1652 tp->ops->destroy(tp);
1653 module_put(tp->ops->owner);
1654 kfree(tp);
1655 }
1656
1657 void tcf_destroy_chain(struct tcf_proto **fl)
1658 {
1659 struct tcf_proto *tp;
1660
1661 while ((tp = *fl) != NULL) {
1662 *fl = tp->next;
1663 tcf_destroy(tp);
1664 }
1665 }
1666 EXPORT_SYMBOL(tcf_destroy_chain);
1667
1668 #ifdef CONFIG_PROC_FS
1669 static int psched_show(struct seq_file *seq, void *v)
1670 {
1671 struct timespec ts;
1672
1673 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1674 seq_printf(seq, "%08x %08x %08x %08x\n",
1675 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1676 1000000,
1677 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1678
1679 return 0;
1680 }
1681
1682 static int psched_open(struct inode *inode, struct file *file)
1683 {
1684 return single_open(file, psched_show, PDE(inode)->data);
1685 }
1686
1687 static const struct file_operations psched_fops = {
1688 .owner = THIS_MODULE,
1689 .open = psched_open,
1690 .read = seq_read,
1691 .llseek = seq_lseek,
1692 .release = single_release,
1693 };
1694 #endif
1695
1696 static int __init pktsched_init(void)
1697 {
1698 register_qdisc(&pfifo_qdisc_ops);
1699 register_qdisc(&bfifo_qdisc_ops);
1700 register_qdisc(&mq_qdisc_ops);
1701 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1702
1703 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1704 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1705 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1706 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1707 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1708 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1709
1710 return 0;
1711 }
1712
1713 subsys_initcall(pktsched_init);
This page took 0.066792 seconds and 5 git commands to generate.