[NETLINK]: don't reinitialize callback mutex
[deliverable/linux.git] / net / sched / sch_api.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
1da177e4
LT
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
1da177e4
LT
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
1da177e4
LT
30#include <linux/init.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33#include <linux/kmod.h>
34#include <linux/list.h>
35#include <linux/bitops.h>
4179477f 36#include <linux/hrtimer.h>
1da177e4 37
dc5fc579 38#include <net/netlink.h>
1da177e4
LT
39#include <net/sock.h>
40#include <net/pkt_sched.h>
41
42#include <asm/processor.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 struct Qdisc *old, struct Qdisc *new);
48static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 struct Qdisc *q, unsigned long cl, int event);
50
51/*
52
53 Short review.
54 -------------
55
56 This file consists of two interrelated parts:
57
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
60
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
65
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
70
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
73
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
79
80 All real intelligent work is done inside qdisc modules.
81
82
83
84 Every discipline has two major routines: enqueue and dequeue.
85
86 ---dequeue
87
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
94
95 ---enqueue
96
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
99 not zero error code.
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
106
107 Auxiliary routines:
108
109 ---requeue
110
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
113
114 ---reset
115
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
118
119 ---init
120
121 initializes newly created qdisc.
122
123 ---destroy
124
125 destroys resources allocated by init and during lifetime of qdisc.
126
127 ---change
128
129 changes qdisc parameters.
130 */
131
132/* Protects list of registered TC modules. It is pure SMP lock. */
133static DEFINE_RWLOCK(qdisc_mod_lock);
134
135
136/************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
139
140
141/* The list of all installed queueing disciplines. */
142
143static struct Qdisc_ops *qdisc_base;
144
145/* Register/uregister queueing discipline */
146
147int register_qdisc(struct Qdisc_ops *qops)
148{
149 struct Qdisc_ops *q, **qp;
150 int rc = -EEXIST;
151
152 write_lock(&qdisc_mod_lock);
153 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 if (!strcmp(qops->id, q->id))
155 goto out;
156
157 if (qops->enqueue == NULL)
158 qops->enqueue = noop_qdisc_ops.enqueue;
159 if (qops->requeue == NULL)
160 qops->requeue = noop_qdisc_ops.requeue;
161 if (qops->dequeue == NULL)
162 qops->dequeue = noop_qdisc_ops.dequeue;
163
164 qops->next = NULL;
165 *qp = qops;
166 rc = 0;
167out:
168 write_unlock(&qdisc_mod_lock);
169 return rc;
170}
171
172int unregister_qdisc(struct Qdisc_ops *qops)
173{
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
176
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
185 }
186 write_unlock(&qdisc_mod_lock);
187 return err;
188}
189
190/* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
192 */
193
43effa1e 194static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
1da177e4
LT
195{
196 struct Qdisc *q;
197
1da177e4 198 list_for_each_entry(q, &dev->qdisc_list, list) {
43effa1e 199 if (q->handle == handle)
1da177e4 200 return q;
1da177e4 201 }
1da177e4
LT
202 return NULL;
203}
204
43effa1e
PM
205struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
206{
207 struct Qdisc *q;
208
209 read_lock(&qdisc_tree_lock);
210 q = __qdisc_lookup(dev, handle);
211 read_unlock(&qdisc_tree_lock);
212 return q;
213}
214
1da177e4
LT
215static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
216{
217 unsigned long cl;
218 struct Qdisc *leaf;
219 struct Qdisc_class_ops *cops = p->ops->cl_ops;
220
221 if (cops == NULL)
222 return NULL;
223 cl = cops->get(p, classid);
224
225 if (cl == 0)
226 return NULL;
227 leaf = cops->leaf(p, cl);
228 cops->put(p, cl);
229 return leaf;
230}
231
232/* Find queueing discipline by name */
233
234static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
235{
236 struct Qdisc_ops *q = NULL;
237
238 if (kind) {
239 read_lock(&qdisc_mod_lock);
240 for (q = qdisc_base; q; q = q->next) {
241 if (rtattr_strcmp(kind, q->id) == 0) {
242 if (!try_module_get(q->owner))
243 q = NULL;
244 break;
245 }
246 }
247 read_unlock(&qdisc_mod_lock);
248 }
249 return q;
250}
251
252static struct qdisc_rate_table *qdisc_rtab_list;
253
254struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
255{
256 struct qdisc_rate_table *rtab;
257
258 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
259 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
260 rtab->refcnt++;
261 return rtab;
262 }
263 }
264
265 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
266 return NULL;
267
268 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
269 if (rtab) {
270 rtab->rate = *r;
271 rtab->refcnt = 1;
272 memcpy(rtab->data, RTA_DATA(tab), 1024);
273 rtab->next = qdisc_rtab_list;
274 qdisc_rtab_list = rtab;
275 }
276 return rtab;
277}
278
279void qdisc_put_rtab(struct qdisc_rate_table *tab)
280{
281 struct qdisc_rate_table *rtab, **rtabp;
282
283 if (!tab || --tab->refcnt)
284 return;
285
286 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
287 if (rtab == tab) {
288 *rtabp = rtab->next;
289 kfree(rtab);
290 return;
291 }
292 }
293}
294
4179477f
PM
295static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
296{
297 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
298 timer);
1936502d 299 struct net_device *dev = wd->qdisc->dev;
4179477f
PM
300
301 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
11274e5a 302 smp_wmb();
1936502d
SH
303 if (spin_trylock(&dev->queue_lock)) {
304 qdisc_run(dev);
305 spin_unlock(&dev->queue_lock);
306 } else
307 netif_schedule(dev);
308
4179477f
PM
309 return HRTIMER_NORESTART;
310}
311
312void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
313{
314 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
315 wd->timer.function = qdisc_watchdog;
316 wd->qdisc = qdisc;
317}
318EXPORT_SYMBOL(qdisc_watchdog_init);
319
320void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
321{
322 ktime_t time;
323
324 wd->qdisc->flags |= TCQ_F_THROTTLED;
325 time = ktime_set(0, 0);
326 time = ktime_add_ns(time, PSCHED_US2NS(expires));
327 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
328}
329EXPORT_SYMBOL(qdisc_watchdog_schedule);
330
331void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
332{
333 hrtimer_cancel(&wd->timer);
334 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
335}
336EXPORT_SYMBOL(qdisc_watchdog_cancel);
1da177e4
LT
337
338/* Allocate an unique handle from space managed by kernel */
339
340static u32 qdisc_alloc_handle(struct net_device *dev)
341{
342 int i = 0x10000;
343 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
344
345 do {
346 autohandle += TC_H_MAKE(0x10000U, 0);
347 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
348 autohandle = TC_H_MAKE(0x80000000U, 0);
349 } while (qdisc_lookup(dev, autohandle) && --i > 0);
350
351 return i>0 ? autohandle : 0;
352}
353
354/* Attach toplevel qdisc to device dev */
355
356static struct Qdisc *
357dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
358{
359 struct Qdisc *oqdisc;
360
361 if (dev->flags & IFF_UP)
362 dev_deactivate(dev);
363
364 qdisc_lock_tree(dev);
365 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
366 oqdisc = dev->qdisc_ingress;
367 /* Prune old scheduler */
368 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
369 /* delete */
370 qdisc_reset(oqdisc);
371 dev->qdisc_ingress = NULL;
372 } else { /* new */
373 dev->qdisc_ingress = qdisc;
374 }
375
376 } else {
377
378 oqdisc = dev->qdisc_sleeping;
379
380 /* Prune old scheduler */
381 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
382 qdisc_reset(oqdisc);
383
384 /* ... and graft new one */
385 if (qdisc == NULL)
386 qdisc = &noop_qdisc;
387 dev->qdisc_sleeping = qdisc;
388 dev->qdisc = &noop_qdisc;
389 }
390
391 qdisc_unlock_tree(dev);
392
393 if (dev->flags & IFF_UP)
394 dev_activate(dev);
395
396 return oqdisc;
397}
398
43effa1e
PM
399void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
400{
401 struct Qdisc_class_ops *cops;
402 unsigned long cl;
403 u32 parentid;
404
405 if (n == 0)
406 return;
407 while ((parentid = sch->parent)) {
408 sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
409 cops = sch->ops->cl_ops;
410 if (cops->qlen_notify) {
411 cl = cops->get(sch, parentid);
412 cops->qlen_notify(sch, cl);
413 cops->put(sch, cl);
414 }
415 sch->q.qlen -= n;
416 }
417}
418EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
1da177e4
LT
419
420/* Graft qdisc "new" to class "classid" of qdisc "parent" or
421 to device "dev".
422
423 Old qdisc is not destroyed but returned in *old.
424 */
425
426static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
427 u32 classid,
428 struct Qdisc *new, struct Qdisc **old)
429{
430 int err = 0;
431 struct Qdisc *q = *old;
432
433
10297b99 434 if (parent == NULL) {
1da177e4
LT
435 if (q && q->flags&TCQ_F_INGRESS) {
436 *old = dev_graft_qdisc(dev, q);
437 } else {
438 *old = dev_graft_qdisc(dev, new);
439 }
440 } else {
441 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
442
443 err = -EINVAL;
444
445 if (cops) {
446 unsigned long cl = cops->get(parent, classid);
447 if (cl) {
448 err = cops->graft(parent, cl, new, old);
449 if (new)
450 new->parent = classid;
451 cops->put(parent, cl);
452 }
453 }
454 }
455 return err;
456}
457
458/*
459 Allocate and initialize new qdisc.
460
461 Parameters are passed via opt.
462 */
463
464static struct Qdisc *
465qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
466{
467 int err;
468 struct rtattr *kind = tca[TCA_KIND-1];
1da177e4
LT
469 struct Qdisc *sch;
470 struct Qdisc_ops *ops;
1da177e4
LT
471
472 ops = qdisc_lookup_ops(kind);
473#ifdef CONFIG_KMOD
474 if (ops == NULL && kind != NULL) {
475 char name[IFNAMSIZ];
476 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
477 /* We dropped the RTNL semaphore in order to
478 * perform the module load. So, even if we
479 * succeeded in loading the module we have to
480 * tell the caller to replay the request. We
481 * indicate this using -EAGAIN.
482 * We replay the request because the device may
483 * go away in the mean time.
484 */
485 rtnl_unlock();
486 request_module("sch_%s", name);
487 rtnl_lock();
488 ops = qdisc_lookup_ops(kind);
489 if (ops != NULL) {
490 /* We will try again qdisc_lookup_ops,
491 * so don't keep a reference.
492 */
493 module_put(ops->owner);
494 err = -EAGAIN;
495 goto err_out;
496 }
497 }
498 }
499#endif
500
b9e2cc0f 501 err = -ENOENT;
1da177e4
LT
502 if (ops == NULL)
503 goto err_out;
504
3d54b82f
TG
505 sch = qdisc_alloc(dev, ops);
506 if (IS_ERR(sch)) {
507 err = PTR_ERR(sch);
1da177e4 508 goto err_out2;
3d54b82f 509 }
1da177e4 510
3d54b82f 511 if (handle == TC_H_INGRESS) {
1da177e4 512 sch->flags |= TCQ_F_INGRESS;
3d54b82f
TG
513 handle = TC_H_MAKE(TC_H_INGRESS, 0);
514 } else if (handle == 0) {
1da177e4
LT
515 handle = qdisc_alloc_handle(dev);
516 err = -ENOMEM;
517 if (handle == 0)
518 goto err_out3;
519 }
520
3d54b82f 521 sch->handle = handle;
1da177e4
LT
522
523 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
023e09a7
TG
524#ifdef CONFIG_NET_ESTIMATOR
525 if (tca[TCA_RATE-1]) {
526 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
527 sch->stats_lock,
528 tca[TCA_RATE-1]);
529 if (err) {
530 /*
531 * Any broken qdiscs that would require
532 * a ops->reset() here? The qdisc was never
533 * in action so it shouldn't be necessary.
534 */
535 if (ops->destroy)
536 ops->destroy(sch);
537 goto err_out3;
538 }
539 }
540#endif
1da177e4
LT
541 qdisc_lock_tree(dev);
542 list_add_tail(&sch->list, &dev->qdisc_list);
543 qdisc_unlock_tree(dev);
544
1da177e4
LT
545 return sch;
546 }
547err_out3:
548 dev_put(dev);
3d54b82f 549 kfree((char *) sch - sch->padded);
1da177e4
LT
550err_out2:
551 module_put(ops->owner);
552err_out:
553 *errp = err;
1da177e4
LT
554 return NULL;
555}
556
557static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
558{
559 if (tca[TCA_OPTIONS-1]) {
560 int err;
561
562 if (sch->ops->change == NULL)
563 return -EINVAL;
564 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
565 if (err)
566 return err;
567 }
568#ifdef CONFIG_NET_ESTIMATOR
569 if (tca[TCA_RATE-1])
570 gen_replace_estimator(&sch->bstats, &sch->rate_est,
571 sch->stats_lock, tca[TCA_RATE-1]);
572#endif
573 return 0;
574}
575
576struct check_loop_arg
577{
578 struct qdisc_walker w;
579 struct Qdisc *p;
580 int depth;
581};
582
583static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
584
585static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
586{
587 struct check_loop_arg arg;
588
589 if (q->ops->cl_ops == NULL)
590 return 0;
591
592 arg.w.stop = arg.w.skip = arg.w.count = 0;
593 arg.w.fn = check_loop_fn;
594 arg.depth = depth;
595 arg.p = p;
596 q->ops->cl_ops->walk(q, &arg.w);
597 return arg.w.stop ? -ELOOP : 0;
598}
599
600static int
601check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
602{
603 struct Qdisc *leaf;
604 struct Qdisc_class_ops *cops = q->ops->cl_ops;
605 struct check_loop_arg *arg = (struct check_loop_arg *)w;
606
607 leaf = cops->leaf(q, cl);
608 if (leaf) {
609 if (leaf == arg->p || arg->depth > 7)
610 return -ELOOP;
611 return check_loop(leaf, arg->p, arg->depth + 1);
612 }
613 return 0;
614}
615
616/*
617 * Delete/get qdisc.
618 */
619
620static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
621{
622 struct tcmsg *tcm = NLMSG_DATA(n);
623 struct rtattr **tca = arg;
624 struct net_device *dev;
625 u32 clid = tcm->tcm_parent;
626 struct Qdisc *q = NULL;
627 struct Qdisc *p = NULL;
628 int err;
629
630 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
631 return -ENODEV;
632
633 if (clid) {
634 if (clid != TC_H_ROOT) {
635 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
636 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
637 return -ENOENT;
638 q = qdisc_leaf(p, clid);
639 } else { /* ingress */
640 q = dev->qdisc_ingress;
10297b99 641 }
1da177e4
LT
642 } else {
643 q = dev->qdisc_sleeping;
644 }
645 if (!q)
646 return -ENOENT;
647
648 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
649 return -EINVAL;
650 } else {
651 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
652 return -ENOENT;
653 }
654
655 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
656 return -EINVAL;
657
658 if (n->nlmsg_type == RTM_DELQDISC) {
659 if (!clid)
660 return -EINVAL;
661 if (q->handle == 0)
662 return -ENOENT;
663 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
664 return err;
665 if (q) {
666 qdisc_notify(skb, n, clid, q, NULL);
667 spin_lock_bh(&dev->queue_lock);
668 qdisc_destroy(q);
669 spin_unlock_bh(&dev->queue_lock);
670 }
671 } else {
672 qdisc_notify(skb, n, clid, NULL, q);
673 }
674 return 0;
675}
676
677/*
678 Create/change qdisc.
679 */
680
681static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
682{
683 struct tcmsg *tcm;
684 struct rtattr **tca;
685 struct net_device *dev;
686 u32 clid;
687 struct Qdisc *q, *p;
688 int err;
689
690replay:
691 /* Reinit, just in case something touches this. */
692 tcm = NLMSG_DATA(n);
693 tca = arg;
694 clid = tcm->tcm_parent;
695 q = p = NULL;
696
697 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
698 return -ENODEV;
699
700 if (clid) {
701 if (clid != TC_H_ROOT) {
702 if (clid != TC_H_INGRESS) {
703 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
704 return -ENOENT;
705 q = qdisc_leaf(p, clid);
706 } else { /*ingress */
707 q = dev->qdisc_ingress;
708 }
709 } else {
710 q = dev->qdisc_sleeping;
711 }
712
713 /* It may be default qdisc, ignore it */
714 if (q && q->handle == 0)
715 q = NULL;
716
717 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
718 if (tcm->tcm_handle) {
719 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
720 return -EEXIST;
721 if (TC_H_MIN(tcm->tcm_handle))
722 return -EINVAL;
723 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
724 goto create_n_graft;
725 if (n->nlmsg_flags&NLM_F_EXCL)
726 return -EEXIST;
727 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
728 return -EINVAL;
729 if (q == p ||
730 (p && check_loop(q, p, 0)))
731 return -ELOOP;
732 atomic_inc(&q->refcnt);
733 goto graft;
734 } else {
735 if (q == NULL)
736 goto create_n_graft;
737
738 /* This magic test requires explanation.
739 *
740 * We know, that some child q is already
741 * attached to this parent and have choice:
742 * either to change it or to create/graft new one.
743 *
744 * 1. We are allowed to create/graft only
745 * if CREATE and REPLACE flags are set.
746 *
747 * 2. If EXCL is set, requestor wanted to say,
748 * that qdisc tcm_handle is not expected
749 * to exist, so that we choose create/graft too.
750 *
751 * 3. The last case is when no flags are set.
752 * Alas, it is sort of hole in API, we
753 * cannot decide what to do unambiguously.
754 * For now we select create/graft, if
755 * user gave KIND, which does not match existing.
756 */
757 if ((n->nlmsg_flags&NLM_F_CREATE) &&
758 (n->nlmsg_flags&NLM_F_REPLACE) &&
759 ((n->nlmsg_flags&NLM_F_EXCL) ||
760 (tca[TCA_KIND-1] &&
761 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
762 goto create_n_graft;
763 }
764 }
765 } else {
766 if (!tcm->tcm_handle)
767 return -EINVAL;
768 q = qdisc_lookup(dev, tcm->tcm_handle);
769 }
770
771 /* Change qdisc parameters */
772 if (q == NULL)
773 return -ENOENT;
774 if (n->nlmsg_flags&NLM_F_EXCL)
775 return -EEXIST;
776 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
777 return -EINVAL;
778 err = qdisc_change(q, tca);
779 if (err == 0)
780 qdisc_notify(skb, n, clid, NULL, q);
781 return err;
782
783create_n_graft:
784 if (!(n->nlmsg_flags&NLM_F_CREATE))
785 return -ENOENT;
786 if (clid == TC_H_INGRESS)
787 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
10297b99 788 else
1da177e4
LT
789 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
790 if (q == NULL) {
791 if (err == -EAGAIN)
792 goto replay;
793 return err;
794 }
795
796graft:
797 if (1) {
798 struct Qdisc *old_q = NULL;
799 err = qdisc_graft(dev, p, clid, q, &old_q);
800 if (err) {
801 if (q) {
802 spin_lock_bh(&dev->queue_lock);
803 qdisc_destroy(q);
804 spin_unlock_bh(&dev->queue_lock);
805 }
806 return err;
807 }
808 qdisc_notify(skb, n, clid, old_q, q);
809 if (old_q) {
810 spin_lock_bh(&dev->queue_lock);
811 qdisc_destroy(old_q);
812 spin_unlock_bh(&dev->queue_lock);
813 }
814 }
815 return 0;
816}
817
818static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
e431b8c0 819 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
820{
821 struct tcmsg *tcm;
822 struct nlmsghdr *nlh;
27a884dc 823 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
824 struct gnet_dump d;
825
e431b8c0 826 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
827 tcm = NLMSG_DATA(nlh);
828 tcm->tcm_family = AF_UNSPEC;
9ef1d4c7
PM
829 tcm->tcm__pad1 = 0;
830 tcm->tcm__pad2 = 0;
1da177e4
LT
831 tcm->tcm_ifindex = q->dev->ifindex;
832 tcm->tcm_parent = clid;
833 tcm->tcm_handle = q->handle;
834 tcm->tcm_info = atomic_read(&q->refcnt);
835 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
836 if (q->ops->dump && q->ops->dump(q, skb) < 0)
837 goto rtattr_failure;
838 q->qstats.qlen = q->q.qlen;
839
840 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
841 TCA_XSTATS, q->stats_lock, &d) < 0)
842 goto rtattr_failure;
843
844 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
845 goto rtattr_failure;
846
847 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
848#ifdef CONFIG_NET_ESTIMATOR
849 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
850#endif
851 gnet_stats_copy_queue(&d, &q->qstats) < 0)
852 goto rtattr_failure;
10297b99 853
1da177e4
LT
854 if (gnet_stats_finish_copy(&d) < 0)
855 goto rtattr_failure;
10297b99 856
27a884dc 857 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
858 return skb->len;
859
860nlmsg_failure:
861rtattr_failure:
dc5fc579 862 nlmsg_trim(skb, b);
1da177e4
LT
863 return -1;
864}
865
866static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
867 u32 clid, struct Qdisc *old, struct Qdisc *new)
868{
869 struct sk_buff *skb;
870 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
871
872 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
873 if (!skb)
874 return -ENOBUFS;
875
876 if (old && old->handle) {
877 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
878 goto err_out;
879 }
880 if (new) {
881 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
882 goto err_out;
883 }
884
885 if (skb->len)
ac6d439d 886 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
887
888err_out:
889 kfree_skb(skb);
890 return -EINVAL;
891}
892
893static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
894{
895 int idx, q_idx;
896 int s_idx, s_q_idx;
897 struct net_device *dev;
898 struct Qdisc *q;
899
900 s_idx = cb->args[0];
901 s_q_idx = q_idx = cb->args[1];
902 read_lock(&dev_base_lock);
903 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
904 if (idx < s_idx)
905 continue;
906 if (idx > s_idx)
907 s_q_idx = 0;
85670cc1 908 read_lock(&qdisc_tree_lock);
1da177e4
LT
909 q_idx = 0;
910 list_for_each_entry(q, &dev->qdisc_list, list) {
911 if (q_idx < s_q_idx) {
912 q_idx++;
913 continue;
914 }
915 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
916 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
85670cc1 917 read_unlock(&qdisc_tree_lock);
1da177e4
LT
918 goto done;
919 }
920 q_idx++;
921 }
85670cc1 922 read_unlock(&qdisc_tree_lock);
1da177e4
LT
923 }
924
925done:
926 read_unlock(&dev_base_lock);
927
928 cb->args[0] = idx;
929 cb->args[1] = q_idx;
930
931 return skb->len;
932}
933
934
935
936/************************************************
937 * Traffic classes manipulation. *
938 ************************************************/
939
940
941
942static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
943{
944 struct tcmsg *tcm = NLMSG_DATA(n);
945 struct rtattr **tca = arg;
946 struct net_device *dev;
947 struct Qdisc *q = NULL;
948 struct Qdisc_class_ops *cops;
949 unsigned long cl = 0;
950 unsigned long new_cl;
951 u32 pid = tcm->tcm_parent;
952 u32 clid = tcm->tcm_handle;
953 u32 qid = TC_H_MAJ(clid);
954 int err;
955
956 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
957 return -ENODEV;
958
959 /*
960 parent == TC_H_UNSPEC - unspecified parent.
961 parent == TC_H_ROOT - class is root, which has no parent.
962 parent == X:0 - parent is root class.
963 parent == X:Y - parent is a node in hierarchy.
964 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
965
966 handle == 0:0 - generate handle from kernel pool.
967 handle == 0:Y - class is X:Y, where X:0 is qdisc.
968 handle == X:Y - clear.
969 handle == X:0 - root class.
970 */
971
972 /* Step 1. Determine qdisc handle X:0 */
973
974 if (pid != TC_H_ROOT) {
975 u32 qid1 = TC_H_MAJ(pid);
976
977 if (qid && qid1) {
978 /* If both majors are known, they must be identical. */
979 if (qid != qid1)
980 return -EINVAL;
981 } else if (qid1) {
982 qid = qid1;
983 } else if (qid == 0)
984 qid = dev->qdisc_sleeping->handle;
985
986 /* Now qid is genuine qdisc handle consistent
987 both with parent and child.
988
989 TC_H_MAJ(pid) still may be unspecified, complete it now.
990 */
991 if (pid)
992 pid = TC_H_MAKE(qid, pid);
993 } else {
994 if (qid == 0)
995 qid = dev->qdisc_sleeping->handle;
996 }
997
998 /* OK. Locate qdisc */
10297b99 999 if ((q = qdisc_lookup(dev, qid)) == NULL)
1da177e4
LT
1000 return -ENOENT;
1001
1002 /* An check that it supports classes */
1003 cops = q->ops->cl_ops;
1004 if (cops == NULL)
1005 return -EINVAL;
1006
1007 /* Now try to get class */
1008 if (clid == 0) {
1009 if (pid == TC_H_ROOT)
1010 clid = qid;
1011 } else
1012 clid = TC_H_MAKE(qid, clid);
1013
1014 if (clid)
1015 cl = cops->get(q, clid);
1016
1017 if (cl == 0) {
1018 err = -ENOENT;
1019 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1020 goto out;
1021 } else {
1022 switch (n->nlmsg_type) {
10297b99 1023 case RTM_NEWTCLASS:
1da177e4
LT
1024 err = -EEXIST;
1025 if (n->nlmsg_flags&NLM_F_EXCL)
1026 goto out;
1027 break;
1028 case RTM_DELTCLASS:
1029 err = cops->delete(q, cl);
1030 if (err == 0)
1031 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1032 goto out;
1033 case RTM_GETTCLASS:
1034 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1035 goto out;
1036 default:
1037 err = -EINVAL;
1038 goto out;
1039 }
1040 }
1041
1042 new_cl = cl;
1043 err = cops->change(q, clid, pid, tca, &new_cl);
1044 if (err == 0)
1045 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1046
1047out:
1048 if (cl)
1049 cops->put(q, cl);
1050
1051 return err;
1052}
1053
1054
1055static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1056 unsigned long cl,
e431b8c0 1057 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1058{
1059 struct tcmsg *tcm;
1060 struct nlmsghdr *nlh;
27a884dc 1061 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1062 struct gnet_dump d;
1063 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1064
e431b8c0 1065 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1066 tcm = NLMSG_DATA(nlh);
1067 tcm->tcm_family = AF_UNSPEC;
1068 tcm->tcm_ifindex = q->dev->ifindex;
1069 tcm->tcm_parent = q->handle;
1070 tcm->tcm_handle = q->handle;
1071 tcm->tcm_info = 0;
1072 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1073 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1074 goto rtattr_failure;
1075
1076 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1077 TCA_XSTATS, q->stats_lock, &d) < 0)
1078 goto rtattr_failure;
1079
1080 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1081 goto rtattr_failure;
1082
1083 if (gnet_stats_finish_copy(&d) < 0)
1084 goto rtattr_failure;
1085
27a884dc 1086 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1087 return skb->len;
1088
1089nlmsg_failure:
1090rtattr_failure:
dc5fc579 1091 nlmsg_trim(skb, b);
1da177e4
LT
1092 return -1;
1093}
1094
1095static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1096 struct Qdisc *q, unsigned long cl, int event)
1097{
1098 struct sk_buff *skb;
1099 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1100
1101 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1102 if (!skb)
1103 return -ENOBUFS;
1104
1105 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1106 kfree_skb(skb);
1107 return -EINVAL;
1108 }
1109
ac6d439d 1110 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1111}
1112
1113struct qdisc_dump_args
1114{
1115 struct qdisc_walker w;
1116 struct sk_buff *skb;
1117 struct netlink_callback *cb;
1118};
1119
1120static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1121{
1122 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1123
1124 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1125 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1126}
1127
1128static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1129{
1130 int t;
1131 int s_t;
1132 struct net_device *dev;
1133 struct Qdisc *q;
1134 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1135 struct qdisc_dump_args arg;
1136
1137 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1138 return 0;
1139 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1140 return 0;
1141
1142 s_t = cb->args[0];
1143 t = 0;
1144
85670cc1 1145 read_lock(&qdisc_tree_lock);
1da177e4
LT
1146 list_for_each_entry(q, &dev->qdisc_list, list) {
1147 if (t < s_t || !q->ops->cl_ops ||
1148 (tcm->tcm_parent &&
1149 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1150 t++;
1151 continue;
1152 }
1153 if (t > s_t)
1154 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1155 arg.w.fn = qdisc_class_dump;
1156 arg.skb = skb;
1157 arg.cb = cb;
1158 arg.w.stop = 0;
1159 arg.w.skip = cb->args[1];
1160 arg.w.count = 0;
1161 q->ops->cl_ops->walk(q, &arg.w);
1162 cb->args[1] = arg.w.count;
1163 if (arg.w.stop)
1164 break;
1165 t++;
1166 }
85670cc1 1167 read_unlock(&qdisc_tree_lock);
1da177e4
LT
1168
1169 cb->args[0] = t;
1170
1171 dev_put(dev);
1172 return skb->len;
1173}
1174
1175/* Main classifier routine: scans classifier chain attached
1176 to this qdisc, (optionally) tests for protocol and asks
1177 specific classifiers.
1178 */
1179int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1180 struct tcf_result *res)
1181{
1182 int err = 0;
66c6f529 1183 __be16 protocol = skb->protocol;
1da177e4
LT
1184#ifdef CONFIG_NET_CLS_ACT
1185 struct tcf_proto *otp = tp;
1186reclassify:
1187#endif
1188 protocol = skb->protocol;
1189
1190 for ( ; tp; tp = tp->next) {
1191 if ((tp->protocol == protocol ||
b6d9bcb0 1192 tp->protocol == htons(ETH_P_ALL)) &&
1da177e4
LT
1193 (err = tp->classify(skb, tp, res)) >= 0) {
1194#ifdef CONFIG_NET_CLS_ACT
1195 if ( TC_ACT_RECLASSIFY == err) {
1196 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1197 tp = otp;
1198
1199 if (MAX_REC_LOOP < verd++) {
1200 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1201 tp->prio&0xffff, ntohs(tp->protocol));
1202 return TC_ACT_SHOT;
1203 }
1204 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1205 goto reclassify;
1206 } else {
10297b99 1207 if (skb->tc_verd)
1da177e4
LT
1208 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1209 return err;
1210 }
1211#else
1212
1213 return err;
1214#endif
1215 }
1216
1217 }
1218 return -1;
1219}
1220
a48b5a61
PM
1221void tcf_destroy(struct tcf_proto *tp)
1222{
1223 tp->ops->destroy(tp);
1224 module_put(tp->ops->owner);
1225 kfree(tp);
1226}
1227
1228void tcf_destroy_chain(struct tcf_proto *fl)
1229{
1230 struct tcf_proto *tp;
1231
1232 while ((tp = fl) != NULL) {
1233 fl = tp->next;
1234 tcf_destroy(tp);
1235 }
1236}
1237EXPORT_SYMBOL(tcf_destroy_chain);
1238
1da177e4
LT
1239#ifdef CONFIG_PROC_FS
1240static int psched_show(struct seq_file *seq, void *v)
1241{
1242 seq_printf(seq, "%08x %08x %08x %08x\n",
641b9e0e 1243 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
514bca32
PM
1244 1000000,
1245 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(KTIME_MONOTONIC_RES));
1da177e4
LT
1246
1247 return 0;
1248}
1249
1250static int psched_open(struct inode *inode, struct file *file)
1251{
1252 return single_open(file, psched_show, PDE(inode)->data);
1253}
1254
da7071d7 1255static const struct file_operations psched_fops = {
1da177e4
LT
1256 .owner = THIS_MODULE,
1257 .open = psched_open,
1258 .read = seq_read,
1259 .llseek = seq_lseek,
1260 .release = single_release,
10297b99 1261};
1da177e4
LT
1262#endif
1263
1da177e4
LT
1264static int __init pktsched_init(void)
1265{
1da177e4
LT
1266 register_qdisc(&pfifo_qdisc_ops);
1267 register_qdisc(&bfifo_qdisc_ops);
1268 proc_net_fops_create("psched", 0, &psched_fops);
1269
be577ddc
TG
1270 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1271 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1272 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1273 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1274 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1275 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1276
1da177e4
LT
1277 return 0;
1278}
1279
1280subsys_initcall(pktsched_init);
1281
1282EXPORT_SYMBOL(qdisc_get_rtab);
1283EXPORT_SYMBOL(qdisc_put_rtab);
1284EXPORT_SYMBOL(register_qdisc);
1285EXPORT_SYMBOL(unregister_qdisc);
1286EXPORT_SYMBOL(tc_classify);
This page took 0.275708 seconds and 5 git commands to generate.