pkt_sched: Remove qdisc->ops->requeue() etc.
[deliverable/linux.git] / net / sched / sch_api.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
1da177e4
LT
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
1da177e4 21#include <linux/string.h>
1da177e4 22#include <linux/errno.h>
1da177e4 23#include <linux/skbuff.h>
1da177e4
LT
24#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
4179477f 29#include <linux/hrtimer.h>
25bfcd5a 30#include <linux/lockdep.h>
1da177e4 31
457c4cbc 32#include <net/net_namespace.h>
b854272b 33#include <net/sock.h>
dc5fc579 34#include <net/netlink.h>
1da177e4
LT
35#include <net/pkt_sched.h>
36
1da177e4
LT
37static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 struct Qdisc *old, struct Qdisc *new);
39static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 struct Qdisc *q, unsigned long cl, int event);
41
42/*
43
44 Short review.
45 -------------
46
47 This file consists of two interrelated parts:
48
49 1. queueing disciplines manager frontend.
50 2. traffic classes manager frontend.
51
52 Generally, queueing discipline ("qdisc") is a black box,
53 which is able to enqueue packets and to dequeue them (when
54 device is ready to send something) in order and at times
55 determined by algorithm hidden in it.
56
57 qdisc's are divided to two categories:
58 - "queues", which have no internal structure visible from outside.
59 - "schedulers", which split all the packets to "traffic classes",
60 using "packet classifiers" (look at cls_api.c)
61
62 In turn, classes may have child qdiscs (as rule, queues)
63 attached to them etc. etc. etc.
64
65 The goal of the routines in this file is to translate
66 information supplied by user in the form of handles
67 to more intelligible for kernel form, to make some sanity
68 checks and part of work, which is common to all qdiscs
69 and to provide rtnetlink notifications.
70
71 All real intelligent work is done inside qdisc modules.
72
73
74
75 Every discipline has two major routines: enqueue and dequeue.
76
77 ---dequeue
78
79 dequeue usually returns a skb to send. It is allowed to return NULL,
80 but it does not mean that queue is empty, it just means that
81 discipline does not want to send anything this time.
82 Queue is really empty if q->q.qlen == 0.
83 For complicated disciplines with multiple queues q->q is not
84 real packet queue, but however q->q.qlen must be valid.
85
86 ---enqueue
87
88 enqueue returns 0, if packet was enqueued successfully.
89 If packet (this one or another one) was dropped, it returns
90 not zero error code.
91 NET_XMIT_DROP - this packet dropped
92 Expected action: do not backoff, but wait until queue will clear.
93 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
94 Expected action: backoff or ignore
95 NET_XMIT_POLICED - dropped by police.
96 Expected action: backoff or error to real-time apps.
97
98 Auxiliary routines:
99
99c0db26
JP
100 ---peek
101
102 like dequeue but without removing a packet from the queue
103
1da177e4
LT
104 ---reset
105
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
108
109 ---init
110
111 initializes newly created qdisc.
112
113 ---destroy
114
115 destroys resources allocated by init and during lifetime of qdisc.
116
117 ---change
118
119 changes qdisc parameters.
120 */
121
122/* Protects list of registered TC modules. It is pure SMP lock. */
123static DEFINE_RWLOCK(qdisc_mod_lock);
124
125
126/************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
129
130
131/* The list of all installed queueing disciplines. */
132
133static struct Qdisc_ops *qdisc_base;
134
135/* Register/uregister queueing discipline */
136
137int register_qdisc(struct Qdisc_ops *qops)
138{
139 struct Qdisc_ops *q, **qp;
140 int rc = -EEXIST;
141
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
145 goto out;
146
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
99c0db26
JP
149 if (qops->peek == NULL) {
150 if (qops->dequeue == NULL) {
151 qops->peek = noop_qdisc_ops.peek;
152 } else {
153 rc = -EINVAL;
154 goto out;
155 }
156 }
1da177e4
LT
157 if (qops->dequeue == NULL)
158 qops->dequeue = noop_qdisc_ops.dequeue;
159
160 qops->next = NULL;
161 *qp = qops;
162 rc = 0;
163out:
164 write_unlock(&qdisc_mod_lock);
165 return rc;
166}
62e3ba1b 167EXPORT_SYMBOL(register_qdisc);
1da177e4
LT
168
169int unregister_qdisc(struct Qdisc_ops *qops)
170{
171 struct Qdisc_ops *q, **qp;
172 int err = -ENOENT;
173
174 write_lock(&qdisc_mod_lock);
175 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176 if (q == qops)
177 break;
178 if (q) {
179 *qp = q->next;
180 q->next = NULL;
181 err = 0;
182 }
183 write_unlock(&qdisc_mod_lock);
184 return err;
185}
62e3ba1b 186EXPORT_SYMBOL(unregister_qdisc);
1da177e4
LT
187
188/* We know handle. Find qdisc among all qdisc's attached to device
189 (root qdisc, all its children, children of children etc.)
190 */
191
8123b421
DM
192struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193{
194 struct Qdisc *q;
195
196 if (!(root->flags & TCQ_F_BUILTIN) &&
197 root->handle == handle)
198 return root;
199
200 list_for_each_entry(q, &root->list, list) {
201 if (q->handle == handle)
202 return q;
203 }
204 return NULL;
205}
206
f6e0b239
JP
207/*
208 * This lock is needed until some qdiscs stop calling qdisc_tree_decrease_qlen()
209 * without rtnl_lock(); currently hfsc_dequeue(), netem_dequeue(), tbf_dequeue()
210 */
211static DEFINE_SPINLOCK(qdisc_list_lock);
212
213static void qdisc_list_add(struct Qdisc *q)
214{
215 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
216 spin_lock_bh(&qdisc_list_lock);
217 list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
218 spin_unlock_bh(&qdisc_list_lock);
219 }
220}
221
222void qdisc_list_del(struct Qdisc *q)
223{
224 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
225 spin_lock_bh(&qdisc_list_lock);
226 list_del(&q->list);
227 spin_unlock_bh(&qdisc_list_lock);
228 }
229}
230EXPORT_SYMBOL(qdisc_list_del);
231
ead81cc5 232struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
1da177e4 233{
30723673 234 unsigned int i;
f6e0b239
JP
235 struct Qdisc *q;
236
237 spin_lock_bh(&qdisc_list_lock);
30723673
DM
238
239 for (i = 0; i < dev->num_tx_queues; i++) {
240 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
f6e0b239 241 struct Qdisc *txq_root = txq->qdisc_sleeping;
1da177e4 242
8123b421
DM
243 q = qdisc_match_from_root(txq_root, handle);
244 if (q)
f6e0b239 245 goto unlock;
1da177e4 246 }
f6e0b239
JP
247
248 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
249
250unlock:
251 spin_unlock_bh(&qdisc_list_lock);
252
253 return q;
1da177e4
LT
254}
255
256static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
257{
258 unsigned long cl;
259 struct Qdisc *leaf;
20fea08b 260 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
1da177e4
LT
261
262 if (cops == NULL)
263 return NULL;
264 cl = cops->get(p, classid);
265
266 if (cl == 0)
267 return NULL;
268 leaf = cops->leaf(p, cl);
269 cops->put(p, cl);
270 return leaf;
271}
272
273/* Find queueing discipline by name */
274
1e90474c 275static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
1da177e4
LT
276{
277 struct Qdisc_ops *q = NULL;
278
279 if (kind) {
280 read_lock(&qdisc_mod_lock);
281 for (q = qdisc_base; q; q = q->next) {
1e90474c 282 if (nla_strcmp(kind, q->id) == 0) {
1da177e4
LT
283 if (!try_module_get(q->owner))
284 q = NULL;
285 break;
286 }
287 }
288 read_unlock(&qdisc_mod_lock);
289 }
290 return q;
291}
292
293static struct qdisc_rate_table *qdisc_rtab_list;
294
1e90474c 295struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
1da177e4
LT
296{
297 struct qdisc_rate_table *rtab;
298
299 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
300 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
301 rtab->refcnt++;
302 return rtab;
303 }
304 }
305
5feb5e1a
PM
306 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
307 nla_len(tab) != TC_RTAB_SIZE)
1da177e4
LT
308 return NULL;
309
310 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
311 if (rtab) {
312 rtab->rate = *r;
313 rtab->refcnt = 1;
1e90474c 314 memcpy(rtab->data, nla_data(tab), 1024);
1da177e4
LT
315 rtab->next = qdisc_rtab_list;
316 qdisc_rtab_list = rtab;
317 }
318 return rtab;
319}
62e3ba1b 320EXPORT_SYMBOL(qdisc_get_rtab);
1da177e4
LT
321
322void qdisc_put_rtab(struct qdisc_rate_table *tab)
323{
324 struct qdisc_rate_table *rtab, **rtabp;
325
326 if (!tab || --tab->refcnt)
327 return;
328
329 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
330 if (rtab == tab) {
331 *rtabp = rtab->next;
332 kfree(rtab);
333 return;
334 }
335 }
336}
62e3ba1b 337EXPORT_SYMBOL(qdisc_put_rtab);
1da177e4 338
175f9c1b
JK
339static LIST_HEAD(qdisc_stab_list);
340static DEFINE_SPINLOCK(qdisc_stab_lock);
341
342static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
343 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
344 [TCA_STAB_DATA] = { .type = NLA_BINARY },
345};
346
347static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
348{
349 struct nlattr *tb[TCA_STAB_MAX + 1];
350 struct qdisc_size_table *stab;
351 struct tc_sizespec *s;
352 unsigned int tsize = 0;
353 u16 *tab = NULL;
354 int err;
355
356 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
357 if (err < 0)
358 return ERR_PTR(err);
359 if (!tb[TCA_STAB_BASE])
360 return ERR_PTR(-EINVAL);
361
362 s = nla_data(tb[TCA_STAB_BASE]);
363
364 if (s->tsize > 0) {
365 if (!tb[TCA_STAB_DATA])
366 return ERR_PTR(-EINVAL);
367 tab = nla_data(tb[TCA_STAB_DATA]);
368 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
369 }
370
371 if (!s || tsize != s->tsize || (!tab && tsize > 0))
372 return ERR_PTR(-EINVAL);
373
f3b9605d 374 spin_lock(&qdisc_stab_lock);
175f9c1b
JK
375
376 list_for_each_entry(stab, &qdisc_stab_list, list) {
377 if (memcmp(&stab->szopts, s, sizeof(*s)))
378 continue;
379 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
380 continue;
381 stab->refcnt++;
f3b9605d 382 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
383 return stab;
384 }
385
f3b9605d 386 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
387
388 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
389 if (!stab)
390 return ERR_PTR(-ENOMEM);
391
392 stab->refcnt = 1;
393 stab->szopts = *s;
394 if (tsize > 0)
395 memcpy(stab->data, tab, tsize * sizeof(u16));
396
f3b9605d 397 spin_lock(&qdisc_stab_lock);
175f9c1b 398 list_add_tail(&stab->list, &qdisc_stab_list);
f3b9605d 399 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
400
401 return stab;
402}
403
404void qdisc_put_stab(struct qdisc_size_table *tab)
405{
406 if (!tab)
407 return;
408
f3b9605d 409 spin_lock(&qdisc_stab_lock);
175f9c1b
JK
410
411 if (--tab->refcnt == 0) {
412 list_del(&tab->list);
413 kfree(tab);
414 }
415
f3b9605d 416 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
417}
418EXPORT_SYMBOL(qdisc_put_stab);
419
420static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
421{
422 struct nlattr *nest;
423
424 nest = nla_nest_start(skb, TCA_STAB);
425 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
426 nla_nest_end(skb, nest);
427
428 return skb->len;
429
430nla_put_failure:
431 return -1;
432}
433
434void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
435{
436 int pkt_len, slot;
437
438 pkt_len = skb->len + stab->szopts.overhead;
439 if (unlikely(!stab->szopts.tsize))
440 goto out;
441
442 slot = pkt_len + stab->szopts.cell_align;
443 if (unlikely(slot < 0))
444 slot = 0;
445
446 slot >>= stab->szopts.cell_log;
447 if (likely(slot < stab->szopts.tsize))
448 pkt_len = stab->data[slot];
449 else
450 pkt_len = stab->data[stab->szopts.tsize - 1] *
451 (slot / stab->szopts.tsize) +
452 stab->data[slot % stab->szopts.tsize];
453
454 pkt_len <<= stab->szopts.size_log;
455out:
456 if (unlikely(pkt_len < 1))
457 pkt_len = 1;
458 qdisc_skb_cb(skb)->pkt_len = pkt_len;
459}
460EXPORT_SYMBOL(qdisc_calculate_pkt_len);
461
4179477f
PM
462static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
463{
464 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
465 timer);
466
467 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
11274e5a 468 smp_wmb();
8608db03 469 __netif_schedule(qdisc_root(wd->qdisc));
1936502d 470
4179477f
PM
471 return HRTIMER_NORESTART;
472}
473
474void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
475{
476 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
477 wd->timer.function = qdisc_watchdog;
478 wd->qdisc = qdisc;
479}
480EXPORT_SYMBOL(qdisc_watchdog_init);
481
482void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
483{
484 ktime_t time;
485
2540e051
JP
486 if (test_bit(__QDISC_STATE_DEACTIVATED,
487 &qdisc_root_sleeping(wd->qdisc)->state))
488 return;
489
4179477f
PM
490 wd->qdisc->flags |= TCQ_F_THROTTLED;
491 time = ktime_set(0, 0);
492 time = ktime_add_ns(time, PSCHED_US2NS(expires));
493 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
494}
495EXPORT_SYMBOL(qdisc_watchdog_schedule);
496
497void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
498{
499 hrtimer_cancel(&wd->timer);
500 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
501}
502EXPORT_SYMBOL(qdisc_watchdog_cancel);
1da177e4 503
a94f779f 504static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
6fe1c7a5
PM
505{
506 unsigned int size = n * sizeof(struct hlist_head), i;
507 struct hlist_head *h;
508
509 if (size <= PAGE_SIZE)
510 h = kmalloc(size, GFP_KERNEL);
511 else
512 h = (struct hlist_head *)
513 __get_free_pages(GFP_KERNEL, get_order(size));
514
515 if (h != NULL) {
516 for (i = 0; i < n; i++)
517 INIT_HLIST_HEAD(&h[i]);
518 }
519 return h;
520}
521
522static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
523{
524 unsigned int size = n * sizeof(struct hlist_head);
525
526 if (size <= PAGE_SIZE)
527 kfree(h);
528 else
529 free_pages((unsigned long)h, get_order(size));
530}
531
532void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
533{
534 struct Qdisc_class_common *cl;
535 struct hlist_node *n, *next;
536 struct hlist_head *nhash, *ohash;
537 unsigned int nsize, nmask, osize;
538 unsigned int i, h;
539
540 /* Rehash when load factor exceeds 0.75 */
541 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
542 return;
543 nsize = clhash->hashsize * 2;
544 nmask = nsize - 1;
545 nhash = qdisc_class_hash_alloc(nsize);
546 if (nhash == NULL)
547 return;
548
549 ohash = clhash->hash;
550 osize = clhash->hashsize;
551
552 sch_tree_lock(sch);
553 for (i = 0; i < osize; i++) {
554 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
555 h = qdisc_class_hash(cl->classid, nmask);
556 hlist_add_head(&cl->hnode, &nhash[h]);
557 }
558 }
559 clhash->hash = nhash;
560 clhash->hashsize = nsize;
561 clhash->hashmask = nmask;
562 sch_tree_unlock(sch);
563
564 qdisc_class_hash_free(ohash, osize);
565}
566EXPORT_SYMBOL(qdisc_class_hash_grow);
567
568int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
569{
570 unsigned int size = 4;
571
572 clhash->hash = qdisc_class_hash_alloc(size);
573 if (clhash->hash == NULL)
574 return -ENOMEM;
575 clhash->hashsize = size;
576 clhash->hashmask = size - 1;
577 clhash->hashelems = 0;
578 return 0;
579}
580EXPORT_SYMBOL(qdisc_class_hash_init);
581
582void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
583{
584 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
585}
586EXPORT_SYMBOL(qdisc_class_hash_destroy);
587
588void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
589 struct Qdisc_class_common *cl)
590{
591 unsigned int h;
592
593 INIT_HLIST_NODE(&cl->hnode);
594 h = qdisc_class_hash(cl->classid, clhash->hashmask);
595 hlist_add_head(&cl->hnode, &clhash->hash[h]);
596 clhash->hashelems++;
597}
598EXPORT_SYMBOL(qdisc_class_hash_insert);
599
600void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
601 struct Qdisc_class_common *cl)
602{
603 hlist_del(&cl->hnode);
604 clhash->hashelems--;
605}
606EXPORT_SYMBOL(qdisc_class_hash_remove);
607
1da177e4
LT
608/* Allocate an unique handle from space managed by kernel */
609
610static u32 qdisc_alloc_handle(struct net_device *dev)
611{
612 int i = 0x10000;
613 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
614
615 do {
616 autohandle += TC_H_MAKE(0x10000U, 0);
617 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
618 autohandle = TC_H_MAKE(0x80000000U, 0);
619 } while (qdisc_lookup(dev, autohandle) && --i > 0);
620
621 return i>0 ? autohandle : 0;
622}
623
99194cff 624/* Attach toplevel qdisc to device queue. */
1da177e4 625
99194cff
DM
626static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
627 struct Qdisc *qdisc)
1da177e4 628{
8d50b53d 629 struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
53049978 630 spinlock_t *root_lock;
53049978 631
666d9bbe 632 root_lock = qdisc_lock(oqdisc);
53049978
DM
633 spin_lock_bh(root_lock);
634
8d50b53d
DM
635 /* Prune old scheduler */
636 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
637 qdisc_reset(oqdisc);
1da177e4 638
8d50b53d
DM
639 /* ... and graft new one */
640 if (qdisc == NULL)
641 qdisc = &noop_qdisc;
642 dev_queue->qdisc_sleeping = qdisc;
f7a54c13 643 rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
1da177e4 644
53049978 645 spin_unlock_bh(root_lock);
1da177e4 646
1da177e4
LT
647 return oqdisc;
648}
649
43effa1e
PM
650void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
651{
20fea08b 652 const struct Qdisc_class_ops *cops;
43effa1e
PM
653 unsigned long cl;
654 u32 parentid;
655
656 if (n == 0)
657 return;
658 while ((parentid = sch->parent)) {
066a3b5b
JP
659 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
660 return;
661
5ce2d488 662 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
ffc8fefa
PM
663 if (sch == NULL) {
664 WARN_ON(parentid != TC_H_ROOT);
665 return;
666 }
43effa1e
PM
667 cops = sch->ops->cl_ops;
668 if (cops->qlen_notify) {
669 cl = cops->get(sch, parentid);
670 cops->qlen_notify(sch, cl);
671 cops->put(sch, cl);
672 }
673 sch->q.qlen -= n;
674 }
675}
676EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
1da177e4 677
99194cff
DM
678static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
679 struct Qdisc *old, struct Qdisc *new)
680{
681 if (new || old)
682 qdisc_notify(skb, n, clid, old, new);
1da177e4 683
4d8863a2 684 if (old)
99194cff 685 qdisc_destroy(old);
99194cff
DM
686}
687
688/* Graft qdisc "new" to class "classid" of qdisc "parent" or
689 * to device "dev".
690 *
691 * When appropriate send a netlink notification using 'skb'
692 * and "n".
693 *
694 * On success, destroy old qdisc.
1da177e4
LT
695 */
696
697static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
99194cff
DM
698 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
699 struct Qdisc *new, struct Qdisc *old)
1da177e4 700{
99194cff 701 struct Qdisc *q = old;
1da177e4 702 int err = 0;
1da177e4 703
10297b99 704 if (parent == NULL) {
99194cff
DM
705 unsigned int i, num_q, ingress;
706
707 ingress = 0;
708 num_q = dev->num_tx_queues;
8d50b53d
DM
709 if ((q && q->flags & TCQ_F_INGRESS) ||
710 (new && new->flags & TCQ_F_INGRESS)) {
99194cff
DM
711 num_q = 1;
712 ingress = 1;
713 }
714
715 if (dev->flags & IFF_UP)
716 dev_deactivate(dev);
717
718 for (i = 0; i < num_q; i++) {
719 struct netdev_queue *dev_queue = &dev->rx_queue;
720
721 if (!ingress)
722 dev_queue = netdev_get_tx_queue(dev, i);
723
8d50b53d
DM
724 old = dev_graft_qdisc(dev_queue, new);
725 if (new && i > 0)
726 atomic_inc(&new->refcnt);
727
99194cff 728 notify_and_destroy(skb, n, classid, old, new);
1da177e4 729 }
99194cff
DM
730
731 if (dev->flags & IFF_UP)
732 dev_activate(dev);
1da177e4 733 } else {
20fea08b 734 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1da177e4
LT
735
736 err = -EINVAL;
737
738 if (cops) {
739 unsigned long cl = cops->get(parent, classid);
740 if (cl) {
99194cff 741 err = cops->graft(parent, cl, new, &old);
1da177e4
LT
742 cops->put(parent, cl);
743 }
744 }
99194cff
DM
745 if (!err)
746 notify_and_destroy(skb, n, classid, old, new);
1da177e4
LT
747 }
748 return err;
749}
750
25bfcd5a
JP
751/* lockdep annotation is needed for ingress; egress gets it only for name */
752static struct lock_class_key qdisc_tx_lock;
753static struct lock_class_key qdisc_rx_lock;
754
1da177e4
LT
755/*
756 Allocate and initialize new qdisc.
757
758 Parameters are passed via opt.
759 */
760
761static struct Qdisc *
bb949fbd
DM
762qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
763 u32 parent, u32 handle, struct nlattr **tca, int *errp)
1da177e4
LT
764{
765 int err;
1e90474c 766 struct nlattr *kind = tca[TCA_KIND];
1da177e4
LT
767 struct Qdisc *sch;
768 struct Qdisc_ops *ops;
175f9c1b 769 struct qdisc_size_table *stab;
1da177e4
LT
770
771 ops = qdisc_lookup_ops(kind);
95a5afca 772#ifdef CONFIG_MODULES
1da177e4
LT
773 if (ops == NULL && kind != NULL) {
774 char name[IFNAMSIZ];
1e90474c 775 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1da177e4
LT
776 /* We dropped the RTNL semaphore in order to
777 * perform the module load. So, even if we
778 * succeeded in loading the module we have to
779 * tell the caller to replay the request. We
780 * indicate this using -EAGAIN.
781 * We replay the request because the device may
782 * go away in the mean time.
783 */
784 rtnl_unlock();
785 request_module("sch_%s", name);
786 rtnl_lock();
787 ops = qdisc_lookup_ops(kind);
788 if (ops != NULL) {
789 /* We will try again qdisc_lookup_ops,
790 * so don't keep a reference.
791 */
792 module_put(ops->owner);
793 err = -EAGAIN;
794 goto err_out;
795 }
796 }
797 }
798#endif
799
b9e2cc0f 800 err = -ENOENT;
1da177e4
LT
801 if (ops == NULL)
802 goto err_out;
803
5ce2d488 804 sch = qdisc_alloc(dev_queue, ops);
3d54b82f
TG
805 if (IS_ERR(sch)) {
806 err = PTR_ERR(sch);
1da177e4 807 goto err_out2;
3d54b82f 808 }
1da177e4 809
ffc8fefa
PM
810 sch->parent = parent;
811
3d54b82f 812 if (handle == TC_H_INGRESS) {
1da177e4 813 sch->flags |= TCQ_F_INGRESS;
3d54b82f 814 handle = TC_H_MAKE(TC_H_INGRESS, 0);
25bfcd5a 815 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
fd44de7c 816 } else {
fd44de7c
PM
817 if (handle == 0) {
818 handle = qdisc_alloc_handle(dev);
819 err = -ENOMEM;
820 if (handle == 0)
821 goto err_out3;
822 }
25bfcd5a 823 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1da177e4
LT
824 }
825
3d54b82f 826 sch->handle = handle;
1da177e4 827
1e90474c 828 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
175f9c1b
JK
829 if (tca[TCA_STAB]) {
830 stab = qdisc_get_stab(tca[TCA_STAB]);
831 if (IS_ERR(stab)) {
832 err = PTR_ERR(stab);
833 goto err_out3;
834 }
835 sch->stab = stab;
836 }
1e90474c 837 if (tca[TCA_RATE]) {
f6f9b93f
JP
838 spinlock_t *root_lock;
839
840 if ((sch->parent != TC_H_ROOT) &&
841 !(sch->flags & TCQ_F_INGRESS))
842 root_lock = qdisc_root_sleeping_lock(sch);
843 else
844 root_lock = qdisc_lock(sch);
845
023e09a7 846 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
f6f9b93f 847 root_lock, tca[TCA_RATE]);
023e09a7
TG
848 if (err) {
849 /*
850 * Any broken qdiscs that would require
851 * a ops->reset() here? The qdisc was never
852 * in action so it shouldn't be necessary.
853 */
854 if (ops->destroy)
855 ops->destroy(sch);
856 goto err_out3;
857 }
858 }
f6e0b239
JP
859
860 qdisc_list_add(sch);
1da177e4 861
1da177e4
LT
862 return sch;
863 }
864err_out3:
175f9c1b 865 qdisc_put_stab(sch->stab);
1da177e4 866 dev_put(dev);
3d54b82f 867 kfree((char *) sch - sch->padded);
1da177e4
LT
868err_out2:
869 module_put(ops->owner);
870err_out:
871 *errp = err;
1da177e4
LT
872 return NULL;
873}
874
1e90474c 875static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1da177e4 876{
175f9c1b
JK
877 struct qdisc_size_table *stab = NULL;
878 int err = 0;
1da177e4 879
175f9c1b 880 if (tca[TCA_OPTIONS]) {
1da177e4
LT
881 if (sch->ops->change == NULL)
882 return -EINVAL;
1e90474c 883 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1da177e4
LT
884 if (err)
885 return err;
886 }
175f9c1b
JK
887
888 if (tca[TCA_STAB]) {
889 stab = qdisc_get_stab(tca[TCA_STAB]);
890 if (IS_ERR(stab))
891 return PTR_ERR(stab);
892 }
893
894 qdisc_put_stab(sch->stab);
895 sch->stab = stab;
896
1e90474c 897 if (tca[TCA_RATE])
1da177e4 898 gen_replace_estimator(&sch->bstats, &sch->rate_est,
f6f9b93f
JP
899 qdisc_root_sleeping_lock(sch),
900 tca[TCA_RATE]);
1da177e4
LT
901 return 0;
902}
903
904struct check_loop_arg
905{
906 struct qdisc_walker w;
907 struct Qdisc *p;
908 int depth;
909};
910
911static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
912
913static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
914{
915 struct check_loop_arg arg;
916
917 if (q->ops->cl_ops == NULL)
918 return 0;
919
920 arg.w.stop = arg.w.skip = arg.w.count = 0;
921 arg.w.fn = check_loop_fn;
922 arg.depth = depth;
923 arg.p = p;
924 q->ops->cl_ops->walk(q, &arg.w);
925 return arg.w.stop ? -ELOOP : 0;
926}
927
928static int
929check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
930{
931 struct Qdisc *leaf;
20fea08b 932 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1da177e4
LT
933 struct check_loop_arg *arg = (struct check_loop_arg *)w;
934
935 leaf = cops->leaf(q, cl);
936 if (leaf) {
937 if (leaf == arg->p || arg->depth > 7)
938 return -ELOOP;
939 return check_loop(leaf, arg->p, arg->depth + 1);
940 }
941 return 0;
942}
943
944/*
945 * Delete/get qdisc.
946 */
947
948static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
949{
3b1e0a65 950 struct net *net = sock_net(skb->sk);
1da177e4 951 struct tcmsg *tcm = NLMSG_DATA(n);
1e90474c 952 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
953 struct net_device *dev;
954 u32 clid = tcm->tcm_parent;
955 struct Qdisc *q = NULL;
956 struct Qdisc *p = NULL;
957 int err;
958
b854272b
DL
959 if (net != &init_net)
960 return -EINVAL;
961
881d966b 962 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
963 return -ENODEV;
964
1e90474c
PM
965 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
966 if (err < 0)
967 return err;
968
1da177e4
LT
969 if (clid) {
970 if (clid != TC_H_ROOT) {
971 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
972 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
973 return -ENOENT;
974 q = qdisc_leaf(p, clid);
975 } else { /* ingress */
8123b421 976 q = dev->rx_queue.qdisc_sleeping;
10297b99 977 }
1da177e4 978 } else {
e8a0464c
DM
979 struct netdev_queue *dev_queue;
980 dev_queue = netdev_get_tx_queue(dev, 0);
b0e1e646 981 q = dev_queue->qdisc_sleeping;
1da177e4
LT
982 }
983 if (!q)
984 return -ENOENT;
985
986 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
987 return -EINVAL;
988 } else {
989 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
990 return -ENOENT;
991 }
992
1e90474c 993 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
994 return -EINVAL;
995
996 if (n->nlmsg_type == RTM_DELQDISC) {
997 if (!clid)
998 return -EINVAL;
999 if (q->handle == 0)
1000 return -ENOENT;
99194cff 1001 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1da177e4 1002 return err;
1da177e4
LT
1003 } else {
1004 qdisc_notify(skb, n, clid, NULL, q);
1005 }
1006 return 0;
1007}
1008
1009/*
1010 Create/change qdisc.
1011 */
1012
1013static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1014{
3b1e0a65 1015 struct net *net = sock_net(skb->sk);
1da177e4 1016 struct tcmsg *tcm;
1e90474c 1017 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
1018 struct net_device *dev;
1019 u32 clid;
1020 struct Qdisc *q, *p;
1021 int err;
1022
b854272b
DL
1023 if (net != &init_net)
1024 return -EINVAL;
1025
1da177e4
LT
1026replay:
1027 /* Reinit, just in case something touches this. */
1028 tcm = NLMSG_DATA(n);
1da177e4
LT
1029 clid = tcm->tcm_parent;
1030 q = p = NULL;
1031
881d966b 1032 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1033 return -ENODEV;
1034
1e90474c
PM
1035 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1036 if (err < 0)
1037 return err;
1038
1da177e4
LT
1039 if (clid) {
1040 if (clid != TC_H_ROOT) {
1041 if (clid != TC_H_INGRESS) {
1042 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1043 return -ENOENT;
1044 q = qdisc_leaf(p, clid);
1045 } else { /*ingress */
8123b421 1046 q = dev->rx_queue.qdisc_sleeping;
1da177e4
LT
1047 }
1048 } else {
e8a0464c
DM
1049 struct netdev_queue *dev_queue;
1050 dev_queue = netdev_get_tx_queue(dev, 0);
b0e1e646 1051 q = dev_queue->qdisc_sleeping;
1da177e4
LT
1052 }
1053
1054 /* It may be default qdisc, ignore it */
1055 if (q && q->handle == 0)
1056 q = NULL;
1057
1058 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1059 if (tcm->tcm_handle) {
1060 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1061 return -EEXIST;
1062 if (TC_H_MIN(tcm->tcm_handle))
1063 return -EINVAL;
1064 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1065 goto create_n_graft;
1066 if (n->nlmsg_flags&NLM_F_EXCL)
1067 return -EEXIST;
1e90474c 1068 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
1069 return -EINVAL;
1070 if (q == p ||
1071 (p && check_loop(q, p, 0)))
1072 return -ELOOP;
1073 atomic_inc(&q->refcnt);
1074 goto graft;
1075 } else {
1076 if (q == NULL)
1077 goto create_n_graft;
1078
1079 /* This magic test requires explanation.
1080 *
1081 * We know, that some child q is already
1082 * attached to this parent and have choice:
1083 * either to change it or to create/graft new one.
1084 *
1085 * 1. We are allowed to create/graft only
1086 * if CREATE and REPLACE flags are set.
1087 *
1088 * 2. If EXCL is set, requestor wanted to say,
1089 * that qdisc tcm_handle is not expected
1090 * to exist, so that we choose create/graft too.
1091 *
1092 * 3. The last case is when no flags are set.
1093 * Alas, it is sort of hole in API, we
1094 * cannot decide what to do unambiguously.
1095 * For now we select create/graft, if
1096 * user gave KIND, which does not match existing.
1097 */
1098 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1099 (n->nlmsg_flags&NLM_F_REPLACE) &&
1100 ((n->nlmsg_flags&NLM_F_EXCL) ||
1e90474c
PM
1101 (tca[TCA_KIND] &&
1102 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1da177e4
LT
1103 goto create_n_graft;
1104 }
1105 }
1106 } else {
1107 if (!tcm->tcm_handle)
1108 return -EINVAL;
1109 q = qdisc_lookup(dev, tcm->tcm_handle);
1110 }
1111
1112 /* Change qdisc parameters */
1113 if (q == NULL)
1114 return -ENOENT;
1115 if (n->nlmsg_flags&NLM_F_EXCL)
1116 return -EEXIST;
1e90474c 1117 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
1118 return -EINVAL;
1119 err = qdisc_change(q, tca);
1120 if (err == 0)
1121 qdisc_notify(skb, n, clid, NULL, q);
1122 return err;
1123
1124create_n_graft:
1125 if (!(n->nlmsg_flags&NLM_F_CREATE))
1126 return -ENOENT;
1127 if (clid == TC_H_INGRESS)
bb949fbd
DM
1128 q = qdisc_create(dev, &dev->rx_queue,
1129 tcm->tcm_parent, tcm->tcm_parent,
ffc8fefa 1130 tca, &err);
10297b99 1131 else
e8a0464c 1132 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
bb949fbd 1133 tcm->tcm_parent, tcm->tcm_handle,
ffc8fefa 1134 tca, &err);
1da177e4
LT
1135 if (q == NULL) {
1136 if (err == -EAGAIN)
1137 goto replay;
1138 return err;
1139 }
1140
1141graft:
e5befbd9
IJ
1142 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1143 if (err) {
1144 if (q)
1145 qdisc_destroy(q);
1146 return err;
1da177e4 1147 }
e5befbd9 1148
1da177e4
LT
1149 return 0;
1150}
1151
1152static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
e431b8c0 1153 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1154{
1155 struct tcmsg *tcm;
1156 struct nlmsghdr *nlh;
27a884dc 1157 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1158 struct gnet_dump d;
1159
e431b8c0 1160 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1161 tcm = NLMSG_DATA(nlh);
1162 tcm->tcm_family = AF_UNSPEC;
9ef1d4c7
PM
1163 tcm->tcm__pad1 = 0;
1164 tcm->tcm__pad2 = 0;
5ce2d488 1165 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1da177e4
LT
1166 tcm->tcm_parent = clid;
1167 tcm->tcm_handle = q->handle;
1168 tcm->tcm_info = atomic_read(&q->refcnt);
57e1c487 1169 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1da177e4 1170 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1e90474c 1171 goto nla_put_failure;
1da177e4
LT
1172 q->qstats.qlen = q->q.qlen;
1173
175f9c1b
JK
1174 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1175 goto nla_put_failure;
1176
102396ae
JP
1177 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1178 qdisc_root_sleeping_lock(q), &d) < 0)
1e90474c 1179 goto nla_put_failure;
1da177e4
LT
1180
1181 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1e90474c 1182 goto nla_put_failure;
1da177e4
LT
1183
1184 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1da177e4 1185 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1da177e4 1186 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1e90474c 1187 goto nla_put_failure;
10297b99 1188
1da177e4 1189 if (gnet_stats_finish_copy(&d) < 0)
1e90474c 1190 goto nla_put_failure;
10297b99 1191
27a884dc 1192 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1193 return skb->len;
1194
1195nlmsg_failure:
1e90474c 1196nla_put_failure:
dc5fc579 1197 nlmsg_trim(skb, b);
1da177e4
LT
1198 return -1;
1199}
1200
1201static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1202 u32 clid, struct Qdisc *old, struct Qdisc *new)
1203{
1204 struct sk_buff *skb;
1205 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1206
1207 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1208 if (!skb)
1209 return -ENOBUFS;
1210
1211 if (old && old->handle) {
1212 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1213 goto err_out;
1214 }
1215 if (new) {
1216 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1217 goto err_out;
1218 }
1219
1220 if (skb->len)
97c53cac 1221 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1222
1223err_out:
1224 kfree_skb(skb);
1225 return -EINVAL;
1226}
1227
30723673
DM
1228static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1229{
1230 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1231}
1232
1233static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1234 struct netlink_callback *cb,
1235 int *q_idx_p, int s_q_idx)
1236{
1237 int ret = 0, q_idx = *q_idx_p;
1238 struct Qdisc *q;
1239
1240 if (!root)
1241 return 0;
1242
1243 q = root;
1244 if (q_idx < s_q_idx) {
1245 q_idx++;
1246 } else {
1247 if (!tc_qdisc_dump_ignore(q) &&
1248 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1249 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1250 goto done;
1251 q_idx++;
1252 }
1253 list_for_each_entry(q, &root->list, list) {
1254 if (q_idx < s_q_idx) {
1255 q_idx++;
1256 continue;
1257 }
1258 if (!tc_qdisc_dump_ignore(q) &&
1259 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1260 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1261 goto done;
1262 q_idx++;
1263 }
1264
1265out:
1266 *q_idx_p = q_idx;
1267 return ret;
1268done:
1269 ret = -1;
1270 goto out;
1271}
1272
1da177e4
LT
1273static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1274{
3b1e0a65 1275 struct net *net = sock_net(skb->sk);
1da177e4
LT
1276 int idx, q_idx;
1277 int s_idx, s_q_idx;
1278 struct net_device *dev;
1da177e4 1279
b854272b
DL
1280 if (net != &init_net)
1281 return 0;
1282
1da177e4
LT
1283 s_idx = cb->args[0];
1284 s_q_idx = q_idx = cb->args[1];
1285 read_lock(&dev_base_lock);
7562f876 1286 idx = 0;
881d966b 1287 for_each_netdev(&init_net, dev) {
30723673
DM
1288 struct netdev_queue *dev_queue;
1289
1da177e4 1290 if (idx < s_idx)
7562f876 1291 goto cont;
1da177e4
LT
1292 if (idx > s_idx)
1293 s_q_idx = 0;
1da177e4 1294 q_idx = 0;
30723673
DM
1295
1296 dev_queue = netdev_get_tx_queue(dev, 0);
827ebd64 1297 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
30723673
DM
1298 goto done;
1299
1300 dev_queue = &dev->rx_queue;
827ebd64 1301 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
30723673
DM
1302 goto done;
1303
7562f876
PE
1304cont:
1305 idx++;
1da177e4
LT
1306 }
1307
1308done:
1309 read_unlock(&dev_base_lock);
1310
1311 cb->args[0] = idx;
1312 cb->args[1] = q_idx;
1313
1314 return skb->len;
1315}
1316
1317
1318
1319/************************************************
1320 * Traffic classes manipulation. *
1321 ************************************************/
1322
1323
1324
1325static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1326{
3b1e0a65 1327 struct net *net = sock_net(skb->sk);
b0e1e646 1328 struct netdev_queue *dev_queue;
1da177e4 1329 struct tcmsg *tcm = NLMSG_DATA(n);
1e90474c 1330 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
1331 struct net_device *dev;
1332 struct Qdisc *q = NULL;
20fea08b 1333 const struct Qdisc_class_ops *cops;
1da177e4
LT
1334 unsigned long cl = 0;
1335 unsigned long new_cl;
1336 u32 pid = tcm->tcm_parent;
1337 u32 clid = tcm->tcm_handle;
1338 u32 qid = TC_H_MAJ(clid);
1339 int err;
1340
b854272b
DL
1341 if (net != &init_net)
1342 return -EINVAL;
1343
881d966b 1344 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1345 return -ENODEV;
1346
1e90474c
PM
1347 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1348 if (err < 0)
1349 return err;
1350
1da177e4
LT
1351 /*
1352 parent == TC_H_UNSPEC - unspecified parent.
1353 parent == TC_H_ROOT - class is root, which has no parent.
1354 parent == X:0 - parent is root class.
1355 parent == X:Y - parent is a node in hierarchy.
1356 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1357
1358 handle == 0:0 - generate handle from kernel pool.
1359 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1360 handle == X:Y - clear.
1361 handle == X:0 - root class.
1362 */
1363
1364 /* Step 1. Determine qdisc handle X:0 */
1365
e8a0464c 1366 dev_queue = netdev_get_tx_queue(dev, 0);
1da177e4
LT
1367 if (pid != TC_H_ROOT) {
1368 u32 qid1 = TC_H_MAJ(pid);
1369
1370 if (qid && qid1) {
1371 /* If both majors are known, they must be identical. */
1372 if (qid != qid1)
1373 return -EINVAL;
1374 } else if (qid1) {
1375 qid = qid1;
1376 } else if (qid == 0)
b0e1e646 1377 qid = dev_queue->qdisc_sleeping->handle;
1da177e4
LT
1378
1379 /* Now qid is genuine qdisc handle consistent
1380 both with parent and child.
1381
1382 TC_H_MAJ(pid) still may be unspecified, complete it now.
1383 */
1384 if (pid)
1385 pid = TC_H_MAKE(qid, pid);
1386 } else {
1387 if (qid == 0)
b0e1e646 1388 qid = dev_queue->qdisc_sleeping->handle;
1da177e4
LT
1389 }
1390
1391 /* OK. Locate qdisc */
10297b99 1392 if ((q = qdisc_lookup(dev, qid)) == NULL)
1da177e4
LT
1393 return -ENOENT;
1394
1395 /* An check that it supports classes */
1396 cops = q->ops->cl_ops;
1397 if (cops == NULL)
1398 return -EINVAL;
1399
1400 /* Now try to get class */
1401 if (clid == 0) {
1402 if (pid == TC_H_ROOT)
1403 clid = qid;
1404 } else
1405 clid = TC_H_MAKE(qid, clid);
1406
1407 if (clid)
1408 cl = cops->get(q, clid);
1409
1410 if (cl == 0) {
1411 err = -ENOENT;
1412 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1413 goto out;
1414 } else {
1415 switch (n->nlmsg_type) {
10297b99 1416 case RTM_NEWTCLASS:
1da177e4
LT
1417 err = -EEXIST;
1418 if (n->nlmsg_flags&NLM_F_EXCL)
1419 goto out;
1420 break;
1421 case RTM_DELTCLASS:
1422 err = cops->delete(q, cl);
1423 if (err == 0)
1424 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1425 goto out;
1426 case RTM_GETTCLASS:
1427 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1428 goto out;
1429 default:
1430 err = -EINVAL;
1431 goto out;
1432 }
1433 }
1434
1435 new_cl = cl;
1436 err = cops->change(q, clid, pid, tca, &new_cl);
1437 if (err == 0)
1438 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1439
1440out:
1441 if (cl)
1442 cops->put(q, cl);
1443
1444 return err;
1445}
1446
1447
1448static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1449 unsigned long cl,
e431b8c0 1450 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1451{
1452 struct tcmsg *tcm;
1453 struct nlmsghdr *nlh;
27a884dc 1454 unsigned char *b = skb_tail_pointer(skb);
1da177e4 1455 struct gnet_dump d;
20fea08b 1456 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1da177e4 1457
e431b8c0 1458 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1459 tcm = NLMSG_DATA(nlh);
1460 tcm->tcm_family = AF_UNSPEC;
5ce2d488 1461 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1da177e4
LT
1462 tcm->tcm_parent = q->handle;
1463 tcm->tcm_handle = q->handle;
1464 tcm->tcm_info = 0;
57e1c487 1465 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1da177e4 1466 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1e90474c 1467 goto nla_put_failure;
1da177e4 1468
102396ae
JP
1469 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1470 qdisc_root_sleeping_lock(q), &d) < 0)
1e90474c 1471 goto nla_put_failure;
1da177e4
LT
1472
1473 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1e90474c 1474 goto nla_put_failure;
1da177e4
LT
1475
1476 if (gnet_stats_finish_copy(&d) < 0)
1e90474c 1477 goto nla_put_failure;
1da177e4 1478
27a884dc 1479 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1480 return skb->len;
1481
1482nlmsg_failure:
1e90474c 1483nla_put_failure:
dc5fc579 1484 nlmsg_trim(skb, b);
1da177e4
LT
1485 return -1;
1486}
1487
1488static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1489 struct Qdisc *q, unsigned long cl, int event)
1490{
1491 struct sk_buff *skb;
1492 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1493
1494 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1495 if (!skb)
1496 return -ENOBUFS;
1497
1498 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1499 kfree_skb(skb);
1500 return -EINVAL;
1501 }
1502
97c53cac 1503 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1504}
1505
1506struct qdisc_dump_args
1507{
1508 struct qdisc_walker w;
1509 struct sk_buff *skb;
1510 struct netlink_callback *cb;
1511};
1512
1513static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1514{
1515 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1516
1517 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1518 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1519}
1520
30723673
DM
1521static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1522 struct tcmsg *tcm, struct netlink_callback *cb,
1523 int *t_p, int s_t)
1524{
1525 struct qdisc_dump_args arg;
1526
1527 if (tc_qdisc_dump_ignore(q) ||
1528 *t_p < s_t || !q->ops->cl_ops ||
1529 (tcm->tcm_parent &&
1530 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1531 (*t_p)++;
1532 return 0;
1533 }
1534 if (*t_p > s_t)
1535 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1536 arg.w.fn = qdisc_class_dump;
1537 arg.skb = skb;
1538 arg.cb = cb;
1539 arg.w.stop = 0;
1540 arg.w.skip = cb->args[1];
1541 arg.w.count = 0;
1542 q->ops->cl_ops->walk(q, &arg.w);
1543 cb->args[1] = arg.w.count;
1544 if (arg.w.stop)
1545 return -1;
1546 (*t_p)++;
1547 return 0;
1548}
1549
1550static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1551 struct tcmsg *tcm, struct netlink_callback *cb,
1552 int *t_p, int s_t)
1553{
1554 struct Qdisc *q;
1555
1556 if (!root)
1557 return 0;
1558
1559 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1560 return -1;
1561
1562 list_for_each_entry(q, &root->list, list) {
1563 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1564 return -1;
1565 }
1566
1567 return 0;
1568}
1569
1da177e4
LT
1570static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1571{
30723673 1572 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
3b1e0a65 1573 struct net *net = sock_net(skb->sk);
30723673 1574 struct netdev_queue *dev_queue;
1da177e4 1575 struct net_device *dev;
30723673 1576 int t, s_t;
1da177e4 1577
b854272b
DL
1578 if (net != &init_net)
1579 return 0;
1580
1da177e4
LT
1581 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1582 return 0;
881d966b 1583 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1584 return 0;
1585
1586 s_t = cb->args[0];
1587 t = 0;
1588
30723673 1589 dev_queue = netdev_get_tx_queue(dev, 0);
8123b421 1590 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
30723673
DM
1591 goto done;
1592
1593 dev_queue = &dev->rx_queue;
8123b421 1594 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
30723673 1595 goto done;
1da177e4 1596
30723673 1597done:
1da177e4
LT
1598 cb->args[0] = t;
1599
1600 dev_put(dev);
1601 return skb->len;
1602}
1603
1604/* Main classifier routine: scans classifier chain attached
1605 to this qdisc, (optionally) tests for protocol and asks
1606 specific classifiers.
1607 */
73ca4918
PM
1608int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1609 struct tcf_result *res)
1610{
1611 __be16 protocol = skb->protocol;
1612 int err = 0;
1613
1614 for (; tp; tp = tp->next) {
1615 if ((tp->protocol == protocol ||
1616 tp->protocol == htons(ETH_P_ALL)) &&
1617 (err = tp->classify(skb, tp, res)) >= 0) {
1618#ifdef CONFIG_NET_CLS_ACT
1619 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1620 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1621#endif
1622 return err;
1623 }
1624 }
1625 return -1;
1626}
1627EXPORT_SYMBOL(tc_classify_compat);
1628
1da177e4 1629int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
73ca4918 1630 struct tcf_result *res)
1da177e4
LT
1631{
1632 int err = 0;
73ca4918 1633 __be16 protocol;
1da177e4
LT
1634#ifdef CONFIG_NET_CLS_ACT
1635 struct tcf_proto *otp = tp;
1636reclassify:
1637#endif
1638 protocol = skb->protocol;
1639
73ca4918 1640 err = tc_classify_compat(skb, tp, res);
1da177e4 1641#ifdef CONFIG_NET_CLS_ACT
73ca4918
PM
1642 if (err == TC_ACT_RECLASSIFY) {
1643 u32 verd = G_TC_VERD(skb->tc_verd);
1644 tp = otp;
1645
1646 if (verd++ >= MAX_REC_LOOP) {
1647 printk("rule prio %u protocol %02x reclassify loop, "
1648 "packet dropped\n",
1649 tp->prio&0xffff, ntohs(tp->protocol));
1650 return TC_ACT_SHOT;
1da177e4 1651 }
73ca4918
PM
1652 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1653 goto reclassify;
1da177e4 1654 }
73ca4918
PM
1655#endif
1656 return err;
1da177e4 1657}
73ca4918 1658EXPORT_SYMBOL(tc_classify);
1da177e4 1659
a48b5a61
PM
1660void tcf_destroy(struct tcf_proto *tp)
1661{
1662 tp->ops->destroy(tp);
1663 module_put(tp->ops->owner);
1664 kfree(tp);
1665}
1666
ff31ab56 1667void tcf_destroy_chain(struct tcf_proto **fl)
a48b5a61
PM
1668{
1669 struct tcf_proto *tp;
1670
ff31ab56
PM
1671 while ((tp = *fl) != NULL) {
1672 *fl = tp->next;
a48b5a61
PM
1673 tcf_destroy(tp);
1674 }
1675}
1676EXPORT_SYMBOL(tcf_destroy_chain);
1677
1da177e4
LT
1678#ifdef CONFIG_PROC_FS
1679static int psched_show(struct seq_file *seq, void *v)
1680{
3c0cfc13
PM
1681 struct timespec ts;
1682
1683 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1da177e4 1684 seq_printf(seq, "%08x %08x %08x %08x\n",
641b9e0e 1685 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
514bca32 1686 1000000,
3c0cfc13 1687 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1da177e4
LT
1688
1689 return 0;
1690}
1691
1692static int psched_open(struct inode *inode, struct file *file)
1693{
1694 return single_open(file, psched_show, PDE(inode)->data);
1695}
1696
da7071d7 1697static const struct file_operations psched_fops = {
1da177e4
LT
1698 .owner = THIS_MODULE,
1699 .open = psched_open,
1700 .read = seq_read,
1701 .llseek = seq_lseek,
1702 .release = single_release,
10297b99 1703};
1da177e4
LT
1704#endif
1705
1da177e4
LT
1706static int __init pktsched_init(void)
1707{
1da177e4
LT
1708 register_qdisc(&pfifo_qdisc_ops);
1709 register_qdisc(&bfifo_qdisc_ops);
457c4cbc 1710 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1da177e4 1711
be577ddc
TG
1712 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1713 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1714 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1715 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1716 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1717 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1718
1da177e4
LT
1719 return 0;
1720}
1721
1722subsys_initcall(pktsched_init);
This page took 0.518538 seconds and 5 git commands to generate.