pkt_sched: sch_api: Remove qdisc_list_lock
[deliverable/linux.git] / net / sched / sch_api.c
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 struct Qdisc *q, unsigned long cl, int event);
41
42 /*
43
44 Short review.
45 -------------
46
47 This file consists of two interrelated parts:
48
49 1. queueing disciplines manager frontend.
50 2. traffic classes manager frontend.
51
52 Generally, queueing discipline ("qdisc") is a black box,
53 which is able to enqueue packets and to dequeue them (when
54 device is ready to send something) in order and at times
55 determined by algorithm hidden in it.
56
57 qdisc's are divided to two categories:
58 - "queues", which have no internal structure visible from outside.
59 - "schedulers", which split all the packets to "traffic classes",
60 using "packet classifiers" (look at cls_api.c)
61
62 In turn, classes may have child qdiscs (as rule, queues)
63 attached to them etc. etc. etc.
64
65 The goal of the routines in this file is to translate
66 information supplied by user in the form of handles
67 to more intelligible for kernel form, to make some sanity
68 checks and part of work, which is common to all qdiscs
69 and to provide rtnetlink notifications.
70
71 All real intelligent work is done inside qdisc modules.
72
73
74
75 Every discipline has two major routines: enqueue and dequeue.
76
77 ---dequeue
78
79 dequeue usually returns a skb to send. It is allowed to return NULL,
80 but it does not mean that queue is empty, it just means that
81 discipline does not want to send anything this time.
82 Queue is really empty if q->q.qlen == 0.
83 For complicated disciplines with multiple queues q->q is not
84 real packet queue, but however q->q.qlen must be valid.
85
86 ---enqueue
87
88 enqueue returns 0, if packet was enqueued successfully.
89 If packet (this one or another one) was dropped, it returns
90 not zero error code.
91 NET_XMIT_DROP - this packet dropped
92 Expected action: do not backoff, but wait until queue will clear.
93 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
94 Expected action: backoff or ignore
95 NET_XMIT_POLICED - dropped by police.
96 Expected action: backoff or error to real-time apps.
97
98 Auxiliary routines:
99
100 ---peek
101
102 like dequeue but without removing a packet from the queue
103
104 ---reset
105
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
108
109 ---init
110
111 initializes newly created qdisc.
112
113 ---destroy
114
115 destroys resources allocated by init and during lifetime of qdisc.
116
117 ---change
118
119 changes qdisc parameters.
120 */
121
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
124
125
126 /************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
129
130
131 /* The list of all installed queueing disciplines. */
132
133 static struct Qdisc_ops *qdisc_base;
134
135 /* Register/uregister queueing discipline */
136
137 int register_qdisc(struct Qdisc_ops *qops)
138 {
139 struct Qdisc_ops *q, **qp;
140 int rc = -EEXIST;
141
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
145 goto out;
146
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
149 if (qops->peek == NULL) {
150 if (qops->dequeue == NULL) {
151 qops->peek = noop_qdisc_ops.peek;
152 } else {
153 rc = -EINVAL;
154 goto out;
155 }
156 }
157 if (qops->dequeue == NULL)
158 qops->dequeue = noop_qdisc_ops.dequeue;
159
160 qops->next = NULL;
161 *qp = qops;
162 rc = 0;
163 out:
164 write_unlock(&qdisc_mod_lock);
165 return rc;
166 }
167 EXPORT_SYMBOL(register_qdisc);
168
169 int unregister_qdisc(struct Qdisc_ops *qops)
170 {
171 struct Qdisc_ops *q, **qp;
172 int err = -ENOENT;
173
174 write_lock(&qdisc_mod_lock);
175 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176 if (q == qops)
177 break;
178 if (q) {
179 *qp = q->next;
180 q->next = NULL;
181 err = 0;
182 }
183 write_unlock(&qdisc_mod_lock);
184 return err;
185 }
186 EXPORT_SYMBOL(unregister_qdisc);
187
188 /* We know handle. Find qdisc among all qdisc's attached to device
189 (root qdisc, all its children, children of children etc.)
190 */
191
192 struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193 {
194 struct Qdisc *q;
195
196 if (!(root->flags & TCQ_F_BUILTIN) &&
197 root->handle == handle)
198 return root;
199
200 list_for_each_entry(q, &root->list, list) {
201 if (q->handle == handle)
202 return q;
203 }
204 return NULL;
205 }
206
207 static void qdisc_list_add(struct Qdisc *q)
208 {
209 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210 list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
211 }
212
213 void qdisc_list_del(struct Qdisc *q)
214 {
215 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
216 list_del(&q->list);
217 }
218 EXPORT_SYMBOL(qdisc_list_del);
219
220 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
221 {
222 unsigned int i;
223 struct Qdisc *q;
224
225 for (i = 0; i < dev->num_tx_queues; i++) {
226 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
227 struct Qdisc *txq_root = txq->qdisc_sleeping;
228
229 q = qdisc_match_from_root(txq_root, handle);
230 if (q)
231 goto out;
232 }
233
234 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
235 out:
236 return q;
237 }
238
239 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
240 {
241 unsigned long cl;
242 struct Qdisc *leaf;
243 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
244
245 if (cops == NULL)
246 return NULL;
247 cl = cops->get(p, classid);
248
249 if (cl == 0)
250 return NULL;
251 leaf = cops->leaf(p, cl);
252 cops->put(p, cl);
253 return leaf;
254 }
255
256 /* Find queueing discipline by name */
257
258 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
259 {
260 struct Qdisc_ops *q = NULL;
261
262 if (kind) {
263 read_lock(&qdisc_mod_lock);
264 for (q = qdisc_base; q; q = q->next) {
265 if (nla_strcmp(kind, q->id) == 0) {
266 if (!try_module_get(q->owner))
267 q = NULL;
268 break;
269 }
270 }
271 read_unlock(&qdisc_mod_lock);
272 }
273 return q;
274 }
275
276 static struct qdisc_rate_table *qdisc_rtab_list;
277
278 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
279 {
280 struct qdisc_rate_table *rtab;
281
282 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
283 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
284 rtab->refcnt++;
285 return rtab;
286 }
287 }
288
289 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
290 nla_len(tab) != TC_RTAB_SIZE)
291 return NULL;
292
293 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
294 if (rtab) {
295 rtab->rate = *r;
296 rtab->refcnt = 1;
297 memcpy(rtab->data, nla_data(tab), 1024);
298 rtab->next = qdisc_rtab_list;
299 qdisc_rtab_list = rtab;
300 }
301 return rtab;
302 }
303 EXPORT_SYMBOL(qdisc_get_rtab);
304
305 void qdisc_put_rtab(struct qdisc_rate_table *tab)
306 {
307 struct qdisc_rate_table *rtab, **rtabp;
308
309 if (!tab || --tab->refcnt)
310 return;
311
312 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
313 if (rtab == tab) {
314 *rtabp = rtab->next;
315 kfree(rtab);
316 return;
317 }
318 }
319 }
320 EXPORT_SYMBOL(qdisc_put_rtab);
321
322 static LIST_HEAD(qdisc_stab_list);
323 static DEFINE_SPINLOCK(qdisc_stab_lock);
324
325 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
326 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
327 [TCA_STAB_DATA] = { .type = NLA_BINARY },
328 };
329
330 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
331 {
332 struct nlattr *tb[TCA_STAB_MAX + 1];
333 struct qdisc_size_table *stab;
334 struct tc_sizespec *s;
335 unsigned int tsize = 0;
336 u16 *tab = NULL;
337 int err;
338
339 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
340 if (err < 0)
341 return ERR_PTR(err);
342 if (!tb[TCA_STAB_BASE])
343 return ERR_PTR(-EINVAL);
344
345 s = nla_data(tb[TCA_STAB_BASE]);
346
347 if (s->tsize > 0) {
348 if (!tb[TCA_STAB_DATA])
349 return ERR_PTR(-EINVAL);
350 tab = nla_data(tb[TCA_STAB_DATA]);
351 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
352 }
353
354 if (!s || tsize != s->tsize || (!tab && tsize > 0))
355 return ERR_PTR(-EINVAL);
356
357 spin_lock(&qdisc_stab_lock);
358
359 list_for_each_entry(stab, &qdisc_stab_list, list) {
360 if (memcmp(&stab->szopts, s, sizeof(*s)))
361 continue;
362 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
363 continue;
364 stab->refcnt++;
365 spin_unlock(&qdisc_stab_lock);
366 return stab;
367 }
368
369 spin_unlock(&qdisc_stab_lock);
370
371 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
372 if (!stab)
373 return ERR_PTR(-ENOMEM);
374
375 stab->refcnt = 1;
376 stab->szopts = *s;
377 if (tsize > 0)
378 memcpy(stab->data, tab, tsize * sizeof(u16));
379
380 spin_lock(&qdisc_stab_lock);
381 list_add_tail(&stab->list, &qdisc_stab_list);
382 spin_unlock(&qdisc_stab_lock);
383
384 return stab;
385 }
386
387 void qdisc_put_stab(struct qdisc_size_table *tab)
388 {
389 if (!tab)
390 return;
391
392 spin_lock(&qdisc_stab_lock);
393
394 if (--tab->refcnt == 0) {
395 list_del(&tab->list);
396 kfree(tab);
397 }
398
399 spin_unlock(&qdisc_stab_lock);
400 }
401 EXPORT_SYMBOL(qdisc_put_stab);
402
403 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
404 {
405 struct nlattr *nest;
406
407 nest = nla_nest_start(skb, TCA_STAB);
408 if (nest == NULL)
409 goto nla_put_failure;
410 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
411 nla_nest_end(skb, nest);
412
413 return skb->len;
414
415 nla_put_failure:
416 return -1;
417 }
418
419 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
420 {
421 int pkt_len, slot;
422
423 pkt_len = skb->len + stab->szopts.overhead;
424 if (unlikely(!stab->szopts.tsize))
425 goto out;
426
427 slot = pkt_len + stab->szopts.cell_align;
428 if (unlikely(slot < 0))
429 slot = 0;
430
431 slot >>= stab->szopts.cell_log;
432 if (likely(slot < stab->szopts.tsize))
433 pkt_len = stab->data[slot];
434 else
435 pkt_len = stab->data[stab->szopts.tsize - 1] *
436 (slot / stab->szopts.tsize) +
437 stab->data[slot % stab->szopts.tsize];
438
439 pkt_len <<= stab->szopts.size_log;
440 out:
441 if (unlikely(pkt_len < 1))
442 pkt_len = 1;
443 qdisc_skb_cb(skb)->pkt_len = pkt_len;
444 }
445 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
446
447 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
448 {
449 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
450 timer);
451
452 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
453 smp_wmb();
454 __netif_schedule(qdisc_root(wd->qdisc));
455
456 return HRTIMER_NORESTART;
457 }
458
459 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
460 {
461 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
462 wd->timer.function = qdisc_watchdog;
463 wd->qdisc = qdisc;
464 }
465 EXPORT_SYMBOL(qdisc_watchdog_init);
466
467 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
468 {
469 ktime_t time;
470
471 if (test_bit(__QDISC_STATE_DEACTIVATED,
472 &qdisc_root_sleeping(wd->qdisc)->state))
473 return;
474
475 wd->qdisc->flags |= TCQ_F_THROTTLED;
476 time = ktime_set(0, 0);
477 time = ktime_add_ns(time, PSCHED_US2NS(expires));
478 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
479 }
480 EXPORT_SYMBOL(qdisc_watchdog_schedule);
481
482 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
483 {
484 hrtimer_cancel(&wd->timer);
485 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
486 }
487 EXPORT_SYMBOL(qdisc_watchdog_cancel);
488
489 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
490 {
491 unsigned int size = n * sizeof(struct hlist_head), i;
492 struct hlist_head *h;
493
494 if (size <= PAGE_SIZE)
495 h = kmalloc(size, GFP_KERNEL);
496 else
497 h = (struct hlist_head *)
498 __get_free_pages(GFP_KERNEL, get_order(size));
499
500 if (h != NULL) {
501 for (i = 0; i < n; i++)
502 INIT_HLIST_HEAD(&h[i]);
503 }
504 return h;
505 }
506
507 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
508 {
509 unsigned int size = n * sizeof(struct hlist_head);
510
511 if (size <= PAGE_SIZE)
512 kfree(h);
513 else
514 free_pages((unsigned long)h, get_order(size));
515 }
516
517 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
518 {
519 struct Qdisc_class_common *cl;
520 struct hlist_node *n, *next;
521 struct hlist_head *nhash, *ohash;
522 unsigned int nsize, nmask, osize;
523 unsigned int i, h;
524
525 /* Rehash when load factor exceeds 0.75 */
526 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
527 return;
528 nsize = clhash->hashsize * 2;
529 nmask = nsize - 1;
530 nhash = qdisc_class_hash_alloc(nsize);
531 if (nhash == NULL)
532 return;
533
534 ohash = clhash->hash;
535 osize = clhash->hashsize;
536
537 sch_tree_lock(sch);
538 for (i = 0; i < osize; i++) {
539 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
540 h = qdisc_class_hash(cl->classid, nmask);
541 hlist_add_head(&cl->hnode, &nhash[h]);
542 }
543 }
544 clhash->hash = nhash;
545 clhash->hashsize = nsize;
546 clhash->hashmask = nmask;
547 sch_tree_unlock(sch);
548
549 qdisc_class_hash_free(ohash, osize);
550 }
551 EXPORT_SYMBOL(qdisc_class_hash_grow);
552
553 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
554 {
555 unsigned int size = 4;
556
557 clhash->hash = qdisc_class_hash_alloc(size);
558 if (clhash->hash == NULL)
559 return -ENOMEM;
560 clhash->hashsize = size;
561 clhash->hashmask = size - 1;
562 clhash->hashelems = 0;
563 return 0;
564 }
565 EXPORT_SYMBOL(qdisc_class_hash_init);
566
567 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
568 {
569 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
570 }
571 EXPORT_SYMBOL(qdisc_class_hash_destroy);
572
573 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
574 struct Qdisc_class_common *cl)
575 {
576 unsigned int h;
577
578 INIT_HLIST_NODE(&cl->hnode);
579 h = qdisc_class_hash(cl->classid, clhash->hashmask);
580 hlist_add_head(&cl->hnode, &clhash->hash[h]);
581 clhash->hashelems++;
582 }
583 EXPORT_SYMBOL(qdisc_class_hash_insert);
584
585 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
586 struct Qdisc_class_common *cl)
587 {
588 hlist_del(&cl->hnode);
589 clhash->hashelems--;
590 }
591 EXPORT_SYMBOL(qdisc_class_hash_remove);
592
593 /* Allocate an unique handle from space managed by kernel */
594
595 static u32 qdisc_alloc_handle(struct net_device *dev)
596 {
597 int i = 0x10000;
598 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
599
600 do {
601 autohandle += TC_H_MAKE(0x10000U, 0);
602 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
603 autohandle = TC_H_MAKE(0x80000000U, 0);
604 } while (qdisc_lookup(dev, autohandle) && --i > 0);
605
606 return i>0 ? autohandle : 0;
607 }
608
609 /* Attach toplevel qdisc to device queue. */
610
611 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
612 struct Qdisc *qdisc)
613 {
614 struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
615 spinlock_t *root_lock;
616
617 root_lock = qdisc_lock(oqdisc);
618 spin_lock_bh(root_lock);
619
620 /* Prune old scheduler */
621 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
622 qdisc_reset(oqdisc);
623
624 /* ... and graft new one */
625 if (qdisc == NULL)
626 qdisc = &noop_qdisc;
627 dev_queue->qdisc_sleeping = qdisc;
628 rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
629
630 spin_unlock_bh(root_lock);
631
632 return oqdisc;
633 }
634
635 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
636 {
637 const struct Qdisc_class_ops *cops;
638 unsigned long cl;
639 u32 parentid;
640
641 if (n == 0)
642 return;
643 while ((parentid = sch->parent)) {
644 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
645 return;
646
647 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
648 if (sch == NULL) {
649 WARN_ON(parentid != TC_H_ROOT);
650 return;
651 }
652 cops = sch->ops->cl_ops;
653 if (cops->qlen_notify) {
654 cl = cops->get(sch, parentid);
655 cops->qlen_notify(sch, cl);
656 cops->put(sch, cl);
657 }
658 sch->q.qlen -= n;
659 }
660 }
661 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
662
663 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
664 struct Qdisc *old, struct Qdisc *new)
665 {
666 if (new || old)
667 qdisc_notify(skb, n, clid, old, new);
668
669 if (old)
670 qdisc_destroy(old);
671 }
672
673 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
674 * to device "dev".
675 *
676 * When appropriate send a netlink notification using 'skb'
677 * and "n".
678 *
679 * On success, destroy old qdisc.
680 */
681
682 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
683 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
684 struct Qdisc *new, struct Qdisc *old)
685 {
686 struct Qdisc *q = old;
687 int err = 0;
688
689 if (parent == NULL) {
690 unsigned int i, num_q, ingress;
691
692 ingress = 0;
693 num_q = dev->num_tx_queues;
694 if ((q && q->flags & TCQ_F_INGRESS) ||
695 (new && new->flags & TCQ_F_INGRESS)) {
696 num_q = 1;
697 ingress = 1;
698 }
699
700 if (dev->flags & IFF_UP)
701 dev_deactivate(dev);
702
703 for (i = 0; i < num_q; i++) {
704 struct netdev_queue *dev_queue = &dev->rx_queue;
705
706 if (!ingress)
707 dev_queue = netdev_get_tx_queue(dev, i);
708
709 old = dev_graft_qdisc(dev_queue, new);
710 if (new && i > 0)
711 atomic_inc(&new->refcnt);
712
713 notify_and_destroy(skb, n, classid, old, new);
714 }
715
716 if (dev->flags & IFF_UP)
717 dev_activate(dev);
718 } else {
719 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
720
721 err = -EINVAL;
722
723 if (cops) {
724 unsigned long cl = cops->get(parent, classid);
725 if (cl) {
726 err = cops->graft(parent, cl, new, &old);
727 cops->put(parent, cl);
728 }
729 }
730 if (!err)
731 notify_and_destroy(skb, n, classid, old, new);
732 }
733 return err;
734 }
735
736 /* lockdep annotation is needed for ingress; egress gets it only for name */
737 static struct lock_class_key qdisc_tx_lock;
738 static struct lock_class_key qdisc_rx_lock;
739
740 /*
741 Allocate and initialize new qdisc.
742
743 Parameters are passed via opt.
744 */
745
746 static struct Qdisc *
747 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
748 u32 parent, u32 handle, struct nlattr **tca, int *errp)
749 {
750 int err;
751 struct nlattr *kind = tca[TCA_KIND];
752 struct Qdisc *sch;
753 struct Qdisc_ops *ops;
754 struct qdisc_size_table *stab;
755
756 ops = qdisc_lookup_ops(kind);
757 #ifdef CONFIG_MODULES
758 if (ops == NULL && kind != NULL) {
759 char name[IFNAMSIZ];
760 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
761 /* We dropped the RTNL semaphore in order to
762 * perform the module load. So, even if we
763 * succeeded in loading the module we have to
764 * tell the caller to replay the request. We
765 * indicate this using -EAGAIN.
766 * We replay the request because the device may
767 * go away in the mean time.
768 */
769 rtnl_unlock();
770 request_module("sch_%s", name);
771 rtnl_lock();
772 ops = qdisc_lookup_ops(kind);
773 if (ops != NULL) {
774 /* We will try again qdisc_lookup_ops,
775 * so don't keep a reference.
776 */
777 module_put(ops->owner);
778 err = -EAGAIN;
779 goto err_out;
780 }
781 }
782 }
783 #endif
784
785 err = -ENOENT;
786 if (ops == NULL)
787 goto err_out;
788
789 sch = qdisc_alloc(dev_queue, ops);
790 if (IS_ERR(sch)) {
791 err = PTR_ERR(sch);
792 goto err_out2;
793 }
794
795 sch->parent = parent;
796
797 if (handle == TC_H_INGRESS) {
798 sch->flags |= TCQ_F_INGRESS;
799 handle = TC_H_MAKE(TC_H_INGRESS, 0);
800 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
801 } else {
802 if (handle == 0) {
803 handle = qdisc_alloc_handle(dev);
804 err = -ENOMEM;
805 if (handle == 0)
806 goto err_out3;
807 }
808 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
809 }
810
811 sch->handle = handle;
812
813 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
814 if (tca[TCA_STAB]) {
815 stab = qdisc_get_stab(tca[TCA_STAB]);
816 if (IS_ERR(stab)) {
817 err = PTR_ERR(stab);
818 goto err_out3;
819 }
820 sch->stab = stab;
821 }
822 if (tca[TCA_RATE]) {
823 spinlock_t *root_lock;
824
825 if ((sch->parent != TC_H_ROOT) &&
826 !(sch->flags & TCQ_F_INGRESS))
827 root_lock = qdisc_root_sleeping_lock(sch);
828 else
829 root_lock = qdisc_lock(sch);
830
831 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
832 root_lock, tca[TCA_RATE]);
833 if (err) {
834 /*
835 * Any broken qdiscs that would require
836 * a ops->reset() here? The qdisc was never
837 * in action so it shouldn't be necessary.
838 */
839 if (ops->destroy)
840 ops->destroy(sch);
841 goto err_out3;
842 }
843 }
844
845 qdisc_list_add(sch);
846
847 return sch;
848 }
849 err_out3:
850 qdisc_put_stab(sch->stab);
851 dev_put(dev);
852 kfree((char *) sch - sch->padded);
853 err_out2:
854 module_put(ops->owner);
855 err_out:
856 *errp = err;
857 return NULL;
858 }
859
860 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
861 {
862 struct qdisc_size_table *stab = NULL;
863 int err = 0;
864
865 if (tca[TCA_OPTIONS]) {
866 if (sch->ops->change == NULL)
867 return -EINVAL;
868 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
869 if (err)
870 return err;
871 }
872
873 if (tca[TCA_STAB]) {
874 stab = qdisc_get_stab(tca[TCA_STAB]);
875 if (IS_ERR(stab))
876 return PTR_ERR(stab);
877 }
878
879 qdisc_put_stab(sch->stab);
880 sch->stab = stab;
881
882 if (tca[TCA_RATE])
883 gen_replace_estimator(&sch->bstats, &sch->rate_est,
884 qdisc_root_sleeping_lock(sch),
885 tca[TCA_RATE]);
886 return 0;
887 }
888
889 struct check_loop_arg
890 {
891 struct qdisc_walker w;
892 struct Qdisc *p;
893 int depth;
894 };
895
896 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
897
898 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
899 {
900 struct check_loop_arg arg;
901
902 if (q->ops->cl_ops == NULL)
903 return 0;
904
905 arg.w.stop = arg.w.skip = arg.w.count = 0;
906 arg.w.fn = check_loop_fn;
907 arg.depth = depth;
908 arg.p = p;
909 q->ops->cl_ops->walk(q, &arg.w);
910 return arg.w.stop ? -ELOOP : 0;
911 }
912
913 static int
914 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
915 {
916 struct Qdisc *leaf;
917 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
918 struct check_loop_arg *arg = (struct check_loop_arg *)w;
919
920 leaf = cops->leaf(q, cl);
921 if (leaf) {
922 if (leaf == arg->p || arg->depth > 7)
923 return -ELOOP;
924 return check_loop(leaf, arg->p, arg->depth + 1);
925 }
926 return 0;
927 }
928
929 /*
930 * Delete/get qdisc.
931 */
932
933 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
934 {
935 struct net *net = sock_net(skb->sk);
936 struct tcmsg *tcm = NLMSG_DATA(n);
937 struct nlattr *tca[TCA_MAX + 1];
938 struct net_device *dev;
939 u32 clid = tcm->tcm_parent;
940 struct Qdisc *q = NULL;
941 struct Qdisc *p = NULL;
942 int err;
943
944 if (net != &init_net)
945 return -EINVAL;
946
947 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
948 return -ENODEV;
949
950 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
951 if (err < 0)
952 return err;
953
954 if (clid) {
955 if (clid != TC_H_ROOT) {
956 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
957 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
958 return -ENOENT;
959 q = qdisc_leaf(p, clid);
960 } else { /* ingress */
961 q = dev->rx_queue.qdisc_sleeping;
962 }
963 } else {
964 struct netdev_queue *dev_queue;
965 dev_queue = netdev_get_tx_queue(dev, 0);
966 q = dev_queue->qdisc_sleeping;
967 }
968 if (!q)
969 return -ENOENT;
970
971 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
972 return -EINVAL;
973 } else {
974 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
975 return -ENOENT;
976 }
977
978 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
979 return -EINVAL;
980
981 if (n->nlmsg_type == RTM_DELQDISC) {
982 if (!clid)
983 return -EINVAL;
984 if (q->handle == 0)
985 return -ENOENT;
986 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
987 return err;
988 } else {
989 qdisc_notify(skb, n, clid, NULL, q);
990 }
991 return 0;
992 }
993
994 /*
995 Create/change qdisc.
996 */
997
998 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
999 {
1000 struct net *net = sock_net(skb->sk);
1001 struct tcmsg *tcm;
1002 struct nlattr *tca[TCA_MAX + 1];
1003 struct net_device *dev;
1004 u32 clid;
1005 struct Qdisc *q, *p;
1006 int err;
1007
1008 if (net != &init_net)
1009 return -EINVAL;
1010
1011 replay:
1012 /* Reinit, just in case something touches this. */
1013 tcm = NLMSG_DATA(n);
1014 clid = tcm->tcm_parent;
1015 q = p = NULL;
1016
1017 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1018 return -ENODEV;
1019
1020 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1021 if (err < 0)
1022 return err;
1023
1024 if (clid) {
1025 if (clid != TC_H_ROOT) {
1026 if (clid != TC_H_INGRESS) {
1027 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1028 return -ENOENT;
1029 q = qdisc_leaf(p, clid);
1030 } else { /*ingress */
1031 q = dev->rx_queue.qdisc_sleeping;
1032 }
1033 } else {
1034 struct netdev_queue *dev_queue;
1035 dev_queue = netdev_get_tx_queue(dev, 0);
1036 q = dev_queue->qdisc_sleeping;
1037 }
1038
1039 /* It may be default qdisc, ignore it */
1040 if (q && q->handle == 0)
1041 q = NULL;
1042
1043 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1044 if (tcm->tcm_handle) {
1045 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1046 return -EEXIST;
1047 if (TC_H_MIN(tcm->tcm_handle))
1048 return -EINVAL;
1049 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1050 goto create_n_graft;
1051 if (n->nlmsg_flags&NLM_F_EXCL)
1052 return -EEXIST;
1053 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1054 return -EINVAL;
1055 if (q == p ||
1056 (p && check_loop(q, p, 0)))
1057 return -ELOOP;
1058 atomic_inc(&q->refcnt);
1059 goto graft;
1060 } else {
1061 if (q == NULL)
1062 goto create_n_graft;
1063
1064 /* This magic test requires explanation.
1065 *
1066 * We know, that some child q is already
1067 * attached to this parent and have choice:
1068 * either to change it or to create/graft new one.
1069 *
1070 * 1. We are allowed to create/graft only
1071 * if CREATE and REPLACE flags are set.
1072 *
1073 * 2. If EXCL is set, requestor wanted to say,
1074 * that qdisc tcm_handle is not expected
1075 * to exist, so that we choose create/graft too.
1076 *
1077 * 3. The last case is when no flags are set.
1078 * Alas, it is sort of hole in API, we
1079 * cannot decide what to do unambiguously.
1080 * For now we select create/graft, if
1081 * user gave KIND, which does not match existing.
1082 */
1083 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1084 (n->nlmsg_flags&NLM_F_REPLACE) &&
1085 ((n->nlmsg_flags&NLM_F_EXCL) ||
1086 (tca[TCA_KIND] &&
1087 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1088 goto create_n_graft;
1089 }
1090 }
1091 } else {
1092 if (!tcm->tcm_handle)
1093 return -EINVAL;
1094 q = qdisc_lookup(dev, tcm->tcm_handle);
1095 }
1096
1097 /* Change qdisc parameters */
1098 if (q == NULL)
1099 return -ENOENT;
1100 if (n->nlmsg_flags&NLM_F_EXCL)
1101 return -EEXIST;
1102 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1103 return -EINVAL;
1104 err = qdisc_change(q, tca);
1105 if (err == 0)
1106 qdisc_notify(skb, n, clid, NULL, q);
1107 return err;
1108
1109 create_n_graft:
1110 if (!(n->nlmsg_flags&NLM_F_CREATE))
1111 return -ENOENT;
1112 if (clid == TC_H_INGRESS)
1113 q = qdisc_create(dev, &dev->rx_queue,
1114 tcm->tcm_parent, tcm->tcm_parent,
1115 tca, &err);
1116 else
1117 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1118 tcm->tcm_parent, tcm->tcm_handle,
1119 tca, &err);
1120 if (q == NULL) {
1121 if (err == -EAGAIN)
1122 goto replay;
1123 return err;
1124 }
1125
1126 graft:
1127 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1128 if (err) {
1129 if (q)
1130 qdisc_destroy(q);
1131 return err;
1132 }
1133
1134 return 0;
1135 }
1136
1137 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1138 u32 pid, u32 seq, u16 flags, int event)
1139 {
1140 struct tcmsg *tcm;
1141 struct nlmsghdr *nlh;
1142 unsigned char *b = skb_tail_pointer(skb);
1143 struct gnet_dump d;
1144
1145 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1146 tcm = NLMSG_DATA(nlh);
1147 tcm->tcm_family = AF_UNSPEC;
1148 tcm->tcm__pad1 = 0;
1149 tcm->tcm__pad2 = 0;
1150 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1151 tcm->tcm_parent = clid;
1152 tcm->tcm_handle = q->handle;
1153 tcm->tcm_info = atomic_read(&q->refcnt);
1154 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1155 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1156 goto nla_put_failure;
1157 q->qstats.qlen = q->q.qlen;
1158
1159 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1160 goto nla_put_failure;
1161
1162 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1163 qdisc_root_sleeping_lock(q), &d) < 0)
1164 goto nla_put_failure;
1165
1166 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1167 goto nla_put_failure;
1168
1169 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1170 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1171 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1172 goto nla_put_failure;
1173
1174 if (gnet_stats_finish_copy(&d) < 0)
1175 goto nla_put_failure;
1176
1177 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1178 return skb->len;
1179
1180 nlmsg_failure:
1181 nla_put_failure:
1182 nlmsg_trim(skb, b);
1183 return -1;
1184 }
1185
1186 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1187 u32 clid, struct Qdisc *old, struct Qdisc *new)
1188 {
1189 struct sk_buff *skb;
1190 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1191
1192 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1193 if (!skb)
1194 return -ENOBUFS;
1195
1196 if (old && old->handle) {
1197 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1198 goto err_out;
1199 }
1200 if (new) {
1201 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1202 goto err_out;
1203 }
1204
1205 if (skb->len)
1206 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1207
1208 err_out:
1209 kfree_skb(skb);
1210 return -EINVAL;
1211 }
1212
1213 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1214 {
1215 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1216 }
1217
1218 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1219 struct netlink_callback *cb,
1220 int *q_idx_p, int s_q_idx)
1221 {
1222 int ret = 0, q_idx = *q_idx_p;
1223 struct Qdisc *q;
1224
1225 if (!root)
1226 return 0;
1227
1228 q = root;
1229 if (q_idx < s_q_idx) {
1230 q_idx++;
1231 } else {
1232 if (!tc_qdisc_dump_ignore(q) &&
1233 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1234 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1235 goto done;
1236 q_idx++;
1237 }
1238 list_for_each_entry(q, &root->list, list) {
1239 if (q_idx < s_q_idx) {
1240 q_idx++;
1241 continue;
1242 }
1243 if (!tc_qdisc_dump_ignore(q) &&
1244 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1245 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1246 goto done;
1247 q_idx++;
1248 }
1249
1250 out:
1251 *q_idx_p = q_idx;
1252 return ret;
1253 done:
1254 ret = -1;
1255 goto out;
1256 }
1257
1258 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1259 {
1260 struct net *net = sock_net(skb->sk);
1261 int idx, q_idx;
1262 int s_idx, s_q_idx;
1263 struct net_device *dev;
1264
1265 if (net != &init_net)
1266 return 0;
1267
1268 s_idx = cb->args[0];
1269 s_q_idx = q_idx = cb->args[1];
1270 read_lock(&dev_base_lock);
1271 idx = 0;
1272 for_each_netdev(&init_net, dev) {
1273 struct netdev_queue *dev_queue;
1274
1275 if (idx < s_idx)
1276 goto cont;
1277 if (idx > s_idx)
1278 s_q_idx = 0;
1279 q_idx = 0;
1280
1281 dev_queue = netdev_get_tx_queue(dev, 0);
1282 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1283 goto done;
1284
1285 dev_queue = &dev->rx_queue;
1286 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1287 goto done;
1288
1289 cont:
1290 idx++;
1291 }
1292
1293 done:
1294 read_unlock(&dev_base_lock);
1295
1296 cb->args[0] = idx;
1297 cb->args[1] = q_idx;
1298
1299 return skb->len;
1300 }
1301
1302
1303
1304 /************************************************
1305 * Traffic classes manipulation. *
1306 ************************************************/
1307
1308
1309
1310 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1311 {
1312 struct net *net = sock_net(skb->sk);
1313 struct netdev_queue *dev_queue;
1314 struct tcmsg *tcm = NLMSG_DATA(n);
1315 struct nlattr *tca[TCA_MAX + 1];
1316 struct net_device *dev;
1317 struct Qdisc *q = NULL;
1318 const struct Qdisc_class_ops *cops;
1319 unsigned long cl = 0;
1320 unsigned long new_cl;
1321 u32 pid = tcm->tcm_parent;
1322 u32 clid = tcm->tcm_handle;
1323 u32 qid = TC_H_MAJ(clid);
1324 int err;
1325
1326 if (net != &init_net)
1327 return -EINVAL;
1328
1329 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1330 return -ENODEV;
1331
1332 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1333 if (err < 0)
1334 return err;
1335
1336 /*
1337 parent == TC_H_UNSPEC - unspecified parent.
1338 parent == TC_H_ROOT - class is root, which has no parent.
1339 parent == X:0 - parent is root class.
1340 parent == X:Y - parent is a node in hierarchy.
1341 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1342
1343 handle == 0:0 - generate handle from kernel pool.
1344 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1345 handle == X:Y - clear.
1346 handle == X:0 - root class.
1347 */
1348
1349 /* Step 1. Determine qdisc handle X:0 */
1350
1351 dev_queue = netdev_get_tx_queue(dev, 0);
1352 if (pid != TC_H_ROOT) {
1353 u32 qid1 = TC_H_MAJ(pid);
1354
1355 if (qid && qid1) {
1356 /* If both majors are known, they must be identical. */
1357 if (qid != qid1)
1358 return -EINVAL;
1359 } else if (qid1) {
1360 qid = qid1;
1361 } else if (qid == 0)
1362 qid = dev_queue->qdisc_sleeping->handle;
1363
1364 /* Now qid is genuine qdisc handle consistent
1365 both with parent and child.
1366
1367 TC_H_MAJ(pid) still may be unspecified, complete it now.
1368 */
1369 if (pid)
1370 pid = TC_H_MAKE(qid, pid);
1371 } else {
1372 if (qid == 0)
1373 qid = dev_queue->qdisc_sleeping->handle;
1374 }
1375
1376 /* OK. Locate qdisc */
1377 if ((q = qdisc_lookup(dev, qid)) == NULL)
1378 return -ENOENT;
1379
1380 /* An check that it supports classes */
1381 cops = q->ops->cl_ops;
1382 if (cops == NULL)
1383 return -EINVAL;
1384
1385 /* Now try to get class */
1386 if (clid == 0) {
1387 if (pid == TC_H_ROOT)
1388 clid = qid;
1389 } else
1390 clid = TC_H_MAKE(qid, clid);
1391
1392 if (clid)
1393 cl = cops->get(q, clid);
1394
1395 if (cl == 0) {
1396 err = -ENOENT;
1397 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1398 goto out;
1399 } else {
1400 switch (n->nlmsg_type) {
1401 case RTM_NEWTCLASS:
1402 err = -EEXIST;
1403 if (n->nlmsg_flags&NLM_F_EXCL)
1404 goto out;
1405 break;
1406 case RTM_DELTCLASS:
1407 err = cops->delete(q, cl);
1408 if (err == 0)
1409 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1410 goto out;
1411 case RTM_GETTCLASS:
1412 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1413 goto out;
1414 default:
1415 err = -EINVAL;
1416 goto out;
1417 }
1418 }
1419
1420 new_cl = cl;
1421 err = cops->change(q, clid, pid, tca, &new_cl);
1422 if (err == 0)
1423 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1424
1425 out:
1426 if (cl)
1427 cops->put(q, cl);
1428
1429 return err;
1430 }
1431
1432
1433 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1434 unsigned long cl,
1435 u32 pid, u32 seq, u16 flags, int event)
1436 {
1437 struct tcmsg *tcm;
1438 struct nlmsghdr *nlh;
1439 unsigned char *b = skb_tail_pointer(skb);
1440 struct gnet_dump d;
1441 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1442
1443 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1444 tcm = NLMSG_DATA(nlh);
1445 tcm->tcm_family = AF_UNSPEC;
1446 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1447 tcm->tcm_parent = q->handle;
1448 tcm->tcm_handle = q->handle;
1449 tcm->tcm_info = 0;
1450 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1451 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1452 goto nla_put_failure;
1453
1454 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1455 qdisc_root_sleeping_lock(q), &d) < 0)
1456 goto nla_put_failure;
1457
1458 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1459 goto nla_put_failure;
1460
1461 if (gnet_stats_finish_copy(&d) < 0)
1462 goto nla_put_failure;
1463
1464 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1465 return skb->len;
1466
1467 nlmsg_failure:
1468 nla_put_failure:
1469 nlmsg_trim(skb, b);
1470 return -1;
1471 }
1472
1473 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1474 struct Qdisc *q, unsigned long cl, int event)
1475 {
1476 struct sk_buff *skb;
1477 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1478
1479 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1480 if (!skb)
1481 return -ENOBUFS;
1482
1483 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1484 kfree_skb(skb);
1485 return -EINVAL;
1486 }
1487
1488 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1489 }
1490
1491 struct qdisc_dump_args
1492 {
1493 struct qdisc_walker w;
1494 struct sk_buff *skb;
1495 struct netlink_callback *cb;
1496 };
1497
1498 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1499 {
1500 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1501
1502 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1503 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1504 }
1505
1506 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1507 struct tcmsg *tcm, struct netlink_callback *cb,
1508 int *t_p, int s_t)
1509 {
1510 struct qdisc_dump_args arg;
1511
1512 if (tc_qdisc_dump_ignore(q) ||
1513 *t_p < s_t || !q->ops->cl_ops ||
1514 (tcm->tcm_parent &&
1515 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1516 (*t_p)++;
1517 return 0;
1518 }
1519 if (*t_p > s_t)
1520 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1521 arg.w.fn = qdisc_class_dump;
1522 arg.skb = skb;
1523 arg.cb = cb;
1524 arg.w.stop = 0;
1525 arg.w.skip = cb->args[1];
1526 arg.w.count = 0;
1527 q->ops->cl_ops->walk(q, &arg.w);
1528 cb->args[1] = arg.w.count;
1529 if (arg.w.stop)
1530 return -1;
1531 (*t_p)++;
1532 return 0;
1533 }
1534
1535 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1536 struct tcmsg *tcm, struct netlink_callback *cb,
1537 int *t_p, int s_t)
1538 {
1539 struct Qdisc *q;
1540
1541 if (!root)
1542 return 0;
1543
1544 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1545 return -1;
1546
1547 list_for_each_entry(q, &root->list, list) {
1548 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1549 return -1;
1550 }
1551
1552 return 0;
1553 }
1554
1555 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1556 {
1557 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1558 struct net *net = sock_net(skb->sk);
1559 struct netdev_queue *dev_queue;
1560 struct net_device *dev;
1561 int t, s_t;
1562
1563 if (net != &init_net)
1564 return 0;
1565
1566 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1567 return 0;
1568 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1569 return 0;
1570
1571 s_t = cb->args[0];
1572 t = 0;
1573
1574 dev_queue = netdev_get_tx_queue(dev, 0);
1575 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1576 goto done;
1577
1578 dev_queue = &dev->rx_queue;
1579 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1580 goto done;
1581
1582 done:
1583 cb->args[0] = t;
1584
1585 dev_put(dev);
1586 return skb->len;
1587 }
1588
1589 /* Main classifier routine: scans classifier chain attached
1590 to this qdisc, (optionally) tests for protocol and asks
1591 specific classifiers.
1592 */
1593 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1594 struct tcf_result *res)
1595 {
1596 __be16 protocol = skb->protocol;
1597 int err = 0;
1598
1599 for (; tp; tp = tp->next) {
1600 if ((tp->protocol == protocol ||
1601 tp->protocol == htons(ETH_P_ALL)) &&
1602 (err = tp->classify(skb, tp, res)) >= 0) {
1603 #ifdef CONFIG_NET_CLS_ACT
1604 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1605 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1606 #endif
1607 return err;
1608 }
1609 }
1610 return -1;
1611 }
1612 EXPORT_SYMBOL(tc_classify_compat);
1613
1614 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1615 struct tcf_result *res)
1616 {
1617 int err = 0;
1618 __be16 protocol;
1619 #ifdef CONFIG_NET_CLS_ACT
1620 struct tcf_proto *otp = tp;
1621 reclassify:
1622 #endif
1623 protocol = skb->protocol;
1624
1625 err = tc_classify_compat(skb, tp, res);
1626 #ifdef CONFIG_NET_CLS_ACT
1627 if (err == TC_ACT_RECLASSIFY) {
1628 u32 verd = G_TC_VERD(skb->tc_verd);
1629 tp = otp;
1630
1631 if (verd++ >= MAX_REC_LOOP) {
1632 printk("rule prio %u protocol %02x reclassify loop, "
1633 "packet dropped\n",
1634 tp->prio&0xffff, ntohs(tp->protocol));
1635 return TC_ACT_SHOT;
1636 }
1637 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1638 goto reclassify;
1639 }
1640 #endif
1641 return err;
1642 }
1643 EXPORT_SYMBOL(tc_classify);
1644
1645 void tcf_destroy(struct tcf_proto *tp)
1646 {
1647 tp->ops->destroy(tp);
1648 module_put(tp->ops->owner);
1649 kfree(tp);
1650 }
1651
1652 void tcf_destroy_chain(struct tcf_proto **fl)
1653 {
1654 struct tcf_proto *tp;
1655
1656 while ((tp = *fl) != NULL) {
1657 *fl = tp->next;
1658 tcf_destroy(tp);
1659 }
1660 }
1661 EXPORT_SYMBOL(tcf_destroy_chain);
1662
1663 #ifdef CONFIG_PROC_FS
1664 static int psched_show(struct seq_file *seq, void *v)
1665 {
1666 struct timespec ts;
1667
1668 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1669 seq_printf(seq, "%08x %08x %08x %08x\n",
1670 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1671 1000000,
1672 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1673
1674 return 0;
1675 }
1676
1677 static int psched_open(struct inode *inode, struct file *file)
1678 {
1679 return single_open(file, psched_show, PDE(inode)->data);
1680 }
1681
1682 static const struct file_operations psched_fops = {
1683 .owner = THIS_MODULE,
1684 .open = psched_open,
1685 .read = seq_read,
1686 .llseek = seq_lseek,
1687 .release = single_release,
1688 };
1689 #endif
1690
1691 static int __init pktsched_init(void)
1692 {
1693 register_qdisc(&pfifo_qdisc_ops);
1694 register_qdisc(&bfifo_qdisc_ops);
1695 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1696
1697 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1698 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1699 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1700 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1701 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1702 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1703
1704 return 0;
1705 }
1706
1707 subsys_initcall(pktsched_init);
This page took 0.129518 seconds and 5 git commands to generate.