[PKT_SCHED] netem: use only inner qdisc -- no private skbuff queue
[deliverable/linux.git] / net / sched / sch_netem.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_netem.c Network emulator
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
11 *
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/errno.h>
22#include <linux/netdevice.h>
23#include <linux/skbuff.h>
24#include <linux/rtnetlink.h>
25
26#include <net/pkt_sched.h>
27
28/* Network Emulation Queuing algorithm.
29 ====================================
30
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
34
35 ----------------------------------------------------------------
36
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
44
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
49
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
52*/
53
54struct netem_sched_data {
55 struct Qdisc *qdisc;
1da177e4
LT
56 struct timer_list timer;
57
58 u32 latency;
59 u32 loss;
60 u32 limit;
61 u32 counter;
62 u32 gap;
63 u32 jitter;
64 u32 duplicate;
65
66 struct crndstate {
67 unsigned long last;
68 unsigned long rho;
69 } delay_cor, loss_cor, dup_cor;
70
71 struct disttable {
72 u32 size;
73 s16 table[0];
74 } *delay_dist;
75};
76
77/* Time stamp put into socket buffer control block */
78struct netem_skb_cb {
79 psched_time_t time_to_send;
80};
81
82/* init_crandom - initialize correlated random number generator
83 * Use entropy source for initial seed.
84 */
85static void init_crandom(struct crndstate *state, unsigned long rho)
86{
87 state->rho = rho;
88 state->last = net_random();
89}
90
91/* get_crandom - correlated random number generator
92 * Next number depends on last value.
93 * rho is scaled to avoid floating point.
94 */
95static unsigned long get_crandom(struct crndstate *state)
96{
97 u64 value, rho;
98 unsigned long answer;
99
100 if (state->rho == 0) /* no correllation */
101 return net_random();
102
103 value = net_random();
104 rho = (u64)state->rho + 1;
105 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
106 state->last = answer;
107 return answer;
108}
109
110/* tabledist - return a pseudo-randomly distributed value with mean mu and
111 * std deviation sigma. Uses table lookup to approximate the desired
112 * distribution, and a uniformly-distributed pseudo-random source.
113 */
114static long tabledist(unsigned long mu, long sigma,
115 struct crndstate *state, const struct disttable *dist)
116{
117 long t, x;
118 unsigned long rnd;
119
120 if (sigma == 0)
121 return mu;
122
123 rnd = get_crandom(state);
124
125 /* default uniform distribution */
126 if (dist == NULL)
127 return (rnd % (2*sigma)) - sigma + mu;
128
129 t = dist->table[rnd % dist->size];
130 x = (sigma % NETEM_DIST_SCALE) * t;
131 if (x >= 0)
132 x += NETEM_DIST_SCALE/2;
133 else
134 x -= NETEM_DIST_SCALE/2;
135
136 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
137}
138
0afb51e7
SH
139/*
140 * Insert one skb into qdisc.
141 * Note: parent depends on return value to account for queue length.
142 * NET_XMIT_DROP: queue length didn't change.
143 * NET_XMIT_SUCCESS: one skb was queued.
144 */
1da177e4
LT
145static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
146{
147 struct netem_sched_data *q = qdisc_priv(sch);
0f9f32ac 148 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
0afb51e7 149 struct sk_buff *skb2;
1da177e4 150 int ret;
0afb51e7 151 int count = 1;
1da177e4 152
771018e7 153 pr_debug("netem_enqueue skb=%p\n", skb);
1da177e4 154
0afb51e7
SH
155 /* Random duplication */
156 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
157 ++count;
158
1da177e4 159 /* Random packet drop 0 => none, ~0 => all */
0afb51e7
SH
160 if (q->loss && q->loss >= get_crandom(&q->loss_cor))
161 --count;
162
163 if (count == 0) {
1da177e4
LT
164 sch->qstats.drops++;
165 kfree_skb(skb);
0afb51e7 166 return NET_XMIT_DROP;
1da177e4
LT
167 }
168
0afb51e7
SH
169 /*
170 * If we need to duplicate packet, then re-insert at top of the
171 * qdisc tree, since parent queuer expects that only one
172 * skb will be queued.
173 */
174 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
175 struct Qdisc *rootq = sch->dev->qdisc;
176 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
177 q->duplicate = 0;
178
179 rootq->enqueue(skb2, rootq);
180 q->duplicate = dupsave;
1da177e4
LT
181 }
182
0f9f32ac
SH
183 /*
184 * Do re-ordering by putting one out of N packets at the front
185 * of the queue.
186 * gap == 0 is special case for no-reordering.
1da177e4 187 */
0f9f32ac
SH
188 if (q->gap == 0 || q->counter != q->gap) {
189 psched_time_t now;
190 PSCHED_GET_TIME(now);
191 PSCHED_TADD2(now,
192 tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist),
193 cb->time_to_send);
194
1da177e4
LT
195 ++q->counter;
196 ret = q->qdisc->enqueue(skb, q->qdisc);
197 } else {
198 q->counter = 0;
0f9f32ac
SH
199 PSCHED_GET_TIME(cb->time_to_send);
200 ret = q->qdisc->ops->requeue(skb, q->qdisc);
1da177e4
LT
201 }
202
203 if (likely(ret == NET_XMIT_SUCCESS)) {
204 sch->q.qlen++;
205 sch->bstats.bytes += skb->len;
206 sch->bstats.packets++;
207 } else
208 sch->qstats.drops++;
209
d5d75cd6 210 pr_debug("netem: enqueue ret %d\n", ret);
1da177e4
LT
211 return ret;
212}
213
214/* Requeue packets but don't change time stamp */
215static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
216{
217 struct netem_sched_data *q = qdisc_priv(sch);
218 int ret;
219
220 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
221 sch->q.qlen++;
222 sch->qstats.requeues++;
223 }
224
225 return ret;
226}
227
228static unsigned int netem_drop(struct Qdisc* sch)
229{
230 struct netem_sched_data *q = qdisc_priv(sch);
231 unsigned int len;
232
233 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
234 sch->q.qlen--;
235 sch->qstats.drops++;
236 }
237 return len;
238}
239
1da177e4
LT
240static struct sk_buff *netem_dequeue(struct Qdisc *sch)
241{
242 struct netem_sched_data *q = qdisc_priv(sch);
243 struct sk_buff *skb;
244
245 skb = q->qdisc->dequeue(q->qdisc);
771018e7 246 if (skb) {
0f9f32ac
SH
247 const struct netem_skb_cb *cb
248 = (const struct netem_skb_cb *)skb->cb;
249 psched_time_t now;
250 long delay;
251
252 /* if more time remaining? */
253 PSCHED_GET_TIME(now);
254 delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
255 pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
256 if (delay <= 0) {
257 pr_debug("netem_dequeue: return skb=%p\n", skb);
258 sch->q.qlen--;
259 sch->flags &= ~TCQ_F_THROTTLED;
260 return skb;
261 }
262
263 mod_timer(&q->timer, jiffies + delay);
771018e7 264 sch->flags |= TCQ_F_THROTTLED;
771018e7 265
0f9f32ac
SH
266 if (q->qdisc->ops->requeue(skb, q->qdisc) != 0)
267 sch->qstats.drops++;
268 }
269
270 return NULL;
1da177e4
LT
271}
272
273static void netem_watchdog(unsigned long arg)
274{
275 struct Qdisc *sch = (struct Qdisc *)arg;
1da177e4 276
771018e7
SH
277 pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
278 sch->flags &= ~TCQ_F_THROTTLED;
279 netif_schedule(sch->dev);
1da177e4
LT
280}
281
282static void netem_reset(struct Qdisc *sch)
283{
284 struct netem_sched_data *q = qdisc_priv(sch);
285
286 qdisc_reset(q->qdisc);
1da177e4 287 sch->q.qlen = 0;
771018e7 288 sch->flags &= ~TCQ_F_THROTTLED;
1da177e4
LT
289 del_timer_sync(&q->timer);
290}
291
292static int set_fifo_limit(struct Qdisc *q, int limit)
293{
294 struct rtattr *rta;
295 int ret = -ENOMEM;
296
297 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
298 if (rta) {
299 rta->rta_type = RTM_NEWQDISC;
300 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
301 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
302
303 ret = q->ops->change(q, rta);
304 kfree(rta);
305 }
306 return ret;
307}
308
309/*
310 * Distribution data is a variable size payload containing
311 * signed 16 bit values.
312 */
313static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
314{
315 struct netem_sched_data *q = qdisc_priv(sch);
316 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
317 const __s16 *data = RTA_DATA(attr);
318 struct disttable *d;
319 int i;
320
321 if (n > 65536)
322 return -EINVAL;
323
324 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
325 if (!d)
326 return -ENOMEM;
327
328 d->size = n;
329 for (i = 0; i < n; i++)
330 d->table[i] = data[i];
331
332 spin_lock_bh(&sch->dev->queue_lock);
333 d = xchg(&q->delay_dist, d);
334 spin_unlock_bh(&sch->dev->queue_lock);
335
336 kfree(d);
337 return 0;
338}
339
340static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
341{
342 struct netem_sched_data *q = qdisc_priv(sch);
343 const struct tc_netem_corr *c = RTA_DATA(attr);
344
345 if (RTA_PAYLOAD(attr) != sizeof(*c))
346 return -EINVAL;
347
348 init_crandom(&q->delay_cor, c->delay_corr);
349 init_crandom(&q->loss_cor, c->loss_corr);
350 init_crandom(&q->dup_cor, c->dup_corr);
351 return 0;
352}
353
354static int netem_change(struct Qdisc *sch, struct rtattr *opt)
355{
356 struct netem_sched_data *q = qdisc_priv(sch);
357 struct tc_netem_qopt *qopt;
358 int ret;
359
360 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
361 return -EINVAL;
362
363 qopt = RTA_DATA(opt);
364 ret = set_fifo_limit(q->qdisc, qopt->limit);
365 if (ret) {
366 pr_debug("netem: can't set fifo limit\n");
367 return ret;
368 }
369
370 q->latency = qopt->latency;
371 q->jitter = qopt->jitter;
372 q->limit = qopt->limit;
373 q->gap = qopt->gap;
374 q->loss = qopt->loss;
375 q->duplicate = qopt->duplicate;
376
377 /* Handle nested options after initial queue options.
378 * Should have put all options in nested format but too late now.
379 */
380 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
381 struct rtattr *tb[TCA_NETEM_MAX];
382 if (rtattr_parse(tb, TCA_NETEM_MAX,
383 RTA_DATA(opt) + sizeof(*qopt),
384 RTA_PAYLOAD(opt) - sizeof(*qopt)))
385 return -EINVAL;
386
387 if (tb[TCA_NETEM_CORR-1]) {
388 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
389 if (ret)
390 return ret;
391 }
392
393 if (tb[TCA_NETEM_DELAY_DIST-1]) {
394 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
395 if (ret)
396 return ret;
397 }
398 }
399
400
401 return 0;
402}
403
404static int netem_init(struct Qdisc *sch, struct rtattr *opt)
405{
406 struct netem_sched_data *q = qdisc_priv(sch);
407 int ret;
408
409 if (!opt)
410 return -EINVAL;
411
1da177e4
LT
412 init_timer(&q->timer);
413 q->timer.function = netem_watchdog;
414 q->timer.data = (unsigned long) sch;
415 q->counter = 0;
416
417 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
418 if (!q->qdisc) {
419 pr_debug("netem: qdisc create failed\n");
420 return -ENOMEM;
421 }
422
423 ret = netem_change(sch, opt);
424 if (ret) {
425 pr_debug("netem: change failed\n");
426 qdisc_destroy(q->qdisc);
427 }
428 return ret;
429}
430
431static void netem_destroy(struct Qdisc *sch)
432{
433 struct netem_sched_data *q = qdisc_priv(sch);
434
435 del_timer_sync(&q->timer);
436 qdisc_destroy(q->qdisc);
437 kfree(q->delay_dist);
438}
439
440static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
441{
442 const struct netem_sched_data *q = qdisc_priv(sch);
443 unsigned char *b = skb->tail;
444 struct rtattr *rta = (struct rtattr *) b;
445 struct tc_netem_qopt qopt;
446 struct tc_netem_corr cor;
447
448 qopt.latency = q->latency;
449 qopt.jitter = q->jitter;
450 qopt.limit = q->limit;
451 qopt.loss = q->loss;
452 qopt.gap = q->gap;
453 qopt.duplicate = q->duplicate;
454 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
455
456 cor.delay_corr = q->delay_cor.rho;
457 cor.loss_corr = q->loss_cor.rho;
458 cor.dup_corr = q->dup_cor.rho;
459 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
460 rta->rta_len = skb->tail - b;
461
462 return skb->len;
463
464rtattr_failure:
465 skb_trim(skb, b - skb->data);
466 return -1;
467}
468
469static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
470 struct sk_buff *skb, struct tcmsg *tcm)
471{
472 struct netem_sched_data *q = qdisc_priv(sch);
473
474 if (cl != 1) /* only one class */
475 return -ENOENT;
476
477 tcm->tcm_handle |= TC_H_MIN(1);
478 tcm->tcm_info = q->qdisc->handle;
479
480 return 0;
481}
482
483static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
484 struct Qdisc **old)
485{
486 struct netem_sched_data *q = qdisc_priv(sch);
487
488 if (new == NULL)
489 new = &noop_qdisc;
490
491 sch_tree_lock(sch);
492 *old = xchg(&q->qdisc, new);
493 qdisc_reset(*old);
494 sch->q.qlen = 0;
495 sch_tree_unlock(sch);
496
497 return 0;
498}
499
500static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
501{
502 struct netem_sched_data *q = qdisc_priv(sch);
503 return q->qdisc;
504}
505
506static unsigned long netem_get(struct Qdisc *sch, u32 classid)
507{
508 return 1;
509}
510
511static void netem_put(struct Qdisc *sch, unsigned long arg)
512{
513}
514
515static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
516 struct rtattr **tca, unsigned long *arg)
517{
518 return -ENOSYS;
519}
520
521static int netem_delete(struct Qdisc *sch, unsigned long arg)
522{
523 return -ENOSYS;
524}
525
526static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
527{
528 if (!walker->stop) {
529 if (walker->count >= walker->skip)
530 if (walker->fn(sch, 1, walker) < 0) {
531 walker->stop = 1;
532 return;
533 }
534 walker->count++;
535 }
536}
537
538static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
539{
540 return NULL;
541}
542
543static struct Qdisc_class_ops netem_class_ops = {
544 .graft = netem_graft,
545 .leaf = netem_leaf,
546 .get = netem_get,
547 .put = netem_put,
548 .change = netem_change_class,
549 .delete = netem_delete,
550 .walk = netem_walk,
551 .tcf_chain = netem_find_tcf,
552 .dump = netem_dump_class,
553};
554
555static struct Qdisc_ops netem_qdisc_ops = {
556 .id = "netem",
557 .cl_ops = &netem_class_ops,
558 .priv_size = sizeof(struct netem_sched_data),
559 .enqueue = netem_enqueue,
560 .dequeue = netem_dequeue,
561 .requeue = netem_requeue,
562 .drop = netem_drop,
563 .init = netem_init,
564 .reset = netem_reset,
565 .destroy = netem_destroy,
566 .change = netem_change,
567 .dump = netem_dump,
568 .owner = THIS_MODULE,
569};
570
571
572static int __init netem_module_init(void)
573{
574 return register_qdisc(&netem_qdisc_ops);
575}
576static void __exit netem_module_exit(void)
577{
578 unregister_qdisc(&netem_qdisc_ops);
579}
580module_init(netem_module_init)
581module_exit(netem_module_exit)
582MODULE_LICENSE("GPL");
This page took 0.067912 seconds and 5 git commands to generate.