net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25
  26 #include <net/netlink.h>
  27 #include <net/pkt_sched.h>
  28
  29 #define VERSION "1.3"
  30
  31 /*      Network Emulation Queuing algorithm.
  32         ====================================
  33
  34         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  35                  Network Emulation Tool
  36                  [2] Luigi Rizzo, DummyNet for FreeBSD
  37
  38          ----------------------------------------------------------------
  39
  40          This started out as a simple way to delay outgoing packets to
  41          test TCP but has grown to include most of the functionality
  42          of a full blown network emulator like NISTnet. It can delay
  43          packets and add random jitter (and correlation). The random
  44          distribution can be loaded from a table as well to provide
  45          normal, Pareto, or experimental curves. Packet loss,
  46          duplication, and reordering can also be emulated.
  47
  48          This qdisc does not do classification that can be handled in
  49          layering other disciplines.  It does not need to do bandwidth
  50          control either since that can be handled by using token
  51          bucket or other rate control.
  52
  53      Correlated Loss Generator models
  54
  55         Added generation of correlated loss according to the
  56         "Gilbert-Elliot" model, a 4-state markov model.
  57
  58         References:
  59         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  60         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  61         and intuitive loss model for packet networks and its implementation
  62         in the Netem module in the Linux kernel", available in [1]
  63
  64         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  65                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  66 */
  67
  68 struct netem_sched_data {
  69         struct Qdisc    *qdisc;
  70         struct qdisc_watchdog watchdog;
  71
  72         psched_tdiff_t latency;
  73         psched_tdiff_t jitter;
  74
  75         u32 loss;
  76         u32 limit;
  77         u32 counter;
  78         u32 gap;
  79         u32 duplicate;
  80         u32 reorder;
  81         u32 corrupt;
  82         u32 rate;
  83
  84         struct crndstate {
  85                 u32 last;
  86                 u32 rho;
  87         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  88
  89         struct disttable {
  90                 u32  size;
  91                 s16 table[0];
  92         } *delay_dist;
  93
  94         enum  {
  95                 CLG_RANDOM,
  96                 CLG_4_STATES,
  97                 CLG_GILB_ELL,
  98         } loss_model;
  99
 100         /* Correlated Loss Generation models */
 101         struct clgstate {
 102                 /* state of the Markov chain */
 103                 u8 state;
 104
 105                 /* 4-states and Gilbert-Elliot models */
 106                 u32 a1; /* p13 for 4-states or p for GE */
 107                 u32 a2; /* p31 for 4-states or r for GE */
 108                 u32 a3; /* p32 for 4-states or h for GE */
 109                 u32 a4; /* p14 for 4-states or 1-k for GE */
 110                 u32 a5; /* p23 used only in 4-states */
 111         } clg;
 112
 113 };
 114
 115 /* Time stamp put into socket buffer control block */
 116 struct netem_skb_cb {
 117         psched_time_t   time_to_send;
 118 };
 119
 120 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 121 {
 122         BUILD_BUG_ON(sizeof(skb->cb) <
 123                 sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
 124         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 125 }
 126
 127 /* init_crandom - initialize correlated random number generator
 128  * Use entropy source for initial seed.
 129  */
 130 static void init_crandom(struct crndstate *state, unsigned long rho)
 131 {
 132         state->rho = rho;
 133         state->last = net_random();
 134 }
 135
 136 /* get_crandom - correlated random number generator
 137  * Next number depends on last value.
 138  * rho is scaled to avoid floating point.
 139  */
 140 static u32 get_crandom(struct crndstate *state)
 141 {
 142         u64 value, rho;
 143         unsigned long answer;
 144
 145         if (state->rho == 0)    /* no correlation */
 146                 return net_random();
 147
 148         value = net_random();
 149         rho = (u64)state->rho + 1;
 150         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 151         state->last = answer;
 152         return answer;
 153 }
 154
 155 /* loss_4state - 4-state model loss generator
 156  * Generates losses according to the 4-state Markov chain adopted in
 157  * the GI (General and Intuitive) loss model.
 158  */
 159 static bool loss_4state(struct netem_sched_data *q)
 160 {
 161         struct clgstate *clg = &q->clg;
 162         u32 rnd = net_random();
 163
 164         /*
 165          * Makes a comparison between rnd and the transition
 166          * probabilities outgoing from the current state, then decides the
 167          * next state and if the next packet has to be transmitted or lost.
 168          * The four states correspond to:
 169          *   1 => successfully transmitted packets within a gap period
 170          *   4 => isolated losses within a gap period
 171          *   3 => lost packets within a burst period
 172          *   2 => successfully transmitted packets within a burst period
 173          */
 174         switch (clg->state) {
 175         case 1:
 176                 if (rnd < clg->a4) {
 177                         clg->state = 4;
 178                         return true;
 179                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 180                         clg->state = 3;
 181                         return true;
 182                 } else if (clg->a1 < rnd)
 183                         clg->state = 1;
 184
 185                 break;
 186         case 2:
 187                 if (rnd < clg->a5) {
 188                         clg->state = 3;
 189                         return true;
 190                 } else
 191                         clg->state = 2;
 192
 193                 break;
 194         case 3:
 195                 if (rnd < clg->a3)
 196                         clg->state = 2;
 197                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 198                         clg->state = 1;
 199                         return true;
 200                 } else if (clg->a2 + clg->a3 < rnd) {
 201                         clg->state = 3;
 202                         return true;
 203                 }
 204                 break;
 205         case 4:
 206                 clg->state = 1;
 207                 break;
 208         }
 209
 210         return false;
 211 }
 212
 213 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 214  * Generates losses according to the Gilbert-Elliot loss model or
 215  * its special cases  (Gilbert or Simple Gilbert)
 216  *
 217  * Makes a comparison between random number and the transition
 218  * probabilities outgoing from the current state, then decides the
 219  * next state. A second random number is extracted and the comparison
 220  * with the loss probability of the current state decides if the next
 221  * packet will be transmitted or lost.
 222  */
 223 static bool loss_gilb_ell(struct netem_sched_data *q)
 224 {
 225         struct clgstate *clg = &q->clg;
 226
 227         switch (clg->state) {
 228         case 1:
 229                 if (net_random() < clg->a1)
 230                         clg->state = 2;
 231                 if (net_random() < clg->a4)
 232                         return true;
 233         case 2:
 234                 if (net_random() < clg->a2)
 235                         clg->state = 1;
 236                 if (clg->a3 > net_random())
 237                         return true;
 238         }
 239
 240         return false;
 241 }
 242
 243 static bool loss_event(struct netem_sched_data *q)
 244 {
 245         switch (q->loss_model) {
 246         case CLG_RANDOM:
 247                 /* Random packet drop 0 => none, ~0 => all */
 248                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 249
 250         case CLG_4_STATES:
 251                 /* 4state loss model algorithm (used also for GI model)
 252                 * Extracts a value from the markov 4 state loss generator,
 253                 * if it is 1 drops a packet and if needed writes the event in
 254                 * the kernel logs
 255                 */
 256                 return loss_4state(q);
 257
 258         case CLG_GILB_ELL:
 259                 /* Gilbert-Elliot loss model algorithm
 260                 * Extracts a value from the Gilbert-Elliot loss generator,
 261                 * if it is 1 drops a packet and if needed writes the event in
 262                 * the kernel logs
 263                 */
 264                 return loss_gilb_ell(q);
 265         }
 266
 267         return false;   /* not reached */
 268 }
 269
 270
 271 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 272  * std deviation sigma.  Uses table lookup to approximate the desired
 273  * distribution, and a uniformly-distributed pseudo-random source.
 274  */
 275 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 276                                 struct crndstate *state,
 277                                 const struct disttable *dist)
 278 {
 279         psched_tdiff_t x;
 280         long t;
 281         u32 rnd;
 282
 283         if (sigma == 0)
 284                 return mu;
 285
 286         rnd = get_crandom(state);
 287
 288         /* default uniform distribution */
 289         if (dist == NULL)
 290                 return (rnd % (2*sigma)) - sigma + mu;
 291
 292         t = dist->table[rnd % dist->size];
 293         x = (sigma % NETEM_DIST_SCALE) * t;
 294         if (x >= 0)
 295                 x += NETEM_DIST_SCALE/2;
 296         else
 297                 x -= NETEM_DIST_SCALE/2;
 298
 299         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 300 }
 301
 302 static psched_time_t packet_len_2_sched_time(unsigned int len, u32 rate)
 303 {
 304         u64 ticks = (u64)len * NSEC_PER_SEC;
 305
 306         do_div(ticks, rate);
 307         return PSCHED_NS2TICKS(ticks);
 308 }
 309
 310 /*
 311  * Insert one skb into qdisc.
 312  * Note: parent depends on return value to account for queue length.
 313  *      NET_XMIT_DROP: queue length didn't change.
 314  *      NET_XMIT_SUCCESS: one skb was queued.
 315  */
 316 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 317 {
 318         struct netem_sched_data *q = qdisc_priv(sch);
 319         /* We don't fill cb now as skb_unshare() may invalidate it */
 320         struct netem_skb_cb *cb;
 321         struct sk_buff *skb2;
 322         int ret;
 323         int count = 1;
 324
 325         /* Random duplication */
 326         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 327                 ++count;
 328
 329         /* Drop packet? */
 330         if (loss_event(q))
 331                 --count;
 332
 333         if (count == 0) {
 334                 sch->qstats.drops++;
 335                 kfree_skb(skb);
 336                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 337         }
 338
 339         skb_orphan(skb);
 340
 341         /*
 342          * If we need to duplicate packet, then re-insert at top of the
 343          * qdisc tree, since parent queuer expects that only one
 344          * skb will be queued.
 345          */
 346         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 347                 struct Qdisc *rootq = qdisc_root(sch);
 348                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 349                 q->duplicate = 0;
 350
 351                 qdisc_enqueue_root(skb2, rootq);
 352                 q->duplicate = dupsave;
 353         }
 354
 355         /*
 356          * Randomized packet corruption.
 357          * Make copy if needed since we are modifying
 358          * If packet is going to be hardware checksummed, then
 359          * do it now in software before we mangle it.
 360          */
 361         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 362                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 363                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 364                      skb_checksum_help(skb))) {
 365                         sch->qstats.drops++;
 366                         return NET_XMIT_DROP;
 367                 }
 368
 369                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 370         }
 371
 372         cb = netem_skb_cb(skb);
 373         if (q->gap == 0 ||              /* not doing reordering */
 374             q->counter < q->gap ||      /* inside last reordering gap */
 375             q->reorder < get_crandom(&q->reorder_cor)) {
 376                 psched_time_t now;
 377                 psched_tdiff_t delay;
 378
 379                 delay = tabledist(q->latency, q->jitter,
 380                                   &q->delay_cor, q->delay_dist);
 381
 382                 now = psched_get_time();
 383
 384                 if (q->rate) {
 385                         struct sk_buff_head *list = &q->qdisc->q;
 386
 387                         delay += packet_len_2_sched_time(skb->len, q->rate);
 388
 389                         if (!skb_queue_empty(list)) {
 390                                 /*
 391                                  * Last packet in queue is reference point (now).
 392                                  * First packet in queue is already in flight,
 393                                  * calculate this time bonus and substract
 394                                  * from delay.
 395                                  */
 396                                 delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
 397                                 now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
 398                         }
 399                 }
 400
 401                 cb->time_to_send = now + delay;
 402                 ++q->counter;
 403                 ret = qdisc_enqueue(skb, q->qdisc);
 404         } else {
 405                 /*
 406                  * Do re-ordering by putting one out of N packets at the front
 407                  * of the queue.
 408                  */
 409                 cb->time_to_send = psched_get_time();
 410                 q->counter = 0;
 411
 412                 __skb_queue_head(&q->qdisc->q, skb);
 413                 q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
 414                 q->qdisc->qstats.requeues++;
 415                 ret = NET_XMIT_SUCCESS;
 416         }
 417
 418         if (ret != NET_XMIT_SUCCESS) {
 419                 if (net_xmit_drop_count(ret)) {
 420                         sch->qstats.drops++;
 421                         return ret;
 422                 }
 423         }
 424
 425         sch->q.qlen++;
 426         return NET_XMIT_SUCCESS;
 427 }
 428
 429 static unsigned int netem_drop(struct Qdisc *sch)
 430 {
 431         struct netem_sched_data *q = qdisc_priv(sch);
 432         unsigned int len = 0;
 433
 434         if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 435                 sch->q.qlen--;
 436                 sch->qstats.drops++;
 437         }
 438         return len;
 439 }
 440
 441 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 442 {
 443         struct netem_sched_data *q = qdisc_priv(sch);
 444         struct sk_buff *skb;
 445
 446         if (qdisc_is_throttled(sch))
 447                 return NULL;
 448
 449         skb = q->qdisc->ops->peek(q->qdisc);
 450         if (skb) {
 451                 const struct netem_skb_cb *cb = netem_skb_cb(skb);
 452                 psched_time_t now = psched_get_time();
 453
 454                 /* if more time remaining? */
 455                 if (cb->time_to_send <= now) {
 456                         skb = qdisc_dequeue_peeked(q->qdisc);
 457                         if (unlikely(!skb))
 458                                 return NULL;
 459
 460 #ifdef CONFIG_NET_CLS_ACT
 461                         /*
 462                          * If it's at ingress let's pretend the delay is
 463                          * from the network (tstamp will be updated).
 464                          */
 465                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 466                                 skb->tstamp.tv64 = 0;
 467 #endif
 468
 469                         sch->q.qlen--;
 470                         qdisc_unthrottled(sch);
 471                         qdisc_bstats_update(sch, skb);
 472                         return skb;
 473                 }
 474
 475                 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 476         }
 477
 478         return NULL;
 479 }
 480
 481 static void netem_reset(struct Qdisc *sch)
 482 {
 483         struct netem_sched_data *q = qdisc_priv(sch);
 484
 485         qdisc_reset(q->qdisc);
 486         sch->q.qlen = 0;
 487         qdisc_watchdog_cancel(&q->watchdog);
 488 }
 489
 490 static void dist_free(struct disttable *d)
 491 {
 492         if (d) {
 493                 if (is_vmalloc_addr(d))
 494                         vfree(d);
 495                 else
 496                         kfree(d);
 497         }
 498 }
 499
 500 /*
 501  * Distribution data is a variable size payload containing
 502  * signed 16 bit values.
 503  */
 504 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 505 {
 506         struct netem_sched_data *q = qdisc_priv(sch);
 507         size_t n = nla_len(attr)/sizeof(__s16);
 508         const __s16 *data = nla_data(attr);
 509         spinlock_t *root_lock;
 510         struct disttable *d;
 511         int i;
 512         size_t s;
 513
 514         if (n > NETEM_DIST_MAX)
 515                 return -EINVAL;
 516
 517         s = sizeof(struct disttable) + n * sizeof(s16);
 518         d = kmalloc(s, GFP_KERNEL);
 519         if (!d)
 520                 d = vmalloc(s);
 521         if (!d)
 522                 return -ENOMEM;
 523
 524         d->size = n;
 525         for (i = 0; i < n; i++)
 526                 d->table[i] = data[i];
 527
 528         root_lock = qdisc_root_sleeping_lock(sch);
 529
 530         spin_lock_bh(root_lock);
 531         dist_free(q->delay_dist);
 532         q->delay_dist = d;
 533         spin_unlock_bh(root_lock);
 534         return 0;
 535 }
 536
 537 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 538 {
 539         struct netem_sched_data *q = qdisc_priv(sch);
 540         const struct tc_netem_corr *c = nla_data(attr);
 541
 542         init_crandom(&q->delay_cor, c->delay_corr);
 543         init_crandom(&q->loss_cor, c->loss_corr);
 544         init_crandom(&q->dup_cor, c->dup_corr);
 545 }
 546
 547 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 548 {
 549         struct netem_sched_data *q = qdisc_priv(sch);
 550         const struct tc_netem_reorder *r = nla_data(attr);
 551
 552         q->reorder = r->probability;
 553         init_crandom(&q->reorder_cor, r->correlation);
 554 }
 555
 556 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 557 {
 558         struct netem_sched_data *q = qdisc_priv(sch);
 559         const struct tc_netem_corrupt *r = nla_data(attr);
 560
 561         q->corrupt = r->probability;
 562         init_crandom(&q->corrupt_cor, r->correlation);
 563 }
 564
 565 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 566 {
 567         struct netem_sched_data *q = qdisc_priv(sch);
 568         const struct tc_netem_rate *r = nla_data(attr);
 569
 570         q->rate = r->rate;
 571 }
 572
 573 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 574 {
 575         struct netem_sched_data *q = qdisc_priv(sch);
 576         const struct nlattr *la;
 577         int rem;
 578
 579         nla_for_each_nested(la, attr, rem) {
 580                 u16 type = nla_type(la);
 581
 582                 switch(type) {
 583                 case NETEM_LOSS_GI: {
 584                         const struct tc_netem_gimodel *gi = nla_data(la);
 585
 586                         if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
 587                                 pr_info("netem: incorrect gi model size\n");
 588                                 return -EINVAL;
 589                         }
 590
 591                         q->loss_model = CLG_4_STATES;
 592
 593                         q->clg.state = 1;
 594                         q->clg.a1 = gi->p13;
 595                         q->clg.a2 = gi->p31;
 596                         q->clg.a3 = gi->p32;
 597                         q->clg.a4 = gi->p14;
 598                         q->clg.a5 = gi->p23;
 599                         break;
 600                 }
 601
 602                 case NETEM_LOSS_GE: {
 603                         const struct tc_netem_gemodel *ge = nla_data(la);
 604
 605                         if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
 606                                 pr_info("netem: incorrect gi model size\n");
 607                                 return -EINVAL;
 608                         }
 609
 610                         q->loss_model = CLG_GILB_ELL;
 611                         q->clg.state = 1;
 612                         q->clg.a1 = ge->p;
 613                         q->clg.a2 = ge->r;
 614                         q->clg.a3 = ge->h;
 615                         q->clg.a4 = ge->k1;
 616                         break;
 617                 }
 618
 619                 default:
 620                         pr_info("netem: unknown loss type %u\n", type);
 621                         return -EINVAL;
 622                 }
 623         }
 624
 625         return 0;
 626 }
 627
 628 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 629         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 630         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 631         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 632         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 633         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 634 };
 635
 636 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 637                       const struct nla_policy *policy, int len)
 638 {
 639         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 640
 641         if (nested_len < 0) {
 642                 pr_info("netem: invalid attributes len %d\n", nested_len);
 643                 return -EINVAL;
 644         }
 645
 646         if (nested_len >= nla_attr_size(0))
 647                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 648                                  nested_len, policy);
 649
 650         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 651         return 0;
 652 }
 653
 654 /* Parse netlink message to set options */
 655 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 656 {
 657         struct netem_sched_data *q = qdisc_priv(sch);
 658         struct nlattr *tb[TCA_NETEM_MAX + 1];
 659         struct tc_netem_qopt *qopt;
 660         int ret;
 661
 662         if (opt == NULL)
 663                 return -EINVAL;
 664
 665         qopt = nla_data(opt);
 666         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 667         if (ret < 0)
 668                 return ret;
 669
 670         ret = fifo_set_limit(q->qdisc, qopt->limit);
 671         if (ret) {
 672                 pr_info("netem: can't set fifo limit\n");
 673                 return ret;
 674         }
 675
 676         q->latency = qopt->latency;
 677         q->jitter = qopt->jitter;
 678         q->limit = qopt->limit;
 679         q->gap = qopt->gap;
 680         q->counter = 0;
 681         q->loss = qopt->loss;
 682         q->duplicate = qopt->duplicate;
 683
 684         /* for compatibility with earlier versions.
 685          * if gap is set, need to assume 100% probability
 686          */
 687         if (q->gap)
 688                 q->reorder = ~0;
 689
 690         if (tb[TCA_NETEM_CORR])
 691                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 692
 693         if (tb[TCA_NETEM_DELAY_DIST]) {
 694                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 695                 if (ret)
 696                         return ret;
 697         }
 698
 699         if (tb[TCA_NETEM_REORDER])
 700                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 701
 702         if (tb[TCA_NETEM_CORRUPT])
 703                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 704
 705         if (tb[TCA_NETEM_RATE])
 706                 get_rate(sch, tb[TCA_NETEM_RATE]);
 707
 708         q->loss_model = CLG_RANDOM;
 709         if (tb[TCA_NETEM_LOSS])
 710                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 711
 712         return ret;
 713 }
 714
 715 /*
 716  * Special case version of FIFO queue for use by netem.
 717  * It queues in order based on timestamps in skb's
 718  */
 719 struct fifo_sched_data {
 720         u32 limit;
 721         psched_time_t oldest;
 722 };
 723
 724 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 725 {
 726         struct fifo_sched_data *q = qdisc_priv(sch);
 727         struct sk_buff_head *list = &sch->q;
 728         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 729         struct sk_buff *skb;
 730
 731         if (likely(skb_queue_len(list) < q->limit)) {
 732                 /* Optimize for add at tail */
 733                 if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
 734                         q->oldest = tnext;
 735                         return qdisc_enqueue_tail(nskb, sch);
 736                 }
 737
 738                 skb_queue_reverse_walk(list, skb) {
 739                         const struct netem_skb_cb *cb = netem_skb_cb(skb);
 740
 741                         if (tnext >= cb->time_to_send)
 742                                 break;
 743                 }
 744
 745                 __skb_queue_after(list, skb, nskb);
 746
 747                 sch->qstats.backlog += qdisc_pkt_len(nskb);
 748
 749                 return NET_XMIT_SUCCESS;
 750         }
 751
 752         return qdisc_reshape_fail(nskb, sch);
 753 }
 754
 755 static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
 756 {
 757         struct fifo_sched_data *q = qdisc_priv(sch);
 758
 759         if (opt) {
 760                 struct tc_fifo_qopt *ctl = nla_data(opt);
 761                 if (nla_len(opt) < sizeof(*ctl))
 762                         return -EINVAL;
 763
 764                 q->limit = ctl->limit;
 765         } else
 766                 q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
 767
 768         q->oldest = PSCHED_PASTPERFECT;
 769         return 0;
 770 }
 771
 772 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 773 {
 774         struct fifo_sched_data *q = qdisc_priv(sch);
 775         struct tc_fifo_qopt opt = { .limit = q->limit };
 776
 777         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 778         return skb->len;
 779
 780 nla_put_failure:
 781         return -1;
 782 }
 783
 784 static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
 785         .id             =       "tfifo",
 786         .priv_size      =       sizeof(struct fifo_sched_data),
 787         .enqueue        =       tfifo_enqueue,
 788         .dequeue        =       qdisc_dequeue_head,
 789         .peek           =       qdisc_peek_head,
 790         .drop           =       qdisc_queue_drop,
 791         .init           =       tfifo_init,
 792         .reset          =       qdisc_reset_queue,
 793         .change         =       tfifo_init,
 794         .dump           =       tfifo_dump,
 795 };
 796
 797 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 798 {
 799         struct netem_sched_data *q = qdisc_priv(sch);
 800         int ret;
 801
 802         if (!opt)
 803                 return -EINVAL;
 804
 805         qdisc_watchdog_init(&q->watchdog, sch);
 806
 807         q->loss_model = CLG_RANDOM;
 808         q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
 809                                      TC_H_MAKE(sch->handle, 1));
 810         if (!q->qdisc) {
 811                 pr_notice("netem: qdisc create tfifo qdisc failed\n");
 812                 return -ENOMEM;
 813         }
 814
 815         ret = netem_change(sch, opt);
 816         if (ret) {
 817                 pr_info("netem: change failed\n");
 818                 qdisc_destroy(q->qdisc);
 819         }
 820         return ret;
 821 }
 822
 823 static void netem_destroy(struct Qdisc *sch)
 824 {
 825         struct netem_sched_data *q = qdisc_priv(sch);
 826
 827         qdisc_watchdog_cancel(&q->watchdog);
 828         qdisc_destroy(q->qdisc);
 829         dist_free(q->delay_dist);
 830 }
 831
 832 static int dump_loss_model(const struct netem_sched_data *q,
 833                            struct sk_buff *skb)
 834 {
 835         struct nlattr *nest;
 836
 837         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 838         if (nest == NULL)
 839                 goto nla_put_failure;
 840
 841         switch (q->loss_model) {
 842         case CLG_RANDOM:
 843                 /* legacy loss model */
 844                 nla_nest_cancel(skb, nest);
 845                 return 0;       /* no data */
 846
 847         case CLG_4_STATES: {
 848                 struct tc_netem_gimodel gi = {
 849                         .p13 = q->clg.a1,
 850                         .p31 = q->clg.a2,
 851                         .p32 = q->clg.a3,
 852                         .p14 = q->clg.a4,
 853                         .p23 = q->clg.a5,
 854                 };
 855
 856                 NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
 857                 break;
 858         }
 859         case CLG_GILB_ELL: {
 860                 struct tc_netem_gemodel ge = {
 861                         .p = q->clg.a1,
 862                         .r = q->clg.a2,
 863                         .h = q->clg.a3,
 864                         .k1 = q->clg.a4,
 865                 };
 866
 867                 NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
 868                 break;
 869         }
 870         }
 871
 872         nla_nest_end(skb, nest);
 873         return 0;
 874
 875 nla_put_failure:
 876         nla_nest_cancel(skb, nest);
 877         return -1;
 878 }
 879
 880 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 881 {
 882         const struct netem_sched_data *q = qdisc_priv(sch);
 883         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 884         struct tc_netem_qopt qopt;
 885         struct tc_netem_corr cor;
 886         struct tc_netem_reorder reorder;
 887         struct tc_netem_corrupt corrupt;
 888         struct tc_netem_rate rate;
 889
 890         qopt.latency = q->latency;
 891         qopt.jitter = q->jitter;
 892         qopt.limit = q->limit;
 893         qopt.loss = q->loss;
 894         qopt.gap = q->gap;
 895         qopt.duplicate = q->duplicate;
 896         NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 897
 898         cor.delay_corr = q->delay_cor.rho;
 899         cor.loss_corr = q->loss_cor.rho;
 900         cor.dup_corr = q->dup_cor.rho;
 901         NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
 902
 903         reorder.probability = q->reorder;
 904         reorder.correlation = q->reorder_cor.rho;
 905         NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 906
 907         corrupt.probability = q->corrupt;
 908         corrupt.correlation = q->corrupt_cor.rho;
 909         NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
 910
 911         rate.rate = q->rate;
 912         NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
 913
 914         if (dump_loss_model(q, skb) != 0)
 915                 goto nla_put_failure;
 916
 917         return nla_nest_end(skb, nla);
 918
 919 nla_put_failure:
 920         nlmsg_trim(skb, nla);
 921         return -1;
 922 }
 923
 924 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 925                           struct sk_buff *skb, struct tcmsg *tcm)
 926 {
 927         struct netem_sched_data *q = qdisc_priv(sch);
 928
 929         if (cl != 1)    /* only one class */
 930                 return -ENOENT;
 931
 932         tcm->tcm_handle |= TC_H_MIN(1);
 933         tcm->tcm_info = q->qdisc->handle;
 934
 935         return 0;
 936 }
 937
 938 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 939                      struct Qdisc **old)
 940 {
 941         struct netem_sched_data *q = qdisc_priv(sch);
 942
 943         if (new == NULL)
 944                 new = &noop_qdisc;
 945
 946         sch_tree_lock(sch);
 947         *old = q->qdisc;
 948         q->qdisc = new;
 949         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 950         qdisc_reset(*old);
 951         sch_tree_unlock(sch);
 952
 953         return 0;
 954 }
 955
 956 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 957 {
 958         struct netem_sched_data *q = qdisc_priv(sch);
 959         return q->qdisc;
 960 }
 961
 962 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 963 {
 964         return 1;
 965 }
 966
 967 static void netem_put(struct Qdisc *sch, unsigned long arg)
 968 {
 969 }
 970
 971 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 972 {
 973         if (!walker->stop) {
 974                 if (walker->count >= walker->skip)
 975                         if (walker->fn(sch, 1, walker) < 0) {
 976                                 walker->stop = 1;
 977                                 return;
 978                         }
 979                 walker->count++;
 980         }
 981 }
 982
 983 static const struct Qdisc_class_ops netem_class_ops = {
 984         .graft          =       netem_graft,
 985         .leaf           =       netem_leaf,
 986         .get            =       netem_get,
 987         .put            =       netem_put,
 988         .walk           =       netem_walk,
 989         .dump           =       netem_dump_class,
 990 };
 991
 992 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 993         .id             =       "netem",
 994         .cl_ops         =       &netem_class_ops,
 995         .priv_size      =       sizeof(struct netem_sched_data),
 996         .enqueue        =       netem_enqueue,
 997         .dequeue        =       netem_dequeue,
 998         .peek           =       qdisc_peek_dequeued,
 999         .drop           =       netem_drop,
1000         .init           =       netem_init,
1001         .reset          =       netem_reset,
1002         .destroy        =       netem_destroy,
1003         .change         =       netem_change,
1004         .dump           =       netem_dump,
1005         .owner          =       THIS_MODULE,
1006 };
1007
1008
1009 static int __init netem_module_init(void)
1010 {
1011         pr_info("netem: version " VERSION "\n");
1012         return register_qdisc(&netem_qdisc_ops);
1013 }
1014 static void __exit netem_module_exit(void)
1015 {
1016         unregister_qdisc(&netem_qdisc_ops);
1017 }
1018 module_init(netem_module_init)
1019 module_exit(netem_module_exit)
1020 MODULE_LICENSE("GPL");