net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26
  27 #include <net/netlink.h>
  28 #include <net/pkt_sched.h>
  29
  30 #define VERSION "1.3"
  31
  32 /*      Network Emulation Queuing algorithm.
  33         ====================================
  34
  35         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  36                  Network Emulation Tool
  37                  [2] Luigi Rizzo, DummyNet for FreeBSD
  38
  39          ----------------------------------------------------------------
  40
  41          This started out as a simple way to delay outgoing packets to
  42          test TCP but has grown to include most of the functionality
  43          of a full blown network emulator like NISTnet. It can delay
  44          packets and add random jitter (and correlation). The random
  45          distribution can be loaded from a table as well to provide
  46          normal, Pareto, or experimental curves. Packet loss,
  47          duplication, and reordering can also be emulated.
  48
  49          This qdisc does not do classification that can be handled in
  50          layering other disciplines.  It does not need to do bandwidth
  51          control either since that can be handled by using token
  52          bucket or other rate control.
  53
  54      Correlated Loss Generator models
  55
  56         Added generation of correlated loss according to the
  57         "Gilbert-Elliot" model, a 4-state markov model.
  58
  59         References:
  60         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  61         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  62         and intuitive loss model for packet networks and its implementation
  63         in the Netem module in the Linux kernel", available in [1]
  64
  65         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  66                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  67 */
  68
  69 struct netem_sched_data {
  70         struct Qdisc    *qdisc;
  71         struct qdisc_watchdog watchdog;
  72
  73         psched_tdiff_t latency;
  74         psched_tdiff_t jitter;
  75
  76         u32 loss;
  77         u32 limit;
  78         u32 counter;
  79         u32 gap;
  80         u32 duplicate;
  81         u32 reorder;
  82         u32 corrupt;
  83         u32 rate;
  84         s32 packet_overhead;
  85         u32 cell_size;
  86         u32 cell_size_reciprocal;
  87         s32 cell_overhead;
  88
  89         struct crndstate {
  90                 u32 last;
  91                 u32 rho;
  92         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  93
  94         struct disttable {
  95                 u32  size;
  96                 s16 table[0];
  97         } *delay_dist;
  98
  99         enum  {
 100                 CLG_RANDOM,
 101                 CLG_4_STATES,
 102                 CLG_GILB_ELL,
 103         } loss_model;
 104
 105         /* Correlated Loss Generation models */
 106         struct clgstate {
 107                 /* state of the Markov chain */
 108                 u8 state;
 109
 110                 /* 4-states and Gilbert-Elliot models */
 111                 u32 a1; /* p13 for 4-states or p for GE */
 112                 u32 a2; /* p31 for 4-states or r for GE */
 113                 u32 a3; /* p32 for 4-states or h for GE */
 114                 u32 a4; /* p14 for 4-states or 1-k for GE */
 115                 u32 a5; /* p23 used only in 4-states */
 116         } clg;
 117
 118 };
 119
 120 /* Time stamp put into socket buffer control block */
 121 struct netem_skb_cb {
 122         psched_time_t   time_to_send;
 123 };
 124
 125 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 126 {
 127         BUILD_BUG_ON(sizeof(skb->cb) <
 128                 sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
 129         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 130 }
 131
 132 /* init_crandom - initialize correlated random number generator
 133  * Use entropy source for initial seed.
 134  */
 135 static void init_crandom(struct crndstate *state, unsigned long rho)
 136 {
 137         state->rho = rho;
 138         state->last = net_random();
 139 }
 140
 141 /* get_crandom - correlated random number generator
 142  * Next number depends on last value.
 143  * rho is scaled to avoid floating point.
 144  */
 145 static u32 get_crandom(struct crndstate *state)
 146 {
 147         u64 value, rho;
 148         unsigned long answer;
 149
 150         if (state->rho == 0)    /* no correlation */
 151                 return net_random();
 152
 153         value = net_random();
 154         rho = (u64)state->rho + 1;
 155         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 156         state->last = answer;
 157         return answer;
 158 }
 159
 160 /* loss_4state - 4-state model loss generator
 161  * Generates losses according to the 4-state Markov chain adopted in
 162  * the GI (General and Intuitive) loss model.
 163  */
 164 static bool loss_4state(struct netem_sched_data *q)
 165 {
 166         struct clgstate *clg = &q->clg;
 167         u32 rnd = net_random();
 168
 169         /*
 170          * Makes a comparison between rnd and the transition
 171          * probabilities outgoing from the current state, then decides the
 172          * next state and if the next packet has to be transmitted or lost.
 173          * The four states correspond to:
 174          *   1 => successfully transmitted packets within a gap period
 175          *   4 => isolated losses within a gap period
 176          *   3 => lost packets within a burst period
 177          *   2 => successfully transmitted packets within a burst period
 178          */
 179         switch (clg->state) {
 180         case 1:
 181                 if (rnd < clg->a4) {
 182                         clg->state = 4;
 183                         return true;
 184                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 185                         clg->state = 3;
 186                         return true;
 187                 } else if (clg->a1 < rnd)
 188                         clg->state = 1;
 189
 190                 break;
 191         case 2:
 192                 if (rnd < clg->a5) {
 193                         clg->state = 3;
 194                         return true;
 195                 } else
 196                         clg->state = 2;
 197
 198                 break;
 199         case 3:
 200                 if (rnd < clg->a3)
 201                         clg->state = 2;
 202                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 203                         clg->state = 1;
 204                         return true;
 205                 } else if (clg->a2 + clg->a3 < rnd) {
 206                         clg->state = 3;
 207                         return true;
 208                 }
 209                 break;
 210         case 4:
 211                 clg->state = 1;
 212                 break;
 213         }
 214
 215         return false;
 216 }
 217
 218 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 219  * Generates losses according to the Gilbert-Elliot loss model or
 220  * its special cases  (Gilbert or Simple Gilbert)
 221  *
 222  * Makes a comparison between random number and the transition
 223  * probabilities outgoing from the current state, then decides the
 224  * next state. A second random number is extracted and the comparison
 225  * with the loss probability of the current state decides if the next
 226  * packet will be transmitted or lost.
 227  */
 228 static bool loss_gilb_ell(struct netem_sched_data *q)
 229 {
 230         struct clgstate *clg = &q->clg;
 231
 232         switch (clg->state) {
 233         case 1:
 234                 if (net_random() < clg->a1)
 235                         clg->state = 2;
 236                 if (net_random() < clg->a4)
 237                         return true;
 238         case 2:
 239                 if (net_random() < clg->a2)
 240                         clg->state = 1;
 241                 if (clg->a3 > net_random())
 242                         return true;
 243         }
 244
 245         return false;
 246 }
 247
 248 static bool loss_event(struct netem_sched_data *q)
 249 {
 250         switch (q->loss_model) {
 251         case CLG_RANDOM:
 252                 /* Random packet drop 0 => none, ~0 => all */
 253                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 254
 255         case CLG_4_STATES:
 256                 /* 4state loss model algorithm (used also for GI model)
 257                 * Extracts a value from the markov 4 state loss generator,
 258                 * if it is 1 drops a packet and if needed writes the event in
 259                 * the kernel logs
 260                 */
 261                 return loss_4state(q);
 262
 263         case CLG_GILB_ELL:
 264                 /* Gilbert-Elliot loss model algorithm
 265                 * Extracts a value from the Gilbert-Elliot loss generator,
 266                 * if it is 1 drops a packet and if needed writes the event in
 267                 * the kernel logs
 268                 */
 269                 return loss_gilb_ell(q);
 270         }
 271
 272         return false;   /* not reached */
 273 }
 274
 275
 276 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 277  * std deviation sigma.  Uses table lookup to approximate the desired
 278  * distribution, and a uniformly-distributed pseudo-random source.
 279  */
 280 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 281                                 struct crndstate *state,
 282                                 const struct disttable *dist)
 283 {
 284         psched_tdiff_t x;
 285         long t;
 286         u32 rnd;
 287
 288         if (sigma == 0)
 289                 return mu;
 290
 291         rnd = get_crandom(state);
 292
 293         /* default uniform distribution */
 294         if (dist == NULL)
 295                 return (rnd % (2*sigma)) - sigma + mu;
 296
 297         t = dist->table[rnd % dist->size];
 298         x = (sigma % NETEM_DIST_SCALE) * t;
 299         if (x >= 0)
 300                 x += NETEM_DIST_SCALE/2;
 301         else
 302                 x -= NETEM_DIST_SCALE/2;
 303
 304         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 305 }
 306
 307 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 308 {
 309         u64 ticks;
 310
 311         len += q->packet_overhead;
 312
 313         if (q->cell_size) {
 314                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 315
 316                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 317                         cells++;
 318                 len = cells * (q->cell_size + q->cell_overhead);
 319         }
 320
 321         ticks = (u64)len * NSEC_PER_SEC;
 322
 323         do_div(ticks, q->rate);
 324         return PSCHED_NS2TICKS(ticks);
 325 }
 326
 327 /*
 328  * Insert one skb into qdisc.
 329  * Note: parent depends on return value to account for queue length.
 330  *      NET_XMIT_DROP: queue length didn't change.
 331  *      NET_XMIT_SUCCESS: one skb was queued.
 332  */
 333 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 334 {
 335         struct netem_sched_data *q = qdisc_priv(sch);
 336         /* We don't fill cb now as skb_unshare() may invalidate it */
 337         struct netem_skb_cb *cb;
 338         struct sk_buff *skb2;
 339         int ret;
 340         int count = 1;
 341
 342         /* Random duplication */
 343         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 344                 ++count;
 345
 346         /* Drop packet? */
 347         if (loss_event(q))
 348                 --count;
 349
 350         if (count == 0) {
 351                 sch->qstats.drops++;
 352                 kfree_skb(skb);
 353                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 354         }
 355
 356         skb_orphan(skb);
 357
 358         /*
 359          * If we need to duplicate packet, then re-insert at top of the
 360          * qdisc tree, since parent queuer expects that only one
 361          * skb will be queued.
 362          */
 363         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 364                 struct Qdisc *rootq = qdisc_root(sch);
 365                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 366                 q->duplicate = 0;
 367
 368                 qdisc_enqueue_root(skb2, rootq);
 369                 q->duplicate = dupsave;
 370         }
 371
 372         /*
 373          * Randomized packet corruption.
 374          * Make copy if needed since we are modifying
 375          * If packet is going to be hardware checksummed, then
 376          * do it now in software before we mangle it.
 377          */
 378         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 379                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 380                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 381                      skb_checksum_help(skb))) {
 382                         sch->qstats.drops++;
 383                         return NET_XMIT_DROP;
 384                 }
 385
 386                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 387         }
 388
 389         cb = netem_skb_cb(skb);
 390         if (q->gap == 0 ||              /* not doing reordering */
 391             q->counter < q->gap ||      /* inside last reordering gap */
 392             q->reorder < get_crandom(&q->reorder_cor)) {
 393                 psched_time_t now;
 394                 psched_tdiff_t delay;
 395
 396                 delay = tabledist(q->latency, q->jitter,
 397                                   &q->delay_cor, q->delay_dist);
 398
 399                 now = psched_get_time();
 400
 401                 if (q->rate) {
 402                         struct sk_buff_head *list = &q->qdisc->q;
 403
 404                         delay += packet_len_2_sched_time(skb->len, q);
 405
 406                         if (!skb_queue_empty(list)) {
 407                                 /*
 408                                  * Last packet in queue is reference point (now).
 409                                  * First packet in queue is already in flight,
 410                                  * calculate this time bonus and substract
 411                                  * from delay.
 412                                  */
 413                                 delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
 414                                 now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
 415                         }
 416                 }
 417
 418                 cb->time_to_send = now + delay;
 419                 ++q->counter;
 420                 ret = qdisc_enqueue(skb, q->qdisc);
 421         } else {
 422                 /*
 423                  * Do re-ordering by putting one out of N packets at the front
 424                  * of the queue.
 425                  */
 426                 cb->time_to_send = psched_get_time();
 427                 q->counter = 0;
 428
 429                 __skb_queue_head(&q->qdisc->q, skb);
 430                 q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
 431                 q->qdisc->qstats.requeues++;
 432                 ret = NET_XMIT_SUCCESS;
 433         }
 434
 435         if (ret != NET_XMIT_SUCCESS) {
 436                 if (net_xmit_drop_count(ret)) {
 437                         sch->qstats.drops++;
 438                         return ret;
 439                 }
 440         }
 441
 442         sch->q.qlen++;
 443         return NET_XMIT_SUCCESS;
 444 }
 445
 446 static unsigned int netem_drop(struct Qdisc *sch)
 447 {
 448         struct netem_sched_data *q = qdisc_priv(sch);
 449         unsigned int len = 0;
 450
 451         if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 452                 sch->q.qlen--;
 453                 sch->qstats.drops++;
 454         }
 455         return len;
 456 }
 457
 458 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 459 {
 460         struct netem_sched_data *q = qdisc_priv(sch);
 461         struct sk_buff *skb;
 462
 463         if (qdisc_is_throttled(sch))
 464                 return NULL;
 465
 466         skb = q->qdisc->ops->peek(q->qdisc);
 467         if (skb) {
 468                 const struct netem_skb_cb *cb = netem_skb_cb(skb);
 469                 psched_time_t now = psched_get_time();
 470
 471                 /* if more time remaining? */
 472                 if (cb->time_to_send <= now) {
 473                         skb = qdisc_dequeue_peeked(q->qdisc);
 474                         if (unlikely(!skb))
 475                                 return NULL;
 476
 477 #ifdef CONFIG_NET_CLS_ACT
 478                         /*
 479                          * If it's at ingress let's pretend the delay is
 480                          * from the network (tstamp will be updated).
 481                          */
 482                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 483                                 skb->tstamp.tv64 = 0;
 484 #endif
 485
 486                         sch->q.qlen--;
 487                         qdisc_unthrottled(sch);
 488                         qdisc_bstats_update(sch, skb);
 489                         return skb;
 490                 }
 491
 492                 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 493         }
 494
 495         return NULL;
 496 }
 497
 498 static void netem_reset(struct Qdisc *sch)
 499 {
 500         struct netem_sched_data *q = qdisc_priv(sch);
 501
 502         qdisc_reset(q->qdisc);
 503         sch->q.qlen = 0;
 504         qdisc_watchdog_cancel(&q->watchdog);
 505 }
 506
 507 static void dist_free(struct disttable *d)
 508 {
 509         if (d) {
 510                 if (is_vmalloc_addr(d))
 511                         vfree(d);
 512                 else
 513                         kfree(d);
 514         }
 515 }
 516
 517 /*
 518  * Distribution data is a variable size payload containing
 519  * signed 16 bit values.
 520  */
 521 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 522 {
 523         struct netem_sched_data *q = qdisc_priv(sch);
 524         size_t n = nla_len(attr)/sizeof(__s16);
 525         const __s16 *data = nla_data(attr);
 526         spinlock_t *root_lock;
 527         struct disttable *d;
 528         int i;
 529         size_t s;
 530
 531         if (n > NETEM_DIST_MAX)
 532                 return -EINVAL;
 533
 534         s = sizeof(struct disttable) + n * sizeof(s16);
 535         d = kmalloc(s, GFP_KERNEL);
 536         if (!d)
 537                 d = vmalloc(s);
 538         if (!d)
 539                 return -ENOMEM;
 540
 541         d->size = n;
 542         for (i = 0; i < n; i++)
 543                 d->table[i] = data[i];
 544
 545         root_lock = qdisc_root_sleeping_lock(sch);
 546
 547         spin_lock_bh(root_lock);
 548         dist_free(q->delay_dist);
 549         q->delay_dist = d;
 550         spin_unlock_bh(root_lock);
 551         return 0;
 552 }
 553
 554 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 555 {
 556         struct netem_sched_data *q = qdisc_priv(sch);
 557         const struct tc_netem_corr *c = nla_data(attr);
 558
 559         init_crandom(&q->delay_cor, c->delay_corr);
 560         init_crandom(&q->loss_cor, c->loss_corr);
 561         init_crandom(&q->dup_cor, c->dup_corr);
 562 }
 563
 564 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 565 {
 566         struct netem_sched_data *q = qdisc_priv(sch);
 567         const struct tc_netem_reorder *r = nla_data(attr);
 568
 569         q->reorder = r->probability;
 570         init_crandom(&q->reorder_cor, r->correlation);
 571 }
 572
 573 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 574 {
 575         struct netem_sched_data *q = qdisc_priv(sch);
 576         const struct tc_netem_corrupt *r = nla_data(attr);
 577
 578         q->corrupt = r->probability;
 579         init_crandom(&q->corrupt_cor, r->correlation);
 580 }
 581
 582 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 583 {
 584         struct netem_sched_data *q = qdisc_priv(sch);
 585         const struct tc_netem_rate *r = nla_data(attr);
 586
 587         q->rate = r->rate;
 588         q->packet_overhead = r->packet_overhead;
 589         q->cell_size = r->cell_size;
 590         if (q->cell_size)
 591                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 592         q->cell_overhead = r->cell_overhead;
 593 }
 594
 595 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 596 {
 597         struct netem_sched_data *q = qdisc_priv(sch);
 598         const struct nlattr *la;
 599         int rem;
 600
 601         nla_for_each_nested(la, attr, rem) {
 602                 u16 type = nla_type(la);
 603
 604                 switch(type) {
 605                 case NETEM_LOSS_GI: {
 606                         const struct tc_netem_gimodel *gi = nla_data(la);
 607
 608                         if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
 609                                 pr_info("netem: incorrect gi model size\n");
 610                                 return -EINVAL;
 611                         }
 612
 613                         q->loss_model = CLG_4_STATES;
 614
 615                         q->clg.state = 1;
 616                         q->clg.a1 = gi->p13;
 617                         q->clg.a2 = gi->p31;
 618                         q->clg.a3 = gi->p32;
 619                         q->clg.a4 = gi->p14;
 620                         q->clg.a5 = gi->p23;
 621                         break;
 622                 }
 623
 624                 case NETEM_LOSS_GE: {
 625                         const struct tc_netem_gemodel *ge = nla_data(la);
 626
 627                         if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
 628                                 pr_info("netem: incorrect gi model size\n");
 629                                 return -EINVAL;
 630                         }
 631
 632                         q->loss_model = CLG_GILB_ELL;
 633                         q->clg.state = 1;
 634                         q->clg.a1 = ge->p;
 635                         q->clg.a2 = ge->r;
 636                         q->clg.a3 = ge->h;
 637                         q->clg.a4 = ge->k1;
 638                         break;
 639                 }
 640
 641                 default:
 642                         pr_info("netem: unknown loss type %u\n", type);
 643                         return -EINVAL;
 644                 }
 645         }
 646
 647         return 0;
 648 }
 649
 650 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 651         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 652         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 653         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 654         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 655         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 656 };
 657
 658 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 659                       const struct nla_policy *policy, int len)
 660 {
 661         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 662
 663         if (nested_len < 0) {
 664                 pr_info("netem: invalid attributes len %d\n", nested_len);
 665                 return -EINVAL;
 666         }
 667
 668         if (nested_len >= nla_attr_size(0))
 669                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 670                                  nested_len, policy);
 671
 672         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 673         return 0;
 674 }
 675
 676 /* Parse netlink message to set options */
 677 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 678 {
 679         struct netem_sched_data *q = qdisc_priv(sch);
 680         struct nlattr *tb[TCA_NETEM_MAX + 1];
 681         struct tc_netem_qopt *qopt;
 682         int ret;
 683
 684         if (opt == NULL)
 685                 return -EINVAL;
 686
 687         qopt = nla_data(opt);
 688         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 689         if (ret < 0)
 690                 return ret;
 691
 692         ret = fifo_set_limit(q->qdisc, qopt->limit);
 693         if (ret) {
 694                 pr_info("netem: can't set fifo limit\n");
 695                 return ret;
 696         }
 697
 698         q->latency = qopt->latency;
 699         q->jitter = qopt->jitter;
 700         q->limit = qopt->limit;
 701         q->gap = qopt->gap;
 702         q->counter = 0;
 703         q->loss = qopt->loss;
 704         q->duplicate = qopt->duplicate;
 705
 706         /* for compatibility with earlier versions.
 707          * if gap is set, need to assume 100% probability
 708          */
 709         if (q->gap)
 710                 q->reorder = ~0;
 711
 712         if (tb[TCA_NETEM_CORR])
 713                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 714
 715         if (tb[TCA_NETEM_DELAY_DIST]) {
 716                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 717                 if (ret)
 718                         return ret;
 719         }
 720
 721         if (tb[TCA_NETEM_REORDER])
 722                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 723
 724         if (tb[TCA_NETEM_CORRUPT])
 725                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 726
 727         if (tb[TCA_NETEM_RATE])
 728                 get_rate(sch, tb[TCA_NETEM_RATE]);
 729
 730         q->loss_model = CLG_RANDOM;
 731         if (tb[TCA_NETEM_LOSS])
 732                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 733
 734         return ret;
 735 }
 736
 737 /*
 738  * Special case version of FIFO queue for use by netem.
 739  * It queues in order based on timestamps in skb's
 740  */
 741 struct fifo_sched_data {
 742         u32 limit;
 743         psched_time_t oldest;
 744 };
 745
 746 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 747 {
 748         struct fifo_sched_data *q = qdisc_priv(sch);
 749         struct sk_buff_head *list = &sch->q;
 750         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 751         struct sk_buff *skb;
 752
 753         if (likely(skb_queue_len(list) < q->limit)) {
 754                 /* Optimize for add at tail */
 755                 if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
 756                         q->oldest = tnext;
 757                         return qdisc_enqueue_tail(nskb, sch);
 758                 }
 759
 760                 skb_queue_reverse_walk(list, skb) {
 761                         const struct netem_skb_cb *cb = netem_skb_cb(skb);
 762
 763                         if (tnext >= cb->time_to_send)
 764                                 break;
 765                 }
 766
 767                 __skb_queue_after(list, skb, nskb);
 768
 769                 sch->qstats.backlog += qdisc_pkt_len(nskb);
 770
 771                 return NET_XMIT_SUCCESS;
 772         }
 773
 774         return qdisc_reshape_fail(nskb, sch);
 775 }
 776
 777 static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
 778 {
 779         struct fifo_sched_data *q = qdisc_priv(sch);
 780
 781         if (opt) {
 782                 struct tc_fifo_qopt *ctl = nla_data(opt);
 783                 if (nla_len(opt) < sizeof(*ctl))
 784                         return -EINVAL;
 785
 786                 q->limit = ctl->limit;
 787         } else
 788                 q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
 789
 790         q->oldest = PSCHED_PASTPERFECT;
 791         return 0;
 792 }
 793
 794 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 795 {
 796         struct fifo_sched_data *q = qdisc_priv(sch);
 797         struct tc_fifo_qopt opt = { .limit = q->limit };
 798
 799         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 800         return skb->len;
 801
 802 nla_put_failure:
 803         return -1;
 804 }
 805
 806 static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
 807         .id             =       "tfifo",
 808         .priv_size      =       sizeof(struct fifo_sched_data),
 809         .enqueue        =       tfifo_enqueue,
 810         .dequeue        =       qdisc_dequeue_head,
 811         .peek           =       qdisc_peek_head,
 812         .drop           =       qdisc_queue_drop,
 813         .init           =       tfifo_init,
 814         .reset          =       qdisc_reset_queue,
 815         .change         =       tfifo_init,
 816         .dump           =       tfifo_dump,
 817 };
 818
 819 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 820 {
 821         struct netem_sched_data *q = qdisc_priv(sch);
 822         int ret;
 823
 824         if (!opt)
 825                 return -EINVAL;
 826
 827         qdisc_watchdog_init(&q->watchdog, sch);
 828
 829         q->loss_model = CLG_RANDOM;
 830         q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
 831                                      TC_H_MAKE(sch->handle, 1));
 832         if (!q->qdisc) {
 833                 pr_notice("netem: qdisc create tfifo qdisc failed\n");
 834                 return -ENOMEM;
 835         }
 836
 837         ret = netem_change(sch, opt);
 838         if (ret) {
 839                 pr_info("netem: change failed\n");
 840                 qdisc_destroy(q->qdisc);
 841         }
 842         return ret;
 843 }
 844
 845 static void netem_destroy(struct Qdisc *sch)
 846 {
 847         struct netem_sched_data *q = qdisc_priv(sch);
 848
 849         qdisc_watchdog_cancel(&q->watchdog);
 850         qdisc_destroy(q->qdisc);
 851         dist_free(q->delay_dist);
 852 }
 853
 854 static int dump_loss_model(const struct netem_sched_data *q,
 855                            struct sk_buff *skb)
 856 {
 857         struct nlattr *nest;
 858
 859         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 860         if (nest == NULL)
 861                 goto nla_put_failure;
 862
 863         switch (q->loss_model) {
 864         case CLG_RANDOM:
 865                 /* legacy loss model */
 866                 nla_nest_cancel(skb, nest);
 867                 return 0;       /* no data */
 868
 869         case CLG_4_STATES: {
 870                 struct tc_netem_gimodel gi = {
 871                         .p13 = q->clg.a1,
 872                         .p31 = q->clg.a2,
 873                         .p32 = q->clg.a3,
 874                         .p14 = q->clg.a4,
 875                         .p23 = q->clg.a5,
 876                 };
 877
 878                 NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
 879                 break;
 880         }
 881         case CLG_GILB_ELL: {
 882                 struct tc_netem_gemodel ge = {
 883                         .p = q->clg.a1,
 884                         .r = q->clg.a2,
 885                         .h = q->clg.a3,
 886                         .k1 = q->clg.a4,
 887                 };
 888
 889                 NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
 890                 break;
 891         }
 892         }
 893
 894         nla_nest_end(skb, nest);
 895         return 0;
 896
 897 nla_put_failure:
 898         nla_nest_cancel(skb, nest);
 899         return -1;
 900 }
 901
 902 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 903 {
 904         const struct netem_sched_data *q = qdisc_priv(sch);
 905         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 906         struct tc_netem_qopt qopt;
 907         struct tc_netem_corr cor;
 908         struct tc_netem_reorder reorder;
 909         struct tc_netem_corrupt corrupt;
 910         struct tc_netem_rate rate;
 911
 912         qopt.latency = q->latency;
 913         qopt.jitter = q->jitter;
 914         qopt.limit = q->limit;
 915         qopt.loss = q->loss;
 916         qopt.gap = q->gap;
 917         qopt.duplicate = q->duplicate;
 918         NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 919
 920         cor.delay_corr = q->delay_cor.rho;
 921         cor.loss_corr = q->loss_cor.rho;
 922         cor.dup_corr = q->dup_cor.rho;
 923         NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
 924
 925         reorder.probability = q->reorder;
 926         reorder.correlation = q->reorder_cor.rho;
 927         NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 928
 929         corrupt.probability = q->corrupt;
 930         corrupt.correlation = q->corrupt_cor.rho;
 931         NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
 932
 933         rate.rate = q->rate;
 934         rate.packet_overhead = q->packet_overhead;
 935         rate.cell_size = q->cell_size;
 936         rate.cell_overhead = q->cell_overhead;
 937         NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
 938
 939         if (dump_loss_model(q, skb) != 0)
 940                 goto nla_put_failure;
 941
 942         return nla_nest_end(skb, nla);
 943
 944 nla_put_failure:
 945         nlmsg_trim(skb, nla);
 946         return -1;
 947 }
 948
 949 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 950                           struct sk_buff *skb, struct tcmsg *tcm)
 951 {
 952         struct netem_sched_data *q = qdisc_priv(sch);
 953
 954         if (cl != 1)    /* only one class */
 955                 return -ENOENT;
 956
 957         tcm->tcm_handle |= TC_H_MIN(1);
 958         tcm->tcm_info = q->qdisc->handle;
 959
 960         return 0;
 961 }
 962
 963 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 964                      struct Qdisc **old)
 965 {
 966         struct netem_sched_data *q = qdisc_priv(sch);
 967
 968         if (new == NULL)
 969                 new = &noop_qdisc;
 970
 971         sch_tree_lock(sch);
 972         *old = q->qdisc;
 973         q->qdisc = new;
 974         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 975         qdisc_reset(*old);
 976         sch_tree_unlock(sch);
 977
 978         return 0;
 979 }
 980
 981 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 982 {
 983         struct netem_sched_data *q = qdisc_priv(sch);
 984         return q->qdisc;
 985 }
 986
 987 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 988 {
 989         return 1;
 990 }
 991
 992 static void netem_put(struct Qdisc *sch, unsigned long arg)
 993 {
 994 }
 995
 996 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 997 {
 998         if (!walker->stop) {
 999                 if (walker->count >= walker->skip)
1000                         if (walker->fn(sch, 1, walker) < 0) {
1001                                 walker->stop = 1;
1002                                 return;
1003                         }
1004                 walker->count++;
1005         }
1006 }
1007
1008 static const struct Qdisc_class_ops netem_class_ops = {
1009         .graft          =       netem_graft,
1010         .leaf           =       netem_leaf,
1011         .get            =       netem_get,
1012         .put            =       netem_put,
1013         .walk           =       netem_walk,
1014         .dump           =       netem_dump_class,
1015 };
1016
1017 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1018         .id             =       "netem",
1019         .cl_ops         =       &netem_class_ops,
1020         .priv_size      =       sizeof(struct netem_sched_data),
1021         .enqueue        =       netem_enqueue,
1022         .dequeue        =       netem_dequeue,
1023         .peek           =       qdisc_peek_dequeued,
1024         .drop           =       netem_drop,
1025         .init           =       netem_init,
1026         .reset          =       netem_reset,
1027         .destroy        =       netem_destroy,
1028         .change         =       netem_change,
1029         .dump           =       netem_dump,
1030         .owner          =       THIS_MODULE,
1031 };
1032
1033
1034 static int __init netem_module_init(void)
1035 {
1036         pr_info("netem: version " VERSION "\n");
1037         return register_qdisc(&netem_qdisc_ops);
1038 }
1039 static void __exit netem_module_exit(void)
1040 {
1041         unregister_qdisc(&netem_qdisc_ops);
1042 }
1043 module_init(netem_module_init)
1044 module_exit(netem_module_exit)
1045 MODULE_LICENSE("GPL");