net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26
  27 #include <net/netlink.h>
  28 #include <net/pkt_sched.h>
  29
  30 #define VERSION "1.3"
  31
  32 /*      Network Emulation Queuing algorithm.
  33         ====================================
  34
  35         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  36                  Network Emulation Tool
  37                  [2] Luigi Rizzo, DummyNet for FreeBSD
  38
  39          ----------------------------------------------------------------
  40
  41          This started out as a simple way to delay outgoing packets to
  42          test TCP but has grown to include most of the functionality
  43          of a full blown network emulator like NISTnet. It can delay
  44          packets and add random jitter (and correlation). The random
  45          distribution can be loaded from a table as well to provide
  46          normal, Pareto, or experimental curves. Packet loss,
  47          duplication, and reordering can also be emulated.
  48
  49          This qdisc does not do classification that can be handled in
  50          layering other disciplines.  It does not need to do bandwidth
  51          control either since that can be handled by using token
  52          bucket or other rate control.
  53
  54      Correlated Loss Generator models
  55
  56         Added generation of correlated loss according to the
  57         "Gilbert-Elliot" model, a 4-state markov model.
  58
  59         References:
  60         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  61         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  62         and intuitive loss model for packet networks and its implementation
  63         in the Netem module in the Linux kernel", available in [1]
  64
  65         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  66                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  67 */
  68
  69 struct netem_sched_data {
  70         struct Qdisc    *qdisc;
  71         struct qdisc_watchdog watchdog;
  72
  73         psched_tdiff_t latency;
  74         psched_tdiff_t jitter;
  75
  76         u32 loss;
  77         u32 limit;
  78         u32 counter;
  79         u32 gap;
  80         u32 duplicate;
  81         u32 reorder;
  82         u32 corrupt;
  83         u32 rate;
  84         s32 packet_overhead;
  85         u32 cell_size;
  86         u32 cell_size_reciprocal;
  87         s32 cell_overhead;
  88
  89         struct crndstate {
  90                 u32 last;
  91                 u32 rho;
  92         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  93
  94         struct disttable {
  95                 u32  size;
  96                 s16 table[0];
  97         } *delay_dist;
  98
  99         enum  {
 100                 CLG_RANDOM,
 101                 CLG_4_STATES,
 102                 CLG_GILB_ELL,
 103         } loss_model;
 104
 105         /* Correlated Loss Generation models */
 106         struct clgstate {
 107                 /* state of the Markov chain */
 108                 u8 state;
 109
 110                 /* 4-states and Gilbert-Elliot models */
 111                 u32 a1; /* p13 for 4-states or p for GE */
 112                 u32 a2; /* p31 for 4-states or r for GE */
 113                 u32 a3; /* p32 for 4-states or h for GE */
 114                 u32 a4; /* p14 for 4-states or 1-k for GE */
 115                 u32 a5; /* p23 used only in 4-states */
 116         } clg;
 117
 118 };
 119
 120 /* Time stamp put into socket buffer control block */
 121 struct netem_skb_cb {
 122         psched_time_t   time_to_send;
 123 };
 124
 125 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 126 {
 127         BUILD_BUG_ON(sizeof(skb->cb) <
 128                 sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
 129         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 130 }
 131
 132 /* init_crandom - initialize correlated random number generator
 133  * Use entropy source for initial seed.
 134  */
 135 static void init_crandom(struct crndstate *state, unsigned long rho)
 136 {
 137         state->rho = rho;
 138         state->last = net_random();
 139 }
 140
 141 /* get_crandom - correlated random number generator
 142  * Next number depends on last value.
 143  * rho is scaled to avoid floating point.
 144  */
 145 static u32 get_crandom(struct crndstate *state)
 146 {
 147         u64 value, rho;
 148         unsigned long answer;
 149
 150         if (state->rho == 0)    /* no correlation */
 151                 return net_random();
 152
 153         value = net_random();
 154         rho = (u64)state->rho + 1;
 155         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 156         state->last = answer;
 157         return answer;
 158 }
 159
 160 /* loss_4state - 4-state model loss generator
 161  * Generates losses according to the 4-state Markov chain adopted in
 162  * the GI (General and Intuitive) loss model.
 163  */
 164 static bool loss_4state(struct netem_sched_data *q)
 165 {
 166         struct clgstate *clg = &q->clg;
 167         u32 rnd = net_random();
 168
 169         /*
 170          * Makes a comparison between rnd and the transition
 171          * probabilities outgoing from the current state, then decides the
 172          * next state and if the next packet has to be transmitted or lost.
 173          * The four states correspond to:
 174          *   1 => successfully transmitted packets within a gap period
 175          *   4 => isolated losses within a gap period
 176          *   3 => lost packets within a burst period
 177          *   2 => successfully transmitted packets within a burst period
 178          */
 179         switch (clg->state) {
 180         case 1:
 181                 if (rnd < clg->a4) {
 182                         clg->state = 4;
 183                         return true;
 184                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 185                         clg->state = 3;
 186                         return true;
 187                 } else if (clg->a1 < rnd)
 188                         clg->state = 1;
 189
 190                 break;
 191         case 2:
 192                 if (rnd < clg->a5) {
 193                         clg->state = 3;
 194                         return true;
 195                 } else
 196                         clg->state = 2;
 197
 198                 break;
 199         case 3:
 200                 if (rnd < clg->a3)
 201                         clg->state = 2;
 202                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 203                         clg->state = 1;
 204                         return true;
 205                 } else if (clg->a2 + clg->a3 < rnd) {
 206                         clg->state = 3;
 207                         return true;
 208                 }
 209                 break;
 210         case 4:
 211                 clg->state = 1;
 212                 break;
 213         }
 214
 215         return false;
 216 }
 217
 218 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 219  * Generates losses according to the Gilbert-Elliot loss model or
 220  * its special cases  (Gilbert or Simple Gilbert)
 221  *
 222  * Makes a comparison between random number and the transition
 223  * probabilities outgoing from the current state, then decides the
 224  * next state. A second random number is extracted and the comparison
 225  * with the loss probability of the current state decides if the next
 226  * packet will be transmitted or lost.
 227  */
 228 static bool loss_gilb_ell(struct netem_sched_data *q)
 229 {
 230         struct clgstate *clg = &q->clg;
 231
 232         switch (clg->state) {
 233         case 1:
 234                 if (net_random() < clg->a1)
 235                         clg->state = 2;
 236                 if (net_random() < clg->a4)
 237                         return true;
 238         case 2:
 239                 if (net_random() < clg->a2)
 240                         clg->state = 1;
 241                 if (clg->a3 > net_random())
 242                         return true;
 243         }
 244
 245         return false;
 246 }
 247
 248 static bool loss_event(struct netem_sched_data *q)
 249 {
 250         switch (q->loss_model) {
 251         case CLG_RANDOM:
 252                 /* Random packet drop 0 => none, ~0 => all */
 253                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 254
 255         case CLG_4_STATES:
 256                 /* 4state loss model algorithm (used also for GI model)
 257                 * Extracts a value from the markov 4 state loss generator,
 258                 * if it is 1 drops a packet and if needed writes the event in
 259                 * the kernel logs
 260                 */
 261                 return loss_4state(q);
 262
 263         case CLG_GILB_ELL:
 264                 /* Gilbert-Elliot loss model algorithm
 265                 * Extracts a value from the Gilbert-Elliot loss generator,
 266                 * if it is 1 drops a packet and if needed writes the event in
 267                 * the kernel logs
 268                 */
 269                 return loss_gilb_ell(q);
 270         }
 271
 272         return false;   /* not reached */
 273 }
 274
 275
 276 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 277  * std deviation sigma.  Uses table lookup to approximate the desired
 278  * distribution, and a uniformly-distributed pseudo-random source.
 279  */
 280 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 281                                 struct crndstate *state,
 282                                 const struct disttable *dist)
 283 {
 284         psched_tdiff_t x;
 285         long t;
 286         u32 rnd;
 287
 288         if (sigma == 0)
 289                 return mu;
 290
 291         rnd = get_crandom(state);
 292
 293         /* default uniform distribution */
 294         if (dist == NULL)
 295                 return (rnd % (2*sigma)) - sigma + mu;
 296
 297         t = dist->table[rnd % dist->size];
 298         x = (sigma % NETEM_DIST_SCALE) * t;
 299         if (x >= 0)
 300                 x += NETEM_DIST_SCALE/2;
 301         else
 302                 x -= NETEM_DIST_SCALE/2;
 303
 304         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 305 }
 306
 307 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 308 {
 309         u64 ticks;
 310
 311         len += q->packet_overhead;
 312
 313         if (q->cell_size) {
 314                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 315
 316                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 317                         cells++;
 318                 len = cells * (q->cell_size + q->cell_overhead);
 319         }
 320
 321         ticks = (u64)len * NSEC_PER_SEC;
 322
 323         do_div(ticks, q->rate);
 324         return PSCHED_NS2TICKS(ticks);
 325 }
 326
 327 /*
 328  * Insert one skb into qdisc.
 329  * Note: parent depends on return value to account for queue length.
 330  *      NET_XMIT_DROP: queue length didn't change.
 331  *      NET_XMIT_SUCCESS: one skb was queued.
 332  */
 333 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 334 {
 335         struct netem_sched_data *q = qdisc_priv(sch);
 336         /* We don't fill cb now as skb_unshare() may invalidate it */
 337         struct netem_skb_cb *cb;
 338         struct sk_buff *skb2;
 339         int ret;
 340         int count = 1;
 341
 342         /* Random duplication */
 343         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 344                 ++count;
 345
 346         /* Drop packet? */
 347         if (loss_event(q))
 348                 --count;
 349
 350         if (count == 0) {
 351                 sch->qstats.drops++;
 352                 kfree_skb(skb);
 353                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 354         }
 355
 356         skb_orphan(skb);
 357
 358         /*
 359          * If we need to duplicate packet, then re-insert at top of the
 360          * qdisc tree, since parent queuer expects that only one
 361          * skb will be queued.
 362          */
 363         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 364                 struct Qdisc *rootq = qdisc_root(sch);
 365                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 366                 q->duplicate = 0;
 367
 368                 qdisc_enqueue_root(skb2, rootq);
 369                 q->duplicate = dupsave;
 370         }
 371
 372         /*
 373          * Randomized packet corruption.
 374          * Make copy if needed since we are modifying
 375          * If packet is going to be hardware checksummed, then
 376          * do it now in software before we mangle it.
 377          */
 378         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 379                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 380                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 381                      skb_checksum_help(skb))) {
 382                         sch->qstats.drops++;
 383                         return NET_XMIT_DROP;
 384                 }
 385
 386                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 387         }
 388
 389         cb = netem_skb_cb(skb);
 390         if (q->gap == 0 ||              /* not doing reordering */
 391             q->counter < q->gap ||      /* inside last reordering gap */
 392             q->reorder < get_crandom(&q->reorder_cor)) {
 393                 psched_time_t now;
 394                 psched_tdiff_t delay;
 395
 396                 delay = tabledist(q->latency, q->jitter,
 397                                   &q->delay_cor, q->delay_dist);
 398
 399                 now = psched_get_time();
 400
 401                 if (q->rate) {
 402                         struct sk_buff_head *list = &q->qdisc->q;
 403
 404                         delay += packet_len_2_sched_time(skb->len, q);
 405
 406                         if (!skb_queue_empty(list)) {
 407                                 /*
 408                                  * Last packet in queue is reference point (now).
 409                                  * First packet in queue is already in flight,
 410                                  * calculate this time bonus and substract
 411                                  * from delay.
 412                                  */
 413                                 delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
 414                                 now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
 415                         }
 416                 }
 417
 418                 cb->time_to_send = now + delay;
 419                 ++q->counter;
 420                 ret = qdisc_enqueue(skb, q->qdisc);
 421         } else {
 422                 /*
 423                  * Do re-ordering by putting one out of N packets at the front
 424                  * of the queue.
 425                  */
 426                 cb->time_to_send = psched_get_time();
 427                 q->counter = 0;
 428
 429                 __skb_queue_head(&q->qdisc->q, skb);
 430                 q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
 431                 q->qdisc->qstats.requeues++;
 432                 ret = NET_XMIT_SUCCESS;
 433         }
 434
 435         if (ret != NET_XMIT_SUCCESS) {
 436                 if (net_xmit_drop_count(ret)) {
 437                         sch->qstats.drops++;
 438                         return ret;
 439                 }
 440         }
 441
 442         sch->q.qlen++;
 443         return NET_XMIT_SUCCESS;
 444 }
 445
 446 static unsigned int netem_drop(struct Qdisc *sch)
 447 {
 448         struct netem_sched_data *q = qdisc_priv(sch);
 449         unsigned int len = 0;
 450
 451         if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 452                 sch->q.qlen--;
 453                 sch->qstats.drops++;
 454         }
 455         return len;
 456 }
 457
 458 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 459 {
 460         struct netem_sched_data *q = qdisc_priv(sch);
 461         struct sk_buff *skb;
 462
 463         if (qdisc_is_throttled(sch))
 464                 return NULL;
 465
 466         skb = q->qdisc->ops->peek(q->qdisc);
 467         if (skb) {
 468                 const struct netem_skb_cb *cb = netem_skb_cb(skb);
 469                 psched_time_t now = psched_get_time();
 470
 471                 /* if more time remaining? */
 472                 if (cb->time_to_send <= now) {
 473                         skb = qdisc_dequeue_peeked(q->qdisc);
 474                         if (unlikely(!skb))
 475                                 return NULL;
 476
 477 #ifdef CONFIG_NET_CLS_ACT
 478                         /*
 479                          * If it's at ingress let's pretend the delay is
 480                          * from the network (tstamp will be updated).
 481                          */
 482                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 483                                 skb->tstamp.tv64 = 0;
 484 #endif
 485
 486                         sch->q.qlen--;
 487                         qdisc_unthrottled(sch);
 488                         qdisc_bstats_update(sch, skb);
 489                         return skb;
 490                 }
 491
 492                 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 493         }
 494
 495         return NULL;
 496 }
 497
 498 static void netem_reset(struct Qdisc *sch)
 499 {
 500         struct netem_sched_data *q = qdisc_priv(sch);
 501
 502         qdisc_reset(q->qdisc);
 503         sch->q.qlen = 0;
 504         qdisc_watchdog_cancel(&q->watchdog);
 505 }
 506
 507 static void dist_free(struct disttable *d)
 508 {
 509         if (d) {
 510                 if (is_vmalloc_addr(d))
 511                         vfree(d);
 512                 else
 513                         kfree(d);
 514         }
 515 }
 516
 517 /*
 518  * Distribution data is a variable size payload containing
 519  * signed 16 bit values.
 520  */
 521 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 522 {
 523         struct netem_sched_data *q = qdisc_priv(sch);
 524         size_t n = nla_len(attr)/sizeof(__s16);
 525         const __s16 *data = nla_data(attr);
 526         spinlock_t *root_lock;
 527         struct disttable *d;
 528         int i;
 529         size_t s;
 530
 531         if (n > NETEM_DIST_MAX)
 532                 return -EINVAL;
 533
 534         s = sizeof(struct disttable) + n * sizeof(s16);
 535         d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 536         if (!d)
 537                 d = vmalloc(s);
 538         if (!d)
 539                 return -ENOMEM;
 540
 541         d->size = n;
 542         for (i = 0; i < n; i++)
 543                 d->table[i] = data[i];
 544
 545         root_lock = qdisc_root_sleeping_lock(sch);
 546
 547         spin_lock_bh(root_lock);
 548         swap(q->delay_dist, d);
 549         spin_unlock_bh(root_lock);
 550
 551         dist_free(d);
 552         return 0;
 553 }
 554
 555 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 556 {
 557         struct netem_sched_data *q = qdisc_priv(sch);
 558         const struct tc_netem_corr *c = nla_data(attr);
 559
 560         init_crandom(&q->delay_cor, c->delay_corr);
 561         init_crandom(&q->loss_cor, c->loss_corr);
 562         init_crandom(&q->dup_cor, c->dup_corr);
 563 }
 564
 565 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 566 {
 567         struct netem_sched_data *q = qdisc_priv(sch);
 568         const struct tc_netem_reorder *r = nla_data(attr);
 569
 570         q->reorder = r->probability;
 571         init_crandom(&q->reorder_cor, r->correlation);
 572 }
 573
 574 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 575 {
 576         struct netem_sched_data *q = qdisc_priv(sch);
 577         const struct tc_netem_corrupt *r = nla_data(attr);
 578
 579         q->corrupt = r->probability;
 580         init_crandom(&q->corrupt_cor, r->correlation);
 581 }
 582
 583 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 584 {
 585         struct netem_sched_data *q = qdisc_priv(sch);
 586         const struct tc_netem_rate *r = nla_data(attr);
 587
 588         q->rate = r->rate;
 589         q->packet_overhead = r->packet_overhead;
 590         q->cell_size = r->cell_size;
 591         if (q->cell_size)
 592                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 593         q->cell_overhead = r->cell_overhead;
 594 }
 595
 596 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 597 {
 598         struct netem_sched_data *q = qdisc_priv(sch);
 599         const struct nlattr *la;
 600         int rem;
 601
 602         nla_for_each_nested(la, attr, rem) {
 603                 u16 type = nla_type(la);
 604
 605                 switch(type) {
 606                 case NETEM_LOSS_GI: {
 607                         const struct tc_netem_gimodel *gi = nla_data(la);
 608
 609                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 610                                 pr_info("netem: incorrect gi model size\n");
 611                                 return -EINVAL;
 612                         }
 613
 614                         q->loss_model = CLG_4_STATES;
 615
 616                         q->clg.state = 1;
 617                         q->clg.a1 = gi->p13;
 618                         q->clg.a2 = gi->p31;
 619                         q->clg.a3 = gi->p32;
 620                         q->clg.a4 = gi->p14;
 621                         q->clg.a5 = gi->p23;
 622                         break;
 623                 }
 624
 625                 case NETEM_LOSS_GE: {
 626                         const struct tc_netem_gemodel *ge = nla_data(la);
 627
 628                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 629                                 pr_info("netem: incorrect ge model size\n");
 630                                 return -EINVAL;
 631                         }
 632
 633                         q->loss_model = CLG_GILB_ELL;
 634                         q->clg.state = 1;
 635                         q->clg.a1 = ge->p;
 636                         q->clg.a2 = ge->r;
 637                         q->clg.a3 = ge->h;
 638                         q->clg.a4 = ge->k1;
 639                         break;
 640                 }
 641
 642                 default:
 643                         pr_info("netem: unknown loss type %u\n", type);
 644                         return -EINVAL;
 645                 }
 646         }
 647
 648         return 0;
 649 }
 650
 651 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 652         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 653         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 654         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 655         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 656         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 657 };
 658
 659 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 660                       const struct nla_policy *policy, int len)
 661 {
 662         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 663
 664         if (nested_len < 0) {
 665                 pr_info("netem: invalid attributes len %d\n", nested_len);
 666                 return -EINVAL;
 667         }
 668
 669         if (nested_len >= nla_attr_size(0))
 670                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 671                                  nested_len, policy);
 672
 673         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 674         return 0;
 675 }
 676
 677 /* Parse netlink message to set options */
 678 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 679 {
 680         struct netem_sched_data *q = qdisc_priv(sch);
 681         struct nlattr *tb[TCA_NETEM_MAX + 1];
 682         struct tc_netem_qopt *qopt;
 683         int ret;
 684
 685         if (opt == NULL)
 686                 return -EINVAL;
 687
 688         qopt = nla_data(opt);
 689         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 690         if (ret < 0)
 691                 return ret;
 692
 693         ret = fifo_set_limit(q->qdisc, qopt->limit);
 694         if (ret) {
 695                 pr_info("netem: can't set fifo limit\n");
 696                 return ret;
 697         }
 698
 699         q->latency = qopt->latency;
 700         q->jitter = qopt->jitter;
 701         q->limit = qopt->limit;
 702         q->gap = qopt->gap;
 703         q->counter = 0;
 704         q->loss = qopt->loss;
 705         q->duplicate = qopt->duplicate;
 706
 707         /* for compatibility with earlier versions.
 708          * if gap is set, need to assume 100% probability
 709          */
 710         if (q->gap)
 711                 q->reorder = ~0;
 712
 713         if (tb[TCA_NETEM_CORR])
 714                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 715
 716         if (tb[TCA_NETEM_DELAY_DIST]) {
 717                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 718                 if (ret)
 719                         return ret;
 720         }
 721
 722         if (tb[TCA_NETEM_REORDER])
 723                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 724
 725         if (tb[TCA_NETEM_CORRUPT])
 726                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 727
 728         if (tb[TCA_NETEM_RATE])
 729                 get_rate(sch, tb[TCA_NETEM_RATE]);
 730
 731         q->loss_model = CLG_RANDOM;
 732         if (tb[TCA_NETEM_LOSS])
 733                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 734
 735         return ret;
 736 }
 737
 738 /*
 739  * Special case version of FIFO queue for use by netem.
 740  * It queues in order based on timestamps in skb's
 741  */
 742 struct fifo_sched_data {
 743         u32 limit;
 744         psched_time_t oldest;
 745 };
 746
 747 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 748 {
 749         struct fifo_sched_data *q = qdisc_priv(sch);
 750         struct sk_buff_head *list = &sch->q;
 751         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 752         struct sk_buff *skb;
 753
 754         if (likely(skb_queue_len(list) < q->limit)) {
 755                 /* Optimize for add at tail */
 756                 if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
 757                         q->oldest = tnext;
 758                         return qdisc_enqueue_tail(nskb, sch);
 759                 }
 760
 761                 skb_queue_reverse_walk(list, skb) {
 762                         const struct netem_skb_cb *cb = netem_skb_cb(skb);
 763
 764                         if (tnext >= cb->time_to_send)
 765                                 break;
 766                 }
 767
 768                 __skb_queue_after(list, skb, nskb);
 769
 770                 sch->qstats.backlog += qdisc_pkt_len(nskb);
 771
 772                 return NET_XMIT_SUCCESS;
 773         }
 774
 775         return qdisc_reshape_fail(nskb, sch);
 776 }
 777
 778 static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
 779 {
 780         struct fifo_sched_data *q = qdisc_priv(sch);
 781
 782         if (opt) {
 783                 struct tc_fifo_qopt *ctl = nla_data(opt);
 784                 if (nla_len(opt) < sizeof(*ctl))
 785                         return -EINVAL;
 786
 787                 q->limit = ctl->limit;
 788         } else
 789                 q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
 790
 791         q->oldest = PSCHED_PASTPERFECT;
 792         return 0;
 793 }
 794
 795 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 796 {
 797         struct fifo_sched_data *q = qdisc_priv(sch);
 798         struct tc_fifo_qopt opt = { .limit = q->limit };
 799
 800         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 801         return skb->len;
 802
 803 nla_put_failure:
 804         return -1;
 805 }
 806
 807 static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
 808         .id             =       "tfifo",
 809         .priv_size      =       sizeof(struct fifo_sched_data),
 810         .enqueue        =       tfifo_enqueue,
 811         .dequeue        =       qdisc_dequeue_head,
 812         .peek           =       qdisc_peek_head,
 813         .drop           =       qdisc_queue_drop,
 814         .init           =       tfifo_init,
 815         .reset          =       qdisc_reset_queue,
 816         .change         =       tfifo_init,
 817         .dump           =       tfifo_dump,
 818 };
 819
 820 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 821 {
 822         struct netem_sched_data *q = qdisc_priv(sch);
 823         int ret;
 824
 825         if (!opt)
 826                 return -EINVAL;
 827
 828         qdisc_watchdog_init(&q->watchdog, sch);
 829
 830         q->loss_model = CLG_RANDOM;
 831         q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
 832                                      TC_H_MAKE(sch->handle, 1));
 833         if (!q->qdisc) {
 834                 pr_notice("netem: qdisc create tfifo qdisc failed\n");
 835                 return -ENOMEM;
 836         }
 837
 838         ret = netem_change(sch, opt);
 839         if (ret) {
 840                 pr_info("netem: change failed\n");
 841                 qdisc_destroy(q->qdisc);
 842         }
 843         return ret;
 844 }
 845
 846 static void netem_destroy(struct Qdisc *sch)
 847 {
 848         struct netem_sched_data *q = qdisc_priv(sch);
 849
 850         qdisc_watchdog_cancel(&q->watchdog);
 851         qdisc_destroy(q->qdisc);
 852         dist_free(q->delay_dist);
 853 }
 854
 855 static int dump_loss_model(const struct netem_sched_data *q,
 856                            struct sk_buff *skb)
 857 {
 858         struct nlattr *nest;
 859
 860         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 861         if (nest == NULL)
 862                 goto nla_put_failure;
 863
 864         switch (q->loss_model) {
 865         case CLG_RANDOM:
 866                 /* legacy loss model */
 867                 nla_nest_cancel(skb, nest);
 868                 return 0;       /* no data */
 869
 870         case CLG_4_STATES: {
 871                 struct tc_netem_gimodel gi = {
 872                         .p13 = q->clg.a1,
 873                         .p31 = q->clg.a2,
 874                         .p32 = q->clg.a3,
 875                         .p14 = q->clg.a4,
 876                         .p23 = q->clg.a5,
 877                 };
 878
 879                 NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
 880                 break;
 881         }
 882         case CLG_GILB_ELL: {
 883                 struct tc_netem_gemodel ge = {
 884                         .p = q->clg.a1,
 885                         .r = q->clg.a2,
 886                         .h = q->clg.a3,
 887                         .k1 = q->clg.a4,
 888                 };
 889
 890                 NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
 891                 break;
 892         }
 893         }
 894
 895         nla_nest_end(skb, nest);
 896         return 0;
 897
 898 nla_put_failure:
 899         nla_nest_cancel(skb, nest);
 900         return -1;
 901 }
 902
 903 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 904 {
 905         const struct netem_sched_data *q = qdisc_priv(sch);
 906         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 907         struct tc_netem_qopt qopt;
 908         struct tc_netem_corr cor;
 909         struct tc_netem_reorder reorder;
 910         struct tc_netem_corrupt corrupt;
 911         struct tc_netem_rate rate;
 912
 913         qopt.latency = q->latency;
 914         qopt.jitter = q->jitter;
 915         qopt.limit = q->limit;
 916         qopt.loss = q->loss;
 917         qopt.gap = q->gap;
 918         qopt.duplicate = q->duplicate;
 919         NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 920
 921         cor.delay_corr = q->delay_cor.rho;
 922         cor.loss_corr = q->loss_cor.rho;
 923         cor.dup_corr = q->dup_cor.rho;
 924         NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
 925
 926         reorder.probability = q->reorder;
 927         reorder.correlation = q->reorder_cor.rho;
 928         NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 929
 930         corrupt.probability = q->corrupt;
 931         corrupt.correlation = q->corrupt_cor.rho;
 932         NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
 933
 934         rate.rate = q->rate;
 935         rate.packet_overhead = q->packet_overhead;
 936         rate.cell_size = q->cell_size;
 937         rate.cell_overhead = q->cell_overhead;
 938         NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
 939
 940         if (dump_loss_model(q, skb) != 0)
 941                 goto nla_put_failure;
 942
 943         return nla_nest_end(skb, nla);
 944
 945 nla_put_failure:
 946         nlmsg_trim(skb, nla);
 947         return -1;
 948 }
 949
 950 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 951                           struct sk_buff *skb, struct tcmsg *tcm)
 952 {
 953         struct netem_sched_data *q = qdisc_priv(sch);
 954
 955         if (cl != 1)    /* only one class */
 956                 return -ENOENT;
 957
 958         tcm->tcm_handle |= TC_H_MIN(1);
 959         tcm->tcm_info = q->qdisc->handle;
 960
 961         return 0;
 962 }
 963
 964 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 965                      struct Qdisc **old)
 966 {
 967         struct netem_sched_data *q = qdisc_priv(sch);
 968
 969         if (new == NULL)
 970                 new = &noop_qdisc;
 971
 972         sch_tree_lock(sch);
 973         *old = q->qdisc;
 974         q->qdisc = new;
 975         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 976         qdisc_reset(*old);
 977         sch_tree_unlock(sch);
 978
 979         return 0;
 980 }
 981
 982 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 983 {
 984         struct netem_sched_data *q = qdisc_priv(sch);
 985         return q->qdisc;
 986 }
 987
 988 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 989 {
 990         return 1;
 991 }
 992
 993 static void netem_put(struct Qdisc *sch, unsigned long arg)
 994 {
 995 }
 996
 997 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 998 {
 999         if (!walker->stop) {
1000                 if (walker->count >= walker->skip)
1001                         if (walker->fn(sch, 1, walker) < 0) {
1002                                 walker->stop = 1;
1003                                 return;
1004                         }
1005                 walker->count++;
1006         }
1007 }
1008
1009 static const struct Qdisc_class_ops netem_class_ops = {
1010         .graft          =       netem_graft,
1011         .leaf           =       netem_leaf,
1012         .get            =       netem_get,
1013         .put            =       netem_put,
1014         .walk           =       netem_walk,
1015         .dump           =       netem_dump_class,
1016 };
1017
1018 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1019         .id             =       "netem",
1020         .cl_ops         =       &netem_class_ops,
1021         .priv_size      =       sizeof(struct netem_sched_data),
1022         .enqueue        =       netem_enqueue,
1023         .dequeue        =       netem_dequeue,
1024         .peek           =       qdisc_peek_dequeued,
1025         .drop           =       netem_drop,
1026         .init           =       netem_init,
1027         .reset          =       netem_reset,
1028         .destroy        =       netem_destroy,
1029         .change         =       netem_change,
1030         .dump           =       netem_dump,
1031         .owner          =       THIS_MODULE,
1032 };
1033
1034
1035 static int __init netem_module_init(void)
1036 {
1037         pr_info("netem: version " VERSION "\n");
1038         return register_qdisc(&netem_qdisc_ops);
1039 }
1040 static void __exit netem_module_exit(void)
1041 {
1042         unregister_qdisc(&netem_qdisc_ops);
1043 }
1044 module_init(netem_module_init)
1045 module_exit(netem_module_exit)
1046 MODULE_LICENSE("GPL");