2 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
4 * Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
11 * Meant to be mostly used for localy generated traffic :
12 * Fast classification depends on skb->sk being set before reaching us.
13 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
14 * All packets belonging to a socket are considered as a 'flow'.
16 * Flows are dynamically allocated and stored in a hash table of RB trees
17 * They are also part of one Round Robin 'queues' (new or old flows)
19 * Burst avoidance (aka pacing) capability :
21 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
22 * bunch of packets, and this packet scheduler adds delay between
23 * packets to respect rate limitation.
26 * - lookup one RB tree (out of 1024 or more) to find the flow.
27 * If non existent flow, create it, add it to the tree.
28 * Add skb to the per flow list of skb (fifo).
29 * - Use a special fifo for high prio packets
31 * dequeue() : serves flows in Round Robin
32 * Note : When a flow becomes empty, we do not immediately remove it from
33 * rb trees, for performance reasons (its expected to send additional packets,
34 * or SLAB cache will reuse socket for another flow)
37 #include <linux/module.h>
38 #include <linux/types.h>
39 #include <linux/kernel.h>
40 #include <linux/jiffies.h>
41 #include <linux/string.h>
43 #include <linux/errno.h>
44 #include <linux/init.h>
45 #include <linux/skbuff.h>
46 #include <linux/slab.h>
47 #include <linux/rbtree.h>
48 #include <linux/hash.h>
49 #include <linux/prefetch.h>
50 #include <net/netlink.h>
51 #include <net/pkt_sched.h>
53 #include <net/tcp_states.h>
56 * Per flow structure, dynamically allocated
59 struct sk_buff
*head
; /* list of skbs for this flow : first skb */
61 struct sk_buff
*tail
; /* last skb in the list */
62 unsigned long age
; /* jiffies when flow was emptied, for gc */
64 struct rb_node fq_node
; /* anchor in fq_root[] trees */
66 int qlen
; /* number of packets in flow queue */
68 u32 socket_hash
; /* sk_hash */
69 struct fq_flow
*next
; /* next pointer in RR lists, or &detached */
71 struct rb_node rate_node
; /* anchor in q->delayed tree */
76 struct fq_flow
*first
;
80 struct fq_sched_data
{
81 struct fq_flow_head new_flows
;
83 struct fq_flow_head old_flows
;
85 struct rb_root delayed
; /* for rate limited flows */
86 u64 time_next_delayed_flow
;
88 struct fq_flow internal
; /* for non classified or high prio packets */
91 u32 flow_default_rate
;/* rate per flow : bytes per second */
92 u32 flow_max_rate
; /* optional max rate per flow */
93 u32 flow_plimit
; /* max packets per flow */
94 struct rb_root
*fq_root
;
103 u64 stat_internal_packets
;
104 u64 stat_tcp_retrans
;
106 u64 stat_flows_plimit
;
107 u64 stat_pkts_too_long
;
108 u64 stat_allocation_errors
;
109 struct qdisc_watchdog watchdog
;
112 /* special value to mark a detached flow (not on old/new list) */
113 static struct fq_flow detached
, throttled
;
115 static void fq_flow_set_detached(struct fq_flow
*f
)
120 static bool fq_flow_is_detached(const struct fq_flow
*f
)
122 return f
->next
== &detached
;
125 static void fq_flow_set_throttled(struct fq_sched_data
*q
, struct fq_flow
*f
)
127 struct rb_node
**p
= &q
->delayed
.rb_node
, *parent
= NULL
;
133 aux
= container_of(parent
, struct fq_flow
, rate_node
);
134 if (f
->time_next_packet
>= aux
->time_next_packet
)
135 p
= &parent
->rb_right
;
137 p
= &parent
->rb_left
;
139 rb_link_node(&f
->rate_node
, parent
, p
);
140 rb_insert_color(&f
->rate_node
, &q
->delayed
);
141 q
->throttled_flows
++;
144 f
->next
= &throttled
;
145 if (q
->time_next_delayed_flow
> f
->time_next_packet
)
146 q
->time_next_delayed_flow
= f
->time_next_packet
;
150 static struct kmem_cache
*fq_flow_cachep __read_mostly
;
152 static void fq_flow_add_tail(struct fq_flow_head
*head
, struct fq_flow
*flow
)
155 head
->last
->next
= flow
;
162 /* limit number of collected flows per round */
164 #define FQ_GC_AGE (3*HZ)
166 static bool fq_gc_candidate(const struct fq_flow
*f
)
168 return fq_flow_is_detached(f
) &&
169 time_after(jiffies
, f
->age
+ FQ_GC_AGE
);
172 static void fq_gc(struct fq_sched_data
*q
,
173 struct rb_root
*root
,
176 struct fq_flow
*f
, *tofree
[FQ_GC_MAX
];
177 struct rb_node
**p
, *parent
;
185 f
= container_of(parent
, struct fq_flow
, fq_node
);
189 if (fq_gc_candidate(f
)) {
191 if (fcnt
== FQ_GC_MAX
)
196 p
= &parent
->rb_right
;
198 p
= &parent
->rb_left
;
202 q
->inactive_flows
-= fcnt
;
203 q
->stat_gc_flows
+= fcnt
;
205 struct fq_flow
*f
= tofree
[--fcnt
];
207 rb_erase(&f
->fq_node
, root
);
208 kmem_cache_free(fq_flow_cachep
, f
);
212 static const u8 prio2band
[TC_PRIO_MAX
+ 1] = {
213 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
216 static struct fq_flow
*fq_classify(struct sk_buff
*skb
, struct fq_sched_data
*q
)
218 struct rb_node
**p
, *parent
;
219 struct sock
*sk
= skb
->sk
;
220 struct rb_root
*root
;
224 /* warning: no starvation prevention... */
225 band
= prio2band
[skb
->priority
& TC_PRIO_MAX
];
226 if (unlikely(band
== 0))
230 /* By forcing low order bit to 1, we make sure to not
231 * collide with a local flow (socket pointers are word aligned)
233 sk
= (struct sock
*)(skb_get_rxhash(skb
) | 1L);
236 root
= &q
->fq_root
[hash_32((u32
)(long)sk
, q
->fq_trees_log
)];
238 if (q
->flows
>= (2U << q
->fq_trees_log
) &&
239 q
->inactive_flows
> q
->flows
/2)
247 f
= container_of(parent
, struct fq_flow
, fq_node
);
249 /* socket might have been reallocated, so check
250 * if its sk_hash is the same.
251 * It not, we need to refill credit with
254 if (unlikely(skb
->sk
&&
255 f
->socket_hash
!= sk
->sk_hash
)) {
256 f
->credit
= q
->initial_quantum
;
257 f
->socket_hash
= sk
->sk_hash
;
262 p
= &parent
->rb_right
;
264 p
= &parent
->rb_left
;
267 f
= kmem_cache_zalloc(fq_flow_cachep
, GFP_ATOMIC
| __GFP_NOWARN
);
269 q
->stat_allocation_errors
++;
272 fq_flow_set_detached(f
);
275 f
->socket_hash
= sk
->sk_hash
;
276 f
->credit
= q
->initial_quantum
;
278 rb_link_node(&f
->fq_node
, parent
, p
);
279 rb_insert_color(&f
->fq_node
, root
);
287 /* remove one skb from head of flow queue */
288 static struct sk_buff
*fq_dequeue_head(struct Qdisc
*sch
, struct fq_flow
*flow
)
290 struct sk_buff
*skb
= flow
->head
;
293 flow
->head
= skb
->next
;
296 sch
->qstats
.backlog
-= qdisc_pkt_len(skb
);
302 /* We might add in the future detection of retransmits
303 * For the time being, just return false
305 static bool skb_is_retransmit(struct sk_buff
*skb
)
310 /* add skb to flow queue
311 * flow queue is a linked list, kind of FIFO, except for TCP retransmits
312 * We special case tcp retransmits to be transmitted before other packets.
313 * We rely on fact that TCP retransmits are unlikely, so we do not waste
314 * a separate queue or a pointer.
315 * head-> [retrans pkt 1]
320 * tail-> [ normal pkt 4]
322 static void flow_queue_add(struct fq_flow
*flow
, struct sk_buff
*skb
)
324 struct sk_buff
*prev
, *head
= flow
->head
;
332 if (likely(!skb_is_retransmit(skb
))) {
333 flow
->tail
->next
= skb
;
338 /* This skb is a tcp retransmit,
339 * find the last retrans packet in the queue
342 while (skb_is_retransmit(head
)) {
348 if (!prev
) { /* no rtx packet in queue, become the new head */
349 skb
->next
= flow
->head
;
352 if (prev
== flow
->tail
)
355 skb
->next
= prev
->next
;
360 static int fq_enqueue(struct sk_buff
*skb
, struct Qdisc
*sch
)
362 struct fq_sched_data
*q
= qdisc_priv(sch
);
365 if (unlikely(sch
->q
.qlen
>= sch
->limit
))
366 return qdisc_drop(skb
, sch
);
368 f
= fq_classify(skb
, q
);
369 if (unlikely(f
->qlen
>= q
->flow_plimit
&& f
!= &q
->internal
)) {
370 q
->stat_flows_plimit
++;
371 return qdisc_drop(skb
, sch
);
375 flow_queue_add(f
, skb
);
376 if (skb_is_retransmit(skb
))
377 q
->stat_tcp_retrans
++;
378 sch
->qstats
.backlog
+= qdisc_pkt_len(skb
);
379 if (fq_flow_is_detached(f
)) {
380 fq_flow_add_tail(&q
->new_flows
, f
);
381 if (q
->quantum
> f
->credit
)
382 f
->credit
= q
->quantum
;
384 qdisc_unthrottled(sch
);
386 if (unlikely(f
== &q
->internal
)) {
387 q
->stat_internal_packets
++;
388 qdisc_unthrottled(sch
);
392 return NET_XMIT_SUCCESS
;
395 static void fq_check_throttled(struct fq_sched_data
*q
, u64 now
)
399 if (q
->time_next_delayed_flow
> now
)
402 q
->time_next_delayed_flow
= ~0ULL;
403 while ((p
= rb_first(&q
->delayed
)) != NULL
) {
404 struct fq_flow
*f
= container_of(p
, struct fq_flow
, rate_node
);
406 if (f
->time_next_packet
> now
) {
407 q
->time_next_delayed_flow
= f
->time_next_packet
;
410 rb_erase(p
, &q
->delayed
);
411 q
->throttled_flows
--;
412 fq_flow_add_tail(&q
->old_flows
, f
);
416 static struct sk_buff
*fq_dequeue(struct Qdisc
*sch
)
418 struct fq_sched_data
*q
= qdisc_priv(sch
);
419 u64 now
= ktime_to_ns(ktime_get());
420 struct fq_flow_head
*head
;
425 skb
= fq_dequeue_head(sch
, &q
->internal
);
428 fq_check_throttled(q
, now
);
430 head
= &q
->new_flows
;
432 head
= &q
->old_flows
;
434 if (q
->time_next_delayed_flow
!= ~0ULL)
435 qdisc_watchdog_schedule_ns(&q
->watchdog
,
436 q
->time_next_delayed_flow
);
442 if (f
->credit
<= 0) {
443 f
->credit
+= q
->quantum
;
444 head
->first
= f
->next
;
445 fq_flow_add_tail(&q
->old_flows
, f
);
449 if (unlikely(f
->head
&& now
< f
->time_next_packet
)) {
450 head
->first
= f
->next
;
451 fq_flow_set_throttled(q
, f
);
455 skb
= fq_dequeue_head(sch
, f
);
457 head
->first
= f
->next
;
458 /* force a pass through old_flows to prevent starvation */
459 if ((head
== &q
->new_flows
) && q
->old_flows
.first
) {
460 fq_flow_add_tail(&q
->old_flows
, f
);
462 fq_flow_set_detached(f
);
469 f
->time_next_packet
= now
;
470 f
->credit
-= qdisc_pkt_len(skb
);
472 if (f
->credit
> 0 || !q
->rate_enable
)
475 rate
= q
->flow_max_rate
;
476 if (skb
->sk
&& skb
->sk
->sk_state
!= TCP_TIME_WAIT
)
477 rate
= min(skb
->sk
->sk_pacing_rate
, rate
);
480 u32 plen
= max(qdisc_pkt_len(skb
), q
->quantum
);
481 u64 len
= (u64
)plen
* NSEC_PER_SEC
;
485 /* Since socket rate can change later,
486 * clamp the delay to 125 ms.
487 * TODO: maybe segment the too big skb, as in commit
488 * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
490 if (unlikely(len
> 125 * NSEC_PER_MSEC
)) {
491 len
= 125 * NSEC_PER_MSEC
;
492 q
->stat_pkts_too_long
++;
495 f
->time_next_packet
= now
+ len
;
498 qdisc_bstats_update(sch
, skb
);
499 qdisc_unthrottled(sch
);
503 static void fq_reset(struct Qdisc
*sch
)
505 struct fq_sched_data
*q
= qdisc_priv(sch
);
506 struct rb_root
*root
;
512 while ((skb
= fq_dequeue_head(sch
, &q
->internal
)) != NULL
)
518 for (idx
= 0; idx
< (1U << q
->fq_trees_log
); idx
++) {
519 root
= &q
->fq_root
[idx
];
520 while ((p
= rb_first(root
)) != NULL
) {
521 f
= container_of(p
, struct fq_flow
, fq_node
);
524 while ((skb
= fq_dequeue_head(sch
, f
)) != NULL
)
527 kmem_cache_free(fq_flow_cachep
, f
);
530 q
->new_flows
.first
= NULL
;
531 q
->old_flows
.first
= NULL
;
532 q
->delayed
= RB_ROOT
;
534 q
->inactive_flows
= 0;
535 q
->throttled_flows
= 0;
538 static void fq_rehash(struct fq_sched_data
*q
,
539 struct rb_root
*old_array
, u32 old_log
,
540 struct rb_root
*new_array
, u32 new_log
)
542 struct rb_node
*op
, **np
, *parent
;
543 struct rb_root
*oroot
, *nroot
;
544 struct fq_flow
*of
, *nf
;
548 for (idx
= 0; idx
< (1U << old_log
); idx
++) {
549 oroot
= &old_array
[idx
];
550 while ((op
= rb_first(oroot
)) != NULL
) {
552 of
= container_of(op
, struct fq_flow
, fq_node
);
553 if (fq_gc_candidate(of
)) {
555 kmem_cache_free(fq_flow_cachep
, of
);
558 nroot
= &new_array
[hash_32((u32
)(long)of
->sk
, new_log
)];
560 np
= &nroot
->rb_node
;
565 nf
= container_of(parent
, struct fq_flow
, fq_node
);
566 BUG_ON(nf
->sk
== of
->sk
);
569 np
= &parent
->rb_right
;
571 np
= &parent
->rb_left
;
574 rb_link_node(&of
->fq_node
, parent
, np
);
575 rb_insert_color(&of
->fq_node
, nroot
);
579 q
->inactive_flows
-= fcnt
;
580 q
->stat_gc_flows
+= fcnt
;
583 static int fq_resize(struct fq_sched_data
*q
, u32 log
)
585 struct rb_root
*array
;
588 if (q
->fq_root
&& log
== q
->fq_trees_log
)
591 array
= kmalloc(sizeof(struct rb_root
) << log
, GFP_KERNEL
);
595 for (idx
= 0; idx
< (1U << log
); idx
++)
596 array
[idx
] = RB_ROOT
;
599 fq_rehash(q
, q
->fq_root
, q
->fq_trees_log
, array
, log
);
603 q
->fq_trees_log
= log
;
608 static const struct nla_policy fq_policy
[TCA_FQ_MAX
+ 1] = {
609 [TCA_FQ_PLIMIT
] = { .type
= NLA_U32
},
610 [TCA_FQ_FLOW_PLIMIT
] = { .type
= NLA_U32
},
611 [TCA_FQ_QUANTUM
] = { .type
= NLA_U32
},
612 [TCA_FQ_INITIAL_QUANTUM
] = { .type
= NLA_U32
},
613 [TCA_FQ_RATE_ENABLE
] = { .type
= NLA_U32
},
614 [TCA_FQ_FLOW_DEFAULT_RATE
] = { .type
= NLA_U32
},
615 [TCA_FQ_FLOW_MAX_RATE
] = { .type
= NLA_U32
},
616 [TCA_FQ_BUCKETS_LOG
] = { .type
= NLA_U32
},
619 static int fq_change(struct Qdisc
*sch
, struct nlattr
*opt
)
621 struct fq_sched_data
*q
= qdisc_priv(sch
);
622 struct nlattr
*tb
[TCA_FQ_MAX
+ 1];
623 int err
, drop_count
= 0;
629 err
= nla_parse_nested(tb
, TCA_FQ_MAX
, opt
, fq_policy
);
635 fq_log
= q
->fq_trees_log
;
637 if (tb
[TCA_FQ_BUCKETS_LOG
]) {
638 u32 nval
= nla_get_u32(tb
[TCA_FQ_BUCKETS_LOG
]);
640 if (nval
>= 1 && nval
<= ilog2(256*1024))
645 if (tb
[TCA_FQ_PLIMIT
])
646 sch
->limit
= nla_get_u32(tb
[TCA_FQ_PLIMIT
]);
648 if (tb
[TCA_FQ_FLOW_PLIMIT
])
649 q
->flow_plimit
= nla_get_u32(tb
[TCA_FQ_FLOW_PLIMIT
]);
651 if (tb
[TCA_FQ_QUANTUM
])
652 q
->quantum
= nla_get_u32(tb
[TCA_FQ_QUANTUM
]);
654 if (tb
[TCA_FQ_INITIAL_QUANTUM
])
655 q
->initial_quantum
= nla_get_u32(tb
[TCA_FQ_INITIAL_QUANTUM
]);
657 if (tb
[TCA_FQ_FLOW_DEFAULT_RATE
])
658 q
->flow_default_rate
= nla_get_u32(tb
[TCA_FQ_FLOW_DEFAULT_RATE
]);
660 if (tb
[TCA_FQ_FLOW_MAX_RATE
])
661 q
->flow_max_rate
= nla_get_u32(tb
[TCA_FQ_FLOW_MAX_RATE
]);
663 if (tb
[TCA_FQ_RATE_ENABLE
]) {
664 u32 enable
= nla_get_u32(tb
[TCA_FQ_RATE_ENABLE
]);
667 q
->rate_enable
= enable
;
673 err
= fq_resize(q
, fq_log
);
675 while (sch
->q
.qlen
> sch
->limit
) {
676 struct sk_buff
*skb
= fq_dequeue(sch
);
683 qdisc_tree_decrease_qlen(sch
, drop_count
);
685 sch_tree_unlock(sch
);
689 static void fq_destroy(struct Qdisc
*sch
)
691 struct fq_sched_data
*q
= qdisc_priv(sch
);
695 qdisc_watchdog_cancel(&q
->watchdog
);
698 static int fq_init(struct Qdisc
*sch
, struct nlattr
*opt
)
700 struct fq_sched_data
*q
= qdisc_priv(sch
);
704 q
->flow_plimit
= 100;
705 q
->quantum
= 2 * psched_mtu(qdisc_dev(sch
));
706 q
->initial_quantum
= 10 * psched_mtu(qdisc_dev(sch
));
707 q
->flow_default_rate
= 0;
708 q
->flow_max_rate
= ~0U;
710 q
->new_flows
.first
= NULL
;
711 q
->old_flows
.first
= NULL
;
712 q
->delayed
= RB_ROOT
;
714 q
->fq_trees_log
= ilog2(1024);
715 qdisc_watchdog_init(&q
->watchdog
, sch
);
718 err
= fq_change(sch
, opt
);
720 err
= fq_resize(q
, q
->fq_trees_log
);
725 static int fq_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
727 struct fq_sched_data
*q
= qdisc_priv(sch
);
730 opts
= nla_nest_start(skb
, TCA_OPTIONS
);
732 goto nla_put_failure
;
734 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore,
735 * do not bother giving its value
737 if (nla_put_u32(skb
, TCA_FQ_PLIMIT
, sch
->limit
) ||
738 nla_put_u32(skb
, TCA_FQ_FLOW_PLIMIT
, q
->flow_plimit
) ||
739 nla_put_u32(skb
, TCA_FQ_QUANTUM
, q
->quantum
) ||
740 nla_put_u32(skb
, TCA_FQ_INITIAL_QUANTUM
, q
->initial_quantum
) ||
741 nla_put_u32(skb
, TCA_FQ_RATE_ENABLE
, q
->rate_enable
) ||
742 nla_put_u32(skb
, TCA_FQ_FLOW_MAX_RATE
, q
->flow_max_rate
) ||
743 nla_put_u32(skb
, TCA_FQ_BUCKETS_LOG
, q
->fq_trees_log
))
744 goto nla_put_failure
;
746 nla_nest_end(skb
, opts
);
753 static int fq_dump_stats(struct Qdisc
*sch
, struct gnet_dump
*d
)
755 struct fq_sched_data
*q
= qdisc_priv(sch
);
756 u64 now
= ktime_to_ns(ktime_get());
757 struct tc_fq_qd_stats st
= {
758 .gc_flows
= q
->stat_gc_flows
,
759 .highprio_packets
= q
->stat_internal_packets
,
760 .tcp_retrans
= q
->stat_tcp_retrans
,
761 .throttled
= q
->stat_throttled
,
762 .flows_plimit
= q
->stat_flows_plimit
,
763 .pkts_too_long
= q
->stat_pkts_too_long
,
764 .allocation_errors
= q
->stat_allocation_errors
,
766 .inactive_flows
= q
->inactive_flows
,
767 .throttled_flows
= q
->throttled_flows
,
768 .time_next_delayed_flow
= q
->time_next_delayed_flow
- now
,
771 return gnet_stats_copy_app(d
, &st
, sizeof(st
));
774 static struct Qdisc_ops fq_qdisc_ops __read_mostly
= {
776 .priv_size
= sizeof(struct fq_sched_data
),
778 .enqueue
= fq_enqueue
,
779 .dequeue
= fq_dequeue
,
780 .peek
= qdisc_peek_dequeued
,
783 .destroy
= fq_destroy
,
786 .dump_stats
= fq_dump_stats
,
787 .owner
= THIS_MODULE
,
790 static int __init
fq_module_init(void)
794 fq_flow_cachep
= kmem_cache_create("fq_flow_cache",
795 sizeof(struct fq_flow
),
800 ret
= register_qdisc(&fq_qdisc_ops
);
802 kmem_cache_destroy(fq_flow_cachep
);
806 static void __exit
fq_module_exit(void)
808 unregister_qdisc(&fq_qdisc_ops
);
809 kmem_cache_destroy(fq_flow_cachep
);
812 module_init(fq_module_init
)
813 module_exit(fq_module_exit
)
814 MODULE_AUTHOR("Eric Dumazet");
815 MODULE_LICENSE("GPL");