fib_trie: Merge leaf into tnode
[deliverable/linux.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
25985edc 15 * This work is based on the LPC-trie which is originally described in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
631dd1a8 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
19baf839
RO
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839 53#include <asm/uaccess.h>
1977f032 54#include <linux/bitops.h>
19baf839
RO
55#include <linux/types.h>
56#include <linux/kernel.h>
19baf839
RO
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
cd8787ab 64#include <linux/inetdevice.h>
19baf839
RO
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
2373ce1c 68#include <linux/rcupdate.h>
19baf839
RO
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
5a0e3ad6 73#include <linux/slab.h>
bc3b2d7f 74#include <linux/export.h>
457c4cbc 75#include <net/net_namespace.h>
19baf839
RO
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
06ef921d 84#define MAX_STAT_DEPTH 32
19baf839 85
19baf839 86#define KEYLENGTH (8*sizeof(t_key))
19baf839 87
19baf839
RO
88typedef unsigned int t_key;
89
64c9b6fb
AD
90#define IS_TNODE(n) ((n)->bits)
91#define IS_LEAF(n) (!(n)->bits)
2373ce1c 92
64c9b6fb
AD
93struct tnode {
94 t_key key;
95 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
96 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
97 struct tnode __rcu *parent;
37fd30f2 98 struct rcu_head rcu;
adaf9816
AD
99 union {
100 /* The fields in this struct are valid if bits > 0 (TNODE) */
101 struct {
102 unsigned int full_children; /* KEYLENGTH bits needed */
103 unsigned int empty_children; /* KEYLENGTH bits needed */
104 struct tnode __rcu *child[0];
105 };
106 /* This list pointer if valid if bits == 0 (LEAF) */
107 struct hlist_head list;
108 };
19baf839
RO
109};
110
111struct leaf_info {
112 struct hlist_node hlist;
113 int plen;
5c74501f 114 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
19baf839 115 struct list_head falh;
5c74501f 116 struct rcu_head rcu;
19baf839
RO
117};
118
19baf839
RO
119#ifdef CONFIG_IP_FIB_TRIE_STATS
120struct trie_use_stats {
121 unsigned int gets;
122 unsigned int backtrack;
123 unsigned int semantic_match_passed;
124 unsigned int semantic_match_miss;
125 unsigned int null_node_hit;
2f36895a 126 unsigned int resize_node_skipped;
19baf839
RO
127};
128#endif
129
130struct trie_stat {
131 unsigned int totdepth;
132 unsigned int maxdepth;
133 unsigned int tnodes;
134 unsigned int leaves;
135 unsigned int nullpointers;
93672292 136 unsigned int prefixes;
06ef921d 137 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 138};
19baf839
RO
139
140struct trie {
adaf9816 141 struct tnode __rcu *trie;
19baf839 142#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 143 struct trie_use_stats __percpu *stats;
19baf839 144#endif
19baf839
RO
145};
146
adaf9816 147static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
a07f5f50 148 int wasfull);
adaf9816 149static struct tnode *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
150static struct tnode *inflate(struct trie *t, struct tnode *tn);
151static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c 152/* tnodes to free after resize(); protected by RTNL */
37fd30f2 153static struct callback_head *tnode_free_head;
c3059477
JP
154static size_t tnode_free_size;
155
156/*
157 * synchronize_rcu after call_rcu for that many pages; it should be especially
158 * useful before resizing the root node with PREEMPT_NONE configs; the value was
159 * obtained experimentally, aiming to avoid visible slowdown.
160 */
161static const int sync_pages = 128;
19baf839 162
e18b890b 163static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 164static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 165
64c9b6fb
AD
166/* caller must hold RTNL */
167#define node_parent(n) rtnl_dereference((n)->parent)
0a5c0475 168
64c9b6fb
AD
169/* caller must hold RCU read lock or RTNL */
170#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
0a5c0475 171
64c9b6fb 172/* wrapper for rcu_assign_pointer */
adaf9816 173static inline void node_set_parent(struct tnode *n, struct tnode *tp)
b59cfbf7 174{
adaf9816
AD
175 if (n)
176 rcu_assign_pointer(n->parent, tp);
06801916
SH
177}
178
64c9b6fb
AD
179#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
180
181/* This provides us with the number of children in this node, in the case of a
182 * leaf this will return 0 meaning none of the children are accessible.
6440cc9e 183 */
64c9b6fb 184static inline int tnode_child_length(const struct tnode *tn)
06801916 185{
64c9b6fb 186 return (1ul << tn->bits) & ~(1ul);
06801916 187}
2373ce1c 188
0a5c0475
ED
189/*
190 * caller must hold RTNL
191 */
adaf9816 192static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
b59cfbf7 193{
64c9b6fb 194 BUG_ON(i >= tnode_child_length(tn));
2373ce1c 195
0a5c0475 196 return rtnl_dereference(tn->child[i]);
b59cfbf7
ED
197}
198
0a5c0475
ED
199/*
200 * caller must hold RCU read lock or RTNL
201 */
adaf9816 202static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
19baf839 203{
64c9b6fb 204 BUG_ON(i >= tnode_child_length(tn));
19baf839 205
0a5c0475 206 return rcu_dereference_rtnl(tn->child[i]);
19baf839
RO
207}
208
3b004569 209static inline t_key mask_pfx(t_key k, unsigned int l)
ab66b4a7
SH
210{
211 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
212}
213
3b004569 214static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
19baf839 215{
91b9a277 216 if (offset < KEYLENGTH)
19baf839 217 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 218 else
19baf839
RO
219 return 0;
220}
221
222static inline int tkey_equals(t_key a, t_key b)
223{
c877efb2 224 return a == b;
19baf839
RO
225}
226
227static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
228{
c877efb2
SH
229 if (bits == 0 || offset >= KEYLENGTH)
230 return 1;
91b9a277
OJ
231 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
232 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 233}
19baf839
RO
234
235static inline int tkey_mismatch(t_key a, int offset, t_key b)
236{
237 t_key diff = a ^ b;
238 int i = offset;
239
c877efb2
SH
240 if (!diff)
241 return 0;
242 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
243 i++;
244 return i;
245}
246
19baf839 247/*
e905a9ed
YH
248 To understand this stuff, an understanding of keys and all their bits is
249 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
250 all of the bits in that key are significant.
251
252 Consider a node 'n' and its parent 'tp'.
253
e905a9ed
YH
254 If n is a leaf, every bit in its key is significant. Its presence is
255 necessitated by path compression, since during a tree traversal (when
256 searching for a leaf - unless we are doing an insertion) we will completely
257 ignore all skipped bits we encounter. Thus we need to verify, at the end of
258 a potentially successful search, that we have indeed been walking the
19baf839
RO
259 correct key path.
260
e905a9ed
YH
261 Note that we can never "miss" the correct key in the tree if present by
262 following the wrong path. Path compression ensures that segments of the key
263 that are the same for all keys with a given prefix are skipped, but the
264 skipped part *is* identical for each node in the subtrie below the skipped
265 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
266 call to tkey_sub_equals() in trie_insert().
267
e905a9ed 268 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
269 have many different meanings.
270
e905a9ed 271 Example:
19baf839
RO
272 _________________________________________________________________
273 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
274 -----------------------------------------------------------------
e905a9ed 275 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
276
277 _________________________________________________________________
278 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
279 -----------------------------------------------------------------
280 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
281
282 tp->pos = 7
283 tp->bits = 3
284 n->pos = 15
91b9a277 285 n->bits = 4
19baf839 286
e905a9ed
YH
287 First, let's just ignore the bits that come before the parent tp, that is
288 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
289 not use them for anything.
290
291 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 292 index into the parent's child array. That is, they will be used to find
19baf839
RO
293 'n' among tp's children.
294
295 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
296 for the node n.
297
e905a9ed 298 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
299 of the bits are really not needed or indeed known in n->key.
300
e905a9ed 301 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 302 n's child array, and will of course be different for each child.
e905a9ed 303
c877efb2 304
19baf839
RO
305 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
306 at this point.
307
308*/
309
f5026fab
DL
310static const int halve_threshold = 25;
311static const int inflate_threshold = 50;
345aa031 312static const int halve_threshold_root = 15;
80b71b80 313static const int inflate_threshold_root = 30;
2373ce1c
RO
314
315static void __alias_free_mem(struct rcu_head *head)
19baf839 316{
2373ce1c
RO
317 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
318 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
319}
320
2373ce1c 321static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 322{
2373ce1c
RO
323 call_rcu(&fa->rcu, __alias_free_mem);
324}
91b9a277 325
37fd30f2 326#define TNODE_KMALLOC_MAX \
adaf9816 327 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
91b9a277 328
37fd30f2 329static void __node_free_rcu(struct rcu_head *head)
387a5487 330{
adaf9816 331 struct tnode *n = container_of(head, struct tnode, rcu);
37fd30f2
AD
332
333 if (IS_LEAF(n))
334 kmem_cache_free(trie_leaf_kmem, n);
335 else if (n->bits <= TNODE_KMALLOC_MAX)
336 kfree(n);
337 else
338 vfree(n);
387a5487
SH
339}
340
37fd30f2
AD
341#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
342
2373ce1c 343static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 344{
bceb0f45 345 kfree_rcu(leaf, rcu);
19baf839
RO
346}
347
8d965444 348static struct tnode *tnode_alloc(size_t size)
f0e36f8c 349{
2373ce1c 350 if (size <= PAGE_SIZE)
8d965444 351 return kzalloc(size, GFP_KERNEL);
15be75cd 352 else
7a1c8e5a 353 return vzalloc(size);
15be75cd 354}
2373ce1c 355
e0f7cb8c
JP
356static void tnode_free_safe(struct tnode *tn)
357{
358 BUG_ON(IS_LEAF(tn));
37fd30f2
AD
359 tn->rcu.next = tnode_free_head;
360 tnode_free_head = &tn->rcu;
e0f7cb8c
JP
361}
362
363static void tnode_free_flush(void)
364{
37fd30f2
AD
365 struct callback_head *head;
366
367 while ((head = tnode_free_head)) {
368 struct tnode *tn = container_of(head, struct tnode, rcu);
369
370 tnode_free_head = head->next;
371 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
e0f7cb8c 372
37fd30f2 373 node_free(tn);
e0f7cb8c 374 }
c3059477
JP
375
376 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
377 tnode_free_size = 0;
378 synchronize_rcu();
379 }
e0f7cb8c
JP
380}
381
adaf9816 382static struct tnode *leaf_new(t_key key)
2373ce1c 383{
adaf9816 384 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c 385 if (l) {
64c9b6fb
AD
386 l->parent = NULL;
387 /* set key and pos to reflect full key value
388 * any trailing zeros in the key should be ignored
389 * as the nodes are searched
390 */
391 l->key = key;
392 l->pos = KEYLENGTH;
393 /* set bits to 0 indicating we are not a tnode */
394 l->bits = 0;
395
2373ce1c
RO
396 INIT_HLIST_HEAD(&l->list);
397 }
398 return l;
399}
400
401static struct leaf_info *leaf_info_new(int plen)
402{
403 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
404 if (li) {
405 li->plen = plen;
5c74501f 406 li->mask_plen = ntohl(inet_make_mask(plen));
2373ce1c
RO
407 INIT_LIST_HEAD(&li->falh);
408 }
409 return li;
410}
411
a07f5f50 412static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 413{
37fd30f2 414 size_t sz = offsetof(struct tnode, child[1 << bits]);
f0e36f8c 415 struct tnode *tn = tnode_alloc(sz);
64c9b6fb
AD
416 unsigned int shift = pos + bits;
417
418 /* verify bits and pos their msb bits clear and values are valid */
419 BUG_ON(!bits || (shift > KEYLENGTH));
19baf839 420
91b9a277 421 if (tn) {
64c9b6fb 422 tn->parent = NULL;
19baf839
RO
423 tn->pos = pos;
424 tn->bits = bits;
64c9b6fb 425 tn->key = mask_pfx(key, pos);
19baf839
RO
426 tn->full_children = 0;
427 tn->empty_children = 1<<bits;
428 }
c877efb2 429
a034ee3c 430 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
adaf9816 431 sizeof(struct tnode *) << bits);
19baf839
RO
432 return tn;
433}
434
19baf839
RO
435/*
436 * Check whether a tnode 'n' is "full", i.e. it is an internal node
437 * and no bits are skipped. See discussion in dyntree paper p. 6
438 */
439
adaf9816 440static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
19baf839 441{
64c9b6fb 442 return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
19baf839
RO
443}
444
61648d91 445static inline void put_child(struct tnode *tn, int i,
adaf9816 446 struct tnode *n)
19baf839
RO
447{
448 tnode_put_child_reorg(tn, i, n, -1);
449}
450
c877efb2 451 /*
19baf839
RO
452 * Add a child at position i overwriting the old value.
453 * Update the value of full_children and empty_children.
454 */
455
adaf9816 456static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
a07f5f50 457 int wasfull)
19baf839 458{
adaf9816 459 struct tnode *chi = rtnl_dereference(tn->child[i]);
19baf839
RO
460 int isfull;
461
0c7770c7
SH
462 BUG_ON(i >= 1<<tn->bits);
463
19baf839
RO
464 /* update emptyChildren */
465 if (n == NULL && chi != NULL)
466 tn->empty_children++;
467 else if (n != NULL && chi == NULL)
468 tn->empty_children--;
c877efb2 469
19baf839 470 /* update fullChildren */
91b9a277 471 if (wasfull == -1)
19baf839
RO
472 wasfull = tnode_full(tn, chi);
473
474 isfull = tnode_full(tn, n);
c877efb2 475 if (wasfull && !isfull)
19baf839 476 tn->full_children--;
c877efb2 477 else if (!wasfull && isfull)
19baf839 478 tn->full_children++;
91b9a277 479
64c9b6fb 480 node_set_parent(n, tn);
19baf839 481
cf778b00 482 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
483}
484
80b71b80 485#define MAX_WORK 10
adaf9816 486static struct tnode *resize(struct trie *t, struct tnode *tn)
19baf839 487{
adaf9816 488 struct tnode *old_tn, *n = NULL;
e6308be8
RO
489 int inflate_threshold_use;
490 int halve_threshold_use;
80b71b80 491 int max_work;
19baf839 492
e905a9ed 493 if (!tn)
19baf839
RO
494 return NULL;
495
0c7770c7
SH
496 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
497 tn, inflate_threshold, halve_threshold);
19baf839
RO
498
499 /* No children */
64c9b6fb
AD
500 if (tn->empty_children > (tnode_child_length(tn) - 1))
501 goto no_children;
502
19baf839 503 /* One child */
64c9b6fb 504 if (tn->empty_children == (tnode_child_length(tn) - 1))
80b71b80 505 goto one_child;
c877efb2 506 /*
19baf839
RO
507 * Double as long as the resulting node has a number of
508 * nonempty nodes that are above the threshold.
509 */
510
511 /*
c877efb2
SH
512 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
513 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 514 * Telecommunications, page 6:
c877efb2 515 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
516 * children in the *doubled* node is at least 'high'."
517 *
c877efb2
SH
518 * 'high' in this instance is the variable 'inflate_threshold'. It
519 * is expressed as a percentage, so we multiply it with
520 * tnode_child_length() and instead of multiplying by 2 (since the
521 * child array will be doubled by inflate()) and multiplying
522 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 523 * multiply the left-hand side by 50.
c877efb2
SH
524 *
525 * The left-hand side may look a bit weird: tnode_child_length(tn)
526 * - tn->empty_children is of course the number of non-null children
527 * in the current node. tn->full_children is the number of "full"
19baf839 528 * children, that is non-null tnodes with a skip value of 0.
c877efb2 529 * All of those will be doubled in the resulting inflated tnode, so
19baf839 530 * we just count them one extra time here.
c877efb2 531 *
19baf839 532 * A clearer way to write this would be:
c877efb2 533 *
19baf839 534 * to_be_doubled = tn->full_children;
c877efb2 535 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
536 * tn->full_children;
537 *
538 * new_child_length = tnode_child_length(tn) * 2;
539 *
c877efb2 540 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
541 * new_child_length;
542 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
543 *
544 * ...and so on, tho it would mess up the while () loop.
545 *
19baf839
RO
546 * anyway,
547 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
548 * inflate_threshold
c877efb2 549 *
19baf839
RO
550 * avoid a division:
551 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
552 * inflate_threshold * new_child_length
c877efb2 553 *
19baf839 554 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 555 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 556 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 557 *
19baf839 558 * expand new_child_length:
c877efb2 559 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 560 * tn->full_children) >=
19baf839 561 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 562 *
19baf839 563 * shorten again:
c877efb2 564 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 565 * tn->empty_children) >= inflate_threshold *
19baf839 566 * tnode_child_length(tn)
c877efb2 567 *
19baf839
RO
568 */
569
e6308be8
RO
570 /* Keep root node larger */
571
64c9b6fb 572 if (!node_parent(tn)) {
80b71b80
JL
573 inflate_threshold_use = inflate_threshold_root;
574 halve_threshold_use = halve_threshold_root;
a034ee3c 575 } else {
e6308be8 576 inflate_threshold_use = inflate_threshold;
80b71b80
JL
577 halve_threshold_use = halve_threshold;
578 }
e6308be8 579
80b71b80
JL
580 max_work = MAX_WORK;
581 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
582 50 * (tn->full_children + tnode_child_length(tn)
583 - tn->empty_children)
584 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 585
2f80b3c8
RO
586 old_tn = tn;
587 tn = inflate(t, tn);
a07f5f50 588
2f80b3c8
RO
589 if (IS_ERR(tn)) {
590 tn = old_tn;
2f36895a 591#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 592 this_cpu_inc(t->stats->resize_node_skipped);
2f36895a
RO
593#endif
594 break;
595 }
19baf839
RO
596 }
597
80b71b80 598 /* Return if at least one inflate is run */
a034ee3c 599 if (max_work != MAX_WORK)
adaf9816 600 return tn;
80b71b80 601
19baf839
RO
602 /*
603 * Halve as long as the number of empty children in this
604 * node is above threshold.
605 */
2f36895a 606
80b71b80
JL
607 max_work = MAX_WORK;
608 while (tn->bits > 1 && max_work-- &&
19baf839 609 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 610 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 611
2f80b3c8
RO
612 old_tn = tn;
613 tn = halve(t, tn);
614 if (IS_ERR(tn)) {
615 tn = old_tn;
2f36895a 616#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 617 this_cpu_inc(t->stats->resize_node_skipped);
2f36895a
RO
618#endif
619 break;
620 }
621 }
19baf839 622
c877efb2 623
19baf839 624 /* Only one child remains */
64c9b6fb
AD
625 if (tn->empty_children == (tnode_child_length(tn) - 1)) {
626 unsigned long i;
80b71b80 627one_child:
64c9b6fb
AD
628 for (i = tnode_child_length(tn); !n && i;)
629 n = tnode_get_child(tn, --i);
630no_children:
631 /* compress one level */
632 node_set_parent(n, NULL);
633 tnode_free_safe(tn);
634 return n;
80b71b80 635 }
adaf9816 636 return tn;
19baf839
RO
637}
638
0a5c0475
ED
639
640static void tnode_clean_free(struct tnode *tn)
641{
adaf9816 642 struct tnode *tofree;
0a5c0475 643 int i;
0a5c0475
ED
644
645 for (i = 0; i < tnode_child_length(tn); i++) {
37fd30f2 646 tofree = rtnl_dereference(tn->child[i]);
0a5c0475 647 if (tofree)
37fd30f2 648 node_free(tofree);
0a5c0475 649 }
37fd30f2 650 node_free(tn);
0a5c0475
ED
651}
652
adaf9816 653static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
19baf839 654{
adaf9816
AD
655 int olen = tnode_child_length(oldtnode);
656 struct tnode *tn;
19baf839
RO
657 int i;
658
0c7770c7 659 pr_debug("In inflate\n");
19baf839
RO
660
661 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
662
0c7770c7 663 if (!tn)
2f80b3c8 664 return ERR_PTR(-ENOMEM);
2f36895a
RO
665
666 /*
c877efb2
SH
667 * Preallocate and store tnodes before the actual work so we
668 * don't get into an inconsistent state if memory allocation
669 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
670 * of tnode is ignored.
671 */
91b9a277
OJ
672
673 for (i = 0; i < olen; i++) {
a07f5f50 674 struct tnode *inode;
2f36895a 675
adaf9816
AD
676 inode = tnode_get_child(oldtnode, i);
677 if (tnode_full(oldtnode, inode) && inode->bits > 1) {
2f36895a 678 struct tnode *left, *right;
ab66b4a7 679 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 680
2f36895a
RO
681 left = tnode_new(inode->key&(~m), inode->pos + 1,
682 inode->bits - 1);
2f80b3c8
RO
683 if (!left)
684 goto nomem;
91b9a277 685
2f36895a
RO
686 right = tnode_new(inode->key|m, inode->pos + 1,
687 inode->bits - 1);
688
e905a9ed 689 if (!right) {
37fd30f2 690 node_free(left);
2f80b3c8 691 goto nomem;
e905a9ed 692 }
2f36895a 693
adaf9816
AD
694 put_child(tn, 2*i, left);
695 put_child(tn, 2*i+1, right);
2f36895a
RO
696 }
697 }
698
91b9a277 699 for (i = 0; i < olen; i++) {
adaf9816 700 struct tnode *inode = tnode_get_child(oldtnode, i);
91b9a277
OJ
701 struct tnode *left, *right;
702 int size, j;
c877efb2 703
19baf839 704 /* An empty child */
adaf9816 705 if (inode == NULL)
19baf839
RO
706 continue;
707
708 /* A leaf or an internal node with skipped bits */
adaf9816 709 if (!tnode_full(oldtnode, inode)) {
bbe34cf8 710 put_child(tn,
adaf9816
AD
711 tkey_extract_bits(inode->key, tn->pos, tn->bits),
712 inode);
19baf839
RO
713 continue;
714 }
715
716 /* An internal node with two children */
19baf839 717 if (inode->bits == 1) {
61648d91
LM
718 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
719 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
19baf839 720
e0f7cb8c 721 tnode_free_safe(inode);
91b9a277 722 continue;
19baf839
RO
723 }
724
91b9a277
OJ
725 /* An internal node with more than two children */
726
727 /* We will replace this node 'inode' with two new
728 * ones, 'left' and 'right', each with half of the
729 * original children. The two new nodes will have
730 * a position one bit further down the key and this
731 * means that the "significant" part of their keys
732 * (see the discussion near the top of this file)
733 * will differ by one bit, which will be "0" in
734 * left's key and "1" in right's key. Since we are
735 * moving the key position by one step, the bit that
736 * we are moving away from - the bit at position
737 * (inode->pos) - is the one that will differ between
738 * left and right. So... we synthesize that bit in the
739 * two new keys.
740 * The mask 'm' below will be a single "one" bit at
741 * the position (inode->pos)
742 */
19baf839 743
91b9a277
OJ
744 /* Use the old key, but set the new significant
745 * bit to zero.
746 */
2f36895a 747
adaf9816 748 left = tnode_get_child(tn, 2*i);
61648d91 749 put_child(tn, 2*i, NULL);
2f36895a 750
91b9a277 751 BUG_ON(!left);
2f36895a 752
adaf9816 753 right = tnode_get_child(tn, 2*i+1);
61648d91 754 put_child(tn, 2*i+1, NULL);
19baf839 755
91b9a277 756 BUG_ON(!right);
19baf839 757
91b9a277
OJ
758 size = tnode_child_length(left);
759 for (j = 0; j < size; j++) {
61648d91
LM
760 put_child(left, j, rtnl_dereference(inode->child[j]));
761 put_child(right, j, rtnl_dereference(inode->child[j + size]));
19baf839 762 }
61648d91
LM
763 put_child(tn, 2*i, resize(t, left));
764 put_child(tn, 2*i+1, resize(t, right));
91b9a277 765
e0f7cb8c 766 tnode_free_safe(inode);
19baf839 767 }
e0f7cb8c 768 tnode_free_safe(oldtnode);
19baf839 769 return tn;
2f80b3c8 770nomem:
0a5c0475
ED
771 tnode_clean_free(tn);
772 return ERR_PTR(-ENOMEM);
19baf839
RO
773}
774
adaf9816 775static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
19baf839 776{
adaf9816
AD
777 int olen = tnode_child_length(oldtnode);
778 struct tnode *tn, *left, *right;
19baf839 779 int i;
19baf839 780
0c7770c7 781 pr_debug("In halve\n");
c877efb2
SH
782
783 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 784
2f80b3c8
RO
785 if (!tn)
786 return ERR_PTR(-ENOMEM);
2f36895a
RO
787
788 /*
c877efb2
SH
789 * Preallocate and store tnodes before the actual work so we
790 * don't get into an inconsistent state if memory allocation
791 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
792 * of tnode is ignored.
793 */
794
91b9a277 795 for (i = 0; i < olen; i += 2) {
2f36895a
RO
796 left = tnode_get_child(oldtnode, i);
797 right = tnode_get_child(oldtnode, i+1);
c877efb2 798
2f36895a 799 /* Two nonempty children */
0c7770c7 800 if (left && right) {
2f80b3c8 801 struct tnode *newn;
0c7770c7 802
2f80b3c8 803 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
804
805 if (!newn)
2f80b3c8 806 goto nomem;
0c7770c7 807
adaf9816 808 put_child(tn, i/2, newn);
2f36895a 809 }
2f36895a 810
2f36895a 811 }
19baf839 812
91b9a277
OJ
813 for (i = 0; i < olen; i += 2) {
814 struct tnode *newBinNode;
815
19baf839
RO
816 left = tnode_get_child(oldtnode, i);
817 right = tnode_get_child(oldtnode, i+1);
c877efb2 818
19baf839
RO
819 /* At least one of the children is empty */
820 if (left == NULL) {
821 if (right == NULL) /* Both are empty */
822 continue;
61648d91 823 put_child(tn, i/2, right);
91b9a277 824 continue;
0c7770c7 825 }
91b9a277
OJ
826
827 if (right == NULL) {
61648d91 828 put_child(tn, i/2, left);
91b9a277
OJ
829 continue;
830 }
c877efb2 831
19baf839 832 /* Two nonempty children */
adaf9816 833 newBinNode = tnode_get_child(tn, i/2);
61648d91
LM
834 put_child(tn, i/2, NULL);
835 put_child(newBinNode, 0, left);
836 put_child(newBinNode, 1, right);
837 put_child(tn, i/2, resize(t, newBinNode));
19baf839 838 }
e0f7cb8c 839 tnode_free_safe(oldtnode);
19baf839 840 return tn;
2f80b3c8 841nomem:
0a5c0475
ED
842 tnode_clean_free(tn);
843 return ERR_PTR(-ENOMEM);
19baf839
RO
844}
845
772cb712 846/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
847 via get_fa_head and dump */
848
adaf9816 849static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
19baf839 850{
772cb712 851 struct hlist_head *head = &l->list;
19baf839
RO
852 struct leaf_info *li;
853
b67bfe0d 854 hlist_for_each_entry_rcu(li, head, hlist)
c877efb2 855 if (li->plen == plen)
19baf839 856 return li;
91b9a277 857
19baf839
RO
858 return NULL;
859}
860
adaf9816 861static inline struct list_head *get_fa_head(struct tnode *l, int plen)
19baf839 862{
772cb712 863 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 864
91b9a277
OJ
865 if (!li)
866 return NULL;
c877efb2 867
91b9a277 868 return &li->falh;
19baf839
RO
869}
870
871static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
872{
e905a9ed 873 struct leaf_info *li = NULL, *last = NULL;
e905a9ed
YH
874
875 if (hlist_empty(head)) {
876 hlist_add_head_rcu(&new->hlist, head);
877 } else {
b67bfe0d 878 hlist_for_each_entry(li, head, hlist) {
e905a9ed
YH
879 if (new->plen > li->plen)
880 break;
881
882 last = li;
883 }
884 if (last)
1d023284 885 hlist_add_behind_rcu(&new->hlist, &last->hlist);
e905a9ed
YH
886 else
887 hlist_add_before_rcu(&new->hlist, &li->hlist);
888 }
19baf839
RO
889}
890
2373ce1c
RO
891/* rcu_read_lock needs to be hold by caller from readside */
892
adaf9816 893static struct tnode *fib_find_node(struct trie *t, u32 key)
19baf839 894{
adaf9816
AD
895 struct tnode *n = rcu_dereference_rtnl(t->trie);
896 int pos = 0;
19baf839 897
64c9b6fb 898 while (n && IS_TNODE(n)) {
adaf9816
AD
899 if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
900 pos = n->pos + n->bits;
901 n = tnode_get_child_rcu(n,
a07f5f50 902 tkey_extract_bits(key,
adaf9816
AD
903 n->pos,
904 n->bits));
91b9a277 905 } else
19baf839
RO
906 break;
907 }
908 /* Case we have found a leaf. Compare prefixes */
909
91b9a277 910 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
adaf9816 911 return n;
91b9a277 912
19baf839
RO
913 return NULL;
914}
915
7b85576d 916static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 917{
19baf839 918 int wasfull;
3ed18d76 919 t_key cindex, key;
06801916 920 struct tnode *tp;
19baf839 921
3ed18d76
RO
922 key = tn->key;
923
64c9b6fb 924 while (tn != NULL && (tp = node_parent(tn)) != NULL) {
19baf839
RO
925 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
926 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
adaf9816 927 tn = resize(t, tn);
a07f5f50 928
adaf9816 929 tnode_put_child_reorg(tp, cindex, tn, wasfull);
91b9a277 930
64c9b6fb 931 tp = node_parent(tn);
008440e3 932 if (!tp)
adaf9816 933 rcu_assign_pointer(t->trie, tn);
008440e3 934
e0f7cb8c 935 tnode_free_flush();
06801916 936 if (!tp)
19baf839 937 break;
06801916 938 tn = tp;
19baf839 939 }
06801916 940
19baf839 941 /* Handle last (top) tnode */
7b85576d 942 if (IS_TNODE(tn))
adaf9816 943 tn = resize(t, tn);
19baf839 944
adaf9816 945 rcu_assign_pointer(t->trie, tn);
7b85576d 946 tnode_free_flush();
19baf839
RO
947}
948
2373ce1c
RO
949/* only used from updater-side */
950
fea86ad8 951static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
952{
953 int pos, newpos;
954 struct tnode *tp = NULL, *tn = NULL;
adaf9816
AD
955 struct tnode *n;
956 struct tnode *l;
19baf839 957 int missbit;
c877efb2 958 struct list_head *fa_head = NULL;
19baf839
RO
959 struct leaf_info *li;
960 t_key cindex;
961
962 pos = 0;
0a5c0475 963 n = rtnl_dereference(t->trie);
19baf839 964
c877efb2
SH
965 /* If we point to NULL, stop. Either the tree is empty and we should
966 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 967 * and we should just put our new leaf in that.
c877efb2
SH
968 * If we point to a T_TNODE, check if it matches our key. Note that
969 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
970 * not be the parent's 'pos'+'bits'!
971 *
c877efb2 972 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
973 * the index from our key, push the T_TNODE and walk the tree.
974 *
975 * If it doesn't, we have to replace it with a new T_TNODE.
976 *
c877efb2
SH
977 * If we point to a T_LEAF, it might or might not have the same key
978 * as we do. If it does, just change the value, update the T_LEAF's
979 * value, and return it.
19baf839
RO
980 * If it doesn't, we need to replace it with a T_TNODE.
981 */
982
64c9b6fb 983 while (n && IS_TNODE(n)) {
adaf9816
AD
984 if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
985 tp = n;
986 pos = n->pos + n->bits;
987 n = tnode_get_child(n,
a07f5f50 988 tkey_extract_bits(key,
adaf9816
AD
989 n->pos,
990 n->bits));
19baf839 991
adaf9816 992 BUG_ON(n && node_parent(n) != tp);
91b9a277 993 } else
19baf839
RO
994 break;
995 }
996
997 /*
998 * n ----> NULL, LEAF or TNODE
999 *
c877efb2 1000 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1001 */
1002
91b9a277 1003 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1004
1005 /* Case 1: n is a leaf. Compare prefixes */
1006
c877efb2 1007 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
19baf839 1008 li = leaf_info_new(plen);
91b9a277 1009
fea86ad8
SH
1010 if (!li)
1011 return NULL;
19baf839
RO
1012
1013 fa_head = &li->falh;
adaf9816 1014 insert_leaf_info(&n->list, li);
19baf839
RO
1015 goto done;
1016 }
64c9b6fb 1017 l = leaf_new(key);
19baf839 1018
fea86ad8
SH
1019 if (!l)
1020 return NULL;
19baf839 1021
19baf839
RO
1022 li = leaf_info_new(plen);
1023
c877efb2 1024 if (!li) {
37fd30f2 1025 node_free(l);
fea86ad8 1026 return NULL;
f835e471 1027 }
19baf839
RO
1028
1029 fa_head = &li->falh;
1030 insert_leaf_info(&l->list, li);
1031
19baf839 1032 if (t->trie && n == NULL) {
91b9a277 1033 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1034
adaf9816 1035 node_set_parent(l, tp);
19baf839 1036
91b9a277 1037 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
adaf9816 1038 put_child(tp, cindex, l);
91b9a277
OJ
1039 } else {
1040 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1041 /*
1042 * Add a new tnode here
19baf839
RO
1043 * first tnode need some special handling
1044 */
1045
c877efb2 1046 if (n) {
4c60f1d6 1047 pos = tp ? tp->pos+tp->bits : 0;
19baf839
RO
1048 newpos = tkey_mismatch(key, pos, n->key);
1049 tn = tnode_new(n->key, newpos, 1);
91b9a277 1050 } else {
19baf839 1051 newpos = 0;
c877efb2 1052 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1053 }
19baf839 1054
c877efb2 1055 if (!tn) {
f835e471 1056 free_leaf_info(li);
37fd30f2 1057 node_free(l);
fea86ad8 1058 return NULL;
91b9a277
OJ
1059 }
1060
adaf9816 1061 node_set_parent(tn, tp);
19baf839 1062
91b9a277 1063 missbit = tkey_extract_bits(key, newpos, 1);
adaf9816 1064 put_child(tn, missbit, l);
61648d91 1065 put_child(tn, 1-missbit, n);
19baf839 1066
c877efb2 1067 if (tp) {
19baf839 1068 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
adaf9816 1069 put_child(tp, cindex, tn);
91b9a277 1070 } else {
adaf9816 1071 rcu_assign_pointer(t->trie, tn);
19baf839 1072 }
e962f302
AD
1073
1074 tp = tn;
19baf839 1075 }
91b9a277
OJ
1076
1077 if (tp && tp->pos + tp->bits > 32)
058bd4d2
JP
1078 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1079 tp, tp->pos, tp->bits, key, plen);
91b9a277 1080
19baf839 1081 /* Rebalance the trie */
2373ce1c 1082
7b85576d 1083 trie_rebalance(t, tp);
f835e471 1084done:
19baf839
RO
1085 return fa_head;
1086}
1087
d562f1f8
RO
1088/*
1089 * Caller must hold RTNL.
1090 */
16c6cf8b 1091int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1092{
1093 struct trie *t = (struct trie *) tb->tb_data;
1094 struct fib_alias *fa, *new_fa;
c877efb2 1095 struct list_head *fa_head = NULL;
19baf839 1096 struct fib_info *fi;
4e902c57
TG
1097 int plen = cfg->fc_dst_len;
1098 u8 tos = cfg->fc_tos;
19baf839
RO
1099 u32 key, mask;
1100 int err;
adaf9816 1101 struct tnode *l;
19baf839
RO
1102
1103 if (plen > 32)
1104 return -EINVAL;
1105
4e902c57 1106 key = ntohl(cfg->fc_dst);
19baf839 1107
2dfe55b4 1108 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1109
91b9a277 1110 mask = ntohl(inet_make_mask(plen));
19baf839 1111
c877efb2 1112 if (key & ~mask)
19baf839
RO
1113 return -EINVAL;
1114
1115 key = key & mask;
1116
4e902c57
TG
1117 fi = fib_create_info(cfg);
1118 if (IS_ERR(fi)) {
1119 err = PTR_ERR(fi);
19baf839 1120 goto err;
4e902c57 1121 }
19baf839
RO
1122
1123 l = fib_find_node(t, key);
c877efb2 1124 fa = NULL;
19baf839 1125
c877efb2 1126 if (l) {
19baf839
RO
1127 fa_head = get_fa_head(l, plen);
1128 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1129 }
1130
1131 /* Now fa, if non-NULL, points to the first fib alias
1132 * with the same keys [prefix,tos,priority], if such key already
1133 * exists or to the node before which we will insert new one.
1134 *
1135 * If fa is NULL, we will need to allocate a new one and
1136 * insert to the head of f.
1137 *
1138 * If f is NULL, no fib node matched the destination key
1139 * and we need to allocate a new one of those as well.
1140 */
1141
936f6f8e
JA
1142 if (fa && fa->fa_tos == tos &&
1143 fa->fa_info->fib_priority == fi->fib_priority) {
1144 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1145
1146 err = -EEXIST;
4e902c57 1147 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1148 goto out;
1149
936f6f8e
JA
1150 /* We have 2 goals:
1151 * 1. Find exact match for type, scope, fib_info to avoid
1152 * duplicate routes
1153 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1154 */
1155 fa_match = NULL;
1156 fa_first = fa;
1157 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1158 list_for_each_entry_continue(fa, fa_head, fa_list) {
1159 if (fa->fa_tos != tos)
1160 break;
1161 if (fa->fa_info->fib_priority != fi->fib_priority)
1162 break;
1163 if (fa->fa_type == cfg->fc_type &&
936f6f8e
JA
1164 fa->fa_info == fi) {
1165 fa_match = fa;
1166 break;
1167 }
1168 }
1169
4e902c57 1170 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1171 struct fib_info *fi_drop;
1172 u8 state;
1173
936f6f8e
JA
1174 fa = fa_first;
1175 if (fa_match) {
1176 if (fa == fa_match)
1177 err = 0;
6725033f 1178 goto out;
936f6f8e 1179 }
2373ce1c 1180 err = -ENOBUFS;
e94b1766 1181 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1182 if (new_fa == NULL)
1183 goto out;
19baf839
RO
1184
1185 fi_drop = fa->fa_info;
2373ce1c
RO
1186 new_fa->fa_tos = fa->fa_tos;
1187 new_fa->fa_info = fi;
4e902c57 1188 new_fa->fa_type = cfg->fc_type;
19baf839 1189 state = fa->fa_state;
936f6f8e 1190 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1191
2373ce1c
RO
1192 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1193 alias_free_mem_rcu(fa);
19baf839
RO
1194
1195 fib_release_info(fi_drop);
1196 if (state & FA_S_ACCESSED)
4ccfe6d4 1197 rt_cache_flush(cfg->fc_nlinfo.nl_net);
b8f55831
MK
1198 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1199 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1200
91b9a277 1201 goto succeeded;
19baf839
RO
1202 }
1203 /* Error if we find a perfect match which
1204 * uses the same scope, type, and nexthop
1205 * information.
1206 */
936f6f8e
JA
1207 if (fa_match)
1208 goto out;
a07f5f50 1209
4e902c57 1210 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1211 fa = fa_first;
19baf839
RO
1212 }
1213 err = -ENOENT;
4e902c57 1214 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1215 goto out;
1216
1217 err = -ENOBUFS;
e94b1766 1218 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1219 if (new_fa == NULL)
1220 goto out;
1221
1222 new_fa->fa_info = fi;
1223 new_fa->fa_tos = tos;
4e902c57 1224 new_fa->fa_type = cfg->fc_type;
19baf839 1225 new_fa->fa_state = 0;
19baf839
RO
1226 /*
1227 * Insert new entry to the list.
1228 */
1229
c877efb2 1230 if (!fa_head) {
fea86ad8
SH
1231 fa_head = fib_insert_node(t, key, plen);
1232 if (unlikely(!fa_head)) {
1233 err = -ENOMEM;
f835e471 1234 goto out_free_new_fa;
fea86ad8 1235 }
f835e471 1236 }
19baf839 1237
21d8c49e
DM
1238 if (!plen)
1239 tb->tb_num_default++;
1240
2373ce1c
RO
1241 list_add_tail_rcu(&new_fa->fa_list,
1242 (fa ? &fa->fa_list : fa_head));
19baf839 1243
4ccfe6d4 1244 rt_cache_flush(cfg->fc_nlinfo.nl_net);
4e902c57 1245 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1246 &cfg->fc_nlinfo, 0);
19baf839
RO
1247succeeded:
1248 return 0;
f835e471
RO
1249
1250out_free_new_fa:
1251 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1252out:
1253 fib_release_info(fi);
91b9a277 1254err:
19baf839
RO
1255 return err;
1256}
1257
772cb712 1258/* should be called with rcu_read_lock */
adaf9816 1259static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
22bd5b9b 1260 t_key key, const struct flowi4 *flp,
ebc0ffae 1261 struct fib_result *res, int fib_flags)
19baf839 1262{
19baf839
RO
1263 struct leaf_info *li;
1264 struct hlist_head *hhead = &l->list;
c877efb2 1265
b67bfe0d 1266 hlist_for_each_entry_rcu(li, hhead, hlist) {
3be0686b 1267 struct fib_alias *fa;
a07f5f50 1268
5c74501f 1269 if (l->key != (key & li->mask_plen))
19baf839
RO
1270 continue;
1271
3be0686b
DM
1272 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1273 struct fib_info *fi = fa->fa_info;
1274 int nhsel, err;
a07f5f50 1275
22bd5b9b 1276 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
3be0686b 1277 continue;
dccd9ecc
DM
1278 if (fi->fib_dead)
1279 continue;
37e826c5 1280 if (fa->fa_info->fib_scope < flp->flowi4_scope)
3be0686b
DM
1281 continue;
1282 fib_alias_accessed(fa);
1283 err = fib_props[fa->fa_type].error;
1284 if (err) {
19baf839 1285#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1286 this_cpu_inc(t->stats->semantic_match_passed);
3be0686b 1287#endif
1fbc7843 1288 return err;
3be0686b
DM
1289 }
1290 if (fi->fib_flags & RTNH_F_DEAD)
1291 continue;
1292 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1293 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1294
1295 if (nh->nh_flags & RTNH_F_DEAD)
1296 continue;
22bd5b9b 1297 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
3be0686b
DM
1298 continue;
1299
1300#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1301 this_cpu_inc(t->stats->semantic_match_passed);
3be0686b 1302#endif
5c74501f 1303 res->prefixlen = li->plen;
3be0686b
DM
1304 res->nh_sel = nhsel;
1305 res->type = fa->fa_type;
37e826c5 1306 res->scope = fa->fa_info->fib_scope;
3be0686b
DM
1307 res->fi = fi;
1308 res->table = tb;
1309 res->fa_head = &li->falh;
1310 if (!(fib_flags & FIB_LOOKUP_NOREF))
5c74501f 1311 atomic_inc(&fi->fib_clntref);
3be0686b
DM
1312 return 0;
1313 }
1314 }
1315
1316#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1317 this_cpu_inc(t->stats->semantic_match_miss);
19baf839 1318#endif
19baf839 1319 }
a07f5f50 1320
2e655571 1321 return 1;
19baf839
RO
1322}
1323
22bd5b9b 1324int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
ebc0ffae 1325 struct fib_result *res, int fib_flags)
19baf839
RO
1326{
1327 struct trie *t = (struct trie *) tb->tb_data;
8274a97a
AD
1328#ifdef CONFIG_IP_FIB_TRIE_STATS
1329 struct trie_use_stats __percpu *stats = t->stats;
1330#endif
2e655571 1331 int ret;
adaf9816 1332 struct tnode *n;
19baf839 1333 struct tnode *pn;
3b004569 1334 unsigned int pos, bits;
22bd5b9b 1335 t_key key = ntohl(flp->daddr);
3b004569 1336 unsigned int chopped_off;
19baf839 1337 t_key cindex = 0;
3b004569 1338 unsigned int current_prefix_length = KEYLENGTH;
91b9a277 1339 struct tnode *cn;
874ffa8f 1340 t_key pref_mismatch;
91b9a277 1341
2373ce1c 1342 rcu_read_lock();
91b9a277 1343
2373ce1c 1344 n = rcu_dereference(t->trie);
c877efb2 1345 if (!n)
19baf839
RO
1346 goto failed;
1347
1348#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1349 this_cpu_inc(stats->gets);
19baf839
RO
1350#endif
1351
1352 /* Just a leaf? */
1353 if (IS_LEAF(n)) {
adaf9816 1354 ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
a07f5f50 1355 goto found;
19baf839 1356 }
a07f5f50 1357
adaf9816 1358 pn = n;
19baf839 1359 chopped_off = 0;
c877efb2 1360
91b9a277 1361 while (pn) {
19baf839
RO
1362 pos = pn->pos;
1363 bits = pn->bits;
1364
c877efb2 1365 if (!chopped_off)
ab66b4a7
SH
1366 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1367 pos, bits);
19baf839 1368
b902e573 1369 n = tnode_get_child_rcu(pn, cindex);
19baf839
RO
1370
1371 if (n == NULL) {
1372#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1373 this_cpu_inc(stats->null_node_hit);
19baf839
RO
1374#endif
1375 goto backtrace;
1376 }
1377
91b9a277 1378 if (IS_LEAF(n)) {
adaf9816 1379 ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
2e655571 1380 if (ret > 0)
91b9a277 1381 goto backtrace;
a07f5f50 1382 goto found;
91b9a277
OJ
1383 }
1384
adaf9816 1385 cn = n;
19baf839 1386
91b9a277
OJ
1387 /*
1388 * It's a tnode, and we can do some extra checks here if we
1389 * like, to avoid descending into a dead-end branch.
1390 * This tnode is in the parent's child array at index
1391 * key[p_pos..p_pos+p_bits] but potentially with some bits
1392 * chopped off, so in reality the index may be just a
1393 * subprefix, padded with zero at the end.
1394 * We can also take a look at any skipped bits in this
1395 * tnode - everything up to p_pos is supposed to be ok,
1396 * and the non-chopped bits of the index (se previous
1397 * paragraph) are also guaranteed ok, but the rest is
1398 * considered unknown.
1399 *
1400 * The skipped bits are key[pos+bits..cn->pos].
1401 */
19baf839 1402
91b9a277
OJ
1403 /* If current_prefix_length < pos+bits, we are already doing
1404 * actual prefix matching, which means everything from
1405 * pos+(bits-chopped_off) onward must be zero along some
1406 * branch of this subtree - otherwise there is *no* valid
1407 * prefix present. Here we can only check the skipped
1408 * bits. Remember, since we have already indexed into the
1409 * parent's child array, we know that the bits we chopped of
1410 * *are* zero.
1411 */
19baf839 1412
a07f5f50
SH
1413 /* NOTA BENE: Checking only skipped bits
1414 for the new node here */
19baf839 1415
91b9a277
OJ
1416 if (current_prefix_length < pos+bits) {
1417 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1418 cn->pos - current_prefix_length)
1419 || !(cn->child[0]))
91b9a277
OJ
1420 goto backtrace;
1421 }
19baf839 1422
91b9a277
OJ
1423 /*
1424 * If chopped_off=0, the index is fully validated and we
1425 * only need to look at the skipped bits for this, the new,
1426 * tnode. What we actually want to do is to find out if
1427 * these skipped bits match our key perfectly, or if we will
1428 * have to count on finding a matching prefix further down,
1429 * because if we do, we would like to have some way of
1430 * verifying the existence of such a prefix at this point.
1431 */
19baf839 1432
91b9a277
OJ
1433 /* The only thing we can do at this point is to verify that
1434 * any such matching prefix can indeed be a prefix to our
1435 * key, and if the bits in the node we are inspecting that
1436 * do not match our key are not ZERO, this cannot be true.
1437 * Thus, find out where there is a mismatch (before cn->pos)
1438 * and verify that all the mismatching bits are zero in the
1439 * new tnode's key.
1440 */
19baf839 1441
a07f5f50
SH
1442 /*
1443 * Note: We aren't very concerned about the piece of
1444 * the key that precede pn->pos+pn->bits, since these
1445 * have already been checked. The bits after cn->pos
1446 * aren't checked since these are by definition
1447 * "unknown" at this point. Thus, what we want to see
1448 * is if we are about to enter the "prefix matching"
1449 * state, and in that case verify that the skipped
1450 * bits that will prevail throughout this subtree are
1451 * zero, as they have to be if we are to find a
1452 * matching prefix.
91b9a277
OJ
1453 */
1454
874ffa8f 1455 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
91b9a277 1456
a07f5f50
SH
1457 /*
1458 * In short: If skipped bits in this node do not match
1459 * the search key, enter the "prefix matching"
1460 * state.directly.
91b9a277
OJ
1461 */
1462 if (pref_mismatch) {
79cda75a
ED
1463 /* fls(x) = __fls(x) + 1 */
1464 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
91b9a277 1465
874ffa8f 1466 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
91b9a277
OJ
1467 goto backtrace;
1468
1469 if (current_prefix_length >= cn->pos)
1470 current_prefix_length = mp;
c877efb2 1471 }
a07f5f50 1472
adaf9816 1473 pn = n; /* Descend */
91b9a277
OJ
1474 chopped_off = 0;
1475 continue;
1476
19baf839
RO
1477backtrace:
1478 chopped_off++;
1479
1480 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1481 while ((chopped_off <= pn->bits)
1482 && !(cindex & (1<<(chopped_off-1))))
19baf839 1483 chopped_off++;
19baf839
RO
1484
1485 /* Decrease current_... with bits chopped off */
1486 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1487 current_prefix_length = pn->pos + pn->bits
1488 - chopped_off;
91b9a277 1489
19baf839 1490 /*
c877efb2 1491 * Either we do the actual chop off according or if we have
19baf839
RO
1492 * chopped off all bits in this tnode walk up to our parent.
1493 */
1494
91b9a277 1495 if (chopped_off <= pn->bits) {
19baf839 1496 cindex &= ~(1 << (chopped_off-1));
91b9a277 1497 } else {
64c9b6fb 1498 struct tnode *parent = node_parent_rcu(pn);
06801916 1499 if (!parent)
19baf839 1500 goto failed;
91b9a277 1501
19baf839 1502 /* Get Child's index */
06801916
SH
1503 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1504 pn = parent;
19baf839
RO
1505 chopped_off = 0;
1506
1507#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1508 this_cpu_inc(stats->backtrack);
19baf839
RO
1509#endif
1510 goto backtrace;
c877efb2 1511 }
19baf839
RO
1512 }
1513failed:
c877efb2 1514 ret = 1;
19baf839 1515found:
2373ce1c 1516 rcu_read_unlock();
19baf839
RO
1517 return ret;
1518}
6fc01438 1519EXPORT_SYMBOL_GPL(fib_table_lookup);
19baf839 1520
9195bef7
SH
1521/*
1522 * Remove the leaf and return parent.
1523 */
adaf9816 1524static void trie_leaf_remove(struct trie *t, struct tnode *l)
19baf839 1525{
64c9b6fb 1526 struct tnode *tp = node_parent(l);
c877efb2 1527
9195bef7 1528 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1529
c877efb2 1530 if (tp) {
9195bef7 1531 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
61648d91 1532 put_child(tp, cindex, NULL);
7b85576d 1533 trie_rebalance(t, tp);
91b9a277 1534 } else
a9b3cd7f 1535 RCU_INIT_POINTER(t->trie, NULL);
19baf839 1536
37fd30f2 1537 node_free(l);
19baf839
RO
1538}
1539
d562f1f8
RO
1540/*
1541 * Caller must hold RTNL.
1542 */
16c6cf8b 1543int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1544{
1545 struct trie *t = (struct trie *) tb->tb_data;
1546 u32 key, mask;
4e902c57
TG
1547 int plen = cfg->fc_dst_len;
1548 u8 tos = cfg->fc_tos;
19baf839
RO
1549 struct fib_alias *fa, *fa_to_delete;
1550 struct list_head *fa_head;
adaf9816 1551 struct tnode *l;
91b9a277
OJ
1552 struct leaf_info *li;
1553
c877efb2 1554 if (plen > 32)
19baf839
RO
1555 return -EINVAL;
1556
4e902c57 1557 key = ntohl(cfg->fc_dst);
91b9a277 1558 mask = ntohl(inet_make_mask(plen));
19baf839 1559
c877efb2 1560 if (key & ~mask)
19baf839
RO
1561 return -EINVAL;
1562
1563 key = key & mask;
1564 l = fib_find_node(t, key);
1565
c877efb2 1566 if (!l)
19baf839
RO
1567 return -ESRCH;
1568
ad5b3102
IM
1569 li = find_leaf_info(l, plen);
1570
1571 if (!li)
1572 return -ESRCH;
1573
1574 fa_head = &li->falh;
19baf839
RO
1575 fa = fib_find_alias(fa_head, tos, 0);
1576
1577 if (!fa)
1578 return -ESRCH;
1579
0c7770c7 1580 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1581
1582 fa_to_delete = NULL;
936f6f8e
JA
1583 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1584 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1585 struct fib_info *fi = fa->fa_info;
1586
1587 if (fa->fa_tos != tos)
1588 break;
1589
4e902c57
TG
1590 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1591 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
37e826c5 1592 fa->fa_info->fib_scope == cfg->fc_scope) &&
74cb3c10
JA
1593 (!cfg->fc_prefsrc ||
1594 fi->fib_prefsrc == cfg->fc_prefsrc) &&
4e902c57
TG
1595 (!cfg->fc_protocol ||
1596 fi->fib_protocol == cfg->fc_protocol) &&
1597 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1598 fa_to_delete = fa;
1599 break;
1600 }
1601 }
1602
91b9a277
OJ
1603 if (!fa_to_delete)
1604 return -ESRCH;
19baf839 1605
91b9a277 1606 fa = fa_to_delete;
4e902c57 1607 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1608 &cfg->fc_nlinfo, 0);
91b9a277 1609
2373ce1c 1610 list_del_rcu(&fa->fa_list);
19baf839 1611
21d8c49e
DM
1612 if (!plen)
1613 tb->tb_num_default--;
1614
91b9a277 1615 if (list_empty(fa_head)) {
2373ce1c 1616 hlist_del_rcu(&li->hlist);
91b9a277 1617 free_leaf_info(li);
2373ce1c 1618 }
19baf839 1619
91b9a277 1620 if (hlist_empty(&l->list))
9195bef7 1621 trie_leaf_remove(t, l);
19baf839 1622
91b9a277 1623 if (fa->fa_state & FA_S_ACCESSED)
4ccfe6d4 1624 rt_cache_flush(cfg->fc_nlinfo.nl_net);
19baf839 1625
2373ce1c
RO
1626 fib_release_info(fa->fa_info);
1627 alias_free_mem_rcu(fa);
91b9a277 1628 return 0;
19baf839
RO
1629}
1630
ef3660ce 1631static int trie_flush_list(struct list_head *head)
19baf839
RO
1632{
1633 struct fib_alias *fa, *fa_node;
1634 int found = 0;
1635
1636 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1637 struct fib_info *fi = fa->fa_info;
19baf839 1638
2373ce1c
RO
1639 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1640 list_del_rcu(&fa->fa_list);
1641 fib_release_info(fa->fa_info);
1642 alias_free_mem_rcu(fa);
19baf839
RO
1643 found++;
1644 }
1645 }
1646 return found;
1647}
1648
adaf9816 1649static int trie_flush_leaf(struct tnode *l)
19baf839
RO
1650{
1651 int found = 0;
1652 struct hlist_head *lih = &l->list;
b67bfe0d 1653 struct hlist_node *tmp;
19baf839
RO
1654 struct leaf_info *li = NULL;
1655
b67bfe0d 1656 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
ef3660ce 1657 found += trie_flush_list(&li->falh);
19baf839
RO
1658
1659 if (list_empty(&li->falh)) {
2373ce1c 1660 hlist_del_rcu(&li->hlist);
19baf839
RO
1661 free_leaf_info(li);
1662 }
1663 }
1664 return found;
1665}
1666
82cfbb00
SH
1667/*
1668 * Scan for the next right leaf starting at node p->child[idx]
1669 * Since we have back pointer, no recursion necessary.
1670 */
adaf9816 1671static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
19baf839 1672{
82cfbb00
SH
1673 do {
1674 t_key idx;
c877efb2 1675
c877efb2 1676 if (c)
82cfbb00 1677 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1678 else
82cfbb00 1679 idx = 0;
2373ce1c 1680
82cfbb00
SH
1681 while (idx < 1u << p->bits) {
1682 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1683 if (!c)
91b9a277
OJ
1684 continue;
1685
aab515d7 1686 if (IS_LEAF(c))
adaf9816 1687 return c;
82cfbb00
SH
1688
1689 /* Rescan start scanning in new node */
adaf9816 1690 p = c;
82cfbb00 1691 idx = 0;
19baf839 1692 }
82cfbb00
SH
1693
1694 /* Node empty, walk back up to parent */
adaf9816 1695 c = p;
a034ee3c 1696 } while ((p = node_parent_rcu(c)) != NULL);
82cfbb00
SH
1697
1698 return NULL; /* Root of trie */
1699}
1700
adaf9816 1701static struct tnode *trie_firstleaf(struct trie *t)
82cfbb00 1702{
adaf9816 1703 struct tnode *n = rcu_dereference_rtnl(t->trie);
82cfbb00
SH
1704
1705 if (!n)
1706 return NULL;
1707
1708 if (IS_LEAF(n)) /* trie is just a leaf */
adaf9816 1709 return n;
82cfbb00
SH
1710
1711 return leaf_walk_rcu(n, NULL);
1712}
1713
adaf9816 1714static struct tnode *trie_nextleaf(struct tnode *l)
82cfbb00 1715{
adaf9816 1716 struct tnode *p = node_parent_rcu(l);
82cfbb00
SH
1717
1718 if (!p)
1719 return NULL; /* trie with just one leaf */
1720
adaf9816 1721 return leaf_walk_rcu(p, l);
19baf839
RO
1722}
1723
adaf9816 1724static struct tnode *trie_leafindex(struct trie *t, int index)
71d67e66 1725{
adaf9816 1726 struct tnode *l = trie_firstleaf(t);
71d67e66 1727
ec28cf73 1728 while (l && index-- > 0)
71d67e66 1729 l = trie_nextleaf(l);
ec28cf73 1730
71d67e66
SH
1731 return l;
1732}
1733
1734
d562f1f8
RO
1735/*
1736 * Caller must hold RTNL.
1737 */
16c6cf8b 1738int fib_table_flush(struct fib_table *tb)
19baf839
RO
1739{
1740 struct trie *t = (struct trie *) tb->tb_data;
adaf9816 1741 struct tnode *l, *ll = NULL;
82cfbb00 1742 int found = 0;
19baf839 1743
82cfbb00 1744 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1745 found += trie_flush_leaf(l);
19baf839
RO
1746
1747 if (ll && hlist_empty(&ll->list))
9195bef7 1748 trie_leaf_remove(t, ll);
19baf839
RO
1749 ll = l;
1750 }
1751
1752 if (ll && hlist_empty(&ll->list))
9195bef7 1753 trie_leaf_remove(t, ll);
19baf839 1754
0c7770c7 1755 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1756 return found;
1757}
1758
4aa2c466
PE
1759void fib_free_table(struct fib_table *tb)
1760{
8274a97a
AD
1761#ifdef CONFIG_IP_FIB_TRIE_STATS
1762 struct trie *t = (struct trie *)tb->tb_data;
1763
1764 free_percpu(t->stats);
1765#endif /* CONFIG_IP_FIB_TRIE_STATS */
4aa2c466
PE
1766 kfree(tb);
1767}
1768
a07f5f50
SH
1769static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1770 struct fib_table *tb,
19baf839
RO
1771 struct sk_buff *skb, struct netlink_callback *cb)
1772{
1773 int i, s_i;
1774 struct fib_alias *fa;
32ab5f80 1775 __be32 xkey = htonl(key);
19baf839 1776
71d67e66 1777 s_i = cb->args[5];
19baf839
RO
1778 i = 0;
1779
2373ce1c
RO
1780 /* rcu_read_lock is hold by caller */
1781
1782 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1783 if (i < s_i) {
1784 i++;
1785 continue;
1786 }
19baf839 1787
15e47304 1788 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
19baf839
RO
1789 cb->nlh->nlmsg_seq,
1790 RTM_NEWROUTE,
1791 tb->tb_id,
1792 fa->fa_type,
be403ea1 1793 xkey,
19baf839
RO
1794 plen,
1795 fa->fa_tos,
64347f78 1796 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1797 cb->args[5] = i;
19baf839 1798 return -1;
91b9a277 1799 }
19baf839
RO
1800 i++;
1801 }
71d67e66 1802 cb->args[5] = i;
19baf839
RO
1803 return skb->len;
1804}
1805
adaf9816 1806static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
a88ee229 1807 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1808{
a88ee229 1809 struct leaf_info *li;
a88ee229 1810 int i, s_i;
19baf839 1811
71d67e66 1812 s_i = cb->args[4];
a88ee229 1813 i = 0;
19baf839 1814
a88ee229 1815 /* rcu_read_lock is hold by caller */
b67bfe0d 1816 hlist_for_each_entry_rcu(li, &l->list, hlist) {
a88ee229
SH
1817 if (i < s_i) {
1818 i++;
19baf839 1819 continue;
a88ee229 1820 }
91b9a277 1821
a88ee229 1822 if (i > s_i)
71d67e66 1823 cb->args[5] = 0;
19baf839 1824
a88ee229 1825 if (list_empty(&li->falh))
19baf839
RO
1826 continue;
1827
a88ee229 1828 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1829 cb->args[4] = i;
19baf839
RO
1830 return -1;
1831 }
a88ee229 1832 i++;
19baf839 1833 }
a88ee229 1834
71d67e66 1835 cb->args[4] = i;
19baf839
RO
1836 return skb->len;
1837}
1838
16c6cf8b
SH
1839int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1840 struct netlink_callback *cb)
19baf839 1841{
adaf9816 1842 struct tnode *l;
19baf839 1843 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1844 t_key key = cb->args[2];
71d67e66 1845 int count = cb->args[3];
19baf839 1846
2373ce1c 1847 rcu_read_lock();
d5ce8a0e
SH
1848 /* Dump starting at last key.
1849 * Note: 0.0.0.0/0 (ie default) is first key.
1850 */
71d67e66 1851 if (count == 0)
d5ce8a0e
SH
1852 l = trie_firstleaf(t);
1853 else {
71d67e66
SH
1854 /* Normally, continue from last key, but if that is missing
1855 * fallback to using slow rescan
1856 */
d5ce8a0e 1857 l = fib_find_node(t, key);
71d67e66
SH
1858 if (!l)
1859 l = trie_leafindex(t, count);
d5ce8a0e 1860 }
a88ee229 1861
d5ce8a0e
SH
1862 while (l) {
1863 cb->args[2] = l->key;
a88ee229 1864 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1865 cb->args[3] = count;
a88ee229 1866 rcu_read_unlock();
a88ee229 1867 return -1;
19baf839 1868 }
d5ce8a0e 1869
71d67e66 1870 ++count;
d5ce8a0e 1871 l = trie_nextleaf(l);
71d67e66
SH
1872 memset(&cb->args[4], 0,
1873 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1874 }
71d67e66 1875 cb->args[3] = count;
2373ce1c 1876 rcu_read_unlock();
a88ee229 1877
19baf839 1878 return skb->len;
19baf839
RO
1879}
1880
5348ba85 1881void __init fib_trie_init(void)
7f9b8052 1882{
a07f5f50
SH
1883 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1884 sizeof(struct fib_alias),
bc3c8c1e
SH
1885 0, SLAB_PANIC, NULL);
1886
1887 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
adaf9816 1888 max(sizeof(struct tnode),
bc3c8c1e
SH
1889 sizeof(struct leaf_info)),
1890 0, SLAB_PANIC, NULL);
7f9b8052 1891}
19baf839 1892
7f9b8052 1893
5348ba85 1894struct fib_table *fib_trie_table(u32 id)
19baf839
RO
1895{
1896 struct fib_table *tb;
1897 struct trie *t;
1898
19baf839
RO
1899 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1900 GFP_KERNEL);
1901 if (tb == NULL)
1902 return NULL;
1903
1904 tb->tb_id = id;
971b893e 1905 tb->tb_default = -1;
21d8c49e 1906 tb->tb_num_default = 0;
19baf839
RO
1907
1908 t = (struct trie *) tb->tb_data;
8274a97a
AD
1909 RCU_INIT_POINTER(t->trie, NULL);
1910#ifdef CONFIG_IP_FIB_TRIE_STATS
1911 t->stats = alloc_percpu(struct trie_use_stats);
1912 if (!t->stats) {
1913 kfree(tb);
1914 tb = NULL;
1915 }
1916#endif
19baf839 1917
19baf839
RO
1918 return tb;
1919}
1920
cb7b593c
SH
1921#ifdef CONFIG_PROC_FS
1922/* Depth first Trie walk iterator */
1923struct fib_trie_iter {
1c340b2f 1924 struct seq_net_private p;
3d3b2d25 1925 struct fib_table *tb;
cb7b593c 1926 struct tnode *tnode;
a034ee3c
ED
1927 unsigned int index;
1928 unsigned int depth;
cb7b593c 1929};
19baf839 1930
adaf9816 1931static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 1932{
cb7b593c 1933 struct tnode *tn = iter->tnode;
a034ee3c 1934 unsigned int cindex = iter->index;
cb7b593c 1935 struct tnode *p;
19baf839 1936
6640e697
EB
1937 /* A single entry routing table */
1938 if (!tn)
1939 return NULL;
1940
cb7b593c
SH
1941 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1942 iter->tnode, iter->index, iter->depth);
1943rescan:
1944 while (cindex < (1<<tn->bits)) {
adaf9816 1945 struct tnode *n = tnode_get_child_rcu(tn, cindex);
19baf839 1946
cb7b593c
SH
1947 if (n) {
1948 if (IS_LEAF(n)) {
1949 iter->tnode = tn;
1950 iter->index = cindex + 1;
1951 } else {
1952 /* push down one level */
adaf9816 1953 iter->tnode = n;
cb7b593c
SH
1954 iter->index = 0;
1955 ++iter->depth;
1956 }
1957 return n;
1958 }
19baf839 1959
cb7b593c
SH
1960 ++cindex;
1961 }
91b9a277 1962
cb7b593c 1963 /* Current node exhausted, pop back up */
adaf9816 1964 p = node_parent_rcu(tn);
cb7b593c
SH
1965 if (p) {
1966 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
1967 tn = p;
1968 --iter->depth;
1969 goto rescan;
19baf839 1970 }
cb7b593c
SH
1971
1972 /* got root? */
1973 return NULL;
19baf839
RO
1974}
1975
adaf9816 1976static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
cb7b593c 1977 struct trie *t)
19baf839 1978{
adaf9816 1979 struct tnode *n;
5ddf0eb2 1980
132adf54 1981 if (!t)
5ddf0eb2
RO
1982 return NULL;
1983
1984 n = rcu_dereference(t->trie);
3d3b2d25 1985 if (!n)
5ddf0eb2 1986 return NULL;
19baf839 1987
3d3b2d25 1988 if (IS_TNODE(n)) {
adaf9816 1989 iter->tnode = n;
3d3b2d25
SH
1990 iter->index = 0;
1991 iter->depth = 1;
1992 } else {
1993 iter->tnode = NULL;
1994 iter->index = 0;
1995 iter->depth = 0;
91b9a277 1996 }
3d3b2d25
SH
1997
1998 return n;
cb7b593c 1999}
91b9a277 2000
cb7b593c
SH
2001static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2002{
adaf9816 2003 struct tnode *n;
cb7b593c 2004 struct fib_trie_iter iter;
91b9a277 2005
cb7b593c 2006 memset(s, 0, sizeof(*s));
91b9a277 2007
cb7b593c 2008 rcu_read_lock();
3d3b2d25 2009 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2010 if (IS_LEAF(n)) {
93672292 2011 struct leaf_info *li;
93672292 2012
cb7b593c
SH
2013 s->leaves++;
2014 s->totdepth += iter.depth;
2015 if (iter.depth > s->maxdepth)
2016 s->maxdepth = iter.depth;
93672292 2017
adaf9816 2018 hlist_for_each_entry_rcu(li, &n->list, hlist)
93672292 2019 ++s->prefixes;
cb7b593c 2020 } else {
cb7b593c
SH
2021 int i;
2022
2023 s->tnodes++;
adaf9816
AD
2024 if (n->bits < MAX_STAT_DEPTH)
2025 s->nodesizes[n->bits]++;
06ef921d 2026
adaf9816
AD
2027 for (i = 0; i < tnode_child_length(n); i++)
2028 if (!rcu_access_pointer(n->child[i]))
cb7b593c 2029 s->nullpointers++;
19baf839 2030 }
19baf839 2031 }
2373ce1c 2032 rcu_read_unlock();
19baf839
RO
2033}
2034
cb7b593c
SH
2035/*
2036 * This outputs /proc/net/fib_triestats
2037 */
2038static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2039{
a034ee3c 2040 unsigned int i, max, pointers, bytes, avdepth;
c877efb2 2041
cb7b593c
SH
2042 if (stat->leaves)
2043 avdepth = stat->totdepth*100 / stat->leaves;
2044 else
2045 avdepth = 0;
91b9a277 2046
a07f5f50
SH
2047 seq_printf(seq, "\tAver depth: %u.%02d\n",
2048 avdepth / 100, avdepth % 100);
cb7b593c 2049 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2050
cb7b593c 2051 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
adaf9816 2052 bytes = sizeof(struct tnode) * stat->leaves;
93672292
SH
2053
2054 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2055 bytes += sizeof(struct leaf_info) * stat->prefixes;
2056
187b5188 2057 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2058 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2059
06ef921d
RO
2060 max = MAX_STAT_DEPTH;
2061 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2062 max--;
19baf839 2063
cb7b593c 2064 pointers = 0;
f585a991 2065 for (i = 1; i < max; i++)
cb7b593c 2066 if (stat->nodesizes[i] != 0) {
187b5188 2067 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2068 pointers += (1<<i) * stat->nodesizes[i];
2069 }
2070 seq_putc(seq, '\n');
187b5188 2071 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2072
adaf9816 2073 bytes += sizeof(struct tnode *) * pointers;
187b5188
SH
2074 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2075 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2076}
2373ce1c 2077
cb7b593c 2078#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd 2079static void trie_show_usage(struct seq_file *seq,
8274a97a 2080 const struct trie_use_stats __percpu *stats)
66a2f7fd 2081{
8274a97a
AD
2082 struct trie_use_stats s = { 0 };
2083 int cpu;
2084
2085 /* loop through all of the CPUs and gather up the stats */
2086 for_each_possible_cpu(cpu) {
2087 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2088
2089 s.gets += pcpu->gets;
2090 s.backtrack += pcpu->backtrack;
2091 s.semantic_match_passed += pcpu->semantic_match_passed;
2092 s.semantic_match_miss += pcpu->semantic_match_miss;
2093 s.null_node_hit += pcpu->null_node_hit;
2094 s.resize_node_skipped += pcpu->resize_node_skipped;
2095 }
2096
66a2f7fd 2097 seq_printf(seq, "\nCounters:\n---------\n");
8274a97a
AD
2098 seq_printf(seq, "gets = %u\n", s.gets);
2099 seq_printf(seq, "backtracks = %u\n", s.backtrack);
a07f5f50 2100 seq_printf(seq, "semantic match passed = %u\n",
8274a97a
AD
2101 s.semantic_match_passed);
2102 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2103 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2104 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
cb7b593c 2105}
66a2f7fd
SH
2106#endif /* CONFIG_IP_FIB_TRIE_STATS */
2107
3d3b2d25 2108static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2109{
3d3b2d25
SH
2110 if (tb->tb_id == RT_TABLE_LOCAL)
2111 seq_puts(seq, "Local:\n");
2112 else if (tb->tb_id == RT_TABLE_MAIN)
2113 seq_puts(seq, "Main:\n");
2114 else
2115 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2116}
19baf839 2117
3d3b2d25 2118
cb7b593c
SH
2119static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2120{
1c340b2f 2121 struct net *net = (struct net *)seq->private;
3d3b2d25 2122 unsigned int h;
877a9bff 2123
d717a9a6 2124 seq_printf(seq,
a07f5f50
SH
2125 "Basic info: size of leaf:"
2126 " %Zd bytes, size of tnode: %Zd bytes.\n",
adaf9816 2127 sizeof(struct tnode), sizeof(struct tnode));
d717a9a6 2128
3d3b2d25
SH
2129 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2130 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25
SH
2131 struct fib_table *tb;
2132
b67bfe0d 2133 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2134 struct trie *t = (struct trie *) tb->tb_data;
2135 struct trie_stat stat;
877a9bff 2136
3d3b2d25
SH
2137 if (!t)
2138 continue;
2139
2140 fib_table_print(seq, tb);
2141
2142 trie_collect_stats(t, &stat);
2143 trie_show_stats(seq, &stat);
2144#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 2145 trie_show_usage(seq, t->stats);
3d3b2d25
SH
2146#endif
2147 }
2148 }
19baf839 2149
cb7b593c 2150 return 0;
19baf839
RO
2151}
2152
cb7b593c 2153static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2154{
de05c557 2155 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2156}
2157
9a32144e 2158static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2159 .owner = THIS_MODULE,
2160 .open = fib_triestat_seq_open,
2161 .read = seq_read,
2162 .llseek = seq_lseek,
b6fcbdb4 2163 .release = single_release_net,
cb7b593c
SH
2164};
2165
adaf9816 2166static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2167{
1218854a
YH
2168 struct fib_trie_iter *iter = seq->private;
2169 struct net *net = seq_file_net(seq);
cb7b593c 2170 loff_t idx = 0;
3d3b2d25 2171 unsigned int h;
cb7b593c 2172
3d3b2d25
SH
2173 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2174 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25 2175 struct fib_table *tb;
cb7b593c 2176
b67bfe0d 2177 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
adaf9816 2178 struct tnode *n;
3d3b2d25
SH
2179
2180 for (n = fib_trie_get_first(iter,
2181 (struct trie *) tb->tb_data);
2182 n; n = fib_trie_get_next(iter))
2183 if (pos == idx++) {
2184 iter->tb = tb;
2185 return n;
2186 }
2187 }
cb7b593c 2188 }
3d3b2d25 2189
19baf839
RO
2190 return NULL;
2191}
2192
cb7b593c 2193static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2194 __acquires(RCU)
19baf839 2195{
cb7b593c 2196 rcu_read_lock();
1218854a 2197 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2198}
2199
cb7b593c 2200static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2201{
cb7b593c 2202 struct fib_trie_iter *iter = seq->private;
1218854a 2203 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2204 struct fib_table *tb = iter->tb;
2205 struct hlist_node *tb_node;
2206 unsigned int h;
adaf9816 2207 struct tnode *n;
cb7b593c 2208
19baf839 2209 ++*pos;
3d3b2d25
SH
2210 /* next node in same table */
2211 n = fib_trie_get_next(iter);
2212 if (n)
2213 return n;
19baf839 2214
3d3b2d25
SH
2215 /* walk rest of this hash chain */
2216 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
0a5c0475 2217 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
3d3b2d25
SH
2218 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2219 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2220 if (n)
2221 goto found;
2222 }
19baf839 2223
3d3b2d25
SH
2224 /* new hash chain */
2225 while (++h < FIB_TABLE_HASHSZ) {
2226 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
b67bfe0d 2227 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2228 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2229 if (n)
2230 goto found;
2231 }
2232 }
cb7b593c 2233 return NULL;
3d3b2d25
SH
2234
2235found:
2236 iter->tb = tb;
2237 return n;
cb7b593c 2238}
19baf839 2239
cb7b593c 2240static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2241 __releases(RCU)
19baf839 2242{
cb7b593c
SH
2243 rcu_read_unlock();
2244}
91b9a277 2245
cb7b593c
SH
2246static void seq_indent(struct seq_file *seq, int n)
2247{
a034ee3c
ED
2248 while (n-- > 0)
2249 seq_puts(seq, " ");
cb7b593c 2250}
19baf839 2251
28d36e37 2252static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2253{
132adf54 2254 switch (s) {
cb7b593c
SH
2255 case RT_SCOPE_UNIVERSE: return "universe";
2256 case RT_SCOPE_SITE: return "site";
2257 case RT_SCOPE_LINK: return "link";
2258 case RT_SCOPE_HOST: return "host";
2259 case RT_SCOPE_NOWHERE: return "nowhere";
2260 default:
28d36e37 2261 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2262 return buf;
2263 }
2264}
19baf839 2265
36cbd3dc 2266static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2267 [RTN_UNSPEC] = "UNSPEC",
2268 [RTN_UNICAST] = "UNICAST",
2269 [RTN_LOCAL] = "LOCAL",
2270 [RTN_BROADCAST] = "BROADCAST",
2271 [RTN_ANYCAST] = "ANYCAST",
2272 [RTN_MULTICAST] = "MULTICAST",
2273 [RTN_BLACKHOLE] = "BLACKHOLE",
2274 [RTN_UNREACHABLE] = "UNREACHABLE",
2275 [RTN_PROHIBIT] = "PROHIBIT",
2276 [RTN_THROW] = "THROW",
2277 [RTN_NAT] = "NAT",
2278 [RTN_XRESOLVE] = "XRESOLVE",
2279};
19baf839 2280
a034ee3c 2281static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
cb7b593c 2282{
cb7b593c
SH
2283 if (t < __RTN_MAX && rtn_type_names[t])
2284 return rtn_type_names[t];
28d36e37 2285 snprintf(buf, len, "type %u", t);
cb7b593c 2286 return buf;
19baf839
RO
2287}
2288
cb7b593c
SH
2289/* Pretty print the trie */
2290static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2291{
cb7b593c 2292 const struct fib_trie_iter *iter = seq->private;
adaf9816 2293 struct tnode *n = v;
c877efb2 2294
3d3b2d25
SH
2295 if (!node_parent_rcu(n))
2296 fib_table_print(seq, iter->tb);
095b8501 2297
cb7b593c 2298 if (IS_TNODE(n)) {
adaf9816 2299 __be32 prf = htonl(n->key);
91b9a277 2300
adaf9816 2301 seq_indent(seq, iter->depth - 1);
673d57e7 2302 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
adaf9816
AD
2303 &prf, n->pos, n->bits, n->full_children,
2304 n->empty_children);
cb7b593c 2305 } else {
1328042e 2306 struct leaf_info *li;
adaf9816 2307 __be32 val = htonl(n->key);
cb7b593c
SH
2308
2309 seq_indent(seq, iter->depth);
673d57e7 2310 seq_printf(seq, " |-- %pI4\n", &val);
1328042e 2311
adaf9816 2312 hlist_for_each_entry_rcu(li, &n->list, hlist) {
1328042e
SH
2313 struct fib_alias *fa;
2314
2315 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2316 char buf1[32], buf2[32];
2317
2318 seq_indent(seq, iter->depth+1);
2319 seq_printf(seq, " /%d %s %s", li->plen,
2320 rtn_scope(buf1, sizeof(buf1),
37e826c5 2321 fa->fa_info->fib_scope),
1328042e
SH
2322 rtn_type(buf2, sizeof(buf2),
2323 fa->fa_type));
2324 if (fa->fa_tos)
b9c4d82a 2325 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2326 seq_putc(seq, '\n');
cb7b593c
SH
2327 }
2328 }
19baf839 2329 }
cb7b593c 2330
19baf839
RO
2331 return 0;
2332}
2333
f690808e 2334static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2335 .start = fib_trie_seq_start,
2336 .next = fib_trie_seq_next,
2337 .stop = fib_trie_seq_stop,
2338 .show = fib_trie_seq_show,
19baf839
RO
2339};
2340
cb7b593c 2341static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2342{
1c340b2f
DL
2343 return seq_open_net(inode, file, &fib_trie_seq_ops,
2344 sizeof(struct fib_trie_iter));
19baf839
RO
2345}
2346
9a32144e 2347static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2348 .owner = THIS_MODULE,
2349 .open = fib_trie_seq_open,
2350 .read = seq_read,
2351 .llseek = seq_lseek,
1c340b2f 2352 .release = seq_release_net,
19baf839
RO
2353};
2354
8315f5d8
SH
2355struct fib_route_iter {
2356 struct seq_net_private p;
2357 struct trie *main_trie;
2358 loff_t pos;
2359 t_key key;
2360};
2361
adaf9816 2362static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
8315f5d8 2363{
adaf9816 2364 struct tnode *l = NULL;
8315f5d8
SH
2365 struct trie *t = iter->main_trie;
2366
2367 /* use cache location of last found key */
2368 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2369 pos -= iter->pos;
2370 else {
2371 iter->pos = 0;
2372 l = trie_firstleaf(t);
2373 }
2374
2375 while (l && pos-- > 0) {
2376 iter->pos++;
2377 l = trie_nextleaf(l);
2378 }
2379
2380 if (l)
2381 iter->key = pos; /* remember it */
2382 else
2383 iter->pos = 0; /* forget it */
2384
2385 return l;
2386}
2387
2388static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2389 __acquires(RCU)
2390{
2391 struct fib_route_iter *iter = seq->private;
2392 struct fib_table *tb;
2393
2394 rcu_read_lock();
1218854a 2395 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2396 if (!tb)
2397 return NULL;
2398
2399 iter->main_trie = (struct trie *) tb->tb_data;
2400 if (*pos == 0)
2401 return SEQ_START_TOKEN;
2402 else
2403 return fib_route_get_idx(iter, *pos - 1);
2404}
2405
2406static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2407{
2408 struct fib_route_iter *iter = seq->private;
adaf9816 2409 struct tnode *l = v;
8315f5d8
SH
2410
2411 ++*pos;
2412 if (v == SEQ_START_TOKEN) {
2413 iter->pos = 0;
2414 l = trie_firstleaf(iter->main_trie);
2415 } else {
2416 iter->pos++;
2417 l = trie_nextleaf(l);
2418 }
2419
2420 if (l)
2421 iter->key = l->key;
2422 else
2423 iter->pos = 0;
2424 return l;
2425}
2426
2427static void fib_route_seq_stop(struct seq_file *seq, void *v)
2428 __releases(RCU)
2429{
2430 rcu_read_unlock();
2431}
2432
a034ee3c 2433static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2434{
a034ee3c 2435 unsigned int flags = 0;
19baf839 2436
a034ee3c
ED
2437 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2438 flags = RTF_REJECT;
cb7b593c
SH
2439 if (fi && fi->fib_nh->nh_gw)
2440 flags |= RTF_GATEWAY;
32ab5f80 2441 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2442 flags |= RTF_HOST;
2443 flags |= RTF_UP;
2444 return flags;
19baf839
RO
2445}
2446
cb7b593c
SH
2447/*
2448 * This outputs /proc/net/route.
2449 * The format of the file is not supposed to be changed
a034ee3c 2450 * and needs to be same as fib_hash output to avoid breaking
cb7b593c
SH
2451 * legacy utilities
2452 */
2453static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2454{
adaf9816 2455 struct tnode *l = v;
1328042e 2456 struct leaf_info *li;
19baf839 2457
cb7b593c
SH
2458 if (v == SEQ_START_TOKEN) {
2459 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2460 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2461 "\tWindow\tIRTT");
2462 return 0;
2463 }
19baf839 2464
b67bfe0d 2465 hlist_for_each_entry_rcu(li, &l->list, hlist) {
cb7b593c 2466 struct fib_alias *fa;
32ab5f80 2467 __be32 mask, prefix;
91b9a277 2468
cb7b593c
SH
2469 mask = inet_make_mask(li->plen);
2470 prefix = htonl(l->key);
19baf839 2471
cb7b593c 2472 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2473 const struct fib_info *fi = fa->fa_info;
a034ee3c 2474 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
19baf839 2475
cb7b593c
SH
2476 if (fa->fa_type == RTN_BROADCAST
2477 || fa->fa_type == RTN_MULTICAST)
2478 continue;
19baf839 2479
652586df
TH
2480 seq_setwidth(seq, 127);
2481
cb7b593c 2482 if (fi)
5e659e4c
PE
2483 seq_printf(seq,
2484 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
652586df 2485 "%d\t%08X\t%d\t%u\t%u",
cb7b593c
SH
2486 fi->fib_dev ? fi->fib_dev->name : "*",
2487 prefix,
2488 fi->fib_nh->nh_gw, flags, 0, 0,
2489 fi->fib_priority,
2490 mask,
a07f5f50
SH
2491 (fi->fib_advmss ?
2492 fi->fib_advmss + 40 : 0),
cb7b593c 2493 fi->fib_window,
652586df 2494 fi->fib_rtt >> 3);
cb7b593c 2495 else
5e659e4c
PE
2496 seq_printf(seq,
2497 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
652586df 2498 "%d\t%08X\t%d\t%u\t%u",
cb7b593c 2499 prefix, 0, flags, 0, 0, 0,
652586df 2500 mask, 0, 0, 0);
19baf839 2501
652586df 2502 seq_pad(seq, '\n');
cb7b593c 2503 }
19baf839
RO
2504 }
2505
2506 return 0;
2507}
2508
f690808e 2509static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2510 .start = fib_route_seq_start,
2511 .next = fib_route_seq_next,
2512 .stop = fib_route_seq_stop,
cb7b593c 2513 .show = fib_route_seq_show,
19baf839
RO
2514};
2515
cb7b593c 2516static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2517{
1c340b2f 2518 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2519 sizeof(struct fib_route_iter));
19baf839
RO
2520}
2521
9a32144e 2522static const struct file_operations fib_route_fops = {
cb7b593c
SH
2523 .owner = THIS_MODULE,
2524 .open = fib_route_seq_open,
2525 .read = seq_read,
2526 .llseek = seq_lseek,
1c340b2f 2527 .release = seq_release_net,
19baf839
RO
2528};
2529
61a02653 2530int __net_init fib_proc_init(struct net *net)
19baf839 2531{
d4beaa66 2532 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
cb7b593c
SH
2533 goto out1;
2534
d4beaa66
G
2535 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2536 &fib_triestat_fops))
cb7b593c
SH
2537 goto out2;
2538
d4beaa66 2539 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
cb7b593c
SH
2540 goto out3;
2541
19baf839 2542 return 0;
cb7b593c
SH
2543
2544out3:
ece31ffd 2545 remove_proc_entry("fib_triestat", net->proc_net);
cb7b593c 2546out2:
ece31ffd 2547 remove_proc_entry("fib_trie", net->proc_net);
cb7b593c
SH
2548out1:
2549 return -ENOMEM;
19baf839
RO
2550}
2551
61a02653 2552void __net_exit fib_proc_exit(struct net *net)
19baf839 2553{
ece31ffd
G
2554 remove_proc_entry("fib_trie", net->proc_net);
2555 remove_proc_entry("fib_triestat", net->proc_net);
2556 remove_proc_entry("route", net->proc_net);
19baf839
RO
2557}
2558
2559#endif /* CONFIG_PROC_FS */
This page took 1.222391 seconds and 5 git commands to generate.