fib_trie: Optimize fib_table_lookup to avoid wasting time on loops/variables
[deliverable/linux.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
25985edc 15 * This work is based on the LPC-trie which is originally described in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
631dd1a8 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
19baf839
RO
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839 53#include <asm/uaccess.h>
1977f032 54#include <linux/bitops.h>
19baf839
RO
55#include <linux/types.h>
56#include <linux/kernel.h>
19baf839
RO
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
cd8787ab 64#include <linux/inetdevice.h>
19baf839
RO
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
2373ce1c 68#include <linux/rcupdate.h>
19baf839
RO
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
5a0e3ad6 73#include <linux/slab.h>
bc3b2d7f 74#include <linux/export.h>
457c4cbc 75#include <net/net_namespace.h>
19baf839
RO
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
06ef921d 84#define MAX_STAT_DEPTH 32
19baf839 85
19baf839 86#define KEYLENGTH (8*sizeof(t_key))
19baf839 87
19baf839
RO
88typedef unsigned int t_key;
89
64c9b6fb
AD
90#define IS_TNODE(n) ((n)->bits)
91#define IS_LEAF(n) (!(n)->bits)
2373ce1c 92
9f9e636d
AD
93#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
94#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
95
64c9b6fb
AD
96struct tnode {
97 t_key key;
98 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
99 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
100 struct tnode __rcu *parent;
37fd30f2 101 struct rcu_head rcu;
adaf9816
AD
102 union {
103 /* The fields in this struct are valid if bits > 0 (TNODE) */
104 struct {
105 unsigned int full_children; /* KEYLENGTH bits needed */
106 unsigned int empty_children; /* KEYLENGTH bits needed */
107 struct tnode __rcu *child[0];
108 };
109 /* This list pointer if valid if bits == 0 (LEAF) */
110 struct hlist_head list;
111 };
19baf839
RO
112};
113
114struct leaf_info {
115 struct hlist_node hlist;
116 int plen;
5c74501f 117 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
19baf839 118 struct list_head falh;
5c74501f 119 struct rcu_head rcu;
19baf839
RO
120};
121
19baf839
RO
122#ifdef CONFIG_IP_FIB_TRIE_STATS
123struct trie_use_stats {
124 unsigned int gets;
125 unsigned int backtrack;
126 unsigned int semantic_match_passed;
127 unsigned int semantic_match_miss;
128 unsigned int null_node_hit;
2f36895a 129 unsigned int resize_node_skipped;
19baf839
RO
130};
131#endif
132
133struct trie_stat {
134 unsigned int totdepth;
135 unsigned int maxdepth;
136 unsigned int tnodes;
137 unsigned int leaves;
138 unsigned int nullpointers;
93672292 139 unsigned int prefixes;
06ef921d 140 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 141};
19baf839
RO
142
143struct trie {
adaf9816 144 struct tnode __rcu *trie;
19baf839 145#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 146 struct trie_use_stats __percpu *stats;
19baf839 147#endif
19baf839
RO
148};
149
adaf9816 150static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
a07f5f50 151 int wasfull);
adaf9816 152static struct tnode *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
153static struct tnode *inflate(struct trie *t, struct tnode *tn);
154static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c 155/* tnodes to free after resize(); protected by RTNL */
37fd30f2 156static struct callback_head *tnode_free_head;
c3059477
JP
157static size_t tnode_free_size;
158
159/*
160 * synchronize_rcu after call_rcu for that many pages; it should be especially
161 * useful before resizing the root node with PREEMPT_NONE configs; the value was
162 * obtained experimentally, aiming to avoid visible slowdown.
163 */
164static const int sync_pages = 128;
19baf839 165
e18b890b 166static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 167static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 168
64c9b6fb
AD
169/* caller must hold RTNL */
170#define node_parent(n) rtnl_dereference((n)->parent)
0a5c0475 171
64c9b6fb
AD
172/* caller must hold RCU read lock or RTNL */
173#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
0a5c0475 174
64c9b6fb 175/* wrapper for rcu_assign_pointer */
adaf9816 176static inline void node_set_parent(struct tnode *n, struct tnode *tp)
b59cfbf7 177{
adaf9816
AD
178 if (n)
179 rcu_assign_pointer(n->parent, tp);
06801916
SH
180}
181
64c9b6fb
AD
182#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
183
184/* This provides us with the number of children in this node, in the case of a
185 * leaf this will return 0 meaning none of the children are accessible.
6440cc9e 186 */
64c9b6fb 187static inline int tnode_child_length(const struct tnode *tn)
06801916 188{
64c9b6fb 189 return (1ul << tn->bits) & ~(1ul);
06801916 190}
2373ce1c 191
0a5c0475
ED
192/*
193 * caller must hold RTNL
194 */
adaf9816 195static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
b59cfbf7 196{
64c9b6fb 197 BUG_ON(i >= tnode_child_length(tn));
2373ce1c 198
0a5c0475 199 return rtnl_dereference(tn->child[i]);
b59cfbf7
ED
200}
201
0a5c0475
ED
202/*
203 * caller must hold RCU read lock or RTNL
204 */
adaf9816 205static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
19baf839 206{
64c9b6fb 207 BUG_ON(i >= tnode_child_length(tn));
19baf839 208
0a5c0475 209 return rcu_dereference_rtnl(tn->child[i]);
19baf839
RO
210}
211
3b004569 212static inline t_key mask_pfx(t_key k, unsigned int l)
ab66b4a7
SH
213{
214 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
215}
216
3b004569 217static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
19baf839 218{
91b9a277 219 if (offset < KEYLENGTH)
19baf839 220 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 221 else
19baf839
RO
222 return 0;
223}
224
225static inline int tkey_equals(t_key a, t_key b)
226{
c877efb2 227 return a == b;
19baf839
RO
228}
229
230static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
231{
c877efb2
SH
232 if (bits == 0 || offset >= KEYLENGTH)
233 return 1;
91b9a277
OJ
234 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
235 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 236}
19baf839
RO
237
238static inline int tkey_mismatch(t_key a, int offset, t_key b)
239{
240 t_key diff = a ^ b;
241 int i = offset;
242
c877efb2
SH
243 if (!diff)
244 return 0;
245 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
246 i++;
247 return i;
248}
249
19baf839 250/*
e905a9ed
YH
251 To understand this stuff, an understanding of keys and all their bits is
252 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
253 all of the bits in that key are significant.
254
255 Consider a node 'n' and its parent 'tp'.
256
e905a9ed
YH
257 If n is a leaf, every bit in its key is significant. Its presence is
258 necessitated by path compression, since during a tree traversal (when
259 searching for a leaf - unless we are doing an insertion) we will completely
260 ignore all skipped bits we encounter. Thus we need to verify, at the end of
261 a potentially successful search, that we have indeed been walking the
19baf839
RO
262 correct key path.
263
e905a9ed
YH
264 Note that we can never "miss" the correct key in the tree if present by
265 following the wrong path. Path compression ensures that segments of the key
266 that are the same for all keys with a given prefix are skipped, but the
267 skipped part *is* identical for each node in the subtrie below the skipped
268 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
269 call to tkey_sub_equals() in trie_insert().
270
e905a9ed 271 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
272 have many different meanings.
273
e905a9ed 274 Example:
19baf839
RO
275 _________________________________________________________________
276 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
277 -----------------------------------------------------------------
e905a9ed 278 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
279
280 _________________________________________________________________
281 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
282 -----------------------------------------------------------------
283 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
284
285 tp->pos = 7
286 tp->bits = 3
287 n->pos = 15
91b9a277 288 n->bits = 4
19baf839 289
e905a9ed
YH
290 First, let's just ignore the bits that come before the parent tp, that is
291 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
292 not use them for anything.
293
294 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 295 index into the parent's child array. That is, they will be used to find
19baf839
RO
296 'n' among tp's children.
297
298 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
299 for the node n.
300
e905a9ed 301 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
302 of the bits are really not needed or indeed known in n->key.
303
e905a9ed 304 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 305 n's child array, and will of course be different for each child.
e905a9ed 306
c877efb2 307
19baf839
RO
308 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
309 at this point.
310
311*/
312
f5026fab
DL
313static const int halve_threshold = 25;
314static const int inflate_threshold = 50;
345aa031 315static const int halve_threshold_root = 15;
80b71b80 316static const int inflate_threshold_root = 30;
2373ce1c
RO
317
318static void __alias_free_mem(struct rcu_head *head)
19baf839 319{
2373ce1c
RO
320 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
321 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
322}
323
2373ce1c 324static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 325{
2373ce1c
RO
326 call_rcu(&fa->rcu, __alias_free_mem);
327}
91b9a277 328
37fd30f2 329#define TNODE_KMALLOC_MAX \
adaf9816 330 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
91b9a277 331
37fd30f2 332static void __node_free_rcu(struct rcu_head *head)
387a5487 333{
adaf9816 334 struct tnode *n = container_of(head, struct tnode, rcu);
37fd30f2
AD
335
336 if (IS_LEAF(n))
337 kmem_cache_free(trie_leaf_kmem, n);
338 else if (n->bits <= TNODE_KMALLOC_MAX)
339 kfree(n);
340 else
341 vfree(n);
387a5487
SH
342}
343
37fd30f2
AD
344#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
345
2373ce1c 346static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 347{
bceb0f45 348 kfree_rcu(leaf, rcu);
19baf839
RO
349}
350
8d965444 351static struct tnode *tnode_alloc(size_t size)
f0e36f8c 352{
2373ce1c 353 if (size <= PAGE_SIZE)
8d965444 354 return kzalloc(size, GFP_KERNEL);
15be75cd 355 else
7a1c8e5a 356 return vzalloc(size);
15be75cd 357}
2373ce1c 358
e0f7cb8c
JP
359static void tnode_free_safe(struct tnode *tn)
360{
361 BUG_ON(IS_LEAF(tn));
37fd30f2
AD
362 tn->rcu.next = tnode_free_head;
363 tnode_free_head = &tn->rcu;
e0f7cb8c
JP
364}
365
366static void tnode_free_flush(void)
367{
37fd30f2
AD
368 struct callback_head *head;
369
370 while ((head = tnode_free_head)) {
371 struct tnode *tn = container_of(head, struct tnode, rcu);
372
373 tnode_free_head = head->next;
374 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
e0f7cb8c 375
37fd30f2 376 node_free(tn);
e0f7cb8c 377 }
c3059477
JP
378
379 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
380 tnode_free_size = 0;
381 synchronize_rcu();
382 }
e0f7cb8c
JP
383}
384
adaf9816 385static struct tnode *leaf_new(t_key key)
2373ce1c 386{
adaf9816 387 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c 388 if (l) {
64c9b6fb
AD
389 l->parent = NULL;
390 /* set key and pos to reflect full key value
391 * any trailing zeros in the key should be ignored
392 * as the nodes are searched
393 */
394 l->key = key;
395 l->pos = KEYLENGTH;
396 /* set bits to 0 indicating we are not a tnode */
397 l->bits = 0;
398
2373ce1c
RO
399 INIT_HLIST_HEAD(&l->list);
400 }
401 return l;
402}
403
404static struct leaf_info *leaf_info_new(int plen)
405{
406 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
407 if (li) {
408 li->plen = plen;
5c74501f 409 li->mask_plen = ntohl(inet_make_mask(plen));
2373ce1c
RO
410 INIT_LIST_HEAD(&li->falh);
411 }
412 return li;
413}
414
a07f5f50 415static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 416{
37fd30f2 417 size_t sz = offsetof(struct tnode, child[1 << bits]);
f0e36f8c 418 struct tnode *tn = tnode_alloc(sz);
64c9b6fb
AD
419 unsigned int shift = pos + bits;
420
421 /* verify bits and pos their msb bits clear and values are valid */
422 BUG_ON(!bits || (shift > KEYLENGTH));
19baf839 423
91b9a277 424 if (tn) {
64c9b6fb 425 tn->parent = NULL;
19baf839
RO
426 tn->pos = pos;
427 tn->bits = bits;
64c9b6fb 428 tn->key = mask_pfx(key, pos);
19baf839
RO
429 tn->full_children = 0;
430 tn->empty_children = 1<<bits;
431 }
c877efb2 432
a034ee3c 433 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
adaf9816 434 sizeof(struct tnode *) << bits);
19baf839
RO
435 return tn;
436}
437
19baf839
RO
438/*
439 * Check whether a tnode 'n' is "full", i.e. it is an internal node
440 * and no bits are skipped. See discussion in dyntree paper p. 6
441 */
442
adaf9816 443static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
19baf839 444{
64c9b6fb 445 return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
19baf839
RO
446}
447
61648d91 448static inline void put_child(struct tnode *tn, int i,
adaf9816 449 struct tnode *n)
19baf839
RO
450{
451 tnode_put_child_reorg(tn, i, n, -1);
452}
453
c877efb2 454 /*
19baf839
RO
455 * Add a child at position i overwriting the old value.
456 * Update the value of full_children and empty_children.
457 */
458
adaf9816 459static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
a07f5f50 460 int wasfull)
19baf839 461{
adaf9816 462 struct tnode *chi = rtnl_dereference(tn->child[i]);
19baf839
RO
463 int isfull;
464
0c7770c7
SH
465 BUG_ON(i >= 1<<tn->bits);
466
19baf839
RO
467 /* update emptyChildren */
468 if (n == NULL && chi != NULL)
469 tn->empty_children++;
470 else if (n != NULL && chi == NULL)
471 tn->empty_children--;
c877efb2 472
19baf839 473 /* update fullChildren */
91b9a277 474 if (wasfull == -1)
19baf839
RO
475 wasfull = tnode_full(tn, chi);
476
477 isfull = tnode_full(tn, n);
c877efb2 478 if (wasfull && !isfull)
19baf839 479 tn->full_children--;
c877efb2 480 else if (!wasfull && isfull)
19baf839 481 tn->full_children++;
91b9a277 482
64c9b6fb 483 node_set_parent(n, tn);
19baf839 484
cf778b00 485 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
486}
487
80b71b80 488#define MAX_WORK 10
adaf9816 489static struct tnode *resize(struct trie *t, struct tnode *tn)
19baf839 490{
adaf9816 491 struct tnode *old_tn, *n = NULL;
e6308be8
RO
492 int inflate_threshold_use;
493 int halve_threshold_use;
80b71b80 494 int max_work;
19baf839 495
e905a9ed 496 if (!tn)
19baf839
RO
497 return NULL;
498
0c7770c7
SH
499 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
500 tn, inflate_threshold, halve_threshold);
19baf839
RO
501
502 /* No children */
64c9b6fb
AD
503 if (tn->empty_children > (tnode_child_length(tn) - 1))
504 goto no_children;
505
19baf839 506 /* One child */
64c9b6fb 507 if (tn->empty_children == (tnode_child_length(tn) - 1))
80b71b80 508 goto one_child;
c877efb2 509 /*
19baf839
RO
510 * Double as long as the resulting node has a number of
511 * nonempty nodes that are above the threshold.
512 */
513
514 /*
c877efb2
SH
515 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
516 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 517 * Telecommunications, page 6:
c877efb2 518 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
519 * children in the *doubled* node is at least 'high'."
520 *
c877efb2
SH
521 * 'high' in this instance is the variable 'inflate_threshold'. It
522 * is expressed as a percentage, so we multiply it with
523 * tnode_child_length() and instead of multiplying by 2 (since the
524 * child array will be doubled by inflate()) and multiplying
525 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 526 * multiply the left-hand side by 50.
c877efb2
SH
527 *
528 * The left-hand side may look a bit weird: tnode_child_length(tn)
529 * - tn->empty_children is of course the number of non-null children
530 * in the current node. tn->full_children is the number of "full"
19baf839 531 * children, that is non-null tnodes with a skip value of 0.
c877efb2 532 * All of those will be doubled in the resulting inflated tnode, so
19baf839 533 * we just count them one extra time here.
c877efb2 534 *
19baf839 535 * A clearer way to write this would be:
c877efb2 536 *
19baf839 537 * to_be_doubled = tn->full_children;
c877efb2 538 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
539 * tn->full_children;
540 *
541 * new_child_length = tnode_child_length(tn) * 2;
542 *
c877efb2 543 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
544 * new_child_length;
545 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
546 *
547 * ...and so on, tho it would mess up the while () loop.
548 *
19baf839
RO
549 * anyway,
550 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
551 * inflate_threshold
c877efb2 552 *
19baf839
RO
553 * avoid a division:
554 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
555 * inflate_threshold * new_child_length
c877efb2 556 *
19baf839 557 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 558 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 559 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 560 *
19baf839 561 * expand new_child_length:
c877efb2 562 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 563 * tn->full_children) >=
19baf839 564 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 565 *
19baf839 566 * shorten again:
c877efb2 567 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 568 * tn->empty_children) >= inflate_threshold *
19baf839 569 * tnode_child_length(tn)
c877efb2 570 *
19baf839
RO
571 */
572
e6308be8
RO
573 /* Keep root node larger */
574
64c9b6fb 575 if (!node_parent(tn)) {
80b71b80
JL
576 inflate_threshold_use = inflate_threshold_root;
577 halve_threshold_use = halve_threshold_root;
a034ee3c 578 } else {
e6308be8 579 inflate_threshold_use = inflate_threshold;
80b71b80
JL
580 halve_threshold_use = halve_threshold;
581 }
e6308be8 582
80b71b80
JL
583 max_work = MAX_WORK;
584 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
585 50 * (tn->full_children + tnode_child_length(tn)
586 - tn->empty_children)
587 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 588
2f80b3c8
RO
589 old_tn = tn;
590 tn = inflate(t, tn);
a07f5f50 591
2f80b3c8
RO
592 if (IS_ERR(tn)) {
593 tn = old_tn;
2f36895a 594#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 595 this_cpu_inc(t->stats->resize_node_skipped);
2f36895a
RO
596#endif
597 break;
598 }
19baf839
RO
599 }
600
80b71b80 601 /* Return if at least one inflate is run */
a034ee3c 602 if (max_work != MAX_WORK)
adaf9816 603 return tn;
80b71b80 604
19baf839
RO
605 /*
606 * Halve as long as the number of empty children in this
607 * node is above threshold.
608 */
2f36895a 609
80b71b80
JL
610 max_work = MAX_WORK;
611 while (tn->bits > 1 && max_work-- &&
19baf839 612 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 613 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 614
2f80b3c8
RO
615 old_tn = tn;
616 tn = halve(t, tn);
617 if (IS_ERR(tn)) {
618 tn = old_tn;
2f36895a 619#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 620 this_cpu_inc(t->stats->resize_node_skipped);
2f36895a
RO
621#endif
622 break;
623 }
624 }
19baf839 625
c877efb2 626
19baf839 627 /* Only one child remains */
64c9b6fb
AD
628 if (tn->empty_children == (tnode_child_length(tn) - 1)) {
629 unsigned long i;
80b71b80 630one_child:
64c9b6fb
AD
631 for (i = tnode_child_length(tn); !n && i;)
632 n = tnode_get_child(tn, --i);
633no_children:
634 /* compress one level */
635 node_set_parent(n, NULL);
636 tnode_free_safe(tn);
637 return n;
80b71b80 638 }
adaf9816 639 return tn;
19baf839
RO
640}
641
0a5c0475
ED
642
643static void tnode_clean_free(struct tnode *tn)
644{
adaf9816 645 struct tnode *tofree;
0a5c0475 646 int i;
0a5c0475
ED
647
648 for (i = 0; i < tnode_child_length(tn); i++) {
37fd30f2 649 tofree = rtnl_dereference(tn->child[i]);
0a5c0475 650 if (tofree)
37fd30f2 651 node_free(tofree);
0a5c0475 652 }
37fd30f2 653 node_free(tn);
0a5c0475
ED
654}
655
adaf9816 656static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
19baf839 657{
adaf9816
AD
658 int olen = tnode_child_length(oldtnode);
659 struct tnode *tn;
19baf839
RO
660 int i;
661
0c7770c7 662 pr_debug("In inflate\n");
19baf839
RO
663
664 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
665
0c7770c7 666 if (!tn)
2f80b3c8 667 return ERR_PTR(-ENOMEM);
2f36895a
RO
668
669 /*
c877efb2
SH
670 * Preallocate and store tnodes before the actual work so we
671 * don't get into an inconsistent state if memory allocation
672 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
673 * of tnode is ignored.
674 */
91b9a277
OJ
675
676 for (i = 0; i < olen; i++) {
a07f5f50 677 struct tnode *inode;
2f36895a 678
adaf9816
AD
679 inode = tnode_get_child(oldtnode, i);
680 if (tnode_full(oldtnode, inode) && inode->bits > 1) {
2f36895a 681 struct tnode *left, *right;
ab66b4a7 682 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 683
2f36895a
RO
684 left = tnode_new(inode->key&(~m), inode->pos + 1,
685 inode->bits - 1);
2f80b3c8
RO
686 if (!left)
687 goto nomem;
91b9a277 688
2f36895a
RO
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
e905a9ed 692 if (!right) {
37fd30f2 693 node_free(left);
2f80b3c8 694 goto nomem;
e905a9ed 695 }
2f36895a 696
adaf9816
AD
697 put_child(tn, 2*i, left);
698 put_child(tn, 2*i+1, right);
2f36895a
RO
699 }
700 }
701
91b9a277 702 for (i = 0; i < olen; i++) {
adaf9816 703 struct tnode *inode = tnode_get_child(oldtnode, i);
91b9a277
OJ
704 struct tnode *left, *right;
705 int size, j;
c877efb2 706
19baf839 707 /* An empty child */
adaf9816 708 if (inode == NULL)
19baf839
RO
709 continue;
710
711 /* A leaf or an internal node with skipped bits */
adaf9816 712 if (!tnode_full(oldtnode, inode)) {
bbe34cf8 713 put_child(tn,
adaf9816
AD
714 tkey_extract_bits(inode->key, tn->pos, tn->bits),
715 inode);
19baf839
RO
716 continue;
717 }
718
719 /* An internal node with two children */
19baf839 720 if (inode->bits == 1) {
61648d91
LM
721 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
722 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
19baf839 723
e0f7cb8c 724 tnode_free_safe(inode);
91b9a277 725 continue;
19baf839
RO
726 }
727
91b9a277
OJ
728 /* An internal node with more than two children */
729
730 /* We will replace this node 'inode' with two new
731 * ones, 'left' and 'right', each with half of the
732 * original children. The two new nodes will have
733 * a position one bit further down the key and this
734 * means that the "significant" part of their keys
735 * (see the discussion near the top of this file)
736 * will differ by one bit, which will be "0" in
737 * left's key and "1" in right's key. Since we are
738 * moving the key position by one step, the bit that
739 * we are moving away from - the bit at position
740 * (inode->pos) - is the one that will differ between
741 * left and right. So... we synthesize that bit in the
742 * two new keys.
743 * The mask 'm' below will be a single "one" bit at
744 * the position (inode->pos)
745 */
19baf839 746
91b9a277
OJ
747 /* Use the old key, but set the new significant
748 * bit to zero.
749 */
2f36895a 750
adaf9816 751 left = tnode_get_child(tn, 2*i);
61648d91 752 put_child(tn, 2*i, NULL);
2f36895a 753
91b9a277 754 BUG_ON(!left);
2f36895a 755
adaf9816 756 right = tnode_get_child(tn, 2*i+1);
61648d91 757 put_child(tn, 2*i+1, NULL);
19baf839 758
91b9a277 759 BUG_ON(!right);
19baf839 760
91b9a277
OJ
761 size = tnode_child_length(left);
762 for (j = 0; j < size; j++) {
61648d91
LM
763 put_child(left, j, rtnl_dereference(inode->child[j]));
764 put_child(right, j, rtnl_dereference(inode->child[j + size]));
19baf839 765 }
61648d91
LM
766 put_child(tn, 2*i, resize(t, left));
767 put_child(tn, 2*i+1, resize(t, right));
91b9a277 768
e0f7cb8c 769 tnode_free_safe(inode);
19baf839 770 }
e0f7cb8c 771 tnode_free_safe(oldtnode);
19baf839 772 return tn;
2f80b3c8 773nomem:
0a5c0475
ED
774 tnode_clean_free(tn);
775 return ERR_PTR(-ENOMEM);
19baf839
RO
776}
777
adaf9816 778static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
19baf839 779{
adaf9816
AD
780 int olen = tnode_child_length(oldtnode);
781 struct tnode *tn, *left, *right;
19baf839 782 int i;
19baf839 783
0c7770c7 784 pr_debug("In halve\n");
c877efb2
SH
785
786 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 787
2f80b3c8
RO
788 if (!tn)
789 return ERR_PTR(-ENOMEM);
2f36895a
RO
790
791 /*
c877efb2
SH
792 * Preallocate and store tnodes before the actual work so we
793 * don't get into an inconsistent state if memory allocation
794 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
795 * of tnode is ignored.
796 */
797
91b9a277 798 for (i = 0; i < olen; i += 2) {
2f36895a
RO
799 left = tnode_get_child(oldtnode, i);
800 right = tnode_get_child(oldtnode, i+1);
c877efb2 801
2f36895a 802 /* Two nonempty children */
0c7770c7 803 if (left && right) {
2f80b3c8 804 struct tnode *newn;
0c7770c7 805
2f80b3c8 806 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
807
808 if (!newn)
2f80b3c8 809 goto nomem;
0c7770c7 810
adaf9816 811 put_child(tn, i/2, newn);
2f36895a 812 }
2f36895a 813
2f36895a 814 }
19baf839 815
91b9a277
OJ
816 for (i = 0; i < olen; i += 2) {
817 struct tnode *newBinNode;
818
19baf839
RO
819 left = tnode_get_child(oldtnode, i);
820 right = tnode_get_child(oldtnode, i+1);
c877efb2 821
19baf839
RO
822 /* At least one of the children is empty */
823 if (left == NULL) {
824 if (right == NULL) /* Both are empty */
825 continue;
61648d91 826 put_child(tn, i/2, right);
91b9a277 827 continue;
0c7770c7 828 }
91b9a277
OJ
829
830 if (right == NULL) {
61648d91 831 put_child(tn, i/2, left);
91b9a277
OJ
832 continue;
833 }
c877efb2 834
19baf839 835 /* Two nonempty children */
adaf9816 836 newBinNode = tnode_get_child(tn, i/2);
61648d91
LM
837 put_child(tn, i/2, NULL);
838 put_child(newBinNode, 0, left);
839 put_child(newBinNode, 1, right);
840 put_child(tn, i/2, resize(t, newBinNode));
19baf839 841 }
e0f7cb8c 842 tnode_free_safe(oldtnode);
19baf839 843 return tn;
2f80b3c8 844nomem:
0a5c0475
ED
845 tnode_clean_free(tn);
846 return ERR_PTR(-ENOMEM);
19baf839
RO
847}
848
772cb712 849/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
850 via get_fa_head and dump */
851
adaf9816 852static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
19baf839 853{
772cb712 854 struct hlist_head *head = &l->list;
19baf839
RO
855 struct leaf_info *li;
856
b67bfe0d 857 hlist_for_each_entry_rcu(li, head, hlist)
c877efb2 858 if (li->plen == plen)
19baf839 859 return li;
91b9a277 860
19baf839
RO
861 return NULL;
862}
863
adaf9816 864static inline struct list_head *get_fa_head(struct tnode *l, int plen)
19baf839 865{
772cb712 866 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 867
91b9a277
OJ
868 if (!li)
869 return NULL;
c877efb2 870
91b9a277 871 return &li->falh;
19baf839
RO
872}
873
874static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
875{
e905a9ed 876 struct leaf_info *li = NULL, *last = NULL;
e905a9ed
YH
877
878 if (hlist_empty(head)) {
879 hlist_add_head_rcu(&new->hlist, head);
880 } else {
b67bfe0d 881 hlist_for_each_entry(li, head, hlist) {
e905a9ed
YH
882 if (new->plen > li->plen)
883 break;
884
885 last = li;
886 }
887 if (last)
1d023284 888 hlist_add_behind_rcu(&new->hlist, &last->hlist);
e905a9ed
YH
889 else
890 hlist_add_before_rcu(&new->hlist, &li->hlist);
891 }
19baf839
RO
892}
893
2373ce1c
RO
894/* rcu_read_lock needs to be hold by caller from readside */
895
adaf9816 896static struct tnode *fib_find_node(struct trie *t, u32 key)
19baf839 897{
adaf9816
AD
898 struct tnode *n = rcu_dereference_rtnl(t->trie);
899 int pos = 0;
19baf839 900
64c9b6fb 901 while (n && IS_TNODE(n)) {
adaf9816
AD
902 if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
903 pos = n->pos + n->bits;
904 n = tnode_get_child_rcu(n,
a07f5f50 905 tkey_extract_bits(key,
adaf9816
AD
906 n->pos,
907 n->bits));
91b9a277 908 } else
19baf839
RO
909 break;
910 }
911 /* Case we have found a leaf. Compare prefixes */
912
91b9a277 913 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
adaf9816 914 return n;
91b9a277 915
19baf839
RO
916 return NULL;
917}
918
7b85576d 919static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 920{
19baf839 921 int wasfull;
3ed18d76 922 t_key cindex, key;
06801916 923 struct tnode *tp;
19baf839 924
3ed18d76
RO
925 key = tn->key;
926
64c9b6fb 927 while (tn != NULL && (tp = node_parent(tn)) != NULL) {
19baf839
RO
928 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
929 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
adaf9816 930 tn = resize(t, tn);
a07f5f50 931
adaf9816 932 tnode_put_child_reorg(tp, cindex, tn, wasfull);
91b9a277 933
64c9b6fb 934 tp = node_parent(tn);
008440e3 935 if (!tp)
adaf9816 936 rcu_assign_pointer(t->trie, tn);
008440e3 937
e0f7cb8c 938 tnode_free_flush();
06801916 939 if (!tp)
19baf839 940 break;
06801916 941 tn = tp;
19baf839 942 }
06801916 943
19baf839 944 /* Handle last (top) tnode */
7b85576d 945 if (IS_TNODE(tn))
adaf9816 946 tn = resize(t, tn);
19baf839 947
adaf9816 948 rcu_assign_pointer(t->trie, tn);
7b85576d 949 tnode_free_flush();
19baf839
RO
950}
951
2373ce1c
RO
952/* only used from updater-side */
953
fea86ad8 954static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
955{
956 int pos, newpos;
957 struct tnode *tp = NULL, *tn = NULL;
adaf9816
AD
958 struct tnode *n;
959 struct tnode *l;
19baf839 960 int missbit;
c877efb2 961 struct list_head *fa_head = NULL;
19baf839
RO
962 struct leaf_info *li;
963 t_key cindex;
964
965 pos = 0;
0a5c0475 966 n = rtnl_dereference(t->trie);
19baf839 967
c877efb2
SH
968 /* If we point to NULL, stop. Either the tree is empty and we should
969 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 970 * and we should just put our new leaf in that.
c877efb2
SH
971 * If we point to a T_TNODE, check if it matches our key. Note that
972 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
973 * not be the parent's 'pos'+'bits'!
974 *
c877efb2 975 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
976 * the index from our key, push the T_TNODE and walk the tree.
977 *
978 * If it doesn't, we have to replace it with a new T_TNODE.
979 *
c877efb2
SH
980 * If we point to a T_LEAF, it might or might not have the same key
981 * as we do. If it does, just change the value, update the T_LEAF's
982 * value, and return it.
19baf839
RO
983 * If it doesn't, we need to replace it with a T_TNODE.
984 */
985
64c9b6fb 986 while (n && IS_TNODE(n)) {
adaf9816
AD
987 if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
988 tp = n;
989 pos = n->pos + n->bits;
990 n = tnode_get_child(n,
a07f5f50 991 tkey_extract_bits(key,
adaf9816
AD
992 n->pos,
993 n->bits));
19baf839 994
adaf9816 995 BUG_ON(n && node_parent(n) != tp);
91b9a277 996 } else
19baf839
RO
997 break;
998 }
999
1000 /*
1001 * n ----> NULL, LEAF or TNODE
1002 *
c877efb2 1003 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1004 */
1005
91b9a277 1006 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1007
1008 /* Case 1: n is a leaf. Compare prefixes */
1009
c877efb2 1010 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
19baf839 1011 li = leaf_info_new(plen);
91b9a277 1012
fea86ad8
SH
1013 if (!li)
1014 return NULL;
19baf839
RO
1015
1016 fa_head = &li->falh;
adaf9816 1017 insert_leaf_info(&n->list, li);
19baf839
RO
1018 goto done;
1019 }
64c9b6fb 1020 l = leaf_new(key);
19baf839 1021
fea86ad8
SH
1022 if (!l)
1023 return NULL;
19baf839 1024
19baf839
RO
1025 li = leaf_info_new(plen);
1026
c877efb2 1027 if (!li) {
37fd30f2 1028 node_free(l);
fea86ad8 1029 return NULL;
f835e471 1030 }
19baf839
RO
1031
1032 fa_head = &li->falh;
1033 insert_leaf_info(&l->list, li);
1034
19baf839 1035 if (t->trie && n == NULL) {
91b9a277 1036 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1037
adaf9816 1038 node_set_parent(l, tp);
19baf839 1039
91b9a277 1040 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
adaf9816 1041 put_child(tp, cindex, l);
91b9a277
OJ
1042 } else {
1043 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1044 /*
1045 * Add a new tnode here
19baf839
RO
1046 * first tnode need some special handling
1047 */
1048
c877efb2 1049 if (n) {
4c60f1d6 1050 pos = tp ? tp->pos+tp->bits : 0;
19baf839
RO
1051 newpos = tkey_mismatch(key, pos, n->key);
1052 tn = tnode_new(n->key, newpos, 1);
91b9a277 1053 } else {
19baf839 1054 newpos = 0;
c877efb2 1055 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1056 }
19baf839 1057
c877efb2 1058 if (!tn) {
f835e471 1059 free_leaf_info(li);
37fd30f2 1060 node_free(l);
fea86ad8 1061 return NULL;
91b9a277
OJ
1062 }
1063
adaf9816 1064 node_set_parent(tn, tp);
19baf839 1065
91b9a277 1066 missbit = tkey_extract_bits(key, newpos, 1);
adaf9816 1067 put_child(tn, missbit, l);
61648d91 1068 put_child(tn, 1-missbit, n);
19baf839 1069
c877efb2 1070 if (tp) {
19baf839 1071 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
adaf9816 1072 put_child(tp, cindex, tn);
91b9a277 1073 } else {
adaf9816 1074 rcu_assign_pointer(t->trie, tn);
19baf839 1075 }
e962f302
AD
1076
1077 tp = tn;
19baf839 1078 }
91b9a277
OJ
1079
1080 if (tp && tp->pos + tp->bits > 32)
058bd4d2
JP
1081 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1082 tp, tp->pos, tp->bits, key, plen);
91b9a277 1083
19baf839 1084 /* Rebalance the trie */
2373ce1c 1085
7b85576d 1086 trie_rebalance(t, tp);
f835e471 1087done:
19baf839
RO
1088 return fa_head;
1089}
1090
d562f1f8
RO
1091/*
1092 * Caller must hold RTNL.
1093 */
16c6cf8b 1094int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1095{
1096 struct trie *t = (struct trie *) tb->tb_data;
1097 struct fib_alias *fa, *new_fa;
c877efb2 1098 struct list_head *fa_head = NULL;
19baf839 1099 struct fib_info *fi;
4e902c57
TG
1100 int plen = cfg->fc_dst_len;
1101 u8 tos = cfg->fc_tos;
19baf839
RO
1102 u32 key, mask;
1103 int err;
adaf9816 1104 struct tnode *l;
19baf839
RO
1105
1106 if (plen > 32)
1107 return -EINVAL;
1108
4e902c57 1109 key = ntohl(cfg->fc_dst);
19baf839 1110
2dfe55b4 1111 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1112
91b9a277 1113 mask = ntohl(inet_make_mask(plen));
19baf839 1114
c877efb2 1115 if (key & ~mask)
19baf839
RO
1116 return -EINVAL;
1117
1118 key = key & mask;
1119
4e902c57
TG
1120 fi = fib_create_info(cfg);
1121 if (IS_ERR(fi)) {
1122 err = PTR_ERR(fi);
19baf839 1123 goto err;
4e902c57 1124 }
19baf839
RO
1125
1126 l = fib_find_node(t, key);
c877efb2 1127 fa = NULL;
19baf839 1128
c877efb2 1129 if (l) {
19baf839
RO
1130 fa_head = get_fa_head(l, plen);
1131 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1132 }
1133
1134 /* Now fa, if non-NULL, points to the first fib alias
1135 * with the same keys [prefix,tos,priority], if such key already
1136 * exists or to the node before which we will insert new one.
1137 *
1138 * If fa is NULL, we will need to allocate a new one and
1139 * insert to the head of f.
1140 *
1141 * If f is NULL, no fib node matched the destination key
1142 * and we need to allocate a new one of those as well.
1143 */
1144
936f6f8e
JA
1145 if (fa && fa->fa_tos == tos &&
1146 fa->fa_info->fib_priority == fi->fib_priority) {
1147 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1148
1149 err = -EEXIST;
4e902c57 1150 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1151 goto out;
1152
936f6f8e
JA
1153 /* We have 2 goals:
1154 * 1. Find exact match for type, scope, fib_info to avoid
1155 * duplicate routes
1156 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1157 */
1158 fa_match = NULL;
1159 fa_first = fa;
1160 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1161 list_for_each_entry_continue(fa, fa_head, fa_list) {
1162 if (fa->fa_tos != tos)
1163 break;
1164 if (fa->fa_info->fib_priority != fi->fib_priority)
1165 break;
1166 if (fa->fa_type == cfg->fc_type &&
936f6f8e
JA
1167 fa->fa_info == fi) {
1168 fa_match = fa;
1169 break;
1170 }
1171 }
1172
4e902c57 1173 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1174 struct fib_info *fi_drop;
1175 u8 state;
1176
936f6f8e
JA
1177 fa = fa_first;
1178 if (fa_match) {
1179 if (fa == fa_match)
1180 err = 0;
6725033f 1181 goto out;
936f6f8e 1182 }
2373ce1c 1183 err = -ENOBUFS;
e94b1766 1184 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1185 if (new_fa == NULL)
1186 goto out;
19baf839
RO
1187
1188 fi_drop = fa->fa_info;
2373ce1c
RO
1189 new_fa->fa_tos = fa->fa_tos;
1190 new_fa->fa_info = fi;
4e902c57 1191 new_fa->fa_type = cfg->fc_type;
19baf839 1192 state = fa->fa_state;
936f6f8e 1193 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1194
2373ce1c
RO
1195 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1196 alias_free_mem_rcu(fa);
19baf839
RO
1197
1198 fib_release_info(fi_drop);
1199 if (state & FA_S_ACCESSED)
4ccfe6d4 1200 rt_cache_flush(cfg->fc_nlinfo.nl_net);
b8f55831
MK
1201 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1202 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1203
91b9a277 1204 goto succeeded;
19baf839
RO
1205 }
1206 /* Error if we find a perfect match which
1207 * uses the same scope, type, and nexthop
1208 * information.
1209 */
936f6f8e
JA
1210 if (fa_match)
1211 goto out;
a07f5f50 1212
4e902c57 1213 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1214 fa = fa_first;
19baf839
RO
1215 }
1216 err = -ENOENT;
4e902c57 1217 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1218 goto out;
1219
1220 err = -ENOBUFS;
e94b1766 1221 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1222 if (new_fa == NULL)
1223 goto out;
1224
1225 new_fa->fa_info = fi;
1226 new_fa->fa_tos = tos;
4e902c57 1227 new_fa->fa_type = cfg->fc_type;
19baf839 1228 new_fa->fa_state = 0;
19baf839
RO
1229 /*
1230 * Insert new entry to the list.
1231 */
1232
c877efb2 1233 if (!fa_head) {
fea86ad8
SH
1234 fa_head = fib_insert_node(t, key, plen);
1235 if (unlikely(!fa_head)) {
1236 err = -ENOMEM;
f835e471 1237 goto out_free_new_fa;
fea86ad8 1238 }
f835e471 1239 }
19baf839 1240
21d8c49e
DM
1241 if (!plen)
1242 tb->tb_num_default++;
1243
2373ce1c
RO
1244 list_add_tail_rcu(&new_fa->fa_list,
1245 (fa ? &fa->fa_list : fa_head));
19baf839 1246
4ccfe6d4 1247 rt_cache_flush(cfg->fc_nlinfo.nl_net);
4e902c57 1248 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1249 &cfg->fc_nlinfo, 0);
19baf839
RO
1250succeeded:
1251 return 0;
f835e471
RO
1252
1253out_free_new_fa:
1254 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1255out:
1256 fib_release_info(fi);
91b9a277 1257err:
19baf839
RO
1258 return err;
1259}
1260
772cb712 1261/* should be called with rcu_read_lock */
adaf9816 1262static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
22bd5b9b 1263 t_key key, const struct flowi4 *flp,
ebc0ffae 1264 struct fib_result *res, int fib_flags)
19baf839 1265{
19baf839
RO
1266 struct leaf_info *li;
1267 struct hlist_head *hhead = &l->list;
c877efb2 1268
b67bfe0d 1269 hlist_for_each_entry_rcu(li, hhead, hlist) {
3be0686b 1270 struct fib_alias *fa;
a07f5f50 1271
5c74501f 1272 if (l->key != (key & li->mask_plen))
19baf839
RO
1273 continue;
1274
3be0686b
DM
1275 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1276 struct fib_info *fi = fa->fa_info;
1277 int nhsel, err;
a07f5f50 1278
22bd5b9b 1279 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
3be0686b 1280 continue;
dccd9ecc
DM
1281 if (fi->fib_dead)
1282 continue;
37e826c5 1283 if (fa->fa_info->fib_scope < flp->flowi4_scope)
3be0686b
DM
1284 continue;
1285 fib_alias_accessed(fa);
1286 err = fib_props[fa->fa_type].error;
9f9e636d 1287 if (unlikely(err < 0)) {
19baf839 1288#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1289 this_cpu_inc(t->stats->semantic_match_passed);
3be0686b 1290#endif
1fbc7843 1291 return err;
3be0686b
DM
1292 }
1293 if (fi->fib_flags & RTNH_F_DEAD)
1294 continue;
1295 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1296 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1297
1298 if (nh->nh_flags & RTNH_F_DEAD)
1299 continue;
22bd5b9b 1300 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
3be0686b
DM
1301 continue;
1302
1303#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1304 this_cpu_inc(t->stats->semantic_match_passed);
3be0686b 1305#endif
5c74501f 1306 res->prefixlen = li->plen;
3be0686b
DM
1307 res->nh_sel = nhsel;
1308 res->type = fa->fa_type;
9f9e636d 1309 res->scope = fi->fib_scope;
3be0686b
DM
1310 res->fi = fi;
1311 res->table = tb;
1312 res->fa_head = &li->falh;
1313 if (!(fib_flags & FIB_LOOKUP_NOREF))
5c74501f 1314 atomic_inc(&fi->fib_clntref);
3be0686b
DM
1315 return 0;
1316 }
1317 }
1318
1319#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1320 this_cpu_inc(t->stats->semantic_match_miss);
19baf839 1321#endif
19baf839 1322 }
a07f5f50 1323
2e655571 1324 return 1;
19baf839
RO
1325}
1326
9f9e636d
AD
1327static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1328{
1329 t_key prefix = n->key;
1330
1331 return (key ^ prefix) & (prefix | -prefix);
1332}
1333
22bd5b9b 1334int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
ebc0ffae 1335 struct fib_result *res, int fib_flags)
19baf839 1336{
9f9e636d 1337 struct trie *t = (struct trie *)tb->tb_data;
8274a97a
AD
1338#ifdef CONFIG_IP_FIB_TRIE_STATS
1339 struct trie_use_stats __percpu *stats = t->stats;
1340#endif
9f9e636d
AD
1341 const t_key key = ntohl(flp->daddr);
1342 struct tnode *n, *pn;
1343 t_key cindex;
1344 int ret = 1;
91b9a277 1345
2373ce1c 1346 rcu_read_lock();
91b9a277 1347
2373ce1c 1348 n = rcu_dereference(t->trie);
c877efb2 1349 if (!n)
19baf839
RO
1350 goto failed;
1351
1352#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1353 this_cpu_inc(stats->gets);
19baf839
RO
1354#endif
1355
adaf9816 1356 pn = n;
9f9e636d
AD
1357 cindex = 0;
1358
1359 /* Step 1: Travel to the longest prefix match in the trie */
1360 for (;;) {
1361 unsigned long index = get_index(key, n);
1362
1363 /* This bit of code is a bit tricky but it combines multiple
1364 * checks into a single check. The prefix consists of the
1365 * prefix plus zeros for the "bits" in the prefix. The index
1366 * is the difference between the key and this value. From
1367 * this we can actually derive several pieces of data.
1368 * if !(index >> bits)
1369 * we know the value is child index
1370 * else
1371 * we have a mismatch in skip bits and failed
1372 */
1373 if (index >> n->bits)
1374 break;
19baf839 1375
9f9e636d
AD
1376 /* we have found a leaf. Prefixes have already been compared */
1377 if (IS_LEAF(n))
a07f5f50 1378 goto found;
19baf839 1379
9f9e636d
AD
1380 /* only record pn and cindex if we are going to be chopping
1381 * bits later. Otherwise we are just wasting cycles.
91b9a277 1382 */
9f9e636d
AD
1383 if (index) {
1384 pn = n;
1385 cindex = index;
91b9a277 1386 }
19baf839 1387
9f9e636d
AD
1388 n = rcu_dereference(n->child[index]);
1389 if (unlikely(!n))
1390 goto backtrace;
1391 }
19baf839 1392
9f9e636d
AD
1393 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1394 for (;;) {
1395 /* record the pointer where our next node pointer is stored */
1396 struct tnode __rcu **cptr = n->child;
19baf839 1397
9f9e636d
AD
1398 /* This test verifies that none of the bits that differ
1399 * between the key and the prefix exist in the region of
1400 * the lsb and higher in the prefix.
91b9a277 1401 */
9f9e636d
AD
1402 if (unlikely(prefix_mismatch(key, n)))
1403 goto backtrace;
91b9a277 1404
9f9e636d
AD
1405 /* exit out and process leaf */
1406 if (unlikely(IS_LEAF(n)))
1407 break;
91b9a277 1408
9f9e636d
AD
1409 /* Don't bother recording parent info. Since we are in
1410 * prefix match mode we will have to come back to wherever
1411 * we started this traversal anyway
91b9a277 1412 */
91b9a277 1413
9f9e636d 1414 while ((n = rcu_dereference(*cptr)) == NULL) {
19baf839 1415backtrace:
19baf839 1416#ifdef CONFIG_IP_FIB_TRIE_STATS
9f9e636d
AD
1417 if (!n)
1418 this_cpu_inc(stats->null_node_hit);
19baf839 1419#endif
9f9e636d
AD
1420 /* If we are at cindex 0 there are no more bits for
1421 * us to strip at this level so we must ascend back
1422 * up one level to see if there are any more bits to
1423 * be stripped there.
1424 */
1425 while (!cindex) {
1426 t_key pkey = pn->key;
1427
1428 pn = node_parent_rcu(pn);
1429 if (unlikely(!pn))
1430 goto failed;
1431#ifdef CONFIG_IP_FIB_TRIE_STATS
1432 this_cpu_inc(stats->backtrack);
1433#endif
1434 /* Get Child's index */
1435 cindex = get_index(pkey, pn);
1436 }
1437
1438 /* strip the least significant bit from the cindex */
1439 cindex &= cindex - 1;
1440
1441 /* grab pointer for next child node */
1442 cptr = &pn->child[cindex];
c877efb2 1443 }
19baf839 1444 }
9f9e636d 1445
19baf839 1446found:
9f9e636d
AD
1447 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1448 ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
1449 if (unlikely(ret > 0))
1450 goto backtrace;
1451failed:
2373ce1c 1452 rcu_read_unlock();
19baf839
RO
1453 return ret;
1454}
6fc01438 1455EXPORT_SYMBOL_GPL(fib_table_lookup);
19baf839 1456
9195bef7
SH
1457/*
1458 * Remove the leaf and return parent.
1459 */
adaf9816 1460static void trie_leaf_remove(struct trie *t, struct tnode *l)
19baf839 1461{
64c9b6fb 1462 struct tnode *tp = node_parent(l);
c877efb2 1463
9195bef7 1464 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1465
c877efb2 1466 if (tp) {
9195bef7 1467 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
61648d91 1468 put_child(tp, cindex, NULL);
7b85576d 1469 trie_rebalance(t, tp);
91b9a277 1470 } else
a9b3cd7f 1471 RCU_INIT_POINTER(t->trie, NULL);
19baf839 1472
37fd30f2 1473 node_free(l);
19baf839
RO
1474}
1475
d562f1f8
RO
1476/*
1477 * Caller must hold RTNL.
1478 */
16c6cf8b 1479int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1480{
1481 struct trie *t = (struct trie *) tb->tb_data;
1482 u32 key, mask;
4e902c57
TG
1483 int plen = cfg->fc_dst_len;
1484 u8 tos = cfg->fc_tos;
19baf839
RO
1485 struct fib_alias *fa, *fa_to_delete;
1486 struct list_head *fa_head;
adaf9816 1487 struct tnode *l;
91b9a277
OJ
1488 struct leaf_info *li;
1489
c877efb2 1490 if (plen > 32)
19baf839
RO
1491 return -EINVAL;
1492
4e902c57 1493 key = ntohl(cfg->fc_dst);
91b9a277 1494 mask = ntohl(inet_make_mask(plen));
19baf839 1495
c877efb2 1496 if (key & ~mask)
19baf839
RO
1497 return -EINVAL;
1498
1499 key = key & mask;
1500 l = fib_find_node(t, key);
1501
c877efb2 1502 if (!l)
19baf839
RO
1503 return -ESRCH;
1504
ad5b3102
IM
1505 li = find_leaf_info(l, plen);
1506
1507 if (!li)
1508 return -ESRCH;
1509
1510 fa_head = &li->falh;
19baf839
RO
1511 fa = fib_find_alias(fa_head, tos, 0);
1512
1513 if (!fa)
1514 return -ESRCH;
1515
0c7770c7 1516 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1517
1518 fa_to_delete = NULL;
936f6f8e
JA
1519 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1520 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1521 struct fib_info *fi = fa->fa_info;
1522
1523 if (fa->fa_tos != tos)
1524 break;
1525
4e902c57
TG
1526 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1527 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
37e826c5 1528 fa->fa_info->fib_scope == cfg->fc_scope) &&
74cb3c10
JA
1529 (!cfg->fc_prefsrc ||
1530 fi->fib_prefsrc == cfg->fc_prefsrc) &&
4e902c57
TG
1531 (!cfg->fc_protocol ||
1532 fi->fib_protocol == cfg->fc_protocol) &&
1533 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1534 fa_to_delete = fa;
1535 break;
1536 }
1537 }
1538
91b9a277
OJ
1539 if (!fa_to_delete)
1540 return -ESRCH;
19baf839 1541
91b9a277 1542 fa = fa_to_delete;
4e902c57 1543 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1544 &cfg->fc_nlinfo, 0);
91b9a277 1545
2373ce1c 1546 list_del_rcu(&fa->fa_list);
19baf839 1547
21d8c49e
DM
1548 if (!plen)
1549 tb->tb_num_default--;
1550
91b9a277 1551 if (list_empty(fa_head)) {
2373ce1c 1552 hlist_del_rcu(&li->hlist);
91b9a277 1553 free_leaf_info(li);
2373ce1c 1554 }
19baf839 1555
91b9a277 1556 if (hlist_empty(&l->list))
9195bef7 1557 trie_leaf_remove(t, l);
19baf839 1558
91b9a277 1559 if (fa->fa_state & FA_S_ACCESSED)
4ccfe6d4 1560 rt_cache_flush(cfg->fc_nlinfo.nl_net);
19baf839 1561
2373ce1c
RO
1562 fib_release_info(fa->fa_info);
1563 alias_free_mem_rcu(fa);
91b9a277 1564 return 0;
19baf839
RO
1565}
1566
ef3660ce 1567static int trie_flush_list(struct list_head *head)
19baf839
RO
1568{
1569 struct fib_alias *fa, *fa_node;
1570 int found = 0;
1571
1572 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1573 struct fib_info *fi = fa->fa_info;
19baf839 1574
2373ce1c
RO
1575 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1576 list_del_rcu(&fa->fa_list);
1577 fib_release_info(fa->fa_info);
1578 alias_free_mem_rcu(fa);
19baf839
RO
1579 found++;
1580 }
1581 }
1582 return found;
1583}
1584
adaf9816 1585static int trie_flush_leaf(struct tnode *l)
19baf839
RO
1586{
1587 int found = 0;
1588 struct hlist_head *lih = &l->list;
b67bfe0d 1589 struct hlist_node *tmp;
19baf839
RO
1590 struct leaf_info *li = NULL;
1591
b67bfe0d 1592 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
ef3660ce 1593 found += trie_flush_list(&li->falh);
19baf839
RO
1594
1595 if (list_empty(&li->falh)) {
2373ce1c 1596 hlist_del_rcu(&li->hlist);
19baf839
RO
1597 free_leaf_info(li);
1598 }
1599 }
1600 return found;
1601}
1602
82cfbb00
SH
1603/*
1604 * Scan for the next right leaf starting at node p->child[idx]
1605 * Since we have back pointer, no recursion necessary.
1606 */
adaf9816 1607static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
19baf839 1608{
82cfbb00
SH
1609 do {
1610 t_key idx;
c877efb2 1611
c877efb2 1612 if (c)
82cfbb00 1613 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1614 else
82cfbb00 1615 idx = 0;
2373ce1c 1616
82cfbb00
SH
1617 while (idx < 1u << p->bits) {
1618 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1619 if (!c)
91b9a277
OJ
1620 continue;
1621
aab515d7 1622 if (IS_LEAF(c))
adaf9816 1623 return c;
82cfbb00
SH
1624
1625 /* Rescan start scanning in new node */
adaf9816 1626 p = c;
82cfbb00 1627 idx = 0;
19baf839 1628 }
82cfbb00
SH
1629
1630 /* Node empty, walk back up to parent */
adaf9816 1631 c = p;
a034ee3c 1632 } while ((p = node_parent_rcu(c)) != NULL);
82cfbb00
SH
1633
1634 return NULL; /* Root of trie */
1635}
1636
adaf9816 1637static struct tnode *trie_firstleaf(struct trie *t)
82cfbb00 1638{
adaf9816 1639 struct tnode *n = rcu_dereference_rtnl(t->trie);
82cfbb00
SH
1640
1641 if (!n)
1642 return NULL;
1643
1644 if (IS_LEAF(n)) /* trie is just a leaf */
adaf9816 1645 return n;
82cfbb00
SH
1646
1647 return leaf_walk_rcu(n, NULL);
1648}
1649
adaf9816 1650static struct tnode *trie_nextleaf(struct tnode *l)
82cfbb00 1651{
adaf9816 1652 struct tnode *p = node_parent_rcu(l);
82cfbb00
SH
1653
1654 if (!p)
1655 return NULL; /* trie with just one leaf */
1656
adaf9816 1657 return leaf_walk_rcu(p, l);
19baf839
RO
1658}
1659
adaf9816 1660static struct tnode *trie_leafindex(struct trie *t, int index)
71d67e66 1661{
adaf9816 1662 struct tnode *l = trie_firstleaf(t);
71d67e66 1663
ec28cf73 1664 while (l && index-- > 0)
71d67e66 1665 l = trie_nextleaf(l);
ec28cf73 1666
71d67e66
SH
1667 return l;
1668}
1669
1670
d562f1f8
RO
1671/*
1672 * Caller must hold RTNL.
1673 */
16c6cf8b 1674int fib_table_flush(struct fib_table *tb)
19baf839
RO
1675{
1676 struct trie *t = (struct trie *) tb->tb_data;
adaf9816 1677 struct tnode *l, *ll = NULL;
82cfbb00 1678 int found = 0;
19baf839 1679
82cfbb00 1680 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1681 found += trie_flush_leaf(l);
19baf839
RO
1682
1683 if (ll && hlist_empty(&ll->list))
9195bef7 1684 trie_leaf_remove(t, ll);
19baf839
RO
1685 ll = l;
1686 }
1687
1688 if (ll && hlist_empty(&ll->list))
9195bef7 1689 trie_leaf_remove(t, ll);
19baf839 1690
0c7770c7 1691 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1692 return found;
1693}
1694
4aa2c466
PE
1695void fib_free_table(struct fib_table *tb)
1696{
8274a97a
AD
1697#ifdef CONFIG_IP_FIB_TRIE_STATS
1698 struct trie *t = (struct trie *)tb->tb_data;
1699
1700 free_percpu(t->stats);
1701#endif /* CONFIG_IP_FIB_TRIE_STATS */
4aa2c466
PE
1702 kfree(tb);
1703}
1704
a07f5f50
SH
1705static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1706 struct fib_table *tb,
19baf839
RO
1707 struct sk_buff *skb, struct netlink_callback *cb)
1708{
1709 int i, s_i;
1710 struct fib_alias *fa;
32ab5f80 1711 __be32 xkey = htonl(key);
19baf839 1712
71d67e66 1713 s_i = cb->args[5];
19baf839
RO
1714 i = 0;
1715
2373ce1c
RO
1716 /* rcu_read_lock is hold by caller */
1717
1718 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1719 if (i < s_i) {
1720 i++;
1721 continue;
1722 }
19baf839 1723
15e47304 1724 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
19baf839
RO
1725 cb->nlh->nlmsg_seq,
1726 RTM_NEWROUTE,
1727 tb->tb_id,
1728 fa->fa_type,
be403ea1 1729 xkey,
19baf839
RO
1730 plen,
1731 fa->fa_tos,
64347f78 1732 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1733 cb->args[5] = i;
19baf839 1734 return -1;
91b9a277 1735 }
19baf839
RO
1736 i++;
1737 }
71d67e66 1738 cb->args[5] = i;
19baf839
RO
1739 return skb->len;
1740}
1741
adaf9816 1742static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
a88ee229 1743 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1744{
a88ee229 1745 struct leaf_info *li;
a88ee229 1746 int i, s_i;
19baf839 1747
71d67e66 1748 s_i = cb->args[4];
a88ee229 1749 i = 0;
19baf839 1750
a88ee229 1751 /* rcu_read_lock is hold by caller */
b67bfe0d 1752 hlist_for_each_entry_rcu(li, &l->list, hlist) {
a88ee229
SH
1753 if (i < s_i) {
1754 i++;
19baf839 1755 continue;
a88ee229 1756 }
91b9a277 1757
a88ee229 1758 if (i > s_i)
71d67e66 1759 cb->args[5] = 0;
19baf839 1760
a88ee229 1761 if (list_empty(&li->falh))
19baf839
RO
1762 continue;
1763
a88ee229 1764 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1765 cb->args[4] = i;
19baf839
RO
1766 return -1;
1767 }
a88ee229 1768 i++;
19baf839 1769 }
a88ee229 1770
71d67e66 1771 cb->args[4] = i;
19baf839
RO
1772 return skb->len;
1773}
1774
16c6cf8b
SH
1775int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1776 struct netlink_callback *cb)
19baf839 1777{
adaf9816 1778 struct tnode *l;
19baf839 1779 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1780 t_key key = cb->args[2];
71d67e66 1781 int count = cb->args[3];
19baf839 1782
2373ce1c 1783 rcu_read_lock();
d5ce8a0e
SH
1784 /* Dump starting at last key.
1785 * Note: 0.0.0.0/0 (ie default) is first key.
1786 */
71d67e66 1787 if (count == 0)
d5ce8a0e
SH
1788 l = trie_firstleaf(t);
1789 else {
71d67e66
SH
1790 /* Normally, continue from last key, but if that is missing
1791 * fallback to using slow rescan
1792 */
d5ce8a0e 1793 l = fib_find_node(t, key);
71d67e66
SH
1794 if (!l)
1795 l = trie_leafindex(t, count);
d5ce8a0e 1796 }
a88ee229 1797
d5ce8a0e
SH
1798 while (l) {
1799 cb->args[2] = l->key;
a88ee229 1800 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1801 cb->args[3] = count;
a88ee229 1802 rcu_read_unlock();
a88ee229 1803 return -1;
19baf839 1804 }
d5ce8a0e 1805
71d67e66 1806 ++count;
d5ce8a0e 1807 l = trie_nextleaf(l);
71d67e66
SH
1808 memset(&cb->args[4], 0,
1809 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1810 }
71d67e66 1811 cb->args[3] = count;
2373ce1c 1812 rcu_read_unlock();
a88ee229 1813
19baf839 1814 return skb->len;
19baf839
RO
1815}
1816
5348ba85 1817void __init fib_trie_init(void)
7f9b8052 1818{
a07f5f50
SH
1819 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1820 sizeof(struct fib_alias),
bc3c8c1e
SH
1821 0, SLAB_PANIC, NULL);
1822
1823 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
adaf9816 1824 max(sizeof(struct tnode),
bc3c8c1e
SH
1825 sizeof(struct leaf_info)),
1826 0, SLAB_PANIC, NULL);
7f9b8052 1827}
19baf839 1828
7f9b8052 1829
5348ba85 1830struct fib_table *fib_trie_table(u32 id)
19baf839
RO
1831{
1832 struct fib_table *tb;
1833 struct trie *t;
1834
19baf839
RO
1835 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1836 GFP_KERNEL);
1837 if (tb == NULL)
1838 return NULL;
1839
1840 tb->tb_id = id;
971b893e 1841 tb->tb_default = -1;
21d8c49e 1842 tb->tb_num_default = 0;
19baf839
RO
1843
1844 t = (struct trie *) tb->tb_data;
8274a97a
AD
1845 RCU_INIT_POINTER(t->trie, NULL);
1846#ifdef CONFIG_IP_FIB_TRIE_STATS
1847 t->stats = alloc_percpu(struct trie_use_stats);
1848 if (!t->stats) {
1849 kfree(tb);
1850 tb = NULL;
1851 }
1852#endif
19baf839 1853
19baf839
RO
1854 return tb;
1855}
1856
cb7b593c
SH
1857#ifdef CONFIG_PROC_FS
1858/* Depth first Trie walk iterator */
1859struct fib_trie_iter {
1c340b2f 1860 struct seq_net_private p;
3d3b2d25 1861 struct fib_table *tb;
cb7b593c 1862 struct tnode *tnode;
a034ee3c
ED
1863 unsigned int index;
1864 unsigned int depth;
cb7b593c 1865};
19baf839 1866
adaf9816 1867static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 1868{
cb7b593c 1869 struct tnode *tn = iter->tnode;
a034ee3c 1870 unsigned int cindex = iter->index;
cb7b593c 1871 struct tnode *p;
19baf839 1872
6640e697
EB
1873 /* A single entry routing table */
1874 if (!tn)
1875 return NULL;
1876
cb7b593c
SH
1877 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1878 iter->tnode, iter->index, iter->depth);
1879rescan:
1880 while (cindex < (1<<tn->bits)) {
adaf9816 1881 struct tnode *n = tnode_get_child_rcu(tn, cindex);
19baf839 1882
cb7b593c
SH
1883 if (n) {
1884 if (IS_LEAF(n)) {
1885 iter->tnode = tn;
1886 iter->index = cindex + 1;
1887 } else {
1888 /* push down one level */
adaf9816 1889 iter->tnode = n;
cb7b593c
SH
1890 iter->index = 0;
1891 ++iter->depth;
1892 }
1893 return n;
1894 }
19baf839 1895
cb7b593c
SH
1896 ++cindex;
1897 }
91b9a277 1898
cb7b593c 1899 /* Current node exhausted, pop back up */
adaf9816 1900 p = node_parent_rcu(tn);
cb7b593c
SH
1901 if (p) {
1902 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
1903 tn = p;
1904 --iter->depth;
1905 goto rescan;
19baf839 1906 }
cb7b593c
SH
1907
1908 /* got root? */
1909 return NULL;
19baf839
RO
1910}
1911
adaf9816 1912static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
cb7b593c 1913 struct trie *t)
19baf839 1914{
adaf9816 1915 struct tnode *n;
5ddf0eb2 1916
132adf54 1917 if (!t)
5ddf0eb2
RO
1918 return NULL;
1919
1920 n = rcu_dereference(t->trie);
3d3b2d25 1921 if (!n)
5ddf0eb2 1922 return NULL;
19baf839 1923
3d3b2d25 1924 if (IS_TNODE(n)) {
adaf9816 1925 iter->tnode = n;
3d3b2d25
SH
1926 iter->index = 0;
1927 iter->depth = 1;
1928 } else {
1929 iter->tnode = NULL;
1930 iter->index = 0;
1931 iter->depth = 0;
91b9a277 1932 }
3d3b2d25
SH
1933
1934 return n;
cb7b593c 1935}
91b9a277 1936
cb7b593c
SH
1937static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1938{
adaf9816 1939 struct tnode *n;
cb7b593c 1940 struct fib_trie_iter iter;
91b9a277 1941
cb7b593c 1942 memset(s, 0, sizeof(*s));
91b9a277 1943
cb7b593c 1944 rcu_read_lock();
3d3b2d25 1945 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 1946 if (IS_LEAF(n)) {
93672292 1947 struct leaf_info *li;
93672292 1948
cb7b593c
SH
1949 s->leaves++;
1950 s->totdepth += iter.depth;
1951 if (iter.depth > s->maxdepth)
1952 s->maxdepth = iter.depth;
93672292 1953
adaf9816 1954 hlist_for_each_entry_rcu(li, &n->list, hlist)
93672292 1955 ++s->prefixes;
cb7b593c 1956 } else {
cb7b593c
SH
1957 int i;
1958
1959 s->tnodes++;
adaf9816
AD
1960 if (n->bits < MAX_STAT_DEPTH)
1961 s->nodesizes[n->bits]++;
06ef921d 1962
adaf9816
AD
1963 for (i = 0; i < tnode_child_length(n); i++)
1964 if (!rcu_access_pointer(n->child[i]))
cb7b593c 1965 s->nullpointers++;
19baf839 1966 }
19baf839 1967 }
2373ce1c 1968 rcu_read_unlock();
19baf839
RO
1969}
1970
cb7b593c
SH
1971/*
1972 * This outputs /proc/net/fib_triestats
1973 */
1974static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 1975{
a034ee3c 1976 unsigned int i, max, pointers, bytes, avdepth;
c877efb2 1977
cb7b593c
SH
1978 if (stat->leaves)
1979 avdepth = stat->totdepth*100 / stat->leaves;
1980 else
1981 avdepth = 0;
91b9a277 1982
a07f5f50
SH
1983 seq_printf(seq, "\tAver depth: %u.%02d\n",
1984 avdepth / 100, avdepth % 100);
cb7b593c 1985 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 1986
cb7b593c 1987 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
adaf9816 1988 bytes = sizeof(struct tnode) * stat->leaves;
93672292
SH
1989
1990 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
1991 bytes += sizeof(struct leaf_info) * stat->prefixes;
1992
187b5188 1993 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 1994 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 1995
06ef921d
RO
1996 max = MAX_STAT_DEPTH;
1997 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 1998 max--;
19baf839 1999
cb7b593c 2000 pointers = 0;
f585a991 2001 for (i = 1; i < max; i++)
cb7b593c 2002 if (stat->nodesizes[i] != 0) {
187b5188 2003 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2004 pointers += (1<<i) * stat->nodesizes[i];
2005 }
2006 seq_putc(seq, '\n');
187b5188 2007 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2008
adaf9816 2009 bytes += sizeof(struct tnode *) * pointers;
187b5188
SH
2010 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2011 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2012}
2373ce1c 2013
cb7b593c 2014#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd 2015static void trie_show_usage(struct seq_file *seq,
8274a97a 2016 const struct trie_use_stats __percpu *stats)
66a2f7fd 2017{
8274a97a
AD
2018 struct trie_use_stats s = { 0 };
2019 int cpu;
2020
2021 /* loop through all of the CPUs and gather up the stats */
2022 for_each_possible_cpu(cpu) {
2023 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2024
2025 s.gets += pcpu->gets;
2026 s.backtrack += pcpu->backtrack;
2027 s.semantic_match_passed += pcpu->semantic_match_passed;
2028 s.semantic_match_miss += pcpu->semantic_match_miss;
2029 s.null_node_hit += pcpu->null_node_hit;
2030 s.resize_node_skipped += pcpu->resize_node_skipped;
2031 }
2032
66a2f7fd 2033 seq_printf(seq, "\nCounters:\n---------\n");
8274a97a
AD
2034 seq_printf(seq, "gets = %u\n", s.gets);
2035 seq_printf(seq, "backtracks = %u\n", s.backtrack);
a07f5f50 2036 seq_printf(seq, "semantic match passed = %u\n",
8274a97a
AD
2037 s.semantic_match_passed);
2038 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2039 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2040 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
cb7b593c 2041}
66a2f7fd
SH
2042#endif /* CONFIG_IP_FIB_TRIE_STATS */
2043
3d3b2d25 2044static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2045{
3d3b2d25
SH
2046 if (tb->tb_id == RT_TABLE_LOCAL)
2047 seq_puts(seq, "Local:\n");
2048 else if (tb->tb_id == RT_TABLE_MAIN)
2049 seq_puts(seq, "Main:\n");
2050 else
2051 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2052}
19baf839 2053
3d3b2d25 2054
cb7b593c
SH
2055static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2056{
1c340b2f 2057 struct net *net = (struct net *)seq->private;
3d3b2d25 2058 unsigned int h;
877a9bff 2059
d717a9a6 2060 seq_printf(seq,
a07f5f50
SH
2061 "Basic info: size of leaf:"
2062 " %Zd bytes, size of tnode: %Zd bytes.\n",
adaf9816 2063 sizeof(struct tnode), sizeof(struct tnode));
d717a9a6 2064
3d3b2d25
SH
2065 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2066 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25
SH
2067 struct fib_table *tb;
2068
b67bfe0d 2069 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2070 struct trie *t = (struct trie *) tb->tb_data;
2071 struct trie_stat stat;
877a9bff 2072
3d3b2d25
SH
2073 if (!t)
2074 continue;
2075
2076 fib_table_print(seq, tb);
2077
2078 trie_collect_stats(t, &stat);
2079 trie_show_stats(seq, &stat);
2080#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 2081 trie_show_usage(seq, t->stats);
3d3b2d25
SH
2082#endif
2083 }
2084 }
19baf839 2085
cb7b593c 2086 return 0;
19baf839
RO
2087}
2088
cb7b593c 2089static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2090{
de05c557 2091 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2092}
2093
9a32144e 2094static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2095 .owner = THIS_MODULE,
2096 .open = fib_triestat_seq_open,
2097 .read = seq_read,
2098 .llseek = seq_lseek,
b6fcbdb4 2099 .release = single_release_net,
cb7b593c
SH
2100};
2101
adaf9816 2102static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2103{
1218854a
YH
2104 struct fib_trie_iter *iter = seq->private;
2105 struct net *net = seq_file_net(seq);
cb7b593c 2106 loff_t idx = 0;
3d3b2d25 2107 unsigned int h;
cb7b593c 2108
3d3b2d25
SH
2109 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2110 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25 2111 struct fib_table *tb;
cb7b593c 2112
b67bfe0d 2113 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
adaf9816 2114 struct tnode *n;
3d3b2d25
SH
2115
2116 for (n = fib_trie_get_first(iter,
2117 (struct trie *) tb->tb_data);
2118 n; n = fib_trie_get_next(iter))
2119 if (pos == idx++) {
2120 iter->tb = tb;
2121 return n;
2122 }
2123 }
cb7b593c 2124 }
3d3b2d25 2125
19baf839
RO
2126 return NULL;
2127}
2128
cb7b593c 2129static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2130 __acquires(RCU)
19baf839 2131{
cb7b593c 2132 rcu_read_lock();
1218854a 2133 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2134}
2135
cb7b593c 2136static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2137{
cb7b593c 2138 struct fib_trie_iter *iter = seq->private;
1218854a 2139 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2140 struct fib_table *tb = iter->tb;
2141 struct hlist_node *tb_node;
2142 unsigned int h;
adaf9816 2143 struct tnode *n;
cb7b593c 2144
19baf839 2145 ++*pos;
3d3b2d25
SH
2146 /* next node in same table */
2147 n = fib_trie_get_next(iter);
2148 if (n)
2149 return n;
19baf839 2150
3d3b2d25
SH
2151 /* walk rest of this hash chain */
2152 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
0a5c0475 2153 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
3d3b2d25
SH
2154 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2155 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2156 if (n)
2157 goto found;
2158 }
19baf839 2159
3d3b2d25
SH
2160 /* new hash chain */
2161 while (++h < FIB_TABLE_HASHSZ) {
2162 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
b67bfe0d 2163 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2164 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2165 if (n)
2166 goto found;
2167 }
2168 }
cb7b593c 2169 return NULL;
3d3b2d25
SH
2170
2171found:
2172 iter->tb = tb;
2173 return n;
cb7b593c 2174}
19baf839 2175
cb7b593c 2176static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2177 __releases(RCU)
19baf839 2178{
cb7b593c
SH
2179 rcu_read_unlock();
2180}
91b9a277 2181
cb7b593c
SH
2182static void seq_indent(struct seq_file *seq, int n)
2183{
a034ee3c
ED
2184 while (n-- > 0)
2185 seq_puts(seq, " ");
cb7b593c 2186}
19baf839 2187
28d36e37 2188static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2189{
132adf54 2190 switch (s) {
cb7b593c
SH
2191 case RT_SCOPE_UNIVERSE: return "universe";
2192 case RT_SCOPE_SITE: return "site";
2193 case RT_SCOPE_LINK: return "link";
2194 case RT_SCOPE_HOST: return "host";
2195 case RT_SCOPE_NOWHERE: return "nowhere";
2196 default:
28d36e37 2197 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2198 return buf;
2199 }
2200}
19baf839 2201
36cbd3dc 2202static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2203 [RTN_UNSPEC] = "UNSPEC",
2204 [RTN_UNICAST] = "UNICAST",
2205 [RTN_LOCAL] = "LOCAL",
2206 [RTN_BROADCAST] = "BROADCAST",
2207 [RTN_ANYCAST] = "ANYCAST",
2208 [RTN_MULTICAST] = "MULTICAST",
2209 [RTN_BLACKHOLE] = "BLACKHOLE",
2210 [RTN_UNREACHABLE] = "UNREACHABLE",
2211 [RTN_PROHIBIT] = "PROHIBIT",
2212 [RTN_THROW] = "THROW",
2213 [RTN_NAT] = "NAT",
2214 [RTN_XRESOLVE] = "XRESOLVE",
2215};
19baf839 2216
a034ee3c 2217static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
cb7b593c 2218{
cb7b593c
SH
2219 if (t < __RTN_MAX && rtn_type_names[t])
2220 return rtn_type_names[t];
28d36e37 2221 snprintf(buf, len, "type %u", t);
cb7b593c 2222 return buf;
19baf839
RO
2223}
2224
cb7b593c
SH
2225/* Pretty print the trie */
2226static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2227{
cb7b593c 2228 const struct fib_trie_iter *iter = seq->private;
adaf9816 2229 struct tnode *n = v;
c877efb2 2230
3d3b2d25
SH
2231 if (!node_parent_rcu(n))
2232 fib_table_print(seq, iter->tb);
095b8501 2233
cb7b593c 2234 if (IS_TNODE(n)) {
adaf9816 2235 __be32 prf = htonl(n->key);
91b9a277 2236
adaf9816 2237 seq_indent(seq, iter->depth - 1);
673d57e7 2238 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
adaf9816
AD
2239 &prf, n->pos, n->bits, n->full_children,
2240 n->empty_children);
cb7b593c 2241 } else {
1328042e 2242 struct leaf_info *li;
adaf9816 2243 __be32 val = htonl(n->key);
cb7b593c
SH
2244
2245 seq_indent(seq, iter->depth);
673d57e7 2246 seq_printf(seq, " |-- %pI4\n", &val);
1328042e 2247
adaf9816 2248 hlist_for_each_entry_rcu(li, &n->list, hlist) {
1328042e
SH
2249 struct fib_alias *fa;
2250
2251 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2252 char buf1[32], buf2[32];
2253
2254 seq_indent(seq, iter->depth+1);
2255 seq_printf(seq, " /%d %s %s", li->plen,
2256 rtn_scope(buf1, sizeof(buf1),
37e826c5 2257 fa->fa_info->fib_scope),
1328042e
SH
2258 rtn_type(buf2, sizeof(buf2),
2259 fa->fa_type));
2260 if (fa->fa_tos)
b9c4d82a 2261 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2262 seq_putc(seq, '\n');
cb7b593c
SH
2263 }
2264 }
19baf839 2265 }
cb7b593c 2266
19baf839
RO
2267 return 0;
2268}
2269
f690808e 2270static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2271 .start = fib_trie_seq_start,
2272 .next = fib_trie_seq_next,
2273 .stop = fib_trie_seq_stop,
2274 .show = fib_trie_seq_show,
19baf839
RO
2275};
2276
cb7b593c 2277static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2278{
1c340b2f
DL
2279 return seq_open_net(inode, file, &fib_trie_seq_ops,
2280 sizeof(struct fib_trie_iter));
19baf839
RO
2281}
2282
9a32144e 2283static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2284 .owner = THIS_MODULE,
2285 .open = fib_trie_seq_open,
2286 .read = seq_read,
2287 .llseek = seq_lseek,
1c340b2f 2288 .release = seq_release_net,
19baf839
RO
2289};
2290
8315f5d8
SH
2291struct fib_route_iter {
2292 struct seq_net_private p;
2293 struct trie *main_trie;
2294 loff_t pos;
2295 t_key key;
2296};
2297
adaf9816 2298static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
8315f5d8 2299{
adaf9816 2300 struct tnode *l = NULL;
8315f5d8
SH
2301 struct trie *t = iter->main_trie;
2302
2303 /* use cache location of last found key */
2304 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2305 pos -= iter->pos;
2306 else {
2307 iter->pos = 0;
2308 l = trie_firstleaf(t);
2309 }
2310
2311 while (l && pos-- > 0) {
2312 iter->pos++;
2313 l = trie_nextleaf(l);
2314 }
2315
2316 if (l)
2317 iter->key = pos; /* remember it */
2318 else
2319 iter->pos = 0; /* forget it */
2320
2321 return l;
2322}
2323
2324static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2325 __acquires(RCU)
2326{
2327 struct fib_route_iter *iter = seq->private;
2328 struct fib_table *tb;
2329
2330 rcu_read_lock();
1218854a 2331 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2332 if (!tb)
2333 return NULL;
2334
2335 iter->main_trie = (struct trie *) tb->tb_data;
2336 if (*pos == 0)
2337 return SEQ_START_TOKEN;
2338 else
2339 return fib_route_get_idx(iter, *pos - 1);
2340}
2341
2342static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2343{
2344 struct fib_route_iter *iter = seq->private;
adaf9816 2345 struct tnode *l = v;
8315f5d8
SH
2346
2347 ++*pos;
2348 if (v == SEQ_START_TOKEN) {
2349 iter->pos = 0;
2350 l = trie_firstleaf(iter->main_trie);
2351 } else {
2352 iter->pos++;
2353 l = trie_nextleaf(l);
2354 }
2355
2356 if (l)
2357 iter->key = l->key;
2358 else
2359 iter->pos = 0;
2360 return l;
2361}
2362
2363static void fib_route_seq_stop(struct seq_file *seq, void *v)
2364 __releases(RCU)
2365{
2366 rcu_read_unlock();
2367}
2368
a034ee3c 2369static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2370{
a034ee3c 2371 unsigned int flags = 0;
19baf839 2372
a034ee3c
ED
2373 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2374 flags = RTF_REJECT;
cb7b593c
SH
2375 if (fi && fi->fib_nh->nh_gw)
2376 flags |= RTF_GATEWAY;
32ab5f80 2377 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2378 flags |= RTF_HOST;
2379 flags |= RTF_UP;
2380 return flags;
19baf839
RO
2381}
2382
cb7b593c
SH
2383/*
2384 * This outputs /proc/net/route.
2385 * The format of the file is not supposed to be changed
a034ee3c 2386 * and needs to be same as fib_hash output to avoid breaking
cb7b593c
SH
2387 * legacy utilities
2388 */
2389static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2390{
adaf9816 2391 struct tnode *l = v;
1328042e 2392 struct leaf_info *li;
19baf839 2393
cb7b593c
SH
2394 if (v == SEQ_START_TOKEN) {
2395 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2396 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2397 "\tWindow\tIRTT");
2398 return 0;
2399 }
19baf839 2400
b67bfe0d 2401 hlist_for_each_entry_rcu(li, &l->list, hlist) {
cb7b593c 2402 struct fib_alias *fa;
32ab5f80 2403 __be32 mask, prefix;
91b9a277 2404
cb7b593c
SH
2405 mask = inet_make_mask(li->plen);
2406 prefix = htonl(l->key);
19baf839 2407
cb7b593c 2408 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2409 const struct fib_info *fi = fa->fa_info;
a034ee3c 2410 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
19baf839 2411
cb7b593c
SH
2412 if (fa->fa_type == RTN_BROADCAST
2413 || fa->fa_type == RTN_MULTICAST)
2414 continue;
19baf839 2415
652586df
TH
2416 seq_setwidth(seq, 127);
2417
cb7b593c 2418 if (fi)
5e659e4c
PE
2419 seq_printf(seq,
2420 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
652586df 2421 "%d\t%08X\t%d\t%u\t%u",
cb7b593c
SH
2422 fi->fib_dev ? fi->fib_dev->name : "*",
2423 prefix,
2424 fi->fib_nh->nh_gw, flags, 0, 0,
2425 fi->fib_priority,
2426 mask,
a07f5f50
SH
2427 (fi->fib_advmss ?
2428 fi->fib_advmss + 40 : 0),
cb7b593c 2429 fi->fib_window,
652586df 2430 fi->fib_rtt >> 3);
cb7b593c 2431 else
5e659e4c
PE
2432 seq_printf(seq,
2433 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
652586df 2434 "%d\t%08X\t%d\t%u\t%u",
cb7b593c 2435 prefix, 0, flags, 0, 0, 0,
652586df 2436 mask, 0, 0, 0);
19baf839 2437
652586df 2438 seq_pad(seq, '\n');
cb7b593c 2439 }
19baf839
RO
2440 }
2441
2442 return 0;
2443}
2444
f690808e 2445static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2446 .start = fib_route_seq_start,
2447 .next = fib_route_seq_next,
2448 .stop = fib_route_seq_stop,
cb7b593c 2449 .show = fib_route_seq_show,
19baf839
RO
2450};
2451
cb7b593c 2452static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2453{
1c340b2f 2454 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2455 sizeof(struct fib_route_iter));
19baf839
RO
2456}
2457
9a32144e 2458static const struct file_operations fib_route_fops = {
cb7b593c
SH
2459 .owner = THIS_MODULE,
2460 .open = fib_route_seq_open,
2461 .read = seq_read,
2462 .llseek = seq_lseek,
1c340b2f 2463 .release = seq_release_net,
19baf839
RO
2464};
2465
61a02653 2466int __net_init fib_proc_init(struct net *net)
19baf839 2467{
d4beaa66 2468 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
cb7b593c
SH
2469 goto out1;
2470
d4beaa66
G
2471 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2472 &fib_triestat_fops))
cb7b593c
SH
2473 goto out2;
2474
d4beaa66 2475 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
cb7b593c
SH
2476 goto out3;
2477
19baf839 2478 return 0;
cb7b593c
SH
2479
2480out3:
ece31ffd 2481 remove_proc_entry("fib_triestat", net->proc_net);
cb7b593c 2482out2:
ece31ffd 2483 remove_proc_entry("fib_trie", net->proc_net);
cb7b593c
SH
2484out1:
2485 return -ENOMEM;
19baf839
RO
2486}
2487
61a02653 2488void __net_exit fib_proc_exit(struct net *net)
19baf839 2489{
ece31ffd
G
2490 remove_proc_entry("fib_trie", net->proc_net);
2491 remove_proc_entry("fib_triestat", net->proc_net);
2492 remove_proc_entry("route", net->proc_net);
19baf839
RO
2493}
2494
2495#endif /* CONFIG_PROC_FS */
This page took 0.987796 seconds and 5 git commands to generate.