Give up on pushing CC_OPTIMIZE_FOR_SIZE
[deliverable/linux.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
25985edc 15 * This work is based on the LPC-trie which is originally described in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
631dd1a8 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
19baf839
RO
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839
RO
53#include <asm/uaccess.h>
54#include <asm/system.h>
1977f032 55#include <linux/bitops.h>
19baf839
RO
56#include <linux/types.h>
57#include <linux/kernel.h>
19baf839
RO
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
cd8787ab 65#include <linux/inetdevice.h>
19baf839
RO
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
2373ce1c 69#include <linux/rcupdate.h>
19baf839
RO
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
5a0e3ad6 74#include <linux/slab.h>
457c4cbc 75#include <net/net_namespace.h>
19baf839
RO
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
06ef921d 84#define MAX_STAT_DEPTH 32
19baf839 85
19baf839 86#define KEYLENGTH (8*sizeof(t_key))
19baf839 87
19baf839
RO
88typedef unsigned int t_key;
89
90#define T_TNODE 0
91#define T_LEAF 1
92#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94
91b9a277
OJ
95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839 97
b299e4f0 98struct rt_trie_node {
91b9a277 99 unsigned long parent;
8d965444 100 t_key key;
19baf839
RO
101};
102
103struct leaf {
91b9a277 104 unsigned long parent;
8d965444 105 t_key key;
19baf839 106 struct hlist_head list;
2373ce1c 107 struct rcu_head rcu;
19baf839
RO
108};
109
110struct leaf_info {
111 struct hlist_node hlist;
2373ce1c 112 struct rcu_head rcu;
19baf839
RO
113 int plen;
114 struct list_head falh;
115};
116
117struct tnode {
91b9a277 118 unsigned long parent;
8d965444 119 t_key key;
112d8cfc
ED
120 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
121 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
122 unsigned int full_children; /* KEYLENGTH bits needed */
123 unsigned int empty_children; /* KEYLENGTH bits needed */
15be75cd
SH
124 union {
125 struct rcu_head rcu;
126 struct work_struct work;
e0f7cb8c 127 struct tnode *tnode_free;
15be75cd 128 };
0a5c0475 129 struct rt_trie_node __rcu *child[0];
19baf839
RO
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
2f36895a 139 unsigned int resize_node_skipped;
19baf839
RO
140};
141#endif
142
143struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
93672292 149 unsigned int prefixes;
06ef921d 150 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 151};
19baf839
RO
152
153struct trie {
0a5c0475 154 struct rt_trie_node __rcu *trie;
19baf839
RO
155#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats;
157#endif
19baf839
RO
158};
159
b299e4f0
DM
160static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 162 int wasfull);
b299e4f0 163static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c
JP
166/* tnodes to free after resize(); protected by RTNL */
167static struct tnode *tnode_free_head;
c3059477
JP
168static size_t tnode_free_size;
169
170/*
171 * synchronize_rcu after call_rcu for that many pages; it should be especially
172 * useful before resizing the root node with PREEMPT_NONE configs; the value was
173 * obtained experimentally, aiming to avoid visible slowdown.
174 */
175static const int sync_pages = 128;
19baf839 176
e18b890b 177static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 178static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 179
0a5c0475
ED
180/*
181 * caller must hold RTNL
182 */
183static inline struct tnode *node_parent(const struct rt_trie_node *node)
06801916 184{
0a5c0475
ED
185 unsigned long parent;
186
187 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
188
189 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
b59cfbf7
ED
190}
191
0a5c0475
ED
192/*
193 * caller must hold RCU read lock or RTNL
194 */
195static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
b59cfbf7 196{
0a5c0475
ED
197 unsigned long parent;
198
199 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
200 lockdep_rtnl_is_held());
06801916 201
0a5c0475 202 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
06801916
SH
203}
204
6440cc9e
SH
205/* Same as rcu_assign_pointer
206 * but that macro() assumes that value is a pointer.
207 */
b299e4f0 208static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
06801916 209{
6440cc9e
SH
210 smp_wmb();
211 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 212}
2373ce1c 213
0a5c0475
ED
214/*
215 * caller must hold RTNL
216 */
217static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
b59cfbf7
ED
218{
219 BUG_ON(i >= 1U << tn->bits);
2373ce1c 220
0a5c0475 221 return rtnl_dereference(tn->child[i]);
b59cfbf7
ED
222}
223
0a5c0475
ED
224/*
225 * caller must hold RCU read lock or RTNL
226 */
227static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
19baf839 228{
0a5c0475 229 BUG_ON(i >= 1U << tn->bits);
19baf839 230
0a5c0475 231 return rcu_dereference_rtnl(tn->child[i]);
19baf839
RO
232}
233
bb435b8d 234static inline int tnode_child_length(const struct tnode *tn)
19baf839 235{
91b9a277 236 return 1 << tn->bits;
19baf839
RO
237}
238
3b004569 239static inline t_key mask_pfx(t_key k, unsigned int l)
ab66b4a7
SH
240{
241 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
242}
243
3b004569 244static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
19baf839 245{
91b9a277 246 if (offset < KEYLENGTH)
19baf839 247 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 248 else
19baf839
RO
249 return 0;
250}
251
252static inline int tkey_equals(t_key a, t_key b)
253{
c877efb2 254 return a == b;
19baf839
RO
255}
256
257static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
258{
c877efb2
SH
259 if (bits == 0 || offset >= KEYLENGTH)
260 return 1;
91b9a277
OJ
261 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
262 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 263}
19baf839
RO
264
265static inline int tkey_mismatch(t_key a, int offset, t_key b)
266{
267 t_key diff = a ^ b;
268 int i = offset;
269
c877efb2
SH
270 if (!diff)
271 return 0;
272 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
273 i++;
274 return i;
275}
276
19baf839 277/*
e905a9ed
YH
278 To understand this stuff, an understanding of keys and all their bits is
279 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
280 all of the bits in that key are significant.
281
282 Consider a node 'n' and its parent 'tp'.
283
e905a9ed
YH
284 If n is a leaf, every bit in its key is significant. Its presence is
285 necessitated by path compression, since during a tree traversal (when
286 searching for a leaf - unless we are doing an insertion) we will completely
287 ignore all skipped bits we encounter. Thus we need to verify, at the end of
288 a potentially successful search, that we have indeed been walking the
19baf839
RO
289 correct key path.
290
e905a9ed
YH
291 Note that we can never "miss" the correct key in the tree if present by
292 following the wrong path. Path compression ensures that segments of the key
293 that are the same for all keys with a given prefix are skipped, but the
294 skipped part *is* identical for each node in the subtrie below the skipped
295 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
296 call to tkey_sub_equals() in trie_insert().
297
e905a9ed 298 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
299 have many different meanings.
300
e905a9ed 301 Example:
19baf839
RO
302 _________________________________________________________________
303 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
304 -----------------------------------------------------------------
e905a9ed 305 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
306
307 _________________________________________________________________
308 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
309 -----------------------------------------------------------------
310 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
311
312 tp->pos = 7
313 tp->bits = 3
314 n->pos = 15
91b9a277 315 n->bits = 4
19baf839 316
e905a9ed
YH
317 First, let's just ignore the bits that come before the parent tp, that is
318 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
319 not use them for anything.
320
321 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 322 index into the parent's child array. That is, they will be used to find
19baf839
RO
323 'n' among tp's children.
324
325 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
326 for the node n.
327
e905a9ed 328 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
329 of the bits are really not needed or indeed known in n->key.
330
e905a9ed 331 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 332 n's child array, and will of course be different for each child.
e905a9ed 333
c877efb2 334
19baf839
RO
335 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
336 at this point.
337
338*/
339
0c7770c7 340static inline void check_tnode(const struct tnode *tn)
19baf839 341{
0c7770c7 342 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
343}
344
f5026fab
DL
345static const int halve_threshold = 25;
346static const int inflate_threshold = 50;
345aa031 347static const int halve_threshold_root = 15;
80b71b80 348static const int inflate_threshold_root = 30;
2373ce1c
RO
349
350static void __alias_free_mem(struct rcu_head *head)
19baf839 351{
2373ce1c
RO
352 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
353 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
354}
355
2373ce1c 356static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 357{
2373ce1c
RO
358 call_rcu(&fa->rcu, __alias_free_mem);
359}
91b9a277 360
2373ce1c
RO
361static void __leaf_free_rcu(struct rcu_head *head)
362{
bc3c8c1e
SH
363 struct leaf *l = container_of(head, struct leaf, rcu);
364 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 365}
91b9a277 366
387a5487
SH
367static inline void free_leaf(struct leaf *l)
368{
369 call_rcu_bh(&l->rcu, __leaf_free_rcu);
370}
371
2373ce1c 372static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 373{
bceb0f45 374 kfree_rcu(leaf, rcu);
19baf839
RO
375}
376
8d965444 377static struct tnode *tnode_alloc(size_t size)
f0e36f8c 378{
2373ce1c 379 if (size <= PAGE_SIZE)
8d965444 380 return kzalloc(size, GFP_KERNEL);
15be75cd 381 else
7a1c8e5a 382 return vzalloc(size);
15be75cd 383}
2373ce1c 384
15be75cd
SH
385static void __tnode_vfree(struct work_struct *arg)
386{
387 struct tnode *tn = container_of(arg, struct tnode, work);
388 vfree(tn);
f0e36f8c
PM
389}
390
2373ce1c 391static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 392{
2373ce1c 393 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444 394 size_t size = sizeof(struct tnode) +
b299e4f0 395 (sizeof(struct rt_trie_node *) << tn->bits);
f0e36f8c
PM
396
397 if (size <= PAGE_SIZE)
398 kfree(tn);
15be75cd
SH
399 else {
400 INIT_WORK(&tn->work, __tnode_vfree);
401 schedule_work(&tn->work);
402 }
f0e36f8c
PM
403}
404
2373ce1c
RO
405static inline void tnode_free(struct tnode *tn)
406{
387a5487
SH
407 if (IS_LEAF(tn))
408 free_leaf((struct leaf *) tn);
409 else
550e29bc 410 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
411}
412
e0f7cb8c
JP
413static void tnode_free_safe(struct tnode *tn)
414{
415 BUG_ON(IS_LEAF(tn));
7b85576d
JP
416 tn->tnode_free = tnode_free_head;
417 tnode_free_head = tn;
c3059477 418 tnode_free_size += sizeof(struct tnode) +
b299e4f0 419 (sizeof(struct rt_trie_node *) << tn->bits);
e0f7cb8c
JP
420}
421
422static void tnode_free_flush(void)
423{
424 struct tnode *tn;
425
426 while ((tn = tnode_free_head)) {
427 tnode_free_head = tn->tnode_free;
428 tn->tnode_free = NULL;
429 tnode_free(tn);
430 }
c3059477
JP
431
432 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
433 tnode_free_size = 0;
434 synchronize_rcu();
435 }
e0f7cb8c
JP
436}
437
2373ce1c
RO
438static struct leaf *leaf_new(void)
439{
bc3c8c1e 440 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
441 if (l) {
442 l->parent = T_LEAF;
443 INIT_HLIST_HEAD(&l->list);
444 }
445 return l;
446}
447
448static struct leaf_info *leaf_info_new(int plen)
449{
450 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
451 if (li) {
452 li->plen = plen;
453 INIT_LIST_HEAD(&li->falh);
454 }
455 return li;
456}
457
a07f5f50 458static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 459{
b299e4f0 460 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
f0e36f8c 461 struct tnode *tn = tnode_alloc(sz);
19baf839 462
91b9a277 463 if (tn) {
2373ce1c 464 tn->parent = T_TNODE;
19baf839
RO
465 tn->pos = pos;
466 tn->bits = bits;
467 tn->key = key;
468 tn->full_children = 0;
469 tn->empty_children = 1<<bits;
470 }
c877efb2 471
a034ee3c 472 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
b299e4f0 473 sizeof(struct rt_trie_node) << bits);
19baf839
RO
474 return tn;
475}
476
19baf839
RO
477/*
478 * Check whether a tnode 'n' is "full", i.e. it is an internal node
479 * and no bits are skipped. See discussion in dyntree paper p. 6
480 */
481
b299e4f0 482static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
19baf839 483{
c877efb2 484 if (n == NULL || IS_LEAF(n))
19baf839
RO
485 return 0;
486
487 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
488}
489
a07f5f50 490static inline void put_child(struct trie *t, struct tnode *tn, int i,
b299e4f0 491 struct rt_trie_node *n)
19baf839
RO
492{
493 tnode_put_child_reorg(tn, i, n, -1);
494}
495
c877efb2 496 /*
19baf839
RO
497 * Add a child at position i overwriting the old value.
498 * Update the value of full_children and empty_children.
499 */
500
b299e4f0 501static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 502 int wasfull)
19baf839 503{
0a5c0475 504 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
19baf839
RO
505 int isfull;
506
0c7770c7
SH
507 BUG_ON(i >= 1<<tn->bits);
508
19baf839
RO
509 /* update emptyChildren */
510 if (n == NULL && chi != NULL)
511 tn->empty_children++;
512 else if (n != NULL && chi == NULL)
513 tn->empty_children--;
c877efb2 514
19baf839 515 /* update fullChildren */
91b9a277 516 if (wasfull == -1)
19baf839
RO
517 wasfull = tnode_full(tn, chi);
518
519 isfull = tnode_full(tn, n);
c877efb2 520 if (wasfull && !isfull)
19baf839 521 tn->full_children--;
c877efb2 522 else if (!wasfull && isfull)
19baf839 523 tn->full_children++;
91b9a277 524
c877efb2 525 if (n)
06801916 526 node_set_parent(n, tn);
19baf839 527
2373ce1c 528 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
529}
530
80b71b80 531#define MAX_WORK 10
b299e4f0 532static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
533{
534 int i;
2f80b3c8 535 struct tnode *old_tn;
e6308be8
RO
536 int inflate_threshold_use;
537 int halve_threshold_use;
80b71b80 538 int max_work;
19baf839 539
e905a9ed 540 if (!tn)
19baf839
RO
541 return NULL;
542
0c7770c7
SH
543 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
544 tn, inflate_threshold, halve_threshold);
19baf839
RO
545
546 /* No children */
547 if (tn->empty_children == tnode_child_length(tn)) {
e0f7cb8c 548 tnode_free_safe(tn);
19baf839
RO
549 return NULL;
550 }
551 /* One child */
552 if (tn->empty_children == tnode_child_length(tn) - 1)
80b71b80 553 goto one_child;
c877efb2 554 /*
19baf839
RO
555 * Double as long as the resulting node has a number of
556 * nonempty nodes that are above the threshold.
557 */
558
559 /*
c877efb2
SH
560 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
561 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 562 * Telecommunications, page 6:
c877efb2 563 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
564 * children in the *doubled* node is at least 'high'."
565 *
c877efb2
SH
566 * 'high' in this instance is the variable 'inflate_threshold'. It
567 * is expressed as a percentage, so we multiply it with
568 * tnode_child_length() and instead of multiplying by 2 (since the
569 * child array will be doubled by inflate()) and multiplying
570 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 571 * multiply the left-hand side by 50.
c877efb2
SH
572 *
573 * The left-hand side may look a bit weird: tnode_child_length(tn)
574 * - tn->empty_children is of course the number of non-null children
575 * in the current node. tn->full_children is the number of "full"
19baf839 576 * children, that is non-null tnodes with a skip value of 0.
c877efb2 577 * All of those will be doubled in the resulting inflated tnode, so
19baf839 578 * we just count them one extra time here.
c877efb2 579 *
19baf839 580 * A clearer way to write this would be:
c877efb2 581 *
19baf839 582 * to_be_doubled = tn->full_children;
c877efb2 583 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
584 * tn->full_children;
585 *
586 * new_child_length = tnode_child_length(tn) * 2;
587 *
c877efb2 588 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
589 * new_child_length;
590 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
591 *
592 * ...and so on, tho it would mess up the while () loop.
593 *
19baf839
RO
594 * anyway,
595 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
596 * inflate_threshold
c877efb2 597 *
19baf839
RO
598 * avoid a division:
599 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
600 * inflate_threshold * new_child_length
c877efb2 601 *
19baf839 602 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 603 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 604 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 605 *
19baf839 606 * expand new_child_length:
c877efb2 607 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 608 * tn->full_children) >=
19baf839 609 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 610 *
19baf839 611 * shorten again:
c877efb2 612 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 613 * tn->empty_children) >= inflate_threshold *
19baf839 614 * tnode_child_length(tn)
c877efb2 615 *
19baf839
RO
616 */
617
618 check_tnode(tn);
c877efb2 619
e6308be8
RO
620 /* Keep root node larger */
621
b299e4f0 622 if (!node_parent((struct rt_trie_node *)tn)) {
80b71b80
JL
623 inflate_threshold_use = inflate_threshold_root;
624 halve_threshold_use = halve_threshold_root;
a034ee3c 625 } else {
e6308be8 626 inflate_threshold_use = inflate_threshold;
80b71b80
JL
627 halve_threshold_use = halve_threshold;
628 }
e6308be8 629
80b71b80
JL
630 max_work = MAX_WORK;
631 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
632 50 * (tn->full_children + tnode_child_length(tn)
633 - tn->empty_children)
634 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 635
2f80b3c8
RO
636 old_tn = tn;
637 tn = inflate(t, tn);
a07f5f50 638
2f80b3c8
RO
639 if (IS_ERR(tn)) {
640 tn = old_tn;
2f36895a
RO
641#ifdef CONFIG_IP_FIB_TRIE_STATS
642 t->stats.resize_node_skipped++;
643#endif
644 break;
645 }
19baf839
RO
646 }
647
648 check_tnode(tn);
649
80b71b80 650 /* Return if at least one inflate is run */
a034ee3c 651 if (max_work != MAX_WORK)
b299e4f0 652 return (struct rt_trie_node *) tn;
80b71b80 653
19baf839
RO
654 /*
655 * Halve as long as the number of empty children in this
656 * node is above threshold.
657 */
2f36895a 658
80b71b80
JL
659 max_work = MAX_WORK;
660 while (tn->bits > 1 && max_work-- &&
19baf839 661 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 662 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 663
2f80b3c8
RO
664 old_tn = tn;
665 tn = halve(t, tn);
666 if (IS_ERR(tn)) {
667 tn = old_tn;
2f36895a
RO
668#ifdef CONFIG_IP_FIB_TRIE_STATS
669 t->stats.resize_node_skipped++;
670#endif
671 break;
672 }
673 }
19baf839 674
c877efb2 675
19baf839 676 /* Only one child remains */
80b71b80
JL
677 if (tn->empty_children == tnode_child_length(tn) - 1) {
678one_child:
19baf839 679 for (i = 0; i < tnode_child_length(tn); i++) {
b299e4f0 680 struct rt_trie_node *n;
19baf839 681
0a5c0475 682 n = rtnl_dereference(tn->child[i]);
2373ce1c 683 if (!n)
91b9a277 684 continue;
91b9a277
OJ
685
686 /* compress one level */
687
06801916 688 node_set_parent(n, NULL);
e0f7cb8c 689 tnode_free_safe(tn);
91b9a277 690 return n;
19baf839 691 }
80b71b80 692 }
b299e4f0 693 return (struct rt_trie_node *) tn;
19baf839
RO
694}
695
0a5c0475
ED
696
697static void tnode_clean_free(struct tnode *tn)
698{
699 int i;
700 struct tnode *tofree;
701
702 for (i = 0; i < tnode_child_length(tn); i++) {
703 tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
704 if (tofree)
705 tnode_free(tofree);
706 }
707 tnode_free(tn);
708}
709
2f80b3c8 710static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 711{
19baf839
RO
712 struct tnode *oldtnode = tn;
713 int olen = tnode_child_length(tn);
714 int i;
715
0c7770c7 716 pr_debug("In inflate\n");
19baf839
RO
717
718 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
719
0c7770c7 720 if (!tn)
2f80b3c8 721 return ERR_PTR(-ENOMEM);
2f36895a
RO
722
723 /*
c877efb2
SH
724 * Preallocate and store tnodes before the actual work so we
725 * don't get into an inconsistent state if memory allocation
726 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
727 * of tnode is ignored.
728 */
91b9a277
OJ
729
730 for (i = 0; i < olen; i++) {
a07f5f50 731 struct tnode *inode;
2f36895a 732
a07f5f50 733 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
734 if (inode &&
735 IS_TNODE(inode) &&
736 inode->pos == oldtnode->pos + oldtnode->bits &&
737 inode->bits > 1) {
738 struct tnode *left, *right;
ab66b4a7 739 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 740
2f36895a
RO
741 left = tnode_new(inode->key&(~m), inode->pos + 1,
742 inode->bits - 1);
2f80b3c8
RO
743 if (!left)
744 goto nomem;
91b9a277 745
2f36895a
RO
746 right = tnode_new(inode->key|m, inode->pos + 1,
747 inode->bits - 1);
748
e905a9ed 749 if (!right) {
2f80b3c8
RO
750 tnode_free(left);
751 goto nomem;
e905a9ed 752 }
2f36895a 753
b299e4f0
DM
754 put_child(t, tn, 2*i, (struct rt_trie_node *) left);
755 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
2f36895a
RO
756 }
757 }
758
91b9a277 759 for (i = 0; i < olen; i++) {
c95aaf9a 760 struct tnode *inode;
b299e4f0 761 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
762 struct tnode *left, *right;
763 int size, j;
c877efb2 764
19baf839
RO
765 /* An empty child */
766 if (node == NULL)
767 continue;
768
769 /* A leaf or an internal node with skipped bits */
770
c877efb2 771 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 772 tn->pos + tn->bits - 1) {
a07f5f50
SH
773 if (tkey_extract_bits(node->key,
774 oldtnode->pos + oldtnode->bits,
775 1) == 0)
19baf839
RO
776 put_child(t, tn, 2*i, node);
777 else
778 put_child(t, tn, 2*i+1, node);
779 continue;
780 }
781
782 /* An internal node with two children */
783 inode = (struct tnode *) node;
784
785 if (inode->bits == 1) {
0a5c0475
ED
786 put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
787 put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
19baf839 788
e0f7cb8c 789 tnode_free_safe(inode);
91b9a277 790 continue;
19baf839
RO
791 }
792
91b9a277
OJ
793 /* An internal node with more than two children */
794
795 /* We will replace this node 'inode' with two new
796 * ones, 'left' and 'right', each with half of the
797 * original children. The two new nodes will have
798 * a position one bit further down the key and this
799 * means that the "significant" part of their keys
800 * (see the discussion near the top of this file)
801 * will differ by one bit, which will be "0" in
802 * left's key and "1" in right's key. Since we are
803 * moving the key position by one step, the bit that
804 * we are moving away from - the bit at position
805 * (inode->pos) - is the one that will differ between
806 * left and right. So... we synthesize that bit in the
807 * two new keys.
808 * The mask 'm' below will be a single "one" bit at
809 * the position (inode->pos)
810 */
19baf839 811
91b9a277
OJ
812 /* Use the old key, but set the new significant
813 * bit to zero.
814 */
2f36895a 815
91b9a277
OJ
816 left = (struct tnode *) tnode_get_child(tn, 2*i);
817 put_child(t, tn, 2*i, NULL);
2f36895a 818
91b9a277 819 BUG_ON(!left);
2f36895a 820
91b9a277
OJ
821 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
822 put_child(t, tn, 2*i+1, NULL);
19baf839 823
91b9a277 824 BUG_ON(!right);
19baf839 825
91b9a277
OJ
826 size = tnode_child_length(left);
827 for (j = 0; j < size; j++) {
0a5c0475
ED
828 put_child(t, left, j, rtnl_dereference(inode->child[j]));
829 put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
19baf839 830 }
91b9a277
OJ
831 put_child(t, tn, 2*i, resize(t, left));
832 put_child(t, tn, 2*i+1, resize(t, right));
833
e0f7cb8c 834 tnode_free_safe(inode);
19baf839 835 }
e0f7cb8c 836 tnode_free_safe(oldtnode);
19baf839 837 return tn;
2f80b3c8 838nomem:
0a5c0475
ED
839 tnode_clean_free(tn);
840 return ERR_PTR(-ENOMEM);
19baf839
RO
841}
842
2f80b3c8 843static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
844{
845 struct tnode *oldtnode = tn;
b299e4f0 846 struct rt_trie_node *left, *right;
19baf839
RO
847 int i;
848 int olen = tnode_child_length(tn);
849
0c7770c7 850 pr_debug("In halve\n");
c877efb2
SH
851
852 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 853
2f80b3c8
RO
854 if (!tn)
855 return ERR_PTR(-ENOMEM);
2f36895a
RO
856
857 /*
c877efb2
SH
858 * Preallocate and store tnodes before the actual work so we
859 * don't get into an inconsistent state if memory allocation
860 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
861 * of tnode is ignored.
862 */
863
91b9a277 864 for (i = 0; i < olen; i += 2) {
2f36895a
RO
865 left = tnode_get_child(oldtnode, i);
866 right = tnode_get_child(oldtnode, i+1);
c877efb2 867
2f36895a 868 /* Two nonempty children */
0c7770c7 869 if (left && right) {
2f80b3c8 870 struct tnode *newn;
0c7770c7 871
2f80b3c8 872 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
873
874 if (!newn)
2f80b3c8 875 goto nomem;
0c7770c7 876
b299e4f0 877 put_child(t, tn, i/2, (struct rt_trie_node *)newn);
2f36895a 878 }
2f36895a 879
2f36895a 880 }
19baf839 881
91b9a277
OJ
882 for (i = 0; i < olen; i += 2) {
883 struct tnode *newBinNode;
884
19baf839
RO
885 left = tnode_get_child(oldtnode, i);
886 right = tnode_get_child(oldtnode, i+1);
c877efb2 887
19baf839
RO
888 /* At least one of the children is empty */
889 if (left == NULL) {
890 if (right == NULL) /* Both are empty */
891 continue;
892 put_child(t, tn, i/2, right);
91b9a277 893 continue;
0c7770c7 894 }
91b9a277
OJ
895
896 if (right == NULL) {
19baf839 897 put_child(t, tn, i/2, left);
91b9a277
OJ
898 continue;
899 }
c877efb2 900
19baf839 901 /* Two nonempty children */
91b9a277
OJ
902 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
903 put_child(t, tn, i/2, NULL);
91b9a277
OJ
904 put_child(t, newBinNode, 0, left);
905 put_child(t, newBinNode, 1, right);
906 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839 907 }
e0f7cb8c 908 tnode_free_safe(oldtnode);
19baf839 909 return tn;
2f80b3c8 910nomem:
0a5c0475
ED
911 tnode_clean_free(tn);
912 return ERR_PTR(-ENOMEM);
19baf839
RO
913}
914
772cb712 915/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
916 via get_fa_head and dump */
917
772cb712 918static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 919{
772cb712 920 struct hlist_head *head = &l->list;
19baf839
RO
921 struct hlist_node *node;
922 struct leaf_info *li;
923
2373ce1c 924 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 925 if (li->plen == plen)
19baf839 926 return li;
91b9a277 927
19baf839
RO
928 return NULL;
929}
930
a07f5f50 931static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 932{
772cb712 933 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 934
91b9a277
OJ
935 if (!li)
936 return NULL;
c877efb2 937
91b9a277 938 return &li->falh;
19baf839
RO
939}
940
941static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
942{
e905a9ed
YH
943 struct leaf_info *li = NULL, *last = NULL;
944 struct hlist_node *node;
945
946 if (hlist_empty(head)) {
947 hlist_add_head_rcu(&new->hlist, head);
948 } else {
949 hlist_for_each_entry(li, node, head, hlist) {
950 if (new->plen > li->plen)
951 break;
952
953 last = li;
954 }
955 if (last)
956 hlist_add_after_rcu(&last->hlist, &new->hlist);
957 else
958 hlist_add_before_rcu(&new->hlist, &li->hlist);
959 }
19baf839
RO
960}
961
2373ce1c
RO
962/* rcu_read_lock needs to be hold by caller from readside */
963
19baf839
RO
964static struct leaf *
965fib_find_node(struct trie *t, u32 key)
966{
967 int pos;
968 struct tnode *tn;
b299e4f0 969 struct rt_trie_node *n;
19baf839
RO
970
971 pos = 0;
a034ee3c 972 n = rcu_dereference_rtnl(t->trie);
19baf839
RO
973
974 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
975 tn = (struct tnode *) n;
91b9a277 976
19baf839 977 check_tnode(tn);
91b9a277 978
c877efb2 979 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 980 pos = tn->pos + tn->bits;
a07f5f50
SH
981 n = tnode_get_child_rcu(tn,
982 tkey_extract_bits(key,
983 tn->pos,
984 tn->bits));
91b9a277 985 } else
19baf839
RO
986 break;
987 }
988 /* Case we have found a leaf. Compare prefixes */
989
91b9a277
OJ
990 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
991 return (struct leaf *)n;
992
19baf839
RO
993 return NULL;
994}
995
7b85576d 996static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 997{
19baf839 998 int wasfull;
3ed18d76 999 t_key cindex, key;
06801916 1000 struct tnode *tp;
19baf839 1001
3ed18d76
RO
1002 key = tn->key;
1003
b299e4f0 1004 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
19baf839
RO
1005 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1006 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
a07f5f50
SH
1007 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1008
1009 tnode_put_child_reorg((struct tnode *)tp, cindex,
b299e4f0 1010 (struct rt_trie_node *)tn, wasfull);
91b9a277 1011
b299e4f0 1012 tp = node_parent((struct rt_trie_node *) tn);
008440e3 1013 if (!tp)
b299e4f0 1014 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
008440e3 1015
e0f7cb8c 1016 tnode_free_flush();
06801916 1017 if (!tp)
19baf839 1018 break;
06801916 1019 tn = tp;
19baf839 1020 }
06801916 1021
19baf839 1022 /* Handle last (top) tnode */
7b85576d 1023 if (IS_TNODE(tn))
a07f5f50 1024 tn = (struct tnode *)resize(t, (struct tnode *)tn);
19baf839 1025
b299e4f0 1026 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
7b85576d 1027 tnode_free_flush();
19baf839
RO
1028}
1029
2373ce1c
RO
1030/* only used from updater-side */
1031
fea86ad8 1032static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1033{
1034 int pos, newpos;
1035 struct tnode *tp = NULL, *tn = NULL;
b299e4f0 1036 struct rt_trie_node *n;
19baf839
RO
1037 struct leaf *l;
1038 int missbit;
c877efb2 1039 struct list_head *fa_head = NULL;
19baf839
RO
1040 struct leaf_info *li;
1041 t_key cindex;
1042
1043 pos = 0;
0a5c0475 1044 n = rtnl_dereference(t->trie);
19baf839 1045
c877efb2
SH
1046 /* If we point to NULL, stop. Either the tree is empty and we should
1047 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1048 * and we should just put our new leaf in that.
c877efb2
SH
1049 * If we point to a T_TNODE, check if it matches our key. Note that
1050 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1051 * not be the parent's 'pos'+'bits'!
1052 *
c877efb2 1053 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1054 * the index from our key, push the T_TNODE and walk the tree.
1055 *
1056 * If it doesn't, we have to replace it with a new T_TNODE.
1057 *
c877efb2
SH
1058 * If we point to a T_LEAF, it might or might not have the same key
1059 * as we do. If it does, just change the value, update the T_LEAF's
1060 * value, and return it.
19baf839
RO
1061 * If it doesn't, we need to replace it with a T_TNODE.
1062 */
1063
1064 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1065 tn = (struct tnode *) n;
91b9a277 1066
c877efb2 1067 check_tnode(tn);
91b9a277 1068
c877efb2 1069 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1070 tp = tn;
91b9a277 1071 pos = tn->pos + tn->bits;
a07f5f50
SH
1072 n = tnode_get_child(tn,
1073 tkey_extract_bits(key,
1074 tn->pos,
1075 tn->bits));
19baf839 1076
06801916 1077 BUG_ON(n && node_parent(n) != tn);
91b9a277 1078 } else
19baf839
RO
1079 break;
1080 }
1081
1082 /*
1083 * n ----> NULL, LEAF or TNODE
1084 *
c877efb2 1085 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1086 */
1087
91b9a277 1088 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1089
1090 /* Case 1: n is a leaf. Compare prefixes */
1091
c877efb2 1092 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1093 l = (struct leaf *) n;
19baf839 1094 li = leaf_info_new(plen);
91b9a277 1095
fea86ad8
SH
1096 if (!li)
1097 return NULL;
19baf839
RO
1098
1099 fa_head = &li->falh;
1100 insert_leaf_info(&l->list, li);
1101 goto done;
1102 }
19baf839
RO
1103 l = leaf_new();
1104
fea86ad8
SH
1105 if (!l)
1106 return NULL;
19baf839
RO
1107
1108 l->key = key;
1109 li = leaf_info_new(plen);
1110
c877efb2 1111 if (!li) {
387a5487 1112 free_leaf(l);
fea86ad8 1113 return NULL;
f835e471 1114 }
19baf839
RO
1115
1116 fa_head = &li->falh;
1117 insert_leaf_info(&l->list, li);
1118
19baf839 1119 if (t->trie && n == NULL) {
91b9a277 1120 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1121
b299e4f0 1122 node_set_parent((struct rt_trie_node *)l, tp);
19baf839 1123
91b9a277 1124 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
b299e4f0 1125 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
91b9a277
OJ
1126 } else {
1127 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1128 /*
1129 * Add a new tnode here
19baf839
RO
1130 * first tnode need some special handling
1131 */
1132
1133 if (tp)
91b9a277 1134 pos = tp->pos+tp->bits;
19baf839 1135 else
91b9a277
OJ
1136 pos = 0;
1137
c877efb2 1138 if (n) {
19baf839
RO
1139 newpos = tkey_mismatch(key, pos, n->key);
1140 tn = tnode_new(n->key, newpos, 1);
91b9a277 1141 } else {
19baf839 1142 newpos = 0;
c877efb2 1143 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1144 }
19baf839 1145
c877efb2 1146 if (!tn) {
f835e471 1147 free_leaf_info(li);
387a5487 1148 free_leaf(l);
fea86ad8 1149 return NULL;
91b9a277
OJ
1150 }
1151
b299e4f0 1152 node_set_parent((struct rt_trie_node *)tn, tp);
19baf839 1153
91b9a277 1154 missbit = tkey_extract_bits(key, newpos, 1);
b299e4f0 1155 put_child(t, tn, missbit, (struct rt_trie_node *)l);
19baf839
RO
1156 put_child(t, tn, 1-missbit, n);
1157
c877efb2 1158 if (tp) {
19baf839 1159 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
a07f5f50 1160 put_child(t, (struct tnode *)tp, cindex,
b299e4f0 1161 (struct rt_trie_node *)tn);
91b9a277 1162 } else {
b299e4f0 1163 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
19baf839
RO
1164 tp = tn;
1165 }
1166 }
91b9a277
OJ
1167
1168 if (tp && tp->pos + tp->bits > 32)
a07f5f50
SH
1169 pr_warning("fib_trie"
1170 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1171 tp, tp->pos, tp->bits, key, plen);
91b9a277 1172
19baf839 1173 /* Rebalance the trie */
2373ce1c 1174
7b85576d 1175 trie_rebalance(t, tp);
f835e471 1176done:
19baf839
RO
1177 return fa_head;
1178}
1179
d562f1f8
RO
1180/*
1181 * Caller must hold RTNL.
1182 */
16c6cf8b 1183int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1184{
1185 struct trie *t = (struct trie *) tb->tb_data;
1186 struct fib_alias *fa, *new_fa;
c877efb2 1187 struct list_head *fa_head = NULL;
19baf839 1188 struct fib_info *fi;
4e902c57
TG
1189 int plen = cfg->fc_dst_len;
1190 u8 tos = cfg->fc_tos;
19baf839
RO
1191 u32 key, mask;
1192 int err;
1193 struct leaf *l;
1194
1195 if (plen > 32)
1196 return -EINVAL;
1197
4e902c57 1198 key = ntohl(cfg->fc_dst);
19baf839 1199
2dfe55b4 1200 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1201
91b9a277 1202 mask = ntohl(inet_make_mask(plen));
19baf839 1203
c877efb2 1204 if (key & ~mask)
19baf839
RO
1205 return -EINVAL;
1206
1207 key = key & mask;
1208
4e902c57
TG
1209 fi = fib_create_info(cfg);
1210 if (IS_ERR(fi)) {
1211 err = PTR_ERR(fi);
19baf839 1212 goto err;
4e902c57 1213 }
19baf839
RO
1214
1215 l = fib_find_node(t, key);
c877efb2 1216 fa = NULL;
19baf839 1217
c877efb2 1218 if (l) {
19baf839
RO
1219 fa_head = get_fa_head(l, plen);
1220 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1221 }
1222
1223 /* Now fa, if non-NULL, points to the first fib alias
1224 * with the same keys [prefix,tos,priority], if such key already
1225 * exists or to the node before which we will insert new one.
1226 *
1227 * If fa is NULL, we will need to allocate a new one and
1228 * insert to the head of f.
1229 *
1230 * If f is NULL, no fib node matched the destination key
1231 * and we need to allocate a new one of those as well.
1232 */
1233
936f6f8e
JA
1234 if (fa && fa->fa_tos == tos &&
1235 fa->fa_info->fib_priority == fi->fib_priority) {
1236 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1237
1238 err = -EEXIST;
4e902c57 1239 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1240 goto out;
1241
936f6f8e
JA
1242 /* We have 2 goals:
1243 * 1. Find exact match for type, scope, fib_info to avoid
1244 * duplicate routes
1245 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1246 */
1247 fa_match = NULL;
1248 fa_first = fa;
1249 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1250 list_for_each_entry_continue(fa, fa_head, fa_list) {
1251 if (fa->fa_tos != tos)
1252 break;
1253 if (fa->fa_info->fib_priority != fi->fib_priority)
1254 break;
1255 if (fa->fa_type == cfg->fc_type &&
936f6f8e
JA
1256 fa->fa_info == fi) {
1257 fa_match = fa;
1258 break;
1259 }
1260 }
1261
4e902c57 1262 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1263 struct fib_info *fi_drop;
1264 u8 state;
1265
936f6f8e
JA
1266 fa = fa_first;
1267 if (fa_match) {
1268 if (fa == fa_match)
1269 err = 0;
6725033f 1270 goto out;
936f6f8e 1271 }
2373ce1c 1272 err = -ENOBUFS;
e94b1766 1273 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1274 if (new_fa == NULL)
1275 goto out;
19baf839
RO
1276
1277 fi_drop = fa->fa_info;
2373ce1c
RO
1278 new_fa->fa_tos = fa->fa_tos;
1279 new_fa->fa_info = fi;
4e902c57 1280 new_fa->fa_type = cfg->fc_type;
19baf839 1281 state = fa->fa_state;
936f6f8e 1282 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1283
2373ce1c
RO
1284 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1285 alias_free_mem_rcu(fa);
19baf839
RO
1286
1287 fib_release_info(fi_drop);
1288 if (state & FA_S_ACCESSED)
76e6ebfb 1289 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
b8f55831
MK
1290 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1291 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1292
91b9a277 1293 goto succeeded;
19baf839
RO
1294 }
1295 /* Error if we find a perfect match which
1296 * uses the same scope, type, and nexthop
1297 * information.
1298 */
936f6f8e
JA
1299 if (fa_match)
1300 goto out;
a07f5f50 1301
4e902c57 1302 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1303 fa = fa_first;
19baf839
RO
1304 }
1305 err = -ENOENT;
4e902c57 1306 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1307 goto out;
1308
1309 err = -ENOBUFS;
e94b1766 1310 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1311 if (new_fa == NULL)
1312 goto out;
1313
1314 new_fa->fa_info = fi;
1315 new_fa->fa_tos = tos;
4e902c57 1316 new_fa->fa_type = cfg->fc_type;
19baf839 1317 new_fa->fa_state = 0;
19baf839
RO
1318 /*
1319 * Insert new entry to the list.
1320 */
1321
c877efb2 1322 if (!fa_head) {
fea86ad8
SH
1323 fa_head = fib_insert_node(t, key, plen);
1324 if (unlikely(!fa_head)) {
1325 err = -ENOMEM;
f835e471 1326 goto out_free_new_fa;
fea86ad8 1327 }
f835e471 1328 }
19baf839 1329
21d8c49e
DM
1330 if (!plen)
1331 tb->tb_num_default++;
1332
2373ce1c
RO
1333 list_add_tail_rcu(&new_fa->fa_list,
1334 (fa ? &fa->fa_list : fa_head));
19baf839 1335
76e6ebfb 1336 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
4e902c57 1337 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1338 &cfg->fc_nlinfo, 0);
19baf839
RO
1339succeeded:
1340 return 0;
f835e471
RO
1341
1342out_free_new_fa:
1343 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1344out:
1345 fib_release_info(fi);
91b9a277 1346err:
19baf839
RO
1347 return err;
1348}
1349
772cb712 1350/* should be called with rcu_read_lock */
5b470441 1351static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
22bd5b9b 1352 t_key key, const struct flowi4 *flp,
ebc0ffae 1353 struct fib_result *res, int fib_flags)
19baf839 1354{
19baf839
RO
1355 struct leaf_info *li;
1356 struct hlist_head *hhead = &l->list;
1357 struct hlist_node *node;
c877efb2 1358
2373ce1c 1359 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
3be0686b 1360 struct fib_alias *fa;
a07f5f50
SH
1361 int plen = li->plen;
1362 __be32 mask = inet_make_mask(plen);
1363
888454c5 1364 if (l->key != (key & ntohl(mask)))
19baf839
RO
1365 continue;
1366
3be0686b
DM
1367 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1368 struct fib_info *fi = fa->fa_info;
1369 int nhsel, err;
a07f5f50 1370
22bd5b9b 1371 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
3be0686b 1372 continue;
37e826c5 1373 if (fa->fa_info->fib_scope < flp->flowi4_scope)
3be0686b
DM
1374 continue;
1375 fib_alias_accessed(fa);
1376 err = fib_props[fa->fa_type].error;
1377 if (err) {
19baf839 1378#ifdef CONFIG_IP_FIB_TRIE_STATS
1fbc7843 1379 t->stats.semantic_match_passed++;
3be0686b 1380#endif
1fbc7843 1381 return err;
3be0686b
DM
1382 }
1383 if (fi->fib_flags & RTNH_F_DEAD)
1384 continue;
1385 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1386 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1387
1388 if (nh->nh_flags & RTNH_F_DEAD)
1389 continue;
22bd5b9b 1390 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
3be0686b
DM
1391 continue;
1392
1393#ifdef CONFIG_IP_FIB_TRIE_STATS
1394 t->stats.semantic_match_passed++;
1395#endif
1396 res->prefixlen = plen;
1397 res->nh_sel = nhsel;
1398 res->type = fa->fa_type;
37e826c5 1399 res->scope = fa->fa_info->fib_scope;
3be0686b
DM
1400 res->fi = fi;
1401 res->table = tb;
1402 res->fa_head = &li->falh;
1403 if (!(fib_flags & FIB_LOOKUP_NOREF))
1404 atomic_inc(&res->fi->fib_clntref);
1405 return 0;
1406 }
1407 }
1408
1409#ifdef CONFIG_IP_FIB_TRIE_STATS
1410 t->stats.semantic_match_miss++;
19baf839 1411#endif
19baf839 1412 }
a07f5f50 1413
2e655571 1414 return 1;
19baf839
RO
1415}
1416
22bd5b9b 1417int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
ebc0ffae 1418 struct fib_result *res, int fib_flags)
19baf839
RO
1419{
1420 struct trie *t = (struct trie *) tb->tb_data;
2e655571 1421 int ret;
b299e4f0 1422 struct rt_trie_node *n;
19baf839 1423 struct tnode *pn;
3b004569 1424 unsigned int pos, bits;
22bd5b9b 1425 t_key key = ntohl(flp->daddr);
3b004569 1426 unsigned int chopped_off;
19baf839 1427 t_key cindex = 0;
3b004569 1428 unsigned int current_prefix_length = KEYLENGTH;
91b9a277 1429 struct tnode *cn;
874ffa8f 1430 t_key pref_mismatch;
91b9a277 1431
2373ce1c 1432 rcu_read_lock();
91b9a277 1433
2373ce1c 1434 n = rcu_dereference(t->trie);
c877efb2 1435 if (!n)
19baf839
RO
1436 goto failed;
1437
1438#ifdef CONFIG_IP_FIB_TRIE_STATS
1439 t->stats.gets++;
1440#endif
1441
1442 /* Just a leaf? */
1443 if (IS_LEAF(n)) {
5b470441 1444 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
a07f5f50 1445 goto found;
19baf839 1446 }
a07f5f50 1447
19baf839
RO
1448 pn = (struct tnode *) n;
1449 chopped_off = 0;
c877efb2 1450
91b9a277 1451 while (pn) {
19baf839
RO
1452 pos = pn->pos;
1453 bits = pn->bits;
1454
c877efb2 1455 if (!chopped_off)
ab66b4a7
SH
1456 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1457 pos, bits);
19baf839 1458
b902e573 1459 n = tnode_get_child_rcu(pn, cindex);
19baf839
RO
1460
1461 if (n == NULL) {
1462#ifdef CONFIG_IP_FIB_TRIE_STATS
1463 t->stats.null_node_hit++;
1464#endif
1465 goto backtrace;
1466 }
1467
91b9a277 1468 if (IS_LEAF(n)) {
5b470441 1469 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
2e655571 1470 if (ret > 0)
91b9a277 1471 goto backtrace;
a07f5f50 1472 goto found;
91b9a277
OJ
1473 }
1474
91b9a277 1475 cn = (struct tnode *)n;
19baf839 1476
91b9a277
OJ
1477 /*
1478 * It's a tnode, and we can do some extra checks here if we
1479 * like, to avoid descending into a dead-end branch.
1480 * This tnode is in the parent's child array at index
1481 * key[p_pos..p_pos+p_bits] but potentially with some bits
1482 * chopped off, so in reality the index may be just a
1483 * subprefix, padded with zero at the end.
1484 * We can also take a look at any skipped bits in this
1485 * tnode - everything up to p_pos is supposed to be ok,
1486 * and the non-chopped bits of the index (se previous
1487 * paragraph) are also guaranteed ok, but the rest is
1488 * considered unknown.
1489 *
1490 * The skipped bits are key[pos+bits..cn->pos].
1491 */
19baf839 1492
91b9a277
OJ
1493 /* If current_prefix_length < pos+bits, we are already doing
1494 * actual prefix matching, which means everything from
1495 * pos+(bits-chopped_off) onward must be zero along some
1496 * branch of this subtree - otherwise there is *no* valid
1497 * prefix present. Here we can only check the skipped
1498 * bits. Remember, since we have already indexed into the
1499 * parent's child array, we know that the bits we chopped of
1500 * *are* zero.
1501 */
19baf839 1502
a07f5f50
SH
1503 /* NOTA BENE: Checking only skipped bits
1504 for the new node here */
19baf839 1505
91b9a277
OJ
1506 if (current_prefix_length < pos+bits) {
1507 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1508 cn->pos - current_prefix_length)
1509 || !(cn->child[0]))
91b9a277
OJ
1510 goto backtrace;
1511 }
19baf839 1512
91b9a277
OJ
1513 /*
1514 * If chopped_off=0, the index is fully validated and we
1515 * only need to look at the skipped bits for this, the new,
1516 * tnode. What we actually want to do is to find out if
1517 * these skipped bits match our key perfectly, or if we will
1518 * have to count on finding a matching prefix further down,
1519 * because if we do, we would like to have some way of
1520 * verifying the existence of such a prefix at this point.
1521 */
19baf839 1522
91b9a277
OJ
1523 /* The only thing we can do at this point is to verify that
1524 * any such matching prefix can indeed be a prefix to our
1525 * key, and if the bits in the node we are inspecting that
1526 * do not match our key are not ZERO, this cannot be true.
1527 * Thus, find out where there is a mismatch (before cn->pos)
1528 * and verify that all the mismatching bits are zero in the
1529 * new tnode's key.
1530 */
19baf839 1531
a07f5f50
SH
1532 /*
1533 * Note: We aren't very concerned about the piece of
1534 * the key that precede pn->pos+pn->bits, since these
1535 * have already been checked. The bits after cn->pos
1536 * aren't checked since these are by definition
1537 * "unknown" at this point. Thus, what we want to see
1538 * is if we are about to enter the "prefix matching"
1539 * state, and in that case verify that the skipped
1540 * bits that will prevail throughout this subtree are
1541 * zero, as they have to be if we are to find a
1542 * matching prefix.
91b9a277
OJ
1543 */
1544
874ffa8f 1545 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
91b9a277 1546
a07f5f50
SH
1547 /*
1548 * In short: If skipped bits in this node do not match
1549 * the search key, enter the "prefix matching"
1550 * state.directly.
91b9a277
OJ
1551 */
1552 if (pref_mismatch) {
874ffa8f 1553 int mp = KEYLENGTH - fls(pref_mismatch);
91b9a277 1554
874ffa8f 1555 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
91b9a277
OJ
1556 goto backtrace;
1557
1558 if (current_prefix_length >= cn->pos)
1559 current_prefix_length = mp;
c877efb2 1560 }
a07f5f50 1561
91b9a277
OJ
1562 pn = (struct tnode *)n; /* Descend */
1563 chopped_off = 0;
1564 continue;
1565
19baf839
RO
1566backtrace:
1567 chopped_off++;
1568
1569 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1570 while ((chopped_off <= pn->bits)
1571 && !(cindex & (1<<(chopped_off-1))))
19baf839 1572 chopped_off++;
19baf839
RO
1573
1574 /* Decrease current_... with bits chopped off */
1575 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1576 current_prefix_length = pn->pos + pn->bits
1577 - chopped_off;
91b9a277 1578
19baf839 1579 /*
c877efb2 1580 * Either we do the actual chop off according or if we have
19baf839
RO
1581 * chopped off all bits in this tnode walk up to our parent.
1582 */
1583
91b9a277 1584 if (chopped_off <= pn->bits) {
19baf839 1585 cindex &= ~(1 << (chopped_off-1));
91b9a277 1586 } else {
b299e4f0 1587 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
06801916 1588 if (!parent)
19baf839 1589 goto failed;
91b9a277 1590
19baf839 1591 /* Get Child's index */
06801916
SH
1592 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1593 pn = parent;
19baf839
RO
1594 chopped_off = 0;
1595
1596#ifdef CONFIG_IP_FIB_TRIE_STATS
1597 t->stats.backtrack++;
1598#endif
1599 goto backtrace;
c877efb2 1600 }
19baf839
RO
1601 }
1602failed:
c877efb2 1603 ret = 1;
19baf839 1604found:
2373ce1c 1605 rcu_read_unlock();
19baf839
RO
1606 return ret;
1607}
1608
9195bef7
SH
1609/*
1610 * Remove the leaf and return parent.
1611 */
1612static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1613{
b299e4f0 1614 struct tnode *tp = node_parent((struct rt_trie_node *) l);
c877efb2 1615
9195bef7 1616 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1617
c877efb2 1618 if (tp) {
9195bef7 1619 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
19baf839 1620 put_child(t, (struct tnode *)tp, cindex, NULL);
7b85576d 1621 trie_rebalance(t, tp);
91b9a277 1622 } else
2373ce1c 1623 rcu_assign_pointer(t->trie, NULL);
19baf839 1624
387a5487 1625 free_leaf(l);
19baf839
RO
1626}
1627
d562f1f8
RO
1628/*
1629 * Caller must hold RTNL.
1630 */
16c6cf8b 1631int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1632{
1633 struct trie *t = (struct trie *) tb->tb_data;
1634 u32 key, mask;
4e902c57
TG
1635 int plen = cfg->fc_dst_len;
1636 u8 tos = cfg->fc_tos;
19baf839
RO
1637 struct fib_alias *fa, *fa_to_delete;
1638 struct list_head *fa_head;
1639 struct leaf *l;
91b9a277
OJ
1640 struct leaf_info *li;
1641
c877efb2 1642 if (plen > 32)
19baf839
RO
1643 return -EINVAL;
1644
4e902c57 1645 key = ntohl(cfg->fc_dst);
91b9a277 1646 mask = ntohl(inet_make_mask(plen));
19baf839 1647
c877efb2 1648 if (key & ~mask)
19baf839
RO
1649 return -EINVAL;
1650
1651 key = key & mask;
1652 l = fib_find_node(t, key);
1653
c877efb2 1654 if (!l)
19baf839
RO
1655 return -ESRCH;
1656
1657 fa_head = get_fa_head(l, plen);
1658 fa = fib_find_alias(fa_head, tos, 0);
1659
1660 if (!fa)
1661 return -ESRCH;
1662
0c7770c7 1663 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1664
1665 fa_to_delete = NULL;
936f6f8e
JA
1666 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1667 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1668 struct fib_info *fi = fa->fa_info;
1669
1670 if (fa->fa_tos != tos)
1671 break;
1672
4e902c57
TG
1673 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1674 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
37e826c5 1675 fa->fa_info->fib_scope == cfg->fc_scope) &&
74cb3c10
JA
1676 (!cfg->fc_prefsrc ||
1677 fi->fib_prefsrc == cfg->fc_prefsrc) &&
4e902c57
TG
1678 (!cfg->fc_protocol ||
1679 fi->fib_protocol == cfg->fc_protocol) &&
1680 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1681 fa_to_delete = fa;
1682 break;
1683 }
1684 }
1685
91b9a277
OJ
1686 if (!fa_to_delete)
1687 return -ESRCH;
19baf839 1688
91b9a277 1689 fa = fa_to_delete;
4e902c57 1690 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1691 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1692
1693 l = fib_find_node(t, key);
772cb712 1694 li = find_leaf_info(l, plen);
19baf839 1695
2373ce1c 1696 list_del_rcu(&fa->fa_list);
19baf839 1697
21d8c49e
DM
1698 if (!plen)
1699 tb->tb_num_default--;
1700
91b9a277 1701 if (list_empty(fa_head)) {
2373ce1c 1702 hlist_del_rcu(&li->hlist);
91b9a277 1703 free_leaf_info(li);
2373ce1c 1704 }
19baf839 1705
91b9a277 1706 if (hlist_empty(&l->list))
9195bef7 1707 trie_leaf_remove(t, l);
19baf839 1708
91b9a277 1709 if (fa->fa_state & FA_S_ACCESSED)
76e6ebfb 1710 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
19baf839 1711
2373ce1c
RO
1712 fib_release_info(fa->fa_info);
1713 alias_free_mem_rcu(fa);
91b9a277 1714 return 0;
19baf839
RO
1715}
1716
ef3660ce 1717static int trie_flush_list(struct list_head *head)
19baf839
RO
1718{
1719 struct fib_alias *fa, *fa_node;
1720 int found = 0;
1721
1722 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1723 struct fib_info *fi = fa->fa_info;
19baf839 1724
2373ce1c
RO
1725 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1726 list_del_rcu(&fa->fa_list);
1727 fib_release_info(fa->fa_info);
1728 alias_free_mem_rcu(fa);
19baf839
RO
1729 found++;
1730 }
1731 }
1732 return found;
1733}
1734
ef3660ce 1735static int trie_flush_leaf(struct leaf *l)
19baf839
RO
1736{
1737 int found = 0;
1738 struct hlist_head *lih = &l->list;
1739 struct hlist_node *node, *tmp;
1740 struct leaf_info *li = NULL;
1741
1742 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
ef3660ce 1743 found += trie_flush_list(&li->falh);
19baf839
RO
1744
1745 if (list_empty(&li->falh)) {
2373ce1c 1746 hlist_del_rcu(&li->hlist);
19baf839
RO
1747 free_leaf_info(li);
1748 }
1749 }
1750 return found;
1751}
1752
82cfbb00
SH
1753/*
1754 * Scan for the next right leaf starting at node p->child[idx]
1755 * Since we have back pointer, no recursion necessary.
1756 */
b299e4f0 1757static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
19baf839 1758{
82cfbb00
SH
1759 do {
1760 t_key idx;
c877efb2 1761
c877efb2 1762 if (c)
82cfbb00 1763 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1764 else
82cfbb00 1765 idx = 0;
2373ce1c 1766
82cfbb00
SH
1767 while (idx < 1u << p->bits) {
1768 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1769 if (!c)
91b9a277
OJ
1770 continue;
1771
82cfbb00 1772 if (IS_LEAF(c)) {
0a5c0475 1773 prefetch(rcu_dereference_rtnl(p->child[idx]));
82cfbb00 1774 return (struct leaf *) c;
19baf839 1775 }
82cfbb00
SH
1776
1777 /* Rescan start scanning in new node */
1778 p = (struct tnode *) c;
1779 idx = 0;
19baf839 1780 }
82cfbb00
SH
1781
1782 /* Node empty, walk back up to parent */
b299e4f0 1783 c = (struct rt_trie_node *) p;
a034ee3c 1784 } while ((p = node_parent_rcu(c)) != NULL);
82cfbb00
SH
1785
1786 return NULL; /* Root of trie */
1787}
1788
82cfbb00
SH
1789static struct leaf *trie_firstleaf(struct trie *t)
1790{
a034ee3c 1791 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
82cfbb00
SH
1792
1793 if (!n)
1794 return NULL;
1795
1796 if (IS_LEAF(n)) /* trie is just a leaf */
1797 return (struct leaf *) n;
1798
1799 return leaf_walk_rcu(n, NULL);
1800}
1801
1802static struct leaf *trie_nextleaf(struct leaf *l)
1803{
b299e4f0 1804 struct rt_trie_node *c = (struct rt_trie_node *) l;
b902e573 1805 struct tnode *p = node_parent_rcu(c);
82cfbb00
SH
1806
1807 if (!p)
1808 return NULL; /* trie with just one leaf */
1809
1810 return leaf_walk_rcu(p, c);
19baf839
RO
1811}
1812
71d67e66
SH
1813static struct leaf *trie_leafindex(struct trie *t, int index)
1814{
1815 struct leaf *l = trie_firstleaf(t);
1816
ec28cf73 1817 while (l && index-- > 0)
71d67e66 1818 l = trie_nextleaf(l);
ec28cf73 1819
71d67e66
SH
1820 return l;
1821}
1822
1823
d562f1f8
RO
1824/*
1825 * Caller must hold RTNL.
1826 */
16c6cf8b 1827int fib_table_flush(struct fib_table *tb)
19baf839
RO
1828{
1829 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1830 struct leaf *l, *ll = NULL;
82cfbb00 1831 int found = 0;
19baf839 1832
82cfbb00 1833 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1834 found += trie_flush_leaf(l);
19baf839
RO
1835
1836 if (ll && hlist_empty(&ll->list))
9195bef7 1837 trie_leaf_remove(t, ll);
19baf839
RO
1838 ll = l;
1839 }
1840
1841 if (ll && hlist_empty(&ll->list))
9195bef7 1842 trie_leaf_remove(t, ll);
19baf839 1843
0c7770c7 1844 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1845 return found;
1846}
1847
4aa2c466
PE
1848void fib_free_table(struct fib_table *tb)
1849{
1850 kfree(tb);
1851}
1852
a07f5f50
SH
1853static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1854 struct fib_table *tb,
19baf839
RO
1855 struct sk_buff *skb, struct netlink_callback *cb)
1856{
1857 int i, s_i;
1858 struct fib_alias *fa;
32ab5f80 1859 __be32 xkey = htonl(key);
19baf839 1860
71d67e66 1861 s_i = cb->args[5];
19baf839
RO
1862 i = 0;
1863
2373ce1c
RO
1864 /* rcu_read_lock is hold by caller */
1865
1866 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1867 if (i < s_i) {
1868 i++;
1869 continue;
1870 }
19baf839
RO
1871
1872 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1873 cb->nlh->nlmsg_seq,
1874 RTM_NEWROUTE,
1875 tb->tb_id,
1876 fa->fa_type,
be403ea1 1877 xkey,
19baf839
RO
1878 plen,
1879 fa->fa_tos,
64347f78 1880 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1881 cb->args[5] = i;
19baf839 1882 return -1;
91b9a277 1883 }
19baf839
RO
1884 i++;
1885 }
71d67e66 1886 cb->args[5] = i;
19baf839
RO
1887 return skb->len;
1888}
1889
a88ee229
SH
1890static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1891 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1892{
a88ee229
SH
1893 struct leaf_info *li;
1894 struct hlist_node *node;
1895 int i, s_i;
19baf839 1896
71d67e66 1897 s_i = cb->args[4];
a88ee229 1898 i = 0;
19baf839 1899
a88ee229
SH
1900 /* rcu_read_lock is hold by caller */
1901 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1902 if (i < s_i) {
1903 i++;
19baf839 1904 continue;
a88ee229 1905 }
91b9a277 1906
a88ee229 1907 if (i > s_i)
71d67e66 1908 cb->args[5] = 0;
19baf839 1909
a88ee229 1910 if (list_empty(&li->falh))
19baf839
RO
1911 continue;
1912
a88ee229 1913 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1914 cb->args[4] = i;
19baf839
RO
1915 return -1;
1916 }
a88ee229 1917 i++;
19baf839 1918 }
a88ee229 1919
71d67e66 1920 cb->args[4] = i;
19baf839
RO
1921 return skb->len;
1922}
1923
16c6cf8b
SH
1924int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1925 struct netlink_callback *cb)
19baf839 1926{
a88ee229 1927 struct leaf *l;
19baf839 1928 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1929 t_key key = cb->args[2];
71d67e66 1930 int count = cb->args[3];
19baf839 1931
2373ce1c 1932 rcu_read_lock();
d5ce8a0e
SH
1933 /* Dump starting at last key.
1934 * Note: 0.0.0.0/0 (ie default) is first key.
1935 */
71d67e66 1936 if (count == 0)
d5ce8a0e
SH
1937 l = trie_firstleaf(t);
1938 else {
71d67e66
SH
1939 /* Normally, continue from last key, but if that is missing
1940 * fallback to using slow rescan
1941 */
d5ce8a0e 1942 l = fib_find_node(t, key);
71d67e66
SH
1943 if (!l)
1944 l = trie_leafindex(t, count);
d5ce8a0e 1945 }
a88ee229 1946
d5ce8a0e
SH
1947 while (l) {
1948 cb->args[2] = l->key;
a88ee229 1949 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1950 cb->args[3] = count;
a88ee229 1951 rcu_read_unlock();
a88ee229 1952 return -1;
19baf839 1953 }
d5ce8a0e 1954
71d67e66 1955 ++count;
d5ce8a0e 1956 l = trie_nextleaf(l);
71d67e66
SH
1957 memset(&cb->args[4], 0,
1958 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1959 }
71d67e66 1960 cb->args[3] = count;
2373ce1c 1961 rcu_read_unlock();
a88ee229 1962
19baf839 1963 return skb->len;
19baf839
RO
1964}
1965
5348ba85 1966void __init fib_trie_init(void)
7f9b8052 1967{
a07f5f50
SH
1968 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1969 sizeof(struct fib_alias),
bc3c8c1e
SH
1970 0, SLAB_PANIC, NULL);
1971
1972 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1973 max(sizeof(struct leaf),
1974 sizeof(struct leaf_info)),
1975 0, SLAB_PANIC, NULL);
7f9b8052 1976}
19baf839 1977
7f9b8052 1978
5348ba85 1979struct fib_table *fib_trie_table(u32 id)
19baf839
RO
1980{
1981 struct fib_table *tb;
1982 struct trie *t;
1983
19baf839
RO
1984 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1985 GFP_KERNEL);
1986 if (tb == NULL)
1987 return NULL;
1988
1989 tb->tb_id = id;
971b893e 1990 tb->tb_default = -1;
21d8c49e 1991 tb->tb_num_default = 0;
19baf839
RO
1992
1993 t = (struct trie *) tb->tb_data;
c28a1cf4 1994 memset(t, 0, sizeof(*t));
19baf839 1995
19baf839
RO
1996 return tb;
1997}
1998
cb7b593c
SH
1999#ifdef CONFIG_PROC_FS
2000/* Depth first Trie walk iterator */
2001struct fib_trie_iter {
1c340b2f 2002 struct seq_net_private p;
3d3b2d25 2003 struct fib_table *tb;
cb7b593c 2004 struct tnode *tnode;
a034ee3c
ED
2005 unsigned int index;
2006 unsigned int depth;
cb7b593c 2007};
19baf839 2008
b299e4f0 2009static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2010{
cb7b593c 2011 struct tnode *tn = iter->tnode;
a034ee3c 2012 unsigned int cindex = iter->index;
cb7b593c 2013 struct tnode *p;
19baf839 2014
6640e697
EB
2015 /* A single entry routing table */
2016 if (!tn)
2017 return NULL;
2018
cb7b593c
SH
2019 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2020 iter->tnode, iter->index, iter->depth);
2021rescan:
2022 while (cindex < (1<<tn->bits)) {
b299e4f0 2023 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2024
cb7b593c
SH
2025 if (n) {
2026 if (IS_LEAF(n)) {
2027 iter->tnode = tn;
2028 iter->index = cindex + 1;
2029 } else {
2030 /* push down one level */
2031 iter->tnode = (struct tnode *) n;
2032 iter->index = 0;
2033 ++iter->depth;
2034 }
2035 return n;
2036 }
19baf839 2037
cb7b593c
SH
2038 ++cindex;
2039 }
91b9a277 2040
cb7b593c 2041 /* Current node exhausted, pop back up */
b299e4f0 2042 p = node_parent_rcu((struct rt_trie_node *)tn);
cb7b593c
SH
2043 if (p) {
2044 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2045 tn = p;
2046 --iter->depth;
2047 goto rescan;
19baf839 2048 }
cb7b593c
SH
2049
2050 /* got root? */
2051 return NULL;
19baf839
RO
2052}
2053
b299e4f0 2054static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
cb7b593c 2055 struct trie *t)
19baf839 2056{
b299e4f0 2057 struct rt_trie_node *n;
5ddf0eb2 2058
132adf54 2059 if (!t)
5ddf0eb2
RO
2060 return NULL;
2061
2062 n = rcu_dereference(t->trie);
3d3b2d25 2063 if (!n)
5ddf0eb2 2064 return NULL;
19baf839 2065
3d3b2d25
SH
2066 if (IS_TNODE(n)) {
2067 iter->tnode = (struct tnode *) n;
2068 iter->index = 0;
2069 iter->depth = 1;
2070 } else {
2071 iter->tnode = NULL;
2072 iter->index = 0;
2073 iter->depth = 0;
91b9a277 2074 }
3d3b2d25
SH
2075
2076 return n;
cb7b593c 2077}
91b9a277 2078
cb7b593c
SH
2079static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2080{
b299e4f0 2081 struct rt_trie_node *n;
cb7b593c 2082 struct fib_trie_iter iter;
91b9a277 2083
cb7b593c 2084 memset(s, 0, sizeof(*s));
91b9a277 2085
cb7b593c 2086 rcu_read_lock();
3d3b2d25 2087 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2088 if (IS_LEAF(n)) {
93672292
SH
2089 struct leaf *l = (struct leaf *)n;
2090 struct leaf_info *li;
2091 struct hlist_node *tmp;
2092
cb7b593c
SH
2093 s->leaves++;
2094 s->totdepth += iter.depth;
2095 if (iter.depth > s->maxdepth)
2096 s->maxdepth = iter.depth;
93672292
SH
2097
2098 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2099 ++s->prefixes;
cb7b593c
SH
2100 } else {
2101 const struct tnode *tn = (const struct tnode *) n;
2102 int i;
2103
2104 s->tnodes++;
132adf54 2105 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2106 s->nodesizes[tn->bits]++;
2107
cb7b593c
SH
2108 for (i = 0; i < (1<<tn->bits); i++)
2109 if (!tn->child[i])
2110 s->nullpointers++;
19baf839 2111 }
19baf839 2112 }
2373ce1c 2113 rcu_read_unlock();
19baf839
RO
2114}
2115
cb7b593c
SH
2116/*
2117 * This outputs /proc/net/fib_triestats
2118 */
2119static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2120{
a034ee3c 2121 unsigned int i, max, pointers, bytes, avdepth;
c877efb2 2122
cb7b593c
SH
2123 if (stat->leaves)
2124 avdepth = stat->totdepth*100 / stat->leaves;
2125 else
2126 avdepth = 0;
91b9a277 2127
a07f5f50
SH
2128 seq_printf(seq, "\tAver depth: %u.%02d\n",
2129 avdepth / 100, avdepth % 100);
cb7b593c 2130 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2131
cb7b593c 2132 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2133 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2134
2135 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2136 bytes += sizeof(struct leaf_info) * stat->prefixes;
2137
187b5188 2138 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2139 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2140
06ef921d
RO
2141 max = MAX_STAT_DEPTH;
2142 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2143 max--;
19baf839 2144
cb7b593c
SH
2145 pointers = 0;
2146 for (i = 1; i <= max; i++)
2147 if (stat->nodesizes[i] != 0) {
187b5188 2148 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2149 pointers += (1<<i) * stat->nodesizes[i];
2150 }
2151 seq_putc(seq, '\n');
187b5188 2152 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2153
b299e4f0 2154 bytes += sizeof(struct rt_trie_node *) * pointers;
187b5188
SH
2155 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2156 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2157}
2373ce1c 2158
cb7b593c 2159#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2160static void trie_show_usage(struct seq_file *seq,
2161 const struct trie_use_stats *stats)
2162{
2163 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2164 seq_printf(seq, "gets = %u\n", stats->gets);
2165 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2166 seq_printf(seq, "semantic match passed = %u\n",
2167 stats->semantic_match_passed);
2168 seq_printf(seq, "semantic match miss = %u\n",
2169 stats->semantic_match_miss);
2170 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2171 seq_printf(seq, "skipped node resize = %u\n\n",
2172 stats->resize_node_skipped);
cb7b593c 2173}
66a2f7fd
SH
2174#endif /* CONFIG_IP_FIB_TRIE_STATS */
2175
3d3b2d25 2176static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2177{
3d3b2d25
SH
2178 if (tb->tb_id == RT_TABLE_LOCAL)
2179 seq_puts(seq, "Local:\n");
2180 else if (tb->tb_id == RT_TABLE_MAIN)
2181 seq_puts(seq, "Main:\n");
2182 else
2183 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2184}
19baf839 2185
3d3b2d25 2186
cb7b593c
SH
2187static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2188{
1c340b2f 2189 struct net *net = (struct net *)seq->private;
3d3b2d25 2190 unsigned int h;
877a9bff 2191
d717a9a6 2192 seq_printf(seq,
a07f5f50
SH
2193 "Basic info: size of leaf:"
2194 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2195 sizeof(struct leaf), sizeof(struct tnode));
2196
3d3b2d25
SH
2197 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2198 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2199 struct hlist_node *node;
2200 struct fib_table *tb;
2201
2202 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2203 struct trie *t = (struct trie *) tb->tb_data;
2204 struct trie_stat stat;
877a9bff 2205
3d3b2d25
SH
2206 if (!t)
2207 continue;
2208
2209 fib_table_print(seq, tb);
2210
2211 trie_collect_stats(t, &stat);
2212 trie_show_stats(seq, &stat);
2213#ifdef CONFIG_IP_FIB_TRIE_STATS
2214 trie_show_usage(seq, &t->stats);
2215#endif
2216 }
2217 }
19baf839 2218
cb7b593c 2219 return 0;
19baf839
RO
2220}
2221
cb7b593c 2222static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2223{
de05c557 2224 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2225}
2226
9a32144e 2227static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2228 .owner = THIS_MODULE,
2229 .open = fib_triestat_seq_open,
2230 .read = seq_read,
2231 .llseek = seq_lseek,
b6fcbdb4 2232 .release = single_release_net,
cb7b593c
SH
2233};
2234
b299e4f0 2235static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2236{
1218854a
YH
2237 struct fib_trie_iter *iter = seq->private;
2238 struct net *net = seq_file_net(seq);
cb7b593c 2239 loff_t idx = 0;
3d3b2d25 2240 unsigned int h;
cb7b593c 2241
3d3b2d25
SH
2242 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2243 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2244 struct hlist_node *node;
2245 struct fib_table *tb;
cb7b593c 2246
3d3b2d25 2247 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
b299e4f0 2248 struct rt_trie_node *n;
3d3b2d25
SH
2249
2250 for (n = fib_trie_get_first(iter,
2251 (struct trie *) tb->tb_data);
2252 n; n = fib_trie_get_next(iter))
2253 if (pos == idx++) {
2254 iter->tb = tb;
2255 return n;
2256 }
2257 }
cb7b593c 2258 }
3d3b2d25 2259
19baf839
RO
2260 return NULL;
2261}
2262
cb7b593c 2263static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2264 __acquires(RCU)
19baf839 2265{
cb7b593c 2266 rcu_read_lock();
1218854a 2267 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2268}
2269
cb7b593c 2270static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2271{
cb7b593c 2272 struct fib_trie_iter *iter = seq->private;
1218854a 2273 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2274 struct fib_table *tb = iter->tb;
2275 struct hlist_node *tb_node;
2276 unsigned int h;
b299e4f0 2277 struct rt_trie_node *n;
cb7b593c 2278
19baf839 2279 ++*pos;
3d3b2d25
SH
2280 /* next node in same table */
2281 n = fib_trie_get_next(iter);
2282 if (n)
2283 return n;
19baf839 2284
3d3b2d25
SH
2285 /* walk rest of this hash chain */
2286 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
0a5c0475 2287 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
3d3b2d25
SH
2288 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2289 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2290 if (n)
2291 goto found;
2292 }
19baf839 2293
3d3b2d25
SH
2294 /* new hash chain */
2295 while (++h < FIB_TABLE_HASHSZ) {
2296 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2297 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2298 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2299 if (n)
2300 goto found;
2301 }
2302 }
cb7b593c 2303 return NULL;
3d3b2d25
SH
2304
2305found:
2306 iter->tb = tb;
2307 return n;
cb7b593c 2308}
19baf839 2309
cb7b593c 2310static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2311 __releases(RCU)
19baf839 2312{
cb7b593c
SH
2313 rcu_read_unlock();
2314}
91b9a277 2315
cb7b593c
SH
2316static void seq_indent(struct seq_file *seq, int n)
2317{
a034ee3c
ED
2318 while (n-- > 0)
2319 seq_puts(seq, " ");
cb7b593c 2320}
19baf839 2321
28d36e37 2322static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2323{
132adf54 2324 switch (s) {
cb7b593c
SH
2325 case RT_SCOPE_UNIVERSE: return "universe";
2326 case RT_SCOPE_SITE: return "site";
2327 case RT_SCOPE_LINK: return "link";
2328 case RT_SCOPE_HOST: return "host";
2329 case RT_SCOPE_NOWHERE: return "nowhere";
2330 default:
28d36e37 2331 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2332 return buf;
2333 }
2334}
19baf839 2335
36cbd3dc 2336static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2337 [RTN_UNSPEC] = "UNSPEC",
2338 [RTN_UNICAST] = "UNICAST",
2339 [RTN_LOCAL] = "LOCAL",
2340 [RTN_BROADCAST] = "BROADCAST",
2341 [RTN_ANYCAST] = "ANYCAST",
2342 [RTN_MULTICAST] = "MULTICAST",
2343 [RTN_BLACKHOLE] = "BLACKHOLE",
2344 [RTN_UNREACHABLE] = "UNREACHABLE",
2345 [RTN_PROHIBIT] = "PROHIBIT",
2346 [RTN_THROW] = "THROW",
2347 [RTN_NAT] = "NAT",
2348 [RTN_XRESOLVE] = "XRESOLVE",
2349};
19baf839 2350
a034ee3c 2351static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
cb7b593c 2352{
cb7b593c
SH
2353 if (t < __RTN_MAX && rtn_type_names[t])
2354 return rtn_type_names[t];
28d36e37 2355 snprintf(buf, len, "type %u", t);
cb7b593c 2356 return buf;
19baf839
RO
2357}
2358
cb7b593c
SH
2359/* Pretty print the trie */
2360static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2361{
cb7b593c 2362 const struct fib_trie_iter *iter = seq->private;
b299e4f0 2363 struct rt_trie_node *n = v;
c877efb2 2364
3d3b2d25
SH
2365 if (!node_parent_rcu(n))
2366 fib_table_print(seq, iter->tb);
095b8501 2367
cb7b593c
SH
2368 if (IS_TNODE(n)) {
2369 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2370 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2371
1d25cd6c 2372 seq_indent(seq, iter->depth-1);
673d57e7
HH
2373 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2374 &prf, tn->pos, tn->bits, tn->full_children,
1d25cd6c 2375 tn->empty_children);
e905a9ed 2376
cb7b593c
SH
2377 } else {
2378 struct leaf *l = (struct leaf *) n;
1328042e
SH
2379 struct leaf_info *li;
2380 struct hlist_node *node;
32ab5f80 2381 __be32 val = htonl(l->key);
cb7b593c
SH
2382
2383 seq_indent(seq, iter->depth);
673d57e7 2384 seq_printf(seq, " |-- %pI4\n", &val);
1328042e
SH
2385
2386 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2387 struct fib_alias *fa;
2388
2389 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2390 char buf1[32], buf2[32];
2391
2392 seq_indent(seq, iter->depth+1);
2393 seq_printf(seq, " /%d %s %s", li->plen,
2394 rtn_scope(buf1, sizeof(buf1),
37e826c5 2395 fa->fa_info->fib_scope),
1328042e
SH
2396 rtn_type(buf2, sizeof(buf2),
2397 fa->fa_type));
2398 if (fa->fa_tos)
b9c4d82a 2399 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2400 seq_putc(seq, '\n');
cb7b593c
SH
2401 }
2402 }
19baf839 2403 }
cb7b593c 2404
19baf839
RO
2405 return 0;
2406}
2407
f690808e 2408static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2409 .start = fib_trie_seq_start,
2410 .next = fib_trie_seq_next,
2411 .stop = fib_trie_seq_stop,
2412 .show = fib_trie_seq_show,
19baf839
RO
2413};
2414
cb7b593c 2415static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2416{
1c340b2f
DL
2417 return seq_open_net(inode, file, &fib_trie_seq_ops,
2418 sizeof(struct fib_trie_iter));
19baf839
RO
2419}
2420
9a32144e 2421static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2422 .owner = THIS_MODULE,
2423 .open = fib_trie_seq_open,
2424 .read = seq_read,
2425 .llseek = seq_lseek,
1c340b2f 2426 .release = seq_release_net,
19baf839
RO
2427};
2428
8315f5d8
SH
2429struct fib_route_iter {
2430 struct seq_net_private p;
2431 struct trie *main_trie;
2432 loff_t pos;
2433 t_key key;
2434};
2435
2436static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2437{
2438 struct leaf *l = NULL;
2439 struct trie *t = iter->main_trie;
2440
2441 /* use cache location of last found key */
2442 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2443 pos -= iter->pos;
2444 else {
2445 iter->pos = 0;
2446 l = trie_firstleaf(t);
2447 }
2448
2449 while (l && pos-- > 0) {
2450 iter->pos++;
2451 l = trie_nextleaf(l);
2452 }
2453
2454 if (l)
2455 iter->key = pos; /* remember it */
2456 else
2457 iter->pos = 0; /* forget it */
2458
2459 return l;
2460}
2461
2462static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2463 __acquires(RCU)
2464{
2465 struct fib_route_iter *iter = seq->private;
2466 struct fib_table *tb;
2467
2468 rcu_read_lock();
1218854a 2469 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2470 if (!tb)
2471 return NULL;
2472
2473 iter->main_trie = (struct trie *) tb->tb_data;
2474 if (*pos == 0)
2475 return SEQ_START_TOKEN;
2476 else
2477 return fib_route_get_idx(iter, *pos - 1);
2478}
2479
2480static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2481{
2482 struct fib_route_iter *iter = seq->private;
2483 struct leaf *l = v;
2484
2485 ++*pos;
2486 if (v == SEQ_START_TOKEN) {
2487 iter->pos = 0;
2488 l = trie_firstleaf(iter->main_trie);
2489 } else {
2490 iter->pos++;
2491 l = trie_nextleaf(l);
2492 }
2493
2494 if (l)
2495 iter->key = l->key;
2496 else
2497 iter->pos = 0;
2498 return l;
2499}
2500
2501static void fib_route_seq_stop(struct seq_file *seq, void *v)
2502 __releases(RCU)
2503{
2504 rcu_read_unlock();
2505}
2506
a034ee3c 2507static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2508{
a034ee3c 2509 unsigned int flags = 0;
19baf839 2510
a034ee3c
ED
2511 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2512 flags = RTF_REJECT;
cb7b593c
SH
2513 if (fi && fi->fib_nh->nh_gw)
2514 flags |= RTF_GATEWAY;
32ab5f80 2515 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2516 flags |= RTF_HOST;
2517 flags |= RTF_UP;
2518 return flags;
19baf839
RO
2519}
2520
cb7b593c
SH
2521/*
2522 * This outputs /proc/net/route.
2523 * The format of the file is not supposed to be changed
a034ee3c 2524 * and needs to be same as fib_hash output to avoid breaking
cb7b593c
SH
2525 * legacy utilities
2526 */
2527static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2528{
cb7b593c 2529 struct leaf *l = v;
1328042e
SH
2530 struct leaf_info *li;
2531 struct hlist_node *node;
19baf839 2532
cb7b593c
SH
2533 if (v == SEQ_START_TOKEN) {
2534 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2535 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2536 "\tWindow\tIRTT");
2537 return 0;
2538 }
19baf839 2539
1328042e 2540 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
cb7b593c 2541 struct fib_alias *fa;
32ab5f80 2542 __be32 mask, prefix;
91b9a277 2543
cb7b593c
SH
2544 mask = inet_make_mask(li->plen);
2545 prefix = htonl(l->key);
19baf839 2546
cb7b593c 2547 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2548 const struct fib_info *fi = fa->fa_info;
a034ee3c 2549 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
5e659e4c 2550 int len;
19baf839 2551
cb7b593c
SH
2552 if (fa->fa_type == RTN_BROADCAST
2553 || fa->fa_type == RTN_MULTICAST)
2554 continue;
19baf839 2555
cb7b593c 2556 if (fi)
5e659e4c
PE
2557 seq_printf(seq,
2558 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2559 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c
SH
2560 fi->fib_dev ? fi->fib_dev->name : "*",
2561 prefix,
2562 fi->fib_nh->nh_gw, flags, 0, 0,
2563 fi->fib_priority,
2564 mask,
a07f5f50
SH
2565 (fi->fib_advmss ?
2566 fi->fib_advmss + 40 : 0),
cb7b593c 2567 fi->fib_window,
5e659e4c 2568 fi->fib_rtt >> 3, &len);
cb7b593c 2569 else
5e659e4c
PE
2570 seq_printf(seq,
2571 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2572 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c 2573 prefix, 0, flags, 0, 0, 0,
5e659e4c 2574 mask, 0, 0, 0, &len);
19baf839 2575
5e659e4c 2576 seq_printf(seq, "%*s\n", 127 - len, "");
cb7b593c 2577 }
19baf839
RO
2578 }
2579
2580 return 0;
2581}
2582
f690808e 2583static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2584 .start = fib_route_seq_start,
2585 .next = fib_route_seq_next,
2586 .stop = fib_route_seq_stop,
cb7b593c 2587 .show = fib_route_seq_show,
19baf839
RO
2588};
2589
cb7b593c 2590static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2591{
1c340b2f 2592 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2593 sizeof(struct fib_route_iter));
19baf839
RO
2594}
2595
9a32144e 2596static const struct file_operations fib_route_fops = {
cb7b593c
SH
2597 .owner = THIS_MODULE,
2598 .open = fib_route_seq_open,
2599 .read = seq_read,
2600 .llseek = seq_lseek,
1c340b2f 2601 .release = seq_release_net,
19baf839
RO
2602};
2603
61a02653 2604int __net_init fib_proc_init(struct net *net)
19baf839 2605{
61a02653 2606 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
cb7b593c
SH
2607 goto out1;
2608
61a02653
DL
2609 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2610 &fib_triestat_fops))
cb7b593c
SH
2611 goto out2;
2612
61a02653 2613 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
cb7b593c
SH
2614 goto out3;
2615
19baf839 2616 return 0;
cb7b593c
SH
2617
2618out3:
61a02653 2619 proc_net_remove(net, "fib_triestat");
cb7b593c 2620out2:
61a02653 2621 proc_net_remove(net, "fib_trie");
cb7b593c
SH
2622out1:
2623 return -ENOMEM;
19baf839
RO
2624}
2625
61a02653 2626void __net_exit fib_proc_exit(struct net *net)
19baf839 2627{
61a02653
DL
2628 proc_net_remove(net, "fib_trie");
2629 proc_net_remove(net, "fib_triestat");
2630 proc_net_remove(net, "route");
19baf839
RO
2631}
2632
2633#endif /* CONFIG_PROC_FS */
This page took 0.802013 seconds and 5 git commands to generate.