qlge: Add offload features to vlan interfaces
[deliverable/linux.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
25985edc 15 * This work is based on the LPC-trie which is originally described in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
631dd1a8 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
19baf839
RO
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839 53#include <asm/uaccess.h>
1977f032 54#include <linux/bitops.h>
19baf839
RO
55#include <linux/types.h>
56#include <linux/kernel.h>
19baf839
RO
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
cd8787ab 64#include <linux/inetdevice.h>
19baf839
RO
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
2373ce1c 68#include <linux/rcupdate.h>
19baf839
RO
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
5a0e3ad6 73#include <linux/slab.h>
70c71606 74#include <linux/prefetch.h>
bc3b2d7f 75#include <linux/export.h>
457c4cbc 76#include <net/net_namespace.h>
19baf839
RO
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
06ef921d 85#define MAX_STAT_DEPTH 32
19baf839 86
19baf839 87#define KEYLENGTH (8*sizeof(t_key))
19baf839 88
19baf839
RO
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF 1
93#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
91b9a277
OJ
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839 98
b299e4f0 99struct rt_trie_node {
91b9a277 100 unsigned long parent;
8d965444 101 t_key key;
19baf839
RO
102};
103
104struct leaf {
91b9a277 105 unsigned long parent;
8d965444 106 t_key key;
19baf839 107 struct hlist_head list;
2373ce1c 108 struct rcu_head rcu;
19baf839
RO
109};
110
111struct leaf_info {
112 struct hlist_node hlist;
113 int plen;
5c74501f 114 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
19baf839 115 struct list_head falh;
5c74501f 116 struct rcu_head rcu;
19baf839
RO
117};
118
119struct tnode {
91b9a277 120 unsigned long parent;
8d965444 121 t_key key;
112d8cfc
ED
122 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
123 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
124 unsigned int full_children; /* KEYLENGTH bits needed */
125 unsigned int empty_children; /* KEYLENGTH bits needed */
15be75cd
SH
126 union {
127 struct rcu_head rcu;
128 struct work_struct work;
e0f7cb8c 129 struct tnode *tnode_free;
15be75cd 130 };
0a5c0475 131 struct rt_trie_node __rcu *child[0];
19baf839
RO
132};
133
134#ifdef CONFIG_IP_FIB_TRIE_STATS
135struct trie_use_stats {
136 unsigned int gets;
137 unsigned int backtrack;
138 unsigned int semantic_match_passed;
139 unsigned int semantic_match_miss;
140 unsigned int null_node_hit;
2f36895a 141 unsigned int resize_node_skipped;
19baf839
RO
142};
143#endif
144
145struct trie_stat {
146 unsigned int totdepth;
147 unsigned int maxdepth;
148 unsigned int tnodes;
149 unsigned int leaves;
150 unsigned int nullpointers;
93672292 151 unsigned int prefixes;
06ef921d 152 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 153};
19baf839
RO
154
155struct trie {
0a5c0475 156 struct rt_trie_node __rcu *trie;
19baf839
RO
157#ifdef CONFIG_IP_FIB_TRIE_STATS
158 struct trie_use_stats stats;
159#endif
19baf839
RO
160};
161
b299e4f0
DM
162static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
163static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 164 int wasfull);
b299e4f0 165static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
166static struct tnode *inflate(struct trie *t, struct tnode *tn);
167static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c
JP
168/* tnodes to free after resize(); protected by RTNL */
169static struct tnode *tnode_free_head;
c3059477
JP
170static size_t tnode_free_size;
171
172/*
173 * synchronize_rcu after call_rcu for that many pages; it should be especially
174 * useful before resizing the root node with PREEMPT_NONE configs; the value was
175 * obtained experimentally, aiming to avoid visible slowdown.
176 */
177static const int sync_pages = 128;
19baf839 178
e18b890b 179static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 180static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 181
0a5c0475
ED
182/*
183 * caller must hold RTNL
184 */
185static inline struct tnode *node_parent(const struct rt_trie_node *node)
06801916 186{
0a5c0475
ED
187 unsigned long parent;
188
189 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
190
191 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
b59cfbf7
ED
192}
193
0a5c0475
ED
194/*
195 * caller must hold RCU read lock or RTNL
196 */
197static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
b59cfbf7 198{
0a5c0475
ED
199 unsigned long parent;
200
201 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
202 lockdep_rtnl_is_held());
06801916 203
0a5c0475 204 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
06801916
SH
205}
206
cf778b00 207/* Same as rcu_assign_pointer
6440cc9e
SH
208 * but that macro() assumes that value is a pointer.
209 */
b299e4f0 210static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
06801916 211{
6440cc9e
SH
212 smp_wmb();
213 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 214}
2373ce1c 215
0a5c0475
ED
216/*
217 * caller must hold RTNL
218 */
219static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
b59cfbf7
ED
220{
221 BUG_ON(i >= 1U << tn->bits);
2373ce1c 222
0a5c0475 223 return rtnl_dereference(tn->child[i]);
b59cfbf7
ED
224}
225
0a5c0475
ED
226/*
227 * caller must hold RCU read lock or RTNL
228 */
229static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
19baf839 230{
0a5c0475 231 BUG_ON(i >= 1U << tn->bits);
19baf839 232
0a5c0475 233 return rcu_dereference_rtnl(tn->child[i]);
19baf839
RO
234}
235
bb435b8d 236static inline int tnode_child_length(const struct tnode *tn)
19baf839 237{
91b9a277 238 return 1 << tn->bits;
19baf839
RO
239}
240
3b004569 241static inline t_key mask_pfx(t_key k, unsigned int l)
ab66b4a7
SH
242{
243 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
244}
245
3b004569 246static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
19baf839 247{
91b9a277 248 if (offset < KEYLENGTH)
19baf839 249 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 250 else
19baf839
RO
251 return 0;
252}
253
254static inline int tkey_equals(t_key a, t_key b)
255{
c877efb2 256 return a == b;
19baf839
RO
257}
258
259static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
260{
c877efb2
SH
261 if (bits == 0 || offset >= KEYLENGTH)
262 return 1;
91b9a277
OJ
263 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
264 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 265}
19baf839
RO
266
267static inline int tkey_mismatch(t_key a, int offset, t_key b)
268{
269 t_key diff = a ^ b;
270 int i = offset;
271
c877efb2
SH
272 if (!diff)
273 return 0;
274 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
275 i++;
276 return i;
277}
278
19baf839 279/*
e905a9ed
YH
280 To understand this stuff, an understanding of keys and all their bits is
281 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
282 all of the bits in that key are significant.
283
284 Consider a node 'n' and its parent 'tp'.
285
e905a9ed
YH
286 If n is a leaf, every bit in its key is significant. Its presence is
287 necessitated by path compression, since during a tree traversal (when
288 searching for a leaf - unless we are doing an insertion) we will completely
289 ignore all skipped bits we encounter. Thus we need to verify, at the end of
290 a potentially successful search, that we have indeed been walking the
19baf839
RO
291 correct key path.
292
e905a9ed
YH
293 Note that we can never "miss" the correct key in the tree if present by
294 following the wrong path. Path compression ensures that segments of the key
295 that are the same for all keys with a given prefix are skipped, but the
296 skipped part *is* identical for each node in the subtrie below the skipped
297 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
298 call to tkey_sub_equals() in trie_insert().
299
e905a9ed 300 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
301 have many different meanings.
302
e905a9ed 303 Example:
19baf839
RO
304 _________________________________________________________________
305 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
306 -----------------------------------------------------------------
e905a9ed 307 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
308
309 _________________________________________________________________
310 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
311 -----------------------------------------------------------------
312 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
313
314 tp->pos = 7
315 tp->bits = 3
316 n->pos = 15
91b9a277 317 n->bits = 4
19baf839 318
e905a9ed
YH
319 First, let's just ignore the bits that come before the parent tp, that is
320 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
321 not use them for anything.
322
323 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 324 index into the parent's child array. That is, they will be used to find
19baf839
RO
325 'n' among tp's children.
326
327 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
328 for the node n.
329
e905a9ed 330 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
331 of the bits are really not needed or indeed known in n->key.
332
e905a9ed 333 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 334 n's child array, and will of course be different for each child.
e905a9ed 335
c877efb2 336
19baf839
RO
337 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
338 at this point.
339
340*/
341
0c7770c7 342static inline void check_tnode(const struct tnode *tn)
19baf839 343{
0c7770c7 344 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
345}
346
f5026fab
DL
347static const int halve_threshold = 25;
348static const int inflate_threshold = 50;
345aa031 349static const int halve_threshold_root = 15;
80b71b80 350static const int inflate_threshold_root = 30;
2373ce1c
RO
351
352static void __alias_free_mem(struct rcu_head *head)
19baf839 353{
2373ce1c
RO
354 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
355 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
356}
357
2373ce1c 358static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 359{
2373ce1c
RO
360 call_rcu(&fa->rcu, __alias_free_mem);
361}
91b9a277 362
2373ce1c
RO
363static void __leaf_free_rcu(struct rcu_head *head)
364{
bc3c8c1e
SH
365 struct leaf *l = container_of(head, struct leaf, rcu);
366 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 367}
91b9a277 368
387a5487
SH
369static inline void free_leaf(struct leaf *l)
370{
371 call_rcu_bh(&l->rcu, __leaf_free_rcu);
372}
373
2373ce1c 374static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 375{
bceb0f45 376 kfree_rcu(leaf, rcu);
19baf839
RO
377}
378
8d965444 379static struct tnode *tnode_alloc(size_t size)
f0e36f8c 380{
2373ce1c 381 if (size <= PAGE_SIZE)
8d965444 382 return kzalloc(size, GFP_KERNEL);
15be75cd 383 else
7a1c8e5a 384 return vzalloc(size);
15be75cd 385}
2373ce1c 386
15be75cd
SH
387static void __tnode_vfree(struct work_struct *arg)
388{
389 struct tnode *tn = container_of(arg, struct tnode, work);
390 vfree(tn);
f0e36f8c
PM
391}
392
2373ce1c 393static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 394{
2373ce1c 395 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444 396 size_t size = sizeof(struct tnode) +
b299e4f0 397 (sizeof(struct rt_trie_node *) << tn->bits);
f0e36f8c
PM
398
399 if (size <= PAGE_SIZE)
400 kfree(tn);
15be75cd
SH
401 else {
402 INIT_WORK(&tn->work, __tnode_vfree);
403 schedule_work(&tn->work);
404 }
f0e36f8c
PM
405}
406
2373ce1c
RO
407static inline void tnode_free(struct tnode *tn)
408{
387a5487
SH
409 if (IS_LEAF(tn))
410 free_leaf((struct leaf *) tn);
411 else
550e29bc 412 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
413}
414
e0f7cb8c
JP
415static void tnode_free_safe(struct tnode *tn)
416{
417 BUG_ON(IS_LEAF(tn));
7b85576d
JP
418 tn->tnode_free = tnode_free_head;
419 tnode_free_head = tn;
c3059477 420 tnode_free_size += sizeof(struct tnode) +
b299e4f0 421 (sizeof(struct rt_trie_node *) << tn->bits);
e0f7cb8c
JP
422}
423
424static void tnode_free_flush(void)
425{
426 struct tnode *tn;
427
428 while ((tn = tnode_free_head)) {
429 tnode_free_head = tn->tnode_free;
430 tn->tnode_free = NULL;
431 tnode_free(tn);
432 }
c3059477
JP
433
434 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
435 tnode_free_size = 0;
436 synchronize_rcu();
437 }
e0f7cb8c
JP
438}
439
2373ce1c
RO
440static struct leaf *leaf_new(void)
441{
bc3c8c1e 442 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
443 if (l) {
444 l->parent = T_LEAF;
445 INIT_HLIST_HEAD(&l->list);
446 }
447 return l;
448}
449
450static struct leaf_info *leaf_info_new(int plen)
451{
452 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
453 if (li) {
454 li->plen = plen;
5c74501f 455 li->mask_plen = ntohl(inet_make_mask(plen));
2373ce1c
RO
456 INIT_LIST_HEAD(&li->falh);
457 }
458 return li;
459}
460
a07f5f50 461static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 462{
b299e4f0 463 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
f0e36f8c 464 struct tnode *tn = tnode_alloc(sz);
19baf839 465
91b9a277 466 if (tn) {
2373ce1c 467 tn->parent = T_TNODE;
19baf839
RO
468 tn->pos = pos;
469 tn->bits = bits;
470 tn->key = key;
471 tn->full_children = 0;
472 tn->empty_children = 1<<bits;
473 }
c877efb2 474
a034ee3c 475 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
b299e4f0 476 sizeof(struct rt_trie_node) << bits);
19baf839
RO
477 return tn;
478}
479
19baf839
RO
480/*
481 * Check whether a tnode 'n' is "full", i.e. it is an internal node
482 * and no bits are skipped. See discussion in dyntree paper p. 6
483 */
484
b299e4f0 485static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
19baf839 486{
c877efb2 487 if (n == NULL || IS_LEAF(n))
19baf839
RO
488 return 0;
489
490 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
491}
492
a07f5f50 493static inline void put_child(struct trie *t, struct tnode *tn, int i,
b299e4f0 494 struct rt_trie_node *n)
19baf839
RO
495{
496 tnode_put_child_reorg(tn, i, n, -1);
497}
498
c877efb2 499 /*
19baf839
RO
500 * Add a child at position i overwriting the old value.
501 * Update the value of full_children and empty_children.
502 */
503
b299e4f0 504static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 505 int wasfull)
19baf839 506{
0a5c0475 507 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
19baf839
RO
508 int isfull;
509
0c7770c7
SH
510 BUG_ON(i >= 1<<tn->bits);
511
19baf839
RO
512 /* update emptyChildren */
513 if (n == NULL && chi != NULL)
514 tn->empty_children++;
515 else if (n != NULL && chi == NULL)
516 tn->empty_children--;
c877efb2 517
19baf839 518 /* update fullChildren */
91b9a277 519 if (wasfull == -1)
19baf839
RO
520 wasfull = tnode_full(tn, chi);
521
522 isfull = tnode_full(tn, n);
c877efb2 523 if (wasfull && !isfull)
19baf839 524 tn->full_children--;
c877efb2 525 else if (!wasfull && isfull)
19baf839 526 tn->full_children++;
91b9a277 527
c877efb2 528 if (n)
06801916 529 node_set_parent(n, tn);
19baf839 530
cf778b00 531 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
532}
533
80b71b80 534#define MAX_WORK 10
b299e4f0 535static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
536{
537 int i;
2f80b3c8 538 struct tnode *old_tn;
e6308be8
RO
539 int inflate_threshold_use;
540 int halve_threshold_use;
80b71b80 541 int max_work;
19baf839 542
e905a9ed 543 if (!tn)
19baf839
RO
544 return NULL;
545
0c7770c7
SH
546 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
547 tn, inflate_threshold, halve_threshold);
19baf839
RO
548
549 /* No children */
550 if (tn->empty_children == tnode_child_length(tn)) {
e0f7cb8c 551 tnode_free_safe(tn);
19baf839
RO
552 return NULL;
553 }
554 /* One child */
555 if (tn->empty_children == tnode_child_length(tn) - 1)
80b71b80 556 goto one_child;
c877efb2 557 /*
19baf839
RO
558 * Double as long as the resulting node has a number of
559 * nonempty nodes that are above the threshold.
560 */
561
562 /*
c877efb2
SH
563 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
564 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 565 * Telecommunications, page 6:
c877efb2 566 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
567 * children in the *doubled* node is at least 'high'."
568 *
c877efb2
SH
569 * 'high' in this instance is the variable 'inflate_threshold'. It
570 * is expressed as a percentage, so we multiply it with
571 * tnode_child_length() and instead of multiplying by 2 (since the
572 * child array will be doubled by inflate()) and multiplying
573 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 574 * multiply the left-hand side by 50.
c877efb2
SH
575 *
576 * The left-hand side may look a bit weird: tnode_child_length(tn)
577 * - tn->empty_children is of course the number of non-null children
578 * in the current node. tn->full_children is the number of "full"
19baf839 579 * children, that is non-null tnodes with a skip value of 0.
c877efb2 580 * All of those will be doubled in the resulting inflated tnode, so
19baf839 581 * we just count them one extra time here.
c877efb2 582 *
19baf839 583 * A clearer way to write this would be:
c877efb2 584 *
19baf839 585 * to_be_doubled = tn->full_children;
c877efb2 586 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
587 * tn->full_children;
588 *
589 * new_child_length = tnode_child_length(tn) * 2;
590 *
c877efb2 591 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
592 * new_child_length;
593 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
594 *
595 * ...and so on, tho it would mess up the while () loop.
596 *
19baf839
RO
597 * anyway,
598 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
599 * inflate_threshold
c877efb2 600 *
19baf839
RO
601 * avoid a division:
602 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
603 * inflate_threshold * new_child_length
c877efb2 604 *
19baf839 605 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 606 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 607 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 608 *
19baf839 609 * expand new_child_length:
c877efb2 610 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 611 * tn->full_children) >=
19baf839 612 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 613 *
19baf839 614 * shorten again:
c877efb2 615 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 616 * tn->empty_children) >= inflate_threshold *
19baf839 617 * tnode_child_length(tn)
c877efb2 618 *
19baf839
RO
619 */
620
621 check_tnode(tn);
c877efb2 622
e6308be8
RO
623 /* Keep root node larger */
624
b299e4f0 625 if (!node_parent((struct rt_trie_node *)tn)) {
80b71b80
JL
626 inflate_threshold_use = inflate_threshold_root;
627 halve_threshold_use = halve_threshold_root;
a034ee3c 628 } else {
e6308be8 629 inflate_threshold_use = inflate_threshold;
80b71b80
JL
630 halve_threshold_use = halve_threshold;
631 }
e6308be8 632
80b71b80
JL
633 max_work = MAX_WORK;
634 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
635 50 * (tn->full_children + tnode_child_length(tn)
636 - tn->empty_children)
637 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 638
2f80b3c8
RO
639 old_tn = tn;
640 tn = inflate(t, tn);
a07f5f50 641
2f80b3c8
RO
642 if (IS_ERR(tn)) {
643 tn = old_tn;
2f36895a
RO
644#ifdef CONFIG_IP_FIB_TRIE_STATS
645 t->stats.resize_node_skipped++;
646#endif
647 break;
648 }
19baf839
RO
649 }
650
651 check_tnode(tn);
652
80b71b80 653 /* Return if at least one inflate is run */
a034ee3c 654 if (max_work != MAX_WORK)
b299e4f0 655 return (struct rt_trie_node *) tn;
80b71b80 656
19baf839
RO
657 /*
658 * Halve as long as the number of empty children in this
659 * node is above threshold.
660 */
2f36895a 661
80b71b80
JL
662 max_work = MAX_WORK;
663 while (tn->bits > 1 && max_work-- &&
19baf839 664 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 665 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 666
2f80b3c8
RO
667 old_tn = tn;
668 tn = halve(t, tn);
669 if (IS_ERR(tn)) {
670 tn = old_tn;
2f36895a
RO
671#ifdef CONFIG_IP_FIB_TRIE_STATS
672 t->stats.resize_node_skipped++;
673#endif
674 break;
675 }
676 }
19baf839 677
c877efb2 678
19baf839 679 /* Only one child remains */
80b71b80
JL
680 if (tn->empty_children == tnode_child_length(tn) - 1) {
681one_child:
19baf839 682 for (i = 0; i < tnode_child_length(tn); i++) {
b299e4f0 683 struct rt_trie_node *n;
19baf839 684
0a5c0475 685 n = rtnl_dereference(tn->child[i]);
2373ce1c 686 if (!n)
91b9a277 687 continue;
91b9a277
OJ
688
689 /* compress one level */
690
06801916 691 node_set_parent(n, NULL);
e0f7cb8c 692 tnode_free_safe(tn);
91b9a277 693 return n;
19baf839 694 }
80b71b80 695 }
b299e4f0 696 return (struct rt_trie_node *) tn;
19baf839
RO
697}
698
0a5c0475
ED
699
700static void tnode_clean_free(struct tnode *tn)
701{
702 int i;
703 struct tnode *tofree;
704
705 for (i = 0; i < tnode_child_length(tn); i++) {
706 tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
707 if (tofree)
708 tnode_free(tofree);
709 }
710 tnode_free(tn);
711}
712
2f80b3c8 713static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 714{
19baf839
RO
715 struct tnode *oldtnode = tn;
716 int olen = tnode_child_length(tn);
717 int i;
718
0c7770c7 719 pr_debug("In inflate\n");
19baf839
RO
720
721 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
722
0c7770c7 723 if (!tn)
2f80b3c8 724 return ERR_PTR(-ENOMEM);
2f36895a
RO
725
726 /*
c877efb2
SH
727 * Preallocate and store tnodes before the actual work so we
728 * don't get into an inconsistent state if memory allocation
729 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
730 * of tnode is ignored.
731 */
91b9a277
OJ
732
733 for (i = 0; i < olen; i++) {
a07f5f50 734 struct tnode *inode;
2f36895a 735
a07f5f50 736 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
737 if (inode &&
738 IS_TNODE(inode) &&
739 inode->pos == oldtnode->pos + oldtnode->bits &&
740 inode->bits > 1) {
741 struct tnode *left, *right;
ab66b4a7 742 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 743
2f36895a
RO
744 left = tnode_new(inode->key&(~m), inode->pos + 1,
745 inode->bits - 1);
2f80b3c8
RO
746 if (!left)
747 goto nomem;
91b9a277 748
2f36895a
RO
749 right = tnode_new(inode->key|m, inode->pos + 1,
750 inode->bits - 1);
751
e905a9ed 752 if (!right) {
2f80b3c8
RO
753 tnode_free(left);
754 goto nomem;
e905a9ed 755 }
2f36895a 756
b299e4f0
DM
757 put_child(t, tn, 2*i, (struct rt_trie_node *) left);
758 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
2f36895a
RO
759 }
760 }
761
91b9a277 762 for (i = 0; i < olen; i++) {
c95aaf9a 763 struct tnode *inode;
b299e4f0 764 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
765 struct tnode *left, *right;
766 int size, j;
c877efb2 767
19baf839
RO
768 /* An empty child */
769 if (node == NULL)
770 continue;
771
772 /* A leaf or an internal node with skipped bits */
773
c877efb2 774 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 775 tn->pos + tn->bits - 1) {
a07f5f50
SH
776 if (tkey_extract_bits(node->key,
777 oldtnode->pos + oldtnode->bits,
778 1) == 0)
19baf839
RO
779 put_child(t, tn, 2*i, node);
780 else
781 put_child(t, tn, 2*i+1, node);
782 continue;
783 }
784
785 /* An internal node with two children */
786 inode = (struct tnode *) node;
787
788 if (inode->bits == 1) {
0a5c0475
ED
789 put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
790 put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
19baf839 791
e0f7cb8c 792 tnode_free_safe(inode);
91b9a277 793 continue;
19baf839
RO
794 }
795
91b9a277
OJ
796 /* An internal node with more than two children */
797
798 /* We will replace this node 'inode' with two new
799 * ones, 'left' and 'right', each with half of the
800 * original children. The two new nodes will have
801 * a position one bit further down the key and this
802 * means that the "significant" part of their keys
803 * (see the discussion near the top of this file)
804 * will differ by one bit, which will be "0" in
805 * left's key and "1" in right's key. Since we are
806 * moving the key position by one step, the bit that
807 * we are moving away from - the bit at position
808 * (inode->pos) - is the one that will differ between
809 * left and right. So... we synthesize that bit in the
810 * two new keys.
811 * The mask 'm' below will be a single "one" bit at
812 * the position (inode->pos)
813 */
19baf839 814
91b9a277
OJ
815 /* Use the old key, but set the new significant
816 * bit to zero.
817 */
2f36895a 818
91b9a277
OJ
819 left = (struct tnode *) tnode_get_child(tn, 2*i);
820 put_child(t, tn, 2*i, NULL);
2f36895a 821
91b9a277 822 BUG_ON(!left);
2f36895a 823
91b9a277
OJ
824 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
825 put_child(t, tn, 2*i+1, NULL);
19baf839 826
91b9a277 827 BUG_ON(!right);
19baf839 828
91b9a277
OJ
829 size = tnode_child_length(left);
830 for (j = 0; j < size; j++) {
0a5c0475
ED
831 put_child(t, left, j, rtnl_dereference(inode->child[j]));
832 put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
19baf839 833 }
91b9a277
OJ
834 put_child(t, tn, 2*i, resize(t, left));
835 put_child(t, tn, 2*i+1, resize(t, right));
836
e0f7cb8c 837 tnode_free_safe(inode);
19baf839 838 }
e0f7cb8c 839 tnode_free_safe(oldtnode);
19baf839 840 return tn;
2f80b3c8 841nomem:
0a5c0475
ED
842 tnode_clean_free(tn);
843 return ERR_PTR(-ENOMEM);
19baf839
RO
844}
845
2f80b3c8 846static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
847{
848 struct tnode *oldtnode = tn;
b299e4f0 849 struct rt_trie_node *left, *right;
19baf839
RO
850 int i;
851 int olen = tnode_child_length(tn);
852
0c7770c7 853 pr_debug("In halve\n");
c877efb2
SH
854
855 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 856
2f80b3c8
RO
857 if (!tn)
858 return ERR_PTR(-ENOMEM);
2f36895a
RO
859
860 /*
c877efb2
SH
861 * Preallocate and store tnodes before the actual work so we
862 * don't get into an inconsistent state if memory allocation
863 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
864 * of tnode is ignored.
865 */
866
91b9a277 867 for (i = 0; i < olen; i += 2) {
2f36895a
RO
868 left = tnode_get_child(oldtnode, i);
869 right = tnode_get_child(oldtnode, i+1);
c877efb2 870
2f36895a 871 /* Two nonempty children */
0c7770c7 872 if (left && right) {
2f80b3c8 873 struct tnode *newn;
0c7770c7 874
2f80b3c8 875 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
876
877 if (!newn)
2f80b3c8 878 goto nomem;
0c7770c7 879
b299e4f0 880 put_child(t, tn, i/2, (struct rt_trie_node *)newn);
2f36895a 881 }
2f36895a 882
2f36895a 883 }
19baf839 884
91b9a277
OJ
885 for (i = 0; i < olen; i += 2) {
886 struct tnode *newBinNode;
887
19baf839
RO
888 left = tnode_get_child(oldtnode, i);
889 right = tnode_get_child(oldtnode, i+1);
c877efb2 890
19baf839
RO
891 /* At least one of the children is empty */
892 if (left == NULL) {
893 if (right == NULL) /* Both are empty */
894 continue;
895 put_child(t, tn, i/2, right);
91b9a277 896 continue;
0c7770c7 897 }
91b9a277
OJ
898
899 if (right == NULL) {
19baf839 900 put_child(t, tn, i/2, left);
91b9a277
OJ
901 continue;
902 }
c877efb2 903
19baf839 904 /* Two nonempty children */
91b9a277
OJ
905 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
906 put_child(t, tn, i/2, NULL);
91b9a277
OJ
907 put_child(t, newBinNode, 0, left);
908 put_child(t, newBinNode, 1, right);
909 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839 910 }
e0f7cb8c 911 tnode_free_safe(oldtnode);
19baf839 912 return tn;
2f80b3c8 913nomem:
0a5c0475
ED
914 tnode_clean_free(tn);
915 return ERR_PTR(-ENOMEM);
19baf839
RO
916}
917
772cb712 918/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
919 via get_fa_head and dump */
920
772cb712 921static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 922{
772cb712 923 struct hlist_head *head = &l->list;
19baf839
RO
924 struct hlist_node *node;
925 struct leaf_info *li;
926
2373ce1c 927 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 928 if (li->plen == plen)
19baf839 929 return li;
91b9a277 930
19baf839
RO
931 return NULL;
932}
933
a07f5f50 934static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 935{
772cb712 936 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 937
91b9a277
OJ
938 if (!li)
939 return NULL;
c877efb2 940
91b9a277 941 return &li->falh;
19baf839
RO
942}
943
944static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
945{
e905a9ed
YH
946 struct leaf_info *li = NULL, *last = NULL;
947 struct hlist_node *node;
948
949 if (hlist_empty(head)) {
950 hlist_add_head_rcu(&new->hlist, head);
951 } else {
952 hlist_for_each_entry(li, node, head, hlist) {
953 if (new->plen > li->plen)
954 break;
955
956 last = li;
957 }
958 if (last)
959 hlist_add_after_rcu(&last->hlist, &new->hlist);
960 else
961 hlist_add_before_rcu(&new->hlist, &li->hlist);
962 }
19baf839
RO
963}
964
2373ce1c
RO
965/* rcu_read_lock needs to be hold by caller from readside */
966
19baf839
RO
967static struct leaf *
968fib_find_node(struct trie *t, u32 key)
969{
970 int pos;
971 struct tnode *tn;
b299e4f0 972 struct rt_trie_node *n;
19baf839
RO
973
974 pos = 0;
a034ee3c 975 n = rcu_dereference_rtnl(t->trie);
19baf839
RO
976
977 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
978 tn = (struct tnode *) n;
91b9a277 979
19baf839 980 check_tnode(tn);
91b9a277 981
c877efb2 982 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 983 pos = tn->pos + tn->bits;
a07f5f50
SH
984 n = tnode_get_child_rcu(tn,
985 tkey_extract_bits(key,
986 tn->pos,
987 tn->bits));
91b9a277 988 } else
19baf839
RO
989 break;
990 }
991 /* Case we have found a leaf. Compare prefixes */
992
91b9a277
OJ
993 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
994 return (struct leaf *)n;
995
19baf839
RO
996 return NULL;
997}
998
7b85576d 999static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 1000{
19baf839 1001 int wasfull;
3ed18d76 1002 t_key cindex, key;
06801916 1003 struct tnode *tp;
19baf839 1004
3ed18d76
RO
1005 key = tn->key;
1006
b299e4f0 1007 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
19baf839
RO
1008 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1009 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
e3192690 1010 tn = (struct tnode *)resize(t, tn);
a07f5f50 1011
e3192690 1012 tnode_put_child_reorg(tp, cindex,
b299e4f0 1013 (struct rt_trie_node *)tn, wasfull);
91b9a277 1014
b299e4f0 1015 tp = node_parent((struct rt_trie_node *) tn);
008440e3 1016 if (!tp)
cf778b00 1017 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
008440e3 1018
e0f7cb8c 1019 tnode_free_flush();
06801916 1020 if (!tp)
19baf839 1021 break;
06801916 1022 tn = tp;
19baf839 1023 }
06801916 1024
19baf839 1025 /* Handle last (top) tnode */
7b85576d 1026 if (IS_TNODE(tn))
e3192690 1027 tn = (struct tnode *)resize(t, tn);
19baf839 1028
cf778b00 1029 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
7b85576d 1030 tnode_free_flush();
19baf839
RO
1031}
1032
2373ce1c
RO
1033/* only used from updater-side */
1034
fea86ad8 1035static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1036{
1037 int pos, newpos;
1038 struct tnode *tp = NULL, *tn = NULL;
b299e4f0 1039 struct rt_trie_node *n;
19baf839
RO
1040 struct leaf *l;
1041 int missbit;
c877efb2 1042 struct list_head *fa_head = NULL;
19baf839
RO
1043 struct leaf_info *li;
1044 t_key cindex;
1045
1046 pos = 0;
0a5c0475 1047 n = rtnl_dereference(t->trie);
19baf839 1048
c877efb2
SH
1049 /* If we point to NULL, stop. Either the tree is empty and we should
1050 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1051 * and we should just put our new leaf in that.
c877efb2
SH
1052 * If we point to a T_TNODE, check if it matches our key. Note that
1053 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1054 * not be the parent's 'pos'+'bits'!
1055 *
c877efb2 1056 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1057 * the index from our key, push the T_TNODE and walk the tree.
1058 *
1059 * If it doesn't, we have to replace it with a new T_TNODE.
1060 *
c877efb2
SH
1061 * If we point to a T_LEAF, it might or might not have the same key
1062 * as we do. If it does, just change the value, update the T_LEAF's
1063 * value, and return it.
19baf839
RO
1064 * If it doesn't, we need to replace it with a T_TNODE.
1065 */
1066
1067 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1068 tn = (struct tnode *) n;
91b9a277 1069
c877efb2 1070 check_tnode(tn);
91b9a277 1071
c877efb2 1072 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1073 tp = tn;
91b9a277 1074 pos = tn->pos + tn->bits;
a07f5f50
SH
1075 n = tnode_get_child(tn,
1076 tkey_extract_bits(key,
1077 tn->pos,
1078 tn->bits));
19baf839 1079
06801916 1080 BUG_ON(n && node_parent(n) != tn);
91b9a277 1081 } else
19baf839
RO
1082 break;
1083 }
1084
1085 /*
1086 * n ----> NULL, LEAF or TNODE
1087 *
c877efb2 1088 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1089 */
1090
91b9a277 1091 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1092
1093 /* Case 1: n is a leaf. Compare prefixes */
1094
c877efb2 1095 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1096 l = (struct leaf *) n;
19baf839 1097 li = leaf_info_new(plen);
91b9a277 1098
fea86ad8
SH
1099 if (!li)
1100 return NULL;
19baf839
RO
1101
1102 fa_head = &li->falh;
1103 insert_leaf_info(&l->list, li);
1104 goto done;
1105 }
19baf839
RO
1106 l = leaf_new();
1107
fea86ad8
SH
1108 if (!l)
1109 return NULL;
19baf839
RO
1110
1111 l->key = key;
1112 li = leaf_info_new(plen);
1113
c877efb2 1114 if (!li) {
387a5487 1115 free_leaf(l);
fea86ad8 1116 return NULL;
f835e471 1117 }
19baf839
RO
1118
1119 fa_head = &li->falh;
1120 insert_leaf_info(&l->list, li);
1121
19baf839 1122 if (t->trie && n == NULL) {
91b9a277 1123 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1124
b299e4f0 1125 node_set_parent((struct rt_trie_node *)l, tp);
19baf839 1126
91b9a277 1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
e3192690 1128 put_child(t, tp, cindex, (struct rt_trie_node *)l);
91b9a277
OJ
1129 } else {
1130 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1131 /*
1132 * Add a new tnode here
19baf839
RO
1133 * first tnode need some special handling
1134 */
1135
1136 if (tp)
91b9a277 1137 pos = tp->pos+tp->bits;
19baf839 1138 else
91b9a277
OJ
1139 pos = 0;
1140
c877efb2 1141 if (n) {
19baf839
RO
1142 newpos = tkey_mismatch(key, pos, n->key);
1143 tn = tnode_new(n->key, newpos, 1);
91b9a277 1144 } else {
19baf839 1145 newpos = 0;
c877efb2 1146 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1147 }
19baf839 1148
c877efb2 1149 if (!tn) {
f835e471 1150 free_leaf_info(li);
387a5487 1151 free_leaf(l);
fea86ad8 1152 return NULL;
91b9a277
OJ
1153 }
1154
b299e4f0 1155 node_set_parent((struct rt_trie_node *)tn, tp);
19baf839 1156
91b9a277 1157 missbit = tkey_extract_bits(key, newpos, 1);
b299e4f0 1158 put_child(t, tn, missbit, (struct rt_trie_node *)l);
19baf839
RO
1159 put_child(t, tn, 1-missbit, n);
1160
c877efb2 1161 if (tp) {
19baf839 1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
e3192690 1163 put_child(t, tp, cindex, (struct rt_trie_node *)tn);
91b9a277 1164 } else {
cf778b00 1165 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
19baf839
RO
1166 tp = tn;
1167 }
1168 }
91b9a277
OJ
1169
1170 if (tp && tp->pos + tp->bits > 32)
058bd4d2
JP
1171 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1172 tp, tp->pos, tp->bits, key, plen);
91b9a277 1173
19baf839 1174 /* Rebalance the trie */
2373ce1c 1175
7b85576d 1176 trie_rebalance(t, tp);
f835e471 1177done:
19baf839
RO
1178 return fa_head;
1179}
1180
d562f1f8
RO
1181/*
1182 * Caller must hold RTNL.
1183 */
16c6cf8b 1184int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1185{
1186 struct trie *t = (struct trie *) tb->tb_data;
1187 struct fib_alias *fa, *new_fa;
c877efb2 1188 struct list_head *fa_head = NULL;
19baf839 1189 struct fib_info *fi;
4e902c57
TG
1190 int plen = cfg->fc_dst_len;
1191 u8 tos = cfg->fc_tos;
19baf839
RO
1192 u32 key, mask;
1193 int err;
1194 struct leaf *l;
1195
1196 if (plen > 32)
1197 return -EINVAL;
1198
4e902c57 1199 key = ntohl(cfg->fc_dst);
19baf839 1200
2dfe55b4 1201 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1202
91b9a277 1203 mask = ntohl(inet_make_mask(plen));
19baf839 1204
c877efb2 1205 if (key & ~mask)
19baf839
RO
1206 return -EINVAL;
1207
1208 key = key & mask;
1209
4e902c57
TG
1210 fi = fib_create_info(cfg);
1211 if (IS_ERR(fi)) {
1212 err = PTR_ERR(fi);
19baf839 1213 goto err;
4e902c57 1214 }
19baf839
RO
1215
1216 l = fib_find_node(t, key);
c877efb2 1217 fa = NULL;
19baf839 1218
c877efb2 1219 if (l) {
19baf839
RO
1220 fa_head = get_fa_head(l, plen);
1221 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1222 }
1223
1224 /* Now fa, if non-NULL, points to the first fib alias
1225 * with the same keys [prefix,tos,priority], if such key already
1226 * exists or to the node before which we will insert new one.
1227 *
1228 * If fa is NULL, we will need to allocate a new one and
1229 * insert to the head of f.
1230 *
1231 * If f is NULL, no fib node matched the destination key
1232 * and we need to allocate a new one of those as well.
1233 */
1234
936f6f8e
JA
1235 if (fa && fa->fa_tos == tos &&
1236 fa->fa_info->fib_priority == fi->fib_priority) {
1237 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1238
1239 err = -EEXIST;
4e902c57 1240 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1241 goto out;
1242
936f6f8e
JA
1243 /* We have 2 goals:
1244 * 1. Find exact match for type, scope, fib_info to avoid
1245 * duplicate routes
1246 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1247 */
1248 fa_match = NULL;
1249 fa_first = fa;
1250 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1251 list_for_each_entry_continue(fa, fa_head, fa_list) {
1252 if (fa->fa_tos != tos)
1253 break;
1254 if (fa->fa_info->fib_priority != fi->fib_priority)
1255 break;
1256 if (fa->fa_type == cfg->fc_type &&
936f6f8e
JA
1257 fa->fa_info == fi) {
1258 fa_match = fa;
1259 break;
1260 }
1261 }
1262
4e902c57 1263 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1264 struct fib_info *fi_drop;
1265 u8 state;
1266
936f6f8e
JA
1267 fa = fa_first;
1268 if (fa_match) {
1269 if (fa == fa_match)
1270 err = 0;
6725033f 1271 goto out;
936f6f8e 1272 }
2373ce1c 1273 err = -ENOBUFS;
e94b1766 1274 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1275 if (new_fa == NULL)
1276 goto out;
19baf839
RO
1277
1278 fi_drop = fa->fa_info;
2373ce1c
RO
1279 new_fa->fa_tos = fa->fa_tos;
1280 new_fa->fa_info = fi;
4e902c57 1281 new_fa->fa_type = cfg->fc_type;
19baf839 1282 state = fa->fa_state;
936f6f8e 1283 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1284
2373ce1c
RO
1285 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1286 alias_free_mem_rcu(fa);
19baf839
RO
1287
1288 fib_release_info(fi_drop);
1289 if (state & FA_S_ACCESSED)
76e6ebfb 1290 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
b8f55831
MK
1291 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1292 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1293
91b9a277 1294 goto succeeded;
19baf839
RO
1295 }
1296 /* Error if we find a perfect match which
1297 * uses the same scope, type, and nexthop
1298 * information.
1299 */
936f6f8e
JA
1300 if (fa_match)
1301 goto out;
a07f5f50 1302
4e902c57 1303 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1304 fa = fa_first;
19baf839
RO
1305 }
1306 err = -ENOENT;
4e902c57 1307 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1308 goto out;
1309
1310 err = -ENOBUFS;
e94b1766 1311 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1312 if (new_fa == NULL)
1313 goto out;
1314
1315 new_fa->fa_info = fi;
1316 new_fa->fa_tos = tos;
4e902c57 1317 new_fa->fa_type = cfg->fc_type;
19baf839 1318 new_fa->fa_state = 0;
19baf839
RO
1319 /*
1320 * Insert new entry to the list.
1321 */
1322
c877efb2 1323 if (!fa_head) {
fea86ad8
SH
1324 fa_head = fib_insert_node(t, key, plen);
1325 if (unlikely(!fa_head)) {
1326 err = -ENOMEM;
f835e471 1327 goto out_free_new_fa;
fea86ad8 1328 }
f835e471 1329 }
19baf839 1330
21d8c49e
DM
1331 if (!plen)
1332 tb->tb_num_default++;
1333
2373ce1c
RO
1334 list_add_tail_rcu(&new_fa->fa_list,
1335 (fa ? &fa->fa_list : fa_head));
19baf839 1336
76e6ebfb 1337 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
4e902c57 1338 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1339 &cfg->fc_nlinfo, 0);
19baf839
RO
1340succeeded:
1341 return 0;
f835e471
RO
1342
1343out_free_new_fa:
1344 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1345out:
1346 fib_release_info(fi);
91b9a277 1347err:
19baf839
RO
1348 return err;
1349}
1350
772cb712 1351/* should be called with rcu_read_lock */
5b470441 1352static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
22bd5b9b 1353 t_key key, const struct flowi4 *flp,
ebc0ffae 1354 struct fib_result *res, int fib_flags)
19baf839 1355{
19baf839
RO
1356 struct leaf_info *li;
1357 struct hlist_head *hhead = &l->list;
1358 struct hlist_node *node;
c877efb2 1359
2373ce1c 1360 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
3be0686b 1361 struct fib_alias *fa;
a07f5f50 1362
5c74501f 1363 if (l->key != (key & li->mask_plen))
19baf839
RO
1364 continue;
1365
3be0686b
DM
1366 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1367 struct fib_info *fi = fa->fa_info;
1368 int nhsel, err;
a07f5f50 1369
22bd5b9b 1370 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
3be0686b 1371 continue;
dccd9ecc
DM
1372 if (fi->fib_dead)
1373 continue;
37e826c5 1374 if (fa->fa_info->fib_scope < flp->flowi4_scope)
3be0686b
DM
1375 continue;
1376 fib_alias_accessed(fa);
1377 err = fib_props[fa->fa_type].error;
1378 if (err) {
19baf839 1379#ifdef CONFIG_IP_FIB_TRIE_STATS
1fbc7843 1380 t->stats.semantic_match_passed++;
3be0686b 1381#endif
1fbc7843 1382 return err;
3be0686b
DM
1383 }
1384 if (fi->fib_flags & RTNH_F_DEAD)
1385 continue;
1386 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1387 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1388
1389 if (nh->nh_flags & RTNH_F_DEAD)
1390 continue;
22bd5b9b 1391 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
3be0686b
DM
1392 continue;
1393
1394#ifdef CONFIG_IP_FIB_TRIE_STATS
1395 t->stats.semantic_match_passed++;
1396#endif
5c74501f 1397 res->prefixlen = li->plen;
3be0686b
DM
1398 res->nh_sel = nhsel;
1399 res->type = fa->fa_type;
37e826c5 1400 res->scope = fa->fa_info->fib_scope;
3be0686b
DM
1401 res->fi = fi;
1402 res->table = tb;
1403 res->fa_head = &li->falh;
1404 if (!(fib_flags & FIB_LOOKUP_NOREF))
5c74501f 1405 atomic_inc(&fi->fib_clntref);
3be0686b
DM
1406 return 0;
1407 }
1408 }
1409
1410#ifdef CONFIG_IP_FIB_TRIE_STATS
1411 t->stats.semantic_match_miss++;
19baf839 1412#endif
19baf839 1413 }
a07f5f50 1414
2e655571 1415 return 1;
19baf839
RO
1416}
1417
22bd5b9b 1418int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
ebc0ffae 1419 struct fib_result *res, int fib_flags)
19baf839
RO
1420{
1421 struct trie *t = (struct trie *) tb->tb_data;
2e655571 1422 int ret;
b299e4f0 1423 struct rt_trie_node *n;
19baf839 1424 struct tnode *pn;
3b004569 1425 unsigned int pos, bits;
22bd5b9b 1426 t_key key = ntohl(flp->daddr);
3b004569 1427 unsigned int chopped_off;
19baf839 1428 t_key cindex = 0;
3b004569 1429 unsigned int current_prefix_length = KEYLENGTH;
91b9a277 1430 struct tnode *cn;
874ffa8f 1431 t_key pref_mismatch;
91b9a277 1432
2373ce1c 1433 rcu_read_lock();
91b9a277 1434
2373ce1c 1435 n = rcu_dereference(t->trie);
c877efb2 1436 if (!n)
19baf839
RO
1437 goto failed;
1438
1439#ifdef CONFIG_IP_FIB_TRIE_STATS
1440 t->stats.gets++;
1441#endif
1442
1443 /* Just a leaf? */
1444 if (IS_LEAF(n)) {
5b470441 1445 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
a07f5f50 1446 goto found;
19baf839 1447 }
a07f5f50 1448
19baf839
RO
1449 pn = (struct tnode *) n;
1450 chopped_off = 0;
c877efb2 1451
91b9a277 1452 while (pn) {
19baf839
RO
1453 pos = pn->pos;
1454 bits = pn->bits;
1455
c877efb2 1456 if (!chopped_off)
ab66b4a7
SH
1457 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1458 pos, bits);
19baf839 1459
b902e573 1460 n = tnode_get_child_rcu(pn, cindex);
19baf839
RO
1461
1462 if (n == NULL) {
1463#ifdef CONFIG_IP_FIB_TRIE_STATS
1464 t->stats.null_node_hit++;
1465#endif
1466 goto backtrace;
1467 }
1468
91b9a277 1469 if (IS_LEAF(n)) {
5b470441 1470 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
2e655571 1471 if (ret > 0)
91b9a277 1472 goto backtrace;
a07f5f50 1473 goto found;
91b9a277
OJ
1474 }
1475
91b9a277 1476 cn = (struct tnode *)n;
19baf839 1477
91b9a277
OJ
1478 /*
1479 * It's a tnode, and we can do some extra checks here if we
1480 * like, to avoid descending into a dead-end branch.
1481 * This tnode is in the parent's child array at index
1482 * key[p_pos..p_pos+p_bits] but potentially with some bits
1483 * chopped off, so in reality the index may be just a
1484 * subprefix, padded with zero at the end.
1485 * We can also take a look at any skipped bits in this
1486 * tnode - everything up to p_pos is supposed to be ok,
1487 * and the non-chopped bits of the index (se previous
1488 * paragraph) are also guaranteed ok, but the rest is
1489 * considered unknown.
1490 *
1491 * The skipped bits are key[pos+bits..cn->pos].
1492 */
19baf839 1493
91b9a277
OJ
1494 /* If current_prefix_length < pos+bits, we are already doing
1495 * actual prefix matching, which means everything from
1496 * pos+(bits-chopped_off) onward must be zero along some
1497 * branch of this subtree - otherwise there is *no* valid
1498 * prefix present. Here we can only check the skipped
1499 * bits. Remember, since we have already indexed into the
1500 * parent's child array, we know that the bits we chopped of
1501 * *are* zero.
1502 */
19baf839 1503
a07f5f50
SH
1504 /* NOTA BENE: Checking only skipped bits
1505 for the new node here */
19baf839 1506
91b9a277
OJ
1507 if (current_prefix_length < pos+bits) {
1508 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1509 cn->pos - current_prefix_length)
1510 || !(cn->child[0]))
91b9a277
OJ
1511 goto backtrace;
1512 }
19baf839 1513
91b9a277
OJ
1514 /*
1515 * If chopped_off=0, the index is fully validated and we
1516 * only need to look at the skipped bits for this, the new,
1517 * tnode. What we actually want to do is to find out if
1518 * these skipped bits match our key perfectly, or if we will
1519 * have to count on finding a matching prefix further down,
1520 * because if we do, we would like to have some way of
1521 * verifying the existence of such a prefix at this point.
1522 */
19baf839 1523
91b9a277
OJ
1524 /* The only thing we can do at this point is to verify that
1525 * any such matching prefix can indeed be a prefix to our
1526 * key, and if the bits in the node we are inspecting that
1527 * do not match our key are not ZERO, this cannot be true.
1528 * Thus, find out where there is a mismatch (before cn->pos)
1529 * and verify that all the mismatching bits are zero in the
1530 * new tnode's key.
1531 */
19baf839 1532
a07f5f50
SH
1533 /*
1534 * Note: We aren't very concerned about the piece of
1535 * the key that precede pn->pos+pn->bits, since these
1536 * have already been checked. The bits after cn->pos
1537 * aren't checked since these are by definition
1538 * "unknown" at this point. Thus, what we want to see
1539 * is if we are about to enter the "prefix matching"
1540 * state, and in that case verify that the skipped
1541 * bits that will prevail throughout this subtree are
1542 * zero, as they have to be if we are to find a
1543 * matching prefix.
91b9a277
OJ
1544 */
1545
874ffa8f 1546 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
91b9a277 1547
a07f5f50
SH
1548 /*
1549 * In short: If skipped bits in this node do not match
1550 * the search key, enter the "prefix matching"
1551 * state.directly.
91b9a277
OJ
1552 */
1553 if (pref_mismatch) {
874ffa8f 1554 int mp = KEYLENGTH - fls(pref_mismatch);
91b9a277 1555
874ffa8f 1556 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
91b9a277
OJ
1557 goto backtrace;
1558
1559 if (current_prefix_length >= cn->pos)
1560 current_prefix_length = mp;
c877efb2 1561 }
a07f5f50 1562
91b9a277
OJ
1563 pn = (struct tnode *)n; /* Descend */
1564 chopped_off = 0;
1565 continue;
1566
19baf839
RO
1567backtrace:
1568 chopped_off++;
1569
1570 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1571 while ((chopped_off <= pn->bits)
1572 && !(cindex & (1<<(chopped_off-1))))
19baf839 1573 chopped_off++;
19baf839
RO
1574
1575 /* Decrease current_... with bits chopped off */
1576 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1577 current_prefix_length = pn->pos + pn->bits
1578 - chopped_off;
91b9a277 1579
19baf839 1580 /*
c877efb2 1581 * Either we do the actual chop off according or if we have
19baf839
RO
1582 * chopped off all bits in this tnode walk up to our parent.
1583 */
1584
91b9a277 1585 if (chopped_off <= pn->bits) {
19baf839 1586 cindex &= ~(1 << (chopped_off-1));
91b9a277 1587 } else {
b299e4f0 1588 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
06801916 1589 if (!parent)
19baf839 1590 goto failed;
91b9a277 1591
19baf839 1592 /* Get Child's index */
06801916
SH
1593 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1594 pn = parent;
19baf839
RO
1595 chopped_off = 0;
1596
1597#ifdef CONFIG_IP_FIB_TRIE_STATS
1598 t->stats.backtrack++;
1599#endif
1600 goto backtrace;
c877efb2 1601 }
19baf839
RO
1602 }
1603failed:
c877efb2 1604 ret = 1;
19baf839 1605found:
2373ce1c 1606 rcu_read_unlock();
19baf839
RO
1607 return ret;
1608}
6fc01438 1609EXPORT_SYMBOL_GPL(fib_table_lookup);
19baf839 1610
9195bef7
SH
1611/*
1612 * Remove the leaf and return parent.
1613 */
1614static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1615{
b299e4f0 1616 struct tnode *tp = node_parent((struct rt_trie_node *) l);
c877efb2 1617
9195bef7 1618 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1619
c877efb2 1620 if (tp) {
9195bef7 1621 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
e3192690 1622 put_child(t, tp, cindex, NULL);
7b85576d 1623 trie_rebalance(t, tp);
91b9a277 1624 } else
a9b3cd7f 1625 RCU_INIT_POINTER(t->trie, NULL);
19baf839 1626
387a5487 1627 free_leaf(l);
19baf839
RO
1628}
1629
d562f1f8
RO
1630/*
1631 * Caller must hold RTNL.
1632 */
16c6cf8b 1633int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1634{
1635 struct trie *t = (struct trie *) tb->tb_data;
1636 u32 key, mask;
4e902c57
TG
1637 int plen = cfg->fc_dst_len;
1638 u8 tos = cfg->fc_tos;
19baf839
RO
1639 struct fib_alias *fa, *fa_to_delete;
1640 struct list_head *fa_head;
1641 struct leaf *l;
91b9a277
OJ
1642 struct leaf_info *li;
1643
c877efb2 1644 if (plen > 32)
19baf839
RO
1645 return -EINVAL;
1646
4e902c57 1647 key = ntohl(cfg->fc_dst);
91b9a277 1648 mask = ntohl(inet_make_mask(plen));
19baf839 1649
c877efb2 1650 if (key & ~mask)
19baf839
RO
1651 return -EINVAL;
1652
1653 key = key & mask;
1654 l = fib_find_node(t, key);
1655
c877efb2 1656 if (!l)
19baf839
RO
1657 return -ESRCH;
1658
1659 fa_head = get_fa_head(l, plen);
1660 fa = fib_find_alias(fa_head, tos, 0);
1661
1662 if (!fa)
1663 return -ESRCH;
1664
0c7770c7 1665 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1666
1667 fa_to_delete = NULL;
936f6f8e
JA
1668 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1669 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1670 struct fib_info *fi = fa->fa_info;
1671
1672 if (fa->fa_tos != tos)
1673 break;
1674
4e902c57
TG
1675 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1676 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
37e826c5 1677 fa->fa_info->fib_scope == cfg->fc_scope) &&
74cb3c10
JA
1678 (!cfg->fc_prefsrc ||
1679 fi->fib_prefsrc == cfg->fc_prefsrc) &&
4e902c57
TG
1680 (!cfg->fc_protocol ||
1681 fi->fib_protocol == cfg->fc_protocol) &&
1682 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1683 fa_to_delete = fa;
1684 break;
1685 }
1686 }
1687
91b9a277
OJ
1688 if (!fa_to_delete)
1689 return -ESRCH;
19baf839 1690
91b9a277 1691 fa = fa_to_delete;
4e902c57 1692 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1693 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1694
1695 l = fib_find_node(t, key);
772cb712 1696 li = find_leaf_info(l, plen);
19baf839 1697
2373ce1c 1698 list_del_rcu(&fa->fa_list);
19baf839 1699
21d8c49e
DM
1700 if (!plen)
1701 tb->tb_num_default--;
1702
91b9a277 1703 if (list_empty(fa_head)) {
2373ce1c 1704 hlist_del_rcu(&li->hlist);
91b9a277 1705 free_leaf_info(li);
2373ce1c 1706 }
19baf839 1707
91b9a277 1708 if (hlist_empty(&l->list))
9195bef7 1709 trie_leaf_remove(t, l);
19baf839 1710
91b9a277 1711 if (fa->fa_state & FA_S_ACCESSED)
76e6ebfb 1712 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
19baf839 1713
2373ce1c
RO
1714 fib_release_info(fa->fa_info);
1715 alias_free_mem_rcu(fa);
91b9a277 1716 return 0;
19baf839
RO
1717}
1718
ef3660ce 1719static int trie_flush_list(struct list_head *head)
19baf839
RO
1720{
1721 struct fib_alias *fa, *fa_node;
1722 int found = 0;
1723
1724 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1725 struct fib_info *fi = fa->fa_info;
19baf839 1726
2373ce1c
RO
1727 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1728 list_del_rcu(&fa->fa_list);
1729 fib_release_info(fa->fa_info);
1730 alias_free_mem_rcu(fa);
19baf839
RO
1731 found++;
1732 }
1733 }
1734 return found;
1735}
1736
ef3660ce 1737static int trie_flush_leaf(struct leaf *l)
19baf839
RO
1738{
1739 int found = 0;
1740 struct hlist_head *lih = &l->list;
1741 struct hlist_node *node, *tmp;
1742 struct leaf_info *li = NULL;
1743
1744 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
ef3660ce 1745 found += trie_flush_list(&li->falh);
19baf839
RO
1746
1747 if (list_empty(&li->falh)) {
2373ce1c 1748 hlist_del_rcu(&li->hlist);
19baf839
RO
1749 free_leaf_info(li);
1750 }
1751 }
1752 return found;
1753}
1754
82cfbb00
SH
1755/*
1756 * Scan for the next right leaf starting at node p->child[idx]
1757 * Since we have back pointer, no recursion necessary.
1758 */
b299e4f0 1759static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
19baf839 1760{
82cfbb00
SH
1761 do {
1762 t_key idx;
c877efb2 1763
c877efb2 1764 if (c)
82cfbb00 1765 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1766 else
82cfbb00 1767 idx = 0;
2373ce1c 1768
82cfbb00
SH
1769 while (idx < 1u << p->bits) {
1770 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1771 if (!c)
91b9a277
OJ
1772 continue;
1773
82cfbb00 1774 if (IS_LEAF(c)) {
0a5c0475 1775 prefetch(rcu_dereference_rtnl(p->child[idx]));
82cfbb00 1776 return (struct leaf *) c;
19baf839 1777 }
82cfbb00
SH
1778
1779 /* Rescan start scanning in new node */
1780 p = (struct tnode *) c;
1781 idx = 0;
19baf839 1782 }
82cfbb00
SH
1783
1784 /* Node empty, walk back up to parent */
b299e4f0 1785 c = (struct rt_trie_node *) p;
a034ee3c 1786 } while ((p = node_parent_rcu(c)) != NULL);
82cfbb00
SH
1787
1788 return NULL; /* Root of trie */
1789}
1790
82cfbb00
SH
1791static struct leaf *trie_firstleaf(struct trie *t)
1792{
a034ee3c 1793 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
82cfbb00
SH
1794
1795 if (!n)
1796 return NULL;
1797
1798 if (IS_LEAF(n)) /* trie is just a leaf */
1799 return (struct leaf *) n;
1800
1801 return leaf_walk_rcu(n, NULL);
1802}
1803
1804static struct leaf *trie_nextleaf(struct leaf *l)
1805{
b299e4f0 1806 struct rt_trie_node *c = (struct rt_trie_node *) l;
b902e573 1807 struct tnode *p = node_parent_rcu(c);
82cfbb00
SH
1808
1809 if (!p)
1810 return NULL; /* trie with just one leaf */
1811
1812 return leaf_walk_rcu(p, c);
19baf839
RO
1813}
1814
71d67e66
SH
1815static struct leaf *trie_leafindex(struct trie *t, int index)
1816{
1817 struct leaf *l = trie_firstleaf(t);
1818
ec28cf73 1819 while (l && index-- > 0)
71d67e66 1820 l = trie_nextleaf(l);
ec28cf73 1821
71d67e66
SH
1822 return l;
1823}
1824
1825
d562f1f8
RO
1826/*
1827 * Caller must hold RTNL.
1828 */
16c6cf8b 1829int fib_table_flush(struct fib_table *tb)
19baf839
RO
1830{
1831 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1832 struct leaf *l, *ll = NULL;
82cfbb00 1833 int found = 0;
19baf839 1834
82cfbb00 1835 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1836 found += trie_flush_leaf(l);
19baf839
RO
1837
1838 if (ll && hlist_empty(&ll->list))
9195bef7 1839 trie_leaf_remove(t, ll);
19baf839
RO
1840 ll = l;
1841 }
1842
1843 if (ll && hlist_empty(&ll->list))
9195bef7 1844 trie_leaf_remove(t, ll);
19baf839 1845
0c7770c7 1846 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1847 return found;
1848}
1849
4aa2c466
PE
1850void fib_free_table(struct fib_table *tb)
1851{
1852 kfree(tb);
1853}
1854
a07f5f50
SH
1855static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1856 struct fib_table *tb,
19baf839
RO
1857 struct sk_buff *skb, struct netlink_callback *cb)
1858{
1859 int i, s_i;
1860 struct fib_alias *fa;
32ab5f80 1861 __be32 xkey = htonl(key);
19baf839 1862
71d67e66 1863 s_i = cb->args[5];
19baf839
RO
1864 i = 0;
1865
2373ce1c
RO
1866 /* rcu_read_lock is hold by caller */
1867
1868 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1869 if (i < s_i) {
1870 i++;
1871 continue;
1872 }
19baf839
RO
1873
1874 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1875 cb->nlh->nlmsg_seq,
1876 RTM_NEWROUTE,
1877 tb->tb_id,
1878 fa->fa_type,
be403ea1 1879 xkey,
19baf839
RO
1880 plen,
1881 fa->fa_tos,
64347f78 1882 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1883 cb->args[5] = i;
19baf839 1884 return -1;
91b9a277 1885 }
19baf839
RO
1886 i++;
1887 }
71d67e66 1888 cb->args[5] = i;
19baf839
RO
1889 return skb->len;
1890}
1891
a88ee229
SH
1892static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1893 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1894{
a88ee229
SH
1895 struct leaf_info *li;
1896 struct hlist_node *node;
1897 int i, s_i;
19baf839 1898
71d67e66 1899 s_i = cb->args[4];
a88ee229 1900 i = 0;
19baf839 1901
a88ee229
SH
1902 /* rcu_read_lock is hold by caller */
1903 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1904 if (i < s_i) {
1905 i++;
19baf839 1906 continue;
a88ee229 1907 }
91b9a277 1908
a88ee229 1909 if (i > s_i)
71d67e66 1910 cb->args[5] = 0;
19baf839 1911
a88ee229 1912 if (list_empty(&li->falh))
19baf839
RO
1913 continue;
1914
a88ee229 1915 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1916 cb->args[4] = i;
19baf839
RO
1917 return -1;
1918 }
a88ee229 1919 i++;
19baf839 1920 }
a88ee229 1921
71d67e66 1922 cb->args[4] = i;
19baf839
RO
1923 return skb->len;
1924}
1925
16c6cf8b
SH
1926int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1927 struct netlink_callback *cb)
19baf839 1928{
a88ee229 1929 struct leaf *l;
19baf839 1930 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1931 t_key key = cb->args[2];
71d67e66 1932 int count = cb->args[3];
19baf839 1933
2373ce1c 1934 rcu_read_lock();
d5ce8a0e
SH
1935 /* Dump starting at last key.
1936 * Note: 0.0.0.0/0 (ie default) is first key.
1937 */
71d67e66 1938 if (count == 0)
d5ce8a0e
SH
1939 l = trie_firstleaf(t);
1940 else {
71d67e66
SH
1941 /* Normally, continue from last key, but if that is missing
1942 * fallback to using slow rescan
1943 */
d5ce8a0e 1944 l = fib_find_node(t, key);
71d67e66
SH
1945 if (!l)
1946 l = trie_leafindex(t, count);
d5ce8a0e 1947 }
a88ee229 1948
d5ce8a0e
SH
1949 while (l) {
1950 cb->args[2] = l->key;
a88ee229 1951 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1952 cb->args[3] = count;
a88ee229 1953 rcu_read_unlock();
a88ee229 1954 return -1;
19baf839 1955 }
d5ce8a0e 1956
71d67e66 1957 ++count;
d5ce8a0e 1958 l = trie_nextleaf(l);
71d67e66
SH
1959 memset(&cb->args[4], 0,
1960 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1961 }
71d67e66 1962 cb->args[3] = count;
2373ce1c 1963 rcu_read_unlock();
a88ee229 1964
19baf839 1965 return skb->len;
19baf839
RO
1966}
1967
5348ba85 1968void __init fib_trie_init(void)
7f9b8052 1969{
a07f5f50
SH
1970 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1971 sizeof(struct fib_alias),
bc3c8c1e
SH
1972 0, SLAB_PANIC, NULL);
1973
1974 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1975 max(sizeof(struct leaf),
1976 sizeof(struct leaf_info)),
1977 0, SLAB_PANIC, NULL);
7f9b8052 1978}
19baf839 1979
7f9b8052 1980
5348ba85 1981struct fib_table *fib_trie_table(u32 id)
19baf839
RO
1982{
1983 struct fib_table *tb;
1984 struct trie *t;
1985
19baf839
RO
1986 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1987 GFP_KERNEL);
1988 if (tb == NULL)
1989 return NULL;
1990
1991 tb->tb_id = id;
971b893e 1992 tb->tb_default = -1;
21d8c49e 1993 tb->tb_num_default = 0;
19baf839
RO
1994
1995 t = (struct trie *) tb->tb_data;
c28a1cf4 1996 memset(t, 0, sizeof(*t));
19baf839 1997
19baf839
RO
1998 return tb;
1999}
2000
cb7b593c
SH
2001#ifdef CONFIG_PROC_FS
2002/* Depth first Trie walk iterator */
2003struct fib_trie_iter {
1c340b2f 2004 struct seq_net_private p;
3d3b2d25 2005 struct fib_table *tb;
cb7b593c 2006 struct tnode *tnode;
a034ee3c
ED
2007 unsigned int index;
2008 unsigned int depth;
cb7b593c 2009};
19baf839 2010
b299e4f0 2011static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2012{
cb7b593c 2013 struct tnode *tn = iter->tnode;
a034ee3c 2014 unsigned int cindex = iter->index;
cb7b593c 2015 struct tnode *p;
19baf839 2016
6640e697
EB
2017 /* A single entry routing table */
2018 if (!tn)
2019 return NULL;
2020
cb7b593c
SH
2021 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2022 iter->tnode, iter->index, iter->depth);
2023rescan:
2024 while (cindex < (1<<tn->bits)) {
b299e4f0 2025 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2026
cb7b593c
SH
2027 if (n) {
2028 if (IS_LEAF(n)) {
2029 iter->tnode = tn;
2030 iter->index = cindex + 1;
2031 } else {
2032 /* push down one level */
2033 iter->tnode = (struct tnode *) n;
2034 iter->index = 0;
2035 ++iter->depth;
2036 }
2037 return n;
2038 }
19baf839 2039
cb7b593c
SH
2040 ++cindex;
2041 }
91b9a277 2042
cb7b593c 2043 /* Current node exhausted, pop back up */
b299e4f0 2044 p = node_parent_rcu((struct rt_trie_node *)tn);
cb7b593c
SH
2045 if (p) {
2046 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2047 tn = p;
2048 --iter->depth;
2049 goto rescan;
19baf839 2050 }
cb7b593c
SH
2051
2052 /* got root? */
2053 return NULL;
19baf839
RO
2054}
2055
b299e4f0 2056static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
cb7b593c 2057 struct trie *t)
19baf839 2058{
b299e4f0 2059 struct rt_trie_node *n;
5ddf0eb2 2060
132adf54 2061 if (!t)
5ddf0eb2
RO
2062 return NULL;
2063
2064 n = rcu_dereference(t->trie);
3d3b2d25 2065 if (!n)
5ddf0eb2 2066 return NULL;
19baf839 2067
3d3b2d25
SH
2068 if (IS_TNODE(n)) {
2069 iter->tnode = (struct tnode *) n;
2070 iter->index = 0;
2071 iter->depth = 1;
2072 } else {
2073 iter->tnode = NULL;
2074 iter->index = 0;
2075 iter->depth = 0;
91b9a277 2076 }
3d3b2d25
SH
2077
2078 return n;
cb7b593c 2079}
91b9a277 2080
cb7b593c
SH
2081static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2082{
b299e4f0 2083 struct rt_trie_node *n;
cb7b593c 2084 struct fib_trie_iter iter;
91b9a277 2085
cb7b593c 2086 memset(s, 0, sizeof(*s));
91b9a277 2087
cb7b593c 2088 rcu_read_lock();
3d3b2d25 2089 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2090 if (IS_LEAF(n)) {
93672292
SH
2091 struct leaf *l = (struct leaf *)n;
2092 struct leaf_info *li;
2093 struct hlist_node *tmp;
2094
cb7b593c
SH
2095 s->leaves++;
2096 s->totdepth += iter.depth;
2097 if (iter.depth > s->maxdepth)
2098 s->maxdepth = iter.depth;
93672292
SH
2099
2100 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2101 ++s->prefixes;
cb7b593c
SH
2102 } else {
2103 const struct tnode *tn = (const struct tnode *) n;
2104 int i;
2105
2106 s->tnodes++;
132adf54 2107 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2108 s->nodesizes[tn->bits]++;
2109
cb7b593c
SH
2110 for (i = 0; i < (1<<tn->bits); i++)
2111 if (!tn->child[i])
2112 s->nullpointers++;
19baf839 2113 }
19baf839 2114 }
2373ce1c 2115 rcu_read_unlock();
19baf839
RO
2116}
2117
cb7b593c
SH
2118/*
2119 * This outputs /proc/net/fib_triestats
2120 */
2121static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2122{
a034ee3c 2123 unsigned int i, max, pointers, bytes, avdepth;
c877efb2 2124
cb7b593c
SH
2125 if (stat->leaves)
2126 avdepth = stat->totdepth*100 / stat->leaves;
2127 else
2128 avdepth = 0;
91b9a277 2129
a07f5f50
SH
2130 seq_printf(seq, "\tAver depth: %u.%02d\n",
2131 avdepth / 100, avdepth % 100);
cb7b593c 2132 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2133
cb7b593c 2134 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2135 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2136
2137 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2138 bytes += sizeof(struct leaf_info) * stat->prefixes;
2139
187b5188 2140 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2141 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2142
06ef921d
RO
2143 max = MAX_STAT_DEPTH;
2144 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2145 max--;
19baf839 2146
cb7b593c
SH
2147 pointers = 0;
2148 for (i = 1; i <= max; i++)
2149 if (stat->nodesizes[i] != 0) {
187b5188 2150 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2151 pointers += (1<<i) * stat->nodesizes[i];
2152 }
2153 seq_putc(seq, '\n');
187b5188 2154 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2155
b299e4f0 2156 bytes += sizeof(struct rt_trie_node *) * pointers;
187b5188
SH
2157 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2158 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2159}
2373ce1c 2160
cb7b593c 2161#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2162static void trie_show_usage(struct seq_file *seq,
2163 const struct trie_use_stats *stats)
2164{
2165 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2166 seq_printf(seq, "gets = %u\n", stats->gets);
2167 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2168 seq_printf(seq, "semantic match passed = %u\n",
2169 stats->semantic_match_passed);
2170 seq_printf(seq, "semantic match miss = %u\n",
2171 stats->semantic_match_miss);
2172 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2173 seq_printf(seq, "skipped node resize = %u\n\n",
2174 stats->resize_node_skipped);
cb7b593c 2175}
66a2f7fd
SH
2176#endif /* CONFIG_IP_FIB_TRIE_STATS */
2177
3d3b2d25 2178static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2179{
3d3b2d25
SH
2180 if (tb->tb_id == RT_TABLE_LOCAL)
2181 seq_puts(seq, "Local:\n");
2182 else if (tb->tb_id == RT_TABLE_MAIN)
2183 seq_puts(seq, "Main:\n");
2184 else
2185 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2186}
19baf839 2187
3d3b2d25 2188
cb7b593c
SH
2189static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2190{
1c340b2f 2191 struct net *net = (struct net *)seq->private;
3d3b2d25 2192 unsigned int h;
877a9bff 2193
d717a9a6 2194 seq_printf(seq,
a07f5f50
SH
2195 "Basic info: size of leaf:"
2196 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2197 sizeof(struct leaf), sizeof(struct tnode));
2198
3d3b2d25
SH
2199 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2200 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2201 struct hlist_node *node;
2202 struct fib_table *tb;
2203
2204 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2205 struct trie *t = (struct trie *) tb->tb_data;
2206 struct trie_stat stat;
877a9bff 2207
3d3b2d25
SH
2208 if (!t)
2209 continue;
2210
2211 fib_table_print(seq, tb);
2212
2213 trie_collect_stats(t, &stat);
2214 trie_show_stats(seq, &stat);
2215#ifdef CONFIG_IP_FIB_TRIE_STATS
2216 trie_show_usage(seq, &t->stats);
2217#endif
2218 }
2219 }
19baf839 2220
cb7b593c 2221 return 0;
19baf839
RO
2222}
2223
cb7b593c 2224static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2225{
de05c557 2226 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2227}
2228
9a32144e 2229static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2230 .owner = THIS_MODULE,
2231 .open = fib_triestat_seq_open,
2232 .read = seq_read,
2233 .llseek = seq_lseek,
b6fcbdb4 2234 .release = single_release_net,
cb7b593c
SH
2235};
2236
b299e4f0 2237static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2238{
1218854a
YH
2239 struct fib_trie_iter *iter = seq->private;
2240 struct net *net = seq_file_net(seq);
cb7b593c 2241 loff_t idx = 0;
3d3b2d25 2242 unsigned int h;
cb7b593c 2243
3d3b2d25
SH
2244 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2245 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2246 struct hlist_node *node;
2247 struct fib_table *tb;
cb7b593c 2248
3d3b2d25 2249 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
b299e4f0 2250 struct rt_trie_node *n;
3d3b2d25
SH
2251
2252 for (n = fib_trie_get_first(iter,
2253 (struct trie *) tb->tb_data);
2254 n; n = fib_trie_get_next(iter))
2255 if (pos == idx++) {
2256 iter->tb = tb;
2257 return n;
2258 }
2259 }
cb7b593c 2260 }
3d3b2d25 2261
19baf839
RO
2262 return NULL;
2263}
2264
cb7b593c 2265static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2266 __acquires(RCU)
19baf839 2267{
cb7b593c 2268 rcu_read_lock();
1218854a 2269 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2270}
2271
cb7b593c 2272static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2273{
cb7b593c 2274 struct fib_trie_iter *iter = seq->private;
1218854a 2275 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2276 struct fib_table *tb = iter->tb;
2277 struct hlist_node *tb_node;
2278 unsigned int h;
b299e4f0 2279 struct rt_trie_node *n;
cb7b593c 2280
19baf839 2281 ++*pos;
3d3b2d25
SH
2282 /* next node in same table */
2283 n = fib_trie_get_next(iter);
2284 if (n)
2285 return n;
19baf839 2286
3d3b2d25
SH
2287 /* walk rest of this hash chain */
2288 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
0a5c0475 2289 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
3d3b2d25
SH
2290 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2291 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2292 if (n)
2293 goto found;
2294 }
19baf839 2295
3d3b2d25
SH
2296 /* new hash chain */
2297 while (++h < FIB_TABLE_HASHSZ) {
2298 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2299 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2300 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2301 if (n)
2302 goto found;
2303 }
2304 }
cb7b593c 2305 return NULL;
3d3b2d25
SH
2306
2307found:
2308 iter->tb = tb;
2309 return n;
cb7b593c 2310}
19baf839 2311
cb7b593c 2312static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2313 __releases(RCU)
19baf839 2314{
cb7b593c
SH
2315 rcu_read_unlock();
2316}
91b9a277 2317
cb7b593c
SH
2318static void seq_indent(struct seq_file *seq, int n)
2319{
a034ee3c
ED
2320 while (n-- > 0)
2321 seq_puts(seq, " ");
cb7b593c 2322}
19baf839 2323
28d36e37 2324static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2325{
132adf54 2326 switch (s) {
cb7b593c
SH
2327 case RT_SCOPE_UNIVERSE: return "universe";
2328 case RT_SCOPE_SITE: return "site";
2329 case RT_SCOPE_LINK: return "link";
2330 case RT_SCOPE_HOST: return "host";
2331 case RT_SCOPE_NOWHERE: return "nowhere";
2332 default:
28d36e37 2333 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2334 return buf;
2335 }
2336}
19baf839 2337
36cbd3dc 2338static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2339 [RTN_UNSPEC] = "UNSPEC",
2340 [RTN_UNICAST] = "UNICAST",
2341 [RTN_LOCAL] = "LOCAL",
2342 [RTN_BROADCAST] = "BROADCAST",
2343 [RTN_ANYCAST] = "ANYCAST",
2344 [RTN_MULTICAST] = "MULTICAST",
2345 [RTN_BLACKHOLE] = "BLACKHOLE",
2346 [RTN_UNREACHABLE] = "UNREACHABLE",
2347 [RTN_PROHIBIT] = "PROHIBIT",
2348 [RTN_THROW] = "THROW",
2349 [RTN_NAT] = "NAT",
2350 [RTN_XRESOLVE] = "XRESOLVE",
2351};
19baf839 2352
a034ee3c 2353static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
cb7b593c 2354{
cb7b593c
SH
2355 if (t < __RTN_MAX && rtn_type_names[t])
2356 return rtn_type_names[t];
28d36e37 2357 snprintf(buf, len, "type %u", t);
cb7b593c 2358 return buf;
19baf839
RO
2359}
2360
cb7b593c
SH
2361/* Pretty print the trie */
2362static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2363{
cb7b593c 2364 const struct fib_trie_iter *iter = seq->private;
b299e4f0 2365 struct rt_trie_node *n = v;
c877efb2 2366
3d3b2d25
SH
2367 if (!node_parent_rcu(n))
2368 fib_table_print(seq, iter->tb);
095b8501 2369
cb7b593c
SH
2370 if (IS_TNODE(n)) {
2371 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2372 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2373
1d25cd6c 2374 seq_indent(seq, iter->depth-1);
673d57e7
HH
2375 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2376 &prf, tn->pos, tn->bits, tn->full_children,
1d25cd6c 2377 tn->empty_children);
e905a9ed 2378
cb7b593c
SH
2379 } else {
2380 struct leaf *l = (struct leaf *) n;
1328042e
SH
2381 struct leaf_info *li;
2382 struct hlist_node *node;
32ab5f80 2383 __be32 val = htonl(l->key);
cb7b593c
SH
2384
2385 seq_indent(seq, iter->depth);
673d57e7 2386 seq_printf(seq, " |-- %pI4\n", &val);
1328042e
SH
2387
2388 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2389 struct fib_alias *fa;
2390
2391 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2392 char buf1[32], buf2[32];
2393
2394 seq_indent(seq, iter->depth+1);
2395 seq_printf(seq, " /%d %s %s", li->plen,
2396 rtn_scope(buf1, sizeof(buf1),
37e826c5 2397 fa->fa_info->fib_scope),
1328042e
SH
2398 rtn_type(buf2, sizeof(buf2),
2399 fa->fa_type));
2400 if (fa->fa_tos)
b9c4d82a 2401 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2402 seq_putc(seq, '\n');
cb7b593c
SH
2403 }
2404 }
19baf839 2405 }
cb7b593c 2406
19baf839
RO
2407 return 0;
2408}
2409
f690808e 2410static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2411 .start = fib_trie_seq_start,
2412 .next = fib_trie_seq_next,
2413 .stop = fib_trie_seq_stop,
2414 .show = fib_trie_seq_show,
19baf839
RO
2415};
2416
cb7b593c 2417static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2418{
1c340b2f
DL
2419 return seq_open_net(inode, file, &fib_trie_seq_ops,
2420 sizeof(struct fib_trie_iter));
19baf839
RO
2421}
2422
9a32144e 2423static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2424 .owner = THIS_MODULE,
2425 .open = fib_trie_seq_open,
2426 .read = seq_read,
2427 .llseek = seq_lseek,
1c340b2f 2428 .release = seq_release_net,
19baf839
RO
2429};
2430
8315f5d8
SH
2431struct fib_route_iter {
2432 struct seq_net_private p;
2433 struct trie *main_trie;
2434 loff_t pos;
2435 t_key key;
2436};
2437
2438static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2439{
2440 struct leaf *l = NULL;
2441 struct trie *t = iter->main_trie;
2442
2443 /* use cache location of last found key */
2444 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2445 pos -= iter->pos;
2446 else {
2447 iter->pos = 0;
2448 l = trie_firstleaf(t);
2449 }
2450
2451 while (l && pos-- > 0) {
2452 iter->pos++;
2453 l = trie_nextleaf(l);
2454 }
2455
2456 if (l)
2457 iter->key = pos; /* remember it */
2458 else
2459 iter->pos = 0; /* forget it */
2460
2461 return l;
2462}
2463
2464static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2465 __acquires(RCU)
2466{
2467 struct fib_route_iter *iter = seq->private;
2468 struct fib_table *tb;
2469
2470 rcu_read_lock();
1218854a 2471 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2472 if (!tb)
2473 return NULL;
2474
2475 iter->main_trie = (struct trie *) tb->tb_data;
2476 if (*pos == 0)
2477 return SEQ_START_TOKEN;
2478 else
2479 return fib_route_get_idx(iter, *pos - 1);
2480}
2481
2482static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2483{
2484 struct fib_route_iter *iter = seq->private;
2485 struct leaf *l = v;
2486
2487 ++*pos;
2488 if (v == SEQ_START_TOKEN) {
2489 iter->pos = 0;
2490 l = trie_firstleaf(iter->main_trie);
2491 } else {
2492 iter->pos++;
2493 l = trie_nextleaf(l);
2494 }
2495
2496 if (l)
2497 iter->key = l->key;
2498 else
2499 iter->pos = 0;
2500 return l;
2501}
2502
2503static void fib_route_seq_stop(struct seq_file *seq, void *v)
2504 __releases(RCU)
2505{
2506 rcu_read_unlock();
2507}
2508
a034ee3c 2509static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2510{
a034ee3c 2511 unsigned int flags = 0;
19baf839 2512
a034ee3c
ED
2513 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2514 flags = RTF_REJECT;
cb7b593c
SH
2515 if (fi && fi->fib_nh->nh_gw)
2516 flags |= RTF_GATEWAY;
32ab5f80 2517 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2518 flags |= RTF_HOST;
2519 flags |= RTF_UP;
2520 return flags;
19baf839
RO
2521}
2522
cb7b593c
SH
2523/*
2524 * This outputs /proc/net/route.
2525 * The format of the file is not supposed to be changed
a034ee3c 2526 * and needs to be same as fib_hash output to avoid breaking
cb7b593c
SH
2527 * legacy utilities
2528 */
2529static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2530{
cb7b593c 2531 struct leaf *l = v;
1328042e
SH
2532 struct leaf_info *li;
2533 struct hlist_node *node;
19baf839 2534
cb7b593c
SH
2535 if (v == SEQ_START_TOKEN) {
2536 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2537 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2538 "\tWindow\tIRTT");
2539 return 0;
2540 }
19baf839 2541
1328042e 2542 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
cb7b593c 2543 struct fib_alias *fa;
32ab5f80 2544 __be32 mask, prefix;
91b9a277 2545
cb7b593c
SH
2546 mask = inet_make_mask(li->plen);
2547 prefix = htonl(l->key);
19baf839 2548
cb7b593c 2549 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2550 const struct fib_info *fi = fa->fa_info;
a034ee3c 2551 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
5e659e4c 2552 int len;
19baf839 2553
cb7b593c
SH
2554 if (fa->fa_type == RTN_BROADCAST
2555 || fa->fa_type == RTN_MULTICAST)
2556 continue;
19baf839 2557
cb7b593c 2558 if (fi)
5e659e4c
PE
2559 seq_printf(seq,
2560 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2561 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c
SH
2562 fi->fib_dev ? fi->fib_dev->name : "*",
2563 prefix,
2564 fi->fib_nh->nh_gw, flags, 0, 0,
2565 fi->fib_priority,
2566 mask,
a07f5f50
SH
2567 (fi->fib_advmss ?
2568 fi->fib_advmss + 40 : 0),
cb7b593c 2569 fi->fib_window,
5e659e4c 2570 fi->fib_rtt >> 3, &len);
cb7b593c 2571 else
5e659e4c
PE
2572 seq_printf(seq,
2573 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2574 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c 2575 prefix, 0, flags, 0, 0, 0,
5e659e4c 2576 mask, 0, 0, 0, &len);
19baf839 2577
5e659e4c 2578 seq_printf(seq, "%*s\n", 127 - len, "");
cb7b593c 2579 }
19baf839
RO
2580 }
2581
2582 return 0;
2583}
2584
f690808e 2585static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2586 .start = fib_route_seq_start,
2587 .next = fib_route_seq_next,
2588 .stop = fib_route_seq_stop,
cb7b593c 2589 .show = fib_route_seq_show,
19baf839
RO
2590};
2591
cb7b593c 2592static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2593{
1c340b2f 2594 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2595 sizeof(struct fib_route_iter));
19baf839
RO
2596}
2597
9a32144e 2598static const struct file_operations fib_route_fops = {
cb7b593c
SH
2599 .owner = THIS_MODULE,
2600 .open = fib_route_seq_open,
2601 .read = seq_read,
2602 .llseek = seq_lseek,
1c340b2f 2603 .release = seq_release_net,
19baf839
RO
2604};
2605
61a02653 2606int __net_init fib_proc_init(struct net *net)
19baf839 2607{
61a02653 2608 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
cb7b593c
SH
2609 goto out1;
2610
61a02653
DL
2611 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2612 &fib_triestat_fops))
cb7b593c
SH
2613 goto out2;
2614
61a02653 2615 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
cb7b593c
SH
2616 goto out3;
2617
19baf839 2618 return 0;
cb7b593c
SH
2619
2620out3:
61a02653 2621 proc_net_remove(net, "fib_triestat");
cb7b593c 2622out2:
61a02653 2623 proc_net_remove(net, "fib_trie");
cb7b593c
SH
2624out1:
2625 return -ENOMEM;
19baf839
RO
2626}
2627
61a02653 2628void __net_exit fib_proc_exit(struct net *net)
19baf839 2629{
61a02653
DL
2630 proc_net_remove(net, "fib_trie");
2631 proc_net_remove(net, "fib_triestat");
2632 proc_net_remove(net, "route");
19baf839
RO
2633}
2634
2635#endif /* CONFIG_PROC_FS */
This page took 0.865315 seconds and 5 git commands to generate.