Merge branch 'tip/perf/urgent-2' of git://git.kernel.org/pub/scm/linux/kernel/git...
[deliverable/linux.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
25985edc 15 * This work is based on the LPC-trie which is originally described in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
631dd1a8 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
19baf839
RO
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839 53#include <asm/uaccess.h>
1977f032 54#include <linux/bitops.h>
19baf839
RO
55#include <linux/types.h>
56#include <linux/kernel.h>
19baf839
RO
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
cd8787ab 64#include <linux/inetdevice.h>
19baf839
RO
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
2373ce1c 68#include <linux/rcupdate.h>
19baf839
RO
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
5a0e3ad6 73#include <linux/slab.h>
70c71606 74#include <linux/prefetch.h>
bc3b2d7f 75#include <linux/export.h>
457c4cbc 76#include <net/net_namespace.h>
19baf839
RO
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
06ef921d 85#define MAX_STAT_DEPTH 32
19baf839 86
19baf839 87#define KEYLENGTH (8*sizeof(t_key))
19baf839 88
19baf839
RO
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF 1
93#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
91b9a277
OJ
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839 98
b299e4f0 99struct rt_trie_node {
91b9a277 100 unsigned long parent;
8d965444 101 t_key key;
19baf839
RO
102};
103
104struct leaf {
91b9a277 105 unsigned long parent;
8d965444 106 t_key key;
19baf839 107 struct hlist_head list;
2373ce1c 108 struct rcu_head rcu;
19baf839
RO
109};
110
111struct leaf_info {
112 struct hlist_node hlist;
113 int plen;
5c74501f 114 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
19baf839 115 struct list_head falh;
5c74501f 116 struct rcu_head rcu;
19baf839
RO
117};
118
119struct tnode {
91b9a277 120 unsigned long parent;
8d965444 121 t_key key;
112d8cfc
ED
122 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
123 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
124 unsigned int full_children; /* KEYLENGTH bits needed */
125 unsigned int empty_children; /* KEYLENGTH bits needed */
15be75cd
SH
126 union {
127 struct rcu_head rcu;
128 struct work_struct work;
e0f7cb8c 129 struct tnode *tnode_free;
15be75cd 130 };
0a5c0475 131 struct rt_trie_node __rcu *child[0];
19baf839
RO
132};
133
134#ifdef CONFIG_IP_FIB_TRIE_STATS
135struct trie_use_stats {
136 unsigned int gets;
137 unsigned int backtrack;
138 unsigned int semantic_match_passed;
139 unsigned int semantic_match_miss;
140 unsigned int null_node_hit;
2f36895a 141 unsigned int resize_node_skipped;
19baf839
RO
142};
143#endif
144
145struct trie_stat {
146 unsigned int totdepth;
147 unsigned int maxdepth;
148 unsigned int tnodes;
149 unsigned int leaves;
150 unsigned int nullpointers;
93672292 151 unsigned int prefixes;
06ef921d 152 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 153};
19baf839
RO
154
155struct trie {
0a5c0475 156 struct rt_trie_node __rcu *trie;
19baf839
RO
157#ifdef CONFIG_IP_FIB_TRIE_STATS
158 struct trie_use_stats stats;
159#endif
19baf839
RO
160};
161
b299e4f0 162static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 163 int wasfull);
b299e4f0 164static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
165static struct tnode *inflate(struct trie *t, struct tnode *tn);
166static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c
JP
167/* tnodes to free after resize(); protected by RTNL */
168static struct tnode *tnode_free_head;
c3059477
JP
169static size_t tnode_free_size;
170
171/*
172 * synchronize_rcu after call_rcu for that many pages; it should be especially
173 * useful before resizing the root node with PREEMPT_NONE configs; the value was
174 * obtained experimentally, aiming to avoid visible slowdown.
175 */
176static const int sync_pages = 128;
19baf839 177
e18b890b 178static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 179static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 180
0a5c0475
ED
181/*
182 * caller must hold RTNL
183 */
184static inline struct tnode *node_parent(const struct rt_trie_node *node)
06801916 185{
0a5c0475
ED
186 unsigned long parent;
187
188 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
189
190 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
b59cfbf7
ED
191}
192
0a5c0475
ED
193/*
194 * caller must hold RCU read lock or RTNL
195 */
196static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
b59cfbf7 197{
0a5c0475
ED
198 unsigned long parent;
199
200 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
201 lockdep_rtnl_is_held());
06801916 202
0a5c0475 203 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
06801916
SH
204}
205
cf778b00 206/* Same as rcu_assign_pointer
6440cc9e
SH
207 * but that macro() assumes that value is a pointer.
208 */
b299e4f0 209static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
06801916 210{
6440cc9e
SH
211 smp_wmb();
212 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 213}
2373ce1c 214
0a5c0475
ED
215/*
216 * caller must hold RTNL
217 */
218static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
b59cfbf7
ED
219{
220 BUG_ON(i >= 1U << tn->bits);
2373ce1c 221
0a5c0475 222 return rtnl_dereference(tn->child[i]);
b59cfbf7
ED
223}
224
0a5c0475
ED
225/*
226 * caller must hold RCU read lock or RTNL
227 */
228static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
19baf839 229{
0a5c0475 230 BUG_ON(i >= 1U << tn->bits);
19baf839 231
0a5c0475 232 return rcu_dereference_rtnl(tn->child[i]);
19baf839
RO
233}
234
bb435b8d 235static inline int tnode_child_length(const struct tnode *tn)
19baf839 236{
91b9a277 237 return 1 << tn->bits;
19baf839
RO
238}
239
3b004569 240static inline t_key mask_pfx(t_key k, unsigned int l)
ab66b4a7
SH
241{
242 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
243}
244
3b004569 245static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
19baf839 246{
91b9a277 247 if (offset < KEYLENGTH)
19baf839 248 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 249 else
19baf839
RO
250 return 0;
251}
252
253static inline int tkey_equals(t_key a, t_key b)
254{
c877efb2 255 return a == b;
19baf839
RO
256}
257
258static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
259{
c877efb2
SH
260 if (bits == 0 || offset >= KEYLENGTH)
261 return 1;
91b9a277
OJ
262 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
263 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 264}
19baf839
RO
265
266static inline int tkey_mismatch(t_key a, int offset, t_key b)
267{
268 t_key diff = a ^ b;
269 int i = offset;
270
c877efb2
SH
271 if (!diff)
272 return 0;
273 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
274 i++;
275 return i;
276}
277
19baf839 278/*
e905a9ed
YH
279 To understand this stuff, an understanding of keys and all their bits is
280 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
281 all of the bits in that key are significant.
282
283 Consider a node 'n' and its parent 'tp'.
284
e905a9ed
YH
285 If n is a leaf, every bit in its key is significant. Its presence is
286 necessitated by path compression, since during a tree traversal (when
287 searching for a leaf - unless we are doing an insertion) we will completely
288 ignore all skipped bits we encounter. Thus we need to verify, at the end of
289 a potentially successful search, that we have indeed been walking the
19baf839
RO
290 correct key path.
291
e905a9ed
YH
292 Note that we can never "miss" the correct key in the tree if present by
293 following the wrong path. Path compression ensures that segments of the key
294 that are the same for all keys with a given prefix are skipped, but the
295 skipped part *is* identical for each node in the subtrie below the skipped
296 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
297 call to tkey_sub_equals() in trie_insert().
298
e905a9ed 299 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
300 have many different meanings.
301
e905a9ed 302 Example:
19baf839
RO
303 _________________________________________________________________
304 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
305 -----------------------------------------------------------------
e905a9ed 306 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
307
308 _________________________________________________________________
309 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
310 -----------------------------------------------------------------
311 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
312
313 tp->pos = 7
314 tp->bits = 3
315 n->pos = 15
91b9a277 316 n->bits = 4
19baf839 317
e905a9ed
YH
318 First, let's just ignore the bits that come before the parent tp, that is
319 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
320 not use them for anything.
321
322 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 323 index into the parent's child array. That is, they will be used to find
19baf839
RO
324 'n' among tp's children.
325
326 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
327 for the node n.
328
e905a9ed 329 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
330 of the bits are really not needed or indeed known in n->key.
331
e905a9ed 332 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 333 n's child array, and will of course be different for each child.
e905a9ed 334
c877efb2 335
19baf839
RO
336 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
337 at this point.
338
339*/
340
0c7770c7 341static inline void check_tnode(const struct tnode *tn)
19baf839 342{
0c7770c7 343 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
344}
345
f5026fab
DL
346static const int halve_threshold = 25;
347static const int inflate_threshold = 50;
345aa031 348static const int halve_threshold_root = 15;
80b71b80 349static const int inflate_threshold_root = 30;
2373ce1c
RO
350
351static void __alias_free_mem(struct rcu_head *head)
19baf839 352{
2373ce1c
RO
353 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
354 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
355}
356
2373ce1c 357static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 358{
2373ce1c
RO
359 call_rcu(&fa->rcu, __alias_free_mem);
360}
91b9a277 361
2373ce1c
RO
362static void __leaf_free_rcu(struct rcu_head *head)
363{
bc3c8c1e
SH
364 struct leaf *l = container_of(head, struct leaf, rcu);
365 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 366}
91b9a277 367
387a5487
SH
368static inline void free_leaf(struct leaf *l)
369{
0c03eca3 370 call_rcu(&l->rcu, __leaf_free_rcu);
387a5487
SH
371}
372
2373ce1c 373static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 374{
bceb0f45 375 kfree_rcu(leaf, rcu);
19baf839
RO
376}
377
8d965444 378static struct tnode *tnode_alloc(size_t size)
f0e36f8c 379{
2373ce1c 380 if (size <= PAGE_SIZE)
8d965444 381 return kzalloc(size, GFP_KERNEL);
15be75cd 382 else
7a1c8e5a 383 return vzalloc(size);
15be75cd 384}
2373ce1c 385
15be75cd
SH
386static void __tnode_vfree(struct work_struct *arg)
387{
388 struct tnode *tn = container_of(arg, struct tnode, work);
389 vfree(tn);
f0e36f8c
PM
390}
391
2373ce1c 392static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 393{
2373ce1c 394 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444 395 size_t size = sizeof(struct tnode) +
b299e4f0 396 (sizeof(struct rt_trie_node *) << tn->bits);
f0e36f8c
PM
397
398 if (size <= PAGE_SIZE)
399 kfree(tn);
15be75cd
SH
400 else {
401 INIT_WORK(&tn->work, __tnode_vfree);
402 schedule_work(&tn->work);
403 }
f0e36f8c
PM
404}
405
2373ce1c
RO
406static inline void tnode_free(struct tnode *tn)
407{
387a5487
SH
408 if (IS_LEAF(tn))
409 free_leaf((struct leaf *) tn);
410 else
550e29bc 411 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
412}
413
e0f7cb8c
JP
414static void tnode_free_safe(struct tnode *tn)
415{
416 BUG_ON(IS_LEAF(tn));
7b85576d
JP
417 tn->tnode_free = tnode_free_head;
418 tnode_free_head = tn;
c3059477 419 tnode_free_size += sizeof(struct tnode) +
b299e4f0 420 (sizeof(struct rt_trie_node *) << tn->bits);
e0f7cb8c
JP
421}
422
423static void tnode_free_flush(void)
424{
425 struct tnode *tn;
426
427 while ((tn = tnode_free_head)) {
428 tnode_free_head = tn->tnode_free;
429 tn->tnode_free = NULL;
430 tnode_free(tn);
431 }
c3059477
JP
432
433 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
434 tnode_free_size = 0;
435 synchronize_rcu();
436 }
e0f7cb8c
JP
437}
438
2373ce1c
RO
439static struct leaf *leaf_new(void)
440{
bc3c8c1e 441 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
442 if (l) {
443 l->parent = T_LEAF;
444 INIT_HLIST_HEAD(&l->list);
445 }
446 return l;
447}
448
449static struct leaf_info *leaf_info_new(int plen)
450{
451 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
452 if (li) {
453 li->plen = plen;
5c74501f 454 li->mask_plen = ntohl(inet_make_mask(plen));
2373ce1c
RO
455 INIT_LIST_HEAD(&li->falh);
456 }
457 return li;
458}
459
a07f5f50 460static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 461{
b299e4f0 462 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
f0e36f8c 463 struct tnode *tn = tnode_alloc(sz);
19baf839 464
91b9a277 465 if (tn) {
2373ce1c 466 tn->parent = T_TNODE;
19baf839
RO
467 tn->pos = pos;
468 tn->bits = bits;
469 tn->key = key;
470 tn->full_children = 0;
471 tn->empty_children = 1<<bits;
472 }
c877efb2 473
a034ee3c 474 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
4ea4bf7e 475 sizeof(struct rt_trie_node *) << bits);
19baf839
RO
476 return tn;
477}
478
19baf839
RO
479/*
480 * Check whether a tnode 'n' is "full", i.e. it is an internal node
481 * and no bits are skipped. See discussion in dyntree paper p. 6
482 */
483
b299e4f0 484static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
19baf839 485{
c877efb2 486 if (n == NULL || IS_LEAF(n))
19baf839
RO
487 return 0;
488
489 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
490}
491
61648d91 492static inline void put_child(struct tnode *tn, int i,
b299e4f0 493 struct rt_trie_node *n)
19baf839
RO
494{
495 tnode_put_child_reorg(tn, i, n, -1);
496}
497
c877efb2 498 /*
19baf839
RO
499 * Add a child at position i overwriting the old value.
500 * Update the value of full_children and empty_children.
501 */
502
b299e4f0 503static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 504 int wasfull)
19baf839 505{
0a5c0475 506 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
19baf839
RO
507 int isfull;
508
0c7770c7
SH
509 BUG_ON(i >= 1<<tn->bits);
510
19baf839
RO
511 /* update emptyChildren */
512 if (n == NULL && chi != NULL)
513 tn->empty_children++;
514 else if (n != NULL && chi == NULL)
515 tn->empty_children--;
c877efb2 516
19baf839 517 /* update fullChildren */
91b9a277 518 if (wasfull == -1)
19baf839
RO
519 wasfull = tnode_full(tn, chi);
520
521 isfull = tnode_full(tn, n);
c877efb2 522 if (wasfull && !isfull)
19baf839 523 tn->full_children--;
c877efb2 524 else if (!wasfull && isfull)
19baf839 525 tn->full_children++;
91b9a277 526
c877efb2 527 if (n)
06801916 528 node_set_parent(n, tn);
19baf839 529
cf778b00 530 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
531}
532
80b71b80 533#define MAX_WORK 10
b299e4f0 534static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
535{
536 int i;
2f80b3c8 537 struct tnode *old_tn;
e6308be8
RO
538 int inflate_threshold_use;
539 int halve_threshold_use;
80b71b80 540 int max_work;
19baf839 541
e905a9ed 542 if (!tn)
19baf839
RO
543 return NULL;
544
0c7770c7
SH
545 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
546 tn, inflate_threshold, halve_threshold);
19baf839
RO
547
548 /* No children */
549 if (tn->empty_children == tnode_child_length(tn)) {
e0f7cb8c 550 tnode_free_safe(tn);
19baf839
RO
551 return NULL;
552 }
553 /* One child */
554 if (tn->empty_children == tnode_child_length(tn) - 1)
80b71b80 555 goto one_child;
c877efb2 556 /*
19baf839
RO
557 * Double as long as the resulting node has a number of
558 * nonempty nodes that are above the threshold.
559 */
560
561 /*
c877efb2
SH
562 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
563 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 564 * Telecommunications, page 6:
c877efb2 565 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
566 * children in the *doubled* node is at least 'high'."
567 *
c877efb2
SH
568 * 'high' in this instance is the variable 'inflate_threshold'. It
569 * is expressed as a percentage, so we multiply it with
570 * tnode_child_length() and instead of multiplying by 2 (since the
571 * child array will be doubled by inflate()) and multiplying
572 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 573 * multiply the left-hand side by 50.
c877efb2
SH
574 *
575 * The left-hand side may look a bit weird: tnode_child_length(tn)
576 * - tn->empty_children is of course the number of non-null children
577 * in the current node. tn->full_children is the number of "full"
19baf839 578 * children, that is non-null tnodes with a skip value of 0.
c877efb2 579 * All of those will be doubled in the resulting inflated tnode, so
19baf839 580 * we just count them one extra time here.
c877efb2 581 *
19baf839 582 * A clearer way to write this would be:
c877efb2 583 *
19baf839 584 * to_be_doubled = tn->full_children;
c877efb2 585 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
586 * tn->full_children;
587 *
588 * new_child_length = tnode_child_length(tn) * 2;
589 *
c877efb2 590 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
591 * new_child_length;
592 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
593 *
594 * ...and so on, tho it would mess up the while () loop.
595 *
19baf839
RO
596 * anyway,
597 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
598 * inflate_threshold
c877efb2 599 *
19baf839
RO
600 * avoid a division:
601 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
602 * inflate_threshold * new_child_length
c877efb2 603 *
19baf839 604 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 605 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 606 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 607 *
19baf839 608 * expand new_child_length:
c877efb2 609 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 610 * tn->full_children) >=
19baf839 611 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 612 *
19baf839 613 * shorten again:
c877efb2 614 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 615 * tn->empty_children) >= inflate_threshold *
19baf839 616 * tnode_child_length(tn)
c877efb2 617 *
19baf839
RO
618 */
619
620 check_tnode(tn);
c877efb2 621
e6308be8
RO
622 /* Keep root node larger */
623
b299e4f0 624 if (!node_parent((struct rt_trie_node *)tn)) {
80b71b80
JL
625 inflate_threshold_use = inflate_threshold_root;
626 halve_threshold_use = halve_threshold_root;
a034ee3c 627 } else {
e6308be8 628 inflate_threshold_use = inflate_threshold;
80b71b80
JL
629 halve_threshold_use = halve_threshold;
630 }
e6308be8 631
80b71b80
JL
632 max_work = MAX_WORK;
633 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
634 50 * (tn->full_children + tnode_child_length(tn)
635 - tn->empty_children)
636 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 637
2f80b3c8
RO
638 old_tn = tn;
639 tn = inflate(t, tn);
a07f5f50 640
2f80b3c8
RO
641 if (IS_ERR(tn)) {
642 tn = old_tn;
2f36895a
RO
643#ifdef CONFIG_IP_FIB_TRIE_STATS
644 t->stats.resize_node_skipped++;
645#endif
646 break;
647 }
19baf839
RO
648 }
649
650 check_tnode(tn);
651
80b71b80 652 /* Return if at least one inflate is run */
a034ee3c 653 if (max_work != MAX_WORK)
b299e4f0 654 return (struct rt_trie_node *) tn;
80b71b80 655
19baf839
RO
656 /*
657 * Halve as long as the number of empty children in this
658 * node is above threshold.
659 */
2f36895a 660
80b71b80
JL
661 max_work = MAX_WORK;
662 while (tn->bits > 1 && max_work-- &&
19baf839 663 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 664 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 665
2f80b3c8
RO
666 old_tn = tn;
667 tn = halve(t, tn);
668 if (IS_ERR(tn)) {
669 tn = old_tn;
2f36895a
RO
670#ifdef CONFIG_IP_FIB_TRIE_STATS
671 t->stats.resize_node_skipped++;
672#endif
673 break;
674 }
675 }
19baf839 676
c877efb2 677
19baf839 678 /* Only one child remains */
80b71b80
JL
679 if (tn->empty_children == tnode_child_length(tn) - 1) {
680one_child:
19baf839 681 for (i = 0; i < tnode_child_length(tn); i++) {
b299e4f0 682 struct rt_trie_node *n;
19baf839 683
0a5c0475 684 n = rtnl_dereference(tn->child[i]);
2373ce1c 685 if (!n)
91b9a277 686 continue;
91b9a277
OJ
687
688 /* compress one level */
689
06801916 690 node_set_parent(n, NULL);
e0f7cb8c 691 tnode_free_safe(tn);
91b9a277 692 return n;
19baf839 693 }
80b71b80 694 }
b299e4f0 695 return (struct rt_trie_node *) tn;
19baf839
RO
696}
697
0a5c0475
ED
698
699static void tnode_clean_free(struct tnode *tn)
700{
701 int i;
702 struct tnode *tofree;
703
704 for (i = 0; i < tnode_child_length(tn); i++) {
705 tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
706 if (tofree)
707 tnode_free(tofree);
708 }
709 tnode_free(tn);
710}
711
2f80b3c8 712static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 713{
19baf839
RO
714 struct tnode *oldtnode = tn;
715 int olen = tnode_child_length(tn);
716 int i;
717
0c7770c7 718 pr_debug("In inflate\n");
19baf839
RO
719
720 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
721
0c7770c7 722 if (!tn)
2f80b3c8 723 return ERR_PTR(-ENOMEM);
2f36895a
RO
724
725 /*
c877efb2
SH
726 * Preallocate and store tnodes before the actual work so we
727 * don't get into an inconsistent state if memory allocation
728 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
729 * of tnode is ignored.
730 */
91b9a277
OJ
731
732 for (i = 0; i < olen; i++) {
a07f5f50 733 struct tnode *inode;
2f36895a 734
a07f5f50 735 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
736 if (inode &&
737 IS_TNODE(inode) &&
738 inode->pos == oldtnode->pos + oldtnode->bits &&
739 inode->bits > 1) {
740 struct tnode *left, *right;
ab66b4a7 741 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 742
2f36895a
RO
743 left = tnode_new(inode->key&(~m), inode->pos + 1,
744 inode->bits - 1);
2f80b3c8
RO
745 if (!left)
746 goto nomem;
91b9a277 747
2f36895a
RO
748 right = tnode_new(inode->key|m, inode->pos + 1,
749 inode->bits - 1);
750
e905a9ed 751 if (!right) {
2f80b3c8
RO
752 tnode_free(left);
753 goto nomem;
e905a9ed 754 }
2f36895a 755
61648d91
LM
756 put_child(tn, 2*i, (struct rt_trie_node *) left);
757 put_child(tn, 2*i+1, (struct rt_trie_node *) right);
2f36895a
RO
758 }
759 }
760
91b9a277 761 for (i = 0; i < olen; i++) {
c95aaf9a 762 struct tnode *inode;
b299e4f0 763 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
764 struct tnode *left, *right;
765 int size, j;
c877efb2 766
19baf839
RO
767 /* An empty child */
768 if (node == NULL)
769 continue;
770
771 /* A leaf or an internal node with skipped bits */
772
c877efb2 773 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 774 tn->pos + tn->bits - 1) {
a07f5f50
SH
775 if (tkey_extract_bits(node->key,
776 oldtnode->pos + oldtnode->bits,
777 1) == 0)
61648d91 778 put_child(tn, 2*i, node);
19baf839 779 else
61648d91 780 put_child(tn, 2*i+1, node);
19baf839
RO
781 continue;
782 }
783
784 /* An internal node with two children */
785 inode = (struct tnode *) node;
786
787 if (inode->bits == 1) {
61648d91
LM
788 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
789 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
19baf839 790
e0f7cb8c 791 tnode_free_safe(inode);
91b9a277 792 continue;
19baf839
RO
793 }
794
91b9a277
OJ
795 /* An internal node with more than two children */
796
797 /* We will replace this node 'inode' with two new
798 * ones, 'left' and 'right', each with half of the
799 * original children. The two new nodes will have
800 * a position one bit further down the key and this
801 * means that the "significant" part of their keys
802 * (see the discussion near the top of this file)
803 * will differ by one bit, which will be "0" in
804 * left's key and "1" in right's key. Since we are
805 * moving the key position by one step, the bit that
806 * we are moving away from - the bit at position
807 * (inode->pos) - is the one that will differ between
808 * left and right. So... we synthesize that bit in the
809 * two new keys.
810 * The mask 'm' below will be a single "one" bit at
811 * the position (inode->pos)
812 */
19baf839 813
91b9a277
OJ
814 /* Use the old key, but set the new significant
815 * bit to zero.
816 */
2f36895a 817
91b9a277 818 left = (struct tnode *) tnode_get_child(tn, 2*i);
61648d91 819 put_child(tn, 2*i, NULL);
2f36895a 820
91b9a277 821 BUG_ON(!left);
2f36895a 822
91b9a277 823 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
61648d91 824 put_child(tn, 2*i+1, NULL);
19baf839 825
91b9a277 826 BUG_ON(!right);
19baf839 827
91b9a277
OJ
828 size = tnode_child_length(left);
829 for (j = 0; j < size; j++) {
61648d91
LM
830 put_child(left, j, rtnl_dereference(inode->child[j]));
831 put_child(right, j, rtnl_dereference(inode->child[j + size]));
19baf839 832 }
61648d91
LM
833 put_child(tn, 2*i, resize(t, left));
834 put_child(tn, 2*i+1, resize(t, right));
91b9a277 835
e0f7cb8c 836 tnode_free_safe(inode);
19baf839 837 }
e0f7cb8c 838 tnode_free_safe(oldtnode);
19baf839 839 return tn;
2f80b3c8 840nomem:
0a5c0475
ED
841 tnode_clean_free(tn);
842 return ERR_PTR(-ENOMEM);
19baf839
RO
843}
844
2f80b3c8 845static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
846{
847 struct tnode *oldtnode = tn;
b299e4f0 848 struct rt_trie_node *left, *right;
19baf839
RO
849 int i;
850 int olen = tnode_child_length(tn);
851
0c7770c7 852 pr_debug("In halve\n");
c877efb2
SH
853
854 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 855
2f80b3c8
RO
856 if (!tn)
857 return ERR_PTR(-ENOMEM);
2f36895a
RO
858
859 /*
c877efb2
SH
860 * Preallocate and store tnodes before the actual work so we
861 * don't get into an inconsistent state if memory allocation
862 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
863 * of tnode is ignored.
864 */
865
91b9a277 866 for (i = 0; i < olen; i += 2) {
2f36895a
RO
867 left = tnode_get_child(oldtnode, i);
868 right = tnode_get_child(oldtnode, i+1);
c877efb2 869
2f36895a 870 /* Two nonempty children */
0c7770c7 871 if (left && right) {
2f80b3c8 872 struct tnode *newn;
0c7770c7 873
2f80b3c8 874 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
875
876 if (!newn)
2f80b3c8 877 goto nomem;
0c7770c7 878
61648d91 879 put_child(tn, i/2, (struct rt_trie_node *)newn);
2f36895a 880 }
2f36895a 881
2f36895a 882 }
19baf839 883
91b9a277
OJ
884 for (i = 0; i < olen; i += 2) {
885 struct tnode *newBinNode;
886
19baf839
RO
887 left = tnode_get_child(oldtnode, i);
888 right = tnode_get_child(oldtnode, i+1);
c877efb2 889
19baf839
RO
890 /* At least one of the children is empty */
891 if (left == NULL) {
892 if (right == NULL) /* Both are empty */
893 continue;
61648d91 894 put_child(tn, i/2, right);
91b9a277 895 continue;
0c7770c7 896 }
91b9a277
OJ
897
898 if (right == NULL) {
61648d91 899 put_child(tn, i/2, left);
91b9a277
OJ
900 continue;
901 }
c877efb2 902
19baf839 903 /* Two nonempty children */
91b9a277 904 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
61648d91
LM
905 put_child(tn, i/2, NULL);
906 put_child(newBinNode, 0, left);
907 put_child(newBinNode, 1, right);
908 put_child(tn, i/2, resize(t, newBinNode));
19baf839 909 }
e0f7cb8c 910 tnode_free_safe(oldtnode);
19baf839 911 return tn;
2f80b3c8 912nomem:
0a5c0475
ED
913 tnode_clean_free(tn);
914 return ERR_PTR(-ENOMEM);
19baf839
RO
915}
916
772cb712 917/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
918 via get_fa_head and dump */
919
772cb712 920static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 921{
772cb712 922 struct hlist_head *head = &l->list;
19baf839
RO
923 struct leaf_info *li;
924
b67bfe0d 925 hlist_for_each_entry_rcu(li, head, hlist)
c877efb2 926 if (li->plen == plen)
19baf839 927 return li;
91b9a277 928
19baf839
RO
929 return NULL;
930}
931
a07f5f50 932static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 933{
772cb712 934 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 935
91b9a277
OJ
936 if (!li)
937 return NULL;
c877efb2 938
91b9a277 939 return &li->falh;
19baf839
RO
940}
941
942static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
943{
e905a9ed 944 struct leaf_info *li = NULL, *last = NULL;
e905a9ed
YH
945
946 if (hlist_empty(head)) {
947 hlist_add_head_rcu(&new->hlist, head);
948 } else {
b67bfe0d 949 hlist_for_each_entry(li, head, hlist) {
e905a9ed
YH
950 if (new->plen > li->plen)
951 break;
952
953 last = li;
954 }
955 if (last)
956 hlist_add_after_rcu(&last->hlist, &new->hlist);
957 else
958 hlist_add_before_rcu(&new->hlist, &li->hlist);
959 }
19baf839
RO
960}
961
2373ce1c
RO
962/* rcu_read_lock needs to be hold by caller from readside */
963
19baf839
RO
964static struct leaf *
965fib_find_node(struct trie *t, u32 key)
966{
967 int pos;
968 struct tnode *tn;
b299e4f0 969 struct rt_trie_node *n;
19baf839
RO
970
971 pos = 0;
a034ee3c 972 n = rcu_dereference_rtnl(t->trie);
19baf839
RO
973
974 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
975 tn = (struct tnode *) n;
91b9a277 976
19baf839 977 check_tnode(tn);
91b9a277 978
c877efb2 979 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 980 pos = tn->pos + tn->bits;
a07f5f50
SH
981 n = tnode_get_child_rcu(tn,
982 tkey_extract_bits(key,
983 tn->pos,
984 tn->bits));
91b9a277 985 } else
19baf839
RO
986 break;
987 }
988 /* Case we have found a leaf. Compare prefixes */
989
91b9a277
OJ
990 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
991 return (struct leaf *)n;
992
19baf839
RO
993 return NULL;
994}
995
7b85576d 996static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 997{
19baf839 998 int wasfull;
3ed18d76 999 t_key cindex, key;
06801916 1000 struct tnode *tp;
19baf839 1001
3ed18d76
RO
1002 key = tn->key;
1003
b299e4f0 1004 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
19baf839
RO
1005 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1006 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
e3192690 1007 tn = (struct tnode *)resize(t, tn);
a07f5f50 1008
e3192690 1009 tnode_put_child_reorg(tp, cindex,
b299e4f0 1010 (struct rt_trie_node *)tn, wasfull);
91b9a277 1011
b299e4f0 1012 tp = node_parent((struct rt_trie_node *) tn);
008440e3 1013 if (!tp)
cf778b00 1014 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
008440e3 1015
e0f7cb8c 1016 tnode_free_flush();
06801916 1017 if (!tp)
19baf839 1018 break;
06801916 1019 tn = tp;
19baf839 1020 }
06801916 1021
19baf839 1022 /* Handle last (top) tnode */
7b85576d 1023 if (IS_TNODE(tn))
e3192690 1024 tn = (struct tnode *)resize(t, tn);
19baf839 1025
cf778b00 1026 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
7b85576d 1027 tnode_free_flush();
19baf839
RO
1028}
1029
2373ce1c
RO
1030/* only used from updater-side */
1031
fea86ad8 1032static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1033{
1034 int pos, newpos;
1035 struct tnode *tp = NULL, *tn = NULL;
b299e4f0 1036 struct rt_trie_node *n;
19baf839
RO
1037 struct leaf *l;
1038 int missbit;
c877efb2 1039 struct list_head *fa_head = NULL;
19baf839
RO
1040 struct leaf_info *li;
1041 t_key cindex;
1042
1043 pos = 0;
0a5c0475 1044 n = rtnl_dereference(t->trie);
19baf839 1045
c877efb2
SH
1046 /* If we point to NULL, stop. Either the tree is empty and we should
1047 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1048 * and we should just put our new leaf in that.
c877efb2
SH
1049 * If we point to a T_TNODE, check if it matches our key. Note that
1050 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1051 * not be the parent's 'pos'+'bits'!
1052 *
c877efb2 1053 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1054 * the index from our key, push the T_TNODE and walk the tree.
1055 *
1056 * If it doesn't, we have to replace it with a new T_TNODE.
1057 *
c877efb2
SH
1058 * If we point to a T_LEAF, it might or might not have the same key
1059 * as we do. If it does, just change the value, update the T_LEAF's
1060 * value, and return it.
19baf839
RO
1061 * If it doesn't, we need to replace it with a T_TNODE.
1062 */
1063
1064 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1065 tn = (struct tnode *) n;
91b9a277 1066
c877efb2 1067 check_tnode(tn);
91b9a277 1068
c877efb2 1069 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1070 tp = tn;
91b9a277 1071 pos = tn->pos + tn->bits;
a07f5f50
SH
1072 n = tnode_get_child(tn,
1073 tkey_extract_bits(key,
1074 tn->pos,
1075 tn->bits));
19baf839 1076
06801916 1077 BUG_ON(n && node_parent(n) != tn);
91b9a277 1078 } else
19baf839
RO
1079 break;
1080 }
1081
1082 /*
1083 * n ----> NULL, LEAF or TNODE
1084 *
c877efb2 1085 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1086 */
1087
91b9a277 1088 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1089
1090 /* Case 1: n is a leaf. Compare prefixes */
1091
c877efb2 1092 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1093 l = (struct leaf *) n;
19baf839 1094 li = leaf_info_new(plen);
91b9a277 1095
fea86ad8
SH
1096 if (!li)
1097 return NULL;
19baf839
RO
1098
1099 fa_head = &li->falh;
1100 insert_leaf_info(&l->list, li);
1101 goto done;
1102 }
19baf839
RO
1103 l = leaf_new();
1104
fea86ad8
SH
1105 if (!l)
1106 return NULL;
19baf839
RO
1107
1108 l->key = key;
1109 li = leaf_info_new(plen);
1110
c877efb2 1111 if (!li) {
387a5487 1112 free_leaf(l);
fea86ad8 1113 return NULL;
f835e471 1114 }
19baf839
RO
1115
1116 fa_head = &li->falh;
1117 insert_leaf_info(&l->list, li);
1118
19baf839 1119 if (t->trie && n == NULL) {
91b9a277 1120 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1121
b299e4f0 1122 node_set_parent((struct rt_trie_node *)l, tp);
19baf839 1123
91b9a277 1124 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
61648d91 1125 put_child(tp, cindex, (struct rt_trie_node *)l);
91b9a277
OJ
1126 } else {
1127 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1128 /*
1129 * Add a new tnode here
19baf839
RO
1130 * first tnode need some special handling
1131 */
1132
1133 if (tp)
91b9a277 1134 pos = tp->pos+tp->bits;
19baf839 1135 else
91b9a277
OJ
1136 pos = 0;
1137
c877efb2 1138 if (n) {
19baf839
RO
1139 newpos = tkey_mismatch(key, pos, n->key);
1140 tn = tnode_new(n->key, newpos, 1);
91b9a277 1141 } else {
19baf839 1142 newpos = 0;
c877efb2 1143 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1144 }
19baf839 1145
c877efb2 1146 if (!tn) {
f835e471 1147 free_leaf_info(li);
387a5487 1148 free_leaf(l);
fea86ad8 1149 return NULL;
91b9a277
OJ
1150 }
1151
b299e4f0 1152 node_set_parent((struct rt_trie_node *)tn, tp);
19baf839 1153
91b9a277 1154 missbit = tkey_extract_bits(key, newpos, 1);
61648d91
LM
1155 put_child(tn, missbit, (struct rt_trie_node *)l);
1156 put_child(tn, 1-missbit, n);
19baf839 1157
c877efb2 1158 if (tp) {
19baf839 1159 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
61648d91 1160 put_child(tp, cindex, (struct rt_trie_node *)tn);
91b9a277 1161 } else {
cf778b00 1162 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
19baf839
RO
1163 tp = tn;
1164 }
1165 }
91b9a277
OJ
1166
1167 if (tp && tp->pos + tp->bits > 32)
058bd4d2
JP
1168 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1169 tp, tp->pos, tp->bits, key, plen);
91b9a277 1170
19baf839 1171 /* Rebalance the trie */
2373ce1c 1172
7b85576d 1173 trie_rebalance(t, tp);
f835e471 1174done:
19baf839
RO
1175 return fa_head;
1176}
1177
d562f1f8
RO
1178/*
1179 * Caller must hold RTNL.
1180 */
16c6cf8b 1181int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1182{
1183 struct trie *t = (struct trie *) tb->tb_data;
1184 struct fib_alias *fa, *new_fa;
c877efb2 1185 struct list_head *fa_head = NULL;
19baf839 1186 struct fib_info *fi;
4e902c57
TG
1187 int plen = cfg->fc_dst_len;
1188 u8 tos = cfg->fc_tos;
19baf839
RO
1189 u32 key, mask;
1190 int err;
1191 struct leaf *l;
1192
1193 if (plen > 32)
1194 return -EINVAL;
1195
4e902c57 1196 key = ntohl(cfg->fc_dst);
19baf839 1197
2dfe55b4 1198 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1199
91b9a277 1200 mask = ntohl(inet_make_mask(plen));
19baf839 1201
c877efb2 1202 if (key & ~mask)
19baf839
RO
1203 return -EINVAL;
1204
1205 key = key & mask;
1206
4e902c57
TG
1207 fi = fib_create_info(cfg);
1208 if (IS_ERR(fi)) {
1209 err = PTR_ERR(fi);
19baf839 1210 goto err;
4e902c57 1211 }
19baf839
RO
1212
1213 l = fib_find_node(t, key);
c877efb2 1214 fa = NULL;
19baf839 1215
c877efb2 1216 if (l) {
19baf839
RO
1217 fa_head = get_fa_head(l, plen);
1218 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1219 }
1220
1221 /* Now fa, if non-NULL, points to the first fib alias
1222 * with the same keys [prefix,tos,priority], if such key already
1223 * exists or to the node before which we will insert new one.
1224 *
1225 * If fa is NULL, we will need to allocate a new one and
1226 * insert to the head of f.
1227 *
1228 * If f is NULL, no fib node matched the destination key
1229 * and we need to allocate a new one of those as well.
1230 */
1231
936f6f8e
JA
1232 if (fa && fa->fa_tos == tos &&
1233 fa->fa_info->fib_priority == fi->fib_priority) {
1234 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1235
1236 err = -EEXIST;
4e902c57 1237 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1238 goto out;
1239
936f6f8e
JA
1240 /* We have 2 goals:
1241 * 1. Find exact match for type, scope, fib_info to avoid
1242 * duplicate routes
1243 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1244 */
1245 fa_match = NULL;
1246 fa_first = fa;
1247 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1248 list_for_each_entry_continue(fa, fa_head, fa_list) {
1249 if (fa->fa_tos != tos)
1250 break;
1251 if (fa->fa_info->fib_priority != fi->fib_priority)
1252 break;
1253 if (fa->fa_type == cfg->fc_type &&
936f6f8e
JA
1254 fa->fa_info == fi) {
1255 fa_match = fa;
1256 break;
1257 }
1258 }
1259
4e902c57 1260 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1261 struct fib_info *fi_drop;
1262 u8 state;
1263
936f6f8e
JA
1264 fa = fa_first;
1265 if (fa_match) {
1266 if (fa == fa_match)
1267 err = 0;
6725033f 1268 goto out;
936f6f8e 1269 }
2373ce1c 1270 err = -ENOBUFS;
e94b1766 1271 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1272 if (new_fa == NULL)
1273 goto out;
19baf839
RO
1274
1275 fi_drop = fa->fa_info;
2373ce1c
RO
1276 new_fa->fa_tos = fa->fa_tos;
1277 new_fa->fa_info = fi;
4e902c57 1278 new_fa->fa_type = cfg->fc_type;
19baf839 1279 state = fa->fa_state;
936f6f8e 1280 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1281
2373ce1c
RO
1282 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1283 alias_free_mem_rcu(fa);
19baf839
RO
1284
1285 fib_release_info(fi_drop);
1286 if (state & FA_S_ACCESSED)
4ccfe6d4 1287 rt_cache_flush(cfg->fc_nlinfo.nl_net);
b8f55831
MK
1288 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1289 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1290
91b9a277 1291 goto succeeded;
19baf839
RO
1292 }
1293 /* Error if we find a perfect match which
1294 * uses the same scope, type, and nexthop
1295 * information.
1296 */
936f6f8e
JA
1297 if (fa_match)
1298 goto out;
a07f5f50 1299
4e902c57 1300 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1301 fa = fa_first;
19baf839
RO
1302 }
1303 err = -ENOENT;
4e902c57 1304 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1305 goto out;
1306
1307 err = -ENOBUFS;
e94b1766 1308 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1309 if (new_fa == NULL)
1310 goto out;
1311
1312 new_fa->fa_info = fi;
1313 new_fa->fa_tos = tos;
4e902c57 1314 new_fa->fa_type = cfg->fc_type;
19baf839 1315 new_fa->fa_state = 0;
19baf839
RO
1316 /*
1317 * Insert new entry to the list.
1318 */
1319
c877efb2 1320 if (!fa_head) {
fea86ad8
SH
1321 fa_head = fib_insert_node(t, key, plen);
1322 if (unlikely(!fa_head)) {
1323 err = -ENOMEM;
f835e471 1324 goto out_free_new_fa;
fea86ad8 1325 }
f835e471 1326 }
19baf839 1327
21d8c49e
DM
1328 if (!plen)
1329 tb->tb_num_default++;
1330
2373ce1c
RO
1331 list_add_tail_rcu(&new_fa->fa_list,
1332 (fa ? &fa->fa_list : fa_head));
19baf839 1333
4ccfe6d4 1334 rt_cache_flush(cfg->fc_nlinfo.nl_net);
4e902c57 1335 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1336 &cfg->fc_nlinfo, 0);
19baf839
RO
1337succeeded:
1338 return 0;
f835e471
RO
1339
1340out_free_new_fa:
1341 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1342out:
1343 fib_release_info(fi);
91b9a277 1344err:
19baf839
RO
1345 return err;
1346}
1347
772cb712 1348/* should be called with rcu_read_lock */
5b470441 1349static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
22bd5b9b 1350 t_key key, const struct flowi4 *flp,
ebc0ffae 1351 struct fib_result *res, int fib_flags)
19baf839 1352{
19baf839
RO
1353 struct leaf_info *li;
1354 struct hlist_head *hhead = &l->list;
c877efb2 1355
b67bfe0d 1356 hlist_for_each_entry_rcu(li, hhead, hlist) {
3be0686b 1357 struct fib_alias *fa;
a07f5f50 1358
5c74501f 1359 if (l->key != (key & li->mask_plen))
19baf839
RO
1360 continue;
1361
3be0686b
DM
1362 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1363 struct fib_info *fi = fa->fa_info;
1364 int nhsel, err;
a07f5f50 1365
22bd5b9b 1366 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
3be0686b 1367 continue;
dccd9ecc
DM
1368 if (fi->fib_dead)
1369 continue;
37e826c5 1370 if (fa->fa_info->fib_scope < flp->flowi4_scope)
3be0686b
DM
1371 continue;
1372 fib_alias_accessed(fa);
1373 err = fib_props[fa->fa_type].error;
1374 if (err) {
19baf839 1375#ifdef CONFIG_IP_FIB_TRIE_STATS
1fbc7843 1376 t->stats.semantic_match_passed++;
3be0686b 1377#endif
1fbc7843 1378 return err;
3be0686b
DM
1379 }
1380 if (fi->fib_flags & RTNH_F_DEAD)
1381 continue;
1382 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1383 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1384
1385 if (nh->nh_flags & RTNH_F_DEAD)
1386 continue;
22bd5b9b 1387 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
3be0686b
DM
1388 continue;
1389
1390#ifdef CONFIG_IP_FIB_TRIE_STATS
1391 t->stats.semantic_match_passed++;
1392#endif
5c74501f 1393 res->prefixlen = li->plen;
3be0686b
DM
1394 res->nh_sel = nhsel;
1395 res->type = fa->fa_type;
37e826c5 1396 res->scope = fa->fa_info->fib_scope;
3be0686b
DM
1397 res->fi = fi;
1398 res->table = tb;
1399 res->fa_head = &li->falh;
1400 if (!(fib_flags & FIB_LOOKUP_NOREF))
5c74501f 1401 atomic_inc(&fi->fib_clntref);
3be0686b
DM
1402 return 0;
1403 }
1404 }
1405
1406#ifdef CONFIG_IP_FIB_TRIE_STATS
1407 t->stats.semantic_match_miss++;
19baf839 1408#endif
19baf839 1409 }
a07f5f50 1410
2e655571 1411 return 1;
19baf839
RO
1412}
1413
22bd5b9b 1414int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
ebc0ffae 1415 struct fib_result *res, int fib_flags)
19baf839
RO
1416{
1417 struct trie *t = (struct trie *) tb->tb_data;
2e655571 1418 int ret;
b299e4f0 1419 struct rt_trie_node *n;
19baf839 1420 struct tnode *pn;
3b004569 1421 unsigned int pos, bits;
22bd5b9b 1422 t_key key = ntohl(flp->daddr);
3b004569 1423 unsigned int chopped_off;
19baf839 1424 t_key cindex = 0;
3b004569 1425 unsigned int current_prefix_length = KEYLENGTH;
91b9a277 1426 struct tnode *cn;
874ffa8f 1427 t_key pref_mismatch;
91b9a277 1428
2373ce1c 1429 rcu_read_lock();
91b9a277 1430
2373ce1c 1431 n = rcu_dereference(t->trie);
c877efb2 1432 if (!n)
19baf839
RO
1433 goto failed;
1434
1435#ifdef CONFIG_IP_FIB_TRIE_STATS
1436 t->stats.gets++;
1437#endif
1438
1439 /* Just a leaf? */
1440 if (IS_LEAF(n)) {
5b470441 1441 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
a07f5f50 1442 goto found;
19baf839 1443 }
a07f5f50 1444
19baf839
RO
1445 pn = (struct tnode *) n;
1446 chopped_off = 0;
c877efb2 1447
91b9a277 1448 while (pn) {
19baf839
RO
1449 pos = pn->pos;
1450 bits = pn->bits;
1451
c877efb2 1452 if (!chopped_off)
ab66b4a7
SH
1453 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1454 pos, bits);
19baf839 1455
b902e573 1456 n = tnode_get_child_rcu(pn, cindex);
19baf839
RO
1457
1458 if (n == NULL) {
1459#ifdef CONFIG_IP_FIB_TRIE_STATS
1460 t->stats.null_node_hit++;
1461#endif
1462 goto backtrace;
1463 }
1464
91b9a277 1465 if (IS_LEAF(n)) {
5b470441 1466 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
2e655571 1467 if (ret > 0)
91b9a277 1468 goto backtrace;
a07f5f50 1469 goto found;
91b9a277
OJ
1470 }
1471
91b9a277 1472 cn = (struct tnode *)n;
19baf839 1473
91b9a277
OJ
1474 /*
1475 * It's a tnode, and we can do some extra checks here if we
1476 * like, to avoid descending into a dead-end branch.
1477 * This tnode is in the parent's child array at index
1478 * key[p_pos..p_pos+p_bits] but potentially with some bits
1479 * chopped off, so in reality the index may be just a
1480 * subprefix, padded with zero at the end.
1481 * We can also take a look at any skipped bits in this
1482 * tnode - everything up to p_pos is supposed to be ok,
1483 * and the non-chopped bits of the index (se previous
1484 * paragraph) are also guaranteed ok, but the rest is
1485 * considered unknown.
1486 *
1487 * The skipped bits are key[pos+bits..cn->pos].
1488 */
19baf839 1489
91b9a277
OJ
1490 /* If current_prefix_length < pos+bits, we are already doing
1491 * actual prefix matching, which means everything from
1492 * pos+(bits-chopped_off) onward must be zero along some
1493 * branch of this subtree - otherwise there is *no* valid
1494 * prefix present. Here we can only check the skipped
1495 * bits. Remember, since we have already indexed into the
1496 * parent's child array, we know that the bits we chopped of
1497 * *are* zero.
1498 */
19baf839 1499
a07f5f50
SH
1500 /* NOTA BENE: Checking only skipped bits
1501 for the new node here */
19baf839 1502
91b9a277
OJ
1503 if (current_prefix_length < pos+bits) {
1504 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1505 cn->pos - current_prefix_length)
1506 || !(cn->child[0]))
91b9a277
OJ
1507 goto backtrace;
1508 }
19baf839 1509
91b9a277
OJ
1510 /*
1511 * If chopped_off=0, the index is fully validated and we
1512 * only need to look at the skipped bits for this, the new,
1513 * tnode. What we actually want to do is to find out if
1514 * these skipped bits match our key perfectly, or if we will
1515 * have to count on finding a matching prefix further down,
1516 * because if we do, we would like to have some way of
1517 * verifying the existence of such a prefix at this point.
1518 */
19baf839 1519
91b9a277
OJ
1520 /* The only thing we can do at this point is to verify that
1521 * any such matching prefix can indeed be a prefix to our
1522 * key, and if the bits in the node we are inspecting that
1523 * do not match our key are not ZERO, this cannot be true.
1524 * Thus, find out where there is a mismatch (before cn->pos)
1525 * and verify that all the mismatching bits are zero in the
1526 * new tnode's key.
1527 */
19baf839 1528
a07f5f50
SH
1529 /*
1530 * Note: We aren't very concerned about the piece of
1531 * the key that precede pn->pos+pn->bits, since these
1532 * have already been checked. The bits after cn->pos
1533 * aren't checked since these are by definition
1534 * "unknown" at this point. Thus, what we want to see
1535 * is if we are about to enter the "prefix matching"
1536 * state, and in that case verify that the skipped
1537 * bits that will prevail throughout this subtree are
1538 * zero, as they have to be if we are to find a
1539 * matching prefix.
91b9a277
OJ
1540 */
1541
874ffa8f 1542 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
91b9a277 1543
a07f5f50
SH
1544 /*
1545 * In short: If skipped bits in this node do not match
1546 * the search key, enter the "prefix matching"
1547 * state.directly.
91b9a277
OJ
1548 */
1549 if (pref_mismatch) {
79cda75a
ED
1550 /* fls(x) = __fls(x) + 1 */
1551 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
91b9a277 1552
874ffa8f 1553 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
91b9a277
OJ
1554 goto backtrace;
1555
1556 if (current_prefix_length >= cn->pos)
1557 current_prefix_length = mp;
c877efb2 1558 }
a07f5f50 1559
91b9a277
OJ
1560 pn = (struct tnode *)n; /* Descend */
1561 chopped_off = 0;
1562 continue;
1563
19baf839
RO
1564backtrace:
1565 chopped_off++;
1566
1567 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1568 while ((chopped_off <= pn->bits)
1569 && !(cindex & (1<<(chopped_off-1))))
19baf839 1570 chopped_off++;
19baf839
RO
1571
1572 /* Decrease current_... with bits chopped off */
1573 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1574 current_prefix_length = pn->pos + pn->bits
1575 - chopped_off;
91b9a277 1576
19baf839 1577 /*
c877efb2 1578 * Either we do the actual chop off according or if we have
19baf839
RO
1579 * chopped off all bits in this tnode walk up to our parent.
1580 */
1581
91b9a277 1582 if (chopped_off <= pn->bits) {
19baf839 1583 cindex &= ~(1 << (chopped_off-1));
91b9a277 1584 } else {
b299e4f0 1585 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
06801916 1586 if (!parent)
19baf839 1587 goto failed;
91b9a277 1588
19baf839 1589 /* Get Child's index */
06801916
SH
1590 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1591 pn = parent;
19baf839
RO
1592 chopped_off = 0;
1593
1594#ifdef CONFIG_IP_FIB_TRIE_STATS
1595 t->stats.backtrack++;
1596#endif
1597 goto backtrace;
c877efb2 1598 }
19baf839
RO
1599 }
1600failed:
c877efb2 1601 ret = 1;
19baf839 1602found:
2373ce1c 1603 rcu_read_unlock();
19baf839
RO
1604 return ret;
1605}
6fc01438 1606EXPORT_SYMBOL_GPL(fib_table_lookup);
19baf839 1607
9195bef7
SH
1608/*
1609 * Remove the leaf and return parent.
1610 */
1611static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1612{
b299e4f0 1613 struct tnode *tp = node_parent((struct rt_trie_node *) l);
c877efb2 1614
9195bef7 1615 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1616
c877efb2 1617 if (tp) {
9195bef7 1618 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
61648d91 1619 put_child(tp, cindex, NULL);
7b85576d 1620 trie_rebalance(t, tp);
91b9a277 1621 } else
a9b3cd7f 1622 RCU_INIT_POINTER(t->trie, NULL);
19baf839 1623
387a5487 1624 free_leaf(l);
19baf839
RO
1625}
1626
d562f1f8
RO
1627/*
1628 * Caller must hold RTNL.
1629 */
16c6cf8b 1630int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1631{
1632 struct trie *t = (struct trie *) tb->tb_data;
1633 u32 key, mask;
4e902c57
TG
1634 int plen = cfg->fc_dst_len;
1635 u8 tos = cfg->fc_tos;
19baf839
RO
1636 struct fib_alias *fa, *fa_to_delete;
1637 struct list_head *fa_head;
1638 struct leaf *l;
91b9a277
OJ
1639 struct leaf_info *li;
1640
c877efb2 1641 if (plen > 32)
19baf839
RO
1642 return -EINVAL;
1643
4e902c57 1644 key = ntohl(cfg->fc_dst);
91b9a277 1645 mask = ntohl(inet_make_mask(plen));
19baf839 1646
c877efb2 1647 if (key & ~mask)
19baf839
RO
1648 return -EINVAL;
1649
1650 key = key & mask;
1651 l = fib_find_node(t, key);
1652
c877efb2 1653 if (!l)
19baf839
RO
1654 return -ESRCH;
1655
ad5b3102
IM
1656 li = find_leaf_info(l, plen);
1657
1658 if (!li)
1659 return -ESRCH;
1660
1661 fa_head = &li->falh;
19baf839
RO
1662 fa = fib_find_alias(fa_head, tos, 0);
1663
1664 if (!fa)
1665 return -ESRCH;
1666
0c7770c7 1667 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1668
1669 fa_to_delete = NULL;
936f6f8e
JA
1670 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1671 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1672 struct fib_info *fi = fa->fa_info;
1673
1674 if (fa->fa_tos != tos)
1675 break;
1676
4e902c57
TG
1677 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1678 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
37e826c5 1679 fa->fa_info->fib_scope == cfg->fc_scope) &&
74cb3c10
JA
1680 (!cfg->fc_prefsrc ||
1681 fi->fib_prefsrc == cfg->fc_prefsrc) &&
4e902c57
TG
1682 (!cfg->fc_protocol ||
1683 fi->fib_protocol == cfg->fc_protocol) &&
1684 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1685 fa_to_delete = fa;
1686 break;
1687 }
1688 }
1689
91b9a277
OJ
1690 if (!fa_to_delete)
1691 return -ESRCH;
19baf839 1692
91b9a277 1693 fa = fa_to_delete;
4e902c57 1694 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1695 &cfg->fc_nlinfo, 0);
91b9a277 1696
2373ce1c 1697 list_del_rcu(&fa->fa_list);
19baf839 1698
21d8c49e
DM
1699 if (!plen)
1700 tb->tb_num_default--;
1701
91b9a277 1702 if (list_empty(fa_head)) {
2373ce1c 1703 hlist_del_rcu(&li->hlist);
91b9a277 1704 free_leaf_info(li);
2373ce1c 1705 }
19baf839 1706
91b9a277 1707 if (hlist_empty(&l->list))
9195bef7 1708 trie_leaf_remove(t, l);
19baf839 1709
91b9a277 1710 if (fa->fa_state & FA_S_ACCESSED)
4ccfe6d4 1711 rt_cache_flush(cfg->fc_nlinfo.nl_net);
19baf839 1712
2373ce1c
RO
1713 fib_release_info(fa->fa_info);
1714 alias_free_mem_rcu(fa);
91b9a277 1715 return 0;
19baf839
RO
1716}
1717
ef3660ce 1718static int trie_flush_list(struct list_head *head)
19baf839
RO
1719{
1720 struct fib_alias *fa, *fa_node;
1721 int found = 0;
1722
1723 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1724 struct fib_info *fi = fa->fa_info;
19baf839 1725
2373ce1c
RO
1726 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1727 list_del_rcu(&fa->fa_list);
1728 fib_release_info(fa->fa_info);
1729 alias_free_mem_rcu(fa);
19baf839
RO
1730 found++;
1731 }
1732 }
1733 return found;
1734}
1735
ef3660ce 1736static int trie_flush_leaf(struct leaf *l)
19baf839
RO
1737{
1738 int found = 0;
1739 struct hlist_head *lih = &l->list;
b67bfe0d 1740 struct hlist_node *tmp;
19baf839
RO
1741 struct leaf_info *li = NULL;
1742
b67bfe0d 1743 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
ef3660ce 1744 found += trie_flush_list(&li->falh);
19baf839
RO
1745
1746 if (list_empty(&li->falh)) {
2373ce1c 1747 hlist_del_rcu(&li->hlist);
19baf839
RO
1748 free_leaf_info(li);
1749 }
1750 }
1751 return found;
1752}
1753
82cfbb00
SH
1754/*
1755 * Scan for the next right leaf starting at node p->child[idx]
1756 * Since we have back pointer, no recursion necessary.
1757 */
b299e4f0 1758static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
19baf839 1759{
82cfbb00
SH
1760 do {
1761 t_key idx;
c877efb2 1762
c877efb2 1763 if (c)
82cfbb00 1764 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1765 else
82cfbb00 1766 idx = 0;
2373ce1c 1767
82cfbb00
SH
1768 while (idx < 1u << p->bits) {
1769 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1770 if (!c)
91b9a277
OJ
1771 continue;
1772
82cfbb00 1773 if (IS_LEAF(c)) {
0a5c0475 1774 prefetch(rcu_dereference_rtnl(p->child[idx]));
82cfbb00 1775 return (struct leaf *) c;
19baf839 1776 }
82cfbb00
SH
1777
1778 /* Rescan start scanning in new node */
1779 p = (struct tnode *) c;
1780 idx = 0;
19baf839 1781 }
82cfbb00
SH
1782
1783 /* Node empty, walk back up to parent */
b299e4f0 1784 c = (struct rt_trie_node *) p;
a034ee3c 1785 } while ((p = node_parent_rcu(c)) != NULL);
82cfbb00
SH
1786
1787 return NULL; /* Root of trie */
1788}
1789
82cfbb00
SH
1790static struct leaf *trie_firstleaf(struct trie *t)
1791{
a034ee3c 1792 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
82cfbb00
SH
1793
1794 if (!n)
1795 return NULL;
1796
1797 if (IS_LEAF(n)) /* trie is just a leaf */
1798 return (struct leaf *) n;
1799
1800 return leaf_walk_rcu(n, NULL);
1801}
1802
1803static struct leaf *trie_nextleaf(struct leaf *l)
1804{
b299e4f0 1805 struct rt_trie_node *c = (struct rt_trie_node *) l;
b902e573 1806 struct tnode *p = node_parent_rcu(c);
82cfbb00
SH
1807
1808 if (!p)
1809 return NULL; /* trie with just one leaf */
1810
1811 return leaf_walk_rcu(p, c);
19baf839
RO
1812}
1813
71d67e66
SH
1814static struct leaf *trie_leafindex(struct trie *t, int index)
1815{
1816 struct leaf *l = trie_firstleaf(t);
1817
ec28cf73 1818 while (l && index-- > 0)
71d67e66 1819 l = trie_nextleaf(l);
ec28cf73 1820
71d67e66
SH
1821 return l;
1822}
1823
1824
d562f1f8
RO
1825/*
1826 * Caller must hold RTNL.
1827 */
16c6cf8b 1828int fib_table_flush(struct fib_table *tb)
19baf839
RO
1829{
1830 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1831 struct leaf *l, *ll = NULL;
82cfbb00 1832 int found = 0;
19baf839 1833
82cfbb00 1834 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1835 found += trie_flush_leaf(l);
19baf839
RO
1836
1837 if (ll && hlist_empty(&ll->list))
9195bef7 1838 trie_leaf_remove(t, ll);
19baf839
RO
1839 ll = l;
1840 }
1841
1842 if (ll && hlist_empty(&ll->list))
9195bef7 1843 trie_leaf_remove(t, ll);
19baf839 1844
0c7770c7 1845 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1846 return found;
1847}
1848
4aa2c466
PE
1849void fib_free_table(struct fib_table *tb)
1850{
1851 kfree(tb);
1852}
1853
a07f5f50
SH
1854static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1855 struct fib_table *tb,
19baf839
RO
1856 struct sk_buff *skb, struct netlink_callback *cb)
1857{
1858 int i, s_i;
1859 struct fib_alias *fa;
32ab5f80 1860 __be32 xkey = htonl(key);
19baf839 1861
71d67e66 1862 s_i = cb->args[5];
19baf839
RO
1863 i = 0;
1864
2373ce1c
RO
1865 /* rcu_read_lock is hold by caller */
1866
1867 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1868 if (i < s_i) {
1869 i++;
1870 continue;
1871 }
19baf839 1872
15e47304 1873 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
19baf839
RO
1874 cb->nlh->nlmsg_seq,
1875 RTM_NEWROUTE,
1876 tb->tb_id,
1877 fa->fa_type,
be403ea1 1878 xkey,
19baf839
RO
1879 plen,
1880 fa->fa_tos,
64347f78 1881 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1882 cb->args[5] = i;
19baf839 1883 return -1;
91b9a277 1884 }
19baf839
RO
1885 i++;
1886 }
71d67e66 1887 cb->args[5] = i;
19baf839
RO
1888 return skb->len;
1889}
1890
a88ee229
SH
1891static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1892 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1893{
a88ee229 1894 struct leaf_info *li;
a88ee229 1895 int i, s_i;
19baf839 1896
71d67e66 1897 s_i = cb->args[4];
a88ee229 1898 i = 0;
19baf839 1899
a88ee229 1900 /* rcu_read_lock is hold by caller */
b67bfe0d 1901 hlist_for_each_entry_rcu(li, &l->list, hlist) {
a88ee229
SH
1902 if (i < s_i) {
1903 i++;
19baf839 1904 continue;
a88ee229 1905 }
91b9a277 1906
a88ee229 1907 if (i > s_i)
71d67e66 1908 cb->args[5] = 0;
19baf839 1909
a88ee229 1910 if (list_empty(&li->falh))
19baf839
RO
1911 continue;
1912
a88ee229 1913 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1914 cb->args[4] = i;
19baf839
RO
1915 return -1;
1916 }
a88ee229 1917 i++;
19baf839 1918 }
a88ee229 1919
71d67e66 1920 cb->args[4] = i;
19baf839
RO
1921 return skb->len;
1922}
1923
16c6cf8b
SH
1924int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1925 struct netlink_callback *cb)
19baf839 1926{
a88ee229 1927 struct leaf *l;
19baf839 1928 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1929 t_key key = cb->args[2];
71d67e66 1930 int count = cb->args[3];
19baf839 1931
2373ce1c 1932 rcu_read_lock();
d5ce8a0e
SH
1933 /* Dump starting at last key.
1934 * Note: 0.0.0.0/0 (ie default) is first key.
1935 */
71d67e66 1936 if (count == 0)
d5ce8a0e
SH
1937 l = trie_firstleaf(t);
1938 else {
71d67e66
SH
1939 /* Normally, continue from last key, but if that is missing
1940 * fallback to using slow rescan
1941 */
d5ce8a0e 1942 l = fib_find_node(t, key);
71d67e66
SH
1943 if (!l)
1944 l = trie_leafindex(t, count);
d5ce8a0e 1945 }
a88ee229 1946
d5ce8a0e
SH
1947 while (l) {
1948 cb->args[2] = l->key;
a88ee229 1949 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1950 cb->args[3] = count;
a88ee229 1951 rcu_read_unlock();
a88ee229 1952 return -1;
19baf839 1953 }
d5ce8a0e 1954
71d67e66 1955 ++count;
d5ce8a0e 1956 l = trie_nextleaf(l);
71d67e66
SH
1957 memset(&cb->args[4], 0,
1958 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1959 }
71d67e66 1960 cb->args[3] = count;
2373ce1c 1961 rcu_read_unlock();
a88ee229 1962
19baf839 1963 return skb->len;
19baf839
RO
1964}
1965
5348ba85 1966void __init fib_trie_init(void)
7f9b8052 1967{
a07f5f50
SH
1968 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1969 sizeof(struct fib_alias),
bc3c8c1e
SH
1970 0, SLAB_PANIC, NULL);
1971
1972 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1973 max(sizeof(struct leaf),
1974 sizeof(struct leaf_info)),
1975 0, SLAB_PANIC, NULL);
7f9b8052 1976}
19baf839 1977
7f9b8052 1978
5348ba85 1979struct fib_table *fib_trie_table(u32 id)
19baf839
RO
1980{
1981 struct fib_table *tb;
1982 struct trie *t;
1983
19baf839
RO
1984 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1985 GFP_KERNEL);
1986 if (tb == NULL)
1987 return NULL;
1988
1989 tb->tb_id = id;
971b893e 1990 tb->tb_default = -1;
21d8c49e 1991 tb->tb_num_default = 0;
19baf839
RO
1992
1993 t = (struct trie *) tb->tb_data;
c28a1cf4 1994 memset(t, 0, sizeof(*t));
19baf839 1995
19baf839
RO
1996 return tb;
1997}
1998
cb7b593c
SH
1999#ifdef CONFIG_PROC_FS
2000/* Depth first Trie walk iterator */
2001struct fib_trie_iter {
1c340b2f 2002 struct seq_net_private p;
3d3b2d25 2003 struct fib_table *tb;
cb7b593c 2004 struct tnode *tnode;
a034ee3c
ED
2005 unsigned int index;
2006 unsigned int depth;
cb7b593c 2007};
19baf839 2008
b299e4f0 2009static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2010{
cb7b593c 2011 struct tnode *tn = iter->tnode;
a034ee3c 2012 unsigned int cindex = iter->index;
cb7b593c 2013 struct tnode *p;
19baf839 2014
6640e697
EB
2015 /* A single entry routing table */
2016 if (!tn)
2017 return NULL;
2018
cb7b593c
SH
2019 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2020 iter->tnode, iter->index, iter->depth);
2021rescan:
2022 while (cindex < (1<<tn->bits)) {
b299e4f0 2023 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2024
cb7b593c
SH
2025 if (n) {
2026 if (IS_LEAF(n)) {
2027 iter->tnode = tn;
2028 iter->index = cindex + 1;
2029 } else {
2030 /* push down one level */
2031 iter->tnode = (struct tnode *) n;
2032 iter->index = 0;
2033 ++iter->depth;
2034 }
2035 return n;
2036 }
19baf839 2037
cb7b593c
SH
2038 ++cindex;
2039 }
91b9a277 2040
cb7b593c 2041 /* Current node exhausted, pop back up */
b299e4f0 2042 p = node_parent_rcu((struct rt_trie_node *)tn);
cb7b593c
SH
2043 if (p) {
2044 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2045 tn = p;
2046 --iter->depth;
2047 goto rescan;
19baf839 2048 }
cb7b593c
SH
2049
2050 /* got root? */
2051 return NULL;
19baf839
RO
2052}
2053
b299e4f0 2054static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
cb7b593c 2055 struct trie *t)
19baf839 2056{
b299e4f0 2057 struct rt_trie_node *n;
5ddf0eb2 2058
132adf54 2059 if (!t)
5ddf0eb2
RO
2060 return NULL;
2061
2062 n = rcu_dereference(t->trie);
3d3b2d25 2063 if (!n)
5ddf0eb2 2064 return NULL;
19baf839 2065
3d3b2d25
SH
2066 if (IS_TNODE(n)) {
2067 iter->tnode = (struct tnode *) n;
2068 iter->index = 0;
2069 iter->depth = 1;
2070 } else {
2071 iter->tnode = NULL;
2072 iter->index = 0;
2073 iter->depth = 0;
91b9a277 2074 }
3d3b2d25
SH
2075
2076 return n;
cb7b593c 2077}
91b9a277 2078
cb7b593c
SH
2079static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2080{
b299e4f0 2081 struct rt_trie_node *n;
cb7b593c 2082 struct fib_trie_iter iter;
91b9a277 2083
cb7b593c 2084 memset(s, 0, sizeof(*s));
91b9a277 2085
cb7b593c 2086 rcu_read_lock();
3d3b2d25 2087 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2088 if (IS_LEAF(n)) {
93672292
SH
2089 struct leaf *l = (struct leaf *)n;
2090 struct leaf_info *li;
93672292 2091
cb7b593c
SH
2092 s->leaves++;
2093 s->totdepth += iter.depth;
2094 if (iter.depth > s->maxdepth)
2095 s->maxdepth = iter.depth;
93672292 2096
b67bfe0d 2097 hlist_for_each_entry_rcu(li, &l->list, hlist)
93672292 2098 ++s->prefixes;
cb7b593c
SH
2099 } else {
2100 const struct tnode *tn = (const struct tnode *) n;
2101 int i;
2102
2103 s->tnodes++;
132adf54 2104 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2105 s->nodesizes[tn->bits]++;
2106
cb7b593c
SH
2107 for (i = 0; i < (1<<tn->bits); i++)
2108 if (!tn->child[i])
2109 s->nullpointers++;
19baf839 2110 }
19baf839 2111 }
2373ce1c 2112 rcu_read_unlock();
19baf839
RO
2113}
2114
cb7b593c
SH
2115/*
2116 * This outputs /proc/net/fib_triestats
2117 */
2118static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2119{
a034ee3c 2120 unsigned int i, max, pointers, bytes, avdepth;
c877efb2 2121
cb7b593c
SH
2122 if (stat->leaves)
2123 avdepth = stat->totdepth*100 / stat->leaves;
2124 else
2125 avdepth = 0;
91b9a277 2126
a07f5f50
SH
2127 seq_printf(seq, "\tAver depth: %u.%02d\n",
2128 avdepth / 100, avdepth % 100);
cb7b593c 2129 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2130
cb7b593c 2131 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2132 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2133
2134 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2135 bytes += sizeof(struct leaf_info) * stat->prefixes;
2136
187b5188 2137 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2138 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2139
06ef921d
RO
2140 max = MAX_STAT_DEPTH;
2141 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2142 max--;
19baf839 2143
cb7b593c
SH
2144 pointers = 0;
2145 for (i = 1; i <= max; i++)
2146 if (stat->nodesizes[i] != 0) {
187b5188 2147 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2148 pointers += (1<<i) * stat->nodesizes[i];
2149 }
2150 seq_putc(seq, '\n');
187b5188 2151 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2152
b299e4f0 2153 bytes += sizeof(struct rt_trie_node *) * pointers;
187b5188
SH
2154 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2155 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2156}
2373ce1c 2157
cb7b593c 2158#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2159static void trie_show_usage(struct seq_file *seq,
2160 const struct trie_use_stats *stats)
2161{
2162 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2163 seq_printf(seq, "gets = %u\n", stats->gets);
2164 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2165 seq_printf(seq, "semantic match passed = %u\n",
2166 stats->semantic_match_passed);
2167 seq_printf(seq, "semantic match miss = %u\n",
2168 stats->semantic_match_miss);
2169 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2170 seq_printf(seq, "skipped node resize = %u\n\n",
2171 stats->resize_node_skipped);
cb7b593c 2172}
66a2f7fd
SH
2173#endif /* CONFIG_IP_FIB_TRIE_STATS */
2174
3d3b2d25 2175static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2176{
3d3b2d25
SH
2177 if (tb->tb_id == RT_TABLE_LOCAL)
2178 seq_puts(seq, "Local:\n");
2179 else if (tb->tb_id == RT_TABLE_MAIN)
2180 seq_puts(seq, "Main:\n");
2181 else
2182 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2183}
19baf839 2184
3d3b2d25 2185
cb7b593c
SH
2186static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2187{
1c340b2f 2188 struct net *net = (struct net *)seq->private;
3d3b2d25 2189 unsigned int h;
877a9bff 2190
d717a9a6 2191 seq_printf(seq,
a07f5f50
SH
2192 "Basic info: size of leaf:"
2193 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2194 sizeof(struct leaf), sizeof(struct tnode));
2195
3d3b2d25
SH
2196 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2197 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25
SH
2198 struct fib_table *tb;
2199
b67bfe0d 2200 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2201 struct trie *t = (struct trie *) tb->tb_data;
2202 struct trie_stat stat;
877a9bff 2203
3d3b2d25
SH
2204 if (!t)
2205 continue;
2206
2207 fib_table_print(seq, tb);
2208
2209 trie_collect_stats(t, &stat);
2210 trie_show_stats(seq, &stat);
2211#ifdef CONFIG_IP_FIB_TRIE_STATS
2212 trie_show_usage(seq, &t->stats);
2213#endif
2214 }
2215 }
19baf839 2216
cb7b593c 2217 return 0;
19baf839
RO
2218}
2219
cb7b593c 2220static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2221{
de05c557 2222 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2223}
2224
9a32144e 2225static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2226 .owner = THIS_MODULE,
2227 .open = fib_triestat_seq_open,
2228 .read = seq_read,
2229 .llseek = seq_lseek,
b6fcbdb4 2230 .release = single_release_net,
cb7b593c
SH
2231};
2232
b299e4f0 2233static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2234{
1218854a
YH
2235 struct fib_trie_iter *iter = seq->private;
2236 struct net *net = seq_file_net(seq);
cb7b593c 2237 loff_t idx = 0;
3d3b2d25 2238 unsigned int h;
cb7b593c 2239
3d3b2d25
SH
2240 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2241 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25 2242 struct fib_table *tb;
cb7b593c 2243
b67bfe0d 2244 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
b299e4f0 2245 struct rt_trie_node *n;
3d3b2d25
SH
2246
2247 for (n = fib_trie_get_first(iter,
2248 (struct trie *) tb->tb_data);
2249 n; n = fib_trie_get_next(iter))
2250 if (pos == idx++) {
2251 iter->tb = tb;
2252 return n;
2253 }
2254 }
cb7b593c 2255 }
3d3b2d25 2256
19baf839
RO
2257 return NULL;
2258}
2259
cb7b593c 2260static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2261 __acquires(RCU)
19baf839 2262{
cb7b593c 2263 rcu_read_lock();
1218854a 2264 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2265}
2266
cb7b593c 2267static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2268{
cb7b593c 2269 struct fib_trie_iter *iter = seq->private;
1218854a 2270 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2271 struct fib_table *tb = iter->tb;
2272 struct hlist_node *tb_node;
2273 unsigned int h;
b299e4f0 2274 struct rt_trie_node *n;
cb7b593c 2275
19baf839 2276 ++*pos;
3d3b2d25
SH
2277 /* next node in same table */
2278 n = fib_trie_get_next(iter);
2279 if (n)
2280 return n;
19baf839 2281
3d3b2d25
SH
2282 /* walk rest of this hash chain */
2283 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
0a5c0475 2284 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
3d3b2d25
SH
2285 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2286 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2287 if (n)
2288 goto found;
2289 }
19baf839 2290
3d3b2d25
SH
2291 /* new hash chain */
2292 while (++h < FIB_TABLE_HASHSZ) {
2293 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
b67bfe0d 2294 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2295 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2296 if (n)
2297 goto found;
2298 }
2299 }
cb7b593c 2300 return NULL;
3d3b2d25
SH
2301
2302found:
2303 iter->tb = tb;
2304 return n;
cb7b593c 2305}
19baf839 2306
cb7b593c 2307static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2308 __releases(RCU)
19baf839 2309{
cb7b593c
SH
2310 rcu_read_unlock();
2311}
91b9a277 2312
cb7b593c
SH
2313static void seq_indent(struct seq_file *seq, int n)
2314{
a034ee3c
ED
2315 while (n-- > 0)
2316 seq_puts(seq, " ");
cb7b593c 2317}
19baf839 2318
28d36e37 2319static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2320{
132adf54 2321 switch (s) {
cb7b593c
SH
2322 case RT_SCOPE_UNIVERSE: return "universe";
2323 case RT_SCOPE_SITE: return "site";
2324 case RT_SCOPE_LINK: return "link";
2325 case RT_SCOPE_HOST: return "host";
2326 case RT_SCOPE_NOWHERE: return "nowhere";
2327 default:
28d36e37 2328 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2329 return buf;
2330 }
2331}
19baf839 2332
36cbd3dc 2333static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2334 [RTN_UNSPEC] = "UNSPEC",
2335 [RTN_UNICAST] = "UNICAST",
2336 [RTN_LOCAL] = "LOCAL",
2337 [RTN_BROADCAST] = "BROADCAST",
2338 [RTN_ANYCAST] = "ANYCAST",
2339 [RTN_MULTICAST] = "MULTICAST",
2340 [RTN_BLACKHOLE] = "BLACKHOLE",
2341 [RTN_UNREACHABLE] = "UNREACHABLE",
2342 [RTN_PROHIBIT] = "PROHIBIT",
2343 [RTN_THROW] = "THROW",
2344 [RTN_NAT] = "NAT",
2345 [RTN_XRESOLVE] = "XRESOLVE",
2346};
19baf839 2347
a034ee3c 2348static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
cb7b593c 2349{
cb7b593c
SH
2350 if (t < __RTN_MAX && rtn_type_names[t])
2351 return rtn_type_names[t];
28d36e37 2352 snprintf(buf, len, "type %u", t);
cb7b593c 2353 return buf;
19baf839
RO
2354}
2355
cb7b593c
SH
2356/* Pretty print the trie */
2357static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2358{
cb7b593c 2359 const struct fib_trie_iter *iter = seq->private;
b299e4f0 2360 struct rt_trie_node *n = v;
c877efb2 2361
3d3b2d25
SH
2362 if (!node_parent_rcu(n))
2363 fib_table_print(seq, iter->tb);
095b8501 2364
cb7b593c
SH
2365 if (IS_TNODE(n)) {
2366 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2367 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2368
1d25cd6c 2369 seq_indent(seq, iter->depth-1);
673d57e7
HH
2370 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2371 &prf, tn->pos, tn->bits, tn->full_children,
1d25cd6c 2372 tn->empty_children);
e905a9ed 2373
cb7b593c
SH
2374 } else {
2375 struct leaf *l = (struct leaf *) n;
1328042e 2376 struct leaf_info *li;
32ab5f80 2377 __be32 val = htonl(l->key);
cb7b593c
SH
2378
2379 seq_indent(seq, iter->depth);
673d57e7 2380 seq_printf(seq, " |-- %pI4\n", &val);
1328042e 2381
b67bfe0d 2382 hlist_for_each_entry_rcu(li, &l->list, hlist) {
1328042e
SH
2383 struct fib_alias *fa;
2384
2385 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2386 char buf1[32], buf2[32];
2387
2388 seq_indent(seq, iter->depth+1);
2389 seq_printf(seq, " /%d %s %s", li->plen,
2390 rtn_scope(buf1, sizeof(buf1),
37e826c5 2391 fa->fa_info->fib_scope),
1328042e
SH
2392 rtn_type(buf2, sizeof(buf2),
2393 fa->fa_type));
2394 if (fa->fa_tos)
b9c4d82a 2395 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2396 seq_putc(seq, '\n');
cb7b593c
SH
2397 }
2398 }
19baf839 2399 }
cb7b593c 2400
19baf839
RO
2401 return 0;
2402}
2403
f690808e 2404static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2405 .start = fib_trie_seq_start,
2406 .next = fib_trie_seq_next,
2407 .stop = fib_trie_seq_stop,
2408 .show = fib_trie_seq_show,
19baf839
RO
2409};
2410
cb7b593c 2411static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2412{
1c340b2f
DL
2413 return seq_open_net(inode, file, &fib_trie_seq_ops,
2414 sizeof(struct fib_trie_iter));
19baf839
RO
2415}
2416
9a32144e 2417static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2418 .owner = THIS_MODULE,
2419 .open = fib_trie_seq_open,
2420 .read = seq_read,
2421 .llseek = seq_lseek,
1c340b2f 2422 .release = seq_release_net,
19baf839
RO
2423};
2424
8315f5d8
SH
2425struct fib_route_iter {
2426 struct seq_net_private p;
2427 struct trie *main_trie;
2428 loff_t pos;
2429 t_key key;
2430};
2431
2432static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2433{
2434 struct leaf *l = NULL;
2435 struct trie *t = iter->main_trie;
2436
2437 /* use cache location of last found key */
2438 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2439 pos -= iter->pos;
2440 else {
2441 iter->pos = 0;
2442 l = trie_firstleaf(t);
2443 }
2444
2445 while (l && pos-- > 0) {
2446 iter->pos++;
2447 l = trie_nextleaf(l);
2448 }
2449
2450 if (l)
2451 iter->key = pos; /* remember it */
2452 else
2453 iter->pos = 0; /* forget it */
2454
2455 return l;
2456}
2457
2458static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2459 __acquires(RCU)
2460{
2461 struct fib_route_iter *iter = seq->private;
2462 struct fib_table *tb;
2463
2464 rcu_read_lock();
1218854a 2465 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2466 if (!tb)
2467 return NULL;
2468
2469 iter->main_trie = (struct trie *) tb->tb_data;
2470 if (*pos == 0)
2471 return SEQ_START_TOKEN;
2472 else
2473 return fib_route_get_idx(iter, *pos - 1);
2474}
2475
2476static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2477{
2478 struct fib_route_iter *iter = seq->private;
2479 struct leaf *l = v;
2480
2481 ++*pos;
2482 if (v == SEQ_START_TOKEN) {
2483 iter->pos = 0;
2484 l = trie_firstleaf(iter->main_trie);
2485 } else {
2486 iter->pos++;
2487 l = trie_nextleaf(l);
2488 }
2489
2490 if (l)
2491 iter->key = l->key;
2492 else
2493 iter->pos = 0;
2494 return l;
2495}
2496
2497static void fib_route_seq_stop(struct seq_file *seq, void *v)
2498 __releases(RCU)
2499{
2500 rcu_read_unlock();
2501}
2502
a034ee3c 2503static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2504{
a034ee3c 2505 unsigned int flags = 0;
19baf839 2506
a034ee3c
ED
2507 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2508 flags = RTF_REJECT;
cb7b593c
SH
2509 if (fi && fi->fib_nh->nh_gw)
2510 flags |= RTF_GATEWAY;
32ab5f80 2511 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2512 flags |= RTF_HOST;
2513 flags |= RTF_UP;
2514 return flags;
19baf839
RO
2515}
2516
cb7b593c
SH
2517/*
2518 * This outputs /proc/net/route.
2519 * The format of the file is not supposed to be changed
a034ee3c 2520 * and needs to be same as fib_hash output to avoid breaking
cb7b593c
SH
2521 * legacy utilities
2522 */
2523static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2524{
cb7b593c 2525 struct leaf *l = v;
1328042e 2526 struct leaf_info *li;
19baf839 2527
cb7b593c
SH
2528 if (v == SEQ_START_TOKEN) {
2529 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2530 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2531 "\tWindow\tIRTT");
2532 return 0;
2533 }
19baf839 2534
b67bfe0d 2535 hlist_for_each_entry_rcu(li, &l->list, hlist) {
cb7b593c 2536 struct fib_alias *fa;
32ab5f80 2537 __be32 mask, prefix;
91b9a277 2538
cb7b593c
SH
2539 mask = inet_make_mask(li->plen);
2540 prefix = htonl(l->key);
19baf839 2541
cb7b593c 2542 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2543 const struct fib_info *fi = fa->fa_info;
a034ee3c 2544 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
5e659e4c 2545 int len;
19baf839 2546
cb7b593c
SH
2547 if (fa->fa_type == RTN_BROADCAST
2548 || fa->fa_type == RTN_MULTICAST)
2549 continue;
19baf839 2550
cb7b593c 2551 if (fi)
5e659e4c
PE
2552 seq_printf(seq,
2553 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2554 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c
SH
2555 fi->fib_dev ? fi->fib_dev->name : "*",
2556 prefix,
2557 fi->fib_nh->nh_gw, flags, 0, 0,
2558 fi->fib_priority,
2559 mask,
a07f5f50
SH
2560 (fi->fib_advmss ?
2561 fi->fib_advmss + 40 : 0),
cb7b593c 2562 fi->fib_window,
5e659e4c 2563 fi->fib_rtt >> 3, &len);
cb7b593c 2564 else
5e659e4c
PE
2565 seq_printf(seq,
2566 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2567 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c 2568 prefix, 0, flags, 0, 0, 0,
5e659e4c 2569 mask, 0, 0, 0, &len);
19baf839 2570
5e659e4c 2571 seq_printf(seq, "%*s\n", 127 - len, "");
cb7b593c 2572 }
19baf839
RO
2573 }
2574
2575 return 0;
2576}
2577
f690808e 2578static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2579 .start = fib_route_seq_start,
2580 .next = fib_route_seq_next,
2581 .stop = fib_route_seq_stop,
cb7b593c 2582 .show = fib_route_seq_show,
19baf839
RO
2583};
2584
cb7b593c 2585static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2586{
1c340b2f 2587 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2588 sizeof(struct fib_route_iter));
19baf839
RO
2589}
2590
9a32144e 2591static const struct file_operations fib_route_fops = {
cb7b593c
SH
2592 .owner = THIS_MODULE,
2593 .open = fib_route_seq_open,
2594 .read = seq_read,
2595 .llseek = seq_lseek,
1c340b2f 2596 .release = seq_release_net,
19baf839
RO
2597};
2598
61a02653 2599int __net_init fib_proc_init(struct net *net)
19baf839 2600{
d4beaa66 2601 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
cb7b593c
SH
2602 goto out1;
2603
d4beaa66
G
2604 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2605 &fib_triestat_fops))
cb7b593c
SH
2606 goto out2;
2607
d4beaa66 2608 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
cb7b593c
SH
2609 goto out3;
2610
19baf839 2611 return 0;
cb7b593c
SH
2612
2613out3:
ece31ffd 2614 remove_proc_entry("fib_triestat", net->proc_net);
cb7b593c 2615out2:
ece31ffd 2616 remove_proc_entry("fib_trie", net->proc_net);
cb7b593c
SH
2617out1:
2618 return -ENOMEM;
19baf839
RO
2619}
2620
61a02653 2621void __net_exit fib_proc_exit(struct net *net)
19baf839 2622{
ece31ffd
G
2623 remove_proc_entry("fib_trie", net->proc_net);
2624 remove_proc_entry("fib_triestat", net->proc_net);
2625 remove_proc_entry("route", net->proc_net);
19baf839
RO
2626}
2627
2628#endif /* CONFIG_PROC_FS */
This page took 0.999435 seconds and 5 git commands to generate.