[IPV4] fib_trie: dump message multiple part flag
[deliverable/linux.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
fd966255
RO
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
19baf839
RO
51 */
52
05eee48c 53#define VERSION "0.408"
19baf839 54
19baf839
RO
55#include <asm/uaccess.h>
56#include <asm/system.h>
1977f032 57#include <linux/bitops.h>
19baf839
RO
58#include <linux/types.h>
59#include <linux/kernel.h>
19baf839
RO
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
cd8787ab 67#include <linux/inetdevice.h>
19baf839
RO
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
2373ce1c 71#include <linux/rcupdate.h>
19baf839
RO
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
457c4cbc 76#include <net/net_namespace.h>
19baf839
RO
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
06ef921d 85#define MAX_STAT_DEPTH 32
19baf839 86
19baf839 87#define KEYLENGTH (8*sizeof(t_key))
19baf839 88
19baf839
RO
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF 1
93#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
91b9a277
OJ
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
98
99struct node {
91b9a277 100 unsigned long parent;
8d965444 101 t_key key;
19baf839
RO
102};
103
104struct leaf {
91b9a277 105 unsigned long parent;
8d965444 106 t_key key;
19baf839 107 struct hlist_head list;
2373ce1c 108 struct rcu_head rcu;
19baf839
RO
109};
110
111struct leaf_info {
112 struct hlist_node hlist;
2373ce1c 113 struct rcu_head rcu;
19baf839
RO
114 int plen;
115 struct list_head falh;
116};
117
118struct tnode {
91b9a277 119 unsigned long parent;
8d965444 120 t_key key;
112d8cfc
ED
121 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
122 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
123 unsigned int full_children; /* KEYLENGTH bits needed */
124 unsigned int empty_children; /* KEYLENGTH bits needed */
2373ce1c 125 struct rcu_head rcu;
91b9a277 126 struct node *child[0];
19baf839
RO
127};
128
129#ifdef CONFIG_IP_FIB_TRIE_STATS
130struct trie_use_stats {
131 unsigned int gets;
132 unsigned int backtrack;
133 unsigned int semantic_match_passed;
134 unsigned int semantic_match_miss;
135 unsigned int null_node_hit;
2f36895a 136 unsigned int resize_node_skipped;
19baf839
RO
137};
138#endif
139
140struct trie_stat {
141 unsigned int totdepth;
142 unsigned int maxdepth;
143 unsigned int tnodes;
144 unsigned int leaves;
145 unsigned int nullpointers;
93672292 146 unsigned int prefixes;
06ef921d 147 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 148};
19baf839
RO
149
150struct trie {
91b9a277 151 struct node *trie;
19baf839
RO
152#ifdef CONFIG_IP_FIB_TRIE_STATS
153 struct trie_use_stats stats;
154#endif
19baf839
RO
155};
156
19baf839 157static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
a07f5f50
SH
158static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
159 int wasfull);
19baf839 160static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
161static struct tnode *inflate(struct trie *t, struct tnode *tn);
162static struct tnode *halve(struct trie *t, struct tnode *tn);
19baf839 163static void tnode_free(struct tnode *tn);
19baf839 164
e18b890b 165static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 166static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 167
06801916
SH
168static inline struct tnode *node_parent(struct node *node)
169{
b59cfbf7
ED
170 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171}
172
173static inline struct tnode *node_parent_rcu(struct node *node)
174{
175 struct tnode *ret = node_parent(node);
06801916 176
06801916
SH
177 return rcu_dereference(ret);
178}
179
180static inline void node_set_parent(struct node *node, struct tnode *ptr)
181{
182 rcu_assign_pointer(node->parent,
183 (unsigned long)ptr | NODE_TYPE(node));
184}
2373ce1c 185
b59cfbf7
ED
186static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
187{
188 BUG_ON(i >= 1U << tn->bits);
2373ce1c 189
b59cfbf7
ED
190 return tn->child[i];
191}
192
193static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
19baf839 194{
b59cfbf7 195 struct node *ret = tnode_get_child(tn, i);
19baf839 196
b59cfbf7 197 return rcu_dereference(ret);
19baf839
RO
198}
199
bb435b8d 200static inline int tnode_child_length(const struct tnode *tn)
19baf839 201{
91b9a277 202 return 1 << tn->bits;
19baf839
RO
203}
204
ab66b4a7
SH
205static inline t_key mask_pfx(t_key k, unsigned short l)
206{
207 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
208}
209
19baf839
RO
210static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
211{
91b9a277 212 if (offset < KEYLENGTH)
19baf839 213 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 214 else
19baf839
RO
215 return 0;
216}
217
218static inline int tkey_equals(t_key a, t_key b)
219{
c877efb2 220 return a == b;
19baf839
RO
221}
222
223static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
224{
c877efb2
SH
225 if (bits == 0 || offset >= KEYLENGTH)
226 return 1;
91b9a277
OJ
227 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
228 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 229}
19baf839
RO
230
231static inline int tkey_mismatch(t_key a, int offset, t_key b)
232{
233 t_key diff = a ^ b;
234 int i = offset;
235
c877efb2
SH
236 if (!diff)
237 return 0;
238 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
239 i++;
240 return i;
241}
242
19baf839 243/*
e905a9ed
YH
244 To understand this stuff, an understanding of keys and all their bits is
245 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
246 all of the bits in that key are significant.
247
248 Consider a node 'n' and its parent 'tp'.
249
e905a9ed
YH
250 If n is a leaf, every bit in its key is significant. Its presence is
251 necessitated by path compression, since during a tree traversal (when
252 searching for a leaf - unless we are doing an insertion) we will completely
253 ignore all skipped bits we encounter. Thus we need to verify, at the end of
254 a potentially successful search, that we have indeed been walking the
19baf839
RO
255 correct key path.
256
e905a9ed
YH
257 Note that we can never "miss" the correct key in the tree if present by
258 following the wrong path. Path compression ensures that segments of the key
259 that are the same for all keys with a given prefix are skipped, but the
260 skipped part *is* identical for each node in the subtrie below the skipped
261 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
262 call to tkey_sub_equals() in trie_insert().
263
e905a9ed 264 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
265 have many different meanings.
266
e905a9ed 267 Example:
19baf839
RO
268 _________________________________________________________________
269 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
270 -----------------------------------------------------------------
e905a9ed 271 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
272
273 _________________________________________________________________
274 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
275 -----------------------------------------------------------------
276 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
277
278 tp->pos = 7
279 tp->bits = 3
280 n->pos = 15
91b9a277 281 n->bits = 4
19baf839 282
e905a9ed
YH
283 First, let's just ignore the bits that come before the parent tp, that is
284 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
285 not use them for anything.
286
287 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 288 index into the parent's child array. That is, they will be used to find
19baf839
RO
289 'n' among tp's children.
290
291 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
292 for the node n.
293
e905a9ed 294 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
295 of the bits are really not needed or indeed known in n->key.
296
e905a9ed 297 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 298 n's child array, and will of course be different for each child.
e905a9ed 299
c877efb2 300
19baf839
RO
301 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
302 at this point.
303
304*/
305
0c7770c7 306static inline void check_tnode(const struct tnode *tn)
19baf839 307{
0c7770c7 308 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
309}
310
f5026fab
DL
311static const int halve_threshold = 25;
312static const int inflate_threshold = 50;
313static const int halve_threshold_root = 8;
314static const int inflate_threshold_root = 15;
19baf839 315
2373ce1c
RO
316
317static void __alias_free_mem(struct rcu_head *head)
19baf839 318{
2373ce1c
RO
319 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
320 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
321}
322
2373ce1c 323static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 324{
2373ce1c
RO
325 call_rcu(&fa->rcu, __alias_free_mem);
326}
91b9a277 327
2373ce1c
RO
328static void __leaf_free_rcu(struct rcu_head *head)
329{
bc3c8c1e
SH
330 struct leaf *l = container_of(head, struct leaf, rcu);
331 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 332}
91b9a277 333
2373ce1c 334static void __leaf_info_free_rcu(struct rcu_head *head)
19baf839 335{
2373ce1c 336 kfree(container_of(head, struct leaf_info, rcu));
19baf839
RO
337}
338
2373ce1c 339static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 340{
2373ce1c 341 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
19baf839
RO
342}
343
8d965444 344static struct tnode *tnode_alloc(size_t size)
f0e36f8c 345{
2373ce1c
RO
346 struct page *pages;
347
348 if (size <= PAGE_SIZE)
8d965444 349 return kzalloc(size, GFP_KERNEL);
2373ce1c
RO
350
351 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
352 if (!pages)
353 return NULL;
354
355 return page_address(pages);
f0e36f8c
PM
356}
357
2373ce1c 358static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 359{
2373ce1c 360 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444
ED
361 size_t size = sizeof(struct tnode) +
362 (sizeof(struct node *) << tn->bits);
f0e36f8c
PM
363
364 if (size <= PAGE_SIZE)
365 kfree(tn);
366 else
367 free_pages((unsigned long)tn, get_order(size));
368}
369
2373ce1c
RO
370static inline void tnode_free(struct tnode *tn)
371{
132adf54 372 if (IS_LEAF(tn)) {
550e29bc
RO
373 struct leaf *l = (struct leaf *) tn;
374 call_rcu_bh(&l->rcu, __leaf_free_rcu);
132adf54 375 } else
550e29bc 376 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
377}
378
379static struct leaf *leaf_new(void)
380{
bc3c8c1e 381 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
382 if (l) {
383 l->parent = T_LEAF;
384 INIT_HLIST_HEAD(&l->list);
385 }
386 return l;
387}
388
389static struct leaf_info *leaf_info_new(int plen)
390{
391 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
392 if (li) {
393 li->plen = plen;
394 INIT_LIST_HEAD(&li->falh);
395 }
396 return li;
397}
398
a07f5f50 399static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 400{
8d965444 401 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
f0e36f8c 402 struct tnode *tn = tnode_alloc(sz);
19baf839 403
91b9a277 404 if (tn) {
2373ce1c 405 tn->parent = T_TNODE;
19baf839
RO
406 tn->pos = pos;
407 tn->bits = bits;
408 tn->key = key;
409 tn->full_children = 0;
410 tn->empty_children = 1<<bits;
411 }
c877efb2 412
8d965444
ED
413 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
414 (unsigned long) (sizeof(struct node) << bits));
19baf839
RO
415 return tn;
416}
417
19baf839
RO
418/*
419 * Check whether a tnode 'n' is "full", i.e. it is an internal node
420 * and no bits are skipped. See discussion in dyntree paper p. 6
421 */
422
bb435b8d 423static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 424{
c877efb2 425 if (n == NULL || IS_LEAF(n))
19baf839
RO
426 return 0;
427
428 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
429}
430
a07f5f50
SH
431static inline void put_child(struct trie *t, struct tnode *tn, int i,
432 struct node *n)
19baf839
RO
433{
434 tnode_put_child_reorg(tn, i, n, -1);
435}
436
c877efb2 437 /*
19baf839
RO
438 * Add a child at position i overwriting the old value.
439 * Update the value of full_children and empty_children.
440 */
441
a07f5f50
SH
442static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
443 int wasfull)
19baf839 444{
2373ce1c 445 struct node *chi = tn->child[i];
19baf839
RO
446 int isfull;
447
0c7770c7
SH
448 BUG_ON(i >= 1<<tn->bits);
449
19baf839
RO
450
451 /* update emptyChildren */
452 if (n == NULL && chi != NULL)
453 tn->empty_children++;
454 else if (n != NULL && chi == NULL)
455 tn->empty_children--;
c877efb2 456
19baf839 457 /* update fullChildren */
91b9a277 458 if (wasfull == -1)
19baf839
RO
459 wasfull = tnode_full(tn, chi);
460
461 isfull = tnode_full(tn, n);
c877efb2 462 if (wasfull && !isfull)
19baf839 463 tn->full_children--;
c877efb2 464 else if (!wasfull && isfull)
19baf839 465 tn->full_children++;
91b9a277 466
c877efb2 467 if (n)
06801916 468 node_set_parent(n, tn);
19baf839 469
2373ce1c 470 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
471}
472
c877efb2 473static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
474{
475 int i;
2f36895a 476 int err = 0;
2f80b3c8 477 struct tnode *old_tn;
e6308be8
RO
478 int inflate_threshold_use;
479 int halve_threshold_use;
05eee48c 480 int max_resize;
19baf839 481
e905a9ed 482 if (!tn)
19baf839
RO
483 return NULL;
484
0c7770c7
SH
485 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
486 tn, inflate_threshold, halve_threshold);
19baf839
RO
487
488 /* No children */
489 if (tn->empty_children == tnode_child_length(tn)) {
490 tnode_free(tn);
491 return NULL;
492 }
493 /* One child */
494 if (tn->empty_children == tnode_child_length(tn) - 1)
495 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 496 struct node *n;
19baf839 497
91b9a277 498 n = tn->child[i];
2373ce1c 499 if (!n)
91b9a277 500 continue;
91b9a277
OJ
501
502 /* compress one level */
06801916 503 node_set_parent(n, NULL);
91b9a277
OJ
504 tnode_free(tn);
505 return n;
19baf839 506 }
c877efb2 507 /*
19baf839
RO
508 * Double as long as the resulting node has a number of
509 * nonempty nodes that are above the threshold.
510 */
511
512 /*
c877efb2
SH
513 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
514 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 515 * Telecommunications, page 6:
c877efb2 516 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
517 * children in the *doubled* node is at least 'high'."
518 *
c877efb2
SH
519 * 'high' in this instance is the variable 'inflate_threshold'. It
520 * is expressed as a percentage, so we multiply it with
521 * tnode_child_length() and instead of multiplying by 2 (since the
522 * child array will be doubled by inflate()) and multiplying
523 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 524 * multiply the left-hand side by 50.
c877efb2
SH
525 *
526 * The left-hand side may look a bit weird: tnode_child_length(tn)
527 * - tn->empty_children is of course the number of non-null children
528 * in the current node. tn->full_children is the number of "full"
19baf839 529 * children, that is non-null tnodes with a skip value of 0.
c877efb2 530 * All of those will be doubled in the resulting inflated tnode, so
19baf839 531 * we just count them one extra time here.
c877efb2 532 *
19baf839 533 * A clearer way to write this would be:
c877efb2 534 *
19baf839 535 * to_be_doubled = tn->full_children;
c877efb2 536 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
537 * tn->full_children;
538 *
539 * new_child_length = tnode_child_length(tn) * 2;
540 *
c877efb2 541 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
542 * new_child_length;
543 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
544 *
545 * ...and so on, tho it would mess up the while () loop.
546 *
19baf839
RO
547 * anyway,
548 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
549 * inflate_threshold
c877efb2 550 *
19baf839
RO
551 * avoid a division:
552 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
553 * inflate_threshold * new_child_length
c877efb2 554 *
19baf839 555 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 556 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 557 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 558 *
19baf839 559 * expand new_child_length:
c877efb2 560 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 561 * tn->full_children) >=
19baf839 562 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 563 *
19baf839 564 * shorten again:
c877efb2 565 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 566 * tn->empty_children) >= inflate_threshold *
19baf839 567 * tnode_child_length(tn)
c877efb2 568 *
19baf839
RO
569 */
570
571 check_tnode(tn);
c877efb2 572
e6308be8
RO
573 /* Keep root node larger */
574
132adf54 575 if (!tn->parent)
e6308be8 576 inflate_threshold_use = inflate_threshold_root;
e905a9ed 577 else
e6308be8
RO
578 inflate_threshold_use = inflate_threshold;
579
2f36895a 580 err = 0;
05eee48c
RO
581 max_resize = 10;
582 while ((tn->full_children > 0 && max_resize-- &&
a07f5f50
SH
583 50 * (tn->full_children + tnode_child_length(tn)
584 - tn->empty_children)
585 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 586
2f80b3c8
RO
587 old_tn = tn;
588 tn = inflate(t, tn);
a07f5f50 589
2f80b3c8
RO
590 if (IS_ERR(tn)) {
591 tn = old_tn;
2f36895a
RO
592#ifdef CONFIG_IP_FIB_TRIE_STATS
593 t->stats.resize_node_skipped++;
594#endif
595 break;
596 }
19baf839
RO
597 }
598
05eee48c
RO
599 if (max_resize < 0) {
600 if (!tn->parent)
a07f5f50
SH
601 pr_warning("Fix inflate_threshold_root."
602 " Now=%d size=%d bits\n",
603 inflate_threshold_root, tn->bits);
05eee48c 604 else
a07f5f50
SH
605 pr_warning("Fix inflate_threshold."
606 " Now=%d size=%d bits\n",
607 inflate_threshold, tn->bits);
05eee48c
RO
608 }
609
19baf839
RO
610 check_tnode(tn);
611
612 /*
613 * Halve as long as the number of empty children in this
614 * node is above threshold.
615 */
2f36895a 616
e6308be8
RO
617
618 /* Keep root node larger */
619
132adf54 620 if (!tn->parent)
e6308be8 621 halve_threshold_use = halve_threshold_root;
e905a9ed 622 else
e6308be8
RO
623 halve_threshold_use = halve_threshold;
624
2f36895a 625 err = 0;
05eee48c
RO
626 max_resize = 10;
627 while (tn->bits > 1 && max_resize-- &&
19baf839 628 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 629 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 630
2f80b3c8
RO
631 old_tn = tn;
632 tn = halve(t, tn);
633 if (IS_ERR(tn)) {
634 tn = old_tn;
2f36895a
RO
635#ifdef CONFIG_IP_FIB_TRIE_STATS
636 t->stats.resize_node_skipped++;
637#endif
638 break;
639 }
640 }
19baf839 641
05eee48c
RO
642 if (max_resize < 0) {
643 if (!tn->parent)
a07f5f50
SH
644 pr_warning("Fix halve_threshold_root."
645 " Now=%d size=%d bits\n",
646 halve_threshold_root, tn->bits);
05eee48c 647 else
a07f5f50
SH
648 pr_warning("Fix halve_threshold."
649 " Now=%d size=%d bits\n",
650 halve_threshold, tn->bits);
05eee48c 651 }
c877efb2 652
19baf839 653 /* Only one child remains */
19baf839
RO
654 if (tn->empty_children == tnode_child_length(tn) - 1)
655 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 656 struct node *n;
19baf839 657
91b9a277 658 n = tn->child[i];
2373ce1c 659 if (!n)
91b9a277 660 continue;
91b9a277
OJ
661
662 /* compress one level */
663
06801916 664 node_set_parent(n, NULL);
91b9a277
OJ
665 tnode_free(tn);
666 return n;
19baf839
RO
667 }
668
669 return (struct node *) tn;
670}
671
2f80b3c8 672static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 673{
19baf839
RO
674 struct tnode *oldtnode = tn;
675 int olen = tnode_child_length(tn);
676 int i;
677
0c7770c7 678 pr_debug("In inflate\n");
19baf839
RO
679
680 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
681
0c7770c7 682 if (!tn)
2f80b3c8 683 return ERR_PTR(-ENOMEM);
2f36895a
RO
684
685 /*
c877efb2
SH
686 * Preallocate and store tnodes before the actual work so we
687 * don't get into an inconsistent state if memory allocation
688 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
689 * of tnode is ignored.
690 */
91b9a277
OJ
691
692 for (i = 0; i < olen; i++) {
a07f5f50 693 struct tnode *inode;
2f36895a 694
a07f5f50 695 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
696 if (inode &&
697 IS_TNODE(inode) &&
698 inode->pos == oldtnode->pos + oldtnode->bits &&
699 inode->bits > 1) {
700 struct tnode *left, *right;
ab66b4a7 701 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 702
2f36895a
RO
703 left = tnode_new(inode->key&(~m), inode->pos + 1,
704 inode->bits - 1);
2f80b3c8
RO
705 if (!left)
706 goto nomem;
91b9a277 707
2f36895a
RO
708 right = tnode_new(inode->key|m, inode->pos + 1,
709 inode->bits - 1);
710
e905a9ed 711 if (!right) {
2f80b3c8
RO
712 tnode_free(left);
713 goto nomem;
e905a9ed 714 }
2f36895a
RO
715
716 put_child(t, tn, 2*i, (struct node *) left);
717 put_child(t, tn, 2*i+1, (struct node *) right);
718 }
719 }
720
91b9a277 721 for (i = 0; i < olen; i++) {
c95aaf9a 722 struct tnode *inode;
19baf839 723 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
724 struct tnode *left, *right;
725 int size, j;
c877efb2 726
19baf839
RO
727 /* An empty child */
728 if (node == NULL)
729 continue;
730
731 /* A leaf or an internal node with skipped bits */
732
c877efb2 733 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 734 tn->pos + tn->bits - 1) {
a07f5f50
SH
735 if (tkey_extract_bits(node->key,
736 oldtnode->pos + oldtnode->bits,
737 1) == 0)
19baf839
RO
738 put_child(t, tn, 2*i, node);
739 else
740 put_child(t, tn, 2*i+1, node);
741 continue;
742 }
743
744 /* An internal node with two children */
745 inode = (struct tnode *) node;
746
747 if (inode->bits == 1) {
748 put_child(t, tn, 2*i, inode->child[0]);
749 put_child(t, tn, 2*i+1, inode->child[1]);
750
751 tnode_free(inode);
91b9a277 752 continue;
19baf839
RO
753 }
754
91b9a277
OJ
755 /* An internal node with more than two children */
756
757 /* We will replace this node 'inode' with two new
758 * ones, 'left' and 'right', each with half of the
759 * original children. The two new nodes will have
760 * a position one bit further down the key and this
761 * means that the "significant" part of their keys
762 * (see the discussion near the top of this file)
763 * will differ by one bit, which will be "0" in
764 * left's key and "1" in right's key. Since we are
765 * moving the key position by one step, the bit that
766 * we are moving away from - the bit at position
767 * (inode->pos) - is the one that will differ between
768 * left and right. So... we synthesize that bit in the
769 * two new keys.
770 * The mask 'm' below will be a single "one" bit at
771 * the position (inode->pos)
772 */
19baf839 773
91b9a277
OJ
774 /* Use the old key, but set the new significant
775 * bit to zero.
776 */
2f36895a 777
91b9a277
OJ
778 left = (struct tnode *) tnode_get_child(tn, 2*i);
779 put_child(t, tn, 2*i, NULL);
2f36895a 780
91b9a277 781 BUG_ON(!left);
2f36895a 782
91b9a277
OJ
783 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
784 put_child(t, tn, 2*i+1, NULL);
19baf839 785
91b9a277 786 BUG_ON(!right);
19baf839 787
91b9a277
OJ
788 size = tnode_child_length(left);
789 for (j = 0; j < size; j++) {
790 put_child(t, left, j, inode->child[j]);
791 put_child(t, right, j, inode->child[j + size]);
19baf839 792 }
91b9a277
OJ
793 put_child(t, tn, 2*i, resize(t, left));
794 put_child(t, tn, 2*i+1, resize(t, right));
795
796 tnode_free(inode);
19baf839
RO
797 }
798 tnode_free(oldtnode);
799 return tn;
2f80b3c8
RO
800nomem:
801 {
802 int size = tnode_child_length(tn);
803 int j;
804
0c7770c7 805 for (j = 0; j < size; j++)
2f80b3c8
RO
806 if (tn->child[j])
807 tnode_free((struct tnode *)tn->child[j]);
808
809 tnode_free(tn);
0c7770c7 810
2f80b3c8
RO
811 return ERR_PTR(-ENOMEM);
812 }
19baf839
RO
813}
814
2f80b3c8 815static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
816{
817 struct tnode *oldtnode = tn;
818 struct node *left, *right;
819 int i;
820 int olen = tnode_child_length(tn);
821
0c7770c7 822 pr_debug("In halve\n");
c877efb2
SH
823
824 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 825
2f80b3c8
RO
826 if (!tn)
827 return ERR_PTR(-ENOMEM);
2f36895a
RO
828
829 /*
c877efb2
SH
830 * Preallocate and store tnodes before the actual work so we
831 * don't get into an inconsistent state if memory allocation
832 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
833 * of tnode is ignored.
834 */
835
91b9a277 836 for (i = 0; i < olen; i += 2) {
2f36895a
RO
837 left = tnode_get_child(oldtnode, i);
838 right = tnode_get_child(oldtnode, i+1);
c877efb2 839
2f36895a 840 /* Two nonempty children */
0c7770c7 841 if (left && right) {
2f80b3c8 842 struct tnode *newn;
0c7770c7 843
2f80b3c8 844 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
845
846 if (!newn)
2f80b3c8 847 goto nomem;
0c7770c7 848
2f80b3c8 849 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 850 }
2f36895a 851
2f36895a 852 }
19baf839 853
91b9a277
OJ
854 for (i = 0; i < olen; i += 2) {
855 struct tnode *newBinNode;
856
19baf839
RO
857 left = tnode_get_child(oldtnode, i);
858 right = tnode_get_child(oldtnode, i+1);
c877efb2 859
19baf839
RO
860 /* At least one of the children is empty */
861 if (left == NULL) {
862 if (right == NULL) /* Both are empty */
863 continue;
864 put_child(t, tn, i/2, right);
91b9a277 865 continue;
0c7770c7 866 }
91b9a277
OJ
867
868 if (right == NULL) {
19baf839 869 put_child(t, tn, i/2, left);
91b9a277
OJ
870 continue;
871 }
c877efb2 872
19baf839 873 /* Two nonempty children */
91b9a277
OJ
874 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
875 put_child(t, tn, i/2, NULL);
91b9a277
OJ
876 put_child(t, newBinNode, 0, left);
877 put_child(t, newBinNode, 1, right);
878 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839
RO
879 }
880 tnode_free(oldtnode);
881 return tn;
2f80b3c8
RO
882nomem:
883 {
884 int size = tnode_child_length(tn);
885 int j;
886
0c7770c7 887 for (j = 0; j < size; j++)
2f80b3c8
RO
888 if (tn->child[j])
889 tnode_free((struct tnode *)tn->child[j]);
890
891 tnode_free(tn);
0c7770c7 892
2f80b3c8
RO
893 return ERR_PTR(-ENOMEM);
894 }
19baf839
RO
895}
896
772cb712 897/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
898 via get_fa_head and dump */
899
772cb712 900static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 901{
772cb712 902 struct hlist_head *head = &l->list;
19baf839
RO
903 struct hlist_node *node;
904 struct leaf_info *li;
905
2373ce1c 906 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 907 if (li->plen == plen)
19baf839 908 return li;
91b9a277 909
19baf839
RO
910 return NULL;
911}
912
a07f5f50 913static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 914{
772cb712 915 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 916
91b9a277
OJ
917 if (!li)
918 return NULL;
c877efb2 919
91b9a277 920 return &li->falh;
19baf839
RO
921}
922
923static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
924{
e905a9ed
YH
925 struct leaf_info *li = NULL, *last = NULL;
926 struct hlist_node *node;
927
928 if (hlist_empty(head)) {
929 hlist_add_head_rcu(&new->hlist, head);
930 } else {
931 hlist_for_each_entry(li, node, head, hlist) {
932 if (new->plen > li->plen)
933 break;
934
935 last = li;
936 }
937 if (last)
938 hlist_add_after_rcu(&last->hlist, &new->hlist);
939 else
940 hlist_add_before_rcu(&new->hlist, &li->hlist);
941 }
19baf839
RO
942}
943
2373ce1c
RO
944/* rcu_read_lock needs to be hold by caller from readside */
945
19baf839
RO
946static struct leaf *
947fib_find_node(struct trie *t, u32 key)
948{
949 int pos;
950 struct tnode *tn;
951 struct node *n;
952
953 pos = 0;
2373ce1c 954 n = rcu_dereference(t->trie);
19baf839
RO
955
956 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
957 tn = (struct tnode *) n;
91b9a277 958
19baf839 959 check_tnode(tn);
91b9a277 960
c877efb2 961 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 962 pos = tn->pos + tn->bits;
a07f5f50
SH
963 n = tnode_get_child_rcu(tn,
964 tkey_extract_bits(key,
965 tn->pos,
966 tn->bits));
91b9a277 967 } else
19baf839
RO
968 break;
969 }
970 /* Case we have found a leaf. Compare prefixes */
971
91b9a277
OJ
972 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
973 return (struct leaf *)n;
974
19baf839
RO
975 return NULL;
976}
977
978static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
979{
19baf839 980 int wasfull;
06801916
SH
981 t_key cindex, key = tn->key;
982 struct tnode *tp;
19baf839 983
06801916 984 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
19baf839
RO
985 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
986 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
a07f5f50
SH
987 tn = (struct tnode *) resize(t, (struct tnode *)tn);
988
989 tnode_put_child_reorg((struct tnode *)tp, cindex,
990 (struct node *)tn, wasfull);
91b9a277 991
06801916
SH
992 tp = node_parent((struct node *) tn);
993 if (!tp)
19baf839 994 break;
06801916 995 tn = tp;
19baf839 996 }
06801916 997
19baf839 998 /* Handle last (top) tnode */
c877efb2 999 if (IS_TNODE(tn))
a07f5f50 1000 tn = (struct tnode *)resize(t, (struct tnode *)tn);
19baf839 1001
a07f5f50 1002 return (struct node *)tn;
19baf839
RO
1003}
1004
2373ce1c
RO
1005/* only used from updater-side */
1006
fea86ad8 1007static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1008{
1009 int pos, newpos;
1010 struct tnode *tp = NULL, *tn = NULL;
1011 struct node *n;
1012 struct leaf *l;
1013 int missbit;
c877efb2 1014 struct list_head *fa_head = NULL;
19baf839
RO
1015 struct leaf_info *li;
1016 t_key cindex;
1017
1018 pos = 0;
c877efb2 1019 n = t->trie;
19baf839 1020
c877efb2
SH
1021 /* If we point to NULL, stop. Either the tree is empty and we should
1022 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1023 * and we should just put our new leaf in that.
c877efb2
SH
1024 * If we point to a T_TNODE, check if it matches our key. Note that
1025 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1026 * not be the parent's 'pos'+'bits'!
1027 *
c877efb2 1028 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1029 * the index from our key, push the T_TNODE and walk the tree.
1030 *
1031 * If it doesn't, we have to replace it with a new T_TNODE.
1032 *
c877efb2
SH
1033 * If we point to a T_LEAF, it might or might not have the same key
1034 * as we do. If it does, just change the value, update the T_LEAF's
1035 * value, and return it.
19baf839
RO
1036 * If it doesn't, we need to replace it with a T_TNODE.
1037 */
1038
1039 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1040 tn = (struct tnode *) n;
91b9a277 1041
c877efb2 1042 check_tnode(tn);
91b9a277 1043
c877efb2 1044 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1045 tp = tn;
91b9a277 1046 pos = tn->pos + tn->bits;
a07f5f50
SH
1047 n = tnode_get_child(tn,
1048 tkey_extract_bits(key,
1049 tn->pos,
1050 tn->bits));
19baf839 1051
06801916 1052 BUG_ON(n && node_parent(n) != tn);
91b9a277 1053 } else
19baf839
RO
1054 break;
1055 }
1056
1057 /*
1058 * n ----> NULL, LEAF or TNODE
1059 *
c877efb2 1060 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1061 */
1062
91b9a277 1063 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1064
1065 /* Case 1: n is a leaf. Compare prefixes */
1066
c877efb2 1067 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1068 l = (struct leaf *) n;
19baf839 1069 li = leaf_info_new(plen);
91b9a277 1070
fea86ad8
SH
1071 if (!li)
1072 return NULL;
19baf839
RO
1073
1074 fa_head = &li->falh;
1075 insert_leaf_info(&l->list, li);
1076 goto done;
1077 }
19baf839
RO
1078 l = leaf_new();
1079
fea86ad8
SH
1080 if (!l)
1081 return NULL;
19baf839
RO
1082
1083 l->key = key;
1084 li = leaf_info_new(plen);
1085
c877efb2 1086 if (!li) {
f835e471 1087 tnode_free((struct tnode *) l);
fea86ad8 1088 return NULL;
f835e471 1089 }
19baf839
RO
1090
1091 fa_head = &li->falh;
1092 insert_leaf_info(&l->list, li);
1093
19baf839 1094 if (t->trie && n == NULL) {
91b9a277 1095 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1096
06801916 1097 node_set_parent((struct node *)l, tp);
19baf839 1098
91b9a277
OJ
1099 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1100 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1101 } else {
1102 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1103 /*
1104 * Add a new tnode here
19baf839
RO
1105 * first tnode need some special handling
1106 */
1107
1108 if (tp)
91b9a277 1109 pos = tp->pos+tp->bits;
19baf839 1110 else
91b9a277
OJ
1111 pos = 0;
1112
c877efb2 1113 if (n) {
19baf839
RO
1114 newpos = tkey_mismatch(key, pos, n->key);
1115 tn = tnode_new(n->key, newpos, 1);
91b9a277 1116 } else {
19baf839 1117 newpos = 0;
c877efb2 1118 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1119 }
19baf839 1120
c877efb2 1121 if (!tn) {
f835e471
RO
1122 free_leaf_info(li);
1123 tnode_free((struct tnode *) l);
fea86ad8 1124 return NULL;
91b9a277
OJ
1125 }
1126
06801916 1127 node_set_parent((struct node *)tn, tp);
19baf839 1128
91b9a277 1129 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1130 put_child(t, tn, missbit, (struct node *)l);
1131 put_child(t, tn, 1-missbit, n);
1132
c877efb2 1133 if (tp) {
19baf839 1134 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
a07f5f50
SH
1135 put_child(t, (struct tnode *)tp, cindex,
1136 (struct node *)tn);
91b9a277 1137 } else {
a07f5f50 1138 rcu_assign_pointer(t->trie, (struct node *)tn);
19baf839
RO
1139 tp = tn;
1140 }
1141 }
91b9a277
OJ
1142
1143 if (tp && tp->pos + tp->bits > 32)
a07f5f50
SH
1144 pr_warning("fib_trie"
1145 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1146 tp, tp->pos, tp->bits, key, plen);
91b9a277 1147
19baf839 1148 /* Rebalance the trie */
2373ce1c
RO
1149
1150 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
f835e471 1151done:
19baf839
RO
1152 return fa_head;
1153}
1154
d562f1f8
RO
1155/*
1156 * Caller must hold RTNL.
1157 */
4e902c57 1158static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1159{
1160 struct trie *t = (struct trie *) tb->tb_data;
1161 struct fib_alias *fa, *new_fa;
c877efb2 1162 struct list_head *fa_head = NULL;
19baf839 1163 struct fib_info *fi;
4e902c57
TG
1164 int plen = cfg->fc_dst_len;
1165 u8 tos = cfg->fc_tos;
19baf839
RO
1166 u32 key, mask;
1167 int err;
1168 struct leaf *l;
1169
1170 if (plen > 32)
1171 return -EINVAL;
1172
4e902c57 1173 key = ntohl(cfg->fc_dst);
19baf839 1174
2dfe55b4 1175 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1176
91b9a277 1177 mask = ntohl(inet_make_mask(plen));
19baf839 1178
c877efb2 1179 if (key & ~mask)
19baf839
RO
1180 return -EINVAL;
1181
1182 key = key & mask;
1183
4e902c57
TG
1184 fi = fib_create_info(cfg);
1185 if (IS_ERR(fi)) {
1186 err = PTR_ERR(fi);
19baf839 1187 goto err;
4e902c57 1188 }
19baf839
RO
1189
1190 l = fib_find_node(t, key);
c877efb2 1191 fa = NULL;
19baf839 1192
c877efb2 1193 if (l) {
19baf839
RO
1194 fa_head = get_fa_head(l, plen);
1195 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1196 }
1197
1198 /* Now fa, if non-NULL, points to the first fib alias
1199 * with the same keys [prefix,tos,priority], if such key already
1200 * exists or to the node before which we will insert new one.
1201 *
1202 * If fa is NULL, we will need to allocate a new one and
1203 * insert to the head of f.
1204 *
1205 * If f is NULL, no fib node matched the destination key
1206 * and we need to allocate a new one of those as well.
1207 */
1208
91b9a277 1209 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
19baf839
RO
1210 struct fib_alias *fa_orig;
1211
1212 err = -EEXIST;
4e902c57 1213 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1214 goto out;
1215
4e902c57 1216 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1217 struct fib_info *fi_drop;
1218 u8 state;
1219
6725033f
JP
1220 if (fi->fib_treeref > 1)
1221 goto out;
1222
2373ce1c 1223 err = -ENOBUFS;
e94b1766 1224 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1225 if (new_fa == NULL)
1226 goto out;
19baf839
RO
1227
1228 fi_drop = fa->fa_info;
2373ce1c
RO
1229 new_fa->fa_tos = fa->fa_tos;
1230 new_fa->fa_info = fi;
4e902c57
TG
1231 new_fa->fa_type = cfg->fc_type;
1232 new_fa->fa_scope = cfg->fc_scope;
19baf839 1233 state = fa->fa_state;
2373ce1c 1234 new_fa->fa_state &= ~FA_S_ACCESSED;
19baf839 1235
2373ce1c
RO
1236 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1237 alias_free_mem_rcu(fa);
19baf839
RO
1238
1239 fib_release_info(fi_drop);
1240 if (state & FA_S_ACCESSED)
91b9a277 1241 rt_cache_flush(-1);
b8f55831
MK
1242 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1243 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1244
91b9a277 1245 goto succeeded;
19baf839
RO
1246 }
1247 /* Error if we find a perfect match which
1248 * uses the same scope, type, and nexthop
1249 * information.
1250 */
1251 fa_orig = fa;
1252 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1253 if (fa->fa_tos != tos)
1254 break;
1255 if (fa->fa_info->fib_priority != fi->fib_priority)
1256 break;
4e902c57
TG
1257 if (fa->fa_type == cfg->fc_type &&
1258 fa->fa_scope == cfg->fc_scope &&
a07f5f50 1259 fa->fa_info == fi)
19baf839 1260 goto out;
19baf839 1261 }
a07f5f50 1262
4e902c57 1263 if (!(cfg->fc_nlflags & NLM_F_APPEND))
19baf839
RO
1264 fa = fa_orig;
1265 }
1266 err = -ENOENT;
4e902c57 1267 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1268 goto out;
1269
1270 err = -ENOBUFS;
e94b1766 1271 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1272 if (new_fa == NULL)
1273 goto out;
1274
1275 new_fa->fa_info = fi;
1276 new_fa->fa_tos = tos;
4e902c57
TG
1277 new_fa->fa_type = cfg->fc_type;
1278 new_fa->fa_scope = cfg->fc_scope;
19baf839 1279 new_fa->fa_state = 0;
19baf839
RO
1280 /*
1281 * Insert new entry to the list.
1282 */
1283
c877efb2 1284 if (!fa_head) {
fea86ad8
SH
1285 fa_head = fib_insert_node(t, key, plen);
1286 if (unlikely(!fa_head)) {
1287 err = -ENOMEM;
f835e471 1288 goto out_free_new_fa;
fea86ad8 1289 }
f835e471 1290 }
19baf839 1291
2373ce1c
RO
1292 list_add_tail_rcu(&new_fa->fa_list,
1293 (fa ? &fa->fa_list : fa_head));
19baf839
RO
1294
1295 rt_cache_flush(-1);
4e902c57 1296 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1297 &cfg->fc_nlinfo, 0);
19baf839
RO
1298succeeded:
1299 return 0;
f835e471
RO
1300
1301out_free_new_fa:
1302 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1303out:
1304 fib_release_info(fi);
91b9a277 1305err:
19baf839
RO
1306 return err;
1307}
1308
2373ce1c 1309
772cb712 1310/* should be called with rcu_read_lock */
a07f5f50
SH
1311static int check_leaf(struct trie *t, struct leaf *l,
1312 t_key key, const struct flowi *flp,
1313 struct fib_result *res)
19baf839 1314{
19baf839
RO
1315 struct leaf_info *li;
1316 struct hlist_head *hhead = &l->list;
1317 struct hlist_node *node;
c877efb2 1318
2373ce1c 1319 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
a07f5f50
SH
1320 int err;
1321 int plen = li->plen;
1322 __be32 mask = inet_make_mask(plen);
1323
888454c5 1324 if (l->key != (key & ntohl(mask)))
19baf839
RO
1325 continue;
1326
a07f5f50
SH
1327 err = fib_semantic_match(&li->falh, flp, res,
1328 htonl(l->key), mask, plen);
1329
19baf839 1330#ifdef CONFIG_IP_FIB_TRIE_STATS
a07f5f50 1331 if (err <= 0)
19baf839 1332 t->stats.semantic_match_passed++;
a07f5f50
SH
1333 else
1334 t->stats.semantic_match_miss++;
19baf839 1335#endif
a07f5f50
SH
1336 if (err <= 0)
1337 return plen;
19baf839 1338 }
a07f5f50
SH
1339
1340 return -1;
19baf839
RO
1341}
1342
a07f5f50
SH
1343static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1344 struct fib_result *res)
19baf839
RO
1345{
1346 struct trie *t = (struct trie *) tb->tb_data;
1347 int plen, ret = 0;
1348 struct node *n;
1349 struct tnode *pn;
1350 int pos, bits;
91b9a277 1351 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1352 int chopped_off;
1353 t_key cindex = 0;
1354 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1355 struct tnode *cn;
1356 t_key node_prefix, key_prefix, pref_mismatch;
1357 int mp;
1358
2373ce1c 1359 rcu_read_lock();
91b9a277 1360
2373ce1c 1361 n = rcu_dereference(t->trie);
c877efb2 1362 if (!n)
19baf839
RO
1363 goto failed;
1364
1365#ifdef CONFIG_IP_FIB_TRIE_STATS
1366 t->stats.gets++;
1367#endif
1368
1369 /* Just a leaf? */
1370 if (IS_LEAF(n)) {
a07f5f50
SH
1371 plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1372 if (plen < 0)
1373 goto failed;
1374 ret = 0;
1375 goto found;
19baf839 1376 }
a07f5f50 1377
19baf839
RO
1378 pn = (struct tnode *) n;
1379 chopped_off = 0;
c877efb2 1380
91b9a277 1381 while (pn) {
19baf839
RO
1382 pos = pn->pos;
1383 bits = pn->bits;
1384
c877efb2 1385 if (!chopped_off)
ab66b4a7
SH
1386 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1387 pos, bits);
19baf839
RO
1388
1389 n = tnode_get_child(pn, cindex);
1390
1391 if (n == NULL) {
1392#ifdef CONFIG_IP_FIB_TRIE_STATS
1393 t->stats.null_node_hit++;
1394#endif
1395 goto backtrace;
1396 }
1397
91b9a277 1398 if (IS_LEAF(n)) {
a07f5f50
SH
1399 plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1400 if (plen < 0)
91b9a277 1401 goto backtrace;
a07f5f50
SH
1402
1403 ret = 0;
1404 goto found;
91b9a277
OJ
1405 }
1406
91b9a277 1407 cn = (struct tnode *)n;
19baf839 1408
91b9a277
OJ
1409 /*
1410 * It's a tnode, and we can do some extra checks here if we
1411 * like, to avoid descending into a dead-end branch.
1412 * This tnode is in the parent's child array at index
1413 * key[p_pos..p_pos+p_bits] but potentially with some bits
1414 * chopped off, so in reality the index may be just a
1415 * subprefix, padded with zero at the end.
1416 * We can also take a look at any skipped bits in this
1417 * tnode - everything up to p_pos is supposed to be ok,
1418 * and the non-chopped bits of the index (se previous
1419 * paragraph) are also guaranteed ok, but the rest is
1420 * considered unknown.
1421 *
1422 * The skipped bits are key[pos+bits..cn->pos].
1423 */
19baf839 1424
91b9a277
OJ
1425 /* If current_prefix_length < pos+bits, we are already doing
1426 * actual prefix matching, which means everything from
1427 * pos+(bits-chopped_off) onward must be zero along some
1428 * branch of this subtree - otherwise there is *no* valid
1429 * prefix present. Here we can only check the skipped
1430 * bits. Remember, since we have already indexed into the
1431 * parent's child array, we know that the bits we chopped of
1432 * *are* zero.
1433 */
19baf839 1434
a07f5f50
SH
1435 /* NOTA BENE: Checking only skipped bits
1436 for the new node here */
19baf839 1437
91b9a277
OJ
1438 if (current_prefix_length < pos+bits) {
1439 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1440 cn->pos - current_prefix_length)
1441 || !(cn->child[0]))
91b9a277
OJ
1442 goto backtrace;
1443 }
19baf839 1444
91b9a277
OJ
1445 /*
1446 * If chopped_off=0, the index is fully validated and we
1447 * only need to look at the skipped bits for this, the new,
1448 * tnode. What we actually want to do is to find out if
1449 * these skipped bits match our key perfectly, or if we will
1450 * have to count on finding a matching prefix further down,
1451 * because if we do, we would like to have some way of
1452 * verifying the existence of such a prefix at this point.
1453 */
19baf839 1454
91b9a277
OJ
1455 /* The only thing we can do at this point is to verify that
1456 * any such matching prefix can indeed be a prefix to our
1457 * key, and if the bits in the node we are inspecting that
1458 * do not match our key are not ZERO, this cannot be true.
1459 * Thus, find out where there is a mismatch (before cn->pos)
1460 * and verify that all the mismatching bits are zero in the
1461 * new tnode's key.
1462 */
19baf839 1463
a07f5f50
SH
1464 /*
1465 * Note: We aren't very concerned about the piece of
1466 * the key that precede pn->pos+pn->bits, since these
1467 * have already been checked. The bits after cn->pos
1468 * aren't checked since these are by definition
1469 * "unknown" at this point. Thus, what we want to see
1470 * is if we are about to enter the "prefix matching"
1471 * state, and in that case verify that the skipped
1472 * bits that will prevail throughout this subtree are
1473 * zero, as they have to be if we are to find a
1474 * matching prefix.
91b9a277
OJ
1475 */
1476
ab66b4a7
SH
1477 node_prefix = mask_pfx(cn->key, cn->pos);
1478 key_prefix = mask_pfx(key, cn->pos);
91b9a277
OJ
1479 pref_mismatch = key_prefix^node_prefix;
1480 mp = 0;
1481
a07f5f50
SH
1482 /*
1483 * In short: If skipped bits in this node do not match
1484 * the search key, enter the "prefix matching"
1485 * state.directly.
91b9a277
OJ
1486 */
1487 if (pref_mismatch) {
1488 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1489 mp++;
a07f5f50 1490 pref_mismatch = pref_mismatch << 1;
91b9a277
OJ
1491 }
1492 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1493
1494 if (key_prefix != 0)
1495 goto backtrace;
1496
1497 if (current_prefix_length >= cn->pos)
1498 current_prefix_length = mp;
c877efb2 1499 }
a07f5f50 1500
91b9a277
OJ
1501 pn = (struct tnode *)n; /* Descend */
1502 chopped_off = 0;
1503 continue;
1504
19baf839
RO
1505backtrace:
1506 chopped_off++;
1507
1508 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1509 while ((chopped_off <= pn->bits)
1510 && !(cindex & (1<<(chopped_off-1))))
19baf839 1511 chopped_off++;
19baf839
RO
1512
1513 /* Decrease current_... with bits chopped off */
1514 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1515 current_prefix_length = pn->pos + pn->bits
1516 - chopped_off;
91b9a277 1517
19baf839 1518 /*
c877efb2 1519 * Either we do the actual chop off according or if we have
19baf839
RO
1520 * chopped off all bits in this tnode walk up to our parent.
1521 */
1522
91b9a277 1523 if (chopped_off <= pn->bits) {
19baf839 1524 cindex &= ~(1 << (chopped_off-1));
91b9a277 1525 } else {
06801916
SH
1526 struct tnode *parent = node_parent((struct node *) pn);
1527 if (!parent)
19baf839 1528 goto failed;
91b9a277 1529
19baf839 1530 /* Get Child's index */
06801916
SH
1531 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1532 pn = parent;
19baf839
RO
1533 chopped_off = 0;
1534
1535#ifdef CONFIG_IP_FIB_TRIE_STATS
1536 t->stats.backtrack++;
1537#endif
1538 goto backtrace;
c877efb2 1539 }
19baf839
RO
1540 }
1541failed:
c877efb2 1542 ret = 1;
19baf839 1543found:
2373ce1c 1544 rcu_read_unlock();
19baf839
RO
1545 return ret;
1546}
1547
2373ce1c 1548/* only called from updater side */
19baf839
RO
1549static int trie_leaf_remove(struct trie *t, t_key key)
1550{
1551 t_key cindex;
1552 struct tnode *tp = NULL;
1553 struct node *n = t->trie;
1554 struct leaf *l;
1555
0c7770c7 1556 pr_debug("entering trie_leaf_remove(%p)\n", n);
19baf839
RO
1557
1558 /* Note that in the case skipped bits, those bits are *not* checked!
c877efb2 1559 * When we finish this, we will have NULL or a T_LEAF, and the
19baf839
RO
1560 * T_LEAF may or may not match our key.
1561 */
1562
91b9a277 1563 while (n != NULL && IS_TNODE(n)) {
19baf839
RO
1564 struct tnode *tn = (struct tnode *) n;
1565 check_tnode(tn);
a07f5f50
SH
1566 n = tnode_get_child(tn, tkey_extract_bits(key,
1567 tn->pos, tn->bits));
19baf839 1568
06801916 1569 BUG_ON(n && node_parent(n) != tn);
91b9a277 1570 }
19baf839
RO
1571 l = (struct leaf *) n;
1572
c877efb2 1573 if (!n || !tkey_equals(l->key, key))
19baf839 1574 return 0;
c877efb2
SH
1575
1576 /*
1577 * Key found.
1578 * Remove the leaf and rebalance the tree
19baf839 1579 */
06801916 1580 tp = node_parent(n);
19baf839
RO
1581 tnode_free((struct tnode *) n);
1582
c877efb2 1583 if (tp) {
19baf839
RO
1584 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1585 put_child(t, (struct tnode *)tp, cindex, NULL);
2373ce1c 1586 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
91b9a277 1587 } else
2373ce1c 1588 rcu_assign_pointer(t->trie, NULL);
19baf839
RO
1589
1590 return 1;
1591}
1592
d562f1f8
RO
1593/*
1594 * Caller must hold RTNL.
1595 */
4e902c57 1596static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1597{
1598 struct trie *t = (struct trie *) tb->tb_data;
1599 u32 key, mask;
4e902c57
TG
1600 int plen = cfg->fc_dst_len;
1601 u8 tos = cfg->fc_tos;
19baf839
RO
1602 struct fib_alias *fa, *fa_to_delete;
1603 struct list_head *fa_head;
1604 struct leaf *l;
91b9a277
OJ
1605 struct leaf_info *li;
1606
c877efb2 1607 if (plen > 32)
19baf839
RO
1608 return -EINVAL;
1609
4e902c57 1610 key = ntohl(cfg->fc_dst);
91b9a277 1611 mask = ntohl(inet_make_mask(plen));
19baf839 1612
c877efb2 1613 if (key & ~mask)
19baf839
RO
1614 return -EINVAL;
1615
1616 key = key & mask;
1617 l = fib_find_node(t, key);
1618
c877efb2 1619 if (!l)
19baf839
RO
1620 return -ESRCH;
1621
1622 fa_head = get_fa_head(l, plen);
1623 fa = fib_find_alias(fa_head, tos, 0);
1624
1625 if (!fa)
1626 return -ESRCH;
1627
0c7770c7 1628 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1629
1630 fa_to_delete = NULL;
1631 fa_head = fa->fa_list.prev;
2373ce1c 1632
19baf839
RO
1633 list_for_each_entry(fa, fa_head, fa_list) {
1634 struct fib_info *fi = fa->fa_info;
1635
1636 if (fa->fa_tos != tos)
1637 break;
1638
4e902c57
TG
1639 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1640 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1641 fa->fa_scope == cfg->fc_scope) &&
1642 (!cfg->fc_protocol ||
1643 fi->fib_protocol == cfg->fc_protocol) &&
1644 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1645 fa_to_delete = fa;
1646 break;
1647 }
1648 }
1649
91b9a277
OJ
1650 if (!fa_to_delete)
1651 return -ESRCH;
19baf839 1652
91b9a277 1653 fa = fa_to_delete;
4e902c57 1654 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1655 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1656
1657 l = fib_find_node(t, key);
772cb712 1658 li = find_leaf_info(l, plen);
19baf839 1659
2373ce1c 1660 list_del_rcu(&fa->fa_list);
19baf839 1661
91b9a277 1662 if (list_empty(fa_head)) {
2373ce1c 1663 hlist_del_rcu(&li->hlist);
91b9a277 1664 free_leaf_info(li);
2373ce1c 1665 }
19baf839 1666
91b9a277
OJ
1667 if (hlist_empty(&l->list))
1668 trie_leaf_remove(t, key);
19baf839 1669
91b9a277
OJ
1670 if (fa->fa_state & FA_S_ACCESSED)
1671 rt_cache_flush(-1);
19baf839 1672
2373ce1c
RO
1673 fib_release_info(fa->fa_info);
1674 alias_free_mem_rcu(fa);
91b9a277 1675 return 0;
19baf839
RO
1676}
1677
1678static int trie_flush_list(struct trie *t, struct list_head *head)
1679{
1680 struct fib_alias *fa, *fa_node;
1681 int found = 0;
1682
1683 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1684 struct fib_info *fi = fa->fa_info;
19baf839 1685
2373ce1c
RO
1686 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1687 list_del_rcu(&fa->fa_list);
1688 fib_release_info(fa->fa_info);
1689 alias_free_mem_rcu(fa);
19baf839
RO
1690 found++;
1691 }
1692 }
1693 return found;
1694}
1695
1696static int trie_flush_leaf(struct trie *t, struct leaf *l)
1697{
1698 int found = 0;
1699 struct hlist_head *lih = &l->list;
1700 struct hlist_node *node, *tmp;
1701 struct leaf_info *li = NULL;
1702
1703 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
19baf839
RO
1704 found += trie_flush_list(t, &li->falh);
1705
1706 if (list_empty(&li->falh)) {
2373ce1c 1707 hlist_del_rcu(&li->hlist);
19baf839
RO
1708 free_leaf_info(li);
1709 }
1710 }
1711 return found;
1712}
1713
2373ce1c
RO
1714/* rcu_read_lock needs to be hold by caller from readside */
1715
19baf839
RO
1716static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1717{
1718 struct node *c = (struct node *) thisleaf;
1719 struct tnode *p;
1720 int idx;
2373ce1c 1721 struct node *trie = rcu_dereference(t->trie);
19baf839 1722
c877efb2 1723 if (c == NULL) {
2373ce1c 1724 if (trie == NULL)
19baf839
RO
1725 return NULL;
1726
2373ce1c
RO
1727 if (IS_LEAF(trie)) /* trie w. just a leaf */
1728 return (struct leaf *) trie;
19baf839 1729
a07f5f50 1730 p = (struct tnode *)trie; /* Start */
91b9a277 1731 } else
b59cfbf7 1732 p = node_parent_rcu(c);
c877efb2 1733
19baf839
RO
1734 while (p) {
1735 int pos, last;
1736
1737 /* Find the next child of the parent */
c877efb2
SH
1738 if (c)
1739 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1740 else
19baf839
RO
1741 pos = 0;
1742
1743 last = 1 << p->bits;
91b9a277 1744 for (idx = pos; idx < last ; idx++) {
2373ce1c
RO
1745 c = rcu_dereference(p->child[idx]);
1746
1747 if (!c)
91b9a277
OJ
1748 continue;
1749
1750 /* Decend if tnode */
2373ce1c
RO
1751 while (IS_TNODE(c)) {
1752 p = (struct tnode *) c;
e905a9ed 1753 idx = 0;
91b9a277
OJ
1754
1755 /* Rightmost non-NULL branch */
1756 if (p && IS_TNODE(p))
2373ce1c
RO
1757 while (!(c = rcu_dereference(p->child[idx]))
1758 && idx < (1<<p->bits)) idx++;
91b9a277
OJ
1759
1760 /* Done with this tnode? */
2373ce1c 1761 if (idx >= (1 << p->bits) || !c)
91b9a277 1762 goto up;
19baf839 1763 }
2373ce1c 1764 return (struct leaf *) c;
19baf839
RO
1765 }
1766up:
1767 /* No more children go up one step */
91b9a277 1768 c = (struct node *) p;
b59cfbf7 1769 p = node_parent_rcu(c);
19baf839
RO
1770 }
1771 return NULL; /* Ready. Root of trie */
1772}
1773
d562f1f8
RO
1774/*
1775 * Caller must hold RTNL.
1776 */
19baf839
RO
1777static int fn_trie_flush(struct fib_table *tb)
1778{
1779 struct trie *t = (struct trie *) tb->tb_data;
1780 struct leaf *ll = NULL, *l = NULL;
1781 int found = 0, h;
1782
91b9a277 1783 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1784 found += trie_flush_leaf(t, l);
1785
1786 if (ll && hlist_empty(&ll->list))
1787 trie_leaf_remove(t, ll->key);
1788 ll = l;
1789 }
1790
1791 if (ll && hlist_empty(&ll->list))
1792 trie_leaf_remove(t, ll->key);
1793
0c7770c7 1794 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1795 return found;
1796}
1797
a07f5f50
SH
1798static void fn_trie_select_default(struct fib_table *tb,
1799 const struct flowi *flp,
1800 struct fib_result *res)
19baf839
RO
1801{
1802 struct trie *t = (struct trie *) tb->tb_data;
1803 int order, last_idx;
1804 struct fib_info *fi = NULL;
1805 struct fib_info *last_resort;
1806 struct fib_alias *fa = NULL;
1807 struct list_head *fa_head;
1808 struct leaf *l;
1809
1810 last_idx = -1;
1811 last_resort = NULL;
1812 order = -1;
1813
2373ce1c 1814 rcu_read_lock();
c877efb2 1815
19baf839 1816 l = fib_find_node(t, 0);
c877efb2 1817 if (!l)
19baf839
RO
1818 goto out;
1819
1820 fa_head = get_fa_head(l, 0);
c877efb2 1821 if (!fa_head)
19baf839
RO
1822 goto out;
1823
c877efb2 1824 if (list_empty(fa_head))
19baf839
RO
1825 goto out;
1826
2373ce1c 1827 list_for_each_entry_rcu(fa, fa_head, fa_list) {
19baf839 1828 struct fib_info *next_fi = fa->fa_info;
91b9a277 1829
19baf839
RO
1830 if (fa->fa_scope != res->scope ||
1831 fa->fa_type != RTN_UNICAST)
1832 continue;
91b9a277 1833
19baf839
RO
1834 if (next_fi->fib_priority > res->fi->fib_priority)
1835 break;
1836 if (!next_fi->fib_nh[0].nh_gw ||
1837 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1838 continue;
1839 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1840
19baf839
RO
1841 if (fi == NULL) {
1842 if (next_fi != res->fi)
1843 break;
1844 } else if (!fib_detect_death(fi, order, &last_resort,
971b893e 1845 &last_idx, tb->tb_default)) {
a2bbe682 1846 fib_result_assign(res, fi);
971b893e 1847 tb->tb_default = order;
19baf839
RO
1848 goto out;
1849 }
1850 fi = next_fi;
1851 order++;
1852 }
1853 if (order <= 0 || fi == NULL) {
971b893e 1854 tb->tb_default = -1;
19baf839
RO
1855 goto out;
1856 }
1857
971b893e
DL
1858 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1859 tb->tb_default)) {
a2bbe682 1860 fib_result_assign(res, fi);
971b893e 1861 tb->tb_default = order;
19baf839
RO
1862 goto out;
1863 }
a2bbe682
DL
1864 if (last_idx >= 0)
1865 fib_result_assign(res, last_resort);
971b893e
DL
1866 tb->tb_default = last_idx;
1867out:
2373ce1c 1868 rcu_read_unlock();
19baf839
RO
1869}
1870
a07f5f50
SH
1871static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1872 struct fib_table *tb,
19baf839
RO
1873 struct sk_buff *skb, struct netlink_callback *cb)
1874{
1875 int i, s_i;
1876 struct fib_alias *fa;
1877
32ab5f80 1878 __be32 xkey = htonl(key);
19baf839 1879
1af5a8c4 1880 s_i = cb->args[4];
19baf839
RO
1881 i = 0;
1882
2373ce1c
RO
1883 /* rcu_read_lock is hold by caller */
1884
1885 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1886 if (i < s_i) {
1887 i++;
1888 continue;
1889 }
78c6671a 1890 BUG_ON(!fa->fa_info);
19baf839
RO
1891
1892 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1893 cb->nlh->nlmsg_seq,
1894 RTM_NEWROUTE,
1895 tb->tb_id,
1896 fa->fa_type,
1897 fa->fa_scope,
be403ea1 1898 xkey,
19baf839
RO
1899 plen,
1900 fa->fa_tos,
64347f78 1901 fa->fa_info, NLM_F_MULTI) < 0) {
1af5a8c4 1902 cb->args[4] = i;
19baf839 1903 return -1;
91b9a277 1904 }
19baf839
RO
1905 i++;
1906 }
1af5a8c4 1907 cb->args[4] = i;
19baf839
RO
1908 return skb->len;
1909}
1910
a07f5f50
SH
1911static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb,
1912 struct sk_buff *skb, struct netlink_callback *cb)
19baf839
RO
1913{
1914 int h, s_h;
1915 struct list_head *fa_head;
1916 struct leaf *l = NULL;
19baf839 1917
1af5a8c4 1918 s_h = cb->args[3];
19baf839 1919
91b9a277 1920 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1921 if (h < s_h)
1922 continue;
1923 if (h > s_h)
1af5a8c4
PM
1924 memset(&cb->args[4], 0,
1925 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839
RO
1926
1927 fa_head = get_fa_head(l, plen);
91b9a277 1928
c877efb2 1929 if (!fa_head)
19baf839
RO
1930 continue;
1931
c877efb2 1932 if (list_empty(fa_head))
19baf839
RO
1933 continue;
1934
a07f5f50 1935 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb) < 0) {
1af5a8c4 1936 cb->args[3] = h;
19baf839
RO
1937 return -1;
1938 }
1939 }
1af5a8c4 1940 cb->args[3] = h;
19baf839
RO
1941 return skb->len;
1942}
1943
a07f5f50
SH
1944static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1945 struct netlink_callback *cb)
19baf839
RO
1946{
1947 int m, s_m;
1948 struct trie *t = (struct trie *) tb->tb_data;
1949
1af5a8c4 1950 s_m = cb->args[2];
19baf839 1951
2373ce1c 1952 rcu_read_lock();
91b9a277 1953 for (m = 0; m <= 32; m++) {
19baf839
RO
1954 if (m < s_m)
1955 continue;
1956 if (m > s_m)
1af5a8c4
PM
1957 memset(&cb->args[3], 0,
1958 sizeof(cb->args) - 3*sizeof(cb->args[0]));
19baf839 1959
a07f5f50 1960 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb) < 0) {
1af5a8c4 1961 cb->args[2] = m;
19baf839
RO
1962 goto out;
1963 }
1964 }
2373ce1c 1965 rcu_read_unlock();
1af5a8c4 1966 cb->args[2] = m;
19baf839 1967 return skb->len;
91b9a277 1968out:
2373ce1c 1969 rcu_read_unlock();
19baf839
RO
1970 return -1;
1971}
1972
7f9b8052
SH
1973void __init fib_hash_init(void)
1974{
a07f5f50
SH
1975 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1976 sizeof(struct fib_alias),
bc3c8c1e
SH
1977 0, SLAB_PANIC, NULL);
1978
1979 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1980 max(sizeof(struct leaf),
1981 sizeof(struct leaf_info)),
1982 0, SLAB_PANIC, NULL);
7f9b8052 1983}
19baf839 1984
7f9b8052
SH
1985
1986/* Fix more generic FIB names for init later */
1987struct fib_table *fib_hash_table(u32 id)
19baf839
RO
1988{
1989 struct fib_table *tb;
1990 struct trie *t;
1991
19baf839
RO
1992 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1993 GFP_KERNEL);
1994 if (tb == NULL)
1995 return NULL;
1996
1997 tb->tb_id = id;
971b893e 1998 tb->tb_default = -1;
19baf839
RO
1999 tb->tb_lookup = fn_trie_lookup;
2000 tb->tb_insert = fn_trie_insert;
2001 tb->tb_delete = fn_trie_delete;
2002 tb->tb_flush = fn_trie_flush;
2003 tb->tb_select_default = fn_trie_select_default;
2004 tb->tb_dump = fn_trie_dump;
19baf839
RO
2005
2006 t = (struct trie *) tb->tb_data;
c28a1cf4 2007 memset(t, 0, sizeof(*t));
19baf839 2008
19baf839 2009 if (id == RT_TABLE_LOCAL)
a07f5f50 2010 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
19baf839
RO
2011
2012 return tb;
2013}
2014
cb7b593c
SH
2015#ifdef CONFIG_PROC_FS
2016/* Depth first Trie walk iterator */
2017struct fib_trie_iter {
1c340b2f 2018 struct seq_net_private p;
877a9bff 2019 struct trie *trie_local, *trie_main;
cb7b593c
SH
2020 struct tnode *tnode;
2021 struct trie *trie;
2022 unsigned index;
2023 unsigned depth;
2024};
19baf839 2025
cb7b593c 2026static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2027{
cb7b593c
SH
2028 struct tnode *tn = iter->tnode;
2029 unsigned cindex = iter->index;
2030 struct tnode *p;
19baf839 2031
6640e697
EB
2032 /* A single entry routing table */
2033 if (!tn)
2034 return NULL;
2035
cb7b593c
SH
2036 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2037 iter->tnode, iter->index, iter->depth);
2038rescan:
2039 while (cindex < (1<<tn->bits)) {
b59cfbf7 2040 struct node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2041
cb7b593c
SH
2042 if (n) {
2043 if (IS_LEAF(n)) {
2044 iter->tnode = tn;
2045 iter->index = cindex + 1;
2046 } else {
2047 /* push down one level */
2048 iter->tnode = (struct tnode *) n;
2049 iter->index = 0;
2050 ++iter->depth;
2051 }
2052 return n;
2053 }
19baf839 2054
cb7b593c
SH
2055 ++cindex;
2056 }
91b9a277 2057
cb7b593c 2058 /* Current node exhausted, pop back up */
b59cfbf7 2059 p = node_parent_rcu((struct node *)tn);
cb7b593c
SH
2060 if (p) {
2061 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2062 tn = p;
2063 --iter->depth;
2064 goto rescan;
19baf839 2065 }
cb7b593c
SH
2066
2067 /* got root? */
2068 return NULL;
19baf839
RO
2069}
2070
cb7b593c
SH
2071static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2072 struct trie *t)
19baf839 2073{
5ddf0eb2
RO
2074 struct node *n ;
2075
132adf54 2076 if (!t)
5ddf0eb2
RO
2077 return NULL;
2078
2079 n = rcu_dereference(t->trie);
2080
132adf54 2081 if (!iter)
5ddf0eb2 2082 return NULL;
19baf839 2083
6640e697
EB
2084 if (n) {
2085 if (IS_TNODE(n)) {
2086 iter->tnode = (struct tnode *) n;
2087 iter->trie = t;
2088 iter->index = 0;
2089 iter->depth = 1;
2090 } else {
2091 iter->tnode = NULL;
2092 iter->trie = t;
2093 iter->index = 0;
2094 iter->depth = 0;
2095 }
cb7b593c 2096 return n;
91b9a277 2097 }
cb7b593c
SH
2098 return NULL;
2099}
91b9a277 2100
cb7b593c
SH
2101static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2102{
2103 struct node *n;
2104 struct fib_trie_iter iter;
91b9a277 2105
cb7b593c 2106 memset(s, 0, sizeof(*s));
91b9a277 2107
cb7b593c
SH
2108 rcu_read_lock();
2109 for (n = fib_trie_get_first(&iter, t); n;
2110 n = fib_trie_get_next(&iter)) {
2111 if (IS_LEAF(n)) {
93672292
SH
2112 struct leaf *l = (struct leaf *)n;
2113 struct leaf_info *li;
2114 struct hlist_node *tmp;
2115
cb7b593c
SH
2116 s->leaves++;
2117 s->totdepth += iter.depth;
2118 if (iter.depth > s->maxdepth)
2119 s->maxdepth = iter.depth;
93672292
SH
2120
2121 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2122 ++s->prefixes;
cb7b593c
SH
2123 } else {
2124 const struct tnode *tn = (const struct tnode *) n;
2125 int i;
2126
2127 s->tnodes++;
132adf54 2128 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2129 s->nodesizes[tn->bits]++;
2130
cb7b593c
SH
2131 for (i = 0; i < (1<<tn->bits); i++)
2132 if (!tn->child[i])
2133 s->nullpointers++;
19baf839 2134 }
19baf839 2135 }
2373ce1c 2136 rcu_read_unlock();
19baf839
RO
2137}
2138
cb7b593c
SH
2139/*
2140 * This outputs /proc/net/fib_triestats
2141 */
2142static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2143{
cb7b593c 2144 unsigned i, max, pointers, bytes, avdepth;
c877efb2 2145
cb7b593c
SH
2146 if (stat->leaves)
2147 avdepth = stat->totdepth*100 / stat->leaves;
2148 else
2149 avdepth = 0;
91b9a277 2150
a07f5f50
SH
2151 seq_printf(seq, "\tAver depth: %u.%02d\n",
2152 avdepth / 100, avdepth % 100);
cb7b593c 2153 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2154
cb7b593c 2155 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2156 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2157
2158 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2159 bytes += sizeof(struct leaf_info) * stat->prefixes;
2160
187b5188 2161 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2162 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2163
06ef921d
RO
2164 max = MAX_STAT_DEPTH;
2165 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2166 max--;
19baf839 2167
cb7b593c
SH
2168 pointers = 0;
2169 for (i = 1; i <= max; i++)
2170 if (stat->nodesizes[i] != 0) {
187b5188 2171 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2172 pointers += (1<<i) * stat->nodesizes[i];
2173 }
2174 seq_putc(seq, '\n');
187b5188 2175 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2176
cb7b593c 2177 bytes += sizeof(struct node *) * pointers;
187b5188
SH
2178 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2179 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2180}
2373ce1c 2181
cb7b593c 2182#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2183static void trie_show_usage(struct seq_file *seq,
2184 const struct trie_use_stats *stats)
2185{
2186 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2187 seq_printf(seq, "gets = %u\n", stats->gets);
2188 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2189 seq_printf(seq, "semantic match passed = %u\n",
2190 stats->semantic_match_passed);
2191 seq_printf(seq, "semantic match miss = %u\n",
2192 stats->semantic_match_miss);
2193 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2194 seq_printf(seq, "skipped node resize = %u\n\n",
2195 stats->resize_node_skipped);
cb7b593c 2196}
66a2f7fd
SH
2197#endif /* CONFIG_IP_FIB_TRIE_STATS */
2198
a07f5f50
SH
2199static void fib_trie_show(struct seq_file *seq, const char *name,
2200 struct trie *trie)
d717a9a6
SH
2201{
2202 struct trie_stat stat;
2203
d717a9a6 2204 trie_collect_stats(trie, &stat);
93672292 2205 seq_printf(seq, "%s:\n", name);
d717a9a6
SH
2206 trie_show_stats(seq, &stat);
2207#ifdef CONFIG_IP_FIB_TRIE_STATS
2208 trie_show_usage(seq, &trie->stats);
2209#endif
2210}
19baf839 2211
cb7b593c
SH
2212static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2213{
1c340b2f 2214 struct net *net = (struct net *)seq->private;
877a9bff
EB
2215 struct fib_table *tb;
2216
d717a9a6 2217 seq_printf(seq,
a07f5f50
SH
2218 "Basic info: size of leaf:"
2219 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2220 sizeof(struct leaf), sizeof(struct tnode));
2221
1c340b2f 2222 tb = fib_get_table(net, RT_TABLE_LOCAL);
877a9bff 2223 if (tb)
d717a9a6 2224 fib_trie_show(seq, "Local", (struct trie *) tb->tb_data);
877a9bff 2225
1c340b2f 2226 tb = fib_get_table(net, RT_TABLE_MAIN);
877a9bff 2227 if (tb)
d717a9a6 2228 fib_trie_show(seq, "Main", (struct trie *) tb->tb_data);
19baf839 2229
cb7b593c 2230 return 0;
19baf839
RO
2231}
2232
cb7b593c 2233static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2234{
1c340b2f
DL
2235 int err;
2236 struct net *net;
2237
2238 net = get_proc_net(inode);
2239 if (net == NULL)
2240 return -ENXIO;
2241 err = single_open(file, fib_triestat_seq_show, net);
2242 if (err < 0) {
2243 put_net(net);
2244 return err;
2245 }
2246 return 0;
2247}
2248
2249static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2250{
2251 struct seq_file *seq = f->private_data;
2252 put_net(seq->private);
2253 return single_release(ino, f);
19baf839
RO
2254}
2255
9a32144e 2256static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2257 .owner = THIS_MODULE,
2258 .open = fib_triestat_seq_open,
2259 .read = seq_read,
2260 .llseek = seq_lseek,
1c340b2f 2261 .release = fib_triestat_seq_release,
cb7b593c
SH
2262};
2263
2264static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2265 loff_t pos)
19baf839 2266{
cb7b593c
SH
2267 loff_t idx = 0;
2268 struct node *n;
2269
877a9bff 2270 for (n = fib_trie_get_first(iter, iter->trie_local);
cb7b593c
SH
2271 n; ++idx, n = fib_trie_get_next(iter)) {
2272 if (pos == idx)
2273 return n;
2274 }
2275
877a9bff 2276 for (n = fib_trie_get_first(iter, iter->trie_main);
cb7b593c
SH
2277 n; ++idx, n = fib_trie_get_next(iter)) {
2278 if (pos == idx)
2279 return n;
2280 }
19baf839
RO
2281 return NULL;
2282}
2283
cb7b593c 2284static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2285 __acquires(RCU)
19baf839 2286{
877a9bff
EB
2287 struct fib_trie_iter *iter = seq->private;
2288 struct fib_table *tb;
2289
2290 if (!iter->trie_local) {
1c340b2f 2291 tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
877a9bff
EB
2292 if (tb)
2293 iter->trie_local = (struct trie *) tb->tb_data;
2294 }
2295 if (!iter->trie_main) {
1c340b2f 2296 tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
877a9bff
EB
2297 if (tb)
2298 iter->trie_main = (struct trie *) tb->tb_data;
2299 }
cb7b593c
SH
2300 rcu_read_lock();
2301 if (*pos == 0)
91b9a277 2302 return SEQ_START_TOKEN;
877a9bff 2303 return fib_trie_get_idx(iter, *pos - 1);
19baf839
RO
2304}
2305
cb7b593c 2306static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2307{
cb7b593c
SH
2308 struct fib_trie_iter *iter = seq->private;
2309 void *l = v;
2310
19baf839 2311 ++*pos;
91b9a277 2312 if (v == SEQ_START_TOKEN)
cb7b593c 2313 return fib_trie_get_idx(iter, 0);
19baf839 2314
cb7b593c
SH
2315 v = fib_trie_get_next(iter);
2316 BUG_ON(v == l);
2317 if (v)
2318 return v;
19baf839 2319
cb7b593c 2320 /* continue scan in next trie */
877a9bff
EB
2321 if (iter->trie == iter->trie_local)
2322 return fib_trie_get_first(iter, iter->trie_main);
19baf839 2323
cb7b593c
SH
2324 return NULL;
2325}
19baf839 2326
cb7b593c 2327static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2328 __releases(RCU)
19baf839 2329{
cb7b593c
SH
2330 rcu_read_unlock();
2331}
91b9a277 2332
cb7b593c
SH
2333static void seq_indent(struct seq_file *seq, int n)
2334{
2335 while (n-- > 0) seq_puts(seq, " ");
2336}
19baf839 2337
28d36e37 2338static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2339{
132adf54 2340 switch (s) {
cb7b593c
SH
2341 case RT_SCOPE_UNIVERSE: return "universe";
2342 case RT_SCOPE_SITE: return "site";
2343 case RT_SCOPE_LINK: return "link";
2344 case RT_SCOPE_HOST: return "host";
2345 case RT_SCOPE_NOWHERE: return "nowhere";
2346 default:
28d36e37 2347 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2348 return buf;
2349 }
2350}
19baf839 2351
cb7b593c
SH
2352static const char *rtn_type_names[__RTN_MAX] = {
2353 [RTN_UNSPEC] = "UNSPEC",
2354 [RTN_UNICAST] = "UNICAST",
2355 [RTN_LOCAL] = "LOCAL",
2356 [RTN_BROADCAST] = "BROADCAST",
2357 [RTN_ANYCAST] = "ANYCAST",
2358 [RTN_MULTICAST] = "MULTICAST",
2359 [RTN_BLACKHOLE] = "BLACKHOLE",
2360 [RTN_UNREACHABLE] = "UNREACHABLE",
2361 [RTN_PROHIBIT] = "PROHIBIT",
2362 [RTN_THROW] = "THROW",
2363 [RTN_NAT] = "NAT",
2364 [RTN_XRESOLVE] = "XRESOLVE",
2365};
19baf839 2366
28d36e37 2367static inline const char *rtn_type(char *buf, size_t len, unsigned t)
cb7b593c 2368{
cb7b593c
SH
2369 if (t < __RTN_MAX && rtn_type_names[t])
2370 return rtn_type_names[t];
28d36e37 2371 snprintf(buf, len, "type %u", t);
cb7b593c 2372 return buf;
19baf839
RO
2373}
2374
cb7b593c
SH
2375/* Pretty print the trie */
2376static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2377{
cb7b593c
SH
2378 const struct fib_trie_iter *iter = seq->private;
2379 struct node *n = v;
c877efb2 2380
cb7b593c
SH
2381 if (v == SEQ_START_TOKEN)
2382 return 0;
19baf839 2383
b59cfbf7 2384 if (!node_parent_rcu(n)) {
877a9bff 2385 if (iter->trie == iter->trie_local)
095b8501
RO
2386 seq_puts(seq, "<local>:\n");
2387 else
2388 seq_puts(seq, "<main>:\n");
2389 }
2390
cb7b593c
SH
2391 if (IS_TNODE(n)) {
2392 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2393 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2394
1d25cd6c
RO
2395 seq_indent(seq, iter->depth-1);
2396 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
e905a9ed 2397 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
1d25cd6c 2398 tn->empty_children);
e905a9ed 2399
cb7b593c
SH
2400 } else {
2401 struct leaf *l = (struct leaf *) n;
1328042e
SH
2402 struct leaf_info *li;
2403 struct hlist_node *node;
2404
32ab5f80 2405 __be32 val = htonl(l->key);
cb7b593c
SH
2406
2407 seq_indent(seq, iter->depth);
2408 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
1328042e
SH
2409
2410 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2411 struct fib_alias *fa;
2412
2413 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2414 char buf1[32], buf2[32];
2415
2416 seq_indent(seq, iter->depth+1);
2417 seq_printf(seq, " /%d %s %s", li->plen,
2418 rtn_scope(buf1, sizeof(buf1),
2419 fa->fa_scope),
2420 rtn_type(buf2, sizeof(buf2),
2421 fa->fa_type));
2422 if (fa->fa_tos)
2423 seq_printf(seq, "tos =%d\n",
2424 fa->fa_tos);
2425 seq_putc(seq, '\n');
cb7b593c
SH
2426 }
2427 }
19baf839 2428 }
cb7b593c 2429
19baf839
RO
2430 return 0;
2431}
2432
f690808e 2433static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2434 .start = fib_trie_seq_start,
2435 .next = fib_trie_seq_next,
2436 .stop = fib_trie_seq_stop,
2437 .show = fib_trie_seq_show,
19baf839
RO
2438};
2439
cb7b593c 2440static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2441{
1c340b2f
DL
2442 return seq_open_net(inode, file, &fib_trie_seq_ops,
2443 sizeof(struct fib_trie_iter));
19baf839
RO
2444}
2445
9a32144e 2446static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2447 .owner = THIS_MODULE,
2448 .open = fib_trie_seq_open,
2449 .read = seq_read,
2450 .llseek = seq_lseek,
1c340b2f 2451 .release = seq_release_net,
19baf839
RO
2452};
2453
32ab5f80 2454static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2455{
cb7b593c
SH
2456 static unsigned type2flags[RTN_MAX + 1] = {
2457 [7] = RTF_REJECT, [8] = RTF_REJECT,
2458 };
2459 unsigned flags = type2flags[type];
19baf839 2460
cb7b593c
SH
2461 if (fi && fi->fib_nh->nh_gw)
2462 flags |= RTF_GATEWAY;
32ab5f80 2463 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2464 flags |= RTF_HOST;
2465 flags |= RTF_UP;
2466 return flags;
19baf839
RO
2467}
2468
cb7b593c
SH
2469/*
2470 * This outputs /proc/net/route.
2471 * The format of the file is not supposed to be changed
2472 * and needs to be same as fib_hash output to avoid breaking
2473 * legacy utilities
2474 */
2475static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2476{
c9e53cbe 2477 const struct fib_trie_iter *iter = seq->private;
cb7b593c 2478 struct leaf *l = v;
1328042e
SH
2479 struct leaf_info *li;
2480 struct hlist_node *node;
19baf839 2481
cb7b593c
SH
2482 if (v == SEQ_START_TOKEN) {
2483 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2484 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2485 "\tWindow\tIRTT");
2486 return 0;
2487 }
19baf839 2488
877a9bff 2489 if (iter->trie == iter->trie_local)
c9e53cbe 2490 return 0;
a07f5f50 2491
cb7b593c
SH
2492 if (IS_TNODE(l))
2493 return 0;
19baf839 2494
1328042e 2495 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
cb7b593c 2496 struct fib_alias *fa;
32ab5f80 2497 __be32 mask, prefix;
91b9a277 2498
cb7b593c
SH
2499 if (!li)
2500 continue;
19baf839 2501
cb7b593c
SH
2502 mask = inet_make_mask(li->plen);
2503 prefix = htonl(l->key);
19baf839 2504
cb7b593c 2505 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2506 const struct fib_info *fi = fa->fa_info;
cb7b593c 2507 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
1328042e 2508 char bf[128];
19baf839 2509
cb7b593c
SH
2510 if (fa->fa_type == RTN_BROADCAST
2511 || fa->fa_type == RTN_MULTICAST)
2512 continue;
19baf839 2513
cb7b593c
SH
2514 if (fi)
2515 snprintf(bf, sizeof(bf),
2516 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2517 fi->fib_dev ? fi->fib_dev->name : "*",
2518 prefix,
2519 fi->fib_nh->nh_gw, flags, 0, 0,
2520 fi->fib_priority,
2521 mask,
a07f5f50
SH
2522 (fi->fib_advmss ?
2523 fi->fib_advmss + 40 : 0),
cb7b593c
SH
2524 fi->fib_window,
2525 fi->fib_rtt >> 3);
2526 else
2527 snprintf(bf, sizeof(bf),
2528 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2529 prefix, 0, flags, 0, 0, 0,
2530 mask, 0, 0, 0);
19baf839 2531
cb7b593c
SH
2532 seq_printf(seq, "%-127s\n", bf);
2533 }
19baf839
RO
2534 }
2535
2536 return 0;
2537}
2538
f690808e 2539static const struct seq_operations fib_route_seq_ops = {
cb7b593c
SH
2540 .start = fib_trie_seq_start,
2541 .next = fib_trie_seq_next,
2542 .stop = fib_trie_seq_stop,
2543 .show = fib_route_seq_show,
19baf839
RO
2544};
2545
cb7b593c 2546static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2547{
1c340b2f
DL
2548 return seq_open_net(inode, file, &fib_route_seq_ops,
2549 sizeof(struct fib_trie_iter));
19baf839
RO
2550}
2551
9a32144e 2552static const struct file_operations fib_route_fops = {
cb7b593c
SH
2553 .owner = THIS_MODULE,
2554 .open = fib_route_seq_open,
2555 .read = seq_read,
2556 .llseek = seq_lseek,
1c340b2f 2557 .release = seq_release_net,
19baf839
RO
2558};
2559
61a02653 2560int __net_init fib_proc_init(struct net *net)
19baf839 2561{
61a02653 2562 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
cb7b593c
SH
2563 goto out1;
2564
61a02653
DL
2565 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2566 &fib_triestat_fops))
cb7b593c
SH
2567 goto out2;
2568
61a02653 2569 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
cb7b593c
SH
2570 goto out3;
2571
19baf839 2572 return 0;
cb7b593c
SH
2573
2574out3:
61a02653 2575 proc_net_remove(net, "fib_triestat");
cb7b593c 2576out2:
61a02653 2577 proc_net_remove(net, "fib_trie");
cb7b593c
SH
2578out1:
2579 return -ENOMEM;
19baf839
RO
2580}
2581
61a02653 2582void __net_exit fib_proc_exit(struct net *net)
19baf839 2583{
61a02653
DL
2584 proc_net_remove(net, "fib_trie");
2585 proc_net_remove(net, "fib_triestat");
2586 proc_net_remove(net, "route");
19baf839
RO
2587}
2588
2589#endif /* CONFIG_PROC_FS */
This page took 0.497593 seconds and 5 git commands to generate.