[IPV4] fib_trie: Add statistics.
[deliverable/linux.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
fd966255
RO
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
19baf839
RO
51 */
52
05eee48c 53#define VERSION "0.408"
19baf839 54
19baf839
RO
55#include <asm/uaccess.h>
56#include <asm/system.h>
1977f032 57#include <linux/bitops.h>
19baf839
RO
58#include <linux/types.h>
59#include <linux/kernel.h>
19baf839
RO
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
cd8787ab 67#include <linux/inetdevice.h>
19baf839
RO
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
2373ce1c 71#include <linux/rcupdate.h>
19baf839
RO
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
457c4cbc 76#include <net/net_namespace.h>
19baf839
RO
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
06ef921d 85#define MAX_STAT_DEPTH 32
19baf839 86
19baf839 87#define KEYLENGTH (8*sizeof(t_key))
19baf839 88
19baf839
RO
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF 1
93#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
91b9a277
OJ
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
98
99struct node {
91b9a277
OJ
100 t_key key;
101 unsigned long parent;
19baf839
RO
102};
103
104struct leaf {
91b9a277
OJ
105 t_key key;
106 unsigned long parent;
19baf839 107 struct hlist_head list;
2373ce1c 108 struct rcu_head rcu;
19baf839
RO
109};
110
111struct leaf_info {
112 struct hlist_node hlist;
2373ce1c 113 struct rcu_head rcu;
19baf839
RO
114 int plen;
115 struct list_head falh;
116};
117
118struct tnode {
91b9a277
OJ
119 t_key key;
120 unsigned long parent;
121 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
122 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
123 unsigned short full_children; /* KEYLENGTH bits needed */
124 unsigned short empty_children; /* KEYLENGTH bits needed */
2373ce1c 125 struct rcu_head rcu;
91b9a277 126 struct node *child[0];
19baf839
RO
127};
128
129#ifdef CONFIG_IP_FIB_TRIE_STATS
130struct trie_use_stats {
131 unsigned int gets;
132 unsigned int backtrack;
133 unsigned int semantic_match_passed;
134 unsigned int semantic_match_miss;
135 unsigned int null_node_hit;
2f36895a 136 unsigned int resize_node_skipped;
19baf839
RO
137};
138#endif
139
140struct trie_stat {
141 unsigned int totdepth;
142 unsigned int maxdepth;
143 unsigned int tnodes;
144 unsigned int leaves;
145 unsigned int nullpointers;
06ef921d 146 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 147};
19baf839
RO
148
149struct trie {
91b9a277 150 struct node *trie;
19baf839
RO
151#ifdef CONFIG_IP_FIB_TRIE_STATS
152 struct trie_use_stats stats;
153#endif
91b9a277 154 int size;
19baf839
RO
155};
156
19baf839
RO
157static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
158static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
19baf839 159static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
160static struct tnode *inflate(struct trie *t, struct tnode *tn);
161static struct tnode *halve(struct trie *t, struct tnode *tn);
19baf839 162static void tnode_free(struct tnode *tn);
19baf839 163
e18b890b 164static struct kmem_cache *fn_alias_kmem __read_mostly;
19baf839 165
06801916
SH
166static inline struct tnode *node_parent(struct node *node)
167{
168 struct tnode *ret;
169
170 ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171 return rcu_dereference(ret);
172}
173
174static inline void node_set_parent(struct node *node, struct tnode *ptr)
175{
176 rcu_assign_pointer(node->parent,
177 (unsigned long)ptr | NODE_TYPE(node));
178}
2373ce1c
RO
179
180/* rcu_read_lock needs to be hold by caller from readside */
181
c877efb2 182static inline struct node *tnode_get_child(struct tnode *tn, int i)
19baf839 183{
91b9a277 184 BUG_ON(i >= 1 << tn->bits);
19baf839 185
2373ce1c 186 return rcu_dereference(tn->child[i]);
19baf839
RO
187}
188
bb435b8d 189static inline int tnode_child_length(const struct tnode *tn)
19baf839 190{
91b9a277 191 return 1 << tn->bits;
19baf839
RO
192}
193
ab66b4a7
SH
194static inline t_key mask_pfx(t_key k, unsigned short l)
195{
196 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
197}
198
19baf839
RO
199static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
200{
91b9a277 201 if (offset < KEYLENGTH)
19baf839 202 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 203 else
19baf839
RO
204 return 0;
205}
206
207static inline int tkey_equals(t_key a, t_key b)
208{
c877efb2 209 return a == b;
19baf839
RO
210}
211
212static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
213{
c877efb2
SH
214 if (bits == 0 || offset >= KEYLENGTH)
215 return 1;
91b9a277
OJ
216 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
217 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 218}
19baf839
RO
219
220static inline int tkey_mismatch(t_key a, int offset, t_key b)
221{
222 t_key diff = a ^ b;
223 int i = offset;
224
c877efb2
SH
225 if (!diff)
226 return 0;
227 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
228 i++;
229 return i;
230}
231
19baf839 232/*
e905a9ed
YH
233 To understand this stuff, an understanding of keys and all their bits is
234 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
235 all of the bits in that key are significant.
236
237 Consider a node 'n' and its parent 'tp'.
238
e905a9ed
YH
239 If n is a leaf, every bit in its key is significant. Its presence is
240 necessitated by path compression, since during a tree traversal (when
241 searching for a leaf - unless we are doing an insertion) we will completely
242 ignore all skipped bits we encounter. Thus we need to verify, at the end of
243 a potentially successful search, that we have indeed been walking the
19baf839
RO
244 correct key path.
245
e905a9ed
YH
246 Note that we can never "miss" the correct key in the tree if present by
247 following the wrong path. Path compression ensures that segments of the key
248 that are the same for all keys with a given prefix are skipped, but the
249 skipped part *is* identical for each node in the subtrie below the skipped
250 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
251 call to tkey_sub_equals() in trie_insert().
252
e905a9ed 253 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
254 have many different meanings.
255
e905a9ed 256 Example:
19baf839
RO
257 _________________________________________________________________
258 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
259 -----------------------------------------------------------------
e905a9ed 260 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
261
262 _________________________________________________________________
263 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
264 -----------------------------------------------------------------
265 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
266
267 tp->pos = 7
268 tp->bits = 3
269 n->pos = 15
91b9a277 270 n->bits = 4
19baf839 271
e905a9ed
YH
272 First, let's just ignore the bits that come before the parent tp, that is
273 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
274 not use them for anything.
275
276 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 277 index into the parent's child array. That is, they will be used to find
19baf839
RO
278 'n' among tp's children.
279
280 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
281 for the node n.
282
e905a9ed 283 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
284 of the bits are really not needed or indeed known in n->key.
285
e905a9ed 286 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 287 n's child array, and will of course be different for each child.
e905a9ed 288
c877efb2 289
19baf839
RO
290 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
291 at this point.
292
293*/
294
0c7770c7 295static inline void check_tnode(const struct tnode *tn)
19baf839 296{
0c7770c7 297 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
298}
299
f5026fab
DL
300static const int halve_threshold = 25;
301static const int inflate_threshold = 50;
302static const int halve_threshold_root = 8;
303static const int inflate_threshold_root = 15;
19baf839 304
2373ce1c
RO
305
306static void __alias_free_mem(struct rcu_head *head)
19baf839 307{
2373ce1c
RO
308 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
309 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
310}
311
2373ce1c 312static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 313{
2373ce1c
RO
314 call_rcu(&fa->rcu, __alias_free_mem);
315}
91b9a277 316
2373ce1c
RO
317static void __leaf_free_rcu(struct rcu_head *head)
318{
319 kfree(container_of(head, struct leaf, rcu));
320}
91b9a277 321
2373ce1c 322static void __leaf_info_free_rcu(struct rcu_head *head)
19baf839 323{
2373ce1c 324 kfree(container_of(head, struct leaf_info, rcu));
19baf839
RO
325}
326
2373ce1c 327static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 328{
2373ce1c 329 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
19baf839
RO
330}
331
f0e36f8c
PM
332static struct tnode *tnode_alloc(unsigned int size)
333{
2373ce1c
RO
334 struct page *pages;
335
336 if (size <= PAGE_SIZE)
337 return kcalloc(size, 1, GFP_KERNEL);
338
339 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
340 if (!pages)
341 return NULL;
342
343 return page_address(pages);
f0e36f8c
PM
344}
345
2373ce1c 346static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 347{
2373ce1c 348 struct tnode *tn = container_of(head, struct tnode, rcu);
f0e36f8c 349 unsigned int size = sizeof(struct tnode) +
2373ce1c 350 (1 << tn->bits) * sizeof(struct node *);
f0e36f8c
PM
351
352 if (size <= PAGE_SIZE)
353 kfree(tn);
354 else
355 free_pages((unsigned long)tn, get_order(size));
356}
357
2373ce1c
RO
358static inline void tnode_free(struct tnode *tn)
359{
132adf54 360 if (IS_LEAF(tn)) {
550e29bc
RO
361 struct leaf *l = (struct leaf *) tn;
362 call_rcu_bh(&l->rcu, __leaf_free_rcu);
132adf54 363 } else
550e29bc 364 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
365}
366
367static struct leaf *leaf_new(void)
368{
369 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
370 if (l) {
371 l->parent = T_LEAF;
372 INIT_HLIST_HEAD(&l->list);
373 }
374 return l;
375}
376
377static struct leaf_info *leaf_info_new(int plen)
378{
379 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
380 if (li) {
381 li->plen = plen;
382 INIT_LIST_HEAD(&li->falh);
383 }
384 return li;
385}
386
19baf839
RO
387static struct tnode* tnode_new(t_key key, int pos, int bits)
388{
389 int nchildren = 1<<bits;
390 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
f0e36f8c 391 struct tnode *tn = tnode_alloc(sz);
19baf839 392
91b9a277 393 if (tn) {
19baf839 394 memset(tn, 0, sz);
2373ce1c 395 tn->parent = T_TNODE;
19baf839
RO
396 tn->pos = pos;
397 tn->bits = bits;
398 tn->key = key;
399 tn->full_children = 0;
400 tn->empty_children = 1<<bits;
401 }
c877efb2 402
0c7770c7
SH
403 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
404 (unsigned int) (sizeof(struct node) * 1<<bits));
19baf839
RO
405 return tn;
406}
407
19baf839
RO
408/*
409 * Check whether a tnode 'n' is "full", i.e. it is an internal node
410 * and no bits are skipped. See discussion in dyntree paper p. 6
411 */
412
bb435b8d 413static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 414{
c877efb2 415 if (n == NULL || IS_LEAF(n))
19baf839
RO
416 return 0;
417
418 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
419}
420
c877efb2 421static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
19baf839
RO
422{
423 tnode_put_child_reorg(tn, i, n, -1);
424}
425
c877efb2 426 /*
19baf839
RO
427 * Add a child at position i overwriting the old value.
428 * Update the value of full_children and empty_children.
429 */
430
c877efb2 431static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
19baf839 432{
2373ce1c 433 struct node *chi = tn->child[i];
19baf839
RO
434 int isfull;
435
0c7770c7
SH
436 BUG_ON(i >= 1<<tn->bits);
437
19baf839
RO
438
439 /* update emptyChildren */
440 if (n == NULL && chi != NULL)
441 tn->empty_children++;
442 else if (n != NULL && chi == NULL)
443 tn->empty_children--;
c877efb2 444
19baf839 445 /* update fullChildren */
91b9a277 446 if (wasfull == -1)
19baf839
RO
447 wasfull = tnode_full(tn, chi);
448
449 isfull = tnode_full(tn, n);
c877efb2 450 if (wasfull && !isfull)
19baf839 451 tn->full_children--;
c877efb2 452 else if (!wasfull && isfull)
19baf839 453 tn->full_children++;
91b9a277 454
c877efb2 455 if (n)
06801916 456 node_set_parent(n, tn);
19baf839 457
2373ce1c 458 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
459}
460
c877efb2 461static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
462{
463 int i;
2f36895a 464 int err = 0;
2f80b3c8 465 struct tnode *old_tn;
e6308be8
RO
466 int inflate_threshold_use;
467 int halve_threshold_use;
05eee48c 468 int max_resize;
19baf839 469
e905a9ed 470 if (!tn)
19baf839
RO
471 return NULL;
472
0c7770c7
SH
473 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
474 tn, inflate_threshold, halve_threshold);
19baf839
RO
475
476 /* No children */
477 if (tn->empty_children == tnode_child_length(tn)) {
478 tnode_free(tn);
479 return NULL;
480 }
481 /* One child */
482 if (tn->empty_children == tnode_child_length(tn) - 1)
483 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 484 struct node *n;
19baf839 485
91b9a277 486 n = tn->child[i];
2373ce1c 487 if (!n)
91b9a277 488 continue;
91b9a277
OJ
489
490 /* compress one level */
06801916 491 node_set_parent(n, NULL);
91b9a277
OJ
492 tnode_free(tn);
493 return n;
19baf839 494 }
c877efb2 495 /*
19baf839
RO
496 * Double as long as the resulting node has a number of
497 * nonempty nodes that are above the threshold.
498 */
499
500 /*
c877efb2
SH
501 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
502 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 503 * Telecommunications, page 6:
c877efb2 504 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
505 * children in the *doubled* node is at least 'high'."
506 *
c877efb2
SH
507 * 'high' in this instance is the variable 'inflate_threshold'. It
508 * is expressed as a percentage, so we multiply it with
509 * tnode_child_length() and instead of multiplying by 2 (since the
510 * child array will be doubled by inflate()) and multiplying
511 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 512 * multiply the left-hand side by 50.
c877efb2
SH
513 *
514 * The left-hand side may look a bit weird: tnode_child_length(tn)
515 * - tn->empty_children is of course the number of non-null children
516 * in the current node. tn->full_children is the number of "full"
19baf839 517 * children, that is non-null tnodes with a skip value of 0.
c877efb2 518 * All of those will be doubled in the resulting inflated tnode, so
19baf839 519 * we just count them one extra time here.
c877efb2 520 *
19baf839 521 * A clearer way to write this would be:
c877efb2 522 *
19baf839 523 * to_be_doubled = tn->full_children;
c877efb2 524 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
525 * tn->full_children;
526 *
527 * new_child_length = tnode_child_length(tn) * 2;
528 *
c877efb2 529 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
530 * new_child_length;
531 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
532 *
533 * ...and so on, tho it would mess up the while () loop.
534 *
19baf839
RO
535 * anyway,
536 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
537 * inflate_threshold
c877efb2 538 *
19baf839
RO
539 * avoid a division:
540 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
541 * inflate_threshold * new_child_length
c877efb2 542 *
19baf839 543 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 544 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 545 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 546 *
19baf839 547 * expand new_child_length:
c877efb2 548 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 549 * tn->full_children) >=
19baf839 550 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 551 *
19baf839 552 * shorten again:
c877efb2 553 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 554 * tn->empty_children) >= inflate_threshold *
19baf839 555 * tnode_child_length(tn)
c877efb2 556 *
19baf839
RO
557 */
558
559 check_tnode(tn);
c877efb2 560
e6308be8
RO
561 /* Keep root node larger */
562
132adf54 563 if (!tn->parent)
e6308be8 564 inflate_threshold_use = inflate_threshold_root;
e905a9ed 565 else
e6308be8
RO
566 inflate_threshold_use = inflate_threshold;
567
2f36895a 568 err = 0;
05eee48c
RO
569 max_resize = 10;
570 while ((tn->full_children > 0 && max_resize-- &&
19baf839 571 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
e6308be8 572 inflate_threshold_use * tnode_child_length(tn))) {
19baf839 573
2f80b3c8
RO
574 old_tn = tn;
575 tn = inflate(t, tn);
576 if (IS_ERR(tn)) {
577 tn = old_tn;
2f36895a
RO
578#ifdef CONFIG_IP_FIB_TRIE_STATS
579 t->stats.resize_node_skipped++;
580#endif
581 break;
582 }
19baf839
RO
583 }
584
05eee48c
RO
585 if (max_resize < 0) {
586 if (!tn->parent)
587 printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
588 inflate_threshold_root, tn->bits);
589 else
590 printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
591 inflate_threshold, tn->bits);
592 }
593
19baf839
RO
594 check_tnode(tn);
595
596 /*
597 * Halve as long as the number of empty children in this
598 * node is above threshold.
599 */
2f36895a 600
e6308be8
RO
601
602 /* Keep root node larger */
603
132adf54 604 if (!tn->parent)
e6308be8 605 halve_threshold_use = halve_threshold_root;
e905a9ed 606 else
e6308be8
RO
607 halve_threshold_use = halve_threshold;
608
2f36895a 609 err = 0;
05eee48c
RO
610 max_resize = 10;
611 while (tn->bits > 1 && max_resize-- &&
19baf839 612 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 613 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 614
2f80b3c8
RO
615 old_tn = tn;
616 tn = halve(t, tn);
617 if (IS_ERR(tn)) {
618 tn = old_tn;
2f36895a
RO
619#ifdef CONFIG_IP_FIB_TRIE_STATS
620 t->stats.resize_node_skipped++;
621#endif
622 break;
623 }
624 }
19baf839 625
05eee48c
RO
626 if (max_resize < 0) {
627 if (!tn->parent)
628 printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
629 halve_threshold_root, tn->bits);
630 else
631 printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
632 halve_threshold, tn->bits);
633 }
c877efb2 634
19baf839 635 /* Only one child remains */
19baf839
RO
636 if (tn->empty_children == tnode_child_length(tn) - 1)
637 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 638 struct node *n;
19baf839 639
91b9a277 640 n = tn->child[i];
2373ce1c 641 if (!n)
91b9a277 642 continue;
91b9a277
OJ
643
644 /* compress one level */
645
06801916 646 node_set_parent(n, NULL);
91b9a277
OJ
647 tnode_free(tn);
648 return n;
19baf839
RO
649 }
650
651 return (struct node *) tn;
652}
653
2f80b3c8 654static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839
RO
655{
656 struct tnode *inode;
657 struct tnode *oldtnode = tn;
658 int olen = tnode_child_length(tn);
659 int i;
660
0c7770c7 661 pr_debug("In inflate\n");
19baf839
RO
662
663 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
664
0c7770c7 665 if (!tn)
2f80b3c8 666 return ERR_PTR(-ENOMEM);
2f36895a
RO
667
668 /*
c877efb2
SH
669 * Preallocate and store tnodes before the actual work so we
670 * don't get into an inconsistent state if memory allocation
671 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
672 * of tnode is ignored.
673 */
91b9a277
OJ
674
675 for (i = 0; i < olen; i++) {
2f36895a
RO
676 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
677
678 if (inode &&
679 IS_TNODE(inode) &&
680 inode->pos == oldtnode->pos + oldtnode->bits &&
681 inode->bits > 1) {
682 struct tnode *left, *right;
ab66b4a7 683 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 684
2f36895a
RO
685 left = tnode_new(inode->key&(~m), inode->pos + 1,
686 inode->bits - 1);
2f80b3c8
RO
687 if (!left)
688 goto nomem;
91b9a277 689
2f36895a
RO
690 right = tnode_new(inode->key|m, inode->pos + 1,
691 inode->bits - 1);
692
e905a9ed 693 if (!right) {
2f80b3c8
RO
694 tnode_free(left);
695 goto nomem;
e905a9ed 696 }
2f36895a
RO
697
698 put_child(t, tn, 2*i, (struct node *) left);
699 put_child(t, tn, 2*i+1, (struct node *) right);
700 }
701 }
702
91b9a277 703 for (i = 0; i < olen; i++) {
19baf839 704 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
705 struct tnode *left, *right;
706 int size, j;
c877efb2 707
19baf839
RO
708 /* An empty child */
709 if (node == NULL)
710 continue;
711
712 /* A leaf or an internal node with skipped bits */
713
c877efb2 714 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 715 tn->pos + tn->bits - 1) {
c877efb2 716 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
19baf839
RO
717 1) == 0)
718 put_child(t, tn, 2*i, node);
719 else
720 put_child(t, tn, 2*i+1, node);
721 continue;
722 }
723
724 /* An internal node with two children */
725 inode = (struct tnode *) node;
726
727 if (inode->bits == 1) {
728 put_child(t, tn, 2*i, inode->child[0]);
729 put_child(t, tn, 2*i+1, inode->child[1]);
730
731 tnode_free(inode);
91b9a277 732 continue;
19baf839
RO
733 }
734
91b9a277
OJ
735 /* An internal node with more than two children */
736
737 /* We will replace this node 'inode' with two new
738 * ones, 'left' and 'right', each with half of the
739 * original children. The two new nodes will have
740 * a position one bit further down the key and this
741 * means that the "significant" part of their keys
742 * (see the discussion near the top of this file)
743 * will differ by one bit, which will be "0" in
744 * left's key and "1" in right's key. Since we are
745 * moving the key position by one step, the bit that
746 * we are moving away from - the bit at position
747 * (inode->pos) - is the one that will differ between
748 * left and right. So... we synthesize that bit in the
749 * two new keys.
750 * The mask 'm' below will be a single "one" bit at
751 * the position (inode->pos)
752 */
19baf839 753
91b9a277
OJ
754 /* Use the old key, but set the new significant
755 * bit to zero.
756 */
2f36895a 757
91b9a277
OJ
758 left = (struct tnode *) tnode_get_child(tn, 2*i);
759 put_child(t, tn, 2*i, NULL);
2f36895a 760
91b9a277 761 BUG_ON(!left);
2f36895a 762
91b9a277
OJ
763 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
764 put_child(t, tn, 2*i+1, NULL);
19baf839 765
91b9a277 766 BUG_ON(!right);
19baf839 767
91b9a277
OJ
768 size = tnode_child_length(left);
769 for (j = 0; j < size; j++) {
770 put_child(t, left, j, inode->child[j]);
771 put_child(t, right, j, inode->child[j + size]);
19baf839 772 }
91b9a277
OJ
773 put_child(t, tn, 2*i, resize(t, left));
774 put_child(t, tn, 2*i+1, resize(t, right));
775
776 tnode_free(inode);
19baf839
RO
777 }
778 tnode_free(oldtnode);
779 return tn;
2f80b3c8
RO
780nomem:
781 {
782 int size = tnode_child_length(tn);
783 int j;
784
0c7770c7 785 for (j = 0; j < size; j++)
2f80b3c8
RO
786 if (tn->child[j])
787 tnode_free((struct tnode *)tn->child[j]);
788
789 tnode_free(tn);
0c7770c7 790
2f80b3c8
RO
791 return ERR_PTR(-ENOMEM);
792 }
19baf839
RO
793}
794
2f80b3c8 795static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
796{
797 struct tnode *oldtnode = tn;
798 struct node *left, *right;
799 int i;
800 int olen = tnode_child_length(tn);
801
0c7770c7 802 pr_debug("In halve\n");
c877efb2
SH
803
804 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 805
2f80b3c8
RO
806 if (!tn)
807 return ERR_PTR(-ENOMEM);
2f36895a
RO
808
809 /*
c877efb2
SH
810 * Preallocate and store tnodes before the actual work so we
811 * don't get into an inconsistent state if memory allocation
812 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
813 * of tnode is ignored.
814 */
815
91b9a277 816 for (i = 0; i < olen; i += 2) {
2f36895a
RO
817 left = tnode_get_child(oldtnode, i);
818 right = tnode_get_child(oldtnode, i+1);
c877efb2 819
2f36895a 820 /* Two nonempty children */
0c7770c7 821 if (left && right) {
2f80b3c8 822 struct tnode *newn;
0c7770c7 823
2f80b3c8 824 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
825
826 if (!newn)
2f80b3c8 827 goto nomem;
0c7770c7 828
2f80b3c8 829 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 830 }
2f36895a 831
2f36895a 832 }
19baf839 833
91b9a277
OJ
834 for (i = 0; i < olen; i += 2) {
835 struct tnode *newBinNode;
836
19baf839
RO
837 left = tnode_get_child(oldtnode, i);
838 right = tnode_get_child(oldtnode, i+1);
c877efb2 839
19baf839
RO
840 /* At least one of the children is empty */
841 if (left == NULL) {
842 if (right == NULL) /* Both are empty */
843 continue;
844 put_child(t, tn, i/2, right);
91b9a277 845 continue;
0c7770c7 846 }
91b9a277
OJ
847
848 if (right == NULL) {
19baf839 849 put_child(t, tn, i/2, left);
91b9a277
OJ
850 continue;
851 }
c877efb2 852
19baf839 853 /* Two nonempty children */
91b9a277
OJ
854 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
855 put_child(t, tn, i/2, NULL);
91b9a277
OJ
856 put_child(t, newBinNode, 0, left);
857 put_child(t, newBinNode, 1, right);
858 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839
RO
859 }
860 tnode_free(oldtnode);
861 return tn;
2f80b3c8
RO
862nomem:
863 {
864 int size = tnode_child_length(tn);
865 int j;
866
0c7770c7 867 for (j = 0; j < size; j++)
2f80b3c8
RO
868 if (tn->child[j])
869 tnode_free((struct tnode *)tn->child[j]);
870
871 tnode_free(tn);
0c7770c7 872
2f80b3c8
RO
873 return ERR_PTR(-ENOMEM);
874 }
19baf839
RO
875}
876
772cb712 877/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
878 via get_fa_head and dump */
879
772cb712 880static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 881{
772cb712 882 struct hlist_head *head = &l->list;
19baf839
RO
883 struct hlist_node *node;
884 struct leaf_info *li;
885
2373ce1c 886 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 887 if (li->plen == plen)
19baf839 888 return li;
91b9a277 889
19baf839
RO
890 return NULL;
891}
892
893static inline struct list_head * get_fa_head(struct leaf *l, int plen)
894{
772cb712 895 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 896
91b9a277
OJ
897 if (!li)
898 return NULL;
c877efb2 899
91b9a277 900 return &li->falh;
19baf839
RO
901}
902
903static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
904{
e905a9ed
YH
905 struct leaf_info *li = NULL, *last = NULL;
906 struct hlist_node *node;
907
908 if (hlist_empty(head)) {
909 hlist_add_head_rcu(&new->hlist, head);
910 } else {
911 hlist_for_each_entry(li, node, head, hlist) {
912 if (new->plen > li->plen)
913 break;
914
915 last = li;
916 }
917 if (last)
918 hlist_add_after_rcu(&last->hlist, &new->hlist);
919 else
920 hlist_add_before_rcu(&new->hlist, &li->hlist);
921 }
19baf839
RO
922}
923
2373ce1c
RO
924/* rcu_read_lock needs to be hold by caller from readside */
925
19baf839
RO
926static struct leaf *
927fib_find_node(struct trie *t, u32 key)
928{
929 int pos;
930 struct tnode *tn;
931 struct node *n;
932
933 pos = 0;
2373ce1c 934 n = rcu_dereference(t->trie);
19baf839
RO
935
936 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
937 tn = (struct tnode *) n;
91b9a277 938
19baf839 939 check_tnode(tn);
91b9a277 940
c877efb2 941 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 942 pos = tn->pos + tn->bits;
19baf839 943 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
91b9a277 944 } else
19baf839
RO
945 break;
946 }
947 /* Case we have found a leaf. Compare prefixes */
948
91b9a277
OJ
949 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
950 return (struct leaf *)n;
951
19baf839
RO
952 return NULL;
953}
954
955static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
956{
19baf839 957 int wasfull;
06801916
SH
958 t_key cindex, key = tn->key;
959 struct tnode *tp;
19baf839 960
06801916 961 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
19baf839
RO
962 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
963 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
964 tn = (struct tnode *) resize (t, (struct tnode *)tn);
965 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
91b9a277 966
06801916
SH
967 tp = node_parent((struct node *) tn);
968 if (!tp)
19baf839 969 break;
06801916 970 tn = tp;
19baf839 971 }
06801916 972
19baf839 973 /* Handle last (top) tnode */
c877efb2 974 if (IS_TNODE(tn))
19baf839
RO
975 tn = (struct tnode*) resize(t, (struct tnode *)tn);
976
977 return (struct node*) tn;
978}
979
2373ce1c
RO
980/* only used from updater-side */
981
fea86ad8 982static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
983{
984 int pos, newpos;
985 struct tnode *tp = NULL, *tn = NULL;
986 struct node *n;
987 struct leaf *l;
988 int missbit;
c877efb2 989 struct list_head *fa_head = NULL;
19baf839
RO
990 struct leaf_info *li;
991 t_key cindex;
992
993 pos = 0;
c877efb2 994 n = t->trie;
19baf839 995
c877efb2
SH
996 /* If we point to NULL, stop. Either the tree is empty and we should
997 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 998 * and we should just put our new leaf in that.
c877efb2
SH
999 * If we point to a T_TNODE, check if it matches our key. Note that
1000 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1001 * not be the parent's 'pos'+'bits'!
1002 *
c877efb2 1003 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1004 * the index from our key, push the T_TNODE and walk the tree.
1005 *
1006 * If it doesn't, we have to replace it with a new T_TNODE.
1007 *
c877efb2
SH
1008 * If we point to a T_LEAF, it might or might not have the same key
1009 * as we do. If it does, just change the value, update the T_LEAF's
1010 * value, and return it.
19baf839
RO
1011 * If it doesn't, we need to replace it with a T_TNODE.
1012 */
1013
1014 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1015 tn = (struct tnode *) n;
91b9a277 1016
c877efb2 1017 check_tnode(tn);
91b9a277 1018
c877efb2 1019 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1020 tp = tn;
91b9a277 1021 pos = tn->pos + tn->bits;
19baf839
RO
1022 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1023
06801916 1024 BUG_ON(n && node_parent(n) != tn);
91b9a277 1025 } else
19baf839
RO
1026 break;
1027 }
1028
1029 /*
1030 * n ----> NULL, LEAF or TNODE
1031 *
c877efb2 1032 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1033 */
1034
91b9a277 1035 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1036
1037 /* Case 1: n is a leaf. Compare prefixes */
1038
c877efb2 1039 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
91b9a277
OJ
1040 struct leaf *l = (struct leaf *) n;
1041
19baf839 1042 li = leaf_info_new(plen);
91b9a277 1043
fea86ad8
SH
1044 if (!li)
1045 return NULL;
19baf839
RO
1046
1047 fa_head = &li->falh;
1048 insert_leaf_info(&l->list, li);
1049 goto done;
1050 }
1051 t->size++;
1052 l = leaf_new();
1053
fea86ad8
SH
1054 if (!l)
1055 return NULL;
19baf839
RO
1056
1057 l->key = key;
1058 li = leaf_info_new(plen);
1059
c877efb2 1060 if (!li) {
f835e471 1061 tnode_free((struct tnode *) l);
fea86ad8 1062 return NULL;
f835e471 1063 }
19baf839
RO
1064
1065 fa_head = &li->falh;
1066 insert_leaf_info(&l->list, li);
1067
19baf839 1068 if (t->trie && n == NULL) {
91b9a277 1069 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1070
06801916 1071 node_set_parent((struct node *)l, tp);
19baf839 1072
91b9a277
OJ
1073 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1074 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1075 } else {
1076 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1077 /*
1078 * Add a new tnode here
19baf839
RO
1079 * first tnode need some special handling
1080 */
1081
1082 if (tp)
91b9a277 1083 pos = tp->pos+tp->bits;
19baf839 1084 else
91b9a277
OJ
1085 pos = 0;
1086
c877efb2 1087 if (n) {
19baf839
RO
1088 newpos = tkey_mismatch(key, pos, n->key);
1089 tn = tnode_new(n->key, newpos, 1);
91b9a277 1090 } else {
19baf839 1091 newpos = 0;
c877efb2 1092 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1093 }
19baf839 1094
c877efb2 1095 if (!tn) {
f835e471
RO
1096 free_leaf_info(li);
1097 tnode_free((struct tnode *) l);
fea86ad8 1098 return NULL;
91b9a277
OJ
1099 }
1100
06801916 1101 node_set_parent((struct node *)tn, tp);
19baf839 1102
91b9a277 1103 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1104 put_child(t, tn, missbit, (struct node *)l);
1105 put_child(t, tn, 1-missbit, n);
1106
c877efb2 1107 if (tp) {
19baf839
RO
1108 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1109 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
91b9a277 1110 } else {
2373ce1c 1111 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
19baf839
RO
1112 tp = tn;
1113 }
1114 }
91b9a277
OJ
1115
1116 if (tp && tp->pos + tp->bits > 32)
78c6671a 1117 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
19baf839 1118 tp, tp->pos, tp->bits, key, plen);
91b9a277 1119
19baf839 1120 /* Rebalance the trie */
2373ce1c
RO
1121
1122 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
f835e471 1123done:
19baf839
RO
1124 return fa_head;
1125}
1126
d562f1f8
RO
1127/*
1128 * Caller must hold RTNL.
1129 */
4e902c57 1130static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1131{
1132 struct trie *t = (struct trie *) tb->tb_data;
1133 struct fib_alias *fa, *new_fa;
c877efb2 1134 struct list_head *fa_head = NULL;
19baf839 1135 struct fib_info *fi;
4e902c57
TG
1136 int plen = cfg->fc_dst_len;
1137 u8 tos = cfg->fc_tos;
19baf839
RO
1138 u32 key, mask;
1139 int err;
1140 struct leaf *l;
1141
1142 if (plen > 32)
1143 return -EINVAL;
1144
4e902c57 1145 key = ntohl(cfg->fc_dst);
19baf839 1146
2dfe55b4 1147 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1148
91b9a277 1149 mask = ntohl(inet_make_mask(plen));
19baf839 1150
c877efb2 1151 if (key & ~mask)
19baf839
RO
1152 return -EINVAL;
1153
1154 key = key & mask;
1155
4e902c57
TG
1156 fi = fib_create_info(cfg);
1157 if (IS_ERR(fi)) {
1158 err = PTR_ERR(fi);
19baf839 1159 goto err;
4e902c57 1160 }
19baf839
RO
1161
1162 l = fib_find_node(t, key);
c877efb2 1163 fa = NULL;
19baf839 1164
c877efb2 1165 if (l) {
19baf839
RO
1166 fa_head = get_fa_head(l, plen);
1167 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1168 }
1169
1170 /* Now fa, if non-NULL, points to the first fib alias
1171 * with the same keys [prefix,tos,priority], if such key already
1172 * exists or to the node before which we will insert new one.
1173 *
1174 * If fa is NULL, we will need to allocate a new one and
1175 * insert to the head of f.
1176 *
1177 * If f is NULL, no fib node matched the destination key
1178 * and we need to allocate a new one of those as well.
1179 */
1180
91b9a277 1181 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
19baf839
RO
1182 struct fib_alias *fa_orig;
1183
1184 err = -EEXIST;
4e902c57 1185 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1186 goto out;
1187
4e902c57 1188 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1189 struct fib_info *fi_drop;
1190 u8 state;
1191
6725033f
JP
1192 if (fi->fib_treeref > 1)
1193 goto out;
1194
2373ce1c 1195 err = -ENOBUFS;
e94b1766 1196 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1197 if (new_fa == NULL)
1198 goto out;
19baf839
RO
1199
1200 fi_drop = fa->fa_info;
2373ce1c
RO
1201 new_fa->fa_tos = fa->fa_tos;
1202 new_fa->fa_info = fi;
4e902c57
TG
1203 new_fa->fa_type = cfg->fc_type;
1204 new_fa->fa_scope = cfg->fc_scope;
19baf839 1205 state = fa->fa_state;
2373ce1c 1206 new_fa->fa_state &= ~FA_S_ACCESSED;
19baf839 1207
2373ce1c
RO
1208 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1209 alias_free_mem_rcu(fa);
19baf839
RO
1210
1211 fib_release_info(fi_drop);
1212 if (state & FA_S_ACCESSED)
91b9a277 1213 rt_cache_flush(-1);
b8f55831
MK
1214 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1215 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1216
91b9a277 1217 goto succeeded;
19baf839
RO
1218 }
1219 /* Error if we find a perfect match which
1220 * uses the same scope, type, and nexthop
1221 * information.
1222 */
1223 fa_orig = fa;
1224 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1225 if (fa->fa_tos != tos)
1226 break;
1227 if (fa->fa_info->fib_priority != fi->fib_priority)
1228 break;
4e902c57
TG
1229 if (fa->fa_type == cfg->fc_type &&
1230 fa->fa_scope == cfg->fc_scope &&
19baf839
RO
1231 fa->fa_info == fi) {
1232 goto out;
1233 }
1234 }
4e902c57 1235 if (!(cfg->fc_nlflags & NLM_F_APPEND))
19baf839
RO
1236 fa = fa_orig;
1237 }
1238 err = -ENOENT;
4e902c57 1239 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1240 goto out;
1241
1242 err = -ENOBUFS;
e94b1766 1243 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1244 if (new_fa == NULL)
1245 goto out;
1246
1247 new_fa->fa_info = fi;
1248 new_fa->fa_tos = tos;
4e902c57
TG
1249 new_fa->fa_type = cfg->fc_type;
1250 new_fa->fa_scope = cfg->fc_scope;
19baf839 1251 new_fa->fa_state = 0;
19baf839
RO
1252 /*
1253 * Insert new entry to the list.
1254 */
1255
c877efb2 1256 if (!fa_head) {
fea86ad8
SH
1257 fa_head = fib_insert_node(t, key, plen);
1258 if (unlikely(!fa_head)) {
1259 err = -ENOMEM;
f835e471 1260 goto out_free_new_fa;
fea86ad8 1261 }
f835e471 1262 }
19baf839 1263
2373ce1c
RO
1264 list_add_tail_rcu(&new_fa->fa_list,
1265 (fa ? &fa->fa_list : fa_head));
19baf839
RO
1266
1267 rt_cache_flush(-1);
4e902c57 1268 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1269 &cfg->fc_nlinfo, 0);
19baf839
RO
1270succeeded:
1271 return 0;
f835e471
RO
1272
1273out_free_new_fa:
1274 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1275out:
1276 fib_release_info(fi);
91b9a277 1277err:
19baf839
RO
1278 return err;
1279}
1280
2373ce1c 1281
772cb712 1282/* should be called with rcu_read_lock */
0c7770c7
SH
1283static inline int check_leaf(struct trie *t, struct leaf *l,
1284 t_key key, int *plen, const struct flowi *flp,
06c74270 1285 struct fib_result *res)
19baf839 1286{
06c74270 1287 int err, i;
888454c5 1288 __be32 mask;
19baf839
RO
1289 struct leaf_info *li;
1290 struct hlist_head *hhead = &l->list;
1291 struct hlist_node *node;
c877efb2 1292
2373ce1c 1293 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
19baf839 1294 i = li->plen;
888454c5
AV
1295 mask = inet_make_mask(i);
1296 if (l->key != (key & ntohl(mask)))
19baf839
RO
1297 continue;
1298
888454c5 1299 if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
19baf839
RO
1300 *plen = i;
1301#ifdef CONFIG_IP_FIB_TRIE_STATS
1302 t->stats.semantic_match_passed++;
1303#endif
06c74270 1304 return err;
19baf839
RO
1305 }
1306#ifdef CONFIG_IP_FIB_TRIE_STATS
1307 t->stats.semantic_match_miss++;
1308#endif
1309 }
06c74270 1310 return 1;
19baf839
RO
1311}
1312
1313static int
1314fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1315{
1316 struct trie *t = (struct trie *) tb->tb_data;
1317 int plen, ret = 0;
1318 struct node *n;
1319 struct tnode *pn;
1320 int pos, bits;
91b9a277 1321 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1322 int chopped_off;
1323 t_key cindex = 0;
1324 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1325 struct tnode *cn;
1326 t_key node_prefix, key_prefix, pref_mismatch;
1327 int mp;
1328
2373ce1c 1329 rcu_read_lock();
91b9a277 1330
2373ce1c 1331 n = rcu_dereference(t->trie);
c877efb2 1332 if (!n)
19baf839
RO
1333 goto failed;
1334
1335#ifdef CONFIG_IP_FIB_TRIE_STATS
1336 t->stats.gets++;
1337#endif
1338
1339 /* Just a leaf? */
1340 if (IS_LEAF(n)) {
06c74270 1341 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
19baf839
RO
1342 goto found;
1343 goto failed;
1344 }
1345 pn = (struct tnode *) n;
1346 chopped_off = 0;
c877efb2 1347
91b9a277 1348 while (pn) {
19baf839
RO
1349 pos = pn->pos;
1350 bits = pn->bits;
1351
c877efb2 1352 if (!chopped_off)
ab66b4a7
SH
1353 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1354 pos, bits);
19baf839
RO
1355
1356 n = tnode_get_child(pn, cindex);
1357
1358 if (n == NULL) {
1359#ifdef CONFIG_IP_FIB_TRIE_STATS
1360 t->stats.null_node_hit++;
1361#endif
1362 goto backtrace;
1363 }
1364
91b9a277
OJ
1365 if (IS_LEAF(n)) {
1366 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1367 goto found;
1368 else
1369 goto backtrace;
1370 }
1371
19baf839
RO
1372#define HL_OPTIMIZE
1373#ifdef HL_OPTIMIZE
91b9a277 1374 cn = (struct tnode *)n;
19baf839 1375
91b9a277
OJ
1376 /*
1377 * It's a tnode, and we can do some extra checks here if we
1378 * like, to avoid descending into a dead-end branch.
1379 * This tnode is in the parent's child array at index
1380 * key[p_pos..p_pos+p_bits] but potentially with some bits
1381 * chopped off, so in reality the index may be just a
1382 * subprefix, padded with zero at the end.
1383 * We can also take a look at any skipped bits in this
1384 * tnode - everything up to p_pos is supposed to be ok,
1385 * and the non-chopped bits of the index (se previous
1386 * paragraph) are also guaranteed ok, but the rest is
1387 * considered unknown.
1388 *
1389 * The skipped bits are key[pos+bits..cn->pos].
1390 */
19baf839 1391
91b9a277
OJ
1392 /* If current_prefix_length < pos+bits, we are already doing
1393 * actual prefix matching, which means everything from
1394 * pos+(bits-chopped_off) onward must be zero along some
1395 * branch of this subtree - otherwise there is *no* valid
1396 * prefix present. Here we can only check the skipped
1397 * bits. Remember, since we have already indexed into the
1398 * parent's child array, we know that the bits we chopped of
1399 * *are* zero.
1400 */
19baf839 1401
91b9a277 1402 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
19baf839 1403
91b9a277
OJ
1404 if (current_prefix_length < pos+bits) {
1405 if (tkey_extract_bits(cn->key, current_prefix_length,
1406 cn->pos - current_prefix_length) != 0 ||
1407 !(cn->child[0]))
1408 goto backtrace;
1409 }
19baf839 1410
91b9a277
OJ
1411 /*
1412 * If chopped_off=0, the index is fully validated and we
1413 * only need to look at the skipped bits for this, the new,
1414 * tnode. What we actually want to do is to find out if
1415 * these skipped bits match our key perfectly, or if we will
1416 * have to count on finding a matching prefix further down,
1417 * because if we do, we would like to have some way of
1418 * verifying the existence of such a prefix at this point.
1419 */
19baf839 1420
91b9a277
OJ
1421 /* The only thing we can do at this point is to verify that
1422 * any such matching prefix can indeed be a prefix to our
1423 * key, and if the bits in the node we are inspecting that
1424 * do not match our key are not ZERO, this cannot be true.
1425 * Thus, find out where there is a mismatch (before cn->pos)
1426 * and verify that all the mismatching bits are zero in the
1427 * new tnode's key.
1428 */
19baf839 1429
91b9a277
OJ
1430 /* Note: We aren't very concerned about the piece of the key
1431 * that precede pn->pos+pn->bits, since these have already been
1432 * checked. The bits after cn->pos aren't checked since these are
1433 * by definition "unknown" at this point. Thus, what we want to
1434 * see is if we are about to enter the "prefix matching" state,
1435 * and in that case verify that the skipped bits that will prevail
1436 * throughout this subtree are zero, as they have to be if we are
1437 * to find a matching prefix.
1438 */
1439
ab66b4a7
SH
1440 node_prefix = mask_pfx(cn->key, cn->pos);
1441 key_prefix = mask_pfx(key, cn->pos);
91b9a277
OJ
1442 pref_mismatch = key_prefix^node_prefix;
1443 mp = 0;
1444
1445 /* In short: If skipped bits in this node do not match the search
1446 * key, enter the "prefix matching" state.directly.
1447 */
1448 if (pref_mismatch) {
1449 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1450 mp++;
1451 pref_mismatch = pref_mismatch <<1;
1452 }
1453 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1454
1455 if (key_prefix != 0)
1456 goto backtrace;
1457
1458 if (current_prefix_length >= cn->pos)
1459 current_prefix_length = mp;
c877efb2 1460 }
91b9a277
OJ
1461#endif
1462 pn = (struct tnode *)n; /* Descend */
1463 chopped_off = 0;
1464 continue;
1465
19baf839
RO
1466backtrace:
1467 chopped_off++;
1468
1469 /* As zero don't change the child key (cindex) */
91b9a277 1470 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
19baf839 1471 chopped_off++;
19baf839
RO
1472
1473 /* Decrease current_... with bits chopped off */
1474 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1475 current_prefix_length = pn->pos + pn->bits - chopped_off;
91b9a277 1476
19baf839 1477 /*
c877efb2 1478 * Either we do the actual chop off according or if we have
19baf839
RO
1479 * chopped off all bits in this tnode walk up to our parent.
1480 */
1481
91b9a277 1482 if (chopped_off <= pn->bits) {
19baf839 1483 cindex &= ~(1 << (chopped_off-1));
91b9a277 1484 } else {
06801916
SH
1485 struct tnode *parent = node_parent((struct node *) pn);
1486 if (!parent)
19baf839 1487 goto failed;
91b9a277 1488
19baf839 1489 /* Get Child's index */
06801916
SH
1490 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1491 pn = parent;
19baf839
RO
1492 chopped_off = 0;
1493
1494#ifdef CONFIG_IP_FIB_TRIE_STATS
1495 t->stats.backtrack++;
1496#endif
1497 goto backtrace;
c877efb2 1498 }
19baf839
RO
1499 }
1500failed:
c877efb2 1501 ret = 1;
19baf839 1502found:
2373ce1c 1503 rcu_read_unlock();
19baf839
RO
1504 return ret;
1505}
1506
2373ce1c 1507/* only called from updater side */
19baf839
RO
1508static int trie_leaf_remove(struct trie *t, t_key key)
1509{
1510 t_key cindex;
1511 struct tnode *tp = NULL;
1512 struct node *n = t->trie;
1513 struct leaf *l;
1514
0c7770c7 1515 pr_debug("entering trie_leaf_remove(%p)\n", n);
19baf839
RO
1516
1517 /* Note that in the case skipped bits, those bits are *not* checked!
c877efb2 1518 * When we finish this, we will have NULL or a T_LEAF, and the
19baf839
RO
1519 * T_LEAF may or may not match our key.
1520 */
1521
91b9a277 1522 while (n != NULL && IS_TNODE(n)) {
19baf839
RO
1523 struct tnode *tn = (struct tnode *) n;
1524 check_tnode(tn);
1525 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1526
06801916 1527 BUG_ON(n && node_parent(n) != tn);
91b9a277 1528 }
19baf839
RO
1529 l = (struct leaf *) n;
1530
c877efb2 1531 if (!n || !tkey_equals(l->key, key))
19baf839 1532 return 0;
c877efb2
SH
1533
1534 /*
1535 * Key found.
1536 * Remove the leaf and rebalance the tree
19baf839
RO
1537 */
1538
19baf839
RO
1539 t->size--;
1540
06801916 1541 tp = node_parent(n);
19baf839
RO
1542 tnode_free((struct tnode *) n);
1543
c877efb2 1544 if (tp) {
19baf839
RO
1545 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1546 put_child(t, (struct tnode *)tp, cindex, NULL);
2373ce1c 1547 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
91b9a277 1548 } else
2373ce1c 1549 rcu_assign_pointer(t->trie, NULL);
19baf839
RO
1550
1551 return 1;
1552}
1553
d562f1f8
RO
1554/*
1555 * Caller must hold RTNL.
1556 */
4e902c57 1557static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1558{
1559 struct trie *t = (struct trie *) tb->tb_data;
1560 u32 key, mask;
4e902c57
TG
1561 int plen = cfg->fc_dst_len;
1562 u8 tos = cfg->fc_tos;
19baf839
RO
1563 struct fib_alias *fa, *fa_to_delete;
1564 struct list_head *fa_head;
1565 struct leaf *l;
91b9a277
OJ
1566 struct leaf_info *li;
1567
c877efb2 1568 if (plen > 32)
19baf839
RO
1569 return -EINVAL;
1570
4e902c57 1571 key = ntohl(cfg->fc_dst);
91b9a277 1572 mask = ntohl(inet_make_mask(plen));
19baf839 1573
c877efb2 1574 if (key & ~mask)
19baf839
RO
1575 return -EINVAL;
1576
1577 key = key & mask;
1578 l = fib_find_node(t, key);
1579
c877efb2 1580 if (!l)
19baf839
RO
1581 return -ESRCH;
1582
1583 fa_head = get_fa_head(l, plen);
1584 fa = fib_find_alias(fa_head, tos, 0);
1585
1586 if (!fa)
1587 return -ESRCH;
1588
0c7770c7 1589 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1590
1591 fa_to_delete = NULL;
1592 fa_head = fa->fa_list.prev;
2373ce1c 1593
19baf839
RO
1594 list_for_each_entry(fa, fa_head, fa_list) {
1595 struct fib_info *fi = fa->fa_info;
1596
1597 if (fa->fa_tos != tos)
1598 break;
1599
4e902c57
TG
1600 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1601 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1602 fa->fa_scope == cfg->fc_scope) &&
1603 (!cfg->fc_protocol ||
1604 fi->fib_protocol == cfg->fc_protocol) &&
1605 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1606 fa_to_delete = fa;
1607 break;
1608 }
1609 }
1610
91b9a277
OJ
1611 if (!fa_to_delete)
1612 return -ESRCH;
19baf839 1613
91b9a277 1614 fa = fa_to_delete;
4e902c57 1615 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1616 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1617
1618 l = fib_find_node(t, key);
772cb712 1619 li = find_leaf_info(l, plen);
19baf839 1620
2373ce1c 1621 list_del_rcu(&fa->fa_list);
19baf839 1622
91b9a277 1623 if (list_empty(fa_head)) {
2373ce1c 1624 hlist_del_rcu(&li->hlist);
91b9a277 1625 free_leaf_info(li);
2373ce1c 1626 }
19baf839 1627
91b9a277
OJ
1628 if (hlist_empty(&l->list))
1629 trie_leaf_remove(t, key);
19baf839 1630
91b9a277
OJ
1631 if (fa->fa_state & FA_S_ACCESSED)
1632 rt_cache_flush(-1);
19baf839 1633
2373ce1c
RO
1634 fib_release_info(fa->fa_info);
1635 alias_free_mem_rcu(fa);
91b9a277 1636 return 0;
19baf839
RO
1637}
1638
1639static int trie_flush_list(struct trie *t, struct list_head *head)
1640{
1641 struct fib_alias *fa, *fa_node;
1642 int found = 0;
1643
1644 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1645 struct fib_info *fi = fa->fa_info;
19baf839 1646
2373ce1c
RO
1647 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1648 list_del_rcu(&fa->fa_list);
1649 fib_release_info(fa->fa_info);
1650 alias_free_mem_rcu(fa);
19baf839
RO
1651 found++;
1652 }
1653 }
1654 return found;
1655}
1656
1657static int trie_flush_leaf(struct trie *t, struct leaf *l)
1658{
1659 int found = 0;
1660 struct hlist_head *lih = &l->list;
1661 struct hlist_node *node, *tmp;
1662 struct leaf_info *li = NULL;
1663
1664 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
19baf839
RO
1665 found += trie_flush_list(t, &li->falh);
1666
1667 if (list_empty(&li->falh)) {
2373ce1c 1668 hlist_del_rcu(&li->hlist);
19baf839
RO
1669 free_leaf_info(li);
1670 }
1671 }
1672 return found;
1673}
1674
2373ce1c
RO
1675/* rcu_read_lock needs to be hold by caller from readside */
1676
19baf839
RO
1677static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1678{
1679 struct node *c = (struct node *) thisleaf;
1680 struct tnode *p;
1681 int idx;
2373ce1c 1682 struct node *trie = rcu_dereference(t->trie);
19baf839 1683
c877efb2 1684 if (c == NULL) {
2373ce1c 1685 if (trie == NULL)
19baf839
RO
1686 return NULL;
1687
2373ce1c
RO
1688 if (IS_LEAF(trie)) /* trie w. just a leaf */
1689 return (struct leaf *) trie;
19baf839 1690
2373ce1c 1691 p = (struct tnode*) trie; /* Start */
91b9a277 1692 } else
06801916 1693 p = node_parent(c);
c877efb2 1694
19baf839
RO
1695 while (p) {
1696 int pos, last;
1697
1698 /* Find the next child of the parent */
c877efb2
SH
1699 if (c)
1700 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1701 else
19baf839
RO
1702 pos = 0;
1703
1704 last = 1 << p->bits;
91b9a277 1705 for (idx = pos; idx < last ; idx++) {
2373ce1c
RO
1706 c = rcu_dereference(p->child[idx]);
1707
1708 if (!c)
91b9a277
OJ
1709 continue;
1710
1711 /* Decend if tnode */
2373ce1c
RO
1712 while (IS_TNODE(c)) {
1713 p = (struct tnode *) c;
e905a9ed 1714 idx = 0;
91b9a277
OJ
1715
1716 /* Rightmost non-NULL branch */
1717 if (p && IS_TNODE(p))
2373ce1c
RO
1718 while (!(c = rcu_dereference(p->child[idx]))
1719 && idx < (1<<p->bits)) idx++;
91b9a277
OJ
1720
1721 /* Done with this tnode? */
2373ce1c 1722 if (idx >= (1 << p->bits) || !c)
91b9a277 1723 goto up;
19baf839 1724 }
2373ce1c 1725 return (struct leaf *) c;
19baf839
RO
1726 }
1727up:
1728 /* No more children go up one step */
91b9a277 1729 c = (struct node *) p;
06801916 1730 p = node_parent(c);
19baf839
RO
1731 }
1732 return NULL; /* Ready. Root of trie */
1733}
1734
d562f1f8
RO
1735/*
1736 * Caller must hold RTNL.
1737 */
19baf839
RO
1738static int fn_trie_flush(struct fib_table *tb)
1739{
1740 struct trie *t = (struct trie *) tb->tb_data;
1741 struct leaf *ll = NULL, *l = NULL;
1742 int found = 0, h;
1743
91b9a277 1744 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1745 found += trie_flush_leaf(t, l);
1746
1747 if (ll && hlist_empty(&ll->list))
1748 trie_leaf_remove(t, ll->key);
1749 ll = l;
1750 }
1751
1752 if (ll && hlist_empty(&ll->list))
1753 trie_leaf_remove(t, ll->key);
1754
0c7770c7 1755 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1756 return found;
1757}
1758
19baf839
RO
1759static void
1760fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1761{
1762 struct trie *t = (struct trie *) tb->tb_data;
1763 int order, last_idx;
1764 struct fib_info *fi = NULL;
1765 struct fib_info *last_resort;
1766 struct fib_alias *fa = NULL;
1767 struct list_head *fa_head;
1768 struct leaf *l;
1769
1770 last_idx = -1;
1771 last_resort = NULL;
1772 order = -1;
1773
2373ce1c 1774 rcu_read_lock();
c877efb2 1775
19baf839 1776 l = fib_find_node(t, 0);
c877efb2 1777 if (!l)
19baf839
RO
1778 goto out;
1779
1780 fa_head = get_fa_head(l, 0);
c877efb2 1781 if (!fa_head)
19baf839
RO
1782 goto out;
1783
c877efb2 1784 if (list_empty(fa_head))
19baf839
RO
1785 goto out;
1786
2373ce1c 1787 list_for_each_entry_rcu(fa, fa_head, fa_list) {
19baf839 1788 struct fib_info *next_fi = fa->fa_info;
91b9a277 1789
19baf839
RO
1790 if (fa->fa_scope != res->scope ||
1791 fa->fa_type != RTN_UNICAST)
1792 continue;
91b9a277 1793
19baf839
RO
1794 if (next_fi->fib_priority > res->fi->fib_priority)
1795 break;
1796 if (!next_fi->fib_nh[0].nh_gw ||
1797 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1798 continue;
1799 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1800
19baf839
RO
1801 if (fi == NULL) {
1802 if (next_fi != res->fi)
1803 break;
1804 } else if (!fib_detect_death(fi, order, &last_resort,
971b893e 1805 &last_idx, tb->tb_default)) {
a2bbe682 1806 fib_result_assign(res, fi);
971b893e 1807 tb->tb_default = order;
19baf839
RO
1808 goto out;
1809 }
1810 fi = next_fi;
1811 order++;
1812 }
1813 if (order <= 0 || fi == NULL) {
971b893e 1814 tb->tb_default = -1;
19baf839
RO
1815 goto out;
1816 }
1817
971b893e
DL
1818 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1819 tb->tb_default)) {
a2bbe682 1820 fib_result_assign(res, fi);
971b893e 1821 tb->tb_default = order;
19baf839
RO
1822 goto out;
1823 }
a2bbe682
DL
1824 if (last_idx >= 0)
1825 fib_result_assign(res, last_resort);
971b893e
DL
1826 tb->tb_default = last_idx;
1827out:
2373ce1c 1828 rcu_read_unlock();
19baf839
RO
1829}
1830
c877efb2 1831static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
19baf839
RO
1832 struct sk_buff *skb, struct netlink_callback *cb)
1833{
1834 int i, s_i;
1835 struct fib_alias *fa;
1836
32ab5f80 1837 __be32 xkey = htonl(key);
19baf839 1838
1af5a8c4 1839 s_i = cb->args[4];
19baf839
RO
1840 i = 0;
1841
2373ce1c
RO
1842 /* rcu_read_lock is hold by caller */
1843
1844 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1845 if (i < s_i) {
1846 i++;
1847 continue;
1848 }
78c6671a 1849 BUG_ON(!fa->fa_info);
19baf839
RO
1850
1851 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1852 cb->nlh->nlmsg_seq,
1853 RTM_NEWROUTE,
1854 tb->tb_id,
1855 fa->fa_type,
1856 fa->fa_scope,
be403ea1 1857 xkey,
19baf839
RO
1858 plen,
1859 fa->fa_tos,
90f66914 1860 fa->fa_info, 0) < 0) {
1af5a8c4 1861 cb->args[4] = i;
19baf839 1862 return -1;
91b9a277 1863 }
19baf839
RO
1864 i++;
1865 }
1af5a8c4 1866 cb->args[4] = i;
19baf839
RO
1867 return skb->len;
1868}
1869
c877efb2 1870static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
19baf839
RO
1871 struct netlink_callback *cb)
1872{
1873 int h, s_h;
1874 struct list_head *fa_head;
1875 struct leaf *l = NULL;
19baf839 1876
1af5a8c4 1877 s_h = cb->args[3];
19baf839 1878
91b9a277 1879 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1880 if (h < s_h)
1881 continue;
1882 if (h > s_h)
1af5a8c4
PM
1883 memset(&cb->args[4], 0,
1884 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839
RO
1885
1886 fa_head = get_fa_head(l, plen);
91b9a277 1887
c877efb2 1888 if (!fa_head)
19baf839
RO
1889 continue;
1890
c877efb2 1891 if (list_empty(fa_head))
19baf839
RO
1892 continue;
1893
1894 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1af5a8c4 1895 cb->args[3] = h;
19baf839
RO
1896 return -1;
1897 }
1898 }
1af5a8c4 1899 cb->args[3] = h;
19baf839
RO
1900 return skb->len;
1901}
1902
1903static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1904{
1905 int m, s_m;
1906 struct trie *t = (struct trie *) tb->tb_data;
1907
1af5a8c4 1908 s_m = cb->args[2];
19baf839 1909
2373ce1c 1910 rcu_read_lock();
91b9a277 1911 for (m = 0; m <= 32; m++) {
19baf839
RO
1912 if (m < s_m)
1913 continue;
1914 if (m > s_m)
1af5a8c4
PM
1915 memset(&cb->args[3], 0,
1916 sizeof(cb->args) - 3*sizeof(cb->args[0]));
19baf839
RO
1917
1918 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1af5a8c4 1919 cb->args[2] = m;
19baf839
RO
1920 goto out;
1921 }
1922 }
2373ce1c 1923 rcu_read_unlock();
1af5a8c4 1924 cb->args[2] = m;
19baf839 1925 return skb->len;
91b9a277 1926out:
2373ce1c 1927 rcu_read_unlock();
19baf839
RO
1928 return -1;
1929}
1930
1931/* Fix more generic FIB names for init later */
1932
7b1a74fd 1933struct fib_table *fib_hash_init(u32 id)
19baf839
RO
1934{
1935 struct fib_table *tb;
1936 struct trie *t;
1937
1938 if (fn_alias_kmem == NULL)
1939 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1940 sizeof(struct fib_alias),
1941 0, SLAB_HWCACHE_ALIGN,
20c2df83 1942 NULL);
19baf839
RO
1943
1944 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1945 GFP_KERNEL);
1946 if (tb == NULL)
1947 return NULL;
1948
1949 tb->tb_id = id;
971b893e 1950 tb->tb_default = -1;
19baf839
RO
1951 tb->tb_lookup = fn_trie_lookup;
1952 tb->tb_insert = fn_trie_insert;
1953 tb->tb_delete = fn_trie_delete;
1954 tb->tb_flush = fn_trie_flush;
1955 tb->tb_select_default = fn_trie_select_default;
1956 tb->tb_dump = fn_trie_dump;
19baf839
RO
1957
1958 t = (struct trie *) tb->tb_data;
c28a1cf4 1959 memset(t, 0, sizeof(*t));
19baf839 1960
19baf839 1961 if (id == RT_TABLE_LOCAL)
78c6671a 1962 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
19baf839
RO
1963
1964 return tb;
1965}
1966
cb7b593c
SH
1967#ifdef CONFIG_PROC_FS
1968/* Depth first Trie walk iterator */
1969struct fib_trie_iter {
1c340b2f 1970 struct seq_net_private p;
877a9bff 1971 struct trie *trie_local, *trie_main;
cb7b593c
SH
1972 struct tnode *tnode;
1973 struct trie *trie;
1974 unsigned index;
1975 unsigned depth;
1976};
19baf839 1977
cb7b593c 1978static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 1979{
cb7b593c
SH
1980 struct tnode *tn = iter->tnode;
1981 unsigned cindex = iter->index;
1982 struct tnode *p;
19baf839 1983
6640e697
EB
1984 /* A single entry routing table */
1985 if (!tn)
1986 return NULL;
1987
cb7b593c
SH
1988 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1989 iter->tnode, iter->index, iter->depth);
1990rescan:
1991 while (cindex < (1<<tn->bits)) {
1992 struct node *n = tnode_get_child(tn, cindex);
19baf839 1993
cb7b593c
SH
1994 if (n) {
1995 if (IS_LEAF(n)) {
1996 iter->tnode = tn;
1997 iter->index = cindex + 1;
1998 } else {
1999 /* push down one level */
2000 iter->tnode = (struct tnode *) n;
2001 iter->index = 0;
2002 ++iter->depth;
2003 }
2004 return n;
2005 }
19baf839 2006
cb7b593c
SH
2007 ++cindex;
2008 }
91b9a277 2009
cb7b593c 2010 /* Current node exhausted, pop back up */
06801916 2011 p = node_parent((struct node *)tn);
cb7b593c
SH
2012 if (p) {
2013 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2014 tn = p;
2015 --iter->depth;
2016 goto rescan;
19baf839 2017 }
cb7b593c
SH
2018
2019 /* got root? */
2020 return NULL;
19baf839
RO
2021}
2022
cb7b593c
SH
2023static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2024 struct trie *t)
19baf839 2025{
5ddf0eb2
RO
2026 struct node *n ;
2027
132adf54 2028 if (!t)
5ddf0eb2
RO
2029 return NULL;
2030
2031 n = rcu_dereference(t->trie);
2032
132adf54 2033 if (!iter)
5ddf0eb2 2034 return NULL;
19baf839 2035
6640e697
EB
2036 if (n) {
2037 if (IS_TNODE(n)) {
2038 iter->tnode = (struct tnode *) n;
2039 iter->trie = t;
2040 iter->index = 0;
2041 iter->depth = 1;
2042 } else {
2043 iter->tnode = NULL;
2044 iter->trie = t;
2045 iter->index = 0;
2046 iter->depth = 0;
2047 }
cb7b593c 2048 return n;
91b9a277 2049 }
cb7b593c
SH
2050 return NULL;
2051}
91b9a277 2052
cb7b593c
SH
2053static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2054{
2055 struct node *n;
2056 struct fib_trie_iter iter;
91b9a277 2057
cb7b593c 2058 memset(s, 0, sizeof(*s));
91b9a277 2059
cb7b593c
SH
2060 rcu_read_lock();
2061 for (n = fib_trie_get_first(&iter, t); n;
2062 n = fib_trie_get_next(&iter)) {
2063 if (IS_LEAF(n)) {
2064 s->leaves++;
2065 s->totdepth += iter.depth;
2066 if (iter.depth > s->maxdepth)
2067 s->maxdepth = iter.depth;
2068 } else {
2069 const struct tnode *tn = (const struct tnode *) n;
2070 int i;
2071
2072 s->tnodes++;
132adf54 2073 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2074 s->nodesizes[tn->bits]++;
2075
cb7b593c
SH
2076 for (i = 0; i < (1<<tn->bits); i++)
2077 if (!tn->child[i])
2078 s->nullpointers++;
19baf839 2079 }
19baf839 2080 }
2373ce1c 2081 rcu_read_unlock();
19baf839
RO
2082}
2083
cb7b593c
SH
2084/*
2085 * This outputs /proc/net/fib_triestats
2086 */
2087static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2088{
cb7b593c 2089 unsigned i, max, pointers, bytes, avdepth;
c877efb2 2090
cb7b593c
SH
2091 if (stat->leaves)
2092 avdepth = stat->totdepth*100 / stat->leaves;
2093 else
2094 avdepth = 0;
91b9a277 2095
187b5188 2096 seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100 );
cb7b593c 2097 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2098
cb7b593c 2099 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
91b9a277 2100
cb7b593c 2101 bytes = sizeof(struct leaf) * stat->leaves;
187b5188 2102 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2103 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2104
06ef921d
RO
2105 max = MAX_STAT_DEPTH;
2106 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2107 max--;
19baf839 2108
cb7b593c
SH
2109 pointers = 0;
2110 for (i = 1; i <= max; i++)
2111 if (stat->nodesizes[i] != 0) {
187b5188 2112 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2113 pointers += (1<<i) * stat->nodesizes[i];
2114 }
2115 seq_putc(seq, '\n');
187b5188 2116 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2117
cb7b593c 2118 bytes += sizeof(struct node *) * pointers;
187b5188
SH
2119 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2120 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2121}
2373ce1c 2122
cb7b593c 2123#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2124static void trie_show_usage(struct seq_file *seq,
2125 const struct trie_use_stats *stats)
2126{
2127 seq_printf(seq, "\nCounters:\n---------\n");
2128 seq_printf(seq,"gets = %u\n", stats->gets);
2129 seq_printf(seq,"backtracks = %u\n", stats->backtrack);
2130 seq_printf(seq,"semantic match passed = %u\n", stats->semantic_match_passed);
2131 seq_printf(seq,"semantic match miss = %u\n", stats->semantic_match_miss);
2132 seq_printf(seq,"null node hit= %u\n", stats->null_node_hit);
2133 seq_printf(seq,"skipped node resize = %u\n\n", stats->resize_node_skipped);
cb7b593c 2134}
66a2f7fd
SH
2135#endif /* CONFIG_IP_FIB_TRIE_STATS */
2136
19baf839 2137
cb7b593c
SH
2138static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2139{
1c340b2f 2140 struct net *net = (struct net *)seq->private;
877a9bff 2141 struct trie *trie_local, *trie_main;
cb7b593c 2142 struct trie_stat *stat;
877a9bff
EB
2143 struct fib_table *tb;
2144
2145 trie_local = NULL;
1c340b2f 2146 tb = fib_get_table(net, RT_TABLE_LOCAL);
877a9bff
EB
2147 if (tb)
2148 trie_local = (struct trie *) tb->tb_data;
2149
2150 trie_main = NULL;
1c340b2f 2151 tb = fib_get_table(net, RT_TABLE_MAIN);
877a9bff
EB
2152 if (tb)
2153 trie_main = (struct trie *) tb->tb_data;
2154
91b9a277 2155
cb7b593c
SH
2156 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2157 if (!stat)
2158 return -ENOMEM;
91b9a277 2159
cb7b593c
SH
2160 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2161 sizeof(struct leaf), sizeof(struct tnode));
91b9a277 2162
cb7b593c
SH
2163 if (trie_local) {
2164 seq_printf(seq, "Local:\n");
2165 trie_collect_stats(trie_local, stat);
2166 trie_show_stats(seq, stat);
66a2f7fd
SH
2167#ifdef CONFIG_IP_FIB_TRIE_STATS
2168 trie_show_usage(seq, &trie_local->stats);
2169#endif
cb7b593c 2170 }
91b9a277 2171
cb7b593c
SH
2172 if (trie_main) {
2173 seq_printf(seq, "Main:\n");
2174 trie_collect_stats(trie_main, stat);
2175 trie_show_stats(seq, stat);
66a2f7fd
SH
2176#ifdef CONFIG_IP_FIB_TRIE_STATS
2177 trie_show_usage(seq, &trie_main->stats);
2178#endif
19baf839 2179 }
cb7b593c 2180 kfree(stat);
19baf839 2181
cb7b593c 2182 return 0;
19baf839
RO
2183}
2184
cb7b593c 2185static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2186{
1c340b2f
DL
2187 int err;
2188 struct net *net;
2189
2190 net = get_proc_net(inode);
2191 if (net == NULL)
2192 return -ENXIO;
2193 err = single_open(file, fib_triestat_seq_show, net);
2194 if (err < 0) {
2195 put_net(net);
2196 return err;
2197 }
2198 return 0;
2199}
2200
2201static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2202{
2203 struct seq_file *seq = f->private_data;
2204 put_net(seq->private);
2205 return single_release(ino, f);
19baf839
RO
2206}
2207
9a32144e 2208static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2209 .owner = THIS_MODULE,
2210 .open = fib_triestat_seq_open,
2211 .read = seq_read,
2212 .llseek = seq_lseek,
1c340b2f 2213 .release = fib_triestat_seq_release,
cb7b593c
SH
2214};
2215
2216static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2217 loff_t pos)
19baf839 2218{
cb7b593c
SH
2219 loff_t idx = 0;
2220 struct node *n;
2221
877a9bff 2222 for (n = fib_trie_get_first(iter, iter->trie_local);
cb7b593c
SH
2223 n; ++idx, n = fib_trie_get_next(iter)) {
2224 if (pos == idx)
2225 return n;
2226 }
2227
877a9bff 2228 for (n = fib_trie_get_first(iter, iter->trie_main);
cb7b593c
SH
2229 n; ++idx, n = fib_trie_get_next(iter)) {
2230 if (pos == idx)
2231 return n;
2232 }
19baf839
RO
2233 return NULL;
2234}
2235
cb7b593c 2236static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
19baf839 2237{
877a9bff
EB
2238 struct fib_trie_iter *iter = seq->private;
2239 struct fib_table *tb;
2240
2241 if (!iter->trie_local) {
1c340b2f 2242 tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
877a9bff
EB
2243 if (tb)
2244 iter->trie_local = (struct trie *) tb->tb_data;
2245 }
2246 if (!iter->trie_main) {
1c340b2f 2247 tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
877a9bff
EB
2248 if (tb)
2249 iter->trie_main = (struct trie *) tb->tb_data;
2250 }
cb7b593c
SH
2251 rcu_read_lock();
2252 if (*pos == 0)
91b9a277 2253 return SEQ_START_TOKEN;
877a9bff 2254 return fib_trie_get_idx(iter, *pos - 1);
19baf839
RO
2255}
2256
cb7b593c 2257static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2258{
cb7b593c
SH
2259 struct fib_trie_iter *iter = seq->private;
2260 void *l = v;
2261
19baf839 2262 ++*pos;
91b9a277 2263 if (v == SEQ_START_TOKEN)
cb7b593c 2264 return fib_trie_get_idx(iter, 0);
19baf839 2265
cb7b593c
SH
2266 v = fib_trie_get_next(iter);
2267 BUG_ON(v == l);
2268 if (v)
2269 return v;
19baf839 2270
cb7b593c 2271 /* continue scan in next trie */
877a9bff
EB
2272 if (iter->trie == iter->trie_local)
2273 return fib_trie_get_first(iter, iter->trie_main);
19baf839 2274
cb7b593c
SH
2275 return NULL;
2276}
19baf839 2277
cb7b593c 2278static void fib_trie_seq_stop(struct seq_file *seq, void *v)
19baf839 2279{
cb7b593c
SH
2280 rcu_read_unlock();
2281}
91b9a277 2282
cb7b593c
SH
2283static void seq_indent(struct seq_file *seq, int n)
2284{
2285 while (n-- > 0) seq_puts(seq, " ");
2286}
19baf839 2287
cb7b593c
SH
2288static inline const char *rtn_scope(enum rt_scope_t s)
2289{
2290 static char buf[32];
19baf839 2291
132adf54 2292 switch (s) {
cb7b593c
SH
2293 case RT_SCOPE_UNIVERSE: return "universe";
2294 case RT_SCOPE_SITE: return "site";
2295 case RT_SCOPE_LINK: return "link";
2296 case RT_SCOPE_HOST: return "host";
2297 case RT_SCOPE_NOWHERE: return "nowhere";
2298 default:
2299 snprintf(buf, sizeof(buf), "scope=%d", s);
2300 return buf;
2301 }
2302}
19baf839 2303
cb7b593c
SH
2304static const char *rtn_type_names[__RTN_MAX] = {
2305 [RTN_UNSPEC] = "UNSPEC",
2306 [RTN_UNICAST] = "UNICAST",
2307 [RTN_LOCAL] = "LOCAL",
2308 [RTN_BROADCAST] = "BROADCAST",
2309 [RTN_ANYCAST] = "ANYCAST",
2310 [RTN_MULTICAST] = "MULTICAST",
2311 [RTN_BLACKHOLE] = "BLACKHOLE",
2312 [RTN_UNREACHABLE] = "UNREACHABLE",
2313 [RTN_PROHIBIT] = "PROHIBIT",
2314 [RTN_THROW] = "THROW",
2315 [RTN_NAT] = "NAT",
2316 [RTN_XRESOLVE] = "XRESOLVE",
2317};
19baf839 2318
cb7b593c
SH
2319static inline const char *rtn_type(unsigned t)
2320{
2321 static char buf[32];
19baf839 2322
cb7b593c
SH
2323 if (t < __RTN_MAX && rtn_type_names[t])
2324 return rtn_type_names[t];
187b5188 2325 snprintf(buf, sizeof(buf), "type %u", t);
cb7b593c 2326 return buf;
19baf839
RO
2327}
2328
cb7b593c
SH
2329/* Pretty print the trie */
2330static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2331{
cb7b593c
SH
2332 const struct fib_trie_iter *iter = seq->private;
2333 struct node *n = v;
c877efb2 2334
cb7b593c
SH
2335 if (v == SEQ_START_TOKEN)
2336 return 0;
19baf839 2337
06801916 2338 if (!node_parent(n)) {
877a9bff 2339 if (iter->trie == iter->trie_local)
095b8501
RO
2340 seq_puts(seq, "<local>:\n");
2341 else
2342 seq_puts(seq, "<main>:\n");
2343 }
2344
cb7b593c
SH
2345 if (IS_TNODE(n)) {
2346 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2347 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2348
1d25cd6c
RO
2349 seq_indent(seq, iter->depth-1);
2350 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
e905a9ed 2351 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
1d25cd6c 2352 tn->empty_children);
e905a9ed 2353
cb7b593c
SH
2354 } else {
2355 struct leaf *l = (struct leaf *) n;
2356 int i;
32ab5f80 2357 __be32 val = htonl(l->key);
cb7b593c
SH
2358
2359 seq_indent(seq, iter->depth);
2360 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2361 for (i = 32; i >= 0; i--) {
772cb712 2362 struct leaf_info *li = find_leaf_info(l, i);
cb7b593c
SH
2363 if (li) {
2364 struct fib_alias *fa;
2365 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2366 seq_indent(seq, iter->depth+1);
2367 seq_printf(seq, " /%d %s %s", i,
2368 rtn_scope(fa->fa_scope),
2369 rtn_type(fa->fa_type));
2370 if (fa->fa_tos)
2371 seq_printf(seq, "tos =%d\n",
2372 fa->fa_tos);
2373 seq_putc(seq, '\n');
2374 }
2375 }
2376 }
19baf839 2377 }
cb7b593c 2378
19baf839
RO
2379 return 0;
2380}
2381
f690808e 2382static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2383 .start = fib_trie_seq_start,
2384 .next = fib_trie_seq_next,
2385 .stop = fib_trie_seq_stop,
2386 .show = fib_trie_seq_show,
19baf839
RO
2387};
2388
cb7b593c 2389static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2390{
1c340b2f
DL
2391 return seq_open_net(inode, file, &fib_trie_seq_ops,
2392 sizeof(struct fib_trie_iter));
19baf839
RO
2393}
2394
9a32144e 2395static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2396 .owner = THIS_MODULE,
2397 .open = fib_trie_seq_open,
2398 .read = seq_read,
2399 .llseek = seq_lseek,
1c340b2f 2400 .release = seq_release_net,
19baf839
RO
2401};
2402
32ab5f80 2403static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2404{
cb7b593c
SH
2405 static unsigned type2flags[RTN_MAX + 1] = {
2406 [7] = RTF_REJECT, [8] = RTF_REJECT,
2407 };
2408 unsigned flags = type2flags[type];
19baf839 2409
cb7b593c
SH
2410 if (fi && fi->fib_nh->nh_gw)
2411 flags |= RTF_GATEWAY;
32ab5f80 2412 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2413 flags |= RTF_HOST;
2414 flags |= RTF_UP;
2415 return flags;
19baf839
RO
2416}
2417
cb7b593c
SH
2418/*
2419 * This outputs /proc/net/route.
2420 * The format of the file is not supposed to be changed
2421 * and needs to be same as fib_hash output to avoid breaking
2422 * legacy utilities
2423 */
2424static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2425{
c9e53cbe 2426 const struct fib_trie_iter *iter = seq->private;
cb7b593c
SH
2427 struct leaf *l = v;
2428 int i;
2429 char bf[128];
19baf839 2430
cb7b593c
SH
2431 if (v == SEQ_START_TOKEN) {
2432 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2433 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2434 "\tWindow\tIRTT");
2435 return 0;
2436 }
19baf839 2437
877a9bff 2438 if (iter->trie == iter->trie_local)
c9e53cbe 2439 return 0;
cb7b593c
SH
2440 if (IS_TNODE(l))
2441 return 0;
19baf839 2442
cb7b593c 2443 for (i=32; i>=0; i--) {
772cb712 2444 struct leaf_info *li = find_leaf_info(l, i);
cb7b593c 2445 struct fib_alias *fa;
32ab5f80 2446 __be32 mask, prefix;
91b9a277 2447
cb7b593c
SH
2448 if (!li)
2449 continue;
19baf839 2450
cb7b593c
SH
2451 mask = inet_make_mask(li->plen);
2452 prefix = htonl(l->key);
19baf839 2453
cb7b593c 2454 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2455 const struct fib_info *fi = fa->fa_info;
cb7b593c 2456 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
19baf839 2457
cb7b593c
SH
2458 if (fa->fa_type == RTN_BROADCAST
2459 || fa->fa_type == RTN_MULTICAST)
2460 continue;
19baf839 2461
cb7b593c
SH
2462 if (fi)
2463 snprintf(bf, sizeof(bf),
2464 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2465 fi->fib_dev ? fi->fib_dev->name : "*",
2466 prefix,
2467 fi->fib_nh->nh_gw, flags, 0, 0,
2468 fi->fib_priority,
2469 mask,
2470 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2471 fi->fib_window,
2472 fi->fib_rtt >> 3);
2473 else
2474 snprintf(bf, sizeof(bf),
2475 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2476 prefix, 0, flags, 0, 0, 0,
2477 mask, 0, 0, 0);
19baf839 2478
cb7b593c
SH
2479 seq_printf(seq, "%-127s\n", bf);
2480 }
19baf839
RO
2481 }
2482
2483 return 0;
2484}
2485
f690808e 2486static const struct seq_operations fib_route_seq_ops = {
cb7b593c
SH
2487 .start = fib_trie_seq_start,
2488 .next = fib_trie_seq_next,
2489 .stop = fib_trie_seq_stop,
2490 .show = fib_route_seq_show,
19baf839
RO
2491};
2492
cb7b593c 2493static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2494{
1c340b2f
DL
2495 return seq_open_net(inode, file, &fib_route_seq_ops,
2496 sizeof(struct fib_trie_iter));
19baf839
RO
2497}
2498
9a32144e 2499static const struct file_operations fib_route_fops = {
cb7b593c
SH
2500 .owner = THIS_MODULE,
2501 .open = fib_route_seq_open,
2502 .read = seq_read,
2503 .llseek = seq_lseek,
1c340b2f 2504 .release = seq_release_net,
19baf839
RO
2505};
2506
61a02653 2507int __net_init fib_proc_init(struct net *net)
19baf839 2508{
61a02653 2509 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
cb7b593c
SH
2510 goto out1;
2511
61a02653
DL
2512 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2513 &fib_triestat_fops))
cb7b593c
SH
2514 goto out2;
2515
61a02653 2516 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
cb7b593c
SH
2517 goto out3;
2518
19baf839 2519 return 0;
cb7b593c
SH
2520
2521out3:
61a02653 2522 proc_net_remove(net, "fib_triestat");
cb7b593c 2523out2:
61a02653 2524 proc_net_remove(net, "fib_trie");
cb7b593c
SH
2525out1:
2526 return -ENOMEM;
19baf839
RO
2527}
2528
61a02653 2529void __net_exit fib_proc_exit(struct net *net)
19baf839 2530{
61a02653
DL
2531 proc_net_remove(net, "fib_trie");
2532 proc_net_remove(net, "fib_triestat");
2533 proc_net_remove(net, "route");
19baf839
RO
2534}
2535
2536#endif /* CONFIG_PROC_FS */
This page took 0.514345 seconds and 5 git commands to generate.