Merge branch 'hwmon-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jdelv...
[deliverable/linux.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
05eee48c 51#define VERSION "0.408"
19baf839 52
19baf839
RO
53#include <asm/uaccess.h>
54#include <asm/system.h>
1977f032 55#include <linux/bitops.h>
19baf839
RO
56#include <linux/types.h>
57#include <linux/kernel.h>
19baf839
RO
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
cd8787ab 65#include <linux/inetdevice.h>
19baf839
RO
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
2373ce1c 69#include <linux/rcupdate.h>
19baf839
RO
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
457c4cbc 74#include <net/net_namespace.h>
19baf839
RO
75#include <net/ip.h>
76#include <net/protocol.h>
77#include <net/route.h>
78#include <net/tcp.h>
79#include <net/sock.h>
80#include <net/ip_fib.h>
81#include "fib_lookup.h"
82
06ef921d 83#define MAX_STAT_DEPTH 32
19baf839 84
19baf839 85#define KEYLENGTH (8*sizeof(t_key))
19baf839 86
19baf839
RO
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
92#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93
91b9a277
OJ
94#define IS_TNODE(n) (!(n->parent & T_LEAF))
95#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
96
97struct node {
91b9a277 98 unsigned long parent;
8d965444 99 t_key key;
19baf839
RO
100};
101
102struct leaf {
91b9a277 103 unsigned long parent;
8d965444 104 t_key key;
19baf839 105 struct hlist_head list;
2373ce1c 106 struct rcu_head rcu;
19baf839
RO
107};
108
109struct leaf_info {
110 struct hlist_node hlist;
2373ce1c 111 struct rcu_head rcu;
19baf839
RO
112 int plen;
113 struct list_head falh;
114};
115
116struct tnode {
91b9a277 117 unsigned long parent;
8d965444 118 t_key key;
112d8cfc
ED
119 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
120 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
121 unsigned int full_children; /* KEYLENGTH bits needed */
122 unsigned int empty_children; /* KEYLENGTH bits needed */
15be75cd
SH
123 union {
124 struct rcu_head rcu;
125 struct work_struct work;
e0f7cb8c 126 struct tnode *tnode_free;
15be75cd 127 };
91b9a277 128 struct node *child[0];
19baf839
RO
129};
130
131#ifdef CONFIG_IP_FIB_TRIE_STATS
132struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
2f36895a 138 unsigned int resize_node_skipped;
19baf839
RO
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
93672292 148 unsigned int prefixes;
06ef921d 149 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 150};
19baf839
RO
151
152struct trie {
91b9a277 153 struct node *trie;
19baf839
RO
154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156#endif
19baf839
RO
157};
158
19baf839 159static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
a07f5f50
SH
160static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
161 int wasfull);
19baf839 162static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c
JP
165/* tnodes to free after resize(); protected by RTNL */
166static struct tnode *tnode_free_head;
19baf839 167
e18b890b 168static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 169static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 170
06801916
SH
171static inline struct tnode *node_parent(struct node *node)
172{
b59cfbf7
ED
173 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
174}
175
176static inline struct tnode *node_parent_rcu(struct node *node)
177{
178 struct tnode *ret = node_parent(node);
06801916 179
06801916
SH
180 return rcu_dereference(ret);
181}
182
6440cc9e
SH
183/* Same as rcu_assign_pointer
184 * but that macro() assumes that value is a pointer.
185 */
06801916
SH
186static inline void node_set_parent(struct node *node, struct tnode *ptr)
187{
6440cc9e
SH
188 smp_wmb();
189 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 190}
2373ce1c 191
b59cfbf7
ED
192static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
193{
194 BUG_ON(i >= 1U << tn->bits);
2373ce1c 195
b59cfbf7
ED
196 return tn->child[i];
197}
198
199static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
19baf839 200{
b59cfbf7 201 struct node *ret = tnode_get_child(tn, i);
19baf839 202
b59cfbf7 203 return rcu_dereference(ret);
19baf839
RO
204}
205
bb435b8d 206static inline int tnode_child_length(const struct tnode *tn)
19baf839 207{
91b9a277 208 return 1 << tn->bits;
19baf839
RO
209}
210
ab66b4a7
SH
211static inline t_key mask_pfx(t_key k, unsigned short l)
212{
213 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
214}
215
19baf839
RO
216static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
217{
91b9a277 218 if (offset < KEYLENGTH)
19baf839 219 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 220 else
19baf839
RO
221 return 0;
222}
223
224static inline int tkey_equals(t_key a, t_key b)
225{
c877efb2 226 return a == b;
19baf839
RO
227}
228
229static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
230{
c877efb2
SH
231 if (bits == 0 || offset >= KEYLENGTH)
232 return 1;
91b9a277
OJ
233 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
234 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 235}
19baf839
RO
236
237static inline int tkey_mismatch(t_key a, int offset, t_key b)
238{
239 t_key diff = a ^ b;
240 int i = offset;
241
c877efb2
SH
242 if (!diff)
243 return 0;
244 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
245 i++;
246 return i;
247}
248
19baf839 249/*
e905a9ed
YH
250 To understand this stuff, an understanding of keys and all their bits is
251 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
252 all of the bits in that key are significant.
253
254 Consider a node 'n' and its parent 'tp'.
255
e905a9ed
YH
256 If n is a leaf, every bit in its key is significant. Its presence is
257 necessitated by path compression, since during a tree traversal (when
258 searching for a leaf - unless we are doing an insertion) we will completely
259 ignore all skipped bits we encounter. Thus we need to verify, at the end of
260 a potentially successful search, that we have indeed been walking the
19baf839
RO
261 correct key path.
262
e905a9ed
YH
263 Note that we can never "miss" the correct key in the tree if present by
264 following the wrong path. Path compression ensures that segments of the key
265 that are the same for all keys with a given prefix are skipped, but the
266 skipped part *is* identical for each node in the subtrie below the skipped
267 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
268 call to tkey_sub_equals() in trie_insert().
269
e905a9ed 270 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
271 have many different meanings.
272
e905a9ed 273 Example:
19baf839
RO
274 _________________________________________________________________
275 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
276 -----------------------------------------------------------------
e905a9ed 277 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
278
279 _________________________________________________________________
280 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
281 -----------------------------------------------------------------
282 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
283
284 tp->pos = 7
285 tp->bits = 3
286 n->pos = 15
91b9a277 287 n->bits = 4
19baf839 288
e905a9ed
YH
289 First, let's just ignore the bits that come before the parent tp, that is
290 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
291 not use them for anything.
292
293 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 294 index into the parent's child array. That is, they will be used to find
19baf839
RO
295 'n' among tp's children.
296
297 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
298 for the node n.
299
e905a9ed 300 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
301 of the bits are really not needed or indeed known in n->key.
302
e905a9ed 303 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 304 n's child array, and will of course be different for each child.
e905a9ed 305
c877efb2 306
19baf839
RO
307 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
308 at this point.
309
310*/
311
0c7770c7 312static inline void check_tnode(const struct tnode *tn)
19baf839 313{
0c7770c7 314 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
315}
316
f5026fab
DL
317static const int halve_threshold = 25;
318static const int inflate_threshold = 50;
319static const int halve_threshold_root = 8;
320static const int inflate_threshold_root = 15;
19baf839 321
2373ce1c
RO
322
323static void __alias_free_mem(struct rcu_head *head)
19baf839 324{
2373ce1c
RO
325 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
326 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
327}
328
2373ce1c 329static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 330{
2373ce1c
RO
331 call_rcu(&fa->rcu, __alias_free_mem);
332}
91b9a277 333
2373ce1c
RO
334static void __leaf_free_rcu(struct rcu_head *head)
335{
bc3c8c1e
SH
336 struct leaf *l = container_of(head, struct leaf, rcu);
337 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 338}
91b9a277 339
387a5487
SH
340static inline void free_leaf(struct leaf *l)
341{
342 call_rcu_bh(&l->rcu, __leaf_free_rcu);
343}
344
2373ce1c 345static void __leaf_info_free_rcu(struct rcu_head *head)
19baf839 346{
2373ce1c 347 kfree(container_of(head, struct leaf_info, rcu));
19baf839
RO
348}
349
2373ce1c 350static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 351{
2373ce1c 352 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
19baf839
RO
353}
354
8d965444 355static struct tnode *tnode_alloc(size_t size)
f0e36f8c 356{
2373ce1c 357 if (size <= PAGE_SIZE)
8d965444 358 return kzalloc(size, GFP_KERNEL);
15be75cd
SH
359 else
360 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
361}
2373ce1c 362
15be75cd
SH
363static void __tnode_vfree(struct work_struct *arg)
364{
365 struct tnode *tn = container_of(arg, struct tnode, work);
366 vfree(tn);
f0e36f8c
PM
367}
368
2373ce1c 369static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 370{
2373ce1c 371 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444
ED
372 size_t size = sizeof(struct tnode) +
373 (sizeof(struct node *) << tn->bits);
f0e36f8c
PM
374
375 if (size <= PAGE_SIZE)
376 kfree(tn);
15be75cd
SH
377 else {
378 INIT_WORK(&tn->work, __tnode_vfree);
379 schedule_work(&tn->work);
380 }
f0e36f8c
PM
381}
382
2373ce1c
RO
383static inline void tnode_free(struct tnode *tn)
384{
387a5487
SH
385 if (IS_LEAF(tn))
386 free_leaf((struct leaf *) tn);
387 else
550e29bc 388 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
389}
390
e0f7cb8c
JP
391static void tnode_free_safe(struct tnode *tn)
392{
393 BUG_ON(IS_LEAF(tn));
394
395 if (node_parent((struct node *) tn)) {
396 tn->tnode_free = tnode_free_head;
397 tnode_free_head = tn;
398 } else {
399 tnode_free(tn);
400 }
401}
402
403static void tnode_free_flush(void)
404{
405 struct tnode *tn;
406
407 while ((tn = tnode_free_head)) {
408 tnode_free_head = tn->tnode_free;
409 tn->tnode_free = NULL;
410 tnode_free(tn);
411 }
412}
413
2373ce1c
RO
414static struct leaf *leaf_new(void)
415{
bc3c8c1e 416 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
417 if (l) {
418 l->parent = T_LEAF;
419 INIT_HLIST_HEAD(&l->list);
420 }
421 return l;
422}
423
424static struct leaf_info *leaf_info_new(int plen)
425{
426 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
427 if (li) {
428 li->plen = plen;
429 INIT_LIST_HEAD(&li->falh);
430 }
431 return li;
432}
433
a07f5f50 434static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 435{
8d965444 436 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
f0e36f8c 437 struct tnode *tn = tnode_alloc(sz);
19baf839 438
91b9a277 439 if (tn) {
2373ce1c 440 tn->parent = T_TNODE;
19baf839
RO
441 tn->pos = pos;
442 tn->bits = bits;
443 tn->key = key;
444 tn->full_children = 0;
445 tn->empty_children = 1<<bits;
446 }
c877efb2 447
8d965444
ED
448 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
449 (unsigned long) (sizeof(struct node) << bits));
19baf839
RO
450 return tn;
451}
452
19baf839
RO
453/*
454 * Check whether a tnode 'n' is "full", i.e. it is an internal node
455 * and no bits are skipped. See discussion in dyntree paper p. 6
456 */
457
bb435b8d 458static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 459{
c877efb2 460 if (n == NULL || IS_LEAF(n))
19baf839
RO
461 return 0;
462
463 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
464}
465
a07f5f50
SH
466static inline void put_child(struct trie *t, struct tnode *tn, int i,
467 struct node *n)
19baf839
RO
468{
469 tnode_put_child_reorg(tn, i, n, -1);
470}
471
c877efb2 472 /*
19baf839
RO
473 * Add a child at position i overwriting the old value.
474 * Update the value of full_children and empty_children.
475 */
476
a07f5f50
SH
477static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
478 int wasfull)
19baf839 479{
2373ce1c 480 struct node *chi = tn->child[i];
19baf839
RO
481 int isfull;
482
0c7770c7
SH
483 BUG_ON(i >= 1<<tn->bits);
484
19baf839
RO
485 /* update emptyChildren */
486 if (n == NULL && chi != NULL)
487 tn->empty_children++;
488 else if (n != NULL && chi == NULL)
489 tn->empty_children--;
c877efb2 490
19baf839 491 /* update fullChildren */
91b9a277 492 if (wasfull == -1)
19baf839
RO
493 wasfull = tnode_full(tn, chi);
494
495 isfull = tnode_full(tn, n);
c877efb2 496 if (wasfull && !isfull)
19baf839 497 tn->full_children--;
c877efb2 498 else if (!wasfull && isfull)
19baf839 499 tn->full_children++;
91b9a277 500
c877efb2 501 if (n)
06801916 502 node_set_parent(n, tn);
19baf839 503
2373ce1c 504 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
505}
506
c877efb2 507static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
508{
509 int i;
2f36895a 510 int err = 0;
2f80b3c8 511 struct tnode *old_tn;
e6308be8
RO
512 int inflate_threshold_use;
513 int halve_threshold_use;
05eee48c 514 int max_resize;
19baf839 515
e905a9ed 516 if (!tn)
19baf839
RO
517 return NULL;
518
0c7770c7
SH
519 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
520 tn, inflate_threshold, halve_threshold);
19baf839
RO
521
522 /* No children */
523 if (tn->empty_children == tnode_child_length(tn)) {
e0f7cb8c 524 tnode_free_safe(tn);
19baf839
RO
525 return NULL;
526 }
527 /* One child */
528 if (tn->empty_children == tnode_child_length(tn) - 1)
529 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 530 struct node *n;
19baf839 531
91b9a277 532 n = tn->child[i];
2373ce1c 533 if (!n)
91b9a277 534 continue;
91b9a277
OJ
535
536 /* compress one level */
06801916 537 node_set_parent(n, NULL);
e0f7cb8c 538 tnode_free_safe(tn);
91b9a277 539 return n;
19baf839 540 }
c877efb2 541 /*
19baf839
RO
542 * Double as long as the resulting node has a number of
543 * nonempty nodes that are above the threshold.
544 */
545
546 /*
c877efb2
SH
547 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
548 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 549 * Telecommunications, page 6:
c877efb2 550 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
551 * children in the *doubled* node is at least 'high'."
552 *
c877efb2
SH
553 * 'high' in this instance is the variable 'inflate_threshold'. It
554 * is expressed as a percentage, so we multiply it with
555 * tnode_child_length() and instead of multiplying by 2 (since the
556 * child array will be doubled by inflate()) and multiplying
557 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 558 * multiply the left-hand side by 50.
c877efb2
SH
559 *
560 * The left-hand side may look a bit weird: tnode_child_length(tn)
561 * - tn->empty_children is of course the number of non-null children
562 * in the current node. tn->full_children is the number of "full"
19baf839 563 * children, that is non-null tnodes with a skip value of 0.
c877efb2 564 * All of those will be doubled in the resulting inflated tnode, so
19baf839 565 * we just count them one extra time here.
c877efb2 566 *
19baf839 567 * A clearer way to write this would be:
c877efb2 568 *
19baf839 569 * to_be_doubled = tn->full_children;
c877efb2 570 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
571 * tn->full_children;
572 *
573 * new_child_length = tnode_child_length(tn) * 2;
574 *
c877efb2 575 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
576 * new_child_length;
577 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
578 *
579 * ...and so on, tho it would mess up the while () loop.
580 *
19baf839
RO
581 * anyway,
582 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
583 * inflate_threshold
c877efb2 584 *
19baf839
RO
585 * avoid a division:
586 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
587 * inflate_threshold * new_child_length
c877efb2 588 *
19baf839 589 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 590 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 591 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 592 *
19baf839 593 * expand new_child_length:
c877efb2 594 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 595 * tn->full_children) >=
19baf839 596 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 597 *
19baf839 598 * shorten again:
c877efb2 599 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 600 * tn->empty_children) >= inflate_threshold *
19baf839 601 * tnode_child_length(tn)
c877efb2 602 *
19baf839
RO
603 */
604
605 check_tnode(tn);
c877efb2 606
e6308be8
RO
607 /* Keep root node larger */
608
132adf54 609 if (!tn->parent)
e6308be8 610 inflate_threshold_use = inflate_threshold_root;
e905a9ed 611 else
e6308be8
RO
612 inflate_threshold_use = inflate_threshold;
613
2f36895a 614 err = 0;
05eee48c
RO
615 max_resize = 10;
616 while ((tn->full_children > 0 && max_resize-- &&
a07f5f50
SH
617 50 * (tn->full_children + tnode_child_length(tn)
618 - tn->empty_children)
619 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 620
2f80b3c8
RO
621 old_tn = tn;
622 tn = inflate(t, tn);
a07f5f50 623
2f80b3c8
RO
624 if (IS_ERR(tn)) {
625 tn = old_tn;
2f36895a
RO
626#ifdef CONFIG_IP_FIB_TRIE_STATS
627 t->stats.resize_node_skipped++;
628#endif
629 break;
630 }
19baf839
RO
631 }
632
05eee48c
RO
633 if (max_resize < 0) {
634 if (!tn->parent)
a07f5f50
SH
635 pr_warning("Fix inflate_threshold_root."
636 " Now=%d size=%d bits\n",
637 inflate_threshold_root, tn->bits);
05eee48c 638 else
a07f5f50
SH
639 pr_warning("Fix inflate_threshold."
640 " Now=%d size=%d bits\n",
641 inflate_threshold, tn->bits);
05eee48c
RO
642 }
643
19baf839
RO
644 check_tnode(tn);
645
646 /*
647 * Halve as long as the number of empty children in this
648 * node is above threshold.
649 */
2f36895a 650
e6308be8
RO
651
652 /* Keep root node larger */
653
132adf54 654 if (!tn->parent)
e6308be8 655 halve_threshold_use = halve_threshold_root;
e905a9ed 656 else
e6308be8
RO
657 halve_threshold_use = halve_threshold;
658
2f36895a 659 err = 0;
05eee48c
RO
660 max_resize = 10;
661 while (tn->bits > 1 && max_resize-- &&
19baf839 662 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 663 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 664
2f80b3c8
RO
665 old_tn = tn;
666 tn = halve(t, tn);
667 if (IS_ERR(tn)) {
668 tn = old_tn;
2f36895a
RO
669#ifdef CONFIG_IP_FIB_TRIE_STATS
670 t->stats.resize_node_skipped++;
671#endif
672 break;
673 }
674 }
19baf839 675
05eee48c
RO
676 if (max_resize < 0) {
677 if (!tn->parent)
a07f5f50
SH
678 pr_warning("Fix halve_threshold_root."
679 " Now=%d size=%d bits\n",
680 halve_threshold_root, tn->bits);
05eee48c 681 else
a07f5f50
SH
682 pr_warning("Fix halve_threshold."
683 " Now=%d size=%d bits\n",
684 halve_threshold, tn->bits);
05eee48c 685 }
c877efb2 686
19baf839 687 /* Only one child remains */
19baf839
RO
688 if (tn->empty_children == tnode_child_length(tn) - 1)
689 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 690 struct node *n;
19baf839 691
91b9a277 692 n = tn->child[i];
2373ce1c 693 if (!n)
91b9a277 694 continue;
91b9a277
OJ
695
696 /* compress one level */
697
06801916 698 node_set_parent(n, NULL);
e0f7cb8c 699 tnode_free_safe(tn);
91b9a277 700 return n;
19baf839
RO
701 }
702
703 return (struct node *) tn;
704}
705
2f80b3c8 706static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 707{
19baf839
RO
708 struct tnode *oldtnode = tn;
709 int olen = tnode_child_length(tn);
710 int i;
711
0c7770c7 712 pr_debug("In inflate\n");
19baf839
RO
713
714 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
715
0c7770c7 716 if (!tn)
2f80b3c8 717 return ERR_PTR(-ENOMEM);
2f36895a
RO
718
719 /*
c877efb2
SH
720 * Preallocate and store tnodes before the actual work so we
721 * don't get into an inconsistent state if memory allocation
722 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
723 * of tnode is ignored.
724 */
91b9a277
OJ
725
726 for (i = 0; i < olen; i++) {
a07f5f50 727 struct tnode *inode;
2f36895a 728
a07f5f50 729 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
730 if (inode &&
731 IS_TNODE(inode) &&
732 inode->pos == oldtnode->pos + oldtnode->bits &&
733 inode->bits > 1) {
734 struct tnode *left, *right;
ab66b4a7 735 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 736
2f36895a
RO
737 left = tnode_new(inode->key&(~m), inode->pos + 1,
738 inode->bits - 1);
2f80b3c8
RO
739 if (!left)
740 goto nomem;
91b9a277 741
2f36895a
RO
742 right = tnode_new(inode->key|m, inode->pos + 1,
743 inode->bits - 1);
744
e905a9ed 745 if (!right) {
2f80b3c8
RO
746 tnode_free(left);
747 goto nomem;
e905a9ed 748 }
2f36895a
RO
749
750 put_child(t, tn, 2*i, (struct node *) left);
751 put_child(t, tn, 2*i+1, (struct node *) right);
752 }
753 }
754
91b9a277 755 for (i = 0; i < olen; i++) {
c95aaf9a 756 struct tnode *inode;
19baf839 757 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
758 struct tnode *left, *right;
759 int size, j;
c877efb2 760
19baf839
RO
761 /* An empty child */
762 if (node == NULL)
763 continue;
764
765 /* A leaf or an internal node with skipped bits */
766
c877efb2 767 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 768 tn->pos + tn->bits - 1) {
a07f5f50
SH
769 if (tkey_extract_bits(node->key,
770 oldtnode->pos + oldtnode->bits,
771 1) == 0)
19baf839
RO
772 put_child(t, tn, 2*i, node);
773 else
774 put_child(t, tn, 2*i+1, node);
775 continue;
776 }
777
778 /* An internal node with two children */
779 inode = (struct tnode *) node;
780
781 if (inode->bits == 1) {
782 put_child(t, tn, 2*i, inode->child[0]);
783 put_child(t, tn, 2*i+1, inode->child[1]);
784
e0f7cb8c 785 tnode_free_safe(inode);
91b9a277 786 continue;
19baf839
RO
787 }
788
91b9a277
OJ
789 /* An internal node with more than two children */
790
791 /* We will replace this node 'inode' with two new
792 * ones, 'left' and 'right', each with half of the
793 * original children. The two new nodes will have
794 * a position one bit further down the key and this
795 * means that the "significant" part of their keys
796 * (see the discussion near the top of this file)
797 * will differ by one bit, which will be "0" in
798 * left's key and "1" in right's key. Since we are
799 * moving the key position by one step, the bit that
800 * we are moving away from - the bit at position
801 * (inode->pos) - is the one that will differ between
802 * left and right. So... we synthesize that bit in the
803 * two new keys.
804 * The mask 'm' below will be a single "one" bit at
805 * the position (inode->pos)
806 */
19baf839 807
91b9a277
OJ
808 /* Use the old key, but set the new significant
809 * bit to zero.
810 */
2f36895a 811
91b9a277
OJ
812 left = (struct tnode *) tnode_get_child(tn, 2*i);
813 put_child(t, tn, 2*i, NULL);
2f36895a 814
91b9a277 815 BUG_ON(!left);
2f36895a 816
91b9a277
OJ
817 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
818 put_child(t, tn, 2*i+1, NULL);
19baf839 819
91b9a277 820 BUG_ON(!right);
19baf839 821
91b9a277
OJ
822 size = tnode_child_length(left);
823 for (j = 0; j < size; j++) {
824 put_child(t, left, j, inode->child[j]);
825 put_child(t, right, j, inode->child[j + size]);
19baf839 826 }
91b9a277
OJ
827 put_child(t, tn, 2*i, resize(t, left));
828 put_child(t, tn, 2*i+1, resize(t, right));
829
e0f7cb8c 830 tnode_free_safe(inode);
19baf839 831 }
e0f7cb8c 832 tnode_free_safe(oldtnode);
19baf839 833 return tn;
2f80b3c8
RO
834nomem:
835 {
836 int size = tnode_child_length(tn);
837 int j;
838
0c7770c7 839 for (j = 0; j < size; j++)
2f80b3c8
RO
840 if (tn->child[j])
841 tnode_free((struct tnode *)tn->child[j]);
842
843 tnode_free(tn);
0c7770c7 844
2f80b3c8
RO
845 return ERR_PTR(-ENOMEM);
846 }
19baf839
RO
847}
848
2f80b3c8 849static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
850{
851 struct tnode *oldtnode = tn;
852 struct node *left, *right;
853 int i;
854 int olen = tnode_child_length(tn);
855
0c7770c7 856 pr_debug("In halve\n");
c877efb2
SH
857
858 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 859
2f80b3c8
RO
860 if (!tn)
861 return ERR_PTR(-ENOMEM);
2f36895a
RO
862
863 /*
c877efb2
SH
864 * Preallocate and store tnodes before the actual work so we
865 * don't get into an inconsistent state if memory allocation
866 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
867 * of tnode is ignored.
868 */
869
91b9a277 870 for (i = 0; i < olen; i += 2) {
2f36895a
RO
871 left = tnode_get_child(oldtnode, i);
872 right = tnode_get_child(oldtnode, i+1);
c877efb2 873
2f36895a 874 /* Two nonempty children */
0c7770c7 875 if (left && right) {
2f80b3c8 876 struct tnode *newn;
0c7770c7 877
2f80b3c8 878 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
879
880 if (!newn)
2f80b3c8 881 goto nomem;
0c7770c7 882
2f80b3c8 883 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 884 }
2f36895a 885
2f36895a 886 }
19baf839 887
91b9a277
OJ
888 for (i = 0; i < olen; i += 2) {
889 struct tnode *newBinNode;
890
19baf839
RO
891 left = tnode_get_child(oldtnode, i);
892 right = tnode_get_child(oldtnode, i+1);
c877efb2 893
19baf839
RO
894 /* At least one of the children is empty */
895 if (left == NULL) {
896 if (right == NULL) /* Both are empty */
897 continue;
898 put_child(t, tn, i/2, right);
91b9a277 899 continue;
0c7770c7 900 }
91b9a277
OJ
901
902 if (right == NULL) {
19baf839 903 put_child(t, tn, i/2, left);
91b9a277
OJ
904 continue;
905 }
c877efb2 906
19baf839 907 /* Two nonempty children */
91b9a277
OJ
908 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
909 put_child(t, tn, i/2, NULL);
91b9a277
OJ
910 put_child(t, newBinNode, 0, left);
911 put_child(t, newBinNode, 1, right);
912 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839 913 }
e0f7cb8c 914 tnode_free_safe(oldtnode);
19baf839 915 return tn;
2f80b3c8
RO
916nomem:
917 {
918 int size = tnode_child_length(tn);
919 int j;
920
0c7770c7 921 for (j = 0; j < size; j++)
2f80b3c8
RO
922 if (tn->child[j])
923 tnode_free((struct tnode *)tn->child[j]);
924
925 tnode_free(tn);
0c7770c7 926
2f80b3c8
RO
927 return ERR_PTR(-ENOMEM);
928 }
19baf839
RO
929}
930
772cb712 931/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
932 via get_fa_head and dump */
933
772cb712 934static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 935{
772cb712 936 struct hlist_head *head = &l->list;
19baf839
RO
937 struct hlist_node *node;
938 struct leaf_info *li;
939
2373ce1c 940 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 941 if (li->plen == plen)
19baf839 942 return li;
91b9a277 943
19baf839
RO
944 return NULL;
945}
946
a07f5f50 947static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 948{
772cb712 949 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 950
91b9a277
OJ
951 if (!li)
952 return NULL;
c877efb2 953
91b9a277 954 return &li->falh;
19baf839
RO
955}
956
957static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
958{
e905a9ed
YH
959 struct leaf_info *li = NULL, *last = NULL;
960 struct hlist_node *node;
961
962 if (hlist_empty(head)) {
963 hlist_add_head_rcu(&new->hlist, head);
964 } else {
965 hlist_for_each_entry(li, node, head, hlist) {
966 if (new->plen > li->plen)
967 break;
968
969 last = li;
970 }
971 if (last)
972 hlist_add_after_rcu(&last->hlist, &new->hlist);
973 else
974 hlist_add_before_rcu(&new->hlist, &li->hlist);
975 }
19baf839
RO
976}
977
2373ce1c
RO
978/* rcu_read_lock needs to be hold by caller from readside */
979
19baf839
RO
980static struct leaf *
981fib_find_node(struct trie *t, u32 key)
982{
983 int pos;
984 struct tnode *tn;
985 struct node *n;
986
987 pos = 0;
2373ce1c 988 n = rcu_dereference(t->trie);
19baf839
RO
989
990 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
991 tn = (struct tnode *) n;
91b9a277 992
19baf839 993 check_tnode(tn);
91b9a277 994
c877efb2 995 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 996 pos = tn->pos + tn->bits;
a07f5f50
SH
997 n = tnode_get_child_rcu(tn,
998 tkey_extract_bits(key,
999 tn->pos,
1000 tn->bits));
91b9a277 1001 } else
19baf839
RO
1002 break;
1003 }
1004 /* Case we have found a leaf. Compare prefixes */
1005
91b9a277
OJ
1006 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
1007 return (struct leaf *)n;
1008
19baf839
RO
1009 return NULL;
1010}
1011
1012static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
1013{
19baf839 1014 int wasfull;
3ed18d76 1015 t_key cindex, key;
06801916 1016 struct tnode *tp;
19baf839 1017
3ed18d76
RO
1018 key = tn->key;
1019
06801916 1020 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
19baf839
RO
1021 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1022 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
a07f5f50
SH
1023 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1024
1025 tnode_put_child_reorg((struct tnode *)tp, cindex,
1026 (struct node *)tn, wasfull);
91b9a277 1027
06801916 1028 tp = node_parent((struct node *) tn);
e0f7cb8c 1029 tnode_free_flush();
06801916 1030 if (!tp)
19baf839 1031 break;
06801916 1032 tn = tp;
19baf839 1033 }
06801916 1034
19baf839 1035 /* Handle last (top) tnode */
e0f7cb8c 1036 if (IS_TNODE(tn)) {
a07f5f50 1037 tn = (struct tnode *)resize(t, (struct tnode *)tn);
e0f7cb8c
JP
1038 tnode_free_flush();
1039 }
19baf839 1040
a07f5f50 1041 return (struct node *)tn;
19baf839
RO
1042}
1043
2373ce1c
RO
1044/* only used from updater-side */
1045
fea86ad8 1046static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1047{
1048 int pos, newpos;
1049 struct tnode *tp = NULL, *tn = NULL;
1050 struct node *n;
1051 struct leaf *l;
1052 int missbit;
c877efb2 1053 struct list_head *fa_head = NULL;
19baf839
RO
1054 struct leaf_info *li;
1055 t_key cindex;
1056
1057 pos = 0;
c877efb2 1058 n = t->trie;
19baf839 1059
c877efb2
SH
1060 /* If we point to NULL, stop. Either the tree is empty and we should
1061 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1062 * and we should just put our new leaf in that.
c877efb2
SH
1063 * If we point to a T_TNODE, check if it matches our key. Note that
1064 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1065 * not be the parent's 'pos'+'bits'!
1066 *
c877efb2 1067 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1068 * the index from our key, push the T_TNODE and walk the tree.
1069 *
1070 * If it doesn't, we have to replace it with a new T_TNODE.
1071 *
c877efb2
SH
1072 * If we point to a T_LEAF, it might or might not have the same key
1073 * as we do. If it does, just change the value, update the T_LEAF's
1074 * value, and return it.
19baf839
RO
1075 * If it doesn't, we need to replace it with a T_TNODE.
1076 */
1077
1078 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1079 tn = (struct tnode *) n;
91b9a277 1080
c877efb2 1081 check_tnode(tn);
91b9a277 1082
c877efb2 1083 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1084 tp = tn;
91b9a277 1085 pos = tn->pos + tn->bits;
a07f5f50
SH
1086 n = tnode_get_child(tn,
1087 tkey_extract_bits(key,
1088 tn->pos,
1089 tn->bits));
19baf839 1090
06801916 1091 BUG_ON(n && node_parent(n) != tn);
91b9a277 1092 } else
19baf839
RO
1093 break;
1094 }
1095
1096 /*
1097 * n ----> NULL, LEAF or TNODE
1098 *
c877efb2 1099 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1100 */
1101
91b9a277 1102 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1103
1104 /* Case 1: n is a leaf. Compare prefixes */
1105
c877efb2 1106 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1107 l = (struct leaf *) n;
19baf839 1108 li = leaf_info_new(plen);
91b9a277 1109
fea86ad8
SH
1110 if (!li)
1111 return NULL;
19baf839
RO
1112
1113 fa_head = &li->falh;
1114 insert_leaf_info(&l->list, li);
1115 goto done;
1116 }
19baf839
RO
1117 l = leaf_new();
1118
fea86ad8
SH
1119 if (!l)
1120 return NULL;
19baf839
RO
1121
1122 l->key = key;
1123 li = leaf_info_new(plen);
1124
c877efb2 1125 if (!li) {
387a5487 1126 free_leaf(l);
fea86ad8 1127 return NULL;
f835e471 1128 }
19baf839
RO
1129
1130 fa_head = &li->falh;
1131 insert_leaf_info(&l->list, li);
1132
19baf839 1133 if (t->trie && n == NULL) {
91b9a277 1134 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1135
06801916 1136 node_set_parent((struct node *)l, tp);
19baf839 1137
91b9a277
OJ
1138 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1139 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1140 } else {
1141 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1142 /*
1143 * Add a new tnode here
19baf839
RO
1144 * first tnode need some special handling
1145 */
1146
1147 if (tp)
91b9a277 1148 pos = tp->pos+tp->bits;
19baf839 1149 else
91b9a277
OJ
1150 pos = 0;
1151
c877efb2 1152 if (n) {
19baf839
RO
1153 newpos = tkey_mismatch(key, pos, n->key);
1154 tn = tnode_new(n->key, newpos, 1);
91b9a277 1155 } else {
19baf839 1156 newpos = 0;
c877efb2 1157 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1158 }
19baf839 1159
c877efb2 1160 if (!tn) {
f835e471 1161 free_leaf_info(li);
387a5487 1162 free_leaf(l);
fea86ad8 1163 return NULL;
91b9a277
OJ
1164 }
1165
06801916 1166 node_set_parent((struct node *)tn, tp);
19baf839 1167
91b9a277 1168 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1169 put_child(t, tn, missbit, (struct node *)l);
1170 put_child(t, tn, 1-missbit, n);
1171
c877efb2 1172 if (tp) {
19baf839 1173 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
a07f5f50
SH
1174 put_child(t, (struct tnode *)tp, cindex,
1175 (struct node *)tn);
91b9a277 1176 } else {
a07f5f50 1177 rcu_assign_pointer(t->trie, (struct node *)tn);
19baf839
RO
1178 tp = tn;
1179 }
1180 }
91b9a277
OJ
1181
1182 if (tp && tp->pos + tp->bits > 32)
a07f5f50
SH
1183 pr_warning("fib_trie"
1184 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1185 tp, tp->pos, tp->bits, key, plen);
91b9a277 1186
19baf839 1187 /* Rebalance the trie */
2373ce1c
RO
1188
1189 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
f835e471 1190done:
19baf839
RO
1191 return fa_head;
1192}
1193
d562f1f8
RO
1194/*
1195 * Caller must hold RTNL.
1196 */
4e902c57 1197static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1198{
1199 struct trie *t = (struct trie *) tb->tb_data;
1200 struct fib_alias *fa, *new_fa;
c877efb2 1201 struct list_head *fa_head = NULL;
19baf839 1202 struct fib_info *fi;
4e902c57
TG
1203 int plen = cfg->fc_dst_len;
1204 u8 tos = cfg->fc_tos;
19baf839
RO
1205 u32 key, mask;
1206 int err;
1207 struct leaf *l;
1208
1209 if (plen > 32)
1210 return -EINVAL;
1211
4e902c57 1212 key = ntohl(cfg->fc_dst);
19baf839 1213
2dfe55b4 1214 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1215
91b9a277 1216 mask = ntohl(inet_make_mask(plen));
19baf839 1217
c877efb2 1218 if (key & ~mask)
19baf839
RO
1219 return -EINVAL;
1220
1221 key = key & mask;
1222
4e902c57
TG
1223 fi = fib_create_info(cfg);
1224 if (IS_ERR(fi)) {
1225 err = PTR_ERR(fi);
19baf839 1226 goto err;
4e902c57 1227 }
19baf839
RO
1228
1229 l = fib_find_node(t, key);
c877efb2 1230 fa = NULL;
19baf839 1231
c877efb2 1232 if (l) {
19baf839
RO
1233 fa_head = get_fa_head(l, plen);
1234 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1235 }
1236
1237 /* Now fa, if non-NULL, points to the first fib alias
1238 * with the same keys [prefix,tos,priority], if such key already
1239 * exists or to the node before which we will insert new one.
1240 *
1241 * If fa is NULL, we will need to allocate a new one and
1242 * insert to the head of f.
1243 *
1244 * If f is NULL, no fib node matched the destination key
1245 * and we need to allocate a new one of those as well.
1246 */
1247
936f6f8e
JA
1248 if (fa && fa->fa_tos == tos &&
1249 fa->fa_info->fib_priority == fi->fib_priority) {
1250 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1251
1252 err = -EEXIST;
4e902c57 1253 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1254 goto out;
1255
936f6f8e
JA
1256 /* We have 2 goals:
1257 * 1. Find exact match for type, scope, fib_info to avoid
1258 * duplicate routes
1259 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1260 */
1261 fa_match = NULL;
1262 fa_first = fa;
1263 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1264 list_for_each_entry_continue(fa, fa_head, fa_list) {
1265 if (fa->fa_tos != tos)
1266 break;
1267 if (fa->fa_info->fib_priority != fi->fib_priority)
1268 break;
1269 if (fa->fa_type == cfg->fc_type &&
1270 fa->fa_scope == cfg->fc_scope &&
1271 fa->fa_info == fi) {
1272 fa_match = fa;
1273 break;
1274 }
1275 }
1276
4e902c57 1277 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1278 struct fib_info *fi_drop;
1279 u8 state;
1280
936f6f8e
JA
1281 fa = fa_first;
1282 if (fa_match) {
1283 if (fa == fa_match)
1284 err = 0;
6725033f 1285 goto out;
936f6f8e 1286 }
2373ce1c 1287 err = -ENOBUFS;
e94b1766 1288 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1289 if (new_fa == NULL)
1290 goto out;
19baf839
RO
1291
1292 fi_drop = fa->fa_info;
2373ce1c
RO
1293 new_fa->fa_tos = fa->fa_tos;
1294 new_fa->fa_info = fi;
4e902c57
TG
1295 new_fa->fa_type = cfg->fc_type;
1296 new_fa->fa_scope = cfg->fc_scope;
19baf839 1297 state = fa->fa_state;
936f6f8e 1298 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1299
2373ce1c
RO
1300 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1301 alias_free_mem_rcu(fa);
19baf839
RO
1302
1303 fib_release_info(fi_drop);
1304 if (state & FA_S_ACCESSED)
76e6ebfb 1305 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
b8f55831
MK
1306 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1307 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1308
91b9a277 1309 goto succeeded;
19baf839
RO
1310 }
1311 /* Error if we find a perfect match which
1312 * uses the same scope, type, and nexthop
1313 * information.
1314 */
936f6f8e
JA
1315 if (fa_match)
1316 goto out;
a07f5f50 1317
4e902c57 1318 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1319 fa = fa_first;
19baf839
RO
1320 }
1321 err = -ENOENT;
4e902c57 1322 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1323 goto out;
1324
1325 err = -ENOBUFS;
e94b1766 1326 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1327 if (new_fa == NULL)
1328 goto out;
1329
1330 new_fa->fa_info = fi;
1331 new_fa->fa_tos = tos;
4e902c57
TG
1332 new_fa->fa_type = cfg->fc_type;
1333 new_fa->fa_scope = cfg->fc_scope;
19baf839 1334 new_fa->fa_state = 0;
19baf839
RO
1335 /*
1336 * Insert new entry to the list.
1337 */
1338
c877efb2 1339 if (!fa_head) {
fea86ad8
SH
1340 fa_head = fib_insert_node(t, key, plen);
1341 if (unlikely(!fa_head)) {
1342 err = -ENOMEM;
f835e471 1343 goto out_free_new_fa;
fea86ad8 1344 }
f835e471 1345 }
19baf839 1346
2373ce1c
RO
1347 list_add_tail_rcu(&new_fa->fa_list,
1348 (fa ? &fa->fa_list : fa_head));
19baf839 1349
76e6ebfb 1350 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
4e902c57 1351 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1352 &cfg->fc_nlinfo, 0);
19baf839
RO
1353succeeded:
1354 return 0;
f835e471
RO
1355
1356out_free_new_fa:
1357 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1358out:
1359 fib_release_info(fi);
91b9a277 1360err:
19baf839
RO
1361 return err;
1362}
1363
772cb712 1364/* should be called with rcu_read_lock */
a07f5f50
SH
1365static int check_leaf(struct trie *t, struct leaf *l,
1366 t_key key, const struct flowi *flp,
1367 struct fib_result *res)
19baf839 1368{
19baf839
RO
1369 struct leaf_info *li;
1370 struct hlist_head *hhead = &l->list;
1371 struct hlist_node *node;
c877efb2 1372
2373ce1c 1373 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
a07f5f50
SH
1374 int err;
1375 int plen = li->plen;
1376 __be32 mask = inet_make_mask(plen);
1377
888454c5 1378 if (l->key != (key & ntohl(mask)))
19baf839
RO
1379 continue;
1380
e204a345 1381 err = fib_semantic_match(&li->falh, flp, res, plen);
a07f5f50 1382
19baf839 1383#ifdef CONFIG_IP_FIB_TRIE_STATS
a07f5f50 1384 if (err <= 0)
19baf839 1385 t->stats.semantic_match_passed++;
a07f5f50
SH
1386 else
1387 t->stats.semantic_match_miss++;
19baf839 1388#endif
a07f5f50 1389 if (err <= 0)
2e655571 1390 return err;
19baf839 1391 }
a07f5f50 1392
2e655571 1393 return 1;
19baf839
RO
1394}
1395
a07f5f50
SH
1396static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1397 struct fib_result *res)
19baf839
RO
1398{
1399 struct trie *t = (struct trie *) tb->tb_data;
2e655571 1400 int ret;
19baf839
RO
1401 struct node *n;
1402 struct tnode *pn;
1403 int pos, bits;
91b9a277 1404 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1405 int chopped_off;
1406 t_key cindex = 0;
1407 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1408 struct tnode *cn;
1409 t_key node_prefix, key_prefix, pref_mismatch;
1410 int mp;
1411
2373ce1c 1412 rcu_read_lock();
91b9a277 1413
2373ce1c 1414 n = rcu_dereference(t->trie);
c877efb2 1415 if (!n)
19baf839
RO
1416 goto failed;
1417
1418#ifdef CONFIG_IP_FIB_TRIE_STATS
1419 t->stats.gets++;
1420#endif
1421
1422 /* Just a leaf? */
1423 if (IS_LEAF(n)) {
2e655571 1424 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
a07f5f50 1425 goto found;
19baf839 1426 }
a07f5f50 1427
19baf839
RO
1428 pn = (struct tnode *) n;
1429 chopped_off = 0;
c877efb2 1430
91b9a277 1431 while (pn) {
19baf839
RO
1432 pos = pn->pos;
1433 bits = pn->bits;
1434
c877efb2 1435 if (!chopped_off)
ab66b4a7
SH
1436 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1437 pos, bits);
19baf839
RO
1438
1439 n = tnode_get_child(pn, cindex);
1440
1441 if (n == NULL) {
1442#ifdef CONFIG_IP_FIB_TRIE_STATS
1443 t->stats.null_node_hit++;
1444#endif
1445 goto backtrace;
1446 }
1447
91b9a277 1448 if (IS_LEAF(n)) {
2e655571
BH
1449 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1450 if (ret > 0)
91b9a277 1451 goto backtrace;
a07f5f50 1452 goto found;
91b9a277
OJ
1453 }
1454
91b9a277 1455 cn = (struct tnode *)n;
19baf839 1456
91b9a277
OJ
1457 /*
1458 * It's a tnode, and we can do some extra checks here if we
1459 * like, to avoid descending into a dead-end branch.
1460 * This tnode is in the parent's child array at index
1461 * key[p_pos..p_pos+p_bits] but potentially with some bits
1462 * chopped off, so in reality the index may be just a
1463 * subprefix, padded with zero at the end.
1464 * We can also take a look at any skipped bits in this
1465 * tnode - everything up to p_pos is supposed to be ok,
1466 * and the non-chopped bits of the index (se previous
1467 * paragraph) are also guaranteed ok, but the rest is
1468 * considered unknown.
1469 *
1470 * The skipped bits are key[pos+bits..cn->pos].
1471 */
19baf839 1472
91b9a277
OJ
1473 /* If current_prefix_length < pos+bits, we are already doing
1474 * actual prefix matching, which means everything from
1475 * pos+(bits-chopped_off) onward must be zero along some
1476 * branch of this subtree - otherwise there is *no* valid
1477 * prefix present. Here we can only check the skipped
1478 * bits. Remember, since we have already indexed into the
1479 * parent's child array, we know that the bits we chopped of
1480 * *are* zero.
1481 */
19baf839 1482
a07f5f50
SH
1483 /* NOTA BENE: Checking only skipped bits
1484 for the new node here */
19baf839 1485
91b9a277
OJ
1486 if (current_prefix_length < pos+bits) {
1487 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1488 cn->pos - current_prefix_length)
1489 || !(cn->child[0]))
91b9a277
OJ
1490 goto backtrace;
1491 }
19baf839 1492
91b9a277
OJ
1493 /*
1494 * If chopped_off=0, the index is fully validated and we
1495 * only need to look at the skipped bits for this, the new,
1496 * tnode. What we actually want to do is to find out if
1497 * these skipped bits match our key perfectly, or if we will
1498 * have to count on finding a matching prefix further down,
1499 * because if we do, we would like to have some way of
1500 * verifying the existence of such a prefix at this point.
1501 */
19baf839 1502
91b9a277
OJ
1503 /* The only thing we can do at this point is to verify that
1504 * any such matching prefix can indeed be a prefix to our
1505 * key, and if the bits in the node we are inspecting that
1506 * do not match our key are not ZERO, this cannot be true.
1507 * Thus, find out where there is a mismatch (before cn->pos)
1508 * and verify that all the mismatching bits are zero in the
1509 * new tnode's key.
1510 */
19baf839 1511
a07f5f50
SH
1512 /*
1513 * Note: We aren't very concerned about the piece of
1514 * the key that precede pn->pos+pn->bits, since these
1515 * have already been checked. The bits after cn->pos
1516 * aren't checked since these are by definition
1517 * "unknown" at this point. Thus, what we want to see
1518 * is if we are about to enter the "prefix matching"
1519 * state, and in that case verify that the skipped
1520 * bits that will prevail throughout this subtree are
1521 * zero, as they have to be if we are to find a
1522 * matching prefix.
91b9a277
OJ
1523 */
1524
ab66b4a7
SH
1525 node_prefix = mask_pfx(cn->key, cn->pos);
1526 key_prefix = mask_pfx(key, cn->pos);
91b9a277
OJ
1527 pref_mismatch = key_prefix^node_prefix;
1528 mp = 0;
1529
a07f5f50
SH
1530 /*
1531 * In short: If skipped bits in this node do not match
1532 * the search key, enter the "prefix matching"
1533 * state.directly.
91b9a277
OJ
1534 */
1535 if (pref_mismatch) {
1536 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1537 mp++;
a07f5f50 1538 pref_mismatch = pref_mismatch << 1;
91b9a277
OJ
1539 }
1540 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1541
1542 if (key_prefix != 0)
1543 goto backtrace;
1544
1545 if (current_prefix_length >= cn->pos)
1546 current_prefix_length = mp;
c877efb2 1547 }
a07f5f50 1548
91b9a277
OJ
1549 pn = (struct tnode *)n; /* Descend */
1550 chopped_off = 0;
1551 continue;
1552
19baf839
RO
1553backtrace:
1554 chopped_off++;
1555
1556 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1557 while ((chopped_off <= pn->bits)
1558 && !(cindex & (1<<(chopped_off-1))))
19baf839 1559 chopped_off++;
19baf839
RO
1560
1561 /* Decrease current_... with bits chopped off */
1562 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1563 current_prefix_length = pn->pos + pn->bits
1564 - chopped_off;
91b9a277 1565
19baf839 1566 /*
c877efb2 1567 * Either we do the actual chop off according or if we have
19baf839
RO
1568 * chopped off all bits in this tnode walk up to our parent.
1569 */
1570
91b9a277 1571 if (chopped_off <= pn->bits) {
19baf839 1572 cindex &= ~(1 << (chopped_off-1));
91b9a277 1573 } else {
06801916
SH
1574 struct tnode *parent = node_parent((struct node *) pn);
1575 if (!parent)
19baf839 1576 goto failed;
91b9a277 1577
19baf839 1578 /* Get Child's index */
06801916
SH
1579 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1580 pn = parent;
19baf839
RO
1581 chopped_off = 0;
1582
1583#ifdef CONFIG_IP_FIB_TRIE_STATS
1584 t->stats.backtrack++;
1585#endif
1586 goto backtrace;
c877efb2 1587 }
19baf839
RO
1588 }
1589failed:
c877efb2 1590 ret = 1;
19baf839 1591found:
2373ce1c 1592 rcu_read_unlock();
19baf839
RO
1593 return ret;
1594}
1595
9195bef7
SH
1596/*
1597 * Remove the leaf and return parent.
1598 */
1599static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1600{
9195bef7 1601 struct tnode *tp = node_parent((struct node *) l);
c877efb2 1602
9195bef7 1603 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1604
c877efb2 1605 if (tp) {
9195bef7 1606 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
19baf839 1607 put_child(t, (struct tnode *)tp, cindex, NULL);
2373ce1c 1608 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
91b9a277 1609 } else
2373ce1c 1610 rcu_assign_pointer(t->trie, NULL);
19baf839 1611
387a5487 1612 free_leaf(l);
19baf839
RO
1613}
1614
d562f1f8
RO
1615/*
1616 * Caller must hold RTNL.
1617 */
4e902c57 1618static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1619{
1620 struct trie *t = (struct trie *) tb->tb_data;
1621 u32 key, mask;
4e902c57
TG
1622 int plen = cfg->fc_dst_len;
1623 u8 tos = cfg->fc_tos;
19baf839
RO
1624 struct fib_alias *fa, *fa_to_delete;
1625 struct list_head *fa_head;
1626 struct leaf *l;
91b9a277
OJ
1627 struct leaf_info *li;
1628
c877efb2 1629 if (plen > 32)
19baf839
RO
1630 return -EINVAL;
1631
4e902c57 1632 key = ntohl(cfg->fc_dst);
91b9a277 1633 mask = ntohl(inet_make_mask(plen));
19baf839 1634
c877efb2 1635 if (key & ~mask)
19baf839
RO
1636 return -EINVAL;
1637
1638 key = key & mask;
1639 l = fib_find_node(t, key);
1640
c877efb2 1641 if (!l)
19baf839
RO
1642 return -ESRCH;
1643
1644 fa_head = get_fa_head(l, plen);
1645 fa = fib_find_alias(fa_head, tos, 0);
1646
1647 if (!fa)
1648 return -ESRCH;
1649
0c7770c7 1650 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1651
1652 fa_to_delete = NULL;
936f6f8e
JA
1653 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1654 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1655 struct fib_info *fi = fa->fa_info;
1656
1657 if (fa->fa_tos != tos)
1658 break;
1659
4e902c57
TG
1660 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1661 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1662 fa->fa_scope == cfg->fc_scope) &&
1663 (!cfg->fc_protocol ||
1664 fi->fib_protocol == cfg->fc_protocol) &&
1665 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1666 fa_to_delete = fa;
1667 break;
1668 }
1669 }
1670
91b9a277
OJ
1671 if (!fa_to_delete)
1672 return -ESRCH;
19baf839 1673
91b9a277 1674 fa = fa_to_delete;
4e902c57 1675 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1676 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1677
1678 l = fib_find_node(t, key);
772cb712 1679 li = find_leaf_info(l, plen);
19baf839 1680
2373ce1c 1681 list_del_rcu(&fa->fa_list);
19baf839 1682
91b9a277 1683 if (list_empty(fa_head)) {
2373ce1c 1684 hlist_del_rcu(&li->hlist);
91b9a277 1685 free_leaf_info(li);
2373ce1c 1686 }
19baf839 1687
91b9a277 1688 if (hlist_empty(&l->list))
9195bef7 1689 trie_leaf_remove(t, l);
19baf839 1690
91b9a277 1691 if (fa->fa_state & FA_S_ACCESSED)
76e6ebfb 1692 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
19baf839 1693
2373ce1c
RO
1694 fib_release_info(fa->fa_info);
1695 alias_free_mem_rcu(fa);
91b9a277 1696 return 0;
19baf839
RO
1697}
1698
ef3660ce 1699static int trie_flush_list(struct list_head *head)
19baf839
RO
1700{
1701 struct fib_alias *fa, *fa_node;
1702 int found = 0;
1703
1704 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1705 struct fib_info *fi = fa->fa_info;
19baf839 1706
2373ce1c
RO
1707 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1708 list_del_rcu(&fa->fa_list);
1709 fib_release_info(fa->fa_info);
1710 alias_free_mem_rcu(fa);
19baf839
RO
1711 found++;
1712 }
1713 }
1714 return found;
1715}
1716
ef3660ce 1717static int trie_flush_leaf(struct leaf *l)
19baf839
RO
1718{
1719 int found = 0;
1720 struct hlist_head *lih = &l->list;
1721 struct hlist_node *node, *tmp;
1722 struct leaf_info *li = NULL;
1723
1724 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
ef3660ce 1725 found += trie_flush_list(&li->falh);
19baf839
RO
1726
1727 if (list_empty(&li->falh)) {
2373ce1c 1728 hlist_del_rcu(&li->hlist);
19baf839
RO
1729 free_leaf_info(li);
1730 }
1731 }
1732 return found;
1733}
1734
82cfbb00
SH
1735/*
1736 * Scan for the next right leaf starting at node p->child[idx]
1737 * Since we have back pointer, no recursion necessary.
1738 */
1739static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
19baf839 1740{
82cfbb00
SH
1741 do {
1742 t_key idx;
c877efb2 1743
c877efb2 1744 if (c)
82cfbb00 1745 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1746 else
82cfbb00 1747 idx = 0;
2373ce1c 1748
82cfbb00
SH
1749 while (idx < 1u << p->bits) {
1750 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1751 if (!c)
91b9a277
OJ
1752 continue;
1753
82cfbb00
SH
1754 if (IS_LEAF(c)) {
1755 prefetch(p->child[idx]);
1756 return (struct leaf *) c;
19baf839 1757 }
82cfbb00
SH
1758
1759 /* Rescan start scanning in new node */
1760 p = (struct tnode *) c;
1761 idx = 0;
19baf839 1762 }
82cfbb00
SH
1763
1764 /* Node empty, walk back up to parent */
91b9a277 1765 c = (struct node *) p;
82cfbb00
SH
1766 } while ( (p = node_parent_rcu(c)) != NULL);
1767
1768 return NULL; /* Root of trie */
1769}
1770
82cfbb00
SH
1771static struct leaf *trie_firstleaf(struct trie *t)
1772{
1773 struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1774
1775 if (!n)
1776 return NULL;
1777
1778 if (IS_LEAF(n)) /* trie is just a leaf */
1779 return (struct leaf *) n;
1780
1781 return leaf_walk_rcu(n, NULL);
1782}
1783
1784static struct leaf *trie_nextleaf(struct leaf *l)
1785{
1786 struct node *c = (struct node *) l;
1787 struct tnode *p = node_parent(c);
1788
1789 if (!p)
1790 return NULL; /* trie with just one leaf */
1791
1792 return leaf_walk_rcu(p, c);
19baf839
RO
1793}
1794
71d67e66
SH
1795static struct leaf *trie_leafindex(struct trie *t, int index)
1796{
1797 struct leaf *l = trie_firstleaf(t);
1798
ec28cf73 1799 while (l && index-- > 0)
71d67e66 1800 l = trie_nextleaf(l);
ec28cf73 1801
71d67e66
SH
1802 return l;
1803}
1804
1805
d562f1f8
RO
1806/*
1807 * Caller must hold RTNL.
1808 */
19baf839
RO
1809static int fn_trie_flush(struct fib_table *tb)
1810{
1811 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1812 struct leaf *l, *ll = NULL;
82cfbb00 1813 int found = 0;
19baf839 1814
82cfbb00 1815 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1816 found += trie_flush_leaf(l);
19baf839
RO
1817
1818 if (ll && hlist_empty(&ll->list))
9195bef7 1819 trie_leaf_remove(t, ll);
19baf839
RO
1820 ll = l;
1821 }
1822
1823 if (ll && hlist_empty(&ll->list))
9195bef7 1824 trie_leaf_remove(t, ll);
19baf839 1825
0c7770c7 1826 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1827 return found;
1828}
1829
a07f5f50
SH
1830static void fn_trie_select_default(struct fib_table *tb,
1831 const struct flowi *flp,
1832 struct fib_result *res)
19baf839
RO
1833{
1834 struct trie *t = (struct trie *) tb->tb_data;
1835 int order, last_idx;
1836 struct fib_info *fi = NULL;
1837 struct fib_info *last_resort;
1838 struct fib_alias *fa = NULL;
1839 struct list_head *fa_head;
1840 struct leaf *l;
1841
1842 last_idx = -1;
1843 last_resort = NULL;
1844 order = -1;
1845
2373ce1c 1846 rcu_read_lock();
c877efb2 1847
19baf839 1848 l = fib_find_node(t, 0);
c877efb2 1849 if (!l)
19baf839
RO
1850 goto out;
1851
1852 fa_head = get_fa_head(l, 0);
c877efb2 1853 if (!fa_head)
19baf839
RO
1854 goto out;
1855
c877efb2 1856 if (list_empty(fa_head))
19baf839
RO
1857 goto out;
1858
2373ce1c 1859 list_for_each_entry_rcu(fa, fa_head, fa_list) {
19baf839 1860 struct fib_info *next_fi = fa->fa_info;
91b9a277 1861
19baf839
RO
1862 if (fa->fa_scope != res->scope ||
1863 fa->fa_type != RTN_UNICAST)
1864 continue;
91b9a277 1865
19baf839
RO
1866 if (next_fi->fib_priority > res->fi->fib_priority)
1867 break;
1868 if (!next_fi->fib_nh[0].nh_gw ||
1869 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1870 continue;
1871 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1872
19baf839
RO
1873 if (fi == NULL) {
1874 if (next_fi != res->fi)
1875 break;
1876 } else if (!fib_detect_death(fi, order, &last_resort,
971b893e 1877 &last_idx, tb->tb_default)) {
a2bbe682 1878 fib_result_assign(res, fi);
971b893e 1879 tb->tb_default = order;
19baf839
RO
1880 goto out;
1881 }
1882 fi = next_fi;
1883 order++;
1884 }
1885 if (order <= 0 || fi == NULL) {
971b893e 1886 tb->tb_default = -1;
19baf839
RO
1887 goto out;
1888 }
1889
971b893e
DL
1890 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1891 tb->tb_default)) {
a2bbe682 1892 fib_result_assign(res, fi);
971b893e 1893 tb->tb_default = order;
19baf839
RO
1894 goto out;
1895 }
a2bbe682
DL
1896 if (last_idx >= 0)
1897 fib_result_assign(res, last_resort);
971b893e
DL
1898 tb->tb_default = last_idx;
1899out:
2373ce1c 1900 rcu_read_unlock();
19baf839
RO
1901}
1902
a07f5f50
SH
1903static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1904 struct fib_table *tb,
19baf839
RO
1905 struct sk_buff *skb, struct netlink_callback *cb)
1906{
1907 int i, s_i;
1908 struct fib_alias *fa;
32ab5f80 1909 __be32 xkey = htonl(key);
19baf839 1910
71d67e66 1911 s_i = cb->args[5];
19baf839
RO
1912 i = 0;
1913
2373ce1c
RO
1914 /* rcu_read_lock is hold by caller */
1915
1916 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1917 if (i < s_i) {
1918 i++;
1919 continue;
1920 }
19baf839
RO
1921
1922 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1923 cb->nlh->nlmsg_seq,
1924 RTM_NEWROUTE,
1925 tb->tb_id,
1926 fa->fa_type,
1927 fa->fa_scope,
be403ea1 1928 xkey,
19baf839
RO
1929 plen,
1930 fa->fa_tos,
64347f78 1931 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1932 cb->args[5] = i;
19baf839 1933 return -1;
91b9a277 1934 }
19baf839
RO
1935 i++;
1936 }
71d67e66 1937 cb->args[5] = i;
19baf839
RO
1938 return skb->len;
1939}
1940
a88ee229
SH
1941static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1942 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1943{
a88ee229
SH
1944 struct leaf_info *li;
1945 struct hlist_node *node;
1946 int i, s_i;
19baf839 1947
71d67e66 1948 s_i = cb->args[4];
a88ee229 1949 i = 0;
19baf839 1950
a88ee229
SH
1951 /* rcu_read_lock is hold by caller */
1952 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1953 if (i < s_i) {
1954 i++;
19baf839 1955 continue;
a88ee229 1956 }
91b9a277 1957
a88ee229 1958 if (i > s_i)
71d67e66 1959 cb->args[5] = 0;
19baf839 1960
a88ee229 1961 if (list_empty(&li->falh))
19baf839
RO
1962 continue;
1963
a88ee229 1964 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1965 cb->args[4] = i;
19baf839
RO
1966 return -1;
1967 }
a88ee229 1968 i++;
19baf839 1969 }
a88ee229 1970
71d67e66 1971 cb->args[4] = i;
19baf839
RO
1972 return skb->len;
1973}
1974
a07f5f50
SH
1975static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1976 struct netlink_callback *cb)
19baf839 1977{
a88ee229 1978 struct leaf *l;
19baf839 1979 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1980 t_key key = cb->args[2];
71d67e66 1981 int count = cb->args[3];
19baf839 1982
2373ce1c 1983 rcu_read_lock();
d5ce8a0e
SH
1984 /* Dump starting at last key.
1985 * Note: 0.0.0.0/0 (ie default) is first key.
1986 */
71d67e66 1987 if (count == 0)
d5ce8a0e
SH
1988 l = trie_firstleaf(t);
1989 else {
71d67e66
SH
1990 /* Normally, continue from last key, but if that is missing
1991 * fallback to using slow rescan
1992 */
d5ce8a0e 1993 l = fib_find_node(t, key);
71d67e66
SH
1994 if (!l)
1995 l = trie_leafindex(t, count);
d5ce8a0e 1996 }
a88ee229 1997
d5ce8a0e
SH
1998 while (l) {
1999 cb->args[2] = l->key;
a88ee229 2000 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 2001 cb->args[3] = count;
a88ee229 2002 rcu_read_unlock();
a88ee229 2003 return -1;
19baf839 2004 }
d5ce8a0e 2005
71d67e66 2006 ++count;
d5ce8a0e 2007 l = trie_nextleaf(l);
71d67e66
SH
2008 memset(&cb->args[4], 0,
2009 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 2010 }
71d67e66 2011 cb->args[3] = count;
2373ce1c 2012 rcu_read_unlock();
a88ee229 2013
19baf839 2014 return skb->len;
19baf839
RO
2015}
2016
7f9b8052
SH
2017void __init fib_hash_init(void)
2018{
a07f5f50
SH
2019 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2020 sizeof(struct fib_alias),
bc3c8c1e
SH
2021 0, SLAB_PANIC, NULL);
2022
2023 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2024 max(sizeof(struct leaf),
2025 sizeof(struct leaf_info)),
2026 0, SLAB_PANIC, NULL);
7f9b8052 2027}
19baf839 2028
7f9b8052
SH
2029
2030/* Fix more generic FIB names for init later */
2031struct fib_table *fib_hash_table(u32 id)
19baf839
RO
2032{
2033 struct fib_table *tb;
2034 struct trie *t;
2035
19baf839
RO
2036 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2037 GFP_KERNEL);
2038 if (tb == NULL)
2039 return NULL;
2040
2041 tb->tb_id = id;
971b893e 2042 tb->tb_default = -1;
19baf839
RO
2043 tb->tb_lookup = fn_trie_lookup;
2044 tb->tb_insert = fn_trie_insert;
2045 tb->tb_delete = fn_trie_delete;
2046 tb->tb_flush = fn_trie_flush;
2047 tb->tb_select_default = fn_trie_select_default;
2048 tb->tb_dump = fn_trie_dump;
19baf839
RO
2049
2050 t = (struct trie *) tb->tb_data;
c28a1cf4 2051 memset(t, 0, sizeof(*t));
19baf839 2052
19baf839 2053 if (id == RT_TABLE_LOCAL)
a07f5f50 2054 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
19baf839
RO
2055
2056 return tb;
2057}
2058
cb7b593c
SH
2059#ifdef CONFIG_PROC_FS
2060/* Depth first Trie walk iterator */
2061struct fib_trie_iter {
1c340b2f 2062 struct seq_net_private p;
3d3b2d25 2063 struct fib_table *tb;
cb7b593c 2064 struct tnode *tnode;
cb7b593c
SH
2065 unsigned index;
2066 unsigned depth;
2067};
19baf839 2068
cb7b593c 2069static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2070{
cb7b593c
SH
2071 struct tnode *tn = iter->tnode;
2072 unsigned cindex = iter->index;
2073 struct tnode *p;
19baf839 2074
6640e697
EB
2075 /* A single entry routing table */
2076 if (!tn)
2077 return NULL;
2078
cb7b593c
SH
2079 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2080 iter->tnode, iter->index, iter->depth);
2081rescan:
2082 while (cindex < (1<<tn->bits)) {
b59cfbf7 2083 struct node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2084
cb7b593c
SH
2085 if (n) {
2086 if (IS_LEAF(n)) {
2087 iter->tnode = tn;
2088 iter->index = cindex + 1;
2089 } else {
2090 /* push down one level */
2091 iter->tnode = (struct tnode *) n;
2092 iter->index = 0;
2093 ++iter->depth;
2094 }
2095 return n;
2096 }
19baf839 2097
cb7b593c
SH
2098 ++cindex;
2099 }
91b9a277 2100
cb7b593c 2101 /* Current node exhausted, pop back up */
b59cfbf7 2102 p = node_parent_rcu((struct node *)tn);
cb7b593c
SH
2103 if (p) {
2104 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2105 tn = p;
2106 --iter->depth;
2107 goto rescan;
19baf839 2108 }
cb7b593c
SH
2109
2110 /* got root? */
2111 return NULL;
19baf839
RO
2112}
2113
cb7b593c
SH
2114static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2115 struct trie *t)
19baf839 2116{
3d3b2d25 2117 struct node *n;
5ddf0eb2 2118
132adf54 2119 if (!t)
5ddf0eb2
RO
2120 return NULL;
2121
2122 n = rcu_dereference(t->trie);
3d3b2d25 2123 if (!n)
5ddf0eb2 2124 return NULL;
19baf839 2125
3d3b2d25
SH
2126 if (IS_TNODE(n)) {
2127 iter->tnode = (struct tnode *) n;
2128 iter->index = 0;
2129 iter->depth = 1;
2130 } else {
2131 iter->tnode = NULL;
2132 iter->index = 0;
2133 iter->depth = 0;
91b9a277 2134 }
3d3b2d25
SH
2135
2136 return n;
cb7b593c 2137}
91b9a277 2138
cb7b593c
SH
2139static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2140{
2141 struct node *n;
2142 struct fib_trie_iter iter;
91b9a277 2143
cb7b593c 2144 memset(s, 0, sizeof(*s));
91b9a277 2145
cb7b593c 2146 rcu_read_lock();
3d3b2d25 2147 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2148 if (IS_LEAF(n)) {
93672292
SH
2149 struct leaf *l = (struct leaf *)n;
2150 struct leaf_info *li;
2151 struct hlist_node *tmp;
2152
cb7b593c
SH
2153 s->leaves++;
2154 s->totdepth += iter.depth;
2155 if (iter.depth > s->maxdepth)
2156 s->maxdepth = iter.depth;
93672292
SH
2157
2158 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2159 ++s->prefixes;
cb7b593c
SH
2160 } else {
2161 const struct tnode *tn = (const struct tnode *) n;
2162 int i;
2163
2164 s->tnodes++;
132adf54 2165 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2166 s->nodesizes[tn->bits]++;
2167
cb7b593c
SH
2168 for (i = 0; i < (1<<tn->bits); i++)
2169 if (!tn->child[i])
2170 s->nullpointers++;
19baf839 2171 }
19baf839 2172 }
2373ce1c 2173 rcu_read_unlock();
19baf839
RO
2174}
2175
cb7b593c
SH
2176/*
2177 * This outputs /proc/net/fib_triestats
2178 */
2179static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2180{
cb7b593c 2181 unsigned i, max, pointers, bytes, avdepth;
c877efb2 2182
cb7b593c
SH
2183 if (stat->leaves)
2184 avdepth = stat->totdepth*100 / stat->leaves;
2185 else
2186 avdepth = 0;
91b9a277 2187
a07f5f50
SH
2188 seq_printf(seq, "\tAver depth: %u.%02d\n",
2189 avdepth / 100, avdepth % 100);
cb7b593c 2190 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2191
cb7b593c 2192 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2193 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2194
2195 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2196 bytes += sizeof(struct leaf_info) * stat->prefixes;
2197
187b5188 2198 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2199 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2200
06ef921d
RO
2201 max = MAX_STAT_DEPTH;
2202 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2203 max--;
19baf839 2204
cb7b593c
SH
2205 pointers = 0;
2206 for (i = 1; i <= max; i++)
2207 if (stat->nodesizes[i] != 0) {
187b5188 2208 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2209 pointers += (1<<i) * stat->nodesizes[i];
2210 }
2211 seq_putc(seq, '\n');
187b5188 2212 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2213
cb7b593c 2214 bytes += sizeof(struct node *) * pointers;
187b5188
SH
2215 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2216 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2217}
2373ce1c 2218
cb7b593c 2219#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2220static void trie_show_usage(struct seq_file *seq,
2221 const struct trie_use_stats *stats)
2222{
2223 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2224 seq_printf(seq, "gets = %u\n", stats->gets);
2225 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2226 seq_printf(seq, "semantic match passed = %u\n",
2227 stats->semantic_match_passed);
2228 seq_printf(seq, "semantic match miss = %u\n",
2229 stats->semantic_match_miss);
2230 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2231 seq_printf(seq, "skipped node resize = %u\n\n",
2232 stats->resize_node_skipped);
cb7b593c 2233}
66a2f7fd
SH
2234#endif /* CONFIG_IP_FIB_TRIE_STATS */
2235
3d3b2d25 2236static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2237{
3d3b2d25
SH
2238 if (tb->tb_id == RT_TABLE_LOCAL)
2239 seq_puts(seq, "Local:\n");
2240 else if (tb->tb_id == RT_TABLE_MAIN)
2241 seq_puts(seq, "Main:\n");
2242 else
2243 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2244}
19baf839 2245
3d3b2d25 2246
cb7b593c
SH
2247static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2248{
1c340b2f 2249 struct net *net = (struct net *)seq->private;
3d3b2d25 2250 unsigned int h;
877a9bff 2251
d717a9a6 2252 seq_printf(seq,
a07f5f50
SH
2253 "Basic info: size of leaf:"
2254 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2255 sizeof(struct leaf), sizeof(struct tnode));
2256
3d3b2d25
SH
2257 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2258 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2259 struct hlist_node *node;
2260 struct fib_table *tb;
2261
2262 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2263 struct trie *t = (struct trie *) tb->tb_data;
2264 struct trie_stat stat;
877a9bff 2265
3d3b2d25
SH
2266 if (!t)
2267 continue;
2268
2269 fib_table_print(seq, tb);
2270
2271 trie_collect_stats(t, &stat);
2272 trie_show_stats(seq, &stat);
2273#ifdef CONFIG_IP_FIB_TRIE_STATS
2274 trie_show_usage(seq, &t->stats);
2275#endif
2276 }
2277 }
19baf839 2278
cb7b593c 2279 return 0;
19baf839
RO
2280}
2281
cb7b593c 2282static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2283{
de05c557 2284 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2285}
2286
9a32144e 2287static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2288 .owner = THIS_MODULE,
2289 .open = fib_triestat_seq_open,
2290 .read = seq_read,
2291 .llseek = seq_lseek,
b6fcbdb4 2292 .release = single_release_net,
cb7b593c
SH
2293};
2294
1218854a 2295static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2296{
1218854a
YH
2297 struct fib_trie_iter *iter = seq->private;
2298 struct net *net = seq_file_net(seq);
cb7b593c 2299 loff_t idx = 0;
3d3b2d25 2300 unsigned int h;
cb7b593c 2301
3d3b2d25
SH
2302 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2303 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2304 struct hlist_node *node;
2305 struct fib_table *tb;
cb7b593c 2306
3d3b2d25
SH
2307 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2308 struct node *n;
2309
2310 for (n = fib_trie_get_first(iter,
2311 (struct trie *) tb->tb_data);
2312 n; n = fib_trie_get_next(iter))
2313 if (pos == idx++) {
2314 iter->tb = tb;
2315 return n;
2316 }
2317 }
cb7b593c 2318 }
3d3b2d25 2319
19baf839
RO
2320 return NULL;
2321}
2322
cb7b593c 2323static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2324 __acquires(RCU)
19baf839 2325{
cb7b593c 2326 rcu_read_lock();
1218854a 2327 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2328}
2329
cb7b593c 2330static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2331{
cb7b593c 2332 struct fib_trie_iter *iter = seq->private;
1218854a 2333 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2334 struct fib_table *tb = iter->tb;
2335 struct hlist_node *tb_node;
2336 unsigned int h;
2337 struct node *n;
cb7b593c 2338
19baf839 2339 ++*pos;
3d3b2d25
SH
2340 /* next node in same table */
2341 n = fib_trie_get_next(iter);
2342 if (n)
2343 return n;
19baf839 2344
3d3b2d25
SH
2345 /* walk rest of this hash chain */
2346 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2347 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2348 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2349 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2350 if (n)
2351 goto found;
2352 }
19baf839 2353
3d3b2d25
SH
2354 /* new hash chain */
2355 while (++h < FIB_TABLE_HASHSZ) {
2356 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2357 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2358 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2359 if (n)
2360 goto found;
2361 }
2362 }
cb7b593c 2363 return NULL;
3d3b2d25
SH
2364
2365found:
2366 iter->tb = tb;
2367 return n;
cb7b593c 2368}
19baf839 2369
cb7b593c 2370static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2371 __releases(RCU)
19baf839 2372{
cb7b593c
SH
2373 rcu_read_unlock();
2374}
91b9a277 2375
cb7b593c
SH
2376static void seq_indent(struct seq_file *seq, int n)
2377{
2378 while (n-- > 0) seq_puts(seq, " ");
2379}
19baf839 2380
28d36e37 2381static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2382{
132adf54 2383 switch (s) {
cb7b593c
SH
2384 case RT_SCOPE_UNIVERSE: return "universe";
2385 case RT_SCOPE_SITE: return "site";
2386 case RT_SCOPE_LINK: return "link";
2387 case RT_SCOPE_HOST: return "host";
2388 case RT_SCOPE_NOWHERE: return "nowhere";
2389 default:
28d36e37 2390 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2391 return buf;
2392 }
2393}
19baf839 2394
cb7b593c
SH
2395static const char *rtn_type_names[__RTN_MAX] = {
2396 [RTN_UNSPEC] = "UNSPEC",
2397 [RTN_UNICAST] = "UNICAST",
2398 [RTN_LOCAL] = "LOCAL",
2399 [RTN_BROADCAST] = "BROADCAST",
2400 [RTN_ANYCAST] = "ANYCAST",
2401 [RTN_MULTICAST] = "MULTICAST",
2402 [RTN_BLACKHOLE] = "BLACKHOLE",
2403 [RTN_UNREACHABLE] = "UNREACHABLE",
2404 [RTN_PROHIBIT] = "PROHIBIT",
2405 [RTN_THROW] = "THROW",
2406 [RTN_NAT] = "NAT",
2407 [RTN_XRESOLVE] = "XRESOLVE",
2408};
19baf839 2409
28d36e37 2410static inline const char *rtn_type(char *buf, size_t len, unsigned t)
cb7b593c 2411{
cb7b593c
SH
2412 if (t < __RTN_MAX && rtn_type_names[t])
2413 return rtn_type_names[t];
28d36e37 2414 snprintf(buf, len, "type %u", t);
cb7b593c 2415 return buf;
19baf839
RO
2416}
2417
cb7b593c
SH
2418/* Pretty print the trie */
2419static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2420{
cb7b593c
SH
2421 const struct fib_trie_iter *iter = seq->private;
2422 struct node *n = v;
c877efb2 2423
3d3b2d25
SH
2424 if (!node_parent_rcu(n))
2425 fib_table_print(seq, iter->tb);
095b8501 2426
cb7b593c
SH
2427 if (IS_TNODE(n)) {
2428 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2429 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2430
1d25cd6c 2431 seq_indent(seq, iter->depth-1);
673d57e7
HH
2432 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2433 &prf, tn->pos, tn->bits, tn->full_children,
1d25cd6c 2434 tn->empty_children);
e905a9ed 2435
cb7b593c
SH
2436 } else {
2437 struct leaf *l = (struct leaf *) n;
1328042e
SH
2438 struct leaf_info *li;
2439 struct hlist_node *node;
32ab5f80 2440 __be32 val = htonl(l->key);
cb7b593c
SH
2441
2442 seq_indent(seq, iter->depth);
673d57e7 2443 seq_printf(seq, " |-- %pI4\n", &val);
1328042e
SH
2444
2445 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2446 struct fib_alias *fa;
2447
2448 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2449 char buf1[32], buf2[32];
2450
2451 seq_indent(seq, iter->depth+1);
2452 seq_printf(seq, " /%d %s %s", li->plen,
2453 rtn_scope(buf1, sizeof(buf1),
2454 fa->fa_scope),
2455 rtn_type(buf2, sizeof(buf2),
2456 fa->fa_type));
2457 if (fa->fa_tos)
b9c4d82a 2458 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2459 seq_putc(seq, '\n');
cb7b593c
SH
2460 }
2461 }
19baf839 2462 }
cb7b593c 2463
19baf839
RO
2464 return 0;
2465}
2466
f690808e 2467static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2468 .start = fib_trie_seq_start,
2469 .next = fib_trie_seq_next,
2470 .stop = fib_trie_seq_stop,
2471 .show = fib_trie_seq_show,
19baf839
RO
2472};
2473
cb7b593c 2474static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2475{
1c340b2f
DL
2476 return seq_open_net(inode, file, &fib_trie_seq_ops,
2477 sizeof(struct fib_trie_iter));
19baf839
RO
2478}
2479
9a32144e 2480static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2481 .owner = THIS_MODULE,
2482 .open = fib_trie_seq_open,
2483 .read = seq_read,
2484 .llseek = seq_lseek,
1c340b2f 2485 .release = seq_release_net,
19baf839
RO
2486};
2487
8315f5d8
SH
2488struct fib_route_iter {
2489 struct seq_net_private p;
2490 struct trie *main_trie;
2491 loff_t pos;
2492 t_key key;
2493};
2494
2495static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2496{
2497 struct leaf *l = NULL;
2498 struct trie *t = iter->main_trie;
2499
2500 /* use cache location of last found key */
2501 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2502 pos -= iter->pos;
2503 else {
2504 iter->pos = 0;
2505 l = trie_firstleaf(t);
2506 }
2507
2508 while (l && pos-- > 0) {
2509 iter->pos++;
2510 l = trie_nextleaf(l);
2511 }
2512
2513 if (l)
2514 iter->key = pos; /* remember it */
2515 else
2516 iter->pos = 0; /* forget it */
2517
2518 return l;
2519}
2520
2521static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2522 __acquires(RCU)
2523{
2524 struct fib_route_iter *iter = seq->private;
2525 struct fib_table *tb;
2526
2527 rcu_read_lock();
1218854a 2528 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2529 if (!tb)
2530 return NULL;
2531
2532 iter->main_trie = (struct trie *) tb->tb_data;
2533 if (*pos == 0)
2534 return SEQ_START_TOKEN;
2535 else
2536 return fib_route_get_idx(iter, *pos - 1);
2537}
2538
2539static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2540{
2541 struct fib_route_iter *iter = seq->private;
2542 struct leaf *l = v;
2543
2544 ++*pos;
2545 if (v == SEQ_START_TOKEN) {
2546 iter->pos = 0;
2547 l = trie_firstleaf(iter->main_trie);
2548 } else {
2549 iter->pos++;
2550 l = trie_nextleaf(l);
2551 }
2552
2553 if (l)
2554 iter->key = l->key;
2555 else
2556 iter->pos = 0;
2557 return l;
2558}
2559
2560static void fib_route_seq_stop(struct seq_file *seq, void *v)
2561 __releases(RCU)
2562{
2563 rcu_read_unlock();
2564}
2565
32ab5f80 2566static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2567{
cb7b593c
SH
2568 static unsigned type2flags[RTN_MAX + 1] = {
2569 [7] = RTF_REJECT, [8] = RTF_REJECT,
2570 };
2571 unsigned flags = type2flags[type];
19baf839 2572
cb7b593c
SH
2573 if (fi && fi->fib_nh->nh_gw)
2574 flags |= RTF_GATEWAY;
32ab5f80 2575 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2576 flags |= RTF_HOST;
2577 flags |= RTF_UP;
2578 return flags;
19baf839
RO
2579}
2580
cb7b593c
SH
2581/*
2582 * This outputs /proc/net/route.
2583 * The format of the file is not supposed to be changed
2584 * and needs to be same as fib_hash output to avoid breaking
2585 * legacy utilities
2586 */
2587static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2588{
cb7b593c 2589 struct leaf *l = v;
1328042e
SH
2590 struct leaf_info *li;
2591 struct hlist_node *node;
19baf839 2592
cb7b593c
SH
2593 if (v == SEQ_START_TOKEN) {
2594 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2595 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2596 "\tWindow\tIRTT");
2597 return 0;
2598 }
19baf839 2599
1328042e 2600 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
cb7b593c 2601 struct fib_alias *fa;
32ab5f80 2602 __be32 mask, prefix;
91b9a277 2603
cb7b593c
SH
2604 mask = inet_make_mask(li->plen);
2605 prefix = htonl(l->key);
19baf839 2606
cb7b593c 2607 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2608 const struct fib_info *fi = fa->fa_info;
cb7b593c 2609 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
5e659e4c 2610 int len;
19baf839 2611
cb7b593c
SH
2612 if (fa->fa_type == RTN_BROADCAST
2613 || fa->fa_type == RTN_MULTICAST)
2614 continue;
19baf839 2615
cb7b593c 2616 if (fi)
5e659e4c
PE
2617 seq_printf(seq,
2618 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2619 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c
SH
2620 fi->fib_dev ? fi->fib_dev->name : "*",
2621 prefix,
2622 fi->fib_nh->nh_gw, flags, 0, 0,
2623 fi->fib_priority,
2624 mask,
a07f5f50
SH
2625 (fi->fib_advmss ?
2626 fi->fib_advmss + 40 : 0),
cb7b593c 2627 fi->fib_window,
5e659e4c 2628 fi->fib_rtt >> 3, &len);
cb7b593c 2629 else
5e659e4c
PE
2630 seq_printf(seq,
2631 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2632 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c 2633 prefix, 0, flags, 0, 0, 0,
5e659e4c 2634 mask, 0, 0, 0, &len);
19baf839 2635
5e659e4c 2636 seq_printf(seq, "%*s\n", 127 - len, "");
cb7b593c 2637 }
19baf839
RO
2638 }
2639
2640 return 0;
2641}
2642
f690808e 2643static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2644 .start = fib_route_seq_start,
2645 .next = fib_route_seq_next,
2646 .stop = fib_route_seq_stop,
cb7b593c 2647 .show = fib_route_seq_show,
19baf839
RO
2648};
2649
cb7b593c 2650static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2651{
1c340b2f 2652 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2653 sizeof(struct fib_route_iter));
19baf839
RO
2654}
2655
9a32144e 2656static const struct file_operations fib_route_fops = {
cb7b593c
SH
2657 .owner = THIS_MODULE,
2658 .open = fib_route_seq_open,
2659 .read = seq_read,
2660 .llseek = seq_lseek,
1c340b2f 2661 .release = seq_release_net,
19baf839
RO
2662};
2663
61a02653 2664int __net_init fib_proc_init(struct net *net)
19baf839 2665{
61a02653 2666 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
cb7b593c
SH
2667 goto out1;
2668
61a02653
DL
2669 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2670 &fib_triestat_fops))
cb7b593c
SH
2671 goto out2;
2672
61a02653 2673 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
cb7b593c
SH
2674 goto out3;
2675
19baf839 2676 return 0;
cb7b593c
SH
2677
2678out3:
61a02653 2679 proc_net_remove(net, "fib_triestat");
cb7b593c 2680out2:
61a02653 2681 proc_net_remove(net, "fib_trie");
cb7b593c
SH
2682out1:
2683 return -ENOMEM;
19baf839
RO
2684}
2685
61a02653 2686void __net_exit fib_proc_exit(struct net *net)
19baf839 2687{
61a02653
DL
2688 proc_net_remove(net, "fib_trie");
2689 proc_net_remove(net, "fib_triestat");
2690 proc_net_remove(net, "route");
19baf839
RO
2691}
2692
2693#endif /* CONFIG_PROC_FS */
This page took 0.716565 seconds and 5 git commands to generate.