[IPV4] fib_trie: put leaf nodes in a slab cache
[deliverable/linux.git] / net / ipv4 / fib_trie.c
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
51 */
52
53 #define VERSION "0.408"
54
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <linux/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/mm.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
65 #include <linux/in.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/net_namespace.h>
77 #include <net/ip.h>
78 #include <net/protocol.h>
79 #include <net/route.h>
80 #include <net/tcp.h>
81 #include <net/sock.h>
82 #include <net/ip_fib.h>
83 #include "fib_lookup.h"
84
85 #define MAX_STAT_DEPTH 32
86
87 #define KEYLENGTH (8*sizeof(t_key))
88
89 typedef unsigned int t_key;
90
91 #define T_TNODE 0
92 #define T_LEAF 1
93 #define NODE_TYPE_MASK 0x1UL
94 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
96 #define IS_TNODE(n) (!(n->parent & T_LEAF))
97 #define IS_LEAF(n) (n->parent & T_LEAF)
98
99 struct node {
100 unsigned long parent;
101 t_key key;
102 };
103
104 struct leaf {
105 unsigned long parent;
106 t_key key;
107 struct hlist_head list;
108 struct rcu_head rcu;
109 };
110
111 struct leaf_info {
112 struct hlist_node hlist;
113 struct rcu_head rcu;
114 int plen;
115 struct list_head falh;
116 };
117
118 struct tnode {
119 unsigned long parent;
120 t_key key;
121 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
122 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
123 unsigned int full_children; /* KEYLENGTH bits needed */
124 unsigned int empty_children; /* KEYLENGTH bits needed */
125 struct rcu_head rcu;
126 struct node *child[0];
127 };
128
129 #ifdef CONFIG_IP_FIB_TRIE_STATS
130 struct trie_use_stats {
131 unsigned int gets;
132 unsigned int backtrack;
133 unsigned int semantic_match_passed;
134 unsigned int semantic_match_miss;
135 unsigned int null_node_hit;
136 unsigned int resize_node_skipped;
137 };
138 #endif
139
140 struct trie_stat {
141 unsigned int totdepth;
142 unsigned int maxdepth;
143 unsigned int tnodes;
144 unsigned int leaves;
145 unsigned int nullpointers;
146 unsigned int nodesizes[MAX_STAT_DEPTH];
147 };
148
149 struct trie {
150 struct node *trie;
151 unsigned int size;
152 #ifdef CONFIG_IP_FIB_TRIE_STATS
153 struct trie_use_stats stats;
154 #endif
155 };
156
157 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
158 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
159 static struct node *resize(struct trie *t, struct tnode *tn);
160 static struct tnode *inflate(struct trie *t, struct tnode *tn);
161 static struct tnode *halve(struct trie *t, struct tnode *tn);
162 static void tnode_free(struct tnode *tn);
163
164 static struct kmem_cache *fn_alias_kmem __read_mostly;
165 static struct kmem_cache *trie_leaf_kmem __read_mostly;
166
167 static inline struct tnode *node_parent(struct node *node)
168 {
169 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
170 }
171
172 static inline struct tnode *node_parent_rcu(struct node *node)
173 {
174 struct tnode *ret = node_parent(node);
175
176 return rcu_dereference(ret);
177 }
178
179 static inline void node_set_parent(struct node *node, struct tnode *ptr)
180 {
181 rcu_assign_pointer(node->parent,
182 (unsigned long)ptr | NODE_TYPE(node));
183 }
184
185 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
186 {
187 BUG_ON(i >= 1U << tn->bits);
188
189 return tn->child[i];
190 }
191
192 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
193 {
194 struct node *ret = tnode_get_child(tn, i);
195
196 return rcu_dereference(ret);
197 }
198
199 static inline int tnode_child_length(const struct tnode *tn)
200 {
201 return 1 << tn->bits;
202 }
203
204 static inline t_key mask_pfx(t_key k, unsigned short l)
205 {
206 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
207 }
208
209 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
210 {
211 if (offset < KEYLENGTH)
212 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
213 else
214 return 0;
215 }
216
217 static inline int tkey_equals(t_key a, t_key b)
218 {
219 return a == b;
220 }
221
222 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
223 {
224 if (bits == 0 || offset >= KEYLENGTH)
225 return 1;
226 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
227 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
228 }
229
230 static inline int tkey_mismatch(t_key a, int offset, t_key b)
231 {
232 t_key diff = a ^ b;
233 int i = offset;
234
235 if (!diff)
236 return 0;
237 while ((diff << i) >> (KEYLENGTH-1) == 0)
238 i++;
239 return i;
240 }
241
242 /*
243 To understand this stuff, an understanding of keys and all their bits is
244 necessary. Every node in the trie has a key associated with it, but not
245 all of the bits in that key are significant.
246
247 Consider a node 'n' and its parent 'tp'.
248
249 If n is a leaf, every bit in its key is significant. Its presence is
250 necessitated by path compression, since during a tree traversal (when
251 searching for a leaf - unless we are doing an insertion) we will completely
252 ignore all skipped bits we encounter. Thus we need to verify, at the end of
253 a potentially successful search, that we have indeed been walking the
254 correct key path.
255
256 Note that we can never "miss" the correct key in the tree if present by
257 following the wrong path. Path compression ensures that segments of the key
258 that are the same for all keys with a given prefix are skipped, but the
259 skipped part *is* identical for each node in the subtrie below the skipped
260 bit! trie_insert() in this implementation takes care of that - note the
261 call to tkey_sub_equals() in trie_insert().
262
263 if n is an internal node - a 'tnode' here, the various parts of its key
264 have many different meanings.
265
266 Example:
267 _________________________________________________________________
268 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
269 -----------------------------------------------------------------
270 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
271
272 _________________________________________________________________
273 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
274 -----------------------------------------------------------------
275 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
276
277 tp->pos = 7
278 tp->bits = 3
279 n->pos = 15
280 n->bits = 4
281
282 First, let's just ignore the bits that come before the parent tp, that is
283 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
284 not use them for anything.
285
286 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
287 index into the parent's child array. That is, they will be used to find
288 'n' among tp's children.
289
290 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
291 for the node n.
292
293 All the bits we have seen so far are significant to the node n. The rest
294 of the bits are really not needed or indeed known in n->key.
295
296 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
297 n's child array, and will of course be different for each child.
298
299
300 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
301 at this point.
302
303 */
304
305 static inline void check_tnode(const struct tnode *tn)
306 {
307 WARN_ON(tn && tn->pos+tn->bits > 32);
308 }
309
310 static const int halve_threshold = 25;
311 static const int inflate_threshold = 50;
312 static const int halve_threshold_root = 8;
313 static const int inflate_threshold_root = 15;
314
315
316 static void __alias_free_mem(struct rcu_head *head)
317 {
318 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
319 kmem_cache_free(fn_alias_kmem, fa);
320 }
321
322 static inline void alias_free_mem_rcu(struct fib_alias *fa)
323 {
324 call_rcu(&fa->rcu, __alias_free_mem);
325 }
326
327 static void __leaf_free_rcu(struct rcu_head *head)
328 {
329 struct leaf *l = container_of(head, struct leaf, rcu);
330 kmem_cache_free(trie_leaf_kmem, l);
331 }
332
333 static void __leaf_info_free_rcu(struct rcu_head *head)
334 {
335 kfree(container_of(head, struct leaf_info, rcu));
336 }
337
338 static inline void free_leaf_info(struct leaf_info *leaf)
339 {
340 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
341 }
342
343 static struct tnode *tnode_alloc(size_t size)
344 {
345 struct page *pages;
346
347 if (size <= PAGE_SIZE)
348 return kzalloc(size, GFP_KERNEL);
349
350 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
351 if (!pages)
352 return NULL;
353
354 return page_address(pages);
355 }
356
357 static void __tnode_free_rcu(struct rcu_head *head)
358 {
359 struct tnode *tn = container_of(head, struct tnode, rcu);
360 size_t size = sizeof(struct tnode) +
361 (sizeof(struct node *) << tn->bits);
362
363 if (size <= PAGE_SIZE)
364 kfree(tn);
365 else
366 free_pages((unsigned long)tn, get_order(size));
367 }
368
369 static inline void tnode_free(struct tnode *tn)
370 {
371 if (IS_LEAF(tn)) {
372 struct leaf *l = (struct leaf *) tn;
373 call_rcu_bh(&l->rcu, __leaf_free_rcu);
374 } else
375 call_rcu(&tn->rcu, __tnode_free_rcu);
376 }
377
378 static struct leaf *leaf_new(void)
379 {
380 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
381 if (l) {
382 l->parent = T_LEAF;
383 INIT_HLIST_HEAD(&l->list);
384 }
385 return l;
386 }
387
388 static struct leaf_info *leaf_info_new(int plen)
389 {
390 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
391 if (li) {
392 li->plen = plen;
393 INIT_LIST_HEAD(&li->falh);
394 }
395 return li;
396 }
397
398 static struct tnode* tnode_new(t_key key, int pos, int bits)
399 {
400 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
401 struct tnode *tn = tnode_alloc(sz);
402
403 if (tn) {
404 tn->parent = T_TNODE;
405 tn->pos = pos;
406 tn->bits = bits;
407 tn->key = key;
408 tn->full_children = 0;
409 tn->empty_children = 1<<bits;
410 }
411
412 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
413 (unsigned long) (sizeof(struct node) << bits));
414 return tn;
415 }
416
417 /*
418 * Check whether a tnode 'n' is "full", i.e. it is an internal node
419 * and no bits are skipped. See discussion in dyntree paper p. 6
420 */
421
422 static inline int tnode_full(const struct tnode *tn, const struct node *n)
423 {
424 if (n == NULL || IS_LEAF(n))
425 return 0;
426
427 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
428 }
429
430 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
431 {
432 tnode_put_child_reorg(tn, i, n, -1);
433 }
434
435 /*
436 * Add a child at position i overwriting the old value.
437 * Update the value of full_children and empty_children.
438 */
439
440 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
441 {
442 struct node *chi = tn->child[i];
443 int isfull;
444
445 BUG_ON(i >= 1<<tn->bits);
446
447
448 /* update emptyChildren */
449 if (n == NULL && chi != NULL)
450 tn->empty_children++;
451 else if (n != NULL && chi == NULL)
452 tn->empty_children--;
453
454 /* update fullChildren */
455 if (wasfull == -1)
456 wasfull = tnode_full(tn, chi);
457
458 isfull = tnode_full(tn, n);
459 if (wasfull && !isfull)
460 tn->full_children--;
461 else if (!wasfull && isfull)
462 tn->full_children++;
463
464 if (n)
465 node_set_parent(n, tn);
466
467 rcu_assign_pointer(tn->child[i], n);
468 }
469
470 static struct node *resize(struct trie *t, struct tnode *tn)
471 {
472 int i;
473 int err = 0;
474 struct tnode *old_tn;
475 int inflate_threshold_use;
476 int halve_threshold_use;
477 int max_resize;
478
479 if (!tn)
480 return NULL;
481
482 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
483 tn, inflate_threshold, halve_threshold);
484
485 /* No children */
486 if (tn->empty_children == tnode_child_length(tn)) {
487 tnode_free(tn);
488 return NULL;
489 }
490 /* One child */
491 if (tn->empty_children == tnode_child_length(tn) - 1)
492 for (i = 0; i < tnode_child_length(tn); i++) {
493 struct node *n;
494
495 n = tn->child[i];
496 if (!n)
497 continue;
498
499 /* compress one level */
500 node_set_parent(n, NULL);
501 tnode_free(tn);
502 return n;
503 }
504 /*
505 * Double as long as the resulting node has a number of
506 * nonempty nodes that are above the threshold.
507 */
508
509 /*
510 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
511 * the Helsinki University of Technology and Matti Tikkanen of Nokia
512 * Telecommunications, page 6:
513 * "A node is doubled if the ratio of non-empty children to all
514 * children in the *doubled* node is at least 'high'."
515 *
516 * 'high' in this instance is the variable 'inflate_threshold'. It
517 * is expressed as a percentage, so we multiply it with
518 * tnode_child_length() and instead of multiplying by 2 (since the
519 * child array will be doubled by inflate()) and multiplying
520 * the left-hand side by 100 (to handle the percentage thing) we
521 * multiply the left-hand side by 50.
522 *
523 * The left-hand side may look a bit weird: tnode_child_length(tn)
524 * - tn->empty_children is of course the number of non-null children
525 * in the current node. tn->full_children is the number of "full"
526 * children, that is non-null tnodes with a skip value of 0.
527 * All of those will be doubled in the resulting inflated tnode, so
528 * we just count them one extra time here.
529 *
530 * A clearer way to write this would be:
531 *
532 * to_be_doubled = tn->full_children;
533 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
534 * tn->full_children;
535 *
536 * new_child_length = tnode_child_length(tn) * 2;
537 *
538 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
539 * new_child_length;
540 * if (new_fill_factor >= inflate_threshold)
541 *
542 * ...and so on, tho it would mess up the while () loop.
543 *
544 * anyway,
545 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
546 * inflate_threshold
547 *
548 * avoid a division:
549 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
550 * inflate_threshold * new_child_length
551 *
552 * expand not_to_be_doubled and to_be_doubled, and shorten:
553 * 100 * (tnode_child_length(tn) - tn->empty_children +
554 * tn->full_children) >= inflate_threshold * new_child_length
555 *
556 * expand new_child_length:
557 * 100 * (tnode_child_length(tn) - tn->empty_children +
558 * tn->full_children) >=
559 * inflate_threshold * tnode_child_length(tn) * 2
560 *
561 * shorten again:
562 * 50 * (tn->full_children + tnode_child_length(tn) -
563 * tn->empty_children) >= inflate_threshold *
564 * tnode_child_length(tn)
565 *
566 */
567
568 check_tnode(tn);
569
570 /* Keep root node larger */
571
572 if (!tn->parent)
573 inflate_threshold_use = inflate_threshold_root;
574 else
575 inflate_threshold_use = inflate_threshold;
576
577 err = 0;
578 max_resize = 10;
579 while ((tn->full_children > 0 && max_resize-- &&
580 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
581 inflate_threshold_use * tnode_child_length(tn))) {
582
583 old_tn = tn;
584 tn = inflate(t, tn);
585 if (IS_ERR(tn)) {
586 tn = old_tn;
587 #ifdef CONFIG_IP_FIB_TRIE_STATS
588 t->stats.resize_node_skipped++;
589 #endif
590 break;
591 }
592 }
593
594 if (max_resize < 0) {
595 if (!tn->parent)
596 printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
597 inflate_threshold_root, tn->bits);
598 else
599 printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
600 inflate_threshold, tn->bits);
601 }
602
603 check_tnode(tn);
604
605 /*
606 * Halve as long as the number of empty children in this
607 * node is above threshold.
608 */
609
610
611 /* Keep root node larger */
612
613 if (!tn->parent)
614 halve_threshold_use = halve_threshold_root;
615 else
616 halve_threshold_use = halve_threshold;
617
618 err = 0;
619 max_resize = 10;
620 while (tn->bits > 1 && max_resize-- &&
621 100 * (tnode_child_length(tn) - tn->empty_children) <
622 halve_threshold_use * tnode_child_length(tn)) {
623
624 old_tn = tn;
625 tn = halve(t, tn);
626 if (IS_ERR(tn)) {
627 tn = old_tn;
628 #ifdef CONFIG_IP_FIB_TRIE_STATS
629 t->stats.resize_node_skipped++;
630 #endif
631 break;
632 }
633 }
634
635 if (max_resize < 0) {
636 if (!tn->parent)
637 printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
638 halve_threshold_root, tn->bits);
639 else
640 printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
641 halve_threshold, tn->bits);
642 }
643
644 /* Only one child remains */
645 if (tn->empty_children == tnode_child_length(tn) - 1)
646 for (i = 0; i < tnode_child_length(tn); i++) {
647 struct node *n;
648
649 n = tn->child[i];
650 if (!n)
651 continue;
652
653 /* compress one level */
654
655 node_set_parent(n, NULL);
656 tnode_free(tn);
657 return n;
658 }
659
660 return (struct node *) tn;
661 }
662
663 static struct tnode *inflate(struct trie *t, struct tnode *tn)
664 {
665 struct tnode *oldtnode = tn;
666 int olen = tnode_child_length(tn);
667 int i;
668
669 pr_debug("In inflate\n");
670
671 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
672
673 if (!tn)
674 return ERR_PTR(-ENOMEM);
675
676 /*
677 * Preallocate and store tnodes before the actual work so we
678 * don't get into an inconsistent state if memory allocation
679 * fails. In case of failure we return the oldnode and inflate
680 * of tnode is ignored.
681 */
682
683 for (i = 0; i < olen; i++) {
684 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
685
686 if (inode &&
687 IS_TNODE(inode) &&
688 inode->pos == oldtnode->pos + oldtnode->bits &&
689 inode->bits > 1) {
690 struct tnode *left, *right;
691 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
692
693 left = tnode_new(inode->key&(~m), inode->pos + 1,
694 inode->bits - 1);
695 if (!left)
696 goto nomem;
697
698 right = tnode_new(inode->key|m, inode->pos + 1,
699 inode->bits - 1);
700
701 if (!right) {
702 tnode_free(left);
703 goto nomem;
704 }
705
706 put_child(t, tn, 2*i, (struct node *) left);
707 put_child(t, tn, 2*i+1, (struct node *) right);
708 }
709 }
710
711 for (i = 0; i < olen; i++) {
712 struct tnode *inode;
713 struct node *node = tnode_get_child(oldtnode, i);
714 struct tnode *left, *right;
715 int size, j;
716
717 /* An empty child */
718 if (node == NULL)
719 continue;
720
721 /* A leaf or an internal node with skipped bits */
722
723 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
724 tn->pos + tn->bits - 1) {
725 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
726 1) == 0)
727 put_child(t, tn, 2*i, node);
728 else
729 put_child(t, tn, 2*i+1, node);
730 continue;
731 }
732
733 /* An internal node with two children */
734 inode = (struct tnode *) node;
735
736 if (inode->bits == 1) {
737 put_child(t, tn, 2*i, inode->child[0]);
738 put_child(t, tn, 2*i+1, inode->child[1]);
739
740 tnode_free(inode);
741 continue;
742 }
743
744 /* An internal node with more than two children */
745
746 /* We will replace this node 'inode' with two new
747 * ones, 'left' and 'right', each with half of the
748 * original children. The two new nodes will have
749 * a position one bit further down the key and this
750 * means that the "significant" part of their keys
751 * (see the discussion near the top of this file)
752 * will differ by one bit, which will be "0" in
753 * left's key and "1" in right's key. Since we are
754 * moving the key position by one step, the bit that
755 * we are moving away from - the bit at position
756 * (inode->pos) - is the one that will differ between
757 * left and right. So... we synthesize that bit in the
758 * two new keys.
759 * The mask 'm' below will be a single "one" bit at
760 * the position (inode->pos)
761 */
762
763 /* Use the old key, but set the new significant
764 * bit to zero.
765 */
766
767 left = (struct tnode *) tnode_get_child(tn, 2*i);
768 put_child(t, tn, 2*i, NULL);
769
770 BUG_ON(!left);
771
772 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
773 put_child(t, tn, 2*i+1, NULL);
774
775 BUG_ON(!right);
776
777 size = tnode_child_length(left);
778 for (j = 0; j < size; j++) {
779 put_child(t, left, j, inode->child[j]);
780 put_child(t, right, j, inode->child[j + size]);
781 }
782 put_child(t, tn, 2*i, resize(t, left));
783 put_child(t, tn, 2*i+1, resize(t, right));
784
785 tnode_free(inode);
786 }
787 tnode_free(oldtnode);
788 return tn;
789 nomem:
790 {
791 int size = tnode_child_length(tn);
792 int j;
793
794 for (j = 0; j < size; j++)
795 if (tn->child[j])
796 tnode_free((struct tnode *)tn->child[j]);
797
798 tnode_free(tn);
799
800 return ERR_PTR(-ENOMEM);
801 }
802 }
803
804 static struct tnode *halve(struct trie *t, struct tnode *tn)
805 {
806 struct tnode *oldtnode = tn;
807 struct node *left, *right;
808 int i;
809 int olen = tnode_child_length(tn);
810
811 pr_debug("In halve\n");
812
813 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
814
815 if (!tn)
816 return ERR_PTR(-ENOMEM);
817
818 /*
819 * Preallocate and store tnodes before the actual work so we
820 * don't get into an inconsistent state if memory allocation
821 * fails. In case of failure we return the oldnode and halve
822 * of tnode is ignored.
823 */
824
825 for (i = 0; i < olen; i += 2) {
826 left = tnode_get_child(oldtnode, i);
827 right = tnode_get_child(oldtnode, i+1);
828
829 /* Two nonempty children */
830 if (left && right) {
831 struct tnode *newn;
832
833 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
834
835 if (!newn)
836 goto nomem;
837
838 put_child(t, tn, i/2, (struct node *)newn);
839 }
840
841 }
842
843 for (i = 0; i < olen; i += 2) {
844 struct tnode *newBinNode;
845
846 left = tnode_get_child(oldtnode, i);
847 right = tnode_get_child(oldtnode, i+1);
848
849 /* At least one of the children is empty */
850 if (left == NULL) {
851 if (right == NULL) /* Both are empty */
852 continue;
853 put_child(t, tn, i/2, right);
854 continue;
855 }
856
857 if (right == NULL) {
858 put_child(t, tn, i/2, left);
859 continue;
860 }
861
862 /* Two nonempty children */
863 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
864 put_child(t, tn, i/2, NULL);
865 put_child(t, newBinNode, 0, left);
866 put_child(t, newBinNode, 1, right);
867 put_child(t, tn, i/2, resize(t, newBinNode));
868 }
869 tnode_free(oldtnode);
870 return tn;
871 nomem:
872 {
873 int size = tnode_child_length(tn);
874 int j;
875
876 for (j = 0; j < size; j++)
877 if (tn->child[j])
878 tnode_free((struct tnode *)tn->child[j]);
879
880 tnode_free(tn);
881
882 return ERR_PTR(-ENOMEM);
883 }
884 }
885
886 /* readside must use rcu_read_lock currently dump routines
887 via get_fa_head and dump */
888
889 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
890 {
891 struct hlist_head *head = &l->list;
892 struct hlist_node *node;
893 struct leaf_info *li;
894
895 hlist_for_each_entry_rcu(li, node, head, hlist)
896 if (li->plen == plen)
897 return li;
898
899 return NULL;
900 }
901
902 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
903 {
904 struct leaf_info *li = find_leaf_info(l, plen);
905
906 if (!li)
907 return NULL;
908
909 return &li->falh;
910 }
911
912 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
913 {
914 struct leaf_info *li = NULL, *last = NULL;
915 struct hlist_node *node;
916
917 if (hlist_empty(head)) {
918 hlist_add_head_rcu(&new->hlist, head);
919 } else {
920 hlist_for_each_entry(li, node, head, hlist) {
921 if (new->plen > li->plen)
922 break;
923
924 last = li;
925 }
926 if (last)
927 hlist_add_after_rcu(&last->hlist, &new->hlist);
928 else
929 hlist_add_before_rcu(&new->hlist, &li->hlist);
930 }
931 }
932
933 /* rcu_read_lock needs to be hold by caller from readside */
934
935 static struct leaf *
936 fib_find_node(struct trie *t, u32 key)
937 {
938 int pos;
939 struct tnode *tn;
940 struct node *n;
941
942 pos = 0;
943 n = rcu_dereference(t->trie);
944
945 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
946 tn = (struct tnode *) n;
947
948 check_tnode(tn);
949
950 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
951 pos = tn->pos + tn->bits;
952 n = tnode_get_child_rcu(tn, tkey_extract_bits(key, tn->pos, tn->bits));
953 } else
954 break;
955 }
956 /* Case we have found a leaf. Compare prefixes */
957
958 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
959 return (struct leaf *)n;
960
961 return NULL;
962 }
963
964 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
965 {
966 int wasfull;
967 t_key cindex, key = tn->key;
968 struct tnode *tp;
969
970 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
971 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
972 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
973 tn = (struct tnode *) resize (t, (struct tnode *)tn);
974 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
975
976 tp = node_parent((struct node *) tn);
977 if (!tp)
978 break;
979 tn = tp;
980 }
981
982 /* Handle last (top) tnode */
983 if (IS_TNODE(tn))
984 tn = (struct tnode*) resize(t, (struct tnode *)tn);
985
986 return (struct node*) tn;
987 }
988
989 /* only used from updater-side */
990
991 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
992 {
993 int pos, newpos;
994 struct tnode *tp = NULL, *tn = NULL;
995 struct node *n;
996 struct leaf *l;
997 int missbit;
998 struct list_head *fa_head = NULL;
999 struct leaf_info *li;
1000 t_key cindex;
1001
1002 pos = 0;
1003 n = t->trie;
1004
1005 /* If we point to NULL, stop. Either the tree is empty and we should
1006 * just put a new leaf in if, or we have reached an empty child slot,
1007 * and we should just put our new leaf in that.
1008 * If we point to a T_TNODE, check if it matches our key. Note that
1009 * a T_TNODE might be skipping any number of bits - its 'pos' need
1010 * not be the parent's 'pos'+'bits'!
1011 *
1012 * If it does match the current key, get pos/bits from it, extract
1013 * the index from our key, push the T_TNODE and walk the tree.
1014 *
1015 * If it doesn't, we have to replace it with a new T_TNODE.
1016 *
1017 * If we point to a T_LEAF, it might or might not have the same key
1018 * as we do. If it does, just change the value, update the T_LEAF's
1019 * value, and return it.
1020 * If it doesn't, we need to replace it with a T_TNODE.
1021 */
1022
1023 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1024 tn = (struct tnode *) n;
1025
1026 check_tnode(tn);
1027
1028 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1029 tp = tn;
1030 pos = tn->pos + tn->bits;
1031 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1032
1033 BUG_ON(n && node_parent(n) != tn);
1034 } else
1035 break;
1036 }
1037
1038 /*
1039 * n ----> NULL, LEAF or TNODE
1040 *
1041 * tp is n's (parent) ----> NULL or TNODE
1042 */
1043
1044 BUG_ON(tp && IS_LEAF(tp));
1045
1046 /* Case 1: n is a leaf. Compare prefixes */
1047
1048 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1049 l = (struct leaf *) n;
1050 li = leaf_info_new(plen);
1051
1052 if (!li)
1053 return NULL;
1054
1055 fa_head = &li->falh;
1056 insert_leaf_info(&l->list, li);
1057 goto done;
1058 }
1059 l = leaf_new();
1060
1061 if (!l)
1062 return NULL;
1063
1064 l->key = key;
1065 li = leaf_info_new(plen);
1066
1067 if (!li) {
1068 tnode_free((struct tnode *) l);
1069 return NULL;
1070 }
1071
1072 fa_head = &li->falh;
1073 insert_leaf_info(&l->list, li);
1074
1075 if (t->trie && n == NULL) {
1076 /* Case 2: n is NULL, and will just insert a new leaf */
1077
1078 node_set_parent((struct node *)l, tp);
1079
1080 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1081 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1082 } else {
1083 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1084 /*
1085 * Add a new tnode here
1086 * first tnode need some special handling
1087 */
1088
1089 if (tp)
1090 pos = tp->pos+tp->bits;
1091 else
1092 pos = 0;
1093
1094 if (n) {
1095 newpos = tkey_mismatch(key, pos, n->key);
1096 tn = tnode_new(n->key, newpos, 1);
1097 } else {
1098 newpos = 0;
1099 tn = tnode_new(key, newpos, 1); /* First tnode */
1100 }
1101
1102 if (!tn) {
1103 free_leaf_info(li);
1104 tnode_free((struct tnode *) l);
1105 return NULL;
1106 }
1107
1108 node_set_parent((struct node *)tn, tp);
1109
1110 missbit = tkey_extract_bits(key, newpos, 1);
1111 put_child(t, tn, missbit, (struct node *)l);
1112 put_child(t, tn, 1-missbit, n);
1113
1114 if (tp) {
1115 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1116 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1117 } else {
1118 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1119 tp = tn;
1120 }
1121 }
1122
1123 if (tp && tp->pos + tp->bits > 32)
1124 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1125 tp, tp->pos, tp->bits, key, plen);
1126
1127 /* Rebalance the trie */
1128
1129 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1130 done:
1131 return fa_head;
1132 }
1133
1134 /*
1135 * Caller must hold RTNL.
1136 */
1137 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1138 {
1139 struct trie *t = (struct trie *) tb->tb_data;
1140 struct fib_alias *fa, *new_fa;
1141 struct list_head *fa_head = NULL;
1142 struct fib_info *fi;
1143 int plen = cfg->fc_dst_len;
1144 u8 tos = cfg->fc_tos;
1145 u32 key, mask;
1146 int err;
1147 struct leaf *l;
1148
1149 if (plen > 32)
1150 return -EINVAL;
1151
1152 key = ntohl(cfg->fc_dst);
1153
1154 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1155
1156 mask = ntohl(inet_make_mask(plen));
1157
1158 if (key & ~mask)
1159 return -EINVAL;
1160
1161 key = key & mask;
1162
1163 fi = fib_create_info(cfg);
1164 if (IS_ERR(fi)) {
1165 err = PTR_ERR(fi);
1166 goto err;
1167 }
1168
1169 l = fib_find_node(t, key);
1170 fa = NULL;
1171
1172 if (l) {
1173 fa_head = get_fa_head(l, plen);
1174 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1175 }
1176
1177 /* Now fa, if non-NULL, points to the first fib alias
1178 * with the same keys [prefix,tos,priority], if such key already
1179 * exists or to the node before which we will insert new one.
1180 *
1181 * If fa is NULL, we will need to allocate a new one and
1182 * insert to the head of f.
1183 *
1184 * If f is NULL, no fib node matched the destination key
1185 * and we need to allocate a new one of those as well.
1186 */
1187
1188 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1189 struct fib_alias *fa_orig;
1190
1191 err = -EEXIST;
1192 if (cfg->fc_nlflags & NLM_F_EXCL)
1193 goto out;
1194
1195 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1196 struct fib_info *fi_drop;
1197 u8 state;
1198
1199 if (fi->fib_treeref > 1)
1200 goto out;
1201
1202 err = -ENOBUFS;
1203 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1204 if (new_fa == NULL)
1205 goto out;
1206
1207 fi_drop = fa->fa_info;
1208 new_fa->fa_tos = fa->fa_tos;
1209 new_fa->fa_info = fi;
1210 new_fa->fa_type = cfg->fc_type;
1211 new_fa->fa_scope = cfg->fc_scope;
1212 state = fa->fa_state;
1213 new_fa->fa_state &= ~FA_S_ACCESSED;
1214
1215 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1216 alias_free_mem_rcu(fa);
1217
1218 fib_release_info(fi_drop);
1219 if (state & FA_S_ACCESSED)
1220 rt_cache_flush(-1);
1221 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1222 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1223
1224 goto succeeded;
1225 }
1226 /* Error if we find a perfect match which
1227 * uses the same scope, type, and nexthop
1228 * information.
1229 */
1230 fa_orig = fa;
1231 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1232 if (fa->fa_tos != tos)
1233 break;
1234 if (fa->fa_info->fib_priority != fi->fib_priority)
1235 break;
1236 if (fa->fa_type == cfg->fc_type &&
1237 fa->fa_scope == cfg->fc_scope &&
1238 fa->fa_info == fi) {
1239 goto out;
1240 }
1241 }
1242 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1243 fa = fa_orig;
1244 }
1245 err = -ENOENT;
1246 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1247 goto out;
1248
1249 err = -ENOBUFS;
1250 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1251 if (new_fa == NULL)
1252 goto out;
1253
1254 new_fa->fa_info = fi;
1255 new_fa->fa_tos = tos;
1256 new_fa->fa_type = cfg->fc_type;
1257 new_fa->fa_scope = cfg->fc_scope;
1258 new_fa->fa_state = 0;
1259 /*
1260 * Insert new entry to the list.
1261 */
1262
1263 if (!fa_head) {
1264 fa_head = fib_insert_node(t, key, plen);
1265 if (unlikely(!fa_head)) {
1266 err = -ENOMEM;
1267 goto out_free_new_fa;
1268 }
1269 }
1270
1271 list_add_tail_rcu(&new_fa->fa_list,
1272 (fa ? &fa->fa_list : fa_head));
1273
1274 t->size++;
1275
1276 rt_cache_flush(-1);
1277 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1278 &cfg->fc_nlinfo, 0);
1279 succeeded:
1280 return 0;
1281
1282 out_free_new_fa:
1283 kmem_cache_free(fn_alias_kmem, new_fa);
1284 out:
1285 fib_release_info(fi);
1286 err:
1287 return err;
1288 }
1289
1290
1291 /* should be called with rcu_read_lock */
1292 static inline int check_leaf(struct trie *t, struct leaf *l,
1293 t_key key, int *plen, const struct flowi *flp,
1294 struct fib_result *res)
1295 {
1296 int err, i;
1297 __be32 mask;
1298 struct leaf_info *li;
1299 struct hlist_head *hhead = &l->list;
1300 struct hlist_node *node;
1301
1302 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1303 i = li->plen;
1304 mask = inet_make_mask(i);
1305 if (l->key != (key & ntohl(mask)))
1306 continue;
1307
1308 if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1309 *plen = i;
1310 #ifdef CONFIG_IP_FIB_TRIE_STATS
1311 t->stats.semantic_match_passed++;
1312 #endif
1313 return err;
1314 }
1315 #ifdef CONFIG_IP_FIB_TRIE_STATS
1316 t->stats.semantic_match_miss++;
1317 #endif
1318 }
1319 return 1;
1320 }
1321
1322 static int
1323 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1324 {
1325 struct trie *t = (struct trie *) tb->tb_data;
1326 int plen, ret = 0;
1327 struct node *n;
1328 struct tnode *pn;
1329 int pos, bits;
1330 t_key key = ntohl(flp->fl4_dst);
1331 int chopped_off;
1332 t_key cindex = 0;
1333 int current_prefix_length = KEYLENGTH;
1334 struct tnode *cn;
1335 t_key node_prefix, key_prefix, pref_mismatch;
1336 int mp;
1337
1338 rcu_read_lock();
1339
1340 n = rcu_dereference(t->trie);
1341 if (!n)
1342 goto failed;
1343
1344 #ifdef CONFIG_IP_FIB_TRIE_STATS
1345 t->stats.gets++;
1346 #endif
1347
1348 /* Just a leaf? */
1349 if (IS_LEAF(n)) {
1350 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1351 goto found;
1352 goto failed;
1353 }
1354 pn = (struct tnode *) n;
1355 chopped_off = 0;
1356
1357 while (pn) {
1358 pos = pn->pos;
1359 bits = pn->bits;
1360
1361 if (!chopped_off)
1362 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1363 pos, bits);
1364
1365 n = tnode_get_child(pn, cindex);
1366
1367 if (n == NULL) {
1368 #ifdef CONFIG_IP_FIB_TRIE_STATS
1369 t->stats.null_node_hit++;
1370 #endif
1371 goto backtrace;
1372 }
1373
1374 if (IS_LEAF(n)) {
1375 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1376 goto found;
1377 else
1378 goto backtrace;
1379 }
1380
1381 #define HL_OPTIMIZE
1382 #ifdef HL_OPTIMIZE
1383 cn = (struct tnode *)n;
1384
1385 /*
1386 * It's a tnode, and we can do some extra checks here if we
1387 * like, to avoid descending into a dead-end branch.
1388 * This tnode is in the parent's child array at index
1389 * key[p_pos..p_pos+p_bits] but potentially with some bits
1390 * chopped off, so in reality the index may be just a
1391 * subprefix, padded with zero at the end.
1392 * We can also take a look at any skipped bits in this
1393 * tnode - everything up to p_pos is supposed to be ok,
1394 * and the non-chopped bits of the index (se previous
1395 * paragraph) are also guaranteed ok, but the rest is
1396 * considered unknown.
1397 *
1398 * The skipped bits are key[pos+bits..cn->pos].
1399 */
1400
1401 /* If current_prefix_length < pos+bits, we are already doing
1402 * actual prefix matching, which means everything from
1403 * pos+(bits-chopped_off) onward must be zero along some
1404 * branch of this subtree - otherwise there is *no* valid
1405 * prefix present. Here we can only check the skipped
1406 * bits. Remember, since we have already indexed into the
1407 * parent's child array, we know that the bits we chopped of
1408 * *are* zero.
1409 */
1410
1411 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1412
1413 if (current_prefix_length < pos+bits) {
1414 if (tkey_extract_bits(cn->key, current_prefix_length,
1415 cn->pos - current_prefix_length) != 0 ||
1416 !(cn->child[0]))
1417 goto backtrace;
1418 }
1419
1420 /*
1421 * If chopped_off=0, the index is fully validated and we
1422 * only need to look at the skipped bits for this, the new,
1423 * tnode. What we actually want to do is to find out if
1424 * these skipped bits match our key perfectly, or if we will
1425 * have to count on finding a matching prefix further down,
1426 * because if we do, we would like to have some way of
1427 * verifying the existence of such a prefix at this point.
1428 */
1429
1430 /* The only thing we can do at this point is to verify that
1431 * any such matching prefix can indeed be a prefix to our
1432 * key, and if the bits in the node we are inspecting that
1433 * do not match our key are not ZERO, this cannot be true.
1434 * Thus, find out where there is a mismatch (before cn->pos)
1435 * and verify that all the mismatching bits are zero in the
1436 * new tnode's key.
1437 */
1438
1439 /* Note: We aren't very concerned about the piece of the key
1440 * that precede pn->pos+pn->bits, since these have already been
1441 * checked. The bits after cn->pos aren't checked since these are
1442 * by definition "unknown" at this point. Thus, what we want to
1443 * see is if we are about to enter the "prefix matching" state,
1444 * and in that case verify that the skipped bits that will prevail
1445 * throughout this subtree are zero, as they have to be if we are
1446 * to find a matching prefix.
1447 */
1448
1449 node_prefix = mask_pfx(cn->key, cn->pos);
1450 key_prefix = mask_pfx(key, cn->pos);
1451 pref_mismatch = key_prefix^node_prefix;
1452 mp = 0;
1453
1454 /* In short: If skipped bits in this node do not match the search
1455 * key, enter the "prefix matching" state.directly.
1456 */
1457 if (pref_mismatch) {
1458 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1459 mp++;
1460 pref_mismatch = pref_mismatch <<1;
1461 }
1462 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1463
1464 if (key_prefix != 0)
1465 goto backtrace;
1466
1467 if (current_prefix_length >= cn->pos)
1468 current_prefix_length = mp;
1469 }
1470 #endif
1471 pn = (struct tnode *)n; /* Descend */
1472 chopped_off = 0;
1473 continue;
1474
1475 backtrace:
1476 chopped_off++;
1477
1478 /* As zero don't change the child key (cindex) */
1479 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1480 chopped_off++;
1481
1482 /* Decrease current_... with bits chopped off */
1483 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1484 current_prefix_length = pn->pos + pn->bits - chopped_off;
1485
1486 /*
1487 * Either we do the actual chop off according or if we have
1488 * chopped off all bits in this tnode walk up to our parent.
1489 */
1490
1491 if (chopped_off <= pn->bits) {
1492 cindex &= ~(1 << (chopped_off-1));
1493 } else {
1494 struct tnode *parent = node_parent((struct node *) pn);
1495 if (!parent)
1496 goto failed;
1497
1498 /* Get Child's index */
1499 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1500 pn = parent;
1501 chopped_off = 0;
1502
1503 #ifdef CONFIG_IP_FIB_TRIE_STATS
1504 t->stats.backtrack++;
1505 #endif
1506 goto backtrace;
1507 }
1508 }
1509 failed:
1510 ret = 1;
1511 found:
1512 rcu_read_unlock();
1513 return ret;
1514 }
1515
1516 /* only called from updater side */
1517 static int trie_leaf_remove(struct trie *t, t_key key)
1518 {
1519 t_key cindex;
1520 struct tnode *tp = NULL;
1521 struct node *n = t->trie;
1522 struct leaf *l;
1523
1524 pr_debug("entering trie_leaf_remove(%p)\n", n);
1525
1526 /* Note that in the case skipped bits, those bits are *not* checked!
1527 * When we finish this, we will have NULL or a T_LEAF, and the
1528 * T_LEAF may or may not match our key.
1529 */
1530
1531 while (n != NULL && IS_TNODE(n)) {
1532 struct tnode *tn = (struct tnode *) n;
1533 check_tnode(tn);
1534 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1535
1536 BUG_ON(n && node_parent(n) != tn);
1537 }
1538 l = (struct leaf *) n;
1539
1540 if (!n || !tkey_equals(l->key, key))
1541 return 0;
1542
1543 /*
1544 * Key found.
1545 * Remove the leaf and rebalance the tree
1546 */
1547
1548 t->size--;
1549
1550 tp = node_parent(n);
1551 tnode_free((struct tnode *) n);
1552
1553 if (tp) {
1554 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1555 put_child(t, (struct tnode *)tp, cindex, NULL);
1556 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1557 } else
1558 rcu_assign_pointer(t->trie, NULL);
1559
1560 return 1;
1561 }
1562
1563 /*
1564 * Caller must hold RTNL.
1565 */
1566 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1567 {
1568 struct trie *t = (struct trie *) tb->tb_data;
1569 u32 key, mask;
1570 int plen = cfg->fc_dst_len;
1571 u8 tos = cfg->fc_tos;
1572 struct fib_alias *fa, *fa_to_delete;
1573 struct list_head *fa_head;
1574 struct leaf *l;
1575 struct leaf_info *li;
1576
1577 if (plen > 32)
1578 return -EINVAL;
1579
1580 key = ntohl(cfg->fc_dst);
1581 mask = ntohl(inet_make_mask(plen));
1582
1583 if (key & ~mask)
1584 return -EINVAL;
1585
1586 key = key & mask;
1587 l = fib_find_node(t, key);
1588
1589 if (!l)
1590 return -ESRCH;
1591
1592 fa_head = get_fa_head(l, plen);
1593 fa = fib_find_alias(fa_head, tos, 0);
1594
1595 if (!fa)
1596 return -ESRCH;
1597
1598 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1599
1600 fa_to_delete = NULL;
1601 fa_head = fa->fa_list.prev;
1602
1603 list_for_each_entry(fa, fa_head, fa_list) {
1604 struct fib_info *fi = fa->fa_info;
1605
1606 if (fa->fa_tos != tos)
1607 break;
1608
1609 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1610 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1611 fa->fa_scope == cfg->fc_scope) &&
1612 (!cfg->fc_protocol ||
1613 fi->fib_protocol == cfg->fc_protocol) &&
1614 fib_nh_match(cfg, fi) == 0) {
1615 fa_to_delete = fa;
1616 break;
1617 }
1618 }
1619
1620 if (!fa_to_delete)
1621 return -ESRCH;
1622
1623 fa = fa_to_delete;
1624 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1625 &cfg->fc_nlinfo, 0);
1626
1627 l = fib_find_node(t, key);
1628 li = find_leaf_info(l, plen);
1629
1630 list_del_rcu(&fa->fa_list);
1631
1632 if (list_empty(fa_head)) {
1633 hlist_del_rcu(&li->hlist);
1634 free_leaf_info(li);
1635 }
1636
1637 if (hlist_empty(&l->list))
1638 trie_leaf_remove(t, key);
1639
1640 if (fa->fa_state & FA_S_ACCESSED)
1641 rt_cache_flush(-1);
1642
1643 fib_release_info(fa->fa_info);
1644 alias_free_mem_rcu(fa);
1645 return 0;
1646 }
1647
1648 static int trie_flush_list(struct trie *t, struct list_head *head)
1649 {
1650 struct fib_alias *fa, *fa_node;
1651 int found = 0;
1652
1653 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1654 struct fib_info *fi = fa->fa_info;
1655
1656 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1657 list_del_rcu(&fa->fa_list);
1658 fib_release_info(fa->fa_info);
1659 alias_free_mem_rcu(fa);
1660 found++;
1661 }
1662 }
1663 return found;
1664 }
1665
1666 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1667 {
1668 int found = 0;
1669 struct hlist_head *lih = &l->list;
1670 struct hlist_node *node, *tmp;
1671 struct leaf_info *li = NULL;
1672
1673 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1674 found += trie_flush_list(t, &li->falh);
1675
1676 if (list_empty(&li->falh)) {
1677 hlist_del_rcu(&li->hlist);
1678 free_leaf_info(li);
1679 }
1680 }
1681 return found;
1682 }
1683
1684 /* rcu_read_lock needs to be hold by caller from readside */
1685
1686 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1687 {
1688 struct node *c = (struct node *) thisleaf;
1689 struct tnode *p;
1690 int idx;
1691 struct node *trie = rcu_dereference(t->trie);
1692
1693 if (c == NULL) {
1694 if (trie == NULL)
1695 return NULL;
1696
1697 if (IS_LEAF(trie)) /* trie w. just a leaf */
1698 return (struct leaf *) trie;
1699
1700 p = (struct tnode*) trie; /* Start */
1701 } else
1702 p = node_parent_rcu(c);
1703
1704 while (p) {
1705 int pos, last;
1706
1707 /* Find the next child of the parent */
1708 if (c)
1709 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1710 else
1711 pos = 0;
1712
1713 last = 1 << p->bits;
1714 for (idx = pos; idx < last ; idx++) {
1715 c = rcu_dereference(p->child[idx]);
1716
1717 if (!c)
1718 continue;
1719
1720 /* Decend if tnode */
1721 while (IS_TNODE(c)) {
1722 p = (struct tnode *) c;
1723 idx = 0;
1724
1725 /* Rightmost non-NULL branch */
1726 if (p && IS_TNODE(p))
1727 while (!(c = rcu_dereference(p->child[idx]))
1728 && idx < (1<<p->bits)) idx++;
1729
1730 /* Done with this tnode? */
1731 if (idx >= (1 << p->bits) || !c)
1732 goto up;
1733 }
1734 return (struct leaf *) c;
1735 }
1736 up:
1737 /* No more children go up one step */
1738 c = (struct node *) p;
1739 p = node_parent_rcu(c);
1740 }
1741 return NULL; /* Ready. Root of trie */
1742 }
1743
1744 /*
1745 * Caller must hold RTNL.
1746 */
1747 static int fn_trie_flush(struct fib_table *tb)
1748 {
1749 struct trie *t = (struct trie *) tb->tb_data;
1750 struct leaf *ll = NULL, *l = NULL;
1751 int found = 0, h;
1752
1753 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1754 found += trie_flush_leaf(t, l);
1755
1756 if (ll && hlist_empty(&ll->list))
1757 trie_leaf_remove(t, ll->key);
1758 ll = l;
1759 }
1760
1761 if (ll && hlist_empty(&ll->list))
1762 trie_leaf_remove(t, ll->key);
1763
1764 pr_debug("trie_flush found=%d\n", found);
1765 return found;
1766 }
1767
1768 static void
1769 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1770 {
1771 struct trie *t = (struct trie *) tb->tb_data;
1772 int order, last_idx;
1773 struct fib_info *fi = NULL;
1774 struct fib_info *last_resort;
1775 struct fib_alias *fa = NULL;
1776 struct list_head *fa_head;
1777 struct leaf *l;
1778
1779 last_idx = -1;
1780 last_resort = NULL;
1781 order = -1;
1782
1783 rcu_read_lock();
1784
1785 l = fib_find_node(t, 0);
1786 if (!l)
1787 goto out;
1788
1789 fa_head = get_fa_head(l, 0);
1790 if (!fa_head)
1791 goto out;
1792
1793 if (list_empty(fa_head))
1794 goto out;
1795
1796 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1797 struct fib_info *next_fi = fa->fa_info;
1798
1799 if (fa->fa_scope != res->scope ||
1800 fa->fa_type != RTN_UNICAST)
1801 continue;
1802
1803 if (next_fi->fib_priority > res->fi->fib_priority)
1804 break;
1805 if (!next_fi->fib_nh[0].nh_gw ||
1806 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1807 continue;
1808 fa->fa_state |= FA_S_ACCESSED;
1809
1810 if (fi == NULL) {
1811 if (next_fi != res->fi)
1812 break;
1813 } else if (!fib_detect_death(fi, order, &last_resort,
1814 &last_idx, tb->tb_default)) {
1815 fib_result_assign(res, fi);
1816 tb->tb_default = order;
1817 goto out;
1818 }
1819 fi = next_fi;
1820 order++;
1821 }
1822 if (order <= 0 || fi == NULL) {
1823 tb->tb_default = -1;
1824 goto out;
1825 }
1826
1827 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1828 tb->tb_default)) {
1829 fib_result_assign(res, fi);
1830 tb->tb_default = order;
1831 goto out;
1832 }
1833 if (last_idx >= 0)
1834 fib_result_assign(res, last_resort);
1835 tb->tb_default = last_idx;
1836 out:
1837 rcu_read_unlock();
1838 }
1839
1840 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1841 struct sk_buff *skb, struct netlink_callback *cb)
1842 {
1843 int i, s_i;
1844 struct fib_alias *fa;
1845
1846 __be32 xkey = htonl(key);
1847
1848 s_i = cb->args[4];
1849 i = 0;
1850
1851 /* rcu_read_lock is hold by caller */
1852
1853 list_for_each_entry_rcu(fa, fah, fa_list) {
1854 if (i < s_i) {
1855 i++;
1856 continue;
1857 }
1858 BUG_ON(!fa->fa_info);
1859
1860 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1861 cb->nlh->nlmsg_seq,
1862 RTM_NEWROUTE,
1863 tb->tb_id,
1864 fa->fa_type,
1865 fa->fa_scope,
1866 xkey,
1867 plen,
1868 fa->fa_tos,
1869 fa->fa_info, 0) < 0) {
1870 cb->args[4] = i;
1871 return -1;
1872 }
1873 i++;
1874 }
1875 cb->args[4] = i;
1876 return skb->len;
1877 }
1878
1879 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1880 struct netlink_callback *cb)
1881 {
1882 int h, s_h;
1883 struct list_head *fa_head;
1884 struct leaf *l = NULL;
1885
1886 s_h = cb->args[3];
1887
1888 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1889 if (h < s_h)
1890 continue;
1891 if (h > s_h)
1892 memset(&cb->args[4], 0,
1893 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1894
1895 fa_head = get_fa_head(l, plen);
1896
1897 if (!fa_head)
1898 continue;
1899
1900 if (list_empty(fa_head))
1901 continue;
1902
1903 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1904 cb->args[3] = h;
1905 return -1;
1906 }
1907 }
1908 cb->args[3] = h;
1909 return skb->len;
1910 }
1911
1912 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1913 {
1914 int m, s_m;
1915 struct trie *t = (struct trie *) tb->tb_data;
1916
1917 s_m = cb->args[2];
1918
1919 rcu_read_lock();
1920 for (m = 0; m <= 32; m++) {
1921 if (m < s_m)
1922 continue;
1923 if (m > s_m)
1924 memset(&cb->args[3], 0,
1925 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1926
1927 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1928 cb->args[2] = m;
1929 goto out;
1930 }
1931 }
1932 rcu_read_unlock();
1933 cb->args[2] = m;
1934 return skb->len;
1935 out:
1936 rcu_read_unlock();
1937 return -1;
1938 }
1939
1940 void __init fib_hash_init(void)
1941 {
1942 fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
1943 0, SLAB_PANIC, NULL);
1944
1945 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1946 max(sizeof(struct leaf),
1947 sizeof(struct leaf_info)),
1948 0, SLAB_PANIC, NULL);
1949 }
1950
1951
1952 /* Fix more generic FIB names for init later */
1953 struct fib_table *fib_hash_table(u32 id)
1954 {
1955 struct fib_table *tb;
1956 struct trie *t;
1957
1958 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1959 GFP_KERNEL);
1960 if (tb == NULL)
1961 return NULL;
1962
1963 tb->tb_id = id;
1964 tb->tb_default = -1;
1965 tb->tb_lookup = fn_trie_lookup;
1966 tb->tb_insert = fn_trie_insert;
1967 tb->tb_delete = fn_trie_delete;
1968 tb->tb_flush = fn_trie_flush;
1969 tb->tb_select_default = fn_trie_select_default;
1970 tb->tb_dump = fn_trie_dump;
1971
1972 t = (struct trie *) tb->tb_data;
1973 memset(t, 0, sizeof(*t));
1974
1975 if (id == RT_TABLE_LOCAL)
1976 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1977
1978 return tb;
1979 }
1980
1981 #ifdef CONFIG_PROC_FS
1982 /* Depth first Trie walk iterator */
1983 struct fib_trie_iter {
1984 struct seq_net_private p;
1985 struct trie *trie_local, *trie_main;
1986 struct tnode *tnode;
1987 struct trie *trie;
1988 unsigned index;
1989 unsigned depth;
1990 };
1991
1992 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1993 {
1994 struct tnode *tn = iter->tnode;
1995 unsigned cindex = iter->index;
1996 struct tnode *p;
1997
1998 /* A single entry routing table */
1999 if (!tn)
2000 return NULL;
2001
2002 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2003 iter->tnode, iter->index, iter->depth);
2004 rescan:
2005 while (cindex < (1<<tn->bits)) {
2006 struct node *n = tnode_get_child_rcu(tn, cindex);
2007
2008 if (n) {
2009 if (IS_LEAF(n)) {
2010 iter->tnode = tn;
2011 iter->index = cindex + 1;
2012 } else {
2013 /* push down one level */
2014 iter->tnode = (struct tnode *) n;
2015 iter->index = 0;
2016 ++iter->depth;
2017 }
2018 return n;
2019 }
2020
2021 ++cindex;
2022 }
2023
2024 /* Current node exhausted, pop back up */
2025 p = node_parent_rcu((struct node *)tn);
2026 if (p) {
2027 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2028 tn = p;
2029 --iter->depth;
2030 goto rescan;
2031 }
2032
2033 /* got root? */
2034 return NULL;
2035 }
2036
2037 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2038 struct trie *t)
2039 {
2040 struct node *n ;
2041
2042 if (!t)
2043 return NULL;
2044
2045 n = rcu_dereference(t->trie);
2046
2047 if (!iter)
2048 return NULL;
2049
2050 if (n) {
2051 if (IS_TNODE(n)) {
2052 iter->tnode = (struct tnode *) n;
2053 iter->trie = t;
2054 iter->index = 0;
2055 iter->depth = 1;
2056 } else {
2057 iter->tnode = NULL;
2058 iter->trie = t;
2059 iter->index = 0;
2060 iter->depth = 0;
2061 }
2062 return n;
2063 }
2064 return NULL;
2065 }
2066
2067 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2068 {
2069 struct node *n;
2070 struct fib_trie_iter iter;
2071
2072 memset(s, 0, sizeof(*s));
2073
2074 rcu_read_lock();
2075 for (n = fib_trie_get_first(&iter, t); n;
2076 n = fib_trie_get_next(&iter)) {
2077 if (IS_LEAF(n)) {
2078 s->leaves++;
2079 s->totdepth += iter.depth;
2080 if (iter.depth > s->maxdepth)
2081 s->maxdepth = iter.depth;
2082 } else {
2083 const struct tnode *tn = (const struct tnode *) n;
2084 int i;
2085
2086 s->tnodes++;
2087 if (tn->bits < MAX_STAT_DEPTH)
2088 s->nodesizes[tn->bits]++;
2089
2090 for (i = 0; i < (1<<tn->bits); i++)
2091 if (!tn->child[i])
2092 s->nullpointers++;
2093 }
2094 }
2095 rcu_read_unlock();
2096 }
2097
2098 /*
2099 * This outputs /proc/net/fib_triestats
2100 */
2101 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2102 {
2103 unsigned i, max, pointers, bytes, avdepth;
2104
2105 if (stat->leaves)
2106 avdepth = stat->totdepth*100 / stat->leaves;
2107 else
2108 avdepth = 0;
2109
2110 seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100 );
2111 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2112
2113 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2114
2115 bytes = sizeof(struct leaf) * stat->leaves;
2116 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2117 bytes += sizeof(struct tnode) * stat->tnodes;
2118
2119 max = MAX_STAT_DEPTH;
2120 while (max > 0 && stat->nodesizes[max-1] == 0)
2121 max--;
2122
2123 pointers = 0;
2124 for (i = 1; i <= max; i++)
2125 if (stat->nodesizes[i] != 0) {
2126 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2127 pointers += (1<<i) * stat->nodesizes[i];
2128 }
2129 seq_putc(seq, '\n');
2130 seq_printf(seq, "\tPointers: %u\n", pointers);
2131
2132 bytes += sizeof(struct node *) * pointers;
2133 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2134 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2135 }
2136
2137 #ifdef CONFIG_IP_FIB_TRIE_STATS
2138 static void trie_show_usage(struct seq_file *seq,
2139 const struct trie_use_stats *stats)
2140 {
2141 seq_printf(seq, "\nCounters:\n---------\n");
2142 seq_printf(seq,"gets = %u\n", stats->gets);
2143 seq_printf(seq,"backtracks = %u\n", stats->backtrack);
2144 seq_printf(seq,"semantic match passed = %u\n", stats->semantic_match_passed);
2145 seq_printf(seq,"semantic match miss = %u\n", stats->semantic_match_miss);
2146 seq_printf(seq,"null node hit= %u\n", stats->null_node_hit);
2147 seq_printf(seq,"skipped node resize = %u\n\n", stats->resize_node_skipped);
2148 }
2149 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2150
2151 static void fib_trie_show(struct seq_file *seq, const char *name, struct trie *trie)
2152 {
2153 struct trie_stat stat;
2154
2155 seq_printf(seq, "%s: %d\n", name, trie->size);
2156 trie_collect_stats(trie, &stat);
2157 trie_show_stats(seq, &stat);
2158 #ifdef CONFIG_IP_FIB_TRIE_STATS
2159 trie_show_usage(seq, &trie->stats);
2160 #endif
2161 }
2162
2163 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2164 {
2165 struct net *net = (struct net *)seq->private;
2166 struct fib_table *tb;
2167
2168 seq_printf(seq,
2169 "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2170 sizeof(struct leaf), sizeof(struct tnode));
2171
2172 tb = fib_get_table(net, RT_TABLE_LOCAL);
2173 if (tb)
2174 fib_trie_show(seq, "Local", (struct trie *) tb->tb_data);
2175
2176 tb = fib_get_table(net, RT_TABLE_MAIN);
2177 if (tb)
2178 fib_trie_show(seq, "Main", (struct trie *) tb->tb_data);
2179
2180 return 0;
2181 }
2182
2183 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2184 {
2185 int err;
2186 struct net *net;
2187
2188 net = get_proc_net(inode);
2189 if (net == NULL)
2190 return -ENXIO;
2191 err = single_open(file, fib_triestat_seq_show, net);
2192 if (err < 0) {
2193 put_net(net);
2194 return err;
2195 }
2196 return 0;
2197 }
2198
2199 static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2200 {
2201 struct seq_file *seq = f->private_data;
2202 put_net(seq->private);
2203 return single_release(ino, f);
2204 }
2205
2206 static const struct file_operations fib_triestat_fops = {
2207 .owner = THIS_MODULE,
2208 .open = fib_triestat_seq_open,
2209 .read = seq_read,
2210 .llseek = seq_lseek,
2211 .release = fib_triestat_seq_release,
2212 };
2213
2214 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2215 loff_t pos)
2216 {
2217 loff_t idx = 0;
2218 struct node *n;
2219
2220 for (n = fib_trie_get_first(iter, iter->trie_local);
2221 n; ++idx, n = fib_trie_get_next(iter)) {
2222 if (pos == idx)
2223 return n;
2224 }
2225
2226 for (n = fib_trie_get_first(iter, iter->trie_main);
2227 n; ++idx, n = fib_trie_get_next(iter)) {
2228 if (pos == idx)
2229 return n;
2230 }
2231 return NULL;
2232 }
2233
2234 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2235 __acquires(RCU)
2236 {
2237 struct fib_trie_iter *iter = seq->private;
2238 struct fib_table *tb;
2239
2240 if (!iter->trie_local) {
2241 tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
2242 if (tb)
2243 iter->trie_local = (struct trie *) tb->tb_data;
2244 }
2245 if (!iter->trie_main) {
2246 tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
2247 if (tb)
2248 iter->trie_main = (struct trie *) tb->tb_data;
2249 }
2250 rcu_read_lock();
2251 if (*pos == 0)
2252 return SEQ_START_TOKEN;
2253 return fib_trie_get_idx(iter, *pos - 1);
2254 }
2255
2256 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2257 {
2258 struct fib_trie_iter *iter = seq->private;
2259 void *l = v;
2260
2261 ++*pos;
2262 if (v == SEQ_START_TOKEN)
2263 return fib_trie_get_idx(iter, 0);
2264
2265 v = fib_trie_get_next(iter);
2266 BUG_ON(v == l);
2267 if (v)
2268 return v;
2269
2270 /* continue scan in next trie */
2271 if (iter->trie == iter->trie_local)
2272 return fib_trie_get_first(iter, iter->trie_main);
2273
2274 return NULL;
2275 }
2276
2277 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2278 __releases(RCU)
2279 {
2280 rcu_read_unlock();
2281 }
2282
2283 static void seq_indent(struct seq_file *seq, int n)
2284 {
2285 while (n-- > 0) seq_puts(seq, " ");
2286 }
2287
2288 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2289 {
2290 switch (s) {
2291 case RT_SCOPE_UNIVERSE: return "universe";
2292 case RT_SCOPE_SITE: return "site";
2293 case RT_SCOPE_LINK: return "link";
2294 case RT_SCOPE_HOST: return "host";
2295 case RT_SCOPE_NOWHERE: return "nowhere";
2296 default:
2297 snprintf(buf, len, "scope=%d", s);
2298 return buf;
2299 }
2300 }
2301
2302 static const char *rtn_type_names[__RTN_MAX] = {
2303 [RTN_UNSPEC] = "UNSPEC",
2304 [RTN_UNICAST] = "UNICAST",
2305 [RTN_LOCAL] = "LOCAL",
2306 [RTN_BROADCAST] = "BROADCAST",
2307 [RTN_ANYCAST] = "ANYCAST",
2308 [RTN_MULTICAST] = "MULTICAST",
2309 [RTN_BLACKHOLE] = "BLACKHOLE",
2310 [RTN_UNREACHABLE] = "UNREACHABLE",
2311 [RTN_PROHIBIT] = "PROHIBIT",
2312 [RTN_THROW] = "THROW",
2313 [RTN_NAT] = "NAT",
2314 [RTN_XRESOLVE] = "XRESOLVE",
2315 };
2316
2317 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2318 {
2319 if (t < __RTN_MAX && rtn_type_names[t])
2320 return rtn_type_names[t];
2321 snprintf(buf, len, "type %u", t);
2322 return buf;
2323 }
2324
2325 /* Pretty print the trie */
2326 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2327 {
2328 const struct fib_trie_iter *iter = seq->private;
2329 struct node *n = v;
2330
2331 if (v == SEQ_START_TOKEN)
2332 return 0;
2333
2334 if (!node_parent_rcu(n)) {
2335 if (iter->trie == iter->trie_local)
2336 seq_puts(seq, "<local>:\n");
2337 else
2338 seq_puts(seq, "<main>:\n");
2339 }
2340
2341 if (IS_TNODE(n)) {
2342 struct tnode *tn = (struct tnode *) n;
2343 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2344
2345 seq_indent(seq, iter->depth-1);
2346 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
2347 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2348 tn->empty_children);
2349
2350 } else {
2351 struct leaf *l = (struct leaf *) n;
2352 int i;
2353 __be32 val = htonl(l->key);
2354
2355 seq_indent(seq, iter->depth);
2356 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2357 for (i = 32; i >= 0; i--) {
2358 struct leaf_info *li = find_leaf_info(l, i);
2359
2360 if (li) {
2361 struct fib_alias *fa;
2362
2363 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2364 char buf1[32], buf2[32];
2365
2366 seq_indent(seq, iter->depth+1);
2367 seq_printf(seq, " /%d %s %s", i,
2368 rtn_scope(buf1, sizeof(buf1),
2369 fa->fa_scope),
2370 rtn_type(buf2, sizeof(buf2),
2371 fa->fa_type));
2372 if (fa->fa_tos)
2373 seq_printf(seq, "tos =%d\n",
2374 fa->fa_tos);
2375 seq_putc(seq, '\n');
2376 }
2377 }
2378 }
2379 }
2380
2381 return 0;
2382 }
2383
2384 static const struct seq_operations fib_trie_seq_ops = {
2385 .start = fib_trie_seq_start,
2386 .next = fib_trie_seq_next,
2387 .stop = fib_trie_seq_stop,
2388 .show = fib_trie_seq_show,
2389 };
2390
2391 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2392 {
2393 return seq_open_net(inode, file, &fib_trie_seq_ops,
2394 sizeof(struct fib_trie_iter));
2395 }
2396
2397 static const struct file_operations fib_trie_fops = {
2398 .owner = THIS_MODULE,
2399 .open = fib_trie_seq_open,
2400 .read = seq_read,
2401 .llseek = seq_lseek,
2402 .release = seq_release_net,
2403 };
2404
2405 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2406 {
2407 static unsigned type2flags[RTN_MAX + 1] = {
2408 [7] = RTF_REJECT, [8] = RTF_REJECT,
2409 };
2410 unsigned flags = type2flags[type];
2411
2412 if (fi && fi->fib_nh->nh_gw)
2413 flags |= RTF_GATEWAY;
2414 if (mask == htonl(0xFFFFFFFF))
2415 flags |= RTF_HOST;
2416 flags |= RTF_UP;
2417 return flags;
2418 }
2419
2420 /*
2421 * This outputs /proc/net/route.
2422 * The format of the file is not supposed to be changed
2423 * and needs to be same as fib_hash output to avoid breaking
2424 * legacy utilities
2425 */
2426 static int fib_route_seq_show(struct seq_file *seq, void *v)
2427 {
2428 const struct fib_trie_iter *iter = seq->private;
2429 struct leaf *l = v;
2430 int i;
2431 char bf[128];
2432
2433 if (v == SEQ_START_TOKEN) {
2434 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2435 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2436 "\tWindow\tIRTT");
2437 return 0;
2438 }
2439
2440 if (iter->trie == iter->trie_local)
2441 return 0;
2442 if (IS_TNODE(l))
2443 return 0;
2444
2445 for (i=32; i>=0; i--) {
2446 struct leaf_info *li = find_leaf_info(l, i);
2447 struct fib_alias *fa;
2448 __be32 mask, prefix;
2449
2450 if (!li)
2451 continue;
2452
2453 mask = inet_make_mask(li->plen);
2454 prefix = htonl(l->key);
2455
2456 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2457 const struct fib_info *fi = fa->fa_info;
2458 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2459
2460 if (fa->fa_type == RTN_BROADCAST
2461 || fa->fa_type == RTN_MULTICAST)
2462 continue;
2463
2464 if (fi)
2465 snprintf(bf, sizeof(bf),
2466 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2467 fi->fib_dev ? fi->fib_dev->name : "*",
2468 prefix,
2469 fi->fib_nh->nh_gw, flags, 0, 0,
2470 fi->fib_priority,
2471 mask,
2472 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2473 fi->fib_window,
2474 fi->fib_rtt >> 3);
2475 else
2476 snprintf(bf, sizeof(bf),
2477 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2478 prefix, 0, flags, 0, 0, 0,
2479 mask, 0, 0, 0);
2480
2481 seq_printf(seq, "%-127s\n", bf);
2482 }
2483 }
2484
2485 return 0;
2486 }
2487
2488 static const struct seq_operations fib_route_seq_ops = {
2489 .start = fib_trie_seq_start,
2490 .next = fib_trie_seq_next,
2491 .stop = fib_trie_seq_stop,
2492 .show = fib_route_seq_show,
2493 };
2494
2495 static int fib_route_seq_open(struct inode *inode, struct file *file)
2496 {
2497 return seq_open_net(inode, file, &fib_route_seq_ops,
2498 sizeof(struct fib_trie_iter));
2499 }
2500
2501 static const struct file_operations fib_route_fops = {
2502 .owner = THIS_MODULE,
2503 .open = fib_route_seq_open,
2504 .read = seq_read,
2505 .llseek = seq_lseek,
2506 .release = seq_release_net,
2507 };
2508
2509 int __net_init fib_proc_init(struct net *net)
2510 {
2511 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2512 goto out1;
2513
2514 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2515 &fib_triestat_fops))
2516 goto out2;
2517
2518 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2519 goto out3;
2520
2521 return 0;
2522
2523 out3:
2524 proc_net_remove(net, "fib_triestat");
2525 out2:
2526 proc_net_remove(net, "fib_trie");
2527 out1:
2528 return -ENOMEM;
2529 }
2530
2531 void __net_exit fib_proc_exit(struct net *net)
2532 {
2533 proc_net_remove(net, "fib_trie");
2534 proc_net_remove(net, "fib_triestat");
2535 proc_net_remove(net, "route");
2536 }
2537
2538 #endif /* CONFIG_PROC_FS */
This page took 0.091251 seconds and 6 git commands to generate.