[IPV4] fib_trie: cleanup
[deliverable/linux.git] / net / ipv4 / fib_trie.c
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
51 */
52
53 #define VERSION "0.408"
54
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <asm/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/mm.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
65 #include <linux/in.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83
84 #undef CONFIG_IP_FIB_TRIE_STATS
85 #define MAX_STAT_DEPTH 32
86
87 #define KEYLENGTH (8*sizeof(t_key))
88 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
89 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
90
91 typedef unsigned int t_key;
92
93 #define T_TNODE 0
94 #define T_LEAF 1
95 #define NODE_TYPE_MASK 0x1UL
96 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
97
98 #define IS_TNODE(n) (!(n->parent & T_LEAF))
99 #define IS_LEAF(n) (n->parent & T_LEAF)
100
101 struct node {
102 t_key key;
103 unsigned long parent;
104 };
105
106 struct leaf {
107 t_key key;
108 unsigned long parent;
109 struct hlist_head list;
110 struct rcu_head rcu;
111 };
112
113 struct leaf_info {
114 struct hlist_node hlist;
115 struct rcu_head rcu;
116 int plen;
117 struct list_head falh;
118 };
119
120 struct tnode {
121 t_key key;
122 unsigned long parent;
123 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
124 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
125 unsigned short full_children; /* KEYLENGTH bits needed */
126 unsigned short empty_children; /* KEYLENGTH bits needed */
127 struct rcu_head rcu;
128 struct node *child[0];
129 };
130
131 #ifdef CONFIG_IP_FIB_TRIE_STATS
132 struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
138 unsigned int resize_node_skipped;
139 };
140 #endif
141
142 struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_STAT_DEPTH];
149 };
150
151 struct trie {
152 struct node *trie;
153 #ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats;
155 #endif
156 int size;
157 unsigned int revision;
158 };
159
160 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
162 static struct node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 static void tnode_free(struct tnode *tn);
166
167 static struct kmem_cache *fn_alias_kmem __read_mostly;
168 static struct trie *trie_local = NULL, *trie_main = NULL;
169
170 static inline struct tnode *node_parent(struct node *node)
171 {
172 struct tnode *ret;
173
174 ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
175 return rcu_dereference(ret);
176 }
177
178 static inline void node_set_parent(struct node *node, struct tnode *ptr)
179 {
180 rcu_assign_pointer(node->parent,
181 (unsigned long)ptr | NODE_TYPE(node));
182 }
183
184 /* rcu_read_lock needs to be hold by caller from readside */
185
186 static inline struct node *tnode_get_child(struct tnode *tn, int i)
187 {
188 BUG_ON(i >= 1 << tn->bits);
189
190 return rcu_dereference(tn->child[i]);
191 }
192
193 static inline int tnode_child_length(const struct tnode *tn)
194 {
195 return 1 << tn->bits;
196 }
197
198 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
199 {
200 if (offset < KEYLENGTH)
201 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
202 else
203 return 0;
204 }
205
206 static inline int tkey_equals(t_key a, t_key b)
207 {
208 return a == b;
209 }
210
211 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
212 {
213 if (bits == 0 || offset >= KEYLENGTH)
214 return 1;
215 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
216 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
217 }
218
219 static inline int tkey_mismatch(t_key a, int offset, t_key b)
220 {
221 t_key diff = a ^ b;
222 int i = offset;
223
224 if (!diff)
225 return 0;
226 while ((diff << i) >> (KEYLENGTH-1) == 0)
227 i++;
228 return i;
229 }
230
231 /*
232 To understand this stuff, an understanding of keys and all their bits is
233 necessary. Every node in the trie has a key associated with it, but not
234 all of the bits in that key are significant.
235
236 Consider a node 'n' and its parent 'tp'.
237
238 If n is a leaf, every bit in its key is significant. Its presence is
239 necessitated by path compression, since during a tree traversal (when
240 searching for a leaf - unless we are doing an insertion) we will completely
241 ignore all skipped bits we encounter. Thus we need to verify, at the end of
242 a potentially successful search, that we have indeed been walking the
243 correct key path.
244
245 Note that we can never "miss" the correct key in the tree if present by
246 following the wrong path. Path compression ensures that segments of the key
247 that are the same for all keys with a given prefix are skipped, but the
248 skipped part *is* identical for each node in the subtrie below the skipped
249 bit! trie_insert() in this implementation takes care of that - note the
250 call to tkey_sub_equals() in trie_insert().
251
252 if n is an internal node - a 'tnode' here, the various parts of its key
253 have many different meanings.
254
255 Example:
256 _________________________________________________________________
257 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
258 -----------------------------------------------------------------
259 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
260
261 _________________________________________________________________
262 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
263 -----------------------------------------------------------------
264 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
265
266 tp->pos = 7
267 tp->bits = 3
268 n->pos = 15
269 n->bits = 4
270
271 First, let's just ignore the bits that come before the parent tp, that is
272 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
273 not use them for anything.
274
275 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
276 index into the parent's child array. That is, they will be used to find
277 'n' among tp's children.
278
279 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
280 for the node n.
281
282 All the bits we have seen so far are significant to the node n. The rest
283 of the bits are really not needed or indeed known in n->key.
284
285 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
286 n's child array, and will of course be different for each child.
287
288
289 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
290 at this point.
291
292 */
293
294 static inline void check_tnode(const struct tnode *tn)
295 {
296 WARN_ON(tn && tn->pos+tn->bits > 32);
297 }
298
299 static int halve_threshold = 25;
300 static int inflate_threshold = 50;
301 static int halve_threshold_root = 8;
302 static int inflate_threshold_root = 15;
303
304
305 static void __alias_free_mem(struct rcu_head *head)
306 {
307 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
308 kmem_cache_free(fn_alias_kmem, fa);
309 }
310
311 static inline void alias_free_mem_rcu(struct fib_alias *fa)
312 {
313 call_rcu(&fa->rcu, __alias_free_mem);
314 }
315
316 static void __leaf_free_rcu(struct rcu_head *head)
317 {
318 kfree(container_of(head, struct leaf, rcu));
319 }
320
321 static void __leaf_info_free_rcu(struct rcu_head *head)
322 {
323 kfree(container_of(head, struct leaf_info, rcu));
324 }
325
326 static inline void free_leaf_info(struct leaf_info *leaf)
327 {
328 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
329 }
330
331 static struct tnode *tnode_alloc(unsigned int size)
332 {
333 struct page *pages;
334
335 if (size <= PAGE_SIZE)
336 return kcalloc(size, 1, GFP_KERNEL);
337
338 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
339 if (!pages)
340 return NULL;
341
342 return page_address(pages);
343 }
344
345 static void __tnode_free_rcu(struct rcu_head *head)
346 {
347 struct tnode *tn = container_of(head, struct tnode, rcu);
348 unsigned int size = sizeof(struct tnode) +
349 (1 << tn->bits) * sizeof(struct node *);
350
351 if (size <= PAGE_SIZE)
352 kfree(tn);
353 else
354 free_pages((unsigned long)tn, get_order(size));
355 }
356
357 static inline void tnode_free(struct tnode *tn)
358 {
359 if (IS_LEAF(tn)) {
360 struct leaf *l = (struct leaf *) tn;
361 call_rcu_bh(&l->rcu, __leaf_free_rcu);
362 } else
363 call_rcu(&tn->rcu, __tnode_free_rcu);
364 }
365
366 static struct leaf *leaf_new(void)
367 {
368 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
369 if (l) {
370 l->parent = T_LEAF;
371 INIT_HLIST_HEAD(&l->list);
372 }
373 return l;
374 }
375
376 static struct leaf_info *leaf_info_new(int plen)
377 {
378 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
379 if (li) {
380 li->plen = plen;
381 INIT_LIST_HEAD(&li->falh);
382 }
383 return li;
384 }
385
386 static struct tnode* tnode_new(t_key key, int pos, int bits)
387 {
388 int nchildren = 1<<bits;
389 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
390 struct tnode *tn = tnode_alloc(sz);
391
392 if (tn) {
393 memset(tn, 0, sz);
394 tn->parent = T_TNODE;
395 tn->pos = pos;
396 tn->bits = bits;
397 tn->key = key;
398 tn->full_children = 0;
399 tn->empty_children = 1<<bits;
400 }
401
402 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
403 (unsigned int) (sizeof(struct node) * 1<<bits));
404 return tn;
405 }
406
407 /*
408 * Check whether a tnode 'n' is "full", i.e. it is an internal node
409 * and no bits are skipped. See discussion in dyntree paper p. 6
410 */
411
412 static inline int tnode_full(const struct tnode *tn, const struct node *n)
413 {
414 if (n == NULL || IS_LEAF(n))
415 return 0;
416
417 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
418 }
419
420 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
421 {
422 tnode_put_child_reorg(tn, i, n, -1);
423 }
424
425 /*
426 * Add a child at position i overwriting the old value.
427 * Update the value of full_children and empty_children.
428 */
429
430 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
431 {
432 struct node *chi = tn->child[i];
433 int isfull;
434
435 BUG_ON(i >= 1<<tn->bits);
436
437
438 /* update emptyChildren */
439 if (n == NULL && chi != NULL)
440 tn->empty_children++;
441 else if (n != NULL && chi == NULL)
442 tn->empty_children--;
443
444 /* update fullChildren */
445 if (wasfull == -1)
446 wasfull = tnode_full(tn, chi);
447
448 isfull = tnode_full(tn, n);
449 if (wasfull && !isfull)
450 tn->full_children--;
451 else if (!wasfull && isfull)
452 tn->full_children++;
453
454 if (n)
455 node_set_parent(n, tn);
456
457 rcu_assign_pointer(tn->child[i], n);
458 }
459
460 static struct node *resize(struct trie *t, struct tnode *tn)
461 {
462 int i;
463 int err = 0;
464 struct tnode *old_tn;
465 int inflate_threshold_use;
466 int halve_threshold_use;
467 int max_resize;
468
469 if (!tn)
470 return NULL;
471
472 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
473 tn, inflate_threshold, halve_threshold);
474
475 /* No children */
476 if (tn->empty_children == tnode_child_length(tn)) {
477 tnode_free(tn);
478 return NULL;
479 }
480 /* One child */
481 if (tn->empty_children == tnode_child_length(tn) - 1)
482 for (i = 0; i < tnode_child_length(tn); i++) {
483 struct node *n;
484
485 n = tn->child[i];
486 if (!n)
487 continue;
488
489 /* compress one level */
490 node_set_parent(n, NULL);
491 tnode_free(tn);
492 return n;
493 }
494 /*
495 * Double as long as the resulting node has a number of
496 * nonempty nodes that are above the threshold.
497 */
498
499 /*
500 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
501 * the Helsinki University of Technology and Matti Tikkanen of Nokia
502 * Telecommunications, page 6:
503 * "A node is doubled if the ratio of non-empty children to all
504 * children in the *doubled* node is at least 'high'."
505 *
506 * 'high' in this instance is the variable 'inflate_threshold'. It
507 * is expressed as a percentage, so we multiply it with
508 * tnode_child_length() and instead of multiplying by 2 (since the
509 * child array will be doubled by inflate()) and multiplying
510 * the left-hand side by 100 (to handle the percentage thing) we
511 * multiply the left-hand side by 50.
512 *
513 * The left-hand side may look a bit weird: tnode_child_length(tn)
514 * - tn->empty_children is of course the number of non-null children
515 * in the current node. tn->full_children is the number of "full"
516 * children, that is non-null tnodes with a skip value of 0.
517 * All of those will be doubled in the resulting inflated tnode, so
518 * we just count them one extra time here.
519 *
520 * A clearer way to write this would be:
521 *
522 * to_be_doubled = tn->full_children;
523 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
524 * tn->full_children;
525 *
526 * new_child_length = tnode_child_length(tn) * 2;
527 *
528 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
529 * new_child_length;
530 * if (new_fill_factor >= inflate_threshold)
531 *
532 * ...and so on, tho it would mess up the while () loop.
533 *
534 * anyway,
535 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
536 * inflate_threshold
537 *
538 * avoid a division:
539 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
540 * inflate_threshold * new_child_length
541 *
542 * expand not_to_be_doubled and to_be_doubled, and shorten:
543 * 100 * (tnode_child_length(tn) - tn->empty_children +
544 * tn->full_children) >= inflate_threshold * new_child_length
545 *
546 * expand new_child_length:
547 * 100 * (tnode_child_length(tn) - tn->empty_children +
548 * tn->full_children) >=
549 * inflate_threshold * tnode_child_length(tn) * 2
550 *
551 * shorten again:
552 * 50 * (tn->full_children + tnode_child_length(tn) -
553 * tn->empty_children) >= inflate_threshold *
554 * tnode_child_length(tn)
555 *
556 */
557
558 check_tnode(tn);
559
560 /* Keep root node larger */
561
562 if (!tn->parent)
563 inflate_threshold_use = inflate_threshold_root;
564 else
565 inflate_threshold_use = inflate_threshold;
566
567 err = 0;
568 max_resize = 10;
569 while ((tn->full_children > 0 && max_resize-- &&
570 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
571 inflate_threshold_use * tnode_child_length(tn))) {
572
573 old_tn = tn;
574 tn = inflate(t, tn);
575 if (IS_ERR(tn)) {
576 tn = old_tn;
577 #ifdef CONFIG_IP_FIB_TRIE_STATS
578 t->stats.resize_node_skipped++;
579 #endif
580 break;
581 }
582 }
583
584 if (max_resize < 0) {
585 if (!tn->parent)
586 printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
587 inflate_threshold_root, tn->bits);
588 else
589 printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
590 inflate_threshold, tn->bits);
591 }
592
593 check_tnode(tn);
594
595 /*
596 * Halve as long as the number of empty children in this
597 * node is above threshold.
598 */
599
600
601 /* Keep root node larger */
602
603 if (!tn->parent)
604 halve_threshold_use = halve_threshold_root;
605 else
606 halve_threshold_use = halve_threshold;
607
608 err = 0;
609 max_resize = 10;
610 while (tn->bits > 1 && max_resize-- &&
611 100 * (tnode_child_length(tn) - tn->empty_children) <
612 halve_threshold_use * tnode_child_length(tn)) {
613
614 old_tn = tn;
615 tn = halve(t, tn);
616 if (IS_ERR(tn)) {
617 tn = old_tn;
618 #ifdef CONFIG_IP_FIB_TRIE_STATS
619 t->stats.resize_node_skipped++;
620 #endif
621 break;
622 }
623 }
624
625 if (max_resize < 0) {
626 if (!tn->parent)
627 printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
628 halve_threshold_root, tn->bits);
629 else
630 printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
631 halve_threshold, tn->bits);
632 }
633
634 /* Only one child remains */
635 if (tn->empty_children == tnode_child_length(tn) - 1)
636 for (i = 0; i < tnode_child_length(tn); i++) {
637 struct node *n;
638
639 n = tn->child[i];
640 if (!n)
641 continue;
642
643 /* compress one level */
644
645 node_set_parent(n, NULL);
646 tnode_free(tn);
647 return n;
648 }
649
650 return (struct node *) tn;
651 }
652
653 static struct tnode *inflate(struct trie *t, struct tnode *tn)
654 {
655 struct tnode *inode;
656 struct tnode *oldtnode = tn;
657 int olen = tnode_child_length(tn);
658 int i;
659
660 pr_debug("In inflate\n");
661
662 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
663
664 if (!tn)
665 return ERR_PTR(-ENOMEM);
666
667 /*
668 * Preallocate and store tnodes before the actual work so we
669 * don't get into an inconsistent state if memory allocation
670 * fails. In case of failure we return the oldnode and inflate
671 * of tnode is ignored.
672 */
673
674 for (i = 0; i < olen; i++) {
675 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
676
677 if (inode &&
678 IS_TNODE(inode) &&
679 inode->pos == oldtnode->pos + oldtnode->bits &&
680 inode->bits > 1) {
681 struct tnode *left, *right;
682 t_key m = TKEY_GET_MASK(inode->pos, 1);
683
684 left = tnode_new(inode->key&(~m), inode->pos + 1,
685 inode->bits - 1);
686 if (!left)
687 goto nomem;
688
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
692 if (!right) {
693 tnode_free(left);
694 goto nomem;
695 }
696
697 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right);
699 }
700 }
701
702 for (i = 0; i < olen; i++) {
703 struct node *node = tnode_get_child(oldtnode, i);
704 struct tnode *left, *right;
705 int size, j;
706
707 /* An empty child */
708 if (node == NULL)
709 continue;
710
711 /* A leaf or an internal node with skipped bits */
712
713 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
714 tn->pos + tn->bits - 1) {
715 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
716 1) == 0)
717 put_child(t, tn, 2*i, node);
718 else
719 put_child(t, tn, 2*i+1, node);
720 continue;
721 }
722
723 /* An internal node with two children */
724 inode = (struct tnode *) node;
725
726 if (inode->bits == 1) {
727 put_child(t, tn, 2*i, inode->child[0]);
728 put_child(t, tn, 2*i+1, inode->child[1]);
729
730 tnode_free(inode);
731 continue;
732 }
733
734 /* An internal node with more than two children */
735
736 /* We will replace this node 'inode' with two new
737 * ones, 'left' and 'right', each with half of the
738 * original children. The two new nodes will have
739 * a position one bit further down the key and this
740 * means that the "significant" part of their keys
741 * (see the discussion near the top of this file)
742 * will differ by one bit, which will be "0" in
743 * left's key and "1" in right's key. Since we are
744 * moving the key position by one step, the bit that
745 * we are moving away from - the bit at position
746 * (inode->pos) - is the one that will differ between
747 * left and right. So... we synthesize that bit in the
748 * two new keys.
749 * The mask 'm' below will be a single "one" bit at
750 * the position (inode->pos)
751 */
752
753 /* Use the old key, but set the new significant
754 * bit to zero.
755 */
756
757 left = (struct tnode *) tnode_get_child(tn, 2*i);
758 put_child(t, tn, 2*i, NULL);
759
760 BUG_ON(!left);
761
762 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
763 put_child(t, tn, 2*i+1, NULL);
764
765 BUG_ON(!right);
766
767 size = tnode_child_length(left);
768 for (j = 0; j < size; j++) {
769 put_child(t, left, j, inode->child[j]);
770 put_child(t, right, j, inode->child[j + size]);
771 }
772 put_child(t, tn, 2*i, resize(t, left));
773 put_child(t, tn, 2*i+1, resize(t, right));
774
775 tnode_free(inode);
776 }
777 tnode_free(oldtnode);
778 return tn;
779 nomem:
780 {
781 int size = tnode_child_length(tn);
782 int j;
783
784 for (j = 0; j < size; j++)
785 if (tn->child[j])
786 tnode_free((struct tnode *)tn->child[j]);
787
788 tnode_free(tn);
789
790 return ERR_PTR(-ENOMEM);
791 }
792 }
793
794 static struct tnode *halve(struct trie *t, struct tnode *tn)
795 {
796 struct tnode *oldtnode = tn;
797 struct node *left, *right;
798 int i;
799 int olen = tnode_child_length(tn);
800
801 pr_debug("In halve\n");
802
803 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
804
805 if (!tn)
806 return ERR_PTR(-ENOMEM);
807
808 /*
809 * Preallocate and store tnodes before the actual work so we
810 * don't get into an inconsistent state if memory allocation
811 * fails. In case of failure we return the oldnode and halve
812 * of tnode is ignored.
813 */
814
815 for (i = 0; i < olen; i += 2) {
816 left = tnode_get_child(oldtnode, i);
817 right = tnode_get_child(oldtnode, i+1);
818
819 /* Two nonempty children */
820 if (left && right) {
821 struct tnode *newn;
822
823 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
824
825 if (!newn)
826 goto nomem;
827
828 put_child(t, tn, i/2, (struct node *)newn);
829 }
830
831 }
832
833 for (i = 0; i < olen; i += 2) {
834 struct tnode *newBinNode;
835
836 left = tnode_get_child(oldtnode, i);
837 right = tnode_get_child(oldtnode, i+1);
838
839 /* At least one of the children is empty */
840 if (left == NULL) {
841 if (right == NULL) /* Both are empty */
842 continue;
843 put_child(t, tn, i/2, right);
844 continue;
845 }
846
847 if (right == NULL) {
848 put_child(t, tn, i/2, left);
849 continue;
850 }
851
852 /* Two nonempty children */
853 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
854 put_child(t, tn, i/2, NULL);
855 put_child(t, newBinNode, 0, left);
856 put_child(t, newBinNode, 1, right);
857 put_child(t, tn, i/2, resize(t, newBinNode));
858 }
859 tnode_free(oldtnode);
860 return tn;
861 nomem:
862 {
863 int size = tnode_child_length(tn);
864 int j;
865
866 for (j = 0; j < size; j++)
867 if (tn->child[j])
868 tnode_free((struct tnode *)tn->child[j]);
869
870 tnode_free(tn);
871
872 return ERR_PTR(-ENOMEM);
873 }
874 }
875
876 static void trie_init(struct trie *t)
877 {
878 if (!t)
879 return;
880
881 t->size = 0;
882 rcu_assign_pointer(t->trie, NULL);
883 t->revision = 0;
884 #ifdef CONFIG_IP_FIB_TRIE_STATS
885 memset(&t->stats, 0, sizeof(struct trie_use_stats));
886 #endif
887 }
888
889 /* readside must use rcu_read_lock currently dump routines
890 via get_fa_head and dump */
891
892 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
893 {
894 struct hlist_head *head = &l->list;
895 struct hlist_node *node;
896 struct leaf_info *li;
897
898 hlist_for_each_entry_rcu(li, node, head, hlist)
899 if (li->plen == plen)
900 return li;
901
902 return NULL;
903 }
904
905 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
906 {
907 struct leaf_info *li = find_leaf_info(l, plen);
908
909 if (!li)
910 return NULL;
911
912 return &li->falh;
913 }
914
915 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
916 {
917 struct leaf_info *li = NULL, *last = NULL;
918 struct hlist_node *node;
919
920 if (hlist_empty(head)) {
921 hlist_add_head_rcu(&new->hlist, head);
922 } else {
923 hlist_for_each_entry(li, node, head, hlist) {
924 if (new->plen > li->plen)
925 break;
926
927 last = li;
928 }
929 if (last)
930 hlist_add_after_rcu(&last->hlist, &new->hlist);
931 else
932 hlist_add_before_rcu(&new->hlist, &li->hlist);
933 }
934 }
935
936 /* rcu_read_lock needs to be hold by caller from readside */
937
938 static struct leaf *
939 fib_find_node(struct trie *t, u32 key)
940 {
941 int pos;
942 struct tnode *tn;
943 struct node *n;
944
945 pos = 0;
946 n = rcu_dereference(t->trie);
947
948 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
949 tn = (struct tnode *) n;
950
951 check_tnode(tn);
952
953 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
954 pos = tn->pos + tn->bits;
955 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
956 } else
957 break;
958 }
959 /* Case we have found a leaf. Compare prefixes */
960
961 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
962 return (struct leaf *)n;
963
964 return NULL;
965 }
966
967 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
968 {
969 int wasfull;
970 t_key cindex, key = tn->key;
971 struct tnode *tp;
972
973 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
974 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
975 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
976 tn = (struct tnode *) resize (t, (struct tnode *)tn);
977 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
978
979 tp = node_parent((struct node *) tn);
980 if (!tp)
981 break;
982 tn = tp;
983 }
984
985 /* Handle last (top) tnode */
986 if (IS_TNODE(tn))
987 tn = (struct tnode*) resize(t, (struct tnode *)tn);
988
989 return (struct node*) tn;
990 }
991
992 /* only used from updater-side */
993
994 static struct list_head *
995 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
996 {
997 int pos, newpos;
998 struct tnode *tp = NULL, *tn = NULL;
999 struct node *n;
1000 struct leaf *l;
1001 int missbit;
1002 struct list_head *fa_head = NULL;
1003 struct leaf_info *li;
1004 t_key cindex;
1005
1006 pos = 0;
1007 n = t->trie;
1008
1009 /* If we point to NULL, stop. Either the tree is empty and we should
1010 * just put a new leaf in if, or we have reached an empty child slot,
1011 * and we should just put our new leaf in that.
1012 * If we point to a T_TNODE, check if it matches our key. Note that
1013 * a T_TNODE might be skipping any number of bits - its 'pos' need
1014 * not be the parent's 'pos'+'bits'!
1015 *
1016 * If it does match the current key, get pos/bits from it, extract
1017 * the index from our key, push the T_TNODE and walk the tree.
1018 *
1019 * If it doesn't, we have to replace it with a new T_TNODE.
1020 *
1021 * If we point to a T_LEAF, it might or might not have the same key
1022 * as we do. If it does, just change the value, update the T_LEAF's
1023 * value, and return it.
1024 * If it doesn't, we need to replace it with a T_TNODE.
1025 */
1026
1027 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1028 tn = (struct tnode *) n;
1029
1030 check_tnode(tn);
1031
1032 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1033 tp = tn;
1034 pos = tn->pos + tn->bits;
1035 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1036
1037 BUG_ON(n && node_parent(n) != tn);
1038 } else
1039 break;
1040 }
1041
1042 /*
1043 * n ----> NULL, LEAF or TNODE
1044 *
1045 * tp is n's (parent) ----> NULL or TNODE
1046 */
1047
1048 BUG_ON(tp && IS_LEAF(tp));
1049
1050 /* Case 1: n is a leaf. Compare prefixes */
1051
1052 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1053 struct leaf *l = (struct leaf *) n;
1054
1055 li = leaf_info_new(plen);
1056
1057 if (!li) {
1058 *err = -ENOMEM;
1059 goto err;
1060 }
1061
1062 fa_head = &li->falh;
1063 insert_leaf_info(&l->list, li);
1064 goto done;
1065 }
1066 t->size++;
1067 l = leaf_new();
1068
1069 if (!l) {
1070 *err = -ENOMEM;
1071 goto err;
1072 }
1073
1074 l->key = key;
1075 li = leaf_info_new(plen);
1076
1077 if (!li) {
1078 tnode_free((struct tnode *) l);
1079 *err = -ENOMEM;
1080 goto err;
1081 }
1082
1083 fa_head = &li->falh;
1084 insert_leaf_info(&l->list, li);
1085
1086 if (t->trie && n == NULL) {
1087 /* Case 2: n is NULL, and will just insert a new leaf */
1088
1089 node_set_parent((struct node *)l, tp);
1090
1091 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1092 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1093 } else {
1094 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1095 /*
1096 * Add a new tnode here
1097 * first tnode need some special handling
1098 */
1099
1100 if (tp)
1101 pos = tp->pos+tp->bits;
1102 else
1103 pos = 0;
1104
1105 if (n) {
1106 newpos = tkey_mismatch(key, pos, n->key);
1107 tn = tnode_new(n->key, newpos, 1);
1108 } else {
1109 newpos = 0;
1110 tn = tnode_new(key, newpos, 1); /* First tnode */
1111 }
1112
1113 if (!tn) {
1114 free_leaf_info(li);
1115 tnode_free((struct tnode *) l);
1116 *err = -ENOMEM;
1117 goto err;
1118 }
1119
1120 node_set_parent((struct node *)tn, tp);
1121
1122 missbit = tkey_extract_bits(key, newpos, 1);
1123 put_child(t, tn, missbit, (struct node *)l);
1124 put_child(t, tn, 1-missbit, n);
1125
1126 if (tp) {
1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1128 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1129 } else {
1130 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1131 tp = tn;
1132 }
1133 }
1134
1135 if (tp && tp->pos + tp->bits > 32)
1136 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1137 tp, tp->pos, tp->bits, key, plen);
1138
1139 /* Rebalance the trie */
1140
1141 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1142 done:
1143 t->revision++;
1144 err:
1145 return fa_head;
1146 }
1147
1148 /*
1149 * Caller must hold RTNL.
1150 */
1151 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1152 {
1153 struct trie *t = (struct trie *) tb->tb_data;
1154 struct fib_alias *fa, *new_fa;
1155 struct list_head *fa_head = NULL;
1156 struct fib_info *fi;
1157 int plen = cfg->fc_dst_len;
1158 u8 tos = cfg->fc_tos;
1159 u32 key, mask;
1160 int err;
1161 struct leaf *l;
1162
1163 if (plen > 32)
1164 return -EINVAL;
1165
1166 key = ntohl(cfg->fc_dst);
1167
1168 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1169
1170 mask = ntohl(inet_make_mask(plen));
1171
1172 if (key & ~mask)
1173 return -EINVAL;
1174
1175 key = key & mask;
1176
1177 fi = fib_create_info(cfg);
1178 if (IS_ERR(fi)) {
1179 err = PTR_ERR(fi);
1180 goto err;
1181 }
1182
1183 l = fib_find_node(t, key);
1184 fa = NULL;
1185
1186 if (l) {
1187 fa_head = get_fa_head(l, plen);
1188 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1189 }
1190
1191 /* Now fa, if non-NULL, points to the first fib alias
1192 * with the same keys [prefix,tos,priority], if such key already
1193 * exists or to the node before which we will insert new one.
1194 *
1195 * If fa is NULL, we will need to allocate a new one and
1196 * insert to the head of f.
1197 *
1198 * If f is NULL, no fib node matched the destination key
1199 * and we need to allocate a new one of those as well.
1200 */
1201
1202 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1203 struct fib_alias *fa_orig;
1204
1205 err = -EEXIST;
1206 if (cfg->fc_nlflags & NLM_F_EXCL)
1207 goto out;
1208
1209 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1210 struct fib_info *fi_drop;
1211 u8 state;
1212
1213 err = -ENOBUFS;
1214 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1215 if (new_fa == NULL)
1216 goto out;
1217
1218 fi_drop = fa->fa_info;
1219 new_fa->fa_tos = fa->fa_tos;
1220 new_fa->fa_info = fi;
1221 new_fa->fa_type = cfg->fc_type;
1222 new_fa->fa_scope = cfg->fc_scope;
1223 state = fa->fa_state;
1224 new_fa->fa_state &= ~FA_S_ACCESSED;
1225
1226 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1227 alias_free_mem_rcu(fa);
1228
1229 fib_release_info(fi_drop);
1230 if (state & FA_S_ACCESSED)
1231 rt_cache_flush(-1);
1232 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1233 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1234
1235 goto succeeded;
1236 }
1237 /* Error if we find a perfect match which
1238 * uses the same scope, type, and nexthop
1239 * information.
1240 */
1241 fa_orig = fa;
1242 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1243 if (fa->fa_tos != tos)
1244 break;
1245 if (fa->fa_info->fib_priority != fi->fib_priority)
1246 break;
1247 if (fa->fa_type == cfg->fc_type &&
1248 fa->fa_scope == cfg->fc_scope &&
1249 fa->fa_info == fi) {
1250 goto out;
1251 }
1252 }
1253 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1254 fa = fa_orig;
1255 }
1256 err = -ENOENT;
1257 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1258 goto out;
1259
1260 err = -ENOBUFS;
1261 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1262 if (new_fa == NULL)
1263 goto out;
1264
1265 new_fa->fa_info = fi;
1266 new_fa->fa_tos = tos;
1267 new_fa->fa_type = cfg->fc_type;
1268 new_fa->fa_scope = cfg->fc_scope;
1269 new_fa->fa_state = 0;
1270 /*
1271 * Insert new entry to the list.
1272 */
1273
1274 if (!fa_head) {
1275 err = 0;
1276 fa_head = fib_insert_node(t, &err, key, plen);
1277 if (err)
1278 goto out_free_new_fa;
1279 }
1280
1281 list_add_tail_rcu(&new_fa->fa_list,
1282 (fa ? &fa->fa_list : fa_head));
1283
1284 rt_cache_flush(-1);
1285 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1286 &cfg->fc_nlinfo, 0);
1287 succeeded:
1288 return 0;
1289
1290 out_free_new_fa:
1291 kmem_cache_free(fn_alias_kmem, new_fa);
1292 out:
1293 fib_release_info(fi);
1294 err:
1295 return err;
1296 }
1297
1298
1299 /* should be called with rcu_read_lock */
1300 static inline int check_leaf(struct trie *t, struct leaf *l,
1301 t_key key, int *plen, const struct flowi *flp,
1302 struct fib_result *res)
1303 {
1304 int err, i;
1305 __be32 mask;
1306 struct leaf_info *li;
1307 struct hlist_head *hhead = &l->list;
1308 struct hlist_node *node;
1309
1310 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1311 i = li->plen;
1312 mask = inet_make_mask(i);
1313 if (l->key != (key & ntohl(mask)))
1314 continue;
1315
1316 if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1317 *plen = i;
1318 #ifdef CONFIG_IP_FIB_TRIE_STATS
1319 t->stats.semantic_match_passed++;
1320 #endif
1321 return err;
1322 }
1323 #ifdef CONFIG_IP_FIB_TRIE_STATS
1324 t->stats.semantic_match_miss++;
1325 #endif
1326 }
1327 return 1;
1328 }
1329
1330 static int
1331 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1332 {
1333 struct trie *t = (struct trie *) tb->tb_data;
1334 int plen, ret = 0;
1335 struct node *n;
1336 struct tnode *pn;
1337 int pos, bits;
1338 t_key key = ntohl(flp->fl4_dst);
1339 int chopped_off;
1340 t_key cindex = 0;
1341 int current_prefix_length = KEYLENGTH;
1342 struct tnode *cn;
1343 t_key node_prefix, key_prefix, pref_mismatch;
1344 int mp;
1345
1346 rcu_read_lock();
1347
1348 n = rcu_dereference(t->trie);
1349 if (!n)
1350 goto failed;
1351
1352 #ifdef CONFIG_IP_FIB_TRIE_STATS
1353 t->stats.gets++;
1354 #endif
1355
1356 /* Just a leaf? */
1357 if (IS_LEAF(n)) {
1358 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1359 goto found;
1360 goto failed;
1361 }
1362 pn = (struct tnode *) n;
1363 chopped_off = 0;
1364
1365 while (pn) {
1366 pos = pn->pos;
1367 bits = pn->bits;
1368
1369 if (!chopped_off)
1370 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1371
1372 n = tnode_get_child(pn, cindex);
1373
1374 if (n == NULL) {
1375 #ifdef CONFIG_IP_FIB_TRIE_STATS
1376 t->stats.null_node_hit++;
1377 #endif
1378 goto backtrace;
1379 }
1380
1381 if (IS_LEAF(n)) {
1382 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1383 goto found;
1384 else
1385 goto backtrace;
1386 }
1387
1388 #define HL_OPTIMIZE
1389 #ifdef HL_OPTIMIZE
1390 cn = (struct tnode *)n;
1391
1392 /*
1393 * It's a tnode, and we can do some extra checks here if we
1394 * like, to avoid descending into a dead-end branch.
1395 * This tnode is in the parent's child array at index
1396 * key[p_pos..p_pos+p_bits] but potentially with some bits
1397 * chopped off, so in reality the index may be just a
1398 * subprefix, padded with zero at the end.
1399 * We can also take a look at any skipped bits in this
1400 * tnode - everything up to p_pos is supposed to be ok,
1401 * and the non-chopped bits of the index (se previous
1402 * paragraph) are also guaranteed ok, but the rest is
1403 * considered unknown.
1404 *
1405 * The skipped bits are key[pos+bits..cn->pos].
1406 */
1407
1408 /* If current_prefix_length < pos+bits, we are already doing
1409 * actual prefix matching, which means everything from
1410 * pos+(bits-chopped_off) onward must be zero along some
1411 * branch of this subtree - otherwise there is *no* valid
1412 * prefix present. Here we can only check the skipped
1413 * bits. Remember, since we have already indexed into the
1414 * parent's child array, we know that the bits we chopped of
1415 * *are* zero.
1416 */
1417
1418 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1419
1420 if (current_prefix_length < pos+bits) {
1421 if (tkey_extract_bits(cn->key, current_prefix_length,
1422 cn->pos - current_prefix_length) != 0 ||
1423 !(cn->child[0]))
1424 goto backtrace;
1425 }
1426
1427 /*
1428 * If chopped_off=0, the index is fully validated and we
1429 * only need to look at the skipped bits for this, the new,
1430 * tnode. What we actually want to do is to find out if
1431 * these skipped bits match our key perfectly, or if we will
1432 * have to count on finding a matching prefix further down,
1433 * because if we do, we would like to have some way of
1434 * verifying the existence of such a prefix at this point.
1435 */
1436
1437 /* The only thing we can do at this point is to verify that
1438 * any such matching prefix can indeed be a prefix to our
1439 * key, and if the bits in the node we are inspecting that
1440 * do not match our key are not ZERO, this cannot be true.
1441 * Thus, find out where there is a mismatch (before cn->pos)
1442 * and verify that all the mismatching bits are zero in the
1443 * new tnode's key.
1444 */
1445
1446 /* Note: We aren't very concerned about the piece of the key
1447 * that precede pn->pos+pn->bits, since these have already been
1448 * checked. The bits after cn->pos aren't checked since these are
1449 * by definition "unknown" at this point. Thus, what we want to
1450 * see is if we are about to enter the "prefix matching" state,
1451 * and in that case verify that the skipped bits that will prevail
1452 * throughout this subtree are zero, as they have to be if we are
1453 * to find a matching prefix.
1454 */
1455
1456 node_prefix = MASK_PFX(cn->key, cn->pos);
1457 key_prefix = MASK_PFX(key, cn->pos);
1458 pref_mismatch = key_prefix^node_prefix;
1459 mp = 0;
1460
1461 /* In short: If skipped bits in this node do not match the search
1462 * key, enter the "prefix matching" state.directly.
1463 */
1464 if (pref_mismatch) {
1465 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1466 mp++;
1467 pref_mismatch = pref_mismatch <<1;
1468 }
1469 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1470
1471 if (key_prefix != 0)
1472 goto backtrace;
1473
1474 if (current_prefix_length >= cn->pos)
1475 current_prefix_length = mp;
1476 }
1477 #endif
1478 pn = (struct tnode *)n; /* Descend */
1479 chopped_off = 0;
1480 continue;
1481
1482 backtrace:
1483 chopped_off++;
1484
1485 /* As zero don't change the child key (cindex) */
1486 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1487 chopped_off++;
1488
1489 /* Decrease current_... with bits chopped off */
1490 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1491 current_prefix_length = pn->pos + pn->bits - chopped_off;
1492
1493 /*
1494 * Either we do the actual chop off according or if we have
1495 * chopped off all bits in this tnode walk up to our parent.
1496 */
1497
1498 if (chopped_off <= pn->bits) {
1499 cindex &= ~(1 << (chopped_off-1));
1500 } else {
1501 struct tnode *parent = node_parent((struct node *) pn);
1502 if (!parent)
1503 goto failed;
1504
1505 /* Get Child's index */
1506 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1507 pn = parent;
1508 chopped_off = 0;
1509
1510 #ifdef CONFIG_IP_FIB_TRIE_STATS
1511 t->stats.backtrack++;
1512 #endif
1513 goto backtrace;
1514 }
1515 }
1516 failed:
1517 ret = 1;
1518 found:
1519 rcu_read_unlock();
1520 return ret;
1521 }
1522
1523 /* only called from updater side */
1524 static int trie_leaf_remove(struct trie *t, t_key key)
1525 {
1526 t_key cindex;
1527 struct tnode *tp = NULL;
1528 struct node *n = t->trie;
1529 struct leaf *l;
1530
1531 pr_debug("entering trie_leaf_remove(%p)\n", n);
1532
1533 /* Note that in the case skipped bits, those bits are *not* checked!
1534 * When we finish this, we will have NULL or a T_LEAF, and the
1535 * T_LEAF may or may not match our key.
1536 */
1537
1538 while (n != NULL && IS_TNODE(n)) {
1539 struct tnode *tn = (struct tnode *) n;
1540 check_tnode(tn);
1541 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1542
1543 BUG_ON(n && node_parent(n) != tn);
1544 }
1545 l = (struct leaf *) n;
1546
1547 if (!n || !tkey_equals(l->key, key))
1548 return 0;
1549
1550 /*
1551 * Key found.
1552 * Remove the leaf and rebalance the tree
1553 */
1554
1555 t->revision++;
1556 t->size--;
1557
1558 tp = node_parent(n);
1559 tnode_free((struct tnode *) n);
1560
1561 if (tp) {
1562 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1563 put_child(t, (struct tnode *)tp, cindex, NULL);
1564 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1565 } else
1566 rcu_assign_pointer(t->trie, NULL);
1567
1568 return 1;
1569 }
1570
1571 /*
1572 * Caller must hold RTNL.
1573 */
1574 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1575 {
1576 struct trie *t = (struct trie *) tb->tb_data;
1577 u32 key, mask;
1578 int plen = cfg->fc_dst_len;
1579 u8 tos = cfg->fc_tos;
1580 struct fib_alias *fa, *fa_to_delete;
1581 struct list_head *fa_head;
1582 struct leaf *l;
1583 struct leaf_info *li;
1584
1585 if (plen > 32)
1586 return -EINVAL;
1587
1588 key = ntohl(cfg->fc_dst);
1589 mask = ntohl(inet_make_mask(plen));
1590
1591 if (key & ~mask)
1592 return -EINVAL;
1593
1594 key = key & mask;
1595 l = fib_find_node(t, key);
1596
1597 if (!l)
1598 return -ESRCH;
1599
1600 fa_head = get_fa_head(l, plen);
1601 fa = fib_find_alias(fa_head, tos, 0);
1602
1603 if (!fa)
1604 return -ESRCH;
1605
1606 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1607
1608 fa_to_delete = NULL;
1609 fa_head = fa->fa_list.prev;
1610
1611 list_for_each_entry(fa, fa_head, fa_list) {
1612 struct fib_info *fi = fa->fa_info;
1613
1614 if (fa->fa_tos != tos)
1615 break;
1616
1617 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1618 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1619 fa->fa_scope == cfg->fc_scope) &&
1620 (!cfg->fc_protocol ||
1621 fi->fib_protocol == cfg->fc_protocol) &&
1622 fib_nh_match(cfg, fi) == 0) {
1623 fa_to_delete = fa;
1624 break;
1625 }
1626 }
1627
1628 if (!fa_to_delete)
1629 return -ESRCH;
1630
1631 fa = fa_to_delete;
1632 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1633 &cfg->fc_nlinfo, 0);
1634
1635 l = fib_find_node(t, key);
1636 li = find_leaf_info(l, plen);
1637
1638 list_del_rcu(&fa->fa_list);
1639
1640 if (list_empty(fa_head)) {
1641 hlist_del_rcu(&li->hlist);
1642 free_leaf_info(li);
1643 }
1644
1645 if (hlist_empty(&l->list))
1646 trie_leaf_remove(t, key);
1647
1648 if (fa->fa_state & FA_S_ACCESSED)
1649 rt_cache_flush(-1);
1650
1651 fib_release_info(fa->fa_info);
1652 alias_free_mem_rcu(fa);
1653 return 0;
1654 }
1655
1656 static int trie_flush_list(struct trie *t, struct list_head *head)
1657 {
1658 struct fib_alias *fa, *fa_node;
1659 int found = 0;
1660
1661 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1662 struct fib_info *fi = fa->fa_info;
1663
1664 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1665 list_del_rcu(&fa->fa_list);
1666 fib_release_info(fa->fa_info);
1667 alias_free_mem_rcu(fa);
1668 found++;
1669 }
1670 }
1671 return found;
1672 }
1673
1674 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1675 {
1676 int found = 0;
1677 struct hlist_head *lih = &l->list;
1678 struct hlist_node *node, *tmp;
1679 struct leaf_info *li = NULL;
1680
1681 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1682 found += trie_flush_list(t, &li->falh);
1683
1684 if (list_empty(&li->falh)) {
1685 hlist_del_rcu(&li->hlist);
1686 free_leaf_info(li);
1687 }
1688 }
1689 return found;
1690 }
1691
1692 /* rcu_read_lock needs to be hold by caller from readside */
1693
1694 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1695 {
1696 struct node *c = (struct node *) thisleaf;
1697 struct tnode *p;
1698 int idx;
1699 struct node *trie = rcu_dereference(t->trie);
1700
1701 if (c == NULL) {
1702 if (trie == NULL)
1703 return NULL;
1704
1705 if (IS_LEAF(trie)) /* trie w. just a leaf */
1706 return (struct leaf *) trie;
1707
1708 p = (struct tnode*) trie; /* Start */
1709 } else
1710 p = node_parent(c);
1711
1712 while (p) {
1713 int pos, last;
1714
1715 /* Find the next child of the parent */
1716 if (c)
1717 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1718 else
1719 pos = 0;
1720
1721 last = 1 << p->bits;
1722 for (idx = pos; idx < last ; idx++) {
1723 c = rcu_dereference(p->child[idx]);
1724
1725 if (!c)
1726 continue;
1727
1728 /* Decend if tnode */
1729 while (IS_TNODE(c)) {
1730 p = (struct tnode *) c;
1731 idx = 0;
1732
1733 /* Rightmost non-NULL branch */
1734 if (p && IS_TNODE(p))
1735 while (!(c = rcu_dereference(p->child[idx]))
1736 && idx < (1<<p->bits)) idx++;
1737
1738 /* Done with this tnode? */
1739 if (idx >= (1 << p->bits) || !c)
1740 goto up;
1741 }
1742 return (struct leaf *) c;
1743 }
1744 up:
1745 /* No more children go up one step */
1746 c = (struct node *) p;
1747 p = node_parent(c);
1748 }
1749 return NULL; /* Ready. Root of trie */
1750 }
1751
1752 /*
1753 * Caller must hold RTNL.
1754 */
1755 static int fn_trie_flush(struct fib_table *tb)
1756 {
1757 struct trie *t = (struct trie *) tb->tb_data;
1758 struct leaf *ll = NULL, *l = NULL;
1759 int found = 0, h;
1760
1761 t->revision++;
1762
1763 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1764 found += trie_flush_leaf(t, l);
1765
1766 if (ll && hlist_empty(&ll->list))
1767 trie_leaf_remove(t, ll->key);
1768 ll = l;
1769 }
1770
1771 if (ll && hlist_empty(&ll->list))
1772 trie_leaf_remove(t, ll->key);
1773
1774 pr_debug("trie_flush found=%d\n", found);
1775 return found;
1776 }
1777
1778 static int trie_last_dflt = -1;
1779
1780 static void
1781 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1782 {
1783 struct trie *t = (struct trie *) tb->tb_data;
1784 int order, last_idx;
1785 struct fib_info *fi = NULL;
1786 struct fib_info *last_resort;
1787 struct fib_alias *fa = NULL;
1788 struct list_head *fa_head;
1789 struct leaf *l;
1790
1791 last_idx = -1;
1792 last_resort = NULL;
1793 order = -1;
1794
1795 rcu_read_lock();
1796
1797 l = fib_find_node(t, 0);
1798 if (!l)
1799 goto out;
1800
1801 fa_head = get_fa_head(l, 0);
1802 if (!fa_head)
1803 goto out;
1804
1805 if (list_empty(fa_head))
1806 goto out;
1807
1808 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1809 struct fib_info *next_fi = fa->fa_info;
1810
1811 if (fa->fa_scope != res->scope ||
1812 fa->fa_type != RTN_UNICAST)
1813 continue;
1814
1815 if (next_fi->fib_priority > res->fi->fib_priority)
1816 break;
1817 if (!next_fi->fib_nh[0].nh_gw ||
1818 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1819 continue;
1820 fa->fa_state |= FA_S_ACCESSED;
1821
1822 if (fi == NULL) {
1823 if (next_fi != res->fi)
1824 break;
1825 } else if (!fib_detect_death(fi, order, &last_resort,
1826 &last_idx, &trie_last_dflt)) {
1827 if (res->fi)
1828 fib_info_put(res->fi);
1829 res->fi = fi;
1830 atomic_inc(&fi->fib_clntref);
1831 trie_last_dflt = order;
1832 goto out;
1833 }
1834 fi = next_fi;
1835 order++;
1836 }
1837 if (order <= 0 || fi == NULL) {
1838 trie_last_dflt = -1;
1839 goto out;
1840 }
1841
1842 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1843 if (res->fi)
1844 fib_info_put(res->fi);
1845 res->fi = fi;
1846 atomic_inc(&fi->fib_clntref);
1847 trie_last_dflt = order;
1848 goto out;
1849 }
1850 if (last_idx >= 0) {
1851 if (res->fi)
1852 fib_info_put(res->fi);
1853 res->fi = last_resort;
1854 if (last_resort)
1855 atomic_inc(&last_resort->fib_clntref);
1856 }
1857 trie_last_dflt = last_idx;
1858 out:;
1859 rcu_read_unlock();
1860 }
1861
1862 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1863 struct sk_buff *skb, struct netlink_callback *cb)
1864 {
1865 int i, s_i;
1866 struct fib_alias *fa;
1867
1868 __be32 xkey = htonl(key);
1869
1870 s_i = cb->args[4];
1871 i = 0;
1872
1873 /* rcu_read_lock is hold by caller */
1874
1875 list_for_each_entry_rcu(fa, fah, fa_list) {
1876 if (i < s_i) {
1877 i++;
1878 continue;
1879 }
1880 BUG_ON(!fa->fa_info);
1881
1882 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1883 cb->nlh->nlmsg_seq,
1884 RTM_NEWROUTE,
1885 tb->tb_id,
1886 fa->fa_type,
1887 fa->fa_scope,
1888 xkey,
1889 plen,
1890 fa->fa_tos,
1891 fa->fa_info, 0) < 0) {
1892 cb->args[4] = i;
1893 return -1;
1894 }
1895 i++;
1896 }
1897 cb->args[4] = i;
1898 return skb->len;
1899 }
1900
1901 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1902 struct netlink_callback *cb)
1903 {
1904 int h, s_h;
1905 struct list_head *fa_head;
1906 struct leaf *l = NULL;
1907
1908 s_h = cb->args[3];
1909
1910 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1911 if (h < s_h)
1912 continue;
1913 if (h > s_h)
1914 memset(&cb->args[4], 0,
1915 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1916
1917 fa_head = get_fa_head(l, plen);
1918
1919 if (!fa_head)
1920 continue;
1921
1922 if (list_empty(fa_head))
1923 continue;
1924
1925 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1926 cb->args[3] = h;
1927 return -1;
1928 }
1929 }
1930 cb->args[3] = h;
1931 return skb->len;
1932 }
1933
1934 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1935 {
1936 int m, s_m;
1937 struct trie *t = (struct trie *) tb->tb_data;
1938
1939 s_m = cb->args[2];
1940
1941 rcu_read_lock();
1942 for (m = 0; m <= 32; m++) {
1943 if (m < s_m)
1944 continue;
1945 if (m > s_m)
1946 memset(&cb->args[3], 0,
1947 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1948
1949 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1950 cb->args[2] = m;
1951 goto out;
1952 }
1953 }
1954 rcu_read_unlock();
1955 cb->args[2] = m;
1956 return skb->len;
1957 out:
1958 rcu_read_unlock();
1959 return -1;
1960 }
1961
1962 /* Fix more generic FIB names for init later */
1963
1964 #ifdef CONFIG_IP_MULTIPLE_TABLES
1965 struct fib_table * fib_hash_init(u32 id)
1966 #else
1967 struct fib_table * __init fib_hash_init(u32 id)
1968 #endif
1969 {
1970 struct fib_table *tb;
1971 struct trie *t;
1972
1973 if (fn_alias_kmem == NULL)
1974 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1975 sizeof(struct fib_alias),
1976 0, SLAB_HWCACHE_ALIGN,
1977 NULL);
1978
1979 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1980 GFP_KERNEL);
1981 if (tb == NULL)
1982 return NULL;
1983
1984 tb->tb_id = id;
1985 tb->tb_lookup = fn_trie_lookup;
1986 tb->tb_insert = fn_trie_insert;
1987 tb->tb_delete = fn_trie_delete;
1988 tb->tb_flush = fn_trie_flush;
1989 tb->tb_select_default = fn_trie_select_default;
1990 tb->tb_dump = fn_trie_dump;
1991 memset(tb->tb_data, 0, sizeof(struct trie));
1992
1993 t = (struct trie *) tb->tb_data;
1994
1995 trie_init(t);
1996
1997 if (id == RT_TABLE_LOCAL)
1998 trie_local = t;
1999 else if (id == RT_TABLE_MAIN)
2000 trie_main = t;
2001
2002 if (id == RT_TABLE_LOCAL)
2003 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
2004
2005 return tb;
2006 }
2007
2008 #ifdef CONFIG_PROC_FS
2009 /* Depth first Trie walk iterator */
2010 struct fib_trie_iter {
2011 struct tnode *tnode;
2012 struct trie *trie;
2013 unsigned index;
2014 unsigned depth;
2015 };
2016
2017 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2018 {
2019 struct tnode *tn = iter->tnode;
2020 unsigned cindex = iter->index;
2021 struct tnode *p;
2022
2023 /* A single entry routing table */
2024 if (!tn)
2025 return NULL;
2026
2027 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2028 iter->tnode, iter->index, iter->depth);
2029 rescan:
2030 while (cindex < (1<<tn->bits)) {
2031 struct node *n = tnode_get_child(tn, cindex);
2032
2033 if (n) {
2034 if (IS_LEAF(n)) {
2035 iter->tnode = tn;
2036 iter->index = cindex + 1;
2037 } else {
2038 /* push down one level */
2039 iter->tnode = (struct tnode *) n;
2040 iter->index = 0;
2041 ++iter->depth;
2042 }
2043 return n;
2044 }
2045
2046 ++cindex;
2047 }
2048
2049 /* Current node exhausted, pop back up */
2050 p = node_parent((struct node *)tn);
2051 if (p) {
2052 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2053 tn = p;
2054 --iter->depth;
2055 goto rescan;
2056 }
2057
2058 /* got root? */
2059 return NULL;
2060 }
2061
2062 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2063 struct trie *t)
2064 {
2065 struct node *n ;
2066
2067 if (!t)
2068 return NULL;
2069
2070 n = rcu_dereference(t->trie);
2071
2072 if (!iter)
2073 return NULL;
2074
2075 if (n) {
2076 if (IS_TNODE(n)) {
2077 iter->tnode = (struct tnode *) n;
2078 iter->trie = t;
2079 iter->index = 0;
2080 iter->depth = 1;
2081 } else {
2082 iter->tnode = NULL;
2083 iter->trie = t;
2084 iter->index = 0;
2085 iter->depth = 0;
2086 }
2087 return n;
2088 }
2089 return NULL;
2090 }
2091
2092 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2093 {
2094 struct node *n;
2095 struct fib_trie_iter iter;
2096
2097 memset(s, 0, sizeof(*s));
2098
2099 rcu_read_lock();
2100 for (n = fib_trie_get_first(&iter, t); n;
2101 n = fib_trie_get_next(&iter)) {
2102 if (IS_LEAF(n)) {
2103 s->leaves++;
2104 s->totdepth += iter.depth;
2105 if (iter.depth > s->maxdepth)
2106 s->maxdepth = iter.depth;
2107 } else {
2108 const struct tnode *tn = (const struct tnode *) n;
2109 int i;
2110
2111 s->tnodes++;
2112 if (tn->bits < MAX_STAT_DEPTH)
2113 s->nodesizes[tn->bits]++;
2114
2115 for (i = 0; i < (1<<tn->bits); i++)
2116 if (!tn->child[i])
2117 s->nullpointers++;
2118 }
2119 }
2120 rcu_read_unlock();
2121 }
2122
2123 /*
2124 * This outputs /proc/net/fib_triestats
2125 */
2126 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2127 {
2128 unsigned i, max, pointers, bytes, avdepth;
2129
2130 if (stat->leaves)
2131 avdepth = stat->totdepth*100 / stat->leaves;
2132 else
2133 avdepth = 0;
2134
2135 seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2136 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2137
2138 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2139
2140 bytes = sizeof(struct leaf) * stat->leaves;
2141 seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2142 bytes += sizeof(struct tnode) * stat->tnodes;
2143
2144 max = MAX_STAT_DEPTH;
2145 while (max > 0 && stat->nodesizes[max-1] == 0)
2146 max--;
2147
2148 pointers = 0;
2149 for (i = 1; i <= max; i++)
2150 if (stat->nodesizes[i] != 0) {
2151 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2152 pointers += (1<<i) * stat->nodesizes[i];
2153 }
2154 seq_putc(seq, '\n');
2155 seq_printf(seq, "\tPointers: %d\n", pointers);
2156
2157 bytes += sizeof(struct node *) * pointers;
2158 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2159 seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
2160
2161 #ifdef CONFIG_IP_FIB_TRIE_STATS
2162 seq_printf(seq, "Counters:\n---------\n");
2163 seq_printf(seq,"gets = %d\n", t->stats.gets);
2164 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2165 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2166 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2167 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2168 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2169 #ifdef CLEAR_STATS
2170 memset(&(t->stats), 0, sizeof(t->stats));
2171 #endif
2172 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2173 }
2174
2175 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2176 {
2177 struct trie_stat *stat;
2178
2179 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2180 if (!stat)
2181 return -ENOMEM;
2182
2183 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2184 sizeof(struct leaf), sizeof(struct tnode));
2185
2186 if (trie_local) {
2187 seq_printf(seq, "Local:\n");
2188 trie_collect_stats(trie_local, stat);
2189 trie_show_stats(seq, stat);
2190 }
2191
2192 if (trie_main) {
2193 seq_printf(seq, "Main:\n");
2194 trie_collect_stats(trie_main, stat);
2195 trie_show_stats(seq, stat);
2196 }
2197 kfree(stat);
2198
2199 return 0;
2200 }
2201
2202 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2203 {
2204 return single_open(file, fib_triestat_seq_show, NULL);
2205 }
2206
2207 static const struct file_operations fib_triestat_fops = {
2208 .owner = THIS_MODULE,
2209 .open = fib_triestat_seq_open,
2210 .read = seq_read,
2211 .llseek = seq_lseek,
2212 .release = single_release,
2213 };
2214
2215 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2216 loff_t pos)
2217 {
2218 loff_t idx = 0;
2219 struct node *n;
2220
2221 for (n = fib_trie_get_first(iter, trie_local);
2222 n; ++idx, n = fib_trie_get_next(iter)) {
2223 if (pos == idx)
2224 return n;
2225 }
2226
2227 for (n = fib_trie_get_first(iter, trie_main);
2228 n; ++idx, n = fib_trie_get_next(iter)) {
2229 if (pos == idx)
2230 return n;
2231 }
2232 return NULL;
2233 }
2234
2235 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2236 {
2237 rcu_read_lock();
2238 if (*pos == 0)
2239 return SEQ_START_TOKEN;
2240 return fib_trie_get_idx(seq->private, *pos - 1);
2241 }
2242
2243 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2244 {
2245 struct fib_trie_iter *iter = seq->private;
2246 void *l = v;
2247
2248 ++*pos;
2249 if (v == SEQ_START_TOKEN)
2250 return fib_trie_get_idx(iter, 0);
2251
2252 v = fib_trie_get_next(iter);
2253 BUG_ON(v == l);
2254 if (v)
2255 return v;
2256
2257 /* continue scan in next trie */
2258 if (iter->trie == trie_local)
2259 return fib_trie_get_first(iter, trie_main);
2260
2261 return NULL;
2262 }
2263
2264 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2265 {
2266 rcu_read_unlock();
2267 }
2268
2269 static void seq_indent(struct seq_file *seq, int n)
2270 {
2271 while (n-- > 0) seq_puts(seq, " ");
2272 }
2273
2274 static inline const char *rtn_scope(enum rt_scope_t s)
2275 {
2276 static char buf[32];
2277
2278 switch (s) {
2279 case RT_SCOPE_UNIVERSE: return "universe";
2280 case RT_SCOPE_SITE: return "site";
2281 case RT_SCOPE_LINK: return "link";
2282 case RT_SCOPE_HOST: return "host";
2283 case RT_SCOPE_NOWHERE: return "nowhere";
2284 default:
2285 snprintf(buf, sizeof(buf), "scope=%d", s);
2286 return buf;
2287 }
2288 }
2289
2290 static const char *rtn_type_names[__RTN_MAX] = {
2291 [RTN_UNSPEC] = "UNSPEC",
2292 [RTN_UNICAST] = "UNICAST",
2293 [RTN_LOCAL] = "LOCAL",
2294 [RTN_BROADCAST] = "BROADCAST",
2295 [RTN_ANYCAST] = "ANYCAST",
2296 [RTN_MULTICAST] = "MULTICAST",
2297 [RTN_BLACKHOLE] = "BLACKHOLE",
2298 [RTN_UNREACHABLE] = "UNREACHABLE",
2299 [RTN_PROHIBIT] = "PROHIBIT",
2300 [RTN_THROW] = "THROW",
2301 [RTN_NAT] = "NAT",
2302 [RTN_XRESOLVE] = "XRESOLVE",
2303 };
2304
2305 static inline const char *rtn_type(unsigned t)
2306 {
2307 static char buf[32];
2308
2309 if (t < __RTN_MAX && rtn_type_names[t])
2310 return rtn_type_names[t];
2311 snprintf(buf, sizeof(buf), "type %d", t);
2312 return buf;
2313 }
2314
2315 /* Pretty print the trie */
2316 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2317 {
2318 const struct fib_trie_iter *iter = seq->private;
2319 struct node *n = v;
2320
2321 if (v == SEQ_START_TOKEN)
2322 return 0;
2323
2324 if (!node_parent(n)) {
2325 if (iter->trie == trie_local)
2326 seq_puts(seq, "<local>:\n");
2327 else
2328 seq_puts(seq, "<main>:\n");
2329 }
2330
2331 if (IS_TNODE(n)) {
2332 struct tnode *tn = (struct tnode *) n;
2333 __be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
2334
2335 seq_indent(seq, iter->depth-1);
2336 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
2337 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2338 tn->empty_children);
2339
2340 } else {
2341 struct leaf *l = (struct leaf *) n;
2342 int i;
2343 __be32 val = htonl(l->key);
2344
2345 seq_indent(seq, iter->depth);
2346 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2347 for (i = 32; i >= 0; i--) {
2348 struct leaf_info *li = find_leaf_info(l, i);
2349 if (li) {
2350 struct fib_alias *fa;
2351 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2352 seq_indent(seq, iter->depth+1);
2353 seq_printf(seq, " /%d %s %s", i,
2354 rtn_scope(fa->fa_scope),
2355 rtn_type(fa->fa_type));
2356 if (fa->fa_tos)
2357 seq_printf(seq, "tos =%d\n",
2358 fa->fa_tos);
2359 seq_putc(seq, '\n');
2360 }
2361 }
2362 }
2363 }
2364
2365 return 0;
2366 }
2367
2368 static const struct seq_operations fib_trie_seq_ops = {
2369 .start = fib_trie_seq_start,
2370 .next = fib_trie_seq_next,
2371 .stop = fib_trie_seq_stop,
2372 .show = fib_trie_seq_show,
2373 };
2374
2375 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2376 {
2377 struct seq_file *seq;
2378 int rc = -ENOMEM;
2379 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2380
2381 if (!s)
2382 goto out;
2383
2384 rc = seq_open(file, &fib_trie_seq_ops);
2385 if (rc)
2386 goto out_kfree;
2387
2388 seq = file->private_data;
2389 seq->private = s;
2390 memset(s, 0, sizeof(*s));
2391 out:
2392 return rc;
2393 out_kfree:
2394 kfree(s);
2395 goto out;
2396 }
2397
2398 static const struct file_operations fib_trie_fops = {
2399 .owner = THIS_MODULE,
2400 .open = fib_trie_seq_open,
2401 .read = seq_read,
2402 .llseek = seq_lseek,
2403 .release = seq_release_private,
2404 };
2405
2406 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2407 {
2408 static unsigned type2flags[RTN_MAX + 1] = {
2409 [7] = RTF_REJECT, [8] = RTF_REJECT,
2410 };
2411 unsigned flags = type2flags[type];
2412
2413 if (fi && fi->fib_nh->nh_gw)
2414 flags |= RTF_GATEWAY;
2415 if (mask == htonl(0xFFFFFFFF))
2416 flags |= RTF_HOST;
2417 flags |= RTF_UP;
2418 return flags;
2419 }
2420
2421 /*
2422 * This outputs /proc/net/route.
2423 * The format of the file is not supposed to be changed
2424 * and needs to be same as fib_hash output to avoid breaking
2425 * legacy utilities
2426 */
2427 static int fib_route_seq_show(struct seq_file *seq, void *v)
2428 {
2429 const struct fib_trie_iter *iter = seq->private;
2430 struct leaf *l = v;
2431 int i;
2432 char bf[128];
2433
2434 if (v == SEQ_START_TOKEN) {
2435 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2436 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2437 "\tWindow\tIRTT");
2438 return 0;
2439 }
2440
2441 if (iter->trie == trie_local)
2442 return 0;
2443 if (IS_TNODE(l))
2444 return 0;
2445
2446 for (i=32; i>=0; i--) {
2447 struct leaf_info *li = find_leaf_info(l, i);
2448 struct fib_alias *fa;
2449 __be32 mask, prefix;
2450
2451 if (!li)
2452 continue;
2453
2454 mask = inet_make_mask(li->plen);
2455 prefix = htonl(l->key);
2456
2457 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2458 const struct fib_info *fi = fa->fa_info;
2459 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2460
2461 if (fa->fa_type == RTN_BROADCAST
2462 || fa->fa_type == RTN_MULTICAST)
2463 continue;
2464
2465 if (fi)
2466 snprintf(bf, sizeof(bf),
2467 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2468 fi->fib_dev ? fi->fib_dev->name : "*",
2469 prefix,
2470 fi->fib_nh->nh_gw, flags, 0, 0,
2471 fi->fib_priority,
2472 mask,
2473 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2474 fi->fib_window,
2475 fi->fib_rtt >> 3);
2476 else
2477 snprintf(bf, sizeof(bf),
2478 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2479 prefix, 0, flags, 0, 0, 0,
2480 mask, 0, 0, 0);
2481
2482 seq_printf(seq, "%-127s\n", bf);
2483 }
2484 }
2485
2486 return 0;
2487 }
2488
2489 static const struct seq_operations fib_route_seq_ops = {
2490 .start = fib_trie_seq_start,
2491 .next = fib_trie_seq_next,
2492 .stop = fib_trie_seq_stop,
2493 .show = fib_route_seq_show,
2494 };
2495
2496 static int fib_route_seq_open(struct inode *inode, struct file *file)
2497 {
2498 struct seq_file *seq;
2499 int rc = -ENOMEM;
2500 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2501
2502 if (!s)
2503 goto out;
2504
2505 rc = seq_open(file, &fib_route_seq_ops);
2506 if (rc)
2507 goto out_kfree;
2508
2509 seq = file->private_data;
2510 seq->private = s;
2511 memset(s, 0, sizeof(*s));
2512 out:
2513 return rc;
2514 out_kfree:
2515 kfree(s);
2516 goto out;
2517 }
2518
2519 static const struct file_operations fib_route_fops = {
2520 .owner = THIS_MODULE,
2521 .open = fib_route_seq_open,
2522 .read = seq_read,
2523 .llseek = seq_lseek,
2524 .release = seq_release_private,
2525 };
2526
2527 int __init fib_proc_init(void)
2528 {
2529 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2530 goto out1;
2531
2532 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2533 goto out2;
2534
2535 if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2536 goto out3;
2537
2538 return 0;
2539
2540 out3:
2541 proc_net_remove("fib_triestat");
2542 out2:
2543 proc_net_remove("fib_trie");
2544 out1:
2545 return -ENOMEM;
2546 }
2547
2548 void __init fib_proc_exit(void)
2549 {
2550 proc_net_remove("fib_trie");
2551 proc_net_remove("fib_triestat");
2552 proc_net_remove("route");
2553 }
2554
2555 #endif /* CONFIG_PROC_FS */
This page took 0.083986 seconds and 5 git commands to generate.