[IPV4] fib_trie: removes a memset() call in tnode_new()
[deliverable/linux.git] / net / ipv4 / fib_trie.c
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
51 */
52
53 #define VERSION "0.408"
54
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <linux/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/mm.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
65 #include <linux/in.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/net_namespace.h>
77 #include <net/ip.h>
78 #include <net/protocol.h>
79 #include <net/route.h>
80 #include <net/tcp.h>
81 #include <net/sock.h>
82 #include <net/ip_fib.h>
83 #include "fib_lookup.h"
84
85 #define MAX_STAT_DEPTH 32
86
87 #define KEYLENGTH (8*sizeof(t_key))
88
89 typedef unsigned int t_key;
90
91 #define T_TNODE 0
92 #define T_LEAF 1
93 #define NODE_TYPE_MASK 0x1UL
94 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
96 #define IS_TNODE(n) (!(n->parent & T_LEAF))
97 #define IS_LEAF(n) (n->parent & T_LEAF)
98
99 struct node {
100 t_key key;
101 unsigned long parent;
102 };
103
104 struct leaf {
105 t_key key;
106 unsigned long parent;
107 struct hlist_head list;
108 struct rcu_head rcu;
109 };
110
111 struct leaf_info {
112 struct hlist_node hlist;
113 struct rcu_head rcu;
114 int plen;
115 struct list_head falh;
116 };
117
118 struct tnode {
119 t_key key;
120 unsigned long parent;
121 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
122 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
123 unsigned short full_children; /* KEYLENGTH bits needed */
124 unsigned short empty_children; /* KEYLENGTH bits needed */
125 struct rcu_head rcu;
126 struct node *child[0];
127 };
128
129 #ifdef CONFIG_IP_FIB_TRIE_STATS
130 struct trie_use_stats {
131 unsigned int gets;
132 unsigned int backtrack;
133 unsigned int semantic_match_passed;
134 unsigned int semantic_match_miss;
135 unsigned int null_node_hit;
136 unsigned int resize_node_skipped;
137 };
138 #endif
139
140 struct trie_stat {
141 unsigned int totdepth;
142 unsigned int maxdepth;
143 unsigned int tnodes;
144 unsigned int leaves;
145 unsigned int nullpointers;
146 unsigned int nodesizes[MAX_STAT_DEPTH];
147 };
148
149 struct trie {
150 struct node *trie;
151 #ifdef CONFIG_IP_FIB_TRIE_STATS
152 struct trie_use_stats stats;
153 #endif
154 int size;
155 };
156
157 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
158 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
159 static struct node *resize(struct trie *t, struct tnode *tn);
160 static struct tnode *inflate(struct trie *t, struct tnode *tn);
161 static struct tnode *halve(struct trie *t, struct tnode *tn);
162 static void tnode_free(struct tnode *tn);
163
164 static struct kmem_cache *fn_alias_kmem __read_mostly;
165
166 static inline struct tnode *node_parent(struct node *node)
167 {
168 struct tnode *ret;
169
170 ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171 return rcu_dereference(ret);
172 }
173
174 static inline void node_set_parent(struct node *node, struct tnode *ptr)
175 {
176 rcu_assign_pointer(node->parent,
177 (unsigned long)ptr | NODE_TYPE(node));
178 }
179
180 /* rcu_read_lock needs to be hold by caller from readside */
181
182 static inline struct node *tnode_get_child(struct tnode *tn, int i)
183 {
184 BUG_ON(i >= 1 << tn->bits);
185
186 return rcu_dereference(tn->child[i]);
187 }
188
189 static inline int tnode_child_length(const struct tnode *tn)
190 {
191 return 1 << tn->bits;
192 }
193
194 static inline t_key mask_pfx(t_key k, unsigned short l)
195 {
196 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
197 }
198
199 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
200 {
201 if (offset < KEYLENGTH)
202 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
203 else
204 return 0;
205 }
206
207 static inline int tkey_equals(t_key a, t_key b)
208 {
209 return a == b;
210 }
211
212 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
213 {
214 if (bits == 0 || offset >= KEYLENGTH)
215 return 1;
216 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
217 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
218 }
219
220 static inline int tkey_mismatch(t_key a, int offset, t_key b)
221 {
222 t_key diff = a ^ b;
223 int i = offset;
224
225 if (!diff)
226 return 0;
227 while ((diff << i) >> (KEYLENGTH-1) == 0)
228 i++;
229 return i;
230 }
231
232 /*
233 To understand this stuff, an understanding of keys and all their bits is
234 necessary. Every node in the trie has a key associated with it, but not
235 all of the bits in that key are significant.
236
237 Consider a node 'n' and its parent 'tp'.
238
239 If n is a leaf, every bit in its key is significant. Its presence is
240 necessitated by path compression, since during a tree traversal (when
241 searching for a leaf - unless we are doing an insertion) we will completely
242 ignore all skipped bits we encounter. Thus we need to verify, at the end of
243 a potentially successful search, that we have indeed been walking the
244 correct key path.
245
246 Note that we can never "miss" the correct key in the tree if present by
247 following the wrong path. Path compression ensures that segments of the key
248 that are the same for all keys with a given prefix are skipped, but the
249 skipped part *is* identical for each node in the subtrie below the skipped
250 bit! trie_insert() in this implementation takes care of that - note the
251 call to tkey_sub_equals() in trie_insert().
252
253 if n is an internal node - a 'tnode' here, the various parts of its key
254 have many different meanings.
255
256 Example:
257 _________________________________________________________________
258 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
259 -----------------------------------------------------------------
260 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
261
262 _________________________________________________________________
263 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
264 -----------------------------------------------------------------
265 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
266
267 tp->pos = 7
268 tp->bits = 3
269 n->pos = 15
270 n->bits = 4
271
272 First, let's just ignore the bits that come before the parent tp, that is
273 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
274 not use them for anything.
275
276 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
277 index into the parent's child array. That is, they will be used to find
278 'n' among tp's children.
279
280 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
281 for the node n.
282
283 All the bits we have seen so far are significant to the node n. The rest
284 of the bits are really not needed or indeed known in n->key.
285
286 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
287 n's child array, and will of course be different for each child.
288
289
290 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
291 at this point.
292
293 */
294
295 static inline void check_tnode(const struct tnode *tn)
296 {
297 WARN_ON(tn && tn->pos+tn->bits > 32);
298 }
299
300 static const int halve_threshold = 25;
301 static const int inflate_threshold = 50;
302 static const int halve_threshold_root = 8;
303 static const int inflate_threshold_root = 15;
304
305
306 static void __alias_free_mem(struct rcu_head *head)
307 {
308 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
309 kmem_cache_free(fn_alias_kmem, fa);
310 }
311
312 static inline void alias_free_mem_rcu(struct fib_alias *fa)
313 {
314 call_rcu(&fa->rcu, __alias_free_mem);
315 }
316
317 static void __leaf_free_rcu(struct rcu_head *head)
318 {
319 kfree(container_of(head, struct leaf, rcu));
320 }
321
322 static void __leaf_info_free_rcu(struct rcu_head *head)
323 {
324 kfree(container_of(head, struct leaf_info, rcu));
325 }
326
327 static inline void free_leaf_info(struct leaf_info *leaf)
328 {
329 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
330 }
331
332 static struct tnode *tnode_alloc(unsigned int size)
333 {
334 struct page *pages;
335
336 if (size <= PAGE_SIZE)
337 return kcalloc(size, 1, GFP_KERNEL);
338
339 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
340 if (!pages)
341 return NULL;
342
343 return page_address(pages);
344 }
345
346 static void __tnode_free_rcu(struct rcu_head *head)
347 {
348 struct tnode *tn = container_of(head, struct tnode, rcu);
349 unsigned int size = sizeof(struct tnode) +
350 (1 << tn->bits) * sizeof(struct node *);
351
352 if (size <= PAGE_SIZE)
353 kfree(tn);
354 else
355 free_pages((unsigned long)tn, get_order(size));
356 }
357
358 static inline void tnode_free(struct tnode *tn)
359 {
360 if (IS_LEAF(tn)) {
361 struct leaf *l = (struct leaf *) tn;
362 call_rcu_bh(&l->rcu, __leaf_free_rcu);
363 } else
364 call_rcu(&tn->rcu, __tnode_free_rcu);
365 }
366
367 static struct leaf *leaf_new(void)
368 {
369 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
370 if (l) {
371 l->parent = T_LEAF;
372 INIT_HLIST_HEAD(&l->list);
373 }
374 return l;
375 }
376
377 static struct leaf_info *leaf_info_new(int plen)
378 {
379 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
380 if (li) {
381 li->plen = plen;
382 INIT_LIST_HEAD(&li->falh);
383 }
384 return li;
385 }
386
387 static struct tnode* tnode_new(t_key key, int pos, int bits)
388 {
389 int nchildren = 1<<bits;
390 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
391 struct tnode *tn = tnode_alloc(sz);
392
393 if (tn) {
394 tn->parent = T_TNODE;
395 tn->pos = pos;
396 tn->bits = bits;
397 tn->key = key;
398 tn->full_children = 0;
399 tn->empty_children = 1<<bits;
400 }
401
402 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
403 (unsigned int) (sizeof(struct node) * 1<<bits));
404 return tn;
405 }
406
407 /*
408 * Check whether a tnode 'n' is "full", i.e. it is an internal node
409 * and no bits are skipped. See discussion in dyntree paper p. 6
410 */
411
412 static inline int tnode_full(const struct tnode *tn, const struct node *n)
413 {
414 if (n == NULL || IS_LEAF(n))
415 return 0;
416
417 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
418 }
419
420 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
421 {
422 tnode_put_child_reorg(tn, i, n, -1);
423 }
424
425 /*
426 * Add a child at position i overwriting the old value.
427 * Update the value of full_children and empty_children.
428 */
429
430 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
431 {
432 struct node *chi = tn->child[i];
433 int isfull;
434
435 BUG_ON(i >= 1<<tn->bits);
436
437
438 /* update emptyChildren */
439 if (n == NULL && chi != NULL)
440 tn->empty_children++;
441 else if (n != NULL && chi == NULL)
442 tn->empty_children--;
443
444 /* update fullChildren */
445 if (wasfull == -1)
446 wasfull = tnode_full(tn, chi);
447
448 isfull = tnode_full(tn, n);
449 if (wasfull && !isfull)
450 tn->full_children--;
451 else if (!wasfull && isfull)
452 tn->full_children++;
453
454 if (n)
455 node_set_parent(n, tn);
456
457 rcu_assign_pointer(tn->child[i], n);
458 }
459
460 static struct node *resize(struct trie *t, struct tnode *tn)
461 {
462 int i;
463 int err = 0;
464 struct tnode *old_tn;
465 int inflate_threshold_use;
466 int halve_threshold_use;
467 int max_resize;
468
469 if (!tn)
470 return NULL;
471
472 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
473 tn, inflate_threshold, halve_threshold);
474
475 /* No children */
476 if (tn->empty_children == tnode_child_length(tn)) {
477 tnode_free(tn);
478 return NULL;
479 }
480 /* One child */
481 if (tn->empty_children == tnode_child_length(tn) - 1)
482 for (i = 0; i < tnode_child_length(tn); i++) {
483 struct node *n;
484
485 n = tn->child[i];
486 if (!n)
487 continue;
488
489 /* compress one level */
490 node_set_parent(n, NULL);
491 tnode_free(tn);
492 return n;
493 }
494 /*
495 * Double as long as the resulting node has a number of
496 * nonempty nodes that are above the threshold.
497 */
498
499 /*
500 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
501 * the Helsinki University of Technology and Matti Tikkanen of Nokia
502 * Telecommunications, page 6:
503 * "A node is doubled if the ratio of non-empty children to all
504 * children in the *doubled* node is at least 'high'."
505 *
506 * 'high' in this instance is the variable 'inflate_threshold'. It
507 * is expressed as a percentage, so we multiply it with
508 * tnode_child_length() and instead of multiplying by 2 (since the
509 * child array will be doubled by inflate()) and multiplying
510 * the left-hand side by 100 (to handle the percentage thing) we
511 * multiply the left-hand side by 50.
512 *
513 * The left-hand side may look a bit weird: tnode_child_length(tn)
514 * - tn->empty_children is of course the number of non-null children
515 * in the current node. tn->full_children is the number of "full"
516 * children, that is non-null tnodes with a skip value of 0.
517 * All of those will be doubled in the resulting inflated tnode, so
518 * we just count them one extra time here.
519 *
520 * A clearer way to write this would be:
521 *
522 * to_be_doubled = tn->full_children;
523 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
524 * tn->full_children;
525 *
526 * new_child_length = tnode_child_length(tn) * 2;
527 *
528 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
529 * new_child_length;
530 * if (new_fill_factor >= inflate_threshold)
531 *
532 * ...and so on, tho it would mess up the while () loop.
533 *
534 * anyway,
535 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
536 * inflate_threshold
537 *
538 * avoid a division:
539 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
540 * inflate_threshold * new_child_length
541 *
542 * expand not_to_be_doubled and to_be_doubled, and shorten:
543 * 100 * (tnode_child_length(tn) - tn->empty_children +
544 * tn->full_children) >= inflate_threshold * new_child_length
545 *
546 * expand new_child_length:
547 * 100 * (tnode_child_length(tn) - tn->empty_children +
548 * tn->full_children) >=
549 * inflate_threshold * tnode_child_length(tn) * 2
550 *
551 * shorten again:
552 * 50 * (tn->full_children + tnode_child_length(tn) -
553 * tn->empty_children) >= inflate_threshold *
554 * tnode_child_length(tn)
555 *
556 */
557
558 check_tnode(tn);
559
560 /* Keep root node larger */
561
562 if (!tn->parent)
563 inflate_threshold_use = inflate_threshold_root;
564 else
565 inflate_threshold_use = inflate_threshold;
566
567 err = 0;
568 max_resize = 10;
569 while ((tn->full_children > 0 && max_resize-- &&
570 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
571 inflate_threshold_use * tnode_child_length(tn))) {
572
573 old_tn = tn;
574 tn = inflate(t, tn);
575 if (IS_ERR(tn)) {
576 tn = old_tn;
577 #ifdef CONFIG_IP_FIB_TRIE_STATS
578 t->stats.resize_node_skipped++;
579 #endif
580 break;
581 }
582 }
583
584 if (max_resize < 0) {
585 if (!tn->parent)
586 printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
587 inflate_threshold_root, tn->bits);
588 else
589 printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
590 inflate_threshold, tn->bits);
591 }
592
593 check_tnode(tn);
594
595 /*
596 * Halve as long as the number of empty children in this
597 * node is above threshold.
598 */
599
600
601 /* Keep root node larger */
602
603 if (!tn->parent)
604 halve_threshold_use = halve_threshold_root;
605 else
606 halve_threshold_use = halve_threshold;
607
608 err = 0;
609 max_resize = 10;
610 while (tn->bits > 1 && max_resize-- &&
611 100 * (tnode_child_length(tn) - tn->empty_children) <
612 halve_threshold_use * tnode_child_length(tn)) {
613
614 old_tn = tn;
615 tn = halve(t, tn);
616 if (IS_ERR(tn)) {
617 tn = old_tn;
618 #ifdef CONFIG_IP_FIB_TRIE_STATS
619 t->stats.resize_node_skipped++;
620 #endif
621 break;
622 }
623 }
624
625 if (max_resize < 0) {
626 if (!tn->parent)
627 printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
628 halve_threshold_root, tn->bits);
629 else
630 printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
631 halve_threshold, tn->bits);
632 }
633
634 /* Only one child remains */
635 if (tn->empty_children == tnode_child_length(tn) - 1)
636 for (i = 0; i < tnode_child_length(tn); i++) {
637 struct node *n;
638
639 n = tn->child[i];
640 if (!n)
641 continue;
642
643 /* compress one level */
644
645 node_set_parent(n, NULL);
646 tnode_free(tn);
647 return n;
648 }
649
650 return (struct node *) tn;
651 }
652
653 static struct tnode *inflate(struct trie *t, struct tnode *tn)
654 {
655 struct tnode *oldtnode = tn;
656 int olen = tnode_child_length(tn);
657 int i;
658
659 pr_debug("In inflate\n");
660
661 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
662
663 if (!tn)
664 return ERR_PTR(-ENOMEM);
665
666 /*
667 * Preallocate and store tnodes before the actual work so we
668 * don't get into an inconsistent state if memory allocation
669 * fails. In case of failure we return the oldnode and inflate
670 * of tnode is ignored.
671 */
672
673 for (i = 0; i < olen; i++) {
674 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
675
676 if (inode &&
677 IS_TNODE(inode) &&
678 inode->pos == oldtnode->pos + oldtnode->bits &&
679 inode->bits > 1) {
680 struct tnode *left, *right;
681 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
682
683 left = tnode_new(inode->key&(~m), inode->pos + 1,
684 inode->bits - 1);
685 if (!left)
686 goto nomem;
687
688 right = tnode_new(inode->key|m, inode->pos + 1,
689 inode->bits - 1);
690
691 if (!right) {
692 tnode_free(left);
693 goto nomem;
694 }
695
696 put_child(t, tn, 2*i, (struct node *) left);
697 put_child(t, tn, 2*i+1, (struct node *) right);
698 }
699 }
700
701 for (i = 0; i < olen; i++) {
702 struct tnode *inode;
703 struct node *node = tnode_get_child(oldtnode, i);
704 struct tnode *left, *right;
705 int size, j;
706
707 /* An empty child */
708 if (node == NULL)
709 continue;
710
711 /* A leaf or an internal node with skipped bits */
712
713 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
714 tn->pos + tn->bits - 1) {
715 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
716 1) == 0)
717 put_child(t, tn, 2*i, node);
718 else
719 put_child(t, tn, 2*i+1, node);
720 continue;
721 }
722
723 /* An internal node with two children */
724 inode = (struct tnode *) node;
725
726 if (inode->bits == 1) {
727 put_child(t, tn, 2*i, inode->child[0]);
728 put_child(t, tn, 2*i+1, inode->child[1]);
729
730 tnode_free(inode);
731 continue;
732 }
733
734 /* An internal node with more than two children */
735
736 /* We will replace this node 'inode' with two new
737 * ones, 'left' and 'right', each with half of the
738 * original children. The two new nodes will have
739 * a position one bit further down the key and this
740 * means that the "significant" part of their keys
741 * (see the discussion near the top of this file)
742 * will differ by one bit, which will be "0" in
743 * left's key and "1" in right's key. Since we are
744 * moving the key position by one step, the bit that
745 * we are moving away from - the bit at position
746 * (inode->pos) - is the one that will differ between
747 * left and right. So... we synthesize that bit in the
748 * two new keys.
749 * The mask 'm' below will be a single "one" bit at
750 * the position (inode->pos)
751 */
752
753 /* Use the old key, but set the new significant
754 * bit to zero.
755 */
756
757 left = (struct tnode *) tnode_get_child(tn, 2*i);
758 put_child(t, tn, 2*i, NULL);
759
760 BUG_ON(!left);
761
762 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
763 put_child(t, tn, 2*i+1, NULL);
764
765 BUG_ON(!right);
766
767 size = tnode_child_length(left);
768 for (j = 0; j < size; j++) {
769 put_child(t, left, j, inode->child[j]);
770 put_child(t, right, j, inode->child[j + size]);
771 }
772 put_child(t, tn, 2*i, resize(t, left));
773 put_child(t, tn, 2*i+1, resize(t, right));
774
775 tnode_free(inode);
776 }
777 tnode_free(oldtnode);
778 return tn;
779 nomem:
780 {
781 int size = tnode_child_length(tn);
782 int j;
783
784 for (j = 0; j < size; j++)
785 if (tn->child[j])
786 tnode_free((struct tnode *)tn->child[j]);
787
788 tnode_free(tn);
789
790 return ERR_PTR(-ENOMEM);
791 }
792 }
793
794 static struct tnode *halve(struct trie *t, struct tnode *tn)
795 {
796 struct tnode *oldtnode = tn;
797 struct node *left, *right;
798 int i;
799 int olen = tnode_child_length(tn);
800
801 pr_debug("In halve\n");
802
803 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
804
805 if (!tn)
806 return ERR_PTR(-ENOMEM);
807
808 /*
809 * Preallocate and store tnodes before the actual work so we
810 * don't get into an inconsistent state if memory allocation
811 * fails. In case of failure we return the oldnode and halve
812 * of tnode is ignored.
813 */
814
815 for (i = 0; i < olen; i += 2) {
816 left = tnode_get_child(oldtnode, i);
817 right = tnode_get_child(oldtnode, i+1);
818
819 /* Two nonempty children */
820 if (left && right) {
821 struct tnode *newn;
822
823 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
824
825 if (!newn)
826 goto nomem;
827
828 put_child(t, tn, i/2, (struct node *)newn);
829 }
830
831 }
832
833 for (i = 0; i < olen; i += 2) {
834 struct tnode *newBinNode;
835
836 left = tnode_get_child(oldtnode, i);
837 right = tnode_get_child(oldtnode, i+1);
838
839 /* At least one of the children is empty */
840 if (left == NULL) {
841 if (right == NULL) /* Both are empty */
842 continue;
843 put_child(t, tn, i/2, right);
844 continue;
845 }
846
847 if (right == NULL) {
848 put_child(t, tn, i/2, left);
849 continue;
850 }
851
852 /* Two nonempty children */
853 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
854 put_child(t, tn, i/2, NULL);
855 put_child(t, newBinNode, 0, left);
856 put_child(t, newBinNode, 1, right);
857 put_child(t, tn, i/2, resize(t, newBinNode));
858 }
859 tnode_free(oldtnode);
860 return tn;
861 nomem:
862 {
863 int size = tnode_child_length(tn);
864 int j;
865
866 for (j = 0; j < size; j++)
867 if (tn->child[j])
868 tnode_free((struct tnode *)tn->child[j]);
869
870 tnode_free(tn);
871
872 return ERR_PTR(-ENOMEM);
873 }
874 }
875
876 /* readside must use rcu_read_lock currently dump routines
877 via get_fa_head and dump */
878
879 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
880 {
881 struct hlist_head *head = &l->list;
882 struct hlist_node *node;
883 struct leaf_info *li;
884
885 hlist_for_each_entry_rcu(li, node, head, hlist)
886 if (li->plen == plen)
887 return li;
888
889 return NULL;
890 }
891
892 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
893 {
894 struct leaf_info *li = find_leaf_info(l, plen);
895
896 if (!li)
897 return NULL;
898
899 return &li->falh;
900 }
901
902 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
903 {
904 struct leaf_info *li = NULL, *last = NULL;
905 struct hlist_node *node;
906
907 if (hlist_empty(head)) {
908 hlist_add_head_rcu(&new->hlist, head);
909 } else {
910 hlist_for_each_entry(li, node, head, hlist) {
911 if (new->plen > li->plen)
912 break;
913
914 last = li;
915 }
916 if (last)
917 hlist_add_after_rcu(&last->hlist, &new->hlist);
918 else
919 hlist_add_before_rcu(&new->hlist, &li->hlist);
920 }
921 }
922
923 /* rcu_read_lock needs to be hold by caller from readside */
924
925 static struct leaf *
926 fib_find_node(struct trie *t, u32 key)
927 {
928 int pos;
929 struct tnode *tn;
930 struct node *n;
931
932 pos = 0;
933 n = rcu_dereference(t->trie);
934
935 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
936 tn = (struct tnode *) n;
937
938 check_tnode(tn);
939
940 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
941 pos = tn->pos + tn->bits;
942 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
943 } else
944 break;
945 }
946 /* Case we have found a leaf. Compare prefixes */
947
948 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
949 return (struct leaf *)n;
950
951 return NULL;
952 }
953
954 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
955 {
956 int wasfull;
957 t_key cindex, key = tn->key;
958 struct tnode *tp;
959
960 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
961 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
962 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
963 tn = (struct tnode *) resize (t, (struct tnode *)tn);
964 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
965
966 tp = node_parent((struct node *) tn);
967 if (!tp)
968 break;
969 tn = tp;
970 }
971
972 /* Handle last (top) tnode */
973 if (IS_TNODE(tn))
974 tn = (struct tnode*) resize(t, (struct tnode *)tn);
975
976 return (struct node*) tn;
977 }
978
979 /* only used from updater-side */
980
981 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
982 {
983 int pos, newpos;
984 struct tnode *tp = NULL, *tn = NULL;
985 struct node *n;
986 struct leaf *l;
987 int missbit;
988 struct list_head *fa_head = NULL;
989 struct leaf_info *li;
990 t_key cindex;
991
992 pos = 0;
993 n = t->trie;
994
995 /* If we point to NULL, stop. Either the tree is empty and we should
996 * just put a new leaf in if, or we have reached an empty child slot,
997 * and we should just put our new leaf in that.
998 * If we point to a T_TNODE, check if it matches our key. Note that
999 * a T_TNODE might be skipping any number of bits - its 'pos' need
1000 * not be the parent's 'pos'+'bits'!
1001 *
1002 * If it does match the current key, get pos/bits from it, extract
1003 * the index from our key, push the T_TNODE and walk the tree.
1004 *
1005 * If it doesn't, we have to replace it with a new T_TNODE.
1006 *
1007 * If we point to a T_LEAF, it might or might not have the same key
1008 * as we do. If it does, just change the value, update the T_LEAF's
1009 * value, and return it.
1010 * If it doesn't, we need to replace it with a T_TNODE.
1011 */
1012
1013 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1014 tn = (struct tnode *) n;
1015
1016 check_tnode(tn);
1017
1018 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1019 tp = tn;
1020 pos = tn->pos + tn->bits;
1021 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1022
1023 BUG_ON(n && node_parent(n) != tn);
1024 } else
1025 break;
1026 }
1027
1028 /*
1029 * n ----> NULL, LEAF or TNODE
1030 *
1031 * tp is n's (parent) ----> NULL or TNODE
1032 */
1033
1034 BUG_ON(tp && IS_LEAF(tp));
1035
1036 /* Case 1: n is a leaf. Compare prefixes */
1037
1038 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1039 l = (struct leaf *) n;
1040 li = leaf_info_new(plen);
1041
1042 if (!li)
1043 return NULL;
1044
1045 fa_head = &li->falh;
1046 insert_leaf_info(&l->list, li);
1047 goto done;
1048 }
1049 t->size++;
1050 l = leaf_new();
1051
1052 if (!l)
1053 return NULL;
1054
1055 l->key = key;
1056 li = leaf_info_new(plen);
1057
1058 if (!li) {
1059 tnode_free((struct tnode *) l);
1060 return NULL;
1061 }
1062
1063 fa_head = &li->falh;
1064 insert_leaf_info(&l->list, li);
1065
1066 if (t->trie && n == NULL) {
1067 /* Case 2: n is NULL, and will just insert a new leaf */
1068
1069 node_set_parent((struct node *)l, tp);
1070
1071 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1072 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1073 } else {
1074 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1075 /*
1076 * Add a new tnode here
1077 * first tnode need some special handling
1078 */
1079
1080 if (tp)
1081 pos = tp->pos+tp->bits;
1082 else
1083 pos = 0;
1084
1085 if (n) {
1086 newpos = tkey_mismatch(key, pos, n->key);
1087 tn = tnode_new(n->key, newpos, 1);
1088 } else {
1089 newpos = 0;
1090 tn = tnode_new(key, newpos, 1); /* First tnode */
1091 }
1092
1093 if (!tn) {
1094 free_leaf_info(li);
1095 tnode_free((struct tnode *) l);
1096 return NULL;
1097 }
1098
1099 node_set_parent((struct node *)tn, tp);
1100
1101 missbit = tkey_extract_bits(key, newpos, 1);
1102 put_child(t, tn, missbit, (struct node *)l);
1103 put_child(t, tn, 1-missbit, n);
1104
1105 if (tp) {
1106 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1107 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1108 } else {
1109 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1110 tp = tn;
1111 }
1112 }
1113
1114 if (tp && tp->pos + tp->bits > 32)
1115 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1116 tp, tp->pos, tp->bits, key, plen);
1117
1118 /* Rebalance the trie */
1119
1120 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1121 done:
1122 return fa_head;
1123 }
1124
1125 /*
1126 * Caller must hold RTNL.
1127 */
1128 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1129 {
1130 struct trie *t = (struct trie *) tb->tb_data;
1131 struct fib_alias *fa, *new_fa;
1132 struct list_head *fa_head = NULL;
1133 struct fib_info *fi;
1134 int plen = cfg->fc_dst_len;
1135 u8 tos = cfg->fc_tos;
1136 u32 key, mask;
1137 int err;
1138 struct leaf *l;
1139
1140 if (plen > 32)
1141 return -EINVAL;
1142
1143 key = ntohl(cfg->fc_dst);
1144
1145 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1146
1147 mask = ntohl(inet_make_mask(plen));
1148
1149 if (key & ~mask)
1150 return -EINVAL;
1151
1152 key = key & mask;
1153
1154 fi = fib_create_info(cfg);
1155 if (IS_ERR(fi)) {
1156 err = PTR_ERR(fi);
1157 goto err;
1158 }
1159
1160 l = fib_find_node(t, key);
1161 fa = NULL;
1162
1163 if (l) {
1164 fa_head = get_fa_head(l, plen);
1165 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1166 }
1167
1168 /* Now fa, if non-NULL, points to the first fib alias
1169 * with the same keys [prefix,tos,priority], if such key already
1170 * exists or to the node before which we will insert new one.
1171 *
1172 * If fa is NULL, we will need to allocate a new one and
1173 * insert to the head of f.
1174 *
1175 * If f is NULL, no fib node matched the destination key
1176 * and we need to allocate a new one of those as well.
1177 */
1178
1179 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1180 struct fib_alias *fa_orig;
1181
1182 err = -EEXIST;
1183 if (cfg->fc_nlflags & NLM_F_EXCL)
1184 goto out;
1185
1186 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1187 struct fib_info *fi_drop;
1188 u8 state;
1189
1190 if (fi->fib_treeref > 1)
1191 goto out;
1192
1193 err = -ENOBUFS;
1194 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1195 if (new_fa == NULL)
1196 goto out;
1197
1198 fi_drop = fa->fa_info;
1199 new_fa->fa_tos = fa->fa_tos;
1200 new_fa->fa_info = fi;
1201 new_fa->fa_type = cfg->fc_type;
1202 new_fa->fa_scope = cfg->fc_scope;
1203 state = fa->fa_state;
1204 new_fa->fa_state &= ~FA_S_ACCESSED;
1205
1206 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1207 alias_free_mem_rcu(fa);
1208
1209 fib_release_info(fi_drop);
1210 if (state & FA_S_ACCESSED)
1211 rt_cache_flush(-1);
1212 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1213 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1214
1215 goto succeeded;
1216 }
1217 /* Error if we find a perfect match which
1218 * uses the same scope, type, and nexthop
1219 * information.
1220 */
1221 fa_orig = fa;
1222 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1223 if (fa->fa_tos != tos)
1224 break;
1225 if (fa->fa_info->fib_priority != fi->fib_priority)
1226 break;
1227 if (fa->fa_type == cfg->fc_type &&
1228 fa->fa_scope == cfg->fc_scope &&
1229 fa->fa_info == fi) {
1230 goto out;
1231 }
1232 }
1233 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1234 fa = fa_orig;
1235 }
1236 err = -ENOENT;
1237 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1238 goto out;
1239
1240 err = -ENOBUFS;
1241 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1242 if (new_fa == NULL)
1243 goto out;
1244
1245 new_fa->fa_info = fi;
1246 new_fa->fa_tos = tos;
1247 new_fa->fa_type = cfg->fc_type;
1248 new_fa->fa_scope = cfg->fc_scope;
1249 new_fa->fa_state = 0;
1250 /*
1251 * Insert new entry to the list.
1252 */
1253
1254 if (!fa_head) {
1255 fa_head = fib_insert_node(t, key, plen);
1256 if (unlikely(!fa_head)) {
1257 err = -ENOMEM;
1258 goto out_free_new_fa;
1259 }
1260 }
1261
1262 list_add_tail_rcu(&new_fa->fa_list,
1263 (fa ? &fa->fa_list : fa_head));
1264
1265 rt_cache_flush(-1);
1266 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1267 &cfg->fc_nlinfo, 0);
1268 succeeded:
1269 return 0;
1270
1271 out_free_new_fa:
1272 kmem_cache_free(fn_alias_kmem, new_fa);
1273 out:
1274 fib_release_info(fi);
1275 err:
1276 return err;
1277 }
1278
1279
1280 /* should be called with rcu_read_lock */
1281 static inline int check_leaf(struct trie *t, struct leaf *l,
1282 t_key key, int *plen, const struct flowi *flp,
1283 struct fib_result *res)
1284 {
1285 int err, i;
1286 __be32 mask;
1287 struct leaf_info *li;
1288 struct hlist_head *hhead = &l->list;
1289 struct hlist_node *node;
1290
1291 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1292 i = li->plen;
1293 mask = inet_make_mask(i);
1294 if (l->key != (key & ntohl(mask)))
1295 continue;
1296
1297 if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1298 *plen = i;
1299 #ifdef CONFIG_IP_FIB_TRIE_STATS
1300 t->stats.semantic_match_passed++;
1301 #endif
1302 return err;
1303 }
1304 #ifdef CONFIG_IP_FIB_TRIE_STATS
1305 t->stats.semantic_match_miss++;
1306 #endif
1307 }
1308 return 1;
1309 }
1310
1311 static int
1312 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1313 {
1314 struct trie *t = (struct trie *) tb->tb_data;
1315 int plen, ret = 0;
1316 struct node *n;
1317 struct tnode *pn;
1318 int pos, bits;
1319 t_key key = ntohl(flp->fl4_dst);
1320 int chopped_off;
1321 t_key cindex = 0;
1322 int current_prefix_length = KEYLENGTH;
1323 struct tnode *cn;
1324 t_key node_prefix, key_prefix, pref_mismatch;
1325 int mp;
1326
1327 rcu_read_lock();
1328
1329 n = rcu_dereference(t->trie);
1330 if (!n)
1331 goto failed;
1332
1333 #ifdef CONFIG_IP_FIB_TRIE_STATS
1334 t->stats.gets++;
1335 #endif
1336
1337 /* Just a leaf? */
1338 if (IS_LEAF(n)) {
1339 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1340 goto found;
1341 goto failed;
1342 }
1343 pn = (struct tnode *) n;
1344 chopped_off = 0;
1345
1346 while (pn) {
1347 pos = pn->pos;
1348 bits = pn->bits;
1349
1350 if (!chopped_off)
1351 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1352 pos, bits);
1353
1354 n = tnode_get_child(pn, cindex);
1355
1356 if (n == NULL) {
1357 #ifdef CONFIG_IP_FIB_TRIE_STATS
1358 t->stats.null_node_hit++;
1359 #endif
1360 goto backtrace;
1361 }
1362
1363 if (IS_LEAF(n)) {
1364 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1365 goto found;
1366 else
1367 goto backtrace;
1368 }
1369
1370 #define HL_OPTIMIZE
1371 #ifdef HL_OPTIMIZE
1372 cn = (struct tnode *)n;
1373
1374 /*
1375 * It's a tnode, and we can do some extra checks here if we
1376 * like, to avoid descending into a dead-end branch.
1377 * This tnode is in the parent's child array at index
1378 * key[p_pos..p_pos+p_bits] but potentially with some bits
1379 * chopped off, so in reality the index may be just a
1380 * subprefix, padded with zero at the end.
1381 * We can also take a look at any skipped bits in this
1382 * tnode - everything up to p_pos is supposed to be ok,
1383 * and the non-chopped bits of the index (se previous
1384 * paragraph) are also guaranteed ok, but the rest is
1385 * considered unknown.
1386 *
1387 * The skipped bits are key[pos+bits..cn->pos].
1388 */
1389
1390 /* If current_prefix_length < pos+bits, we are already doing
1391 * actual prefix matching, which means everything from
1392 * pos+(bits-chopped_off) onward must be zero along some
1393 * branch of this subtree - otherwise there is *no* valid
1394 * prefix present. Here we can only check the skipped
1395 * bits. Remember, since we have already indexed into the
1396 * parent's child array, we know that the bits we chopped of
1397 * *are* zero.
1398 */
1399
1400 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1401
1402 if (current_prefix_length < pos+bits) {
1403 if (tkey_extract_bits(cn->key, current_prefix_length,
1404 cn->pos - current_prefix_length) != 0 ||
1405 !(cn->child[0]))
1406 goto backtrace;
1407 }
1408
1409 /*
1410 * If chopped_off=0, the index is fully validated and we
1411 * only need to look at the skipped bits for this, the new,
1412 * tnode. What we actually want to do is to find out if
1413 * these skipped bits match our key perfectly, or if we will
1414 * have to count on finding a matching prefix further down,
1415 * because if we do, we would like to have some way of
1416 * verifying the existence of such a prefix at this point.
1417 */
1418
1419 /* The only thing we can do at this point is to verify that
1420 * any such matching prefix can indeed be a prefix to our
1421 * key, and if the bits in the node we are inspecting that
1422 * do not match our key are not ZERO, this cannot be true.
1423 * Thus, find out where there is a mismatch (before cn->pos)
1424 * and verify that all the mismatching bits are zero in the
1425 * new tnode's key.
1426 */
1427
1428 /* Note: We aren't very concerned about the piece of the key
1429 * that precede pn->pos+pn->bits, since these have already been
1430 * checked. The bits after cn->pos aren't checked since these are
1431 * by definition "unknown" at this point. Thus, what we want to
1432 * see is if we are about to enter the "prefix matching" state,
1433 * and in that case verify that the skipped bits that will prevail
1434 * throughout this subtree are zero, as they have to be if we are
1435 * to find a matching prefix.
1436 */
1437
1438 node_prefix = mask_pfx(cn->key, cn->pos);
1439 key_prefix = mask_pfx(key, cn->pos);
1440 pref_mismatch = key_prefix^node_prefix;
1441 mp = 0;
1442
1443 /* In short: If skipped bits in this node do not match the search
1444 * key, enter the "prefix matching" state.directly.
1445 */
1446 if (pref_mismatch) {
1447 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1448 mp++;
1449 pref_mismatch = pref_mismatch <<1;
1450 }
1451 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1452
1453 if (key_prefix != 0)
1454 goto backtrace;
1455
1456 if (current_prefix_length >= cn->pos)
1457 current_prefix_length = mp;
1458 }
1459 #endif
1460 pn = (struct tnode *)n; /* Descend */
1461 chopped_off = 0;
1462 continue;
1463
1464 backtrace:
1465 chopped_off++;
1466
1467 /* As zero don't change the child key (cindex) */
1468 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1469 chopped_off++;
1470
1471 /* Decrease current_... with bits chopped off */
1472 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1473 current_prefix_length = pn->pos + pn->bits - chopped_off;
1474
1475 /*
1476 * Either we do the actual chop off according or if we have
1477 * chopped off all bits in this tnode walk up to our parent.
1478 */
1479
1480 if (chopped_off <= pn->bits) {
1481 cindex &= ~(1 << (chopped_off-1));
1482 } else {
1483 struct tnode *parent = node_parent((struct node *) pn);
1484 if (!parent)
1485 goto failed;
1486
1487 /* Get Child's index */
1488 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1489 pn = parent;
1490 chopped_off = 0;
1491
1492 #ifdef CONFIG_IP_FIB_TRIE_STATS
1493 t->stats.backtrack++;
1494 #endif
1495 goto backtrace;
1496 }
1497 }
1498 failed:
1499 ret = 1;
1500 found:
1501 rcu_read_unlock();
1502 return ret;
1503 }
1504
1505 /* only called from updater side */
1506 static int trie_leaf_remove(struct trie *t, t_key key)
1507 {
1508 t_key cindex;
1509 struct tnode *tp = NULL;
1510 struct node *n = t->trie;
1511 struct leaf *l;
1512
1513 pr_debug("entering trie_leaf_remove(%p)\n", n);
1514
1515 /* Note that in the case skipped bits, those bits are *not* checked!
1516 * When we finish this, we will have NULL or a T_LEAF, and the
1517 * T_LEAF may or may not match our key.
1518 */
1519
1520 while (n != NULL && IS_TNODE(n)) {
1521 struct tnode *tn = (struct tnode *) n;
1522 check_tnode(tn);
1523 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1524
1525 BUG_ON(n && node_parent(n) != tn);
1526 }
1527 l = (struct leaf *) n;
1528
1529 if (!n || !tkey_equals(l->key, key))
1530 return 0;
1531
1532 /*
1533 * Key found.
1534 * Remove the leaf and rebalance the tree
1535 */
1536
1537 t->size--;
1538
1539 tp = node_parent(n);
1540 tnode_free((struct tnode *) n);
1541
1542 if (tp) {
1543 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1544 put_child(t, (struct tnode *)tp, cindex, NULL);
1545 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1546 } else
1547 rcu_assign_pointer(t->trie, NULL);
1548
1549 return 1;
1550 }
1551
1552 /*
1553 * Caller must hold RTNL.
1554 */
1555 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1556 {
1557 struct trie *t = (struct trie *) tb->tb_data;
1558 u32 key, mask;
1559 int plen = cfg->fc_dst_len;
1560 u8 tos = cfg->fc_tos;
1561 struct fib_alias *fa, *fa_to_delete;
1562 struct list_head *fa_head;
1563 struct leaf *l;
1564 struct leaf_info *li;
1565
1566 if (plen > 32)
1567 return -EINVAL;
1568
1569 key = ntohl(cfg->fc_dst);
1570 mask = ntohl(inet_make_mask(plen));
1571
1572 if (key & ~mask)
1573 return -EINVAL;
1574
1575 key = key & mask;
1576 l = fib_find_node(t, key);
1577
1578 if (!l)
1579 return -ESRCH;
1580
1581 fa_head = get_fa_head(l, plen);
1582 fa = fib_find_alias(fa_head, tos, 0);
1583
1584 if (!fa)
1585 return -ESRCH;
1586
1587 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1588
1589 fa_to_delete = NULL;
1590 fa_head = fa->fa_list.prev;
1591
1592 list_for_each_entry(fa, fa_head, fa_list) {
1593 struct fib_info *fi = fa->fa_info;
1594
1595 if (fa->fa_tos != tos)
1596 break;
1597
1598 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1599 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1600 fa->fa_scope == cfg->fc_scope) &&
1601 (!cfg->fc_protocol ||
1602 fi->fib_protocol == cfg->fc_protocol) &&
1603 fib_nh_match(cfg, fi) == 0) {
1604 fa_to_delete = fa;
1605 break;
1606 }
1607 }
1608
1609 if (!fa_to_delete)
1610 return -ESRCH;
1611
1612 fa = fa_to_delete;
1613 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1614 &cfg->fc_nlinfo, 0);
1615
1616 l = fib_find_node(t, key);
1617 li = find_leaf_info(l, plen);
1618
1619 list_del_rcu(&fa->fa_list);
1620
1621 if (list_empty(fa_head)) {
1622 hlist_del_rcu(&li->hlist);
1623 free_leaf_info(li);
1624 }
1625
1626 if (hlist_empty(&l->list))
1627 trie_leaf_remove(t, key);
1628
1629 if (fa->fa_state & FA_S_ACCESSED)
1630 rt_cache_flush(-1);
1631
1632 fib_release_info(fa->fa_info);
1633 alias_free_mem_rcu(fa);
1634 return 0;
1635 }
1636
1637 static int trie_flush_list(struct trie *t, struct list_head *head)
1638 {
1639 struct fib_alias *fa, *fa_node;
1640 int found = 0;
1641
1642 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1643 struct fib_info *fi = fa->fa_info;
1644
1645 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1646 list_del_rcu(&fa->fa_list);
1647 fib_release_info(fa->fa_info);
1648 alias_free_mem_rcu(fa);
1649 found++;
1650 }
1651 }
1652 return found;
1653 }
1654
1655 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1656 {
1657 int found = 0;
1658 struct hlist_head *lih = &l->list;
1659 struct hlist_node *node, *tmp;
1660 struct leaf_info *li = NULL;
1661
1662 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1663 found += trie_flush_list(t, &li->falh);
1664
1665 if (list_empty(&li->falh)) {
1666 hlist_del_rcu(&li->hlist);
1667 free_leaf_info(li);
1668 }
1669 }
1670 return found;
1671 }
1672
1673 /* rcu_read_lock needs to be hold by caller from readside */
1674
1675 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1676 {
1677 struct node *c = (struct node *) thisleaf;
1678 struct tnode *p;
1679 int idx;
1680 struct node *trie = rcu_dereference(t->trie);
1681
1682 if (c == NULL) {
1683 if (trie == NULL)
1684 return NULL;
1685
1686 if (IS_LEAF(trie)) /* trie w. just a leaf */
1687 return (struct leaf *) trie;
1688
1689 p = (struct tnode*) trie; /* Start */
1690 } else
1691 p = node_parent(c);
1692
1693 while (p) {
1694 int pos, last;
1695
1696 /* Find the next child of the parent */
1697 if (c)
1698 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1699 else
1700 pos = 0;
1701
1702 last = 1 << p->bits;
1703 for (idx = pos; idx < last ; idx++) {
1704 c = rcu_dereference(p->child[idx]);
1705
1706 if (!c)
1707 continue;
1708
1709 /* Decend if tnode */
1710 while (IS_TNODE(c)) {
1711 p = (struct tnode *) c;
1712 idx = 0;
1713
1714 /* Rightmost non-NULL branch */
1715 if (p && IS_TNODE(p))
1716 while (!(c = rcu_dereference(p->child[idx]))
1717 && idx < (1<<p->bits)) idx++;
1718
1719 /* Done with this tnode? */
1720 if (idx >= (1 << p->bits) || !c)
1721 goto up;
1722 }
1723 return (struct leaf *) c;
1724 }
1725 up:
1726 /* No more children go up one step */
1727 c = (struct node *) p;
1728 p = node_parent(c);
1729 }
1730 return NULL; /* Ready. Root of trie */
1731 }
1732
1733 /*
1734 * Caller must hold RTNL.
1735 */
1736 static int fn_trie_flush(struct fib_table *tb)
1737 {
1738 struct trie *t = (struct trie *) tb->tb_data;
1739 struct leaf *ll = NULL, *l = NULL;
1740 int found = 0, h;
1741
1742 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1743 found += trie_flush_leaf(t, l);
1744
1745 if (ll && hlist_empty(&ll->list))
1746 trie_leaf_remove(t, ll->key);
1747 ll = l;
1748 }
1749
1750 if (ll && hlist_empty(&ll->list))
1751 trie_leaf_remove(t, ll->key);
1752
1753 pr_debug("trie_flush found=%d\n", found);
1754 return found;
1755 }
1756
1757 static void
1758 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1759 {
1760 struct trie *t = (struct trie *) tb->tb_data;
1761 int order, last_idx;
1762 struct fib_info *fi = NULL;
1763 struct fib_info *last_resort;
1764 struct fib_alias *fa = NULL;
1765 struct list_head *fa_head;
1766 struct leaf *l;
1767
1768 last_idx = -1;
1769 last_resort = NULL;
1770 order = -1;
1771
1772 rcu_read_lock();
1773
1774 l = fib_find_node(t, 0);
1775 if (!l)
1776 goto out;
1777
1778 fa_head = get_fa_head(l, 0);
1779 if (!fa_head)
1780 goto out;
1781
1782 if (list_empty(fa_head))
1783 goto out;
1784
1785 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1786 struct fib_info *next_fi = fa->fa_info;
1787
1788 if (fa->fa_scope != res->scope ||
1789 fa->fa_type != RTN_UNICAST)
1790 continue;
1791
1792 if (next_fi->fib_priority > res->fi->fib_priority)
1793 break;
1794 if (!next_fi->fib_nh[0].nh_gw ||
1795 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1796 continue;
1797 fa->fa_state |= FA_S_ACCESSED;
1798
1799 if (fi == NULL) {
1800 if (next_fi != res->fi)
1801 break;
1802 } else if (!fib_detect_death(fi, order, &last_resort,
1803 &last_idx, tb->tb_default)) {
1804 fib_result_assign(res, fi);
1805 tb->tb_default = order;
1806 goto out;
1807 }
1808 fi = next_fi;
1809 order++;
1810 }
1811 if (order <= 0 || fi == NULL) {
1812 tb->tb_default = -1;
1813 goto out;
1814 }
1815
1816 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1817 tb->tb_default)) {
1818 fib_result_assign(res, fi);
1819 tb->tb_default = order;
1820 goto out;
1821 }
1822 if (last_idx >= 0)
1823 fib_result_assign(res, last_resort);
1824 tb->tb_default = last_idx;
1825 out:
1826 rcu_read_unlock();
1827 }
1828
1829 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1830 struct sk_buff *skb, struct netlink_callback *cb)
1831 {
1832 int i, s_i;
1833 struct fib_alias *fa;
1834
1835 __be32 xkey = htonl(key);
1836
1837 s_i = cb->args[4];
1838 i = 0;
1839
1840 /* rcu_read_lock is hold by caller */
1841
1842 list_for_each_entry_rcu(fa, fah, fa_list) {
1843 if (i < s_i) {
1844 i++;
1845 continue;
1846 }
1847 BUG_ON(!fa->fa_info);
1848
1849 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1850 cb->nlh->nlmsg_seq,
1851 RTM_NEWROUTE,
1852 tb->tb_id,
1853 fa->fa_type,
1854 fa->fa_scope,
1855 xkey,
1856 plen,
1857 fa->fa_tos,
1858 fa->fa_info, 0) < 0) {
1859 cb->args[4] = i;
1860 return -1;
1861 }
1862 i++;
1863 }
1864 cb->args[4] = i;
1865 return skb->len;
1866 }
1867
1868 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1869 struct netlink_callback *cb)
1870 {
1871 int h, s_h;
1872 struct list_head *fa_head;
1873 struct leaf *l = NULL;
1874
1875 s_h = cb->args[3];
1876
1877 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1878 if (h < s_h)
1879 continue;
1880 if (h > s_h)
1881 memset(&cb->args[4], 0,
1882 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1883
1884 fa_head = get_fa_head(l, plen);
1885
1886 if (!fa_head)
1887 continue;
1888
1889 if (list_empty(fa_head))
1890 continue;
1891
1892 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1893 cb->args[3] = h;
1894 return -1;
1895 }
1896 }
1897 cb->args[3] = h;
1898 return skb->len;
1899 }
1900
1901 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1902 {
1903 int m, s_m;
1904 struct trie *t = (struct trie *) tb->tb_data;
1905
1906 s_m = cb->args[2];
1907
1908 rcu_read_lock();
1909 for (m = 0; m <= 32; m++) {
1910 if (m < s_m)
1911 continue;
1912 if (m > s_m)
1913 memset(&cb->args[3], 0,
1914 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1915
1916 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1917 cb->args[2] = m;
1918 goto out;
1919 }
1920 }
1921 rcu_read_unlock();
1922 cb->args[2] = m;
1923 return skb->len;
1924 out:
1925 rcu_read_unlock();
1926 return -1;
1927 }
1928
1929 /* Fix more generic FIB names for init later */
1930
1931 struct fib_table *fib_hash_init(u32 id)
1932 {
1933 struct fib_table *tb;
1934 struct trie *t;
1935
1936 if (fn_alias_kmem == NULL)
1937 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1938 sizeof(struct fib_alias),
1939 0, SLAB_HWCACHE_ALIGN,
1940 NULL);
1941
1942 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1943 GFP_KERNEL);
1944 if (tb == NULL)
1945 return NULL;
1946
1947 tb->tb_id = id;
1948 tb->tb_default = -1;
1949 tb->tb_lookup = fn_trie_lookup;
1950 tb->tb_insert = fn_trie_insert;
1951 tb->tb_delete = fn_trie_delete;
1952 tb->tb_flush = fn_trie_flush;
1953 tb->tb_select_default = fn_trie_select_default;
1954 tb->tb_dump = fn_trie_dump;
1955
1956 t = (struct trie *) tb->tb_data;
1957 memset(t, 0, sizeof(*t));
1958
1959 if (id == RT_TABLE_LOCAL)
1960 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1961
1962 return tb;
1963 }
1964
1965 #ifdef CONFIG_PROC_FS
1966 /* Depth first Trie walk iterator */
1967 struct fib_trie_iter {
1968 struct seq_net_private p;
1969 struct trie *trie_local, *trie_main;
1970 struct tnode *tnode;
1971 struct trie *trie;
1972 unsigned index;
1973 unsigned depth;
1974 };
1975
1976 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1977 {
1978 struct tnode *tn = iter->tnode;
1979 unsigned cindex = iter->index;
1980 struct tnode *p;
1981
1982 /* A single entry routing table */
1983 if (!tn)
1984 return NULL;
1985
1986 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1987 iter->tnode, iter->index, iter->depth);
1988 rescan:
1989 while (cindex < (1<<tn->bits)) {
1990 struct node *n = tnode_get_child(tn, cindex);
1991
1992 if (n) {
1993 if (IS_LEAF(n)) {
1994 iter->tnode = tn;
1995 iter->index = cindex + 1;
1996 } else {
1997 /* push down one level */
1998 iter->tnode = (struct tnode *) n;
1999 iter->index = 0;
2000 ++iter->depth;
2001 }
2002 return n;
2003 }
2004
2005 ++cindex;
2006 }
2007
2008 /* Current node exhausted, pop back up */
2009 p = node_parent((struct node *)tn);
2010 if (p) {
2011 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2012 tn = p;
2013 --iter->depth;
2014 goto rescan;
2015 }
2016
2017 /* got root? */
2018 return NULL;
2019 }
2020
2021 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2022 struct trie *t)
2023 {
2024 struct node *n ;
2025
2026 if (!t)
2027 return NULL;
2028
2029 n = rcu_dereference(t->trie);
2030
2031 if (!iter)
2032 return NULL;
2033
2034 if (n) {
2035 if (IS_TNODE(n)) {
2036 iter->tnode = (struct tnode *) n;
2037 iter->trie = t;
2038 iter->index = 0;
2039 iter->depth = 1;
2040 } else {
2041 iter->tnode = NULL;
2042 iter->trie = t;
2043 iter->index = 0;
2044 iter->depth = 0;
2045 }
2046 return n;
2047 }
2048 return NULL;
2049 }
2050
2051 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2052 {
2053 struct node *n;
2054 struct fib_trie_iter iter;
2055
2056 memset(s, 0, sizeof(*s));
2057
2058 rcu_read_lock();
2059 for (n = fib_trie_get_first(&iter, t); n;
2060 n = fib_trie_get_next(&iter)) {
2061 if (IS_LEAF(n)) {
2062 s->leaves++;
2063 s->totdepth += iter.depth;
2064 if (iter.depth > s->maxdepth)
2065 s->maxdepth = iter.depth;
2066 } else {
2067 const struct tnode *tn = (const struct tnode *) n;
2068 int i;
2069
2070 s->tnodes++;
2071 if (tn->bits < MAX_STAT_DEPTH)
2072 s->nodesizes[tn->bits]++;
2073
2074 for (i = 0; i < (1<<tn->bits); i++)
2075 if (!tn->child[i])
2076 s->nullpointers++;
2077 }
2078 }
2079 rcu_read_unlock();
2080 }
2081
2082 /*
2083 * This outputs /proc/net/fib_triestats
2084 */
2085 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2086 {
2087 unsigned i, max, pointers, bytes, avdepth;
2088
2089 if (stat->leaves)
2090 avdepth = stat->totdepth*100 / stat->leaves;
2091 else
2092 avdepth = 0;
2093
2094 seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100 );
2095 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2096
2097 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2098
2099 bytes = sizeof(struct leaf) * stat->leaves;
2100 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2101 bytes += sizeof(struct tnode) * stat->tnodes;
2102
2103 max = MAX_STAT_DEPTH;
2104 while (max > 0 && stat->nodesizes[max-1] == 0)
2105 max--;
2106
2107 pointers = 0;
2108 for (i = 1; i <= max; i++)
2109 if (stat->nodesizes[i] != 0) {
2110 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2111 pointers += (1<<i) * stat->nodesizes[i];
2112 }
2113 seq_putc(seq, '\n');
2114 seq_printf(seq, "\tPointers: %u\n", pointers);
2115
2116 bytes += sizeof(struct node *) * pointers;
2117 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2118 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2119 }
2120
2121 #ifdef CONFIG_IP_FIB_TRIE_STATS
2122 static void trie_show_usage(struct seq_file *seq,
2123 const struct trie_use_stats *stats)
2124 {
2125 seq_printf(seq, "\nCounters:\n---------\n");
2126 seq_printf(seq,"gets = %u\n", stats->gets);
2127 seq_printf(seq,"backtracks = %u\n", stats->backtrack);
2128 seq_printf(seq,"semantic match passed = %u\n", stats->semantic_match_passed);
2129 seq_printf(seq,"semantic match miss = %u\n", stats->semantic_match_miss);
2130 seq_printf(seq,"null node hit= %u\n", stats->null_node_hit);
2131 seq_printf(seq,"skipped node resize = %u\n\n", stats->resize_node_skipped);
2132 }
2133 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2134
2135
2136 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2137 {
2138 struct net *net = (struct net *)seq->private;
2139 struct trie *trie_local, *trie_main;
2140 struct trie_stat *stat;
2141 struct fib_table *tb;
2142
2143 trie_local = NULL;
2144 tb = fib_get_table(net, RT_TABLE_LOCAL);
2145 if (tb)
2146 trie_local = (struct trie *) tb->tb_data;
2147
2148 trie_main = NULL;
2149 tb = fib_get_table(net, RT_TABLE_MAIN);
2150 if (tb)
2151 trie_main = (struct trie *) tb->tb_data;
2152
2153
2154 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2155 if (!stat)
2156 return -ENOMEM;
2157
2158 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2159 sizeof(struct leaf), sizeof(struct tnode));
2160
2161 if (trie_local) {
2162 seq_printf(seq, "Local:\n");
2163 trie_collect_stats(trie_local, stat);
2164 trie_show_stats(seq, stat);
2165 #ifdef CONFIG_IP_FIB_TRIE_STATS
2166 trie_show_usage(seq, &trie_local->stats);
2167 #endif
2168 }
2169
2170 if (trie_main) {
2171 seq_printf(seq, "Main:\n");
2172 trie_collect_stats(trie_main, stat);
2173 trie_show_stats(seq, stat);
2174 #ifdef CONFIG_IP_FIB_TRIE_STATS
2175 trie_show_usage(seq, &trie_main->stats);
2176 #endif
2177 }
2178 kfree(stat);
2179
2180 return 0;
2181 }
2182
2183 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2184 {
2185 int err;
2186 struct net *net;
2187
2188 net = get_proc_net(inode);
2189 if (net == NULL)
2190 return -ENXIO;
2191 err = single_open(file, fib_triestat_seq_show, net);
2192 if (err < 0) {
2193 put_net(net);
2194 return err;
2195 }
2196 return 0;
2197 }
2198
2199 static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2200 {
2201 struct seq_file *seq = f->private_data;
2202 put_net(seq->private);
2203 return single_release(ino, f);
2204 }
2205
2206 static const struct file_operations fib_triestat_fops = {
2207 .owner = THIS_MODULE,
2208 .open = fib_triestat_seq_open,
2209 .read = seq_read,
2210 .llseek = seq_lseek,
2211 .release = fib_triestat_seq_release,
2212 };
2213
2214 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2215 loff_t pos)
2216 {
2217 loff_t idx = 0;
2218 struct node *n;
2219
2220 for (n = fib_trie_get_first(iter, iter->trie_local);
2221 n; ++idx, n = fib_trie_get_next(iter)) {
2222 if (pos == idx)
2223 return n;
2224 }
2225
2226 for (n = fib_trie_get_first(iter, iter->trie_main);
2227 n; ++idx, n = fib_trie_get_next(iter)) {
2228 if (pos == idx)
2229 return n;
2230 }
2231 return NULL;
2232 }
2233
2234 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2235 __acquires(RCU)
2236 {
2237 struct fib_trie_iter *iter = seq->private;
2238 struct fib_table *tb;
2239
2240 if (!iter->trie_local) {
2241 tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
2242 if (tb)
2243 iter->trie_local = (struct trie *) tb->tb_data;
2244 }
2245 if (!iter->trie_main) {
2246 tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
2247 if (tb)
2248 iter->trie_main = (struct trie *) tb->tb_data;
2249 }
2250 rcu_read_lock();
2251 if (*pos == 0)
2252 return SEQ_START_TOKEN;
2253 return fib_trie_get_idx(iter, *pos - 1);
2254 }
2255
2256 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2257 {
2258 struct fib_trie_iter *iter = seq->private;
2259 void *l = v;
2260
2261 ++*pos;
2262 if (v == SEQ_START_TOKEN)
2263 return fib_trie_get_idx(iter, 0);
2264
2265 v = fib_trie_get_next(iter);
2266 BUG_ON(v == l);
2267 if (v)
2268 return v;
2269
2270 /* continue scan in next trie */
2271 if (iter->trie == iter->trie_local)
2272 return fib_trie_get_first(iter, iter->trie_main);
2273
2274 return NULL;
2275 }
2276
2277 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2278 __releases(RCU)
2279 {
2280 rcu_read_unlock();
2281 }
2282
2283 static void seq_indent(struct seq_file *seq, int n)
2284 {
2285 while (n-- > 0) seq_puts(seq, " ");
2286 }
2287
2288 static inline const char *rtn_scope(enum rt_scope_t s)
2289 {
2290 static char buf[32];
2291
2292 switch (s) {
2293 case RT_SCOPE_UNIVERSE: return "universe";
2294 case RT_SCOPE_SITE: return "site";
2295 case RT_SCOPE_LINK: return "link";
2296 case RT_SCOPE_HOST: return "host";
2297 case RT_SCOPE_NOWHERE: return "nowhere";
2298 default:
2299 snprintf(buf, sizeof(buf), "scope=%d", s);
2300 return buf;
2301 }
2302 }
2303
2304 static const char *rtn_type_names[__RTN_MAX] = {
2305 [RTN_UNSPEC] = "UNSPEC",
2306 [RTN_UNICAST] = "UNICAST",
2307 [RTN_LOCAL] = "LOCAL",
2308 [RTN_BROADCAST] = "BROADCAST",
2309 [RTN_ANYCAST] = "ANYCAST",
2310 [RTN_MULTICAST] = "MULTICAST",
2311 [RTN_BLACKHOLE] = "BLACKHOLE",
2312 [RTN_UNREACHABLE] = "UNREACHABLE",
2313 [RTN_PROHIBIT] = "PROHIBIT",
2314 [RTN_THROW] = "THROW",
2315 [RTN_NAT] = "NAT",
2316 [RTN_XRESOLVE] = "XRESOLVE",
2317 };
2318
2319 static inline const char *rtn_type(unsigned t)
2320 {
2321 static char buf[32];
2322
2323 if (t < __RTN_MAX && rtn_type_names[t])
2324 return rtn_type_names[t];
2325 snprintf(buf, sizeof(buf), "type %u", t);
2326 return buf;
2327 }
2328
2329 /* Pretty print the trie */
2330 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2331 {
2332 const struct fib_trie_iter *iter = seq->private;
2333 struct node *n = v;
2334
2335 if (v == SEQ_START_TOKEN)
2336 return 0;
2337
2338 if (!node_parent(n)) {
2339 if (iter->trie == iter->trie_local)
2340 seq_puts(seq, "<local>:\n");
2341 else
2342 seq_puts(seq, "<main>:\n");
2343 }
2344
2345 if (IS_TNODE(n)) {
2346 struct tnode *tn = (struct tnode *) n;
2347 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2348
2349 seq_indent(seq, iter->depth-1);
2350 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
2351 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2352 tn->empty_children);
2353
2354 } else {
2355 struct leaf *l = (struct leaf *) n;
2356 int i;
2357 __be32 val = htonl(l->key);
2358
2359 seq_indent(seq, iter->depth);
2360 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2361 for (i = 32; i >= 0; i--) {
2362 struct leaf_info *li = find_leaf_info(l, i);
2363 if (li) {
2364 struct fib_alias *fa;
2365 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2366 seq_indent(seq, iter->depth+1);
2367 seq_printf(seq, " /%d %s %s", i,
2368 rtn_scope(fa->fa_scope),
2369 rtn_type(fa->fa_type));
2370 if (fa->fa_tos)
2371 seq_printf(seq, "tos =%d\n",
2372 fa->fa_tos);
2373 seq_putc(seq, '\n');
2374 }
2375 }
2376 }
2377 }
2378
2379 return 0;
2380 }
2381
2382 static const struct seq_operations fib_trie_seq_ops = {
2383 .start = fib_trie_seq_start,
2384 .next = fib_trie_seq_next,
2385 .stop = fib_trie_seq_stop,
2386 .show = fib_trie_seq_show,
2387 };
2388
2389 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2390 {
2391 return seq_open_net(inode, file, &fib_trie_seq_ops,
2392 sizeof(struct fib_trie_iter));
2393 }
2394
2395 static const struct file_operations fib_trie_fops = {
2396 .owner = THIS_MODULE,
2397 .open = fib_trie_seq_open,
2398 .read = seq_read,
2399 .llseek = seq_lseek,
2400 .release = seq_release_net,
2401 };
2402
2403 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2404 {
2405 static unsigned type2flags[RTN_MAX + 1] = {
2406 [7] = RTF_REJECT, [8] = RTF_REJECT,
2407 };
2408 unsigned flags = type2flags[type];
2409
2410 if (fi && fi->fib_nh->nh_gw)
2411 flags |= RTF_GATEWAY;
2412 if (mask == htonl(0xFFFFFFFF))
2413 flags |= RTF_HOST;
2414 flags |= RTF_UP;
2415 return flags;
2416 }
2417
2418 /*
2419 * This outputs /proc/net/route.
2420 * The format of the file is not supposed to be changed
2421 * and needs to be same as fib_hash output to avoid breaking
2422 * legacy utilities
2423 */
2424 static int fib_route_seq_show(struct seq_file *seq, void *v)
2425 {
2426 const struct fib_trie_iter *iter = seq->private;
2427 struct leaf *l = v;
2428 int i;
2429 char bf[128];
2430
2431 if (v == SEQ_START_TOKEN) {
2432 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2433 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2434 "\tWindow\tIRTT");
2435 return 0;
2436 }
2437
2438 if (iter->trie == iter->trie_local)
2439 return 0;
2440 if (IS_TNODE(l))
2441 return 0;
2442
2443 for (i=32; i>=0; i--) {
2444 struct leaf_info *li = find_leaf_info(l, i);
2445 struct fib_alias *fa;
2446 __be32 mask, prefix;
2447
2448 if (!li)
2449 continue;
2450
2451 mask = inet_make_mask(li->plen);
2452 prefix = htonl(l->key);
2453
2454 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2455 const struct fib_info *fi = fa->fa_info;
2456 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2457
2458 if (fa->fa_type == RTN_BROADCAST
2459 || fa->fa_type == RTN_MULTICAST)
2460 continue;
2461
2462 if (fi)
2463 snprintf(bf, sizeof(bf),
2464 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2465 fi->fib_dev ? fi->fib_dev->name : "*",
2466 prefix,
2467 fi->fib_nh->nh_gw, flags, 0, 0,
2468 fi->fib_priority,
2469 mask,
2470 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2471 fi->fib_window,
2472 fi->fib_rtt >> 3);
2473 else
2474 snprintf(bf, sizeof(bf),
2475 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2476 prefix, 0, flags, 0, 0, 0,
2477 mask, 0, 0, 0);
2478
2479 seq_printf(seq, "%-127s\n", bf);
2480 }
2481 }
2482
2483 return 0;
2484 }
2485
2486 static const struct seq_operations fib_route_seq_ops = {
2487 .start = fib_trie_seq_start,
2488 .next = fib_trie_seq_next,
2489 .stop = fib_trie_seq_stop,
2490 .show = fib_route_seq_show,
2491 };
2492
2493 static int fib_route_seq_open(struct inode *inode, struct file *file)
2494 {
2495 return seq_open_net(inode, file, &fib_route_seq_ops,
2496 sizeof(struct fib_trie_iter));
2497 }
2498
2499 static const struct file_operations fib_route_fops = {
2500 .owner = THIS_MODULE,
2501 .open = fib_route_seq_open,
2502 .read = seq_read,
2503 .llseek = seq_lseek,
2504 .release = seq_release_net,
2505 };
2506
2507 int __net_init fib_proc_init(struct net *net)
2508 {
2509 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2510 goto out1;
2511
2512 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2513 &fib_triestat_fops))
2514 goto out2;
2515
2516 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2517 goto out3;
2518
2519 return 0;
2520
2521 out3:
2522 proc_net_remove(net, "fib_triestat");
2523 out2:
2524 proc_net_remove(net, "fib_trie");
2525 out1:
2526 return -ENOMEM;
2527 }
2528
2529 void __net_exit fib_proc_exit(struct net *net)
2530 {
2531 proc_net_remove(net, "fib_trie");
2532 proc_net_remove(net, "fib_triestat");
2533 proc_net_remove(net, "route");
2534 }
2535
2536 #endif /* CONFIG_PROC_FS */
This page took 0.080387 seconds and 6 git commands to generate.