[NETROM/AX.25/ROSE]: Remove useless tests
[deliverable/linux.git] / net / ipv4 / fib_trie.c
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
46 #define VERSION "0.403"
47
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
55 #include <linux/mm.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/rcupdate.h>
66 #include <linux/skbuff.h>
67 #include <linux/netlink.h>
68 #include <linux/init.h>
69 #include <linux/list.h>
70 #include <net/ip.h>
71 #include <net/protocol.h>
72 #include <net/route.h>
73 #include <net/tcp.h>
74 #include <net/sock.h>
75 #include <net/ip_fib.h>
76 #include "fib_lookup.h"
77
78 #undef CONFIG_IP_FIB_TRIE_STATS
79 #define MAX_CHILDS 16384
80
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85 typedef unsigned int t_key;
86
87 #define T_TNODE 0
88 #define T_LEAF 1
89 #define NODE_TYPE_MASK 0x1UL
90 #define NODE_PARENT(node) \
91 ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
92
93 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94
95 #define NODE_SET_PARENT(node, ptr) \
96 rcu_assign_pointer((node)->parent, \
97 ((unsigned long)(ptr)) | NODE_TYPE(node))
98
99 #define IS_TNODE(n) (!(n->parent & T_LEAF))
100 #define IS_LEAF(n) (n->parent & T_LEAF)
101
102 struct node {
103 t_key key;
104 unsigned long parent;
105 };
106
107 struct leaf {
108 t_key key;
109 unsigned long parent;
110 struct hlist_head list;
111 struct rcu_head rcu;
112 };
113
114 struct leaf_info {
115 struct hlist_node hlist;
116 struct rcu_head rcu;
117 int plen;
118 struct list_head falh;
119 };
120
121 struct tnode {
122 t_key key;
123 unsigned long parent;
124 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
125 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short full_children; /* KEYLENGTH bits needed */
127 unsigned short empty_children; /* KEYLENGTH bits needed */
128 struct rcu_head rcu;
129 struct node *child[0];
130 };
131
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
140 };
141 #endif
142
143 struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
150 };
151
152 struct trie {
153 struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156 #endif
157 int size;
158 unsigned int revision;
159 };
160
161 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
162 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
163 static struct node *resize(struct trie *t, struct tnode *tn);
164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
165 static struct tnode *halve(struct trie *t, struct tnode *tn);
166 static void tnode_free(struct tnode *tn);
167
168 static kmem_cache_t *fn_alias_kmem __read_mostly;
169 static struct trie *trie_local = NULL, *trie_main = NULL;
170
171
172 /* rcu_read_lock needs to be hold by caller from readside */
173
174 static inline struct node *tnode_get_child(struct tnode *tn, int i)
175 {
176 BUG_ON(i >= 1 << tn->bits);
177
178 return rcu_dereference(tn->child[i]);
179 }
180
181 static inline int tnode_child_length(const struct tnode *tn)
182 {
183 return 1 << tn->bits;
184 }
185
186 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
187 {
188 if (offset < KEYLENGTH)
189 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
190 else
191 return 0;
192 }
193
194 static inline int tkey_equals(t_key a, t_key b)
195 {
196 return a == b;
197 }
198
199 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
200 {
201 if (bits == 0 || offset >= KEYLENGTH)
202 return 1;
203 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
204 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
205 }
206
207 static inline int tkey_mismatch(t_key a, int offset, t_key b)
208 {
209 t_key diff = a ^ b;
210 int i = offset;
211
212 if (!diff)
213 return 0;
214 while ((diff << i) >> (KEYLENGTH-1) == 0)
215 i++;
216 return i;
217 }
218
219 /*
220 To understand this stuff, an understanding of keys and all their bits is
221 necessary. Every node in the trie has a key associated with it, but not
222 all of the bits in that key are significant.
223
224 Consider a node 'n' and its parent 'tp'.
225
226 If n is a leaf, every bit in its key is significant. Its presence is
227 necessitaded by path compression, since during a tree traversal (when
228 searching for a leaf - unless we are doing an insertion) we will completely
229 ignore all skipped bits we encounter. Thus we need to verify, at the end of
230 a potentially successful search, that we have indeed been walking the
231 correct key path.
232
233 Note that we can never "miss" the correct key in the tree if present by
234 following the wrong path. Path compression ensures that segments of the key
235 that are the same for all keys with a given prefix are skipped, but the
236 skipped part *is* identical for each node in the subtrie below the skipped
237 bit! trie_insert() in this implementation takes care of that - note the
238 call to tkey_sub_equals() in trie_insert().
239
240 if n is an internal node - a 'tnode' here, the various parts of its key
241 have many different meanings.
242
243 Example:
244 _________________________________________________________________
245 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
246 -----------------------------------------------------------------
247 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
248
249 _________________________________________________________________
250 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
251 -----------------------------------------------------------------
252 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
253
254 tp->pos = 7
255 tp->bits = 3
256 n->pos = 15
257 n->bits = 4
258
259 First, let's just ignore the bits that come before the parent tp, that is
260 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
261 not use them for anything.
262
263 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
264 index into the parent's child array. That is, they will be used to find
265 'n' among tp's children.
266
267 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
268 for the node n.
269
270 All the bits we have seen so far are significant to the node n. The rest
271 of the bits are really not needed or indeed known in n->key.
272
273 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
274 n's child array, and will of course be different for each child.
275
276
277 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
278 at this point.
279
280 */
281
282 static inline void check_tnode(const struct tnode *tn)
283 {
284 WARN_ON(tn && tn->pos+tn->bits > 32);
285 }
286
287 static int halve_threshold = 25;
288 static int inflate_threshold = 50;
289
290
291 static void __alias_free_mem(struct rcu_head *head)
292 {
293 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
294 kmem_cache_free(fn_alias_kmem, fa);
295 }
296
297 static inline void alias_free_mem_rcu(struct fib_alias *fa)
298 {
299 call_rcu(&fa->rcu, __alias_free_mem);
300 }
301
302 static void __leaf_free_rcu(struct rcu_head *head)
303 {
304 kfree(container_of(head, struct leaf, rcu));
305 }
306
307 static inline void free_leaf(struct leaf *leaf)
308 {
309 call_rcu(&leaf->rcu, __leaf_free_rcu);
310 }
311
312 static void __leaf_info_free_rcu(struct rcu_head *head)
313 {
314 kfree(container_of(head, struct leaf_info, rcu));
315 }
316
317 static inline void free_leaf_info(struct leaf_info *leaf)
318 {
319 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
320 }
321
322 static struct tnode *tnode_alloc(unsigned int size)
323 {
324 struct page *pages;
325
326 if (size <= PAGE_SIZE)
327 return kcalloc(size, 1, GFP_KERNEL);
328
329 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
330 if (!pages)
331 return NULL;
332
333 return page_address(pages);
334 }
335
336 static void __tnode_free_rcu(struct rcu_head *head)
337 {
338 struct tnode *tn = container_of(head, struct tnode, rcu);
339 unsigned int size = sizeof(struct tnode) +
340 (1 << tn->bits) * sizeof(struct node *);
341
342 if (size <= PAGE_SIZE)
343 kfree(tn);
344 else
345 free_pages((unsigned long)tn, get_order(size));
346 }
347
348 static inline void tnode_free(struct tnode *tn)
349 {
350 call_rcu(&tn->rcu, __tnode_free_rcu);
351 }
352
353 static struct leaf *leaf_new(void)
354 {
355 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
356 if (l) {
357 l->parent = T_LEAF;
358 INIT_HLIST_HEAD(&l->list);
359 }
360 return l;
361 }
362
363 static struct leaf_info *leaf_info_new(int plen)
364 {
365 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
366 if (li) {
367 li->plen = plen;
368 INIT_LIST_HEAD(&li->falh);
369 }
370 return li;
371 }
372
373 static struct tnode* tnode_new(t_key key, int pos, int bits)
374 {
375 int nchildren = 1<<bits;
376 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
377 struct tnode *tn = tnode_alloc(sz);
378
379 if (tn) {
380 memset(tn, 0, sz);
381 tn->parent = T_TNODE;
382 tn->pos = pos;
383 tn->bits = bits;
384 tn->key = key;
385 tn->full_children = 0;
386 tn->empty_children = 1<<bits;
387 }
388
389 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
390 (unsigned int) (sizeof(struct node) * 1<<bits));
391 return tn;
392 }
393
394 /*
395 * Check whether a tnode 'n' is "full", i.e. it is an internal node
396 * and no bits are skipped. See discussion in dyntree paper p. 6
397 */
398
399 static inline int tnode_full(const struct tnode *tn, const struct node *n)
400 {
401 if (n == NULL || IS_LEAF(n))
402 return 0;
403
404 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
405 }
406
407 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
408 {
409 tnode_put_child_reorg(tn, i, n, -1);
410 }
411
412 /*
413 * Add a child at position i overwriting the old value.
414 * Update the value of full_children and empty_children.
415 */
416
417 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
418 {
419 struct node *chi = tn->child[i];
420 int isfull;
421
422 BUG_ON(i >= 1<<tn->bits);
423
424
425 /* update emptyChildren */
426 if (n == NULL && chi != NULL)
427 tn->empty_children++;
428 else if (n != NULL && chi == NULL)
429 tn->empty_children--;
430
431 /* update fullChildren */
432 if (wasfull == -1)
433 wasfull = tnode_full(tn, chi);
434
435 isfull = tnode_full(tn, n);
436 if (wasfull && !isfull)
437 tn->full_children--;
438 else if (!wasfull && isfull)
439 tn->full_children++;
440
441 if (n)
442 NODE_SET_PARENT(n, tn);
443
444 rcu_assign_pointer(tn->child[i], n);
445 }
446
447 static struct node *resize(struct trie *t, struct tnode *tn)
448 {
449 int i;
450 int err = 0;
451 struct tnode *old_tn;
452
453 if (!tn)
454 return NULL;
455
456 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
457 tn, inflate_threshold, halve_threshold);
458
459 /* No children */
460 if (tn->empty_children == tnode_child_length(tn)) {
461 tnode_free(tn);
462 return NULL;
463 }
464 /* One child */
465 if (tn->empty_children == tnode_child_length(tn) - 1)
466 for (i = 0; i < tnode_child_length(tn); i++) {
467 struct node *n;
468
469 n = tn->child[i];
470 if (!n)
471 continue;
472
473 /* compress one level */
474 NODE_SET_PARENT(n, NULL);
475 tnode_free(tn);
476 return n;
477 }
478 /*
479 * Double as long as the resulting node has a number of
480 * nonempty nodes that are above the threshold.
481 */
482
483 /*
484 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
485 * the Helsinki University of Technology and Matti Tikkanen of Nokia
486 * Telecommunications, page 6:
487 * "A node is doubled if the ratio of non-empty children to all
488 * children in the *doubled* node is at least 'high'."
489 *
490 * 'high' in this instance is the variable 'inflate_threshold'. It
491 * is expressed as a percentage, so we multiply it with
492 * tnode_child_length() and instead of multiplying by 2 (since the
493 * child array will be doubled by inflate()) and multiplying
494 * the left-hand side by 100 (to handle the percentage thing) we
495 * multiply the left-hand side by 50.
496 *
497 * The left-hand side may look a bit weird: tnode_child_length(tn)
498 * - tn->empty_children is of course the number of non-null children
499 * in the current node. tn->full_children is the number of "full"
500 * children, that is non-null tnodes with a skip value of 0.
501 * All of those will be doubled in the resulting inflated tnode, so
502 * we just count them one extra time here.
503 *
504 * A clearer way to write this would be:
505 *
506 * to_be_doubled = tn->full_children;
507 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
508 * tn->full_children;
509 *
510 * new_child_length = tnode_child_length(tn) * 2;
511 *
512 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
513 * new_child_length;
514 * if (new_fill_factor >= inflate_threshold)
515 *
516 * ...and so on, tho it would mess up the while () loop.
517 *
518 * anyway,
519 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
520 * inflate_threshold
521 *
522 * avoid a division:
523 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
524 * inflate_threshold * new_child_length
525 *
526 * expand not_to_be_doubled and to_be_doubled, and shorten:
527 * 100 * (tnode_child_length(tn) - tn->empty_children +
528 * tn->full_children) >= inflate_threshold * new_child_length
529 *
530 * expand new_child_length:
531 * 100 * (tnode_child_length(tn) - tn->empty_children +
532 * tn->full_children) >=
533 * inflate_threshold * tnode_child_length(tn) * 2
534 *
535 * shorten again:
536 * 50 * (tn->full_children + tnode_child_length(tn) -
537 * tn->empty_children) >= inflate_threshold *
538 * tnode_child_length(tn)
539 *
540 */
541
542 check_tnode(tn);
543
544 err = 0;
545 while ((tn->full_children > 0 &&
546 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
547 inflate_threshold * tnode_child_length(tn))) {
548
549 old_tn = tn;
550 tn = inflate(t, tn);
551 if (IS_ERR(tn)) {
552 tn = old_tn;
553 #ifdef CONFIG_IP_FIB_TRIE_STATS
554 t->stats.resize_node_skipped++;
555 #endif
556 break;
557 }
558 }
559
560 check_tnode(tn);
561
562 /*
563 * Halve as long as the number of empty children in this
564 * node is above threshold.
565 */
566
567 err = 0;
568 while (tn->bits > 1 &&
569 100 * (tnode_child_length(tn) - tn->empty_children) <
570 halve_threshold * tnode_child_length(tn)) {
571
572 old_tn = tn;
573 tn = halve(t, tn);
574 if (IS_ERR(tn)) {
575 tn = old_tn;
576 #ifdef CONFIG_IP_FIB_TRIE_STATS
577 t->stats.resize_node_skipped++;
578 #endif
579 break;
580 }
581 }
582
583
584 /* Only one child remains */
585 if (tn->empty_children == tnode_child_length(tn) - 1)
586 for (i = 0; i < tnode_child_length(tn); i++) {
587 struct node *n;
588
589 n = tn->child[i];
590 if (!n)
591 continue;
592
593 /* compress one level */
594
595 NODE_SET_PARENT(n, NULL);
596 tnode_free(tn);
597 return n;
598 }
599
600 return (struct node *) tn;
601 }
602
603 static struct tnode *inflate(struct trie *t, struct tnode *tn)
604 {
605 struct tnode *inode;
606 struct tnode *oldtnode = tn;
607 int olen = tnode_child_length(tn);
608 int i;
609
610 pr_debug("In inflate\n");
611
612 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
613
614 if (!tn)
615 return ERR_PTR(-ENOMEM);
616
617 /*
618 * Preallocate and store tnodes before the actual work so we
619 * don't get into an inconsistent state if memory allocation
620 * fails. In case of failure we return the oldnode and inflate
621 * of tnode is ignored.
622 */
623
624 for (i = 0; i < olen; i++) {
625 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
626
627 if (inode &&
628 IS_TNODE(inode) &&
629 inode->pos == oldtnode->pos + oldtnode->bits &&
630 inode->bits > 1) {
631 struct tnode *left, *right;
632 t_key m = TKEY_GET_MASK(inode->pos, 1);
633
634 left = tnode_new(inode->key&(~m), inode->pos + 1,
635 inode->bits - 1);
636 if (!left)
637 goto nomem;
638
639 right = tnode_new(inode->key|m, inode->pos + 1,
640 inode->bits - 1);
641
642 if (!right) {
643 tnode_free(left);
644 goto nomem;
645 }
646
647 put_child(t, tn, 2*i, (struct node *) left);
648 put_child(t, tn, 2*i+1, (struct node *) right);
649 }
650 }
651
652 for (i = 0; i < olen; i++) {
653 struct node *node = tnode_get_child(oldtnode, i);
654 struct tnode *left, *right;
655 int size, j;
656
657 /* An empty child */
658 if (node == NULL)
659 continue;
660
661 /* A leaf or an internal node with skipped bits */
662
663 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
664 tn->pos + tn->bits - 1) {
665 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
666 1) == 0)
667 put_child(t, tn, 2*i, node);
668 else
669 put_child(t, tn, 2*i+1, node);
670 continue;
671 }
672
673 /* An internal node with two children */
674 inode = (struct tnode *) node;
675
676 if (inode->bits == 1) {
677 put_child(t, tn, 2*i, inode->child[0]);
678 put_child(t, tn, 2*i+1, inode->child[1]);
679
680 tnode_free(inode);
681 continue;
682 }
683
684 /* An internal node with more than two children */
685
686 /* We will replace this node 'inode' with two new
687 * ones, 'left' and 'right', each with half of the
688 * original children. The two new nodes will have
689 * a position one bit further down the key and this
690 * means that the "significant" part of their keys
691 * (see the discussion near the top of this file)
692 * will differ by one bit, which will be "0" in
693 * left's key and "1" in right's key. Since we are
694 * moving the key position by one step, the bit that
695 * we are moving away from - the bit at position
696 * (inode->pos) - is the one that will differ between
697 * left and right. So... we synthesize that bit in the
698 * two new keys.
699 * The mask 'm' below will be a single "one" bit at
700 * the position (inode->pos)
701 */
702
703 /* Use the old key, but set the new significant
704 * bit to zero.
705 */
706
707 left = (struct tnode *) tnode_get_child(tn, 2*i);
708 put_child(t, tn, 2*i, NULL);
709
710 BUG_ON(!left);
711
712 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
713 put_child(t, tn, 2*i+1, NULL);
714
715 BUG_ON(!right);
716
717 size = tnode_child_length(left);
718 for (j = 0; j < size; j++) {
719 put_child(t, left, j, inode->child[j]);
720 put_child(t, right, j, inode->child[j + size]);
721 }
722 put_child(t, tn, 2*i, resize(t, left));
723 put_child(t, tn, 2*i+1, resize(t, right));
724
725 tnode_free(inode);
726 }
727 tnode_free(oldtnode);
728 return tn;
729 nomem:
730 {
731 int size = tnode_child_length(tn);
732 int j;
733
734 for (j = 0; j < size; j++)
735 if (tn->child[j])
736 tnode_free((struct tnode *)tn->child[j]);
737
738 tnode_free(tn);
739
740 return ERR_PTR(-ENOMEM);
741 }
742 }
743
744 static struct tnode *halve(struct trie *t, struct tnode *tn)
745 {
746 struct tnode *oldtnode = tn;
747 struct node *left, *right;
748 int i;
749 int olen = tnode_child_length(tn);
750
751 pr_debug("In halve\n");
752
753 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
754
755 if (!tn)
756 return ERR_PTR(-ENOMEM);
757
758 /*
759 * Preallocate and store tnodes before the actual work so we
760 * don't get into an inconsistent state if memory allocation
761 * fails. In case of failure we return the oldnode and halve
762 * of tnode is ignored.
763 */
764
765 for (i = 0; i < olen; i += 2) {
766 left = tnode_get_child(oldtnode, i);
767 right = tnode_get_child(oldtnode, i+1);
768
769 /* Two nonempty children */
770 if (left && right) {
771 struct tnode *newn;
772
773 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
774
775 if (!newn)
776 goto nomem;
777
778 put_child(t, tn, i/2, (struct node *)newn);
779 }
780
781 }
782
783 for (i = 0; i < olen; i += 2) {
784 struct tnode *newBinNode;
785
786 left = tnode_get_child(oldtnode, i);
787 right = tnode_get_child(oldtnode, i+1);
788
789 /* At least one of the children is empty */
790 if (left == NULL) {
791 if (right == NULL) /* Both are empty */
792 continue;
793 put_child(t, tn, i/2, right);
794 continue;
795 }
796
797 if (right == NULL) {
798 put_child(t, tn, i/2, left);
799 continue;
800 }
801
802 /* Two nonempty children */
803 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
804 put_child(t, tn, i/2, NULL);
805 put_child(t, newBinNode, 0, left);
806 put_child(t, newBinNode, 1, right);
807 put_child(t, tn, i/2, resize(t, newBinNode));
808 }
809 tnode_free(oldtnode);
810 return tn;
811 nomem:
812 {
813 int size = tnode_child_length(tn);
814 int j;
815
816 for (j = 0; j < size; j++)
817 if (tn->child[j])
818 tnode_free((struct tnode *)tn->child[j]);
819
820 tnode_free(tn);
821
822 return ERR_PTR(-ENOMEM);
823 }
824 }
825
826 static void trie_init(struct trie *t)
827 {
828 if (!t)
829 return;
830
831 t->size = 0;
832 rcu_assign_pointer(t->trie, NULL);
833 t->revision = 0;
834 #ifdef CONFIG_IP_FIB_TRIE_STATS
835 memset(&t->stats, 0, sizeof(struct trie_use_stats));
836 #endif
837 }
838
839 /* readside most use rcu_read_lock currently dump routines
840 via get_fa_head and dump */
841
842 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
843 {
844 struct hlist_node *node;
845 struct leaf_info *li;
846
847 hlist_for_each_entry_rcu(li, node, head, hlist)
848 if (li->plen == plen)
849 return li;
850
851 return NULL;
852 }
853
854 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
855 {
856 struct leaf_info *li = find_leaf_info(&l->list, plen);
857
858 if (!li)
859 return NULL;
860
861 return &li->falh;
862 }
863
864 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
865 {
866 struct leaf_info *li = NULL, *last = NULL;
867 struct hlist_node *node;
868
869 if (hlist_empty(head)) {
870 hlist_add_head_rcu(&new->hlist, head);
871 } else {
872 hlist_for_each_entry(li, node, head, hlist) {
873 if (new->plen > li->plen)
874 break;
875
876 last = li;
877 }
878 if (last)
879 hlist_add_after_rcu(&last->hlist, &new->hlist);
880 else
881 hlist_add_before_rcu(&new->hlist, &li->hlist);
882 }
883 }
884
885 /* rcu_read_lock needs to be hold by caller from readside */
886
887 static struct leaf *
888 fib_find_node(struct trie *t, u32 key)
889 {
890 int pos;
891 struct tnode *tn;
892 struct node *n;
893
894 pos = 0;
895 n = rcu_dereference(t->trie);
896
897 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
898 tn = (struct tnode *) n;
899
900 check_tnode(tn);
901
902 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
903 pos = tn->pos + tn->bits;
904 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
905 } else
906 break;
907 }
908 /* Case we have found a leaf. Compare prefixes */
909
910 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
911 return (struct leaf *)n;
912
913 return NULL;
914 }
915
916 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
917 {
918 int wasfull;
919 t_key cindex, key;
920 struct tnode *tp = NULL;
921
922 key = tn->key;
923
924 while (tn != NULL && NODE_PARENT(tn) != NULL) {
925
926 tp = NODE_PARENT(tn);
927 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
928 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
929 tn = (struct tnode *) resize (t, (struct tnode *)tn);
930 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
931
932 if (!NODE_PARENT(tn))
933 break;
934
935 tn = NODE_PARENT(tn);
936 }
937 /* Handle last (top) tnode */
938 if (IS_TNODE(tn))
939 tn = (struct tnode*) resize(t, (struct tnode *)tn);
940
941 return (struct node*) tn;
942 }
943
944 /* only used from updater-side */
945
946 static struct list_head *
947 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
948 {
949 int pos, newpos;
950 struct tnode *tp = NULL, *tn = NULL;
951 struct node *n;
952 struct leaf *l;
953 int missbit;
954 struct list_head *fa_head = NULL;
955 struct leaf_info *li;
956 t_key cindex;
957
958 pos = 0;
959 n = t->trie;
960
961 /* If we point to NULL, stop. Either the tree is empty and we should
962 * just put a new leaf in if, or we have reached an empty child slot,
963 * and we should just put our new leaf in that.
964 * If we point to a T_TNODE, check if it matches our key. Note that
965 * a T_TNODE might be skipping any number of bits - its 'pos' need
966 * not be the parent's 'pos'+'bits'!
967 *
968 * If it does match the current key, get pos/bits from it, extract
969 * the index from our key, push the T_TNODE and walk the tree.
970 *
971 * If it doesn't, we have to replace it with a new T_TNODE.
972 *
973 * If we point to a T_LEAF, it might or might not have the same key
974 * as we do. If it does, just change the value, update the T_LEAF's
975 * value, and return it.
976 * If it doesn't, we need to replace it with a T_TNODE.
977 */
978
979 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
980 tn = (struct tnode *) n;
981
982 check_tnode(tn);
983
984 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
985 tp = tn;
986 pos = tn->pos + tn->bits;
987 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
988
989 BUG_ON(n && NODE_PARENT(n) != tn);
990 } else
991 break;
992 }
993
994 /*
995 * n ----> NULL, LEAF or TNODE
996 *
997 * tp is n's (parent) ----> NULL or TNODE
998 */
999
1000 BUG_ON(tp && IS_LEAF(tp));
1001
1002 /* Case 1: n is a leaf. Compare prefixes */
1003
1004 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1005 struct leaf *l = (struct leaf *) n;
1006
1007 li = leaf_info_new(plen);
1008
1009 if (!li) {
1010 *err = -ENOMEM;
1011 goto err;
1012 }
1013
1014 fa_head = &li->falh;
1015 insert_leaf_info(&l->list, li);
1016 goto done;
1017 }
1018 t->size++;
1019 l = leaf_new();
1020
1021 if (!l) {
1022 *err = -ENOMEM;
1023 goto err;
1024 }
1025
1026 l->key = key;
1027 li = leaf_info_new(plen);
1028
1029 if (!li) {
1030 tnode_free((struct tnode *) l);
1031 *err = -ENOMEM;
1032 goto err;
1033 }
1034
1035 fa_head = &li->falh;
1036 insert_leaf_info(&l->list, li);
1037
1038 if (t->trie && n == NULL) {
1039 /* Case 2: n is NULL, and will just insert a new leaf */
1040
1041 NODE_SET_PARENT(l, tp);
1042
1043 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1044 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1045 } else {
1046 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1047 /*
1048 * Add a new tnode here
1049 * first tnode need some special handling
1050 */
1051
1052 if (tp)
1053 pos = tp->pos+tp->bits;
1054 else
1055 pos = 0;
1056
1057 if (n) {
1058 newpos = tkey_mismatch(key, pos, n->key);
1059 tn = tnode_new(n->key, newpos, 1);
1060 } else {
1061 newpos = 0;
1062 tn = tnode_new(key, newpos, 1); /* First tnode */
1063 }
1064
1065 if (!tn) {
1066 free_leaf_info(li);
1067 tnode_free((struct tnode *) l);
1068 *err = -ENOMEM;
1069 goto err;
1070 }
1071
1072 NODE_SET_PARENT(tn, tp);
1073
1074 missbit = tkey_extract_bits(key, newpos, 1);
1075 put_child(t, tn, missbit, (struct node *)l);
1076 put_child(t, tn, 1-missbit, n);
1077
1078 if (tp) {
1079 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1080 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1081 } else {
1082 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1083 tp = tn;
1084 }
1085 }
1086
1087 if (tp && tp->pos + tp->bits > 32)
1088 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1089 tp, tp->pos, tp->bits, key, plen);
1090
1091 /* Rebalance the trie */
1092
1093 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1094 done:
1095 t->revision++;
1096 err:
1097 return fa_head;
1098 }
1099
1100 static int
1101 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1102 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1103 {
1104 struct trie *t = (struct trie *) tb->tb_data;
1105 struct fib_alias *fa, *new_fa;
1106 struct list_head *fa_head = NULL;
1107 struct fib_info *fi;
1108 int plen = r->rtm_dst_len;
1109 int type = r->rtm_type;
1110 u8 tos = r->rtm_tos;
1111 u32 key, mask;
1112 int err;
1113 struct leaf *l;
1114
1115 if (plen > 32)
1116 return -EINVAL;
1117
1118 key = 0;
1119 if (rta->rta_dst)
1120 memcpy(&key, rta->rta_dst, 4);
1121
1122 key = ntohl(key);
1123
1124 pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1125
1126 mask = ntohl(inet_make_mask(plen));
1127
1128 if (key & ~mask)
1129 return -EINVAL;
1130
1131 key = key & mask;
1132
1133 fi = fib_create_info(r, rta, nlhdr, &err);
1134
1135 if (!fi)
1136 goto err;
1137
1138 l = fib_find_node(t, key);
1139 fa = NULL;
1140
1141 if (l) {
1142 fa_head = get_fa_head(l, plen);
1143 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1144 }
1145
1146 /* Now fa, if non-NULL, points to the first fib alias
1147 * with the same keys [prefix,tos,priority], if such key already
1148 * exists or to the node before which we will insert new one.
1149 *
1150 * If fa is NULL, we will need to allocate a new one and
1151 * insert to the head of f.
1152 *
1153 * If f is NULL, no fib node matched the destination key
1154 * and we need to allocate a new one of those as well.
1155 */
1156
1157 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1158 struct fib_alias *fa_orig;
1159
1160 err = -EEXIST;
1161 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1162 goto out;
1163
1164 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1165 struct fib_info *fi_drop;
1166 u8 state;
1167
1168 err = -ENOBUFS;
1169 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1170 if (new_fa == NULL)
1171 goto out;
1172
1173 fi_drop = fa->fa_info;
1174 new_fa->fa_tos = fa->fa_tos;
1175 new_fa->fa_info = fi;
1176 new_fa->fa_type = type;
1177 new_fa->fa_scope = r->rtm_scope;
1178 state = fa->fa_state;
1179 new_fa->fa_state &= ~FA_S_ACCESSED;
1180
1181 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1182 alias_free_mem_rcu(fa);
1183
1184 fib_release_info(fi_drop);
1185 if (state & FA_S_ACCESSED)
1186 rt_cache_flush(-1);
1187
1188 goto succeeded;
1189 }
1190 /* Error if we find a perfect match which
1191 * uses the same scope, type, and nexthop
1192 * information.
1193 */
1194 fa_orig = fa;
1195 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1196 if (fa->fa_tos != tos)
1197 break;
1198 if (fa->fa_info->fib_priority != fi->fib_priority)
1199 break;
1200 if (fa->fa_type == type &&
1201 fa->fa_scope == r->rtm_scope &&
1202 fa->fa_info == fi) {
1203 goto out;
1204 }
1205 }
1206 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1207 fa = fa_orig;
1208 }
1209 err = -ENOENT;
1210 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1211 goto out;
1212
1213 err = -ENOBUFS;
1214 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1215 if (new_fa == NULL)
1216 goto out;
1217
1218 new_fa->fa_info = fi;
1219 new_fa->fa_tos = tos;
1220 new_fa->fa_type = type;
1221 new_fa->fa_scope = r->rtm_scope;
1222 new_fa->fa_state = 0;
1223 /*
1224 * Insert new entry to the list.
1225 */
1226
1227 if (!fa_head) {
1228 fa_head = fib_insert_node(t, &err, key, plen);
1229 err = 0;
1230 if (err)
1231 goto out_free_new_fa;
1232 }
1233
1234 list_add_tail_rcu(&new_fa->fa_list,
1235 (fa ? &fa->fa_list : fa_head));
1236
1237 rt_cache_flush(-1);
1238 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1239 succeeded:
1240 return 0;
1241
1242 out_free_new_fa:
1243 kmem_cache_free(fn_alias_kmem, new_fa);
1244 out:
1245 fib_release_info(fi);
1246 err:
1247 return err;
1248 }
1249
1250
1251 /* should be clalled with rcu_read_lock */
1252 static inline int check_leaf(struct trie *t, struct leaf *l,
1253 t_key key, int *plen, const struct flowi *flp,
1254 struct fib_result *res)
1255 {
1256 int err, i;
1257 t_key mask;
1258 struct leaf_info *li;
1259 struct hlist_head *hhead = &l->list;
1260 struct hlist_node *node;
1261
1262 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1263 i = li->plen;
1264 mask = ntohl(inet_make_mask(i));
1265 if (l->key != (key & mask))
1266 continue;
1267
1268 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1269 *plen = i;
1270 #ifdef CONFIG_IP_FIB_TRIE_STATS
1271 t->stats.semantic_match_passed++;
1272 #endif
1273 return err;
1274 }
1275 #ifdef CONFIG_IP_FIB_TRIE_STATS
1276 t->stats.semantic_match_miss++;
1277 #endif
1278 }
1279 return 1;
1280 }
1281
1282 static int
1283 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1284 {
1285 struct trie *t = (struct trie *) tb->tb_data;
1286 int plen, ret = 0;
1287 struct node *n;
1288 struct tnode *pn;
1289 int pos, bits;
1290 t_key key = ntohl(flp->fl4_dst);
1291 int chopped_off;
1292 t_key cindex = 0;
1293 int current_prefix_length = KEYLENGTH;
1294 struct tnode *cn;
1295 t_key node_prefix, key_prefix, pref_mismatch;
1296 int mp;
1297
1298 rcu_read_lock();
1299
1300 n = rcu_dereference(t->trie);
1301 if (!n)
1302 goto failed;
1303
1304 #ifdef CONFIG_IP_FIB_TRIE_STATS
1305 t->stats.gets++;
1306 #endif
1307
1308 /* Just a leaf? */
1309 if (IS_LEAF(n)) {
1310 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1311 goto found;
1312 goto failed;
1313 }
1314 pn = (struct tnode *) n;
1315 chopped_off = 0;
1316
1317 while (pn) {
1318 pos = pn->pos;
1319 bits = pn->bits;
1320
1321 if (!chopped_off)
1322 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1323
1324 n = tnode_get_child(pn, cindex);
1325
1326 if (n == NULL) {
1327 #ifdef CONFIG_IP_FIB_TRIE_STATS
1328 t->stats.null_node_hit++;
1329 #endif
1330 goto backtrace;
1331 }
1332
1333 if (IS_LEAF(n)) {
1334 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1335 goto found;
1336 else
1337 goto backtrace;
1338 }
1339
1340 #define HL_OPTIMIZE
1341 #ifdef HL_OPTIMIZE
1342 cn = (struct tnode *)n;
1343
1344 /*
1345 * It's a tnode, and we can do some extra checks here if we
1346 * like, to avoid descending into a dead-end branch.
1347 * This tnode is in the parent's child array at index
1348 * key[p_pos..p_pos+p_bits] but potentially with some bits
1349 * chopped off, so in reality the index may be just a
1350 * subprefix, padded with zero at the end.
1351 * We can also take a look at any skipped bits in this
1352 * tnode - everything up to p_pos is supposed to be ok,
1353 * and the non-chopped bits of the index (se previous
1354 * paragraph) are also guaranteed ok, but the rest is
1355 * considered unknown.
1356 *
1357 * The skipped bits are key[pos+bits..cn->pos].
1358 */
1359
1360 /* If current_prefix_length < pos+bits, we are already doing
1361 * actual prefix matching, which means everything from
1362 * pos+(bits-chopped_off) onward must be zero along some
1363 * branch of this subtree - otherwise there is *no* valid
1364 * prefix present. Here we can only check the skipped
1365 * bits. Remember, since we have already indexed into the
1366 * parent's child array, we know that the bits we chopped of
1367 * *are* zero.
1368 */
1369
1370 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1371
1372 if (current_prefix_length < pos+bits) {
1373 if (tkey_extract_bits(cn->key, current_prefix_length,
1374 cn->pos - current_prefix_length) != 0 ||
1375 !(cn->child[0]))
1376 goto backtrace;
1377 }
1378
1379 /*
1380 * If chopped_off=0, the index is fully validated and we
1381 * only need to look at the skipped bits for this, the new,
1382 * tnode. What we actually want to do is to find out if
1383 * these skipped bits match our key perfectly, or if we will
1384 * have to count on finding a matching prefix further down,
1385 * because if we do, we would like to have some way of
1386 * verifying the existence of such a prefix at this point.
1387 */
1388
1389 /* The only thing we can do at this point is to verify that
1390 * any such matching prefix can indeed be a prefix to our
1391 * key, and if the bits in the node we are inspecting that
1392 * do not match our key are not ZERO, this cannot be true.
1393 * Thus, find out where there is a mismatch (before cn->pos)
1394 * and verify that all the mismatching bits are zero in the
1395 * new tnode's key.
1396 */
1397
1398 /* Note: We aren't very concerned about the piece of the key
1399 * that precede pn->pos+pn->bits, since these have already been
1400 * checked. The bits after cn->pos aren't checked since these are
1401 * by definition "unknown" at this point. Thus, what we want to
1402 * see is if we are about to enter the "prefix matching" state,
1403 * and in that case verify that the skipped bits that will prevail
1404 * throughout this subtree are zero, as they have to be if we are
1405 * to find a matching prefix.
1406 */
1407
1408 node_prefix = MASK_PFX(cn->key, cn->pos);
1409 key_prefix = MASK_PFX(key, cn->pos);
1410 pref_mismatch = key_prefix^node_prefix;
1411 mp = 0;
1412
1413 /* In short: If skipped bits in this node do not match the search
1414 * key, enter the "prefix matching" state.directly.
1415 */
1416 if (pref_mismatch) {
1417 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1418 mp++;
1419 pref_mismatch = pref_mismatch <<1;
1420 }
1421 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1422
1423 if (key_prefix != 0)
1424 goto backtrace;
1425
1426 if (current_prefix_length >= cn->pos)
1427 current_prefix_length = mp;
1428 }
1429 #endif
1430 pn = (struct tnode *)n; /* Descend */
1431 chopped_off = 0;
1432 continue;
1433
1434 backtrace:
1435 chopped_off++;
1436
1437 /* As zero don't change the child key (cindex) */
1438 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1439 chopped_off++;
1440
1441 /* Decrease current_... with bits chopped off */
1442 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1443 current_prefix_length = pn->pos + pn->bits - chopped_off;
1444
1445 /*
1446 * Either we do the actual chop off according or if we have
1447 * chopped off all bits in this tnode walk up to our parent.
1448 */
1449
1450 if (chopped_off <= pn->bits) {
1451 cindex &= ~(1 << (chopped_off-1));
1452 } else {
1453 if (NODE_PARENT(pn) == NULL)
1454 goto failed;
1455
1456 /* Get Child's index */
1457 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1458 pn = NODE_PARENT(pn);
1459 chopped_off = 0;
1460
1461 #ifdef CONFIG_IP_FIB_TRIE_STATS
1462 t->stats.backtrack++;
1463 #endif
1464 goto backtrace;
1465 }
1466 }
1467 failed:
1468 ret = 1;
1469 found:
1470 rcu_read_unlock();
1471 return ret;
1472 }
1473
1474 /* only called from updater side */
1475 static int trie_leaf_remove(struct trie *t, t_key key)
1476 {
1477 t_key cindex;
1478 struct tnode *tp = NULL;
1479 struct node *n = t->trie;
1480 struct leaf *l;
1481
1482 pr_debug("entering trie_leaf_remove(%p)\n", n);
1483
1484 /* Note that in the case skipped bits, those bits are *not* checked!
1485 * When we finish this, we will have NULL or a T_LEAF, and the
1486 * T_LEAF may or may not match our key.
1487 */
1488
1489 while (n != NULL && IS_TNODE(n)) {
1490 struct tnode *tn = (struct tnode *) n;
1491 check_tnode(tn);
1492 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1493
1494 BUG_ON(n && NODE_PARENT(n) != tn);
1495 }
1496 l = (struct leaf *) n;
1497
1498 if (!n || !tkey_equals(l->key, key))
1499 return 0;
1500
1501 /*
1502 * Key found.
1503 * Remove the leaf and rebalance the tree
1504 */
1505
1506 t->revision++;
1507 t->size--;
1508
1509 preempt_disable();
1510 tp = NODE_PARENT(n);
1511 tnode_free((struct tnode *) n);
1512
1513 if (tp) {
1514 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1515 put_child(t, (struct tnode *)tp, cindex, NULL);
1516 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1517 } else
1518 rcu_assign_pointer(t->trie, NULL);
1519 preempt_enable();
1520
1521 return 1;
1522 }
1523
1524 static int
1525 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1526 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1527 {
1528 struct trie *t = (struct trie *) tb->tb_data;
1529 u32 key, mask;
1530 int plen = r->rtm_dst_len;
1531 u8 tos = r->rtm_tos;
1532 struct fib_alias *fa, *fa_to_delete;
1533 struct list_head *fa_head;
1534 struct leaf *l;
1535 struct leaf_info *li;
1536
1537
1538 if (plen > 32)
1539 return -EINVAL;
1540
1541 key = 0;
1542 if (rta->rta_dst)
1543 memcpy(&key, rta->rta_dst, 4);
1544
1545 key = ntohl(key);
1546 mask = ntohl(inet_make_mask(plen));
1547
1548 if (key & ~mask)
1549 return -EINVAL;
1550
1551 key = key & mask;
1552 l = fib_find_node(t, key);
1553
1554 if (!l)
1555 return -ESRCH;
1556
1557 fa_head = get_fa_head(l, plen);
1558 fa = fib_find_alias(fa_head, tos, 0);
1559
1560 if (!fa)
1561 return -ESRCH;
1562
1563 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1564
1565 fa_to_delete = NULL;
1566 fa_head = fa->fa_list.prev;
1567
1568 list_for_each_entry(fa, fa_head, fa_list) {
1569 struct fib_info *fi = fa->fa_info;
1570
1571 if (fa->fa_tos != tos)
1572 break;
1573
1574 if ((!r->rtm_type ||
1575 fa->fa_type == r->rtm_type) &&
1576 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1577 fa->fa_scope == r->rtm_scope) &&
1578 (!r->rtm_protocol ||
1579 fi->fib_protocol == r->rtm_protocol) &&
1580 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1581 fa_to_delete = fa;
1582 break;
1583 }
1584 }
1585
1586 if (!fa_to_delete)
1587 return -ESRCH;
1588
1589 fa = fa_to_delete;
1590 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1591
1592 l = fib_find_node(t, key);
1593 li = find_leaf_info(&l->list, plen);
1594
1595 list_del_rcu(&fa->fa_list);
1596
1597 if (list_empty(fa_head)) {
1598 hlist_del_rcu(&li->hlist);
1599 free_leaf_info(li);
1600 }
1601
1602 if (hlist_empty(&l->list))
1603 trie_leaf_remove(t, key);
1604
1605 if (fa->fa_state & FA_S_ACCESSED)
1606 rt_cache_flush(-1);
1607
1608 fib_release_info(fa->fa_info);
1609 alias_free_mem_rcu(fa);
1610 return 0;
1611 }
1612
1613 static int trie_flush_list(struct trie *t, struct list_head *head)
1614 {
1615 struct fib_alias *fa, *fa_node;
1616 int found = 0;
1617
1618 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1619 struct fib_info *fi = fa->fa_info;
1620
1621 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1622 list_del_rcu(&fa->fa_list);
1623 fib_release_info(fa->fa_info);
1624 alias_free_mem_rcu(fa);
1625 found++;
1626 }
1627 }
1628 return found;
1629 }
1630
1631 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1632 {
1633 int found = 0;
1634 struct hlist_head *lih = &l->list;
1635 struct hlist_node *node, *tmp;
1636 struct leaf_info *li = NULL;
1637
1638 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1639 found += trie_flush_list(t, &li->falh);
1640
1641 if (list_empty(&li->falh)) {
1642 hlist_del_rcu(&li->hlist);
1643 free_leaf_info(li);
1644 }
1645 }
1646 return found;
1647 }
1648
1649 /* rcu_read_lock needs to be hold by caller from readside */
1650
1651 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1652 {
1653 struct node *c = (struct node *) thisleaf;
1654 struct tnode *p;
1655 int idx;
1656 struct node *trie = rcu_dereference(t->trie);
1657
1658 if (c == NULL) {
1659 if (trie == NULL)
1660 return NULL;
1661
1662 if (IS_LEAF(trie)) /* trie w. just a leaf */
1663 return (struct leaf *) trie;
1664
1665 p = (struct tnode*) trie; /* Start */
1666 } else
1667 p = (struct tnode *) NODE_PARENT(c);
1668
1669 while (p) {
1670 int pos, last;
1671
1672 /* Find the next child of the parent */
1673 if (c)
1674 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1675 else
1676 pos = 0;
1677
1678 last = 1 << p->bits;
1679 for (idx = pos; idx < last ; idx++) {
1680 c = rcu_dereference(p->child[idx]);
1681
1682 if (!c)
1683 continue;
1684
1685 /* Decend if tnode */
1686 while (IS_TNODE(c)) {
1687 p = (struct tnode *) c;
1688 idx = 0;
1689
1690 /* Rightmost non-NULL branch */
1691 if (p && IS_TNODE(p))
1692 while (!(c = rcu_dereference(p->child[idx]))
1693 && idx < (1<<p->bits)) idx++;
1694
1695 /* Done with this tnode? */
1696 if (idx >= (1 << p->bits) || !c)
1697 goto up;
1698 }
1699 return (struct leaf *) c;
1700 }
1701 up:
1702 /* No more children go up one step */
1703 c = (struct node *) p;
1704 p = (struct tnode *) NODE_PARENT(p);
1705 }
1706 return NULL; /* Ready. Root of trie */
1707 }
1708
1709 static int fn_trie_flush(struct fib_table *tb)
1710 {
1711 struct trie *t = (struct trie *) tb->tb_data;
1712 struct leaf *ll = NULL, *l = NULL;
1713 int found = 0, h;
1714
1715 t->revision++;
1716
1717 rcu_read_lock();
1718 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1719 found += trie_flush_leaf(t, l);
1720
1721 if (ll && hlist_empty(&ll->list))
1722 trie_leaf_remove(t, ll->key);
1723 ll = l;
1724 }
1725 rcu_read_unlock();
1726
1727 if (ll && hlist_empty(&ll->list))
1728 trie_leaf_remove(t, ll->key);
1729
1730 pr_debug("trie_flush found=%d\n", found);
1731 return found;
1732 }
1733
1734 static int trie_last_dflt = -1;
1735
1736 static void
1737 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1738 {
1739 struct trie *t = (struct trie *) tb->tb_data;
1740 int order, last_idx;
1741 struct fib_info *fi = NULL;
1742 struct fib_info *last_resort;
1743 struct fib_alias *fa = NULL;
1744 struct list_head *fa_head;
1745 struct leaf *l;
1746
1747 last_idx = -1;
1748 last_resort = NULL;
1749 order = -1;
1750
1751 rcu_read_lock();
1752
1753 l = fib_find_node(t, 0);
1754 if (!l)
1755 goto out;
1756
1757 fa_head = get_fa_head(l, 0);
1758 if (!fa_head)
1759 goto out;
1760
1761 if (list_empty(fa_head))
1762 goto out;
1763
1764 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1765 struct fib_info *next_fi = fa->fa_info;
1766
1767 if (fa->fa_scope != res->scope ||
1768 fa->fa_type != RTN_UNICAST)
1769 continue;
1770
1771 if (next_fi->fib_priority > res->fi->fib_priority)
1772 break;
1773 if (!next_fi->fib_nh[0].nh_gw ||
1774 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1775 continue;
1776 fa->fa_state |= FA_S_ACCESSED;
1777
1778 if (fi == NULL) {
1779 if (next_fi != res->fi)
1780 break;
1781 } else if (!fib_detect_death(fi, order, &last_resort,
1782 &last_idx, &trie_last_dflt)) {
1783 if (res->fi)
1784 fib_info_put(res->fi);
1785 res->fi = fi;
1786 atomic_inc(&fi->fib_clntref);
1787 trie_last_dflt = order;
1788 goto out;
1789 }
1790 fi = next_fi;
1791 order++;
1792 }
1793 if (order <= 0 || fi == NULL) {
1794 trie_last_dflt = -1;
1795 goto out;
1796 }
1797
1798 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1799 if (res->fi)
1800 fib_info_put(res->fi);
1801 res->fi = fi;
1802 atomic_inc(&fi->fib_clntref);
1803 trie_last_dflt = order;
1804 goto out;
1805 }
1806 if (last_idx >= 0) {
1807 if (res->fi)
1808 fib_info_put(res->fi);
1809 res->fi = last_resort;
1810 if (last_resort)
1811 atomic_inc(&last_resort->fib_clntref);
1812 }
1813 trie_last_dflt = last_idx;
1814 out:;
1815 rcu_read_unlock();
1816 }
1817
1818 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1819 struct sk_buff *skb, struct netlink_callback *cb)
1820 {
1821 int i, s_i;
1822 struct fib_alias *fa;
1823
1824 u32 xkey = htonl(key);
1825
1826 s_i = cb->args[3];
1827 i = 0;
1828
1829 /* rcu_read_lock is hold by caller */
1830
1831 list_for_each_entry_rcu(fa, fah, fa_list) {
1832 if (i < s_i) {
1833 i++;
1834 continue;
1835 }
1836 if (fa->fa_info->fib_nh == NULL) {
1837 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1838 i++;
1839 continue;
1840 }
1841 if (fa->fa_info == NULL) {
1842 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1843 i++;
1844 continue;
1845 }
1846
1847 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1848 cb->nlh->nlmsg_seq,
1849 RTM_NEWROUTE,
1850 tb->tb_id,
1851 fa->fa_type,
1852 fa->fa_scope,
1853 &xkey,
1854 plen,
1855 fa->fa_tos,
1856 fa->fa_info, 0) < 0) {
1857 cb->args[3] = i;
1858 return -1;
1859 }
1860 i++;
1861 }
1862 cb->args[3] = i;
1863 return skb->len;
1864 }
1865
1866 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1867 struct netlink_callback *cb)
1868 {
1869 int h, s_h;
1870 struct list_head *fa_head;
1871 struct leaf *l = NULL;
1872
1873 s_h = cb->args[2];
1874
1875 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1876 if (h < s_h)
1877 continue;
1878 if (h > s_h)
1879 memset(&cb->args[3], 0,
1880 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1881
1882 fa_head = get_fa_head(l, plen);
1883
1884 if (!fa_head)
1885 continue;
1886
1887 if (list_empty(fa_head))
1888 continue;
1889
1890 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1891 cb->args[2] = h;
1892 return -1;
1893 }
1894 }
1895 cb->args[2] = h;
1896 return skb->len;
1897 }
1898
1899 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1900 {
1901 int m, s_m;
1902 struct trie *t = (struct trie *) tb->tb_data;
1903
1904 s_m = cb->args[1];
1905
1906 rcu_read_lock();
1907 for (m = 0; m <= 32; m++) {
1908 if (m < s_m)
1909 continue;
1910 if (m > s_m)
1911 memset(&cb->args[2], 0,
1912 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1913
1914 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1915 cb->args[1] = m;
1916 goto out;
1917 }
1918 }
1919 rcu_read_unlock();
1920 cb->args[1] = m;
1921 return skb->len;
1922 out:
1923 rcu_read_unlock();
1924 return -1;
1925 }
1926
1927 /* Fix more generic FIB names for init later */
1928
1929 #ifdef CONFIG_IP_MULTIPLE_TABLES
1930 struct fib_table * fib_hash_init(int id)
1931 #else
1932 struct fib_table * __init fib_hash_init(int id)
1933 #endif
1934 {
1935 struct fib_table *tb;
1936 struct trie *t;
1937
1938 if (fn_alias_kmem == NULL)
1939 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1940 sizeof(struct fib_alias),
1941 0, SLAB_HWCACHE_ALIGN,
1942 NULL, NULL);
1943
1944 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1945 GFP_KERNEL);
1946 if (tb == NULL)
1947 return NULL;
1948
1949 tb->tb_id = id;
1950 tb->tb_lookup = fn_trie_lookup;
1951 tb->tb_insert = fn_trie_insert;
1952 tb->tb_delete = fn_trie_delete;
1953 tb->tb_flush = fn_trie_flush;
1954 tb->tb_select_default = fn_trie_select_default;
1955 tb->tb_dump = fn_trie_dump;
1956 memset(tb->tb_data, 0, sizeof(struct trie));
1957
1958 t = (struct trie *) tb->tb_data;
1959
1960 trie_init(t);
1961
1962 if (id == RT_TABLE_LOCAL)
1963 trie_local = t;
1964 else if (id == RT_TABLE_MAIN)
1965 trie_main = t;
1966
1967 if (id == RT_TABLE_LOCAL)
1968 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1969
1970 return tb;
1971 }
1972
1973 #ifdef CONFIG_PROC_FS
1974 /* Depth first Trie walk iterator */
1975 struct fib_trie_iter {
1976 struct tnode *tnode;
1977 struct trie *trie;
1978 unsigned index;
1979 unsigned depth;
1980 };
1981
1982 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1983 {
1984 struct tnode *tn = iter->tnode;
1985 unsigned cindex = iter->index;
1986 struct tnode *p;
1987
1988 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1989 iter->tnode, iter->index, iter->depth);
1990 rescan:
1991 while (cindex < (1<<tn->bits)) {
1992 struct node *n = tnode_get_child(tn, cindex);
1993
1994 if (n) {
1995 if (IS_LEAF(n)) {
1996 iter->tnode = tn;
1997 iter->index = cindex + 1;
1998 } else {
1999 /* push down one level */
2000 iter->tnode = (struct tnode *) n;
2001 iter->index = 0;
2002 ++iter->depth;
2003 }
2004 return n;
2005 }
2006
2007 ++cindex;
2008 }
2009
2010 /* Current node exhausted, pop back up */
2011 p = NODE_PARENT(tn);
2012 if (p) {
2013 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2014 tn = p;
2015 --iter->depth;
2016 goto rescan;
2017 }
2018
2019 /* got root? */
2020 return NULL;
2021 }
2022
2023 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2024 struct trie *t)
2025 {
2026 struct node *n = rcu_dereference(t->trie);
2027
2028 if (n && IS_TNODE(n)) {
2029 iter->tnode = (struct tnode *) n;
2030 iter->trie = t;
2031 iter->index = 0;
2032 iter->depth = 0;
2033 return n;
2034 }
2035 return NULL;
2036 }
2037
2038 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2039 {
2040 struct node *n;
2041 struct fib_trie_iter iter;
2042
2043 memset(s, 0, sizeof(*s));
2044
2045 rcu_read_lock();
2046 for (n = fib_trie_get_first(&iter, t); n;
2047 n = fib_trie_get_next(&iter)) {
2048 if (IS_LEAF(n)) {
2049 s->leaves++;
2050 s->totdepth += iter.depth;
2051 if (iter.depth > s->maxdepth)
2052 s->maxdepth = iter.depth;
2053 } else {
2054 const struct tnode *tn = (const struct tnode *) n;
2055 int i;
2056
2057 s->tnodes++;
2058 s->nodesizes[tn->bits]++;
2059 for (i = 0; i < (1<<tn->bits); i++)
2060 if (!tn->child[i])
2061 s->nullpointers++;
2062 }
2063 }
2064 rcu_read_unlock();
2065 }
2066
2067 /*
2068 * This outputs /proc/net/fib_triestats
2069 */
2070 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2071 {
2072 unsigned i, max, pointers, bytes, avdepth;
2073
2074 if (stat->leaves)
2075 avdepth = stat->totdepth*100 / stat->leaves;
2076 else
2077 avdepth = 0;
2078
2079 seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2080 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2081
2082 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2083
2084 bytes = sizeof(struct leaf) * stat->leaves;
2085 seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2086 bytes += sizeof(struct tnode) * stat->tnodes;
2087
2088 max = MAX_CHILDS-1;
2089 while (max >= 0 && stat->nodesizes[max] == 0)
2090 max--;
2091
2092 pointers = 0;
2093 for (i = 1; i <= max; i++)
2094 if (stat->nodesizes[i] != 0) {
2095 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2096 pointers += (1<<i) * stat->nodesizes[i];
2097 }
2098 seq_putc(seq, '\n');
2099 seq_printf(seq, "\tPointers: %d\n", pointers);
2100
2101 bytes += sizeof(struct node *) * pointers;
2102 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2103 seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
2104
2105 #ifdef CONFIG_IP_FIB_TRIE_STATS
2106 seq_printf(seq, "Counters:\n---------\n");
2107 seq_printf(seq,"gets = %d\n", t->stats.gets);
2108 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2109 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2110 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2111 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2112 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2113 #ifdef CLEAR_STATS
2114 memset(&(t->stats), 0, sizeof(t->stats));
2115 #endif
2116 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2117 }
2118
2119 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2120 {
2121 struct trie_stat *stat;
2122
2123 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2124 if (!stat)
2125 return -ENOMEM;
2126
2127 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2128 sizeof(struct leaf), sizeof(struct tnode));
2129
2130 if (trie_local) {
2131 seq_printf(seq, "Local:\n");
2132 trie_collect_stats(trie_local, stat);
2133 trie_show_stats(seq, stat);
2134 }
2135
2136 if (trie_main) {
2137 seq_printf(seq, "Main:\n");
2138 trie_collect_stats(trie_main, stat);
2139 trie_show_stats(seq, stat);
2140 }
2141 kfree(stat);
2142
2143 return 0;
2144 }
2145
2146 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2147 {
2148 return single_open(file, fib_triestat_seq_show, NULL);
2149 }
2150
2151 static struct file_operations fib_triestat_fops = {
2152 .owner = THIS_MODULE,
2153 .open = fib_triestat_seq_open,
2154 .read = seq_read,
2155 .llseek = seq_lseek,
2156 .release = single_release,
2157 };
2158
2159 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2160 loff_t pos)
2161 {
2162 loff_t idx = 0;
2163 struct node *n;
2164
2165 for (n = fib_trie_get_first(iter, trie_local);
2166 n; ++idx, n = fib_trie_get_next(iter)) {
2167 if (pos == idx)
2168 return n;
2169 }
2170
2171 for (n = fib_trie_get_first(iter, trie_main);
2172 n; ++idx, n = fib_trie_get_next(iter)) {
2173 if (pos == idx)
2174 return n;
2175 }
2176 return NULL;
2177 }
2178
2179 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2180 {
2181 rcu_read_lock();
2182 if (*pos == 0)
2183 return SEQ_START_TOKEN;
2184 return fib_trie_get_idx(seq->private, *pos - 1);
2185 }
2186
2187 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2188 {
2189 struct fib_trie_iter *iter = seq->private;
2190 void *l = v;
2191
2192 ++*pos;
2193 if (v == SEQ_START_TOKEN)
2194 return fib_trie_get_idx(iter, 0);
2195
2196 v = fib_trie_get_next(iter);
2197 BUG_ON(v == l);
2198 if (v)
2199 return v;
2200
2201 /* continue scan in next trie */
2202 if (iter->trie == trie_local)
2203 return fib_trie_get_first(iter, trie_main);
2204
2205 return NULL;
2206 }
2207
2208 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2209 {
2210 rcu_read_unlock();
2211 }
2212
2213 static void seq_indent(struct seq_file *seq, int n)
2214 {
2215 while (n-- > 0) seq_puts(seq, " ");
2216 }
2217
2218 static inline const char *rtn_scope(enum rt_scope_t s)
2219 {
2220 static char buf[32];
2221
2222 switch(s) {
2223 case RT_SCOPE_UNIVERSE: return "universe";
2224 case RT_SCOPE_SITE: return "site";
2225 case RT_SCOPE_LINK: return "link";
2226 case RT_SCOPE_HOST: return "host";
2227 case RT_SCOPE_NOWHERE: return "nowhere";
2228 default:
2229 snprintf(buf, sizeof(buf), "scope=%d", s);
2230 return buf;
2231 }
2232 }
2233
2234 static const char *rtn_type_names[__RTN_MAX] = {
2235 [RTN_UNSPEC] = "UNSPEC",
2236 [RTN_UNICAST] = "UNICAST",
2237 [RTN_LOCAL] = "LOCAL",
2238 [RTN_BROADCAST] = "BROADCAST",
2239 [RTN_ANYCAST] = "ANYCAST",
2240 [RTN_MULTICAST] = "MULTICAST",
2241 [RTN_BLACKHOLE] = "BLACKHOLE",
2242 [RTN_UNREACHABLE] = "UNREACHABLE",
2243 [RTN_PROHIBIT] = "PROHIBIT",
2244 [RTN_THROW] = "THROW",
2245 [RTN_NAT] = "NAT",
2246 [RTN_XRESOLVE] = "XRESOLVE",
2247 };
2248
2249 static inline const char *rtn_type(unsigned t)
2250 {
2251 static char buf[32];
2252
2253 if (t < __RTN_MAX && rtn_type_names[t])
2254 return rtn_type_names[t];
2255 snprintf(buf, sizeof(buf), "type %d", t);
2256 return buf;
2257 }
2258
2259 /* Pretty print the trie */
2260 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2261 {
2262 const struct fib_trie_iter *iter = seq->private;
2263 struct node *n = v;
2264
2265 if (v == SEQ_START_TOKEN)
2266 return 0;
2267
2268 if (IS_TNODE(n)) {
2269 struct tnode *tn = (struct tnode *) n;
2270 t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
2271
2272 if (!NODE_PARENT(n)) {
2273 if (iter->trie == trie_local)
2274 seq_puts(seq, "<local>:\n");
2275 else
2276 seq_puts(seq, "<main>:\n");
2277 } else {
2278 seq_indent(seq, iter->depth-1);
2279 seq_printf(seq, " +-- %d.%d.%d.%d/%d\n",
2280 NIPQUAD(prf), tn->pos);
2281 }
2282 } else {
2283 struct leaf *l = (struct leaf *) n;
2284 int i;
2285 u32 val = ntohl(l->key);
2286
2287 seq_indent(seq, iter->depth);
2288 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2289 for (i = 32; i >= 0; i--) {
2290 struct leaf_info *li = find_leaf_info(&l->list, i);
2291 if (li) {
2292 struct fib_alias *fa;
2293 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2294 seq_indent(seq, iter->depth+1);
2295 seq_printf(seq, " /%d %s %s", i,
2296 rtn_scope(fa->fa_scope),
2297 rtn_type(fa->fa_type));
2298 if (fa->fa_tos)
2299 seq_printf(seq, "tos =%d\n",
2300 fa->fa_tos);
2301 seq_putc(seq, '\n');
2302 }
2303 }
2304 }
2305 }
2306
2307 return 0;
2308 }
2309
2310 static struct seq_operations fib_trie_seq_ops = {
2311 .start = fib_trie_seq_start,
2312 .next = fib_trie_seq_next,
2313 .stop = fib_trie_seq_stop,
2314 .show = fib_trie_seq_show,
2315 };
2316
2317 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2318 {
2319 struct seq_file *seq;
2320 int rc = -ENOMEM;
2321 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2322
2323 if (!s)
2324 goto out;
2325
2326 rc = seq_open(file, &fib_trie_seq_ops);
2327 if (rc)
2328 goto out_kfree;
2329
2330 seq = file->private_data;
2331 seq->private = s;
2332 memset(s, 0, sizeof(*s));
2333 out:
2334 return rc;
2335 out_kfree:
2336 kfree(s);
2337 goto out;
2338 }
2339
2340 static struct file_operations fib_trie_fops = {
2341 .owner = THIS_MODULE,
2342 .open = fib_trie_seq_open,
2343 .read = seq_read,
2344 .llseek = seq_lseek,
2345 .release = seq_release_private,
2346 };
2347
2348 static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
2349 {
2350 static unsigned type2flags[RTN_MAX + 1] = {
2351 [7] = RTF_REJECT, [8] = RTF_REJECT,
2352 };
2353 unsigned flags = type2flags[type];
2354
2355 if (fi && fi->fib_nh->nh_gw)
2356 flags |= RTF_GATEWAY;
2357 if (mask == 0xFFFFFFFF)
2358 flags |= RTF_HOST;
2359 flags |= RTF_UP;
2360 return flags;
2361 }
2362
2363 /*
2364 * This outputs /proc/net/route.
2365 * The format of the file is not supposed to be changed
2366 * and needs to be same as fib_hash output to avoid breaking
2367 * legacy utilities
2368 */
2369 static int fib_route_seq_show(struct seq_file *seq, void *v)
2370 {
2371 struct leaf *l = v;
2372 int i;
2373 char bf[128];
2374
2375 if (v == SEQ_START_TOKEN) {
2376 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2377 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2378 "\tWindow\tIRTT");
2379 return 0;
2380 }
2381
2382 if (IS_TNODE(l))
2383 return 0;
2384
2385 for (i=32; i>=0; i--) {
2386 struct leaf_info *li = find_leaf_info(&l->list, i);
2387 struct fib_alias *fa;
2388 u32 mask, prefix;
2389
2390 if (!li)
2391 continue;
2392
2393 mask = inet_make_mask(li->plen);
2394 prefix = htonl(l->key);
2395
2396 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2397 const struct fib_info *fi = rcu_dereference(fa->fa_info);
2398 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2399
2400 if (fa->fa_type == RTN_BROADCAST
2401 || fa->fa_type == RTN_MULTICAST)
2402 continue;
2403
2404 if (fi)
2405 snprintf(bf, sizeof(bf),
2406 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2407 fi->fib_dev ? fi->fib_dev->name : "*",
2408 prefix,
2409 fi->fib_nh->nh_gw, flags, 0, 0,
2410 fi->fib_priority,
2411 mask,
2412 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2413 fi->fib_window,
2414 fi->fib_rtt >> 3);
2415 else
2416 snprintf(bf, sizeof(bf),
2417 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2418 prefix, 0, flags, 0, 0, 0,
2419 mask, 0, 0, 0);
2420
2421 seq_printf(seq, "%-127s\n", bf);
2422 }
2423 }
2424
2425 return 0;
2426 }
2427
2428 static struct seq_operations fib_route_seq_ops = {
2429 .start = fib_trie_seq_start,
2430 .next = fib_trie_seq_next,
2431 .stop = fib_trie_seq_stop,
2432 .show = fib_route_seq_show,
2433 };
2434
2435 static int fib_route_seq_open(struct inode *inode, struct file *file)
2436 {
2437 struct seq_file *seq;
2438 int rc = -ENOMEM;
2439 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2440
2441 if (!s)
2442 goto out;
2443
2444 rc = seq_open(file, &fib_route_seq_ops);
2445 if (rc)
2446 goto out_kfree;
2447
2448 seq = file->private_data;
2449 seq->private = s;
2450 memset(s, 0, sizeof(*s));
2451 out:
2452 return rc;
2453 out_kfree:
2454 kfree(s);
2455 goto out;
2456 }
2457
2458 static struct file_operations fib_route_fops = {
2459 .owner = THIS_MODULE,
2460 .open = fib_route_seq_open,
2461 .read = seq_read,
2462 .llseek = seq_lseek,
2463 .release = seq_release_private,
2464 };
2465
2466 int __init fib_proc_init(void)
2467 {
2468 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2469 goto out1;
2470
2471 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2472 goto out2;
2473
2474 if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2475 goto out3;
2476
2477 return 0;
2478
2479 out3:
2480 proc_net_remove("fib_triestat");
2481 out2:
2482 proc_net_remove("fib_trie");
2483 out1:
2484 return -ENOMEM;
2485 }
2486
2487 void __init fib_proc_exit(void)
2488 {
2489 proc_net_remove("fib_trie");
2490 proc_net_remove("fib_triestat");
2491 proc_net_remove("route");
2492 }
2493
2494 #endif /* CONFIG_PROC_FS */
This page took 0.085089 seconds and 5 git commands to generate.