[NET]: Introduce inet_connection_sock
[deliverable/linux.git] / net / ipv4 / fib_trie.c
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
46 #define VERSION "0.325"
47
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
55 #include <linux/mm.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/skbuff.h>
66 #include <linux/netlink.h>
67 #include <linux/init.h>
68 #include <linux/list.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/sock.h>
74 #include <net/ip_fib.h>
75 #include "fib_lookup.h"
76
77 #undef CONFIG_IP_FIB_TRIE_STATS
78 #define MAX_CHILDS 16384
79
80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85 static DEFINE_RWLOCK(fib_lock);
86
87 typedef unsigned int t_key;
88
89 #define T_TNODE 0
90 #define T_LEAF 1
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_PARENT(_node) \
93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94 #define NODE_SET_PARENT(_node, _ptr) \
95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97 #define NODE_INIT_PARENT(_node, _type) \
98 ((_node)->_parent = (_type))
99 #define NODE_TYPE(_node) \
100 ((_node)->_parent & NODE_TYPE_MASK)
101
102 #define IS_TNODE(n) (!(n->_parent & T_LEAF))
103 #define IS_LEAF(n) (n->_parent & T_LEAF)
104
105 struct node {
106 t_key key;
107 unsigned long _parent;
108 };
109
110 struct leaf {
111 t_key key;
112 unsigned long _parent;
113 struct hlist_head list;
114 };
115
116 struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120 };
121
122 struct tnode {
123 t_key key;
124 unsigned long _parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
130 };
131
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
140 };
141 #endif
142
143 struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
150 };
151
152 struct trie {
153 struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156 #endif
157 int size;
158 unsigned int revision;
159 };
160
161 static int trie_debug = 0;
162
163 static int tnode_full(struct tnode *tn, struct node *n);
164 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166 static int tnode_child_length(struct tnode *tn);
167 static struct node *resize(struct trie *t, struct tnode *tn);
168 static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
169 static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
170 static void tnode_free(struct tnode *tn);
171 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
172 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
173 extern int fib_detect_death(struct fib_info *fi, int order,
174 struct fib_info **last_resort, int *last_idx, int *dflt);
175
176 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
177 struct nlmsghdr *n, struct netlink_skb_parms *req);
178
179 static kmem_cache_t *fn_alias_kmem;
180 static struct trie *trie_local = NULL, *trie_main = NULL;
181
182 static void trie_bug(char *err)
183 {
184 printk("Trie Bug: %s\n", err);
185 BUG();
186 }
187
188 static inline struct node *tnode_get_child(struct tnode *tn, int i)
189 {
190 if (i >= 1<<tn->bits)
191 trie_bug("tnode_get_child");
192
193 return tn->child[i];
194 }
195
196 static inline int tnode_child_length(struct tnode *tn)
197 {
198 return 1<<tn->bits;
199 }
200
201 /*
202 _________________________________________________________________
203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204 ----------------------------------------------------------------
205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
206
207 _________________________________________________________________
208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209 -----------------------------------------------------------------
210 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
211
212 tp->pos = 7
213 tp->bits = 3
214 n->pos = 15
215 n->bits=4
216 KEYLENGTH=32
217 */
218
219 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
220 {
221 if (offset < KEYLENGTH)
222 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
223 else
224 return 0;
225 }
226
227 static inline int tkey_equals(t_key a, t_key b)
228 {
229 return a == b;
230 }
231
232 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
233 {
234 if (bits == 0 || offset >= KEYLENGTH)
235 return 1;
236 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
238 }
239
240 static inline int tkey_mismatch(t_key a, int offset, t_key b)
241 {
242 t_key diff = a ^ b;
243 int i = offset;
244
245 if (!diff)
246 return 0;
247 while ((diff << i) >> (KEYLENGTH-1) == 0)
248 i++;
249 return i;
250 }
251
252 /* Candiate for fib_semantics */
253
254 static void fn_free_alias(struct fib_alias *fa)
255 {
256 fib_release_info(fa->fa_info);
257 kmem_cache_free(fn_alias_kmem, fa);
258 }
259
260 /*
261 To understand this stuff, an understanding of keys and all their bits is
262 necessary. Every node in the trie has a key associated with it, but not
263 all of the bits in that key are significant.
264
265 Consider a node 'n' and its parent 'tp'.
266
267 If n is a leaf, every bit in its key is significant. Its presence is
268 necessitaded by path compression, since during a tree traversal (when
269 searching for a leaf - unless we are doing an insertion) we will completely
270 ignore all skipped bits we encounter. Thus we need to verify, at the end of
271 a potentially successful search, that we have indeed been walking the
272 correct key path.
273
274 Note that we can never "miss" the correct key in the tree if present by
275 following the wrong path. Path compression ensures that segments of the key
276 that are the same for all keys with a given prefix are skipped, but the
277 skipped part *is* identical for each node in the subtrie below the skipped
278 bit! trie_insert() in this implementation takes care of that - note the
279 call to tkey_sub_equals() in trie_insert().
280
281 if n is an internal node - a 'tnode' here, the various parts of its key
282 have many different meanings.
283
284 Example:
285 _________________________________________________________________
286 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287 -----------------------------------------------------------------
288 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
289
290 _________________________________________________________________
291 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292 -----------------------------------------------------------------
293 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
294
295 tp->pos = 7
296 tp->bits = 3
297 n->pos = 15
298 n->bits=4
299
300 First, let's just ignore the bits that come before the parent tp, that is
301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302 not use them for anything.
303
304 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305 index into the parent's child array. That is, they will be used to find
306 'n' among tp's children.
307
308 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
309 for the node n.
310
311 All the bits we have seen so far are significant to the node n. The rest
312 of the bits are really not needed or indeed known in n->key.
313
314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315 n's child array, and will of course be different for each child.
316
317
318 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
319 at this point.
320
321 */
322
323 static void check_tnode(struct tnode *tn)
324 {
325 if (tn && tn->pos+tn->bits > 32) {
326 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
327 }
328 }
329
330 static int halve_threshold = 25;
331 static int inflate_threshold = 50;
332
333 static struct leaf *leaf_new(void)
334 {
335 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
336 if (l) {
337 NODE_INIT_PARENT(l, T_LEAF);
338 INIT_HLIST_HEAD(&l->list);
339 }
340 return l;
341 }
342
343 static struct leaf_info *leaf_info_new(int plen)
344 {
345 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
346 if (li) {
347 li->plen = plen;
348 INIT_LIST_HEAD(&li->falh);
349 }
350 return li;
351 }
352
353 static inline void free_leaf(struct leaf *l)
354 {
355 kfree(l);
356 }
357
358 static inline void free_leaf_info(struct leaf_info *li)
359 {
360 kfree(li);
361 }
362
363 static struct tnode *tnode_alloc(unsigned int size)
364 {
365 if (size <= PAGE_SIZE) {
366 return kmalloc(size, GFP_KERNEL);
367 } else {
368 return (struct tnode *)
369 __get_free_pages(GFP_KERNEL, get_order(size));
370 }
371 }
372
373 static void __tnode_free(struct tnode *tn)
374 {
375 unsigned int size = sizeof(struct tnode) +
376 (1<<tn->bits) * sizeof(struct node *);
377
378 if (size <= PAGE_SIZE)
379 kfree(tn);
380 else
381 free_pages((unsigned long)tn, get_order(size));
382 }
383
384 static struct tnode* tnode_new(t_key key, int pos, int bits)
385 {
386 int nchildren = 1<<bits;
387 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
388 struct tnode *tn = tnode_alloc(sz);
389
390 if (tn) {
391 memset(tn, 0, sz);
392 NODE_INIT_PARENT(tn, T_TNODE);
393 tn->pos = pos;
394 tn->bits = bits;
395 tn->key = key;
396 tn->full_children = 0;
397 tn->empty_children = 1<<bits;
398 }
399
400 if (trie_debug > 0)
401 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
402 (unsigned int) (sizeof(struct node) * 1<<bits));
403 return tn;
404 }
405
406 static void tnode_free(struct tnode *tn)
407 {
408 if (!tn) {
409 trie_bug("tnode_free\n");
410 }
411 if (IS_LEAF(tn)) {
412 free_leaf((struct leaf *)tn);
413 if (trie_debug > 0 )
414 printk("FL %p \n", tn);
415 }
416 else if (IS_TNODE(tn)) {
417 __tnode_free(tn);
418 if (trie_debug > 0 )
419 printk("FT %p \n", tn);
420 }
421 else {
422 trie_bug("tnode_free\n");
423 }
424 }
425
426 /*
427 * Check whether a tnode 'n' is "full", i.e. it is an internal node
428 * and no bits are skipped. See discussion in dyntree paper p. 6
429 */
430
431 static inline int tnode_full(struct tnode *tn, struct node *n)
432 {
433 if (n == NULL || IS_LEAF(n))
434 return 0;
435
436 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
437 }
438
439 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
440 {
441 tnode_put_child_reorg(tn, i, n, -1);
442 }
443
444 /*
445 * Add a child at position i overwriting the old value.
446 * Update the value of full_children and empty_children.
447 */
448
449 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
450 {
451 struct node *chi;
452 int isfull;
453
454 if (i >= 1<<tn->bits) {
455 printk("bits=%d, i=%d\n", tn->bits, i);
456 trie_bug("tnode_put_child_reorg bits");
457 }
458 write_lock_bh(&fib_lock);
459 chi = tn->child[i];
460
461 /* update emptyChildren */
462 if (n == NULL && chi != NULL)
463 tn->empty_children++;
464 else if (n != NULL && chi == NULL)
465 tn->empty_children--;
466
467 /* update fullChildren */
468 if (wasfull == -1)
469 wasfull = tnode_full(tn, chi);
470
471 isfull = tnode_full(tn, n);
472 if (wasfull && !isfull)
473 tn->full_children--;
474
475 else if (!wasfull && isfull)
476 tn->full_children++;
477 if (n)
478 NODE_SET_PARENT(n, tn);
479
480 tn->child[i] = n;
481 write_unlock_bh(&fib_lock);
482 }
483
484 static struct node *resize(struct trie *t, struct tnode *tn)
485 {
486 int i;
487 int err = 0;
488
489 if (!tn)
490 return NULL;
491
492 if (trie_debug)
493 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
494 tn, inflate_threshold, halve_threshold);
495
496 /* No children */
497 if (tn->empty_children == tnode_child_length(tn)) {
498 tnode_free(tn);
499 return NULL;
500 }
501 /* One child */
502 if (tn->empty_children == tnode_child_length(tn) - 1)
503 for (i = 0; i < tnode_child_length(tn); i++) {
504
505 write_lock_bh(&fib_lock);
506 if (tn->child[i] != NULL) {
507
508 /* compress one level */
509 struct node *n = tn->child[i];
510 if (n)
511 NODE_INIT_PARENT(n, NODE_TYPE(n));
512
513 write_unlock_bh(&fib_lock);
514 tnode_free(tn);
515 return n;
516 }
517 write_unlock_bh(&fib_lock);
518 }
519 /*
520 * Double as long as the resulting node has a number of
521 * nonempty nodes that are above the threshold.
522 */
523
524 /*
525 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
526 * the Helsinki University of Technology and Matti Tikkanen of Nokia
527 * Telecommunications, page 6:
528 * "A node is doubled if the ratio of non-empty children to all
529 * children in the *doubled* node is at least 'high'."
530 *
531 * 'high' in this instance is the variable 'inflate_threshold'. It
532 * is expressed as a percentage, so we multiply it with
533 * tnode_child_length() and instead of multiplying by 2 (since the
534 * child array will be doubled by inflate()) and multiplying
535 * the left-hand side by 100 (to handle the percentage thing) we
536 * multiply the left-hand side by 50.
537 *
538 * The left-hand side may look a bit weird: tnode_child_length(tn)
539 * - tn->empty_children is of course the number of non-null children
540 * in the current node. tn->full_children is the number of "full"
541 * children, that is non-null tnodes with a skip value of 0.
542 * All of those will be doubled in the resulting inflated tnode, so
543 * we just count them one extra time here.
544 *
545 * A clearer way to write this would be:
546 *
547 * to_be_doubled = tn->full_children;
548 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
549 * tn->full_children;
550 *
551 * new_child_length = tnode_child_length(tn) * 2;
552 *
553 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
554 * new_child_length;
555 * if (new_fill_factor >= inflate_threshold)
556 *
557 * ...and so on, tho it would mess up the while () loop.
558 *
559 * anyway,
560 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
561 * inflate_threshold
562 *
563 * avoid a division:
564 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
565 * inflate_threshold * new_child_length
566 *
567 * expand not_to_be_doubled and to_be_doubled, and shorten:
568 * 100 * (tnode_child_length(tn) - tn->empty_children +
569 * tn->full_children ) >= inflate_threshold * new_child_length
570 *
571 * expand new_child_length:
572 * 100 * (tnode_child_length(tn) - tn->empty_children +
573 * tn->full_children ) >=
574 * inflate_threshold * tnode_child_length(tn) * 2
575 *
576 * shorten again:
577 * 50 * (tn->full_children + tnode_child_length(tn) -
578 * tn->empty_children ) >= inflate_threshold *
579 * tnode_child_length(tn)
580 *
581 */
582
583 check_tnode(tn);
584
585 err = 0;
586 while ((tn->full_children > 0 &&
587 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
588 inflate_threshold * tnode_child_length(tn))) {
589
590 tn = inflate(t, tn, &err);
591
592 if (err) {
593 #ifdef CONFIG_IP_FIB_TRIE_STATS
594 t->stats.resize_node_skipped++;
595 #endif
596 break;
597 }
598 }
599
600 check_tnode(tn);
601
602 /*
603 * Halve as long as the number of empty children in this
604 * node is above threshold.
605 */
606
607 err = 0;
608 while (tn->bits > 1 &&
609 100 * (tnode_child_length(tn) - tn->empty_children) <
610 halve_threshold * tnode_child_length(tn)) {
611
612 tn = halve(t, tn, &err);
613
614 if (err) {
615 #ifdef CONFIG_IP_FIB_TRIE_STATS
616 t->stats.resize_node_skipped++;
617 #endif
618 break;
619 }
620 }
621
622
623 /* Only one child remains */
624
625 if (tn->empty_children == tnode_child_length(tn) - 1)
626 for (i = 0; i < tnode_child_length(tn); i++) {
627
628 write_lock_bh(&fib_lock);
629 if (tn->child[i] != NULL) {
630 /* compress one level */
631 struct node *n = tn->child[i];
632
633 if (n)
634 NODE_INIT_PARENT(n, NODE_TYPE(n));
635
636 write_unlock_bh(&fib_lock);
637 tnode_free(tn);
638 return n;
639 }
640 write_unlock_bh(&fib_lock);
641 }
642
643 return (struct node *) tn;
644 }
645
646 static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
647 {
648 struct tnode *inode;
649 struct tnode *oldtnode = tn;
650 int olen = tnode_child_length(tn);
651 int i;
652
653 if (trie_debug)
654 printk("In inflate\n");
655
656 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
657
658 if (!tn) {
659 *err = -ENOMEM;
660 return oldtnode;
661 }
662
663 /*
664 * Preallocate and store tnodes before the actual work so we
665 * don't get into an inconsistent state if memory allocation
666 * fails. In case of failure we return the oldnode and inflate
667 * of tnode is ignored.
668 */
669
670 for(i = 0; i < olen; i++) {
671 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
672
673 if (inode &&
674 IS_TNODE(inode) &&
675 inode->pos == oldtnode->pos + oldtnode->bits &&
676 inode->bits > 1) {
677 struct tnode *left, *right;
678
679 t_key m = TKEY_GET_MASK(inode->pos, 1);
680
681 left = tnode_new(inode->key&(~m), inode->pos + 1,
682 inode->bits - 1);
683
684 if (!left) {
685 *err = -ENOMEM;
686 break;
687 }
688
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
692 if (!right) {
693 *err = -ENOMEM;
694 break;
695 }
696
697 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right);
699 }
700 }
701
702 if (*err) {
703 int size = tnode_child_length(tn);
704 int j;
705
706 for(j = 0; j < size; j++)
707 if (tn->child[j])
708 tnode_free((struct tnode *)tn->child[j]);
709
710 tnode_free(tn);
711
712 *err = -ENOMEM;
713 return oldtnode;
714 }
715
716 for(i = 0; i < olen; i++) {
717 struct node *node = tnode_get_child(oldtnode, i);
718
719 /* An empty child */
720 if (node == NULL)
721 continue;
722
723 /* A leaf or an internal node with skipped bits */
724
725 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
726 tn->pos + tn->bits - 1) {
727 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
728 1) == 0)
729 put_child(t, tn, 2*i, node);
730 else
731 put_child(t, tn, 2*i+1, node);
732 continue;
733 }
734
735 /* An internal node with two children */
736 inode = (struct tnode *) node;
737
738 if (inode->bits == 1) {
739 put_child(t, tn, 2*i, inode->child[0]);
740 put_child(t, tn, 2*i+1, inode->child[1]);
741
742 tnode_free(inode);
743 }
744
745 /* An internal node with more than two children */
746 else {
747 struct tnode *left, *right;
748 int size, j;
749
750 /* We will replace this node 'inode' with two new
751 * ones, 'left' and 'right', each with half of the
752 * original children. The two new nodes will have
753 * a position one bit further down the key and this
754 * means that the "significant" part of their keys
755 * (see the discussion near the top of this file)
756 * will differ by one bit, which will be "0" in
757 * left's key and "1" in right's key. Since we are
758 * moving the key position by one step, the bit that
759 * we are moving away from - the bit at position
760 * (inode->pos) - is the one that will differ between
761 * left and right. So... we synthesize that bit in the
762 * two new keys.
763 * The mask 'm' below will be a single "one" bit at
764 * the position (inode->pos)
765 */
766
767 /* Use the old key, but set the new significant
768 * bit to zero.
769 */
770
771 left = (struct tnode *) tnode_get_child(tn, 2*i);
772 put_child(t, tn, 2*i, NULL);
773
774 if (!left)
775 BUG();
776
777 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
778 put_child(t, tn, 2*i+1, NULL);
779
780 if (!right)
781 BUG();
782
783 size = tnode_child_length(left);
784 for(j = 0; j < size; j++) {
785 put_child(t, left, j, inode->child[j]);
786 put_child(t, right, j, inode->child[j + size]);
787 }
788 put_child(t, tn, 2*i, resize(t, left));
789 put_child(t, tn, 2*i+1, resize(t, right));
790
791 tnode_free(inode);
792 }
793 }
794 tnode_free(oldtnode);
795 return tn;
796 }
797
798 static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
799 {
800 struct tnode *oldtnode = tn;
801 struct node *left, *right;
802 int i;
803 int olen = tnode_child_length(tn);
804
805 if (trie_debug) printk("In halve\n");
806
807 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
808
809 if (!tn) {
810 *err = -ENOMEM;
811 return oldtnode;
812 }
813
814 /*
815 * Preallocate and store tnodes before the actual work so we
816 * don't get into an inconsistent state if memory allocation
817 * fails. In case of failure we return the oldnode and halve
818 * of tnode is ignored.
819 */
820
821 for(i = 0; i < olen; i += 2) {
822 left = tnode_get_child(oldtnode, i);
823 right = tnode_get_child(oldtnode, i+1);
824
825 /* Two nonempty children */
826 if (left && right) {
827 struct tnode *newBinNode =
828 tnode_new(left->key, tn->pos + tn->bits, 1);
829
830 if (!newBinNode) {
831 *err = -ENOMEM;
832 break;
833 }
834 put_child(t, tn, i/2, (struct node *)newBinNode);
835 }
836 }
837
838 if (*err) {
839 int size = tnode_child_length(tn);
840 int j;
841
842 for(j = 0; j < size; j++)
843 if (tn->child[j])
844 tnode_free((struct tnode *)tn->child[j]);
845
846 tnode_free(tn);
847
848 *err = -ENOMEM;
849 return oldtnode;
850 }
851
852 for(i = 0; i < olen; i += 2) {
853 left = tnode_get_child(oldtnode, i);
854 right = tnode_get_child(oldtnode, i+1);
855
856 /* At least one of the children is empty */
857 if (left == NULL) {
858 if (right == NULL) /* Both are empty */
859 continue;
860 put_child(t, tn, i/2, right);
861 } else if (right == NULL)
862 put_child(t, tn, i/2, left);
863
864 /* Two nonempty children */
865 else {
866 struct tnode *newBinNode =
867 (struct tnode *) tnode_get_child(tn, i/2);
868 put_child(t, tn, i/2, NULL);
869
870 if (!newBinNode)
871 BUG();
872
873 put_child(t, newBinNode, 0, left);
874 put_child(t, newBinNode, 1, right);
875 put_child(t, tn, i/2, resize(t, newBinNode));
876 }
877 }
878 tnode_free(oldtnode);
879 return tn;
880 }
881
882 static void *trie_init(struct trie *t)
883 {
884 if (t) {
885 t->size = 0;
886 t->trie = NULL;
887 t->revision = 0;
888 #ifdef CONFIG_IP_FIB_TRIE_STATS
889 memset(&t->stats, 0, sizeof(struct trie_use_stats));
890 #endif
891 }
892 return t;
893 }
894
895 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
896 {
897 struct hlist_node *node;
898 struct leaf_info *li;
899
900 hlist_for_each_entry(li, node, head, hlist) {
901 if (li->plen == plen)
902 return li;
903 }
904 return NULL;
905 }
906
907 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
908 {
909 struct list_head *fa_head = NULL;
910 struct leaf_info *li = find_leaf_info(&l->list, plen);
911
912 if (li)
913 fa_head = &li->falh;
914
915 return fa_head;
916 }
917
918 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
919 {
920 struct leaf_info *li = NULL, *last = NULL;
921 struct hlist_node *node, *tmp;
922
923 write_lock_bh(&fib_lock);
924
925 if (hlist_empty(head))
926 hlist_add_head(&new->hlist, head);
927 else {
928 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
929
930 if (new->plen > li->plen)
931 break;
932
933 last = li;
934 }
935 if (last)
936 hlist_add_after(&last->hlist, &new->hlist);
937 else
938 hlist_add_before(&new->hlist, &li->hlist);
939 }
940 write_unlock_bh(&fib_lock);
941 }
942
943 static struct leaf *
944 fib_find_node(struct trie *t, u32 key)
945 {
946 int pos;
947 struct tnode *tn;
948 struct node *n;
949
950 pos = 0;
951 n = t->trie;
952
953 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
954 tn = (struct tnode *) n;
955
956 check_tnode(tn);
957
958 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
959 pos=tn->pos + tn->bits;
960 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
961 }
962 else
963 break;
964 }
965 /* Case we have found a leaf. Compare prefixes */
966
967 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
968 struct leaf *l = (struct leaf *) n;
969 return l;
970 }
971 return NULL;
972 }
973
974 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
975 {
976 int i = 0;
977 int wasfull;
978 t_key cindex, key;
979 struct tnode *tp = NULL;
980
981 if (!tn)
982 BUG();
983
984 key = tn->key;
985 i = 0;
986
987 while (tn != NULL && NODE_PARENT(tn) != NULL) {
988
989 if (i > 10) {
990 printk("Rebalance tn=%p \n", tn);
991 if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
992
993 printk("Rebalance tp=%p \n", tp);
994 if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
995 }
996
997 if (i > 12) BUG();
998 i++;
999
1000 tp = NODE_PARENT(tn);
1001 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1002 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1003 tn = (struct tnode *) resize (t, (struct tnode *)tn);
1004 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
1005
1006 if (!NODE_PARENT(tn))
1007 break;
1008
1009 tn = NODE_PARENT(tn);
1010 }
1011 /* Handle last (top) tnode */
1012 if (IS_TNODE(tn))
1013 tn = (struct tnode*) resize(t, (struct tnode *)tn);
1014
1015 return (struct node*) tn;
1016 }
1017
1018 static struct list_head *
1019 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1020 {
1021 int pos, newpos;
1022 struct tnode *tp = NULL, *tn = NULL;
1023 struct node *n;
1024 struct leaf *l;
1025 int missbit;
1026 struct list_head *fa_head = NULL;
1027 struct leaf_info *li;
1028 t_key cindex;
1029
1030 pos = 0;
1031 n = t->trie;
1032
1033 /* If we point to NULL, stop. Either the tree is empty and we should
1034 * just put a new leaf in if, or we have reached an empty child slot,
1035 * and we should just put our new leaf in that.
1036 * If we point to a T_TNODE, check if it matches our key. Note that
1037 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 * not be the parent's 'pos'+'bits'!
1039 *
1040 * If it does match the current key, get pos/bits from it, extract
1041 * the index from our key, push the T_TNODE and walk the tree.
1042 *
1043 * If it doesn't, we have to replace it with a new T_TNODE.
1044 *
1045 * If we point to a T_LEAF, it might or might not have the same key
1046 * as we do. If it does, just change the value, update the T_LEAF's
1047 * value, and return it.
1048 * If it doesn't, we need to replace it with a T_TNODE.
1049 */
1050
1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1052 tn = (struct tnode *) n;
1053
1054 check_tnode(tn);
1055
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1057 tp = tn;
1058 pos=tn->pos + tn->bits;
1059 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1060
1061 if (n && NODE_PARENT(n) != tn) {
1062 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1063 BUG();
1064 }
1065 }
1066 else
1067 break;
1068 }
1069
1070 /*
1071 * n ----> NULL, LEAF or TNODE
1072 *
1073 * tp is n's (parent) ----> NULL or TNODE
1074 */
1075
1076 if (tp && IS_LEAF(tp))
1077 BUG();
1078
1079
1080 /* Case 1: n is a leaf. Compare prefixes */
1081
1082 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1083 struct leaf *l = ( struct leaf *) n;
1084
1085 li = leaf_info_new(plen);
1086
1087 if (!li) {
1088 *err = -ENOMEM;
1089 goto err;
1090 }
1091
1092 fa_head = &li->falh;
1093 insert_leaf_info(&l->list, li);
1094 goto done;
1095 }
1096 t->size++;
1097 l = leaf_new();
1098
1099 if (!l) {
1100 *err = -ENOMEM;
1101 goto err;
1102 }
1103
1104 l->key = key;
1105 li = leaf_info_new(plen);
1106
1107 if (!li) {
1108 tnode_free((struct tnode *) l);
1109 *err = -ENOMEM;
1110 goto err;
1111 }
1112
1113 fa_head = &li->falh;
1114 insert_leaf_info(&l->list, li);
1115
1116 /* Case 2: n is NULL, and will just insert a new leaf */
1117 if (t->trie && n == NULL) {
1118
1119 NODE_SET_PARENT(l, tp);
1120
1121 if (!tp)
1122 BUG();
1123
1124 else {
1125 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1126 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1127 }
1128 }
1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1130 else {
1131 /*
1132 * Add a new tnode here
1133 * first tnode need some special handling
1134 */
1135
1136 if (tp)
1137 pos=tp->pos+tp->bits;
1138 else
1139 pos=0;
1140 if (n) {
1141 newpos = tkey_mismatch(key, pos, n->key);
1142 tn = tnode_new(n->key, newpos, 1);
1143 }
1144 else {
1145 newpos = 0;
1146 tn = tnode_new(key, newpos, 1); /* First tnode */
1147 }
1148
1149 if (!tn) {
1150 free_leaf_info(li);
1151 tnode_free((struct tnode *) l);
1152 *err = -ENOMEM;
1153 goto err;
1154 }
1155
1156 NODE_SET_PARENT(tn, tp);
1157
1158 missbit=tkey_extract_bits(key, newpos, 1);
1159 put_child(t, tn, missbit, (struct node *)l);
1160 put_child(t, tn, 1-missbit, n);
1161
1162 if (tp) {
1163 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1164 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1165 }
1166 else {
1167 t->trie = (struct node*) tn; /* First tnode */
1168 tp = tn;
1169 }
1170 }
1171 if (tp && tp->pos+tp->bits > 32) {
1172 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1173 tp, tp->pos, tp->bits, key, plen);
1174 }
1175 /* Rebalance the trie */
1176 t->trie = trie_rebalance(t, tp);
1177 done:
1178 t->revision++;
1179 err:;
1180 return fa_head;
1181 }
1182
1183 static int
1184 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1185 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1186 {
1187 struct trie *t = (struct trie *) tb->tb_data;
1188 struct fib_alias *fa, *new_fa;
1189 struct list_head *fa_head = NULL;
1190 struct fib_info *fi;
1191 int plen = r->rtm_dst_len;
1192 int type = r->rtm_type;
1193 u8 tos = r->rtm_tos;
1194 u32 key, mask;
1195 int err;
1196 struct leaf *l;
1197
1198 if (plen > 32)
1199 return -EINVAL;
1200
1201 key = 0;
1202 if (rta->rta_dst)
1203 memcpy(&key, rta->rta_dst, 4);
1204
1205 key = ntohl(key);
1206
1207 if (trie_debug)
1208 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1209
1210 mask = ntohl( inet_make_mask(plen) );
1211
1212 if (key & ~mask)
1213 return -EINVAL;
1214
1215 key = key & mask;
1216
1217 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1218 goto err;
1219
1220 l = fib_find_node(t, key);
1221 fa = NULL;
1222
1223 if (l) {
1224 fa_head = get_fa_head(l, plen);
1225 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1226 }
1227
1228 /* Now fa, if non-NULL, points to the first fib alias
1229 * with the same keys [prefix,tos,priority], if such key already
1230 * exists or to the node before which we will insert new one.
1231 *
1232 * If fa is NULL, we will need to allocate a new one and
1233 * insert to the head of f.
1234 *
1235 * If f is NULL, no fib node matched the destination key
1236 * and we need to allocate a new one of those as well.
1237 */
1238
1239 if (fa &&
1240 fa->fa_info->fib_priority == fi->fib_priority) {
1241 struct fib_alias *fa_orig;
1242
1243 err = -EEXIST;
1244 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1245 goto out;
1246
1247 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1248 struct fib_info *fi_drop;
1249 u8 state;
1250
1251 write_lock_bh(&fib_lock);
1252
1253 fi_drop = fa->fa_info;
1254 fa->fa_info = fi;
1255 fa->fa_type = type;
1256 fa->fa_scope = r->rtm_scope;
1257 state = fa->fa_state;
1258 fa->fa_state &= ~FA_S_ACCESSED;
1259
1260 write_unlock_bh(&fib_lock);
1261
1262 fib_release_info(fi_drop);
1263 if (state & FA_S_ACCESSED)
1264 rt_cache_flush(-1);
1265
1266 goto succeeded;
1267 }
1268 /* Error if we find a perfect match which
1269 * uses the same scope, type, and nexthop
1270 * information.
1271 */
1272 fa_orig = fa;
1273 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1274 if (fa->fa_tos != tos)
1275 break;
1276 if (fa->fa_info->fib_priority != fi->fib_priority)
1277 break;
1278 if (fa->fa_type == type &&
1279 fa->fa_scope == r->rtm_scope &&
1280 fa->fa_info == fi) {
1281 goto out;
1282 }
1283 }
1284 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1285 fa = fa_orig;
1286 }
1287 err = -ENOENT;
1288 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1289 goto out;
1290
1291 err = -ENOBUFS;
1292 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1293 if (new_fa == NULL)
1294 goto out;
1295
1296 new_fa->fa_info = fi;
1297 new_fa->fa_tos = tos;
1298 new_fa->fa_type = type;
1299 new_fa->fa_scope = r->rtm_scope;
1300 new_fa->fa_state = 0;
1301 #if 0
1302 new_fa->dst = NULL;
1303 #endif
1304 /*
1305 * Insert new entry to the list.
1306 */
1307
1308 if (!fa_head) {
1309 fa_head = fib_insert_node(t, &err, key, plen);
1310 err = 0;
1311 if (err)
1312 goto out_free_new_fa;
1313 }
1314
1315 write_lock_bh(&fib_lock);
1316
1317 list_add_tail(&new_fa->fa_list,
1318 (fa ? &fa->fa_list : fa_head));
1319
1320 write_unlock_bh(&fib_lock);
1321
1322 rt_cache_flush(-1);
1323 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1324 succeeded:
1325 return 0;
1326
1327 out_free_new_fa:
1328 kmem_cache_free(fn_alias_kmem, new_fa);
1329 out:
1330 fib_release_info(fi);
1331 err:;
1332 return err;
1333 }
1334
1335 static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1336 struct fib_result *res)
1337 {
1338 int err, i;
1339 t_key mask;
1340 struct leaf_info *li;
1341 struct hlist_head *hhead = &l->list;
1342 struct hlist_node *node;
1343
1344 hlist_for_each_entry(li, node, hhead, hlist) {
1345
1346 i = li->plen;
1347 mask = ntohl(inet_make_mask(i));
1348 if (l->key != (key & mask))
1349 continue;
1350
1351 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1352 *plen = i;
1353 #ifdef CONFIG_IP_FIB_TRIE_STATS
1354 t->stats.semantic_match_passed++;
1355 #endif
1356 return err;
1357 }
1358 #ifdef CONFIG_IP_FIB_TRIE_STATS
1359 t->stats.semantic_match_miss++;
1360 #endif
1361 }
1362 return 1;
1363 }
1364
1365 static int
1366 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1367 {
1368 struct trie *t = (struct trie *) tb->tb_data;
1369 int plen, ret = 0;
1370 struct node *n;
1371 struct tnode *pn;
1372 int pos, bits;
1373 t_key key=ntohl(flp->fl4_dst);
1374 int chopped_off;
1375 t_key cindex = 0;
1376 int current_prefix_length = KEYLENGTH;
1377 n = t->trie;
1378
1379 read_lock(&fib_lock);
1380 if (!n)
1381 goto failed;
1382
1383 #ifdef CONFIG_IP_FIB_TRIE_STATS
1384 t->stats.gets++;
1385 #endif
1386
1387 /* Just a leaf? */
1388 if (IS_LEAF(n)) {
1389 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1390 goto found;
1391 goto failed;
1392 }
1393 pn = (struct tnode *) n;
1394 chopped_off = 0;
1395
1396 while (pn) {
1397
1398 pos = pn->pos;
1399 bits = pn->bits;
1400
1401 if (!chopped_off)
1402 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1403
1404 n = tnode_get_child(pn, cindex);
1405
1406 if (n == NULL) {
1407 #ifdef CONFIG_IP_FIB_TRIE_STATS
1408 t->stats.null_node_hit++;
1409 #endif
1410 goto backtrace;
1411 }
1412
1413 if (IS_TNODE(n)) {
1414 #define HL_OPTIMIZE
1415 #ifdef HL_OPTIMIZE
1416 struct tnode *cn = (struct tnode *)n;
1417 t_key node_prefix, key_prefix, pref_mismatch;
1418 int mp;
1419
1420 /*
1421 * It's a tnode, and we can do some extra checks here if we
1422 * like, to avoid descending into a dead-end branch.
1423 * This tnode is in the parent's child array at index
1424 * key[p_pos..p_pos+p_bits] but potentially with some bits
1425 * chopped off, so in reality the index may be just a
1426 * subprefix, padded with zero at the end.
1427 * We can also take a look at any skipped bits in this
1428 * tnode - everything up to p_pos is supposed to be ok,
1429 * and the non-chopped bits of the index (se previous
1430 * paragraph) are also guaranteed ok, but the rest is
1431 * considered unknown.
1432 *
1433 * The skipped bits are key[pos+bits..cn->pos].
1434 */
1435
1436 /* If current_prefix_length < pos+bits, we are already doing
1437 * actual prefix matching, which means everything from
1438 * pos+(bits-chopped_off) onward must be zero along some
1439 * branch of this subtree - otherwise there is *no* valid
1440 * prefix present. Here we can only check the skipped
1441 * bits. Remember, since we have already indexed into the
1442 * parent's child array, we know that the bits we chopped of
1443 * *are* zero.
1444 */
1445
1446 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1447
1448 if (current_prefix_length < pos+bits) {
1449 if (tkey_extract_bits(cn->key, current_prefix_length,
1450 cn->pos - current_prefix_length) != 0 ||
1451 !(cn->child[0]))
1452 goto backtrace;
1453 }
1454
1455 /*
1456 * If chopped_off=0, the index is fully validated and we
1457 * only need to look at the skipped bits for this, the new,
1458 * tnode. What we actually want to do is to find out if
1459 * these skipped bits match our key perfectly, or if we will
1460 * have to count on finding a matching prefix further down,
1461 * because if we do, we would like to have some way of
1462 * verifying the existence of such a prefix at this point.
1463 */
1464
1465 /* The only thing we can do at this point is to verify that
1466 * any such matching prefix can indeed be a prefix to our
1467 * key, and if the bits in the node we are inspecting that
1468 * do not match our key are not ZERO, this cannot be true.
1469 * Thus, find out where there is a mismatch (before cn->pos)
1470 * and verify that all the mismatching bits are zero in the
1471 * new tnode's key.
1472 */
1473
1474 /* Note: We aren't very concerned about the piece of the key
1475 * that precede pn->pos+pn->bits, since these have already been
1476 * checked. The bits after cn->pos aren't checked since these are
1477 * by definition "unknown" at this point. Thus, what we want to
1478 * see is if we are about to enter the "prefix matching" state,
1479 * and in that case verify that the skipped bits that will prevail
1480 * throughout this subtree are zero, as they have to be if we are
1481 * to find a matching prefix.
1482 */
1483
1484 node_prefix = MASK_PFX(cn->key, cn->pos);
1485 key_prefix = MASK_PFX(key, cn->pos);
1486 pref_mismatch = key_prefix^node_prefix;
1487 mp = 0;
1488
1489 /* In short: If skipped bits in this node do not match the search
1490 * key, enter the "prefix matching" state.directly.
1491 */
1492 if (pref_mismatch) {
1493 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1494 mp++;
1495 pref_mismatch = pref_mismatch <<1;
1496 }
1497 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1498
1499 if (key_prefix != 0)
1500 goto backtrace;
1501
1502 if (current_prefix_length >= cn->pos)
1503 current_prefix_length=mp;
1504 }
1505 #endif
1506 pn = (struct tnode *)n; /* Descend */
1507 chopped_off = 0;
1508 continue;
1509 }
1510 if (IS_LEAF(n)) {
1511 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1512 goto found;
1513 }
1514 backtrace:
1515 chopped_off++;
1516
1517 /* As zero don't change the child key (cindex) */
1518 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1519 chopped_off++;
1520 }
1521
1522 /* Decrease current_... with bits chopped off */
1523 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1524 current_prefix_length = pn->pos + pn->bits - chopped_off;
1525
1526 /*
1527 * Either we do the actual chop off according or if we have
1528 * chopped off all bits in this tnode walk up to our parent.
1529 */
1530
1531 if (chopped_off <= pn->bits)
1532 cindex &= ~(1 << (chopped_off-1));
1533 else {
1534 if (NODE_PARENT(pn) == NULL)
1535 goto failed;
1536
1537 /* Get Child's index */
1538 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1539 pn = NODE_PARENT(pn);
1540 chopped_off = 0;
1541
1542 #ifdef CONFIG_IP_FIB_TRIE_STATS
1543 t->stats.backtrack++;
1544 #endif
1545 goto backtrace;
1546 }
1547 }
1548 failed:
1549 ret = 1;
1550 found:
1551 read_unlock(&fib_lock);
1552 return ret;
1553 }
1554
1555 static int trie_leaf_remove(struct trie *t, t_key key)
1556 {
1557 t_key cindex;
1558 struct tnode *tp = NULL;
1559 struct node *n = t->trie;
1560 struct leaf *l;
1561
1562 if (trie_debug)
1563 printk("entering trie_leaf_remove(%p)\n", n);
1564
1565 /* Note that in the case skipped bits, those bits are *not* checked!
1566 * When we finish this, we will have NULL or a T_LEAF, and the
1567 * T_LEAF may or may not match our key.
1568 */
1569
1570 while (n != NULL && IS_TNODE(n)) {
1571 struct tnode *tn = (struct tnode *) n;
1572 check_tnode(tn);
1573 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1574
1575 if (n && NODE_PARENT(n) != tn) {
1576 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1577 BUG();
1578 }
1579 }
1580 l = (struct leaf *) n;
1581
1582 if (!n || !tkey_equals(l->key, key))
1583 return 0;
1584
1585 /*
1586 * Key found.
1587 * Remove the leaf and rebalance the tree
1588 */
1589
1590 t->revision++;
1591 t->size--;
1592
1593 tp = NODE_PARENT(n);
1594 tnode_free((struct tnode *) n);
1595
1596 if (tp) {
1597 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1598 put_child(t, (struct tnode *)tp, cindex, NULL);
1599 t->trie = trie_rebalance(t, tp);
1600 }
1601 else
1602 t->trie = NULL;
1603
1604 return 1;
1605 }
1606
1607 static int
1608 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1609 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1610 {
1611 struct trie *t = (struct trie *) tb->tb_data;
1612 u32 key, mask;
1613 int plen = r->rtm_dst_len;
1614 u8 tos = r->rtm_tos;
1615 struct fib_alias *fa, *fa_to_delete;
1616 struct list_head *fa_head;
1617 struct leaf *l;
1618
1619 if (plen > 32)
1620 return -EINVAL;
1621
1622 key = 0;
1623 if (rta->rta_dst)
1624 memcpy(&key, rta->rta_dst, 4);
1625
1626 key = ntohl(key);
1627 mask = ntohl( inet_make_mask(plen) );
1628
1629 if (key & ~mask)
1630 return -EINVAL;
1631
1632 key = key & mask;
1633 l = fib_find_node(t, key);
1634
1635 if (!l)
1636 return -ESRCH;
1637
1638 fa_head = get_fa_head(l, plen);
1639 fa = fib_find_alias(fa_head, tos, 0);
1640
1641 if (!fa)
1642 return -ESRCH;
1643
1644 if (trie_debug)
1645 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1646
1647 fa_to_delete = NULL;
1648 fa_head = fa->fa_list.prev;
1649 list_for_each_entry(fa, fa_head, fa_list) {
1650 struct fib_info *fi = fa->fa_info;
1651
1652 if (fa->fa_tos != tos)
1653 break;
1654
1655 if ((!r->rtm_type ||
1656 fa->fa_type == r->rtm_type) &&
1657 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1658 fa->fa_scope == r->rtm_scope) &&
1659 (!r->rtm_protocol ||
1660 fi->fib_protocol == r->rtm_protocol) &&
1661 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1662 fa_to_delete = fa;
1663 break;
1664 }
1665 }
1666
1667 if (fa_to_delete) {
1668 int kill_li = 0;
1669 struct leaf_info *li;
1670
1671 fa = fa_to_delete;
1672 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1673
1674 l = fib_find_node(t, key);
1675 li = find_leaf_info(&l->list, plen);
1676
1677 write_lock_bh(&fib_lock);
1678
1679 list_del(&fa->fa_list);
1680
1681 if (list_empty(fa_head)) {
1682 hlist_del(&li->hlist);
1683 kill_li = 1;
1684 }
1685 write_unlock_bh(&fib_lock);
1686
1687 if (kill_li)
1688 free_leaf_info(li);
1689
1690 if (hlist_empty(&l->list))
1691 trie_leaf_remove(t, key);
1692
1693 if (fa->fa_state & FA_S_ACCESSED)
1694 rt_cache_flush(-1);
1695
1696 fn_free_alias(fa);
1697 return 0;
1698 }
1699 return -ESRCH;
1700 }
1701
1702 static int trie_flush_list(struct trie *t, struct list_head *head)
1703 {
1704 struct fib_alias *fa, *fa_node;
1705 int found = 0;
1706
1707 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1708 struct fib_info *fi = fa->fa_info;
1709
1710 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1711
1712 write_lock_bh(&fib_lock);
1713 list_del(&fa->fa_list);
1714 write_unlock_bh(&fib_lock);
1715
1716 fn_free_alias(fa);
1717 found++;
1718 }
1719 }
1720 return found;
1721 }
1722
1723 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1724 {
1725 int found = 0;
1726 struct hlist_head *lih = &l->list;
1727 struct hlist_node *node, *tmp;
1728 struct leaf_info *li = NULL;
1729
1730 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1731
1732 found += trie_flush_list(t, &li->falh);
1733
1734 if (list_empty(&li->falh)) {
1735
1736 write_lock_bh(&fib_lock);
1737 hlist_del(&li->hlist);
1738 write_unlock_bh(&fib_lock);
1739
1740 free_leaf_info(li);
1741 }
1742 }
1743 return found;
1744 }
1745
1746 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1747 {
1748 struct node *c = (struct node *) thisleaf;
1749 struct tnode *p;
1750 int idx;
1751
1752 if (c == NULL) {
1753 if (t->trie == NULL)
1754 return NULL;
1755
1756 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1757 return (struct leaf *) t->trie;
1758
1759 p = (struct tnode*) t->trie; /* Start */
1760 }
1761 else
1762 p = (struct tnode *) NODE_PARENT(c);
1763
1764 while (p) {
1765 int pos, last;
1766
1767 /* Find the next child of the parent */
1768 if (c)
1769 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1770 else
1771 pos = 0;
1772
1773 last = 1 << p->bits;
1774 for(idx = pos; idx < last ; idx++) {
1775 if (p->child[idx]) {
1776
1777 /* Decend if tnode */
1778
1779 while (IS_TNODE(p->child[idx])) {
1780 p = (struct tnode*) p->child[idx];
1781 idx = 0;
1782
1783 /* Rightmost non-NULL branch */
1784 if (p && IS_TNODE(p))
1785 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1786
1787 /* Done with this tnode? */
1788 if (idx >= (1 << p->bits) || p->child[idx] == NULL )
1789 goto up;
1790 }
1791 return (struct leaf*) p->child[idx];
1792 }
1793 }
1794 up:
1795 /* No more children go up one step */
1796 c = (struct node*) p;
1797 p = (struct tnode *) NODE_PARENT(p);
1798 }
1799 return NULL; /* Ready. Root of trie */
1800 }
1801
1802 static int fn_trie_flush(struct fib_table *tb)
1803 {
1804 struct trie *t = (struct trie *) tb->tb_data;
1805 struct leaf *ll = NULL, *l = NULL;
1806 int found = 0, h;
1807
1808 t->revision++;
1809
1810 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1811 found += trie_flush_leaf(t, l);
1812
1813 if (ll && hlist_empty(&ll->list))
1814 trie_leaf_remove(t, ll->key);
1815 ll = l;
1816 }
1817
1818 if (ll && hlist_empty(&ll->list))
1819 trie_leaf_remove(t, ll->key);
1820
1821 if (trie_debug)
1822 printk("trie_flush found=%d\n", found);
1823 return found;
1824 }
1825
1826 static int trie_last_dflt=-1;
1827
1828 static void
1829 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1830 {
1831 struct trie *t = (struct trie *) tb->tb_data;
1832 int order, last_idx;
1833 struct fib_info *fi = NULL;
1834 struct fib_info *last_resort;
1835 struct fib_alias *fa = NULL;
1836 struct list_head *fa_head;
1837 struct leaf *l;
1838
1839 last_idx = -1;
1840 last_resort = NULL;
1841 order = -1;
1842
1843 read_lock(&fib_lock);
1844
1845 l = fib_find_node(t, 0);
1846 if (!l)
1847 goto out;
1848
1849 fa_head = get_fa_head(l, 0);
1850 if (!fa_head)
1851 goto out;
1852
1853 if (list_empty(fa_head))
1854 goto out;
1855
1856 list_for_each_entry(fa, fa_head, fa_list) {
1857 struct fib_info *next_fi = fa->fa_info;
1858
1859 if (fa->fa_scope != res->scope ||
1860 fa->fa_type != RTN_UNICAST)
1861 continue;
1862
1863 if (next_fi->fib_priority > res->fi->fib_priority)
1864 break;
1865 if (!next_fi->fib_nh[0].nh_gw ||
1866 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1867 continue;
1868 fa->fa_state |= FA_S_ACCESSED;
1869
1870 if (fi == NULL) {
1871 if (next_fi != res->fi)
1872 break;
1873 } else if (!fib_detect_death(fi, order, &last_resort,
1874 &last_idx, &trie_last_dflt)) {
1875 if (res->fi)
1876 fib_info_put(res->fi);
1877 res->fi = fi;
1878 atomic_inc(&fi->fib_clntref);
1879 trie_last_dflt = order;
1880 goto out;
1881 }
1882 fi = next_fi;
1883 order++;
1884 }
1885 if (order <= 0 || fi == NULL) {
1886 trie_last_dflt = -1;
1887 goto out;
1888 }
1889
1890 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1891 if (res->fi)
1892 fib_info_put(res->fi);
1893 res->fi = fi;
1894 atomic_inc(&fi->fib_clntref);
1895 trie_last_dflt = order;
1896 goto out;
1897 }
1898 if (last_idx >= 0) {
1899 if (res->fi)
1900 fib_info_put(res->fi);
1901 res->fi = last_resort;
1902 if (last_resort)
1903 atomic_inc(&last_resort->fib_clntref);
1904 }
1905 trie_last_dflt = last_idx;
1906 out:;
1907 read_unlock(&fib_lock);
1908 }
1909
1910 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1911 struct sk_buff *skb, struct netlink_callback *cb)
1912 {
1913 int i, s_i;
1914 struct fib_alias *fa;
1915
1916 u32 xkey=htonl(key);
1917
1918 s_i=cb->args[3];
1919 i = 0;
1920
1921 list_for_each_entry(fa, fah, fa_list) {
1922 if (i < s_i) {
1923 i++;
1924 continue;
1925 }
1926 if (fa->fa_info->fib_nh == NULL) {
1927 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1928 i++;
1929 continue;
1930 }
1931 if (fa->fa_info == NULL) {
1932 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1933 i++;
1934 continue;
1935 }
1936
1937 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1938 cb->nlh->nlmsg_seq,
1939 RTM_NEWROUTE,
1940 tb->tb_id,
1941 fa->fa_type,
1942 fa->fa_scope,
1943 &xkey,
1944 plen,
1945 fa->fa_tos,
1946 fa->fa_info, 0) < 0) {
1947 cb->args[3] = i;
1948 return -1;
1949 }
1950 i++;
1951 }
1952 cb->args[3]=i;
1953 return skb->len;
1954 }
1955
1956 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1957 struct netlink_callback *cb)
1958 {
1959 int h, s_h;
1960 struct list_head *fa_head;
1961 struct leaf *l = NULL;
1962 s_h=cb->args[2];
1963
1964 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1965
1966 if (h < s_h)
1967 continue;
1968 if (h > s_h)
1969 memset(&cb->args[3], 0,
1970 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1971
1972 fa_head = get_fa_head(l, plen);
1973
1974 if (!fa_head)
1975 continue;
1976
1977 if (list_empty(fa_head))
1978 continue;
1979
1980 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1981 cb->args[2]=h;
1982 return -1;
1983 }
1984 }
1985 cb->args[2]=h;
1986 return skb->len;
1987 }
1988
1989 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1990 {
1991 int m, s_m;
1992 struct trie *t = (struct trie *) tb->tb_data;
1993
1994 s_m = cb->args[1];
1995
1996 read_lock(&fib_lock);
1997 for (m=0; m<=32; m++) {
1998
1999 if (m < s_m)
2000 continue;
2001 if (m > s_m)
2002 memset(&cb->args[2], 0,
2003 sizeof(cb->args) - 2*sizeof(cb->args[0]));
2004
2005 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
2006 cb->args[1] = m;
2007 goto out;
2008 }
2009 }
2010 read_unlock(&fib_lock);
2011 cb->args[1] = m;
2012 return skb->len;
2013 out:
2014 read_unlock(&fib_lock);
2015 return -1;
2016 }
2017
2018 /* Fix more generic FIB names for init later */
2019
2020 #ifdef CONFIG_IP_MULTIPLE_TABLES
2021 struct fib_table * fib_hash_init(int id)
2022 #else
2023 struct fib_table * __init fib_hash_init(int id)
2024 #endif
2025 {
2026 struct fib_table *tb;
2027 struct trie *t;
2028
2029 if (fn_alias_kmem == NULL)
2030 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2031 sizeof(struct fib_alias),
2032 0, SLAB_HWCACHE_ALIGN,
2033 NULL, NULL);
2034
2035 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2036 GFP_KERNEL);
2037 if (tb == NULL)
2038 return NULL;
2039
2040 tb->tb_id = id;
2041 tb->tb_lookup = fn_trie_lookup;
2042 tb->tb_insert = fn_trie_insert;
2043 tb->tb_delete = fn_trie_delete;
2044 tb->tb_flush = fn_trie_flush;
2045 tb->tb_select_default = fn_trie_select_default;
2046 tb->tb_dump = fn_trie_dump;
2047 memset(tb->tb_data, 0, sizeof(struct trie));
2048
2049 t = (struct trie *) tb->tb_data;
2050
2051 trie_init(t);
2052
2053 if (id == RT_TABLE_LOCAL)
2054 trie_local = t;
2055 else if (id == RT_TABLE_MAIN)
2056 trie_main = t;
2057
2058 if (id == RT_TABLE_LOCAL)
2059 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2060
2061 return tb;
2062 }
2063
2064 /* Trie dump functions */
2065
2066 static void putspace_seq(struct seq_file *seq, int n)
2067 {
2068 while (n--) seq_printf(seq, " ");
2069 }
2070
2071 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2072 {
2073 while (bits--)
2074 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2075 }
2076
2077 static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2078 int pend, int cindex, int bits)
2079 {
2080 putspace_seq(seq, indent);
2081 if (IS_LEAF(n))
2082 seq_printf(seq, "|");
2083 else
2084 seq_printf(seq, "+");
2085 if (bits) {
2086 seq_printf(seq, "%d/", cindex);
2087 printbin_seq(seq, cindex, bits);
2088 seq_printf(seq, ": ");
2089 }
2090 else
2091 seq_printf(seq, "<root>: ");
2092 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2093
2094 if (IS_LEAF(n))
2095 seq_printf(seq, "key=%d.%d.%d.%d\n",
2096 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2097 else {
2098 int plen = ((struct tnode *)n)->pos;
2099 t_key prf=MASK_PFX(n->key, plen);
2100 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2101 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2102 }
2103 if (IS_LEAF(n)) {
2104 struct leaf *l=(struct leaf *)n;
2105 struct fib_alias *fa;
2106 int i;
2107 for (i=32; i>=0; i--)
2108 if (find_leaf_info(&l->list, i)) {
2109
2110 struct list_head *fa_head = get_fa_head(l, i);
2111
2112 if (!fa_head)
2113 continue;
2114
2115 if (list_empty(fa_head))
2116 continue;
2117
2118 putspace_seq(seq, indent+2);
2119 seq_printf(seq, "{/%d...dumping}\n", i);
2120
2121
2122 list_for_each_entry(fa, fa_head, fa_list) {
2123 putspace_seq(seq, indent+2);
2124 if (fa->fa_info->fib_nh == NULL) {
2125 seq_printf(seq, "Error _fib_nh=NULL\n");
2126 continue;
2127 }
2128 if (fa->fa_info == NULL) {
2129 seq_printf(seq, "Error fa_info=NULL\n");
2130 continue;
2131 }
2132
2133 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2134 fa->fa_type,
2135 fa->fa_scope,
2136 fa->fa_tos);
2137 }
2138 }
2139 }
2140 else if (IS_TNODE(n)) {
2141 struct tnode *tn = (struct tnode *)n;
2142 putspace_seq(seq, indent); seq_printf(seq, "| ");
2143 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
2144 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2145 seq_printf(seq, "}\n");
2146 putspace_seq(seq, indent); seq_printf(seq, "| ");
2147 seq_printf(seq, "{pos=%d", tn->pos);
2148 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2149 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2150 putspace_seq(seq, indent); seq_printf(seq, "| ");
2151 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2152 }
2153 }
2154
2155 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2156 {
2157 struct node *n = t->trie;
2158 int cindex=0;
2159 int indent=1;
2160 int pend=0;
2161 int depth = 0;
2162
2163 read_lock(&fib_lock);
2164
2165 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2166 if (n) {
2167 printnode_seq(seq, indent, n, pend, cindex, 0);
2168 if (IS_TNODE(n)) {
2169 struct tnode *tn = (struct tnode *)n;
2170 pend = tn->pos+tn->bits;
2171 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2172 indent += 3;
2173 depth++;
2174
2175 while (tn && cindex < (1 << tn->bits)) {
2176 if (tn->child[cindex]) {
2177
2178 /* Got a child */
2179
2180 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2181 if (IS_LEAF(tn->child[cindex])) {
2182 cindex++;
2183
2184 }
2185 else {
2186 /*
2187 * New tnode. Decend one level
2188 */
2189
2190 depth++;
2191 n = tn->child[cindex];
2192 tn = (struct tnode *)n;
2193 pend = tn->pos+tn->bits;
2194 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2195 indent+=3;
2196 cindex=0;
2197 }
2198 }
2199 else
2200 cindex++;
2201
2202 /*
2203 * Test if we are done
2204 */
2205
2206 while (cindex >= (1 << tn->bits)) {
2207
2208 /*
2209 * Move upwards and test for root
2210 * pop off all traversed nodes
2211 */
2212
2213 if (NODE_PARENT(tn) == NULL) {
2214 tn = NULL;
2215 n = NULL;
2216 break;
2217 }
2218 else {
2219 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2220 tn = NODE_PARENT(tn);
2221 cindex++;
2222 n = (struct node *)tn;
2223 pend = tn->pos+tn->bits;
2224 indent-=3;
2225 depth--;
2226 }
2227 }
2228 }
2229 }
2230 else n = NULL;
2231 }
2232 else seq_printf(seq, "------ trie is empty\n");
2233
2234 read_unlock(&fib_lock);
2235 }
2236
2237 static struct trie_stat *trie_stat_new(void)
2238 {
2239 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2240 int i;
2241
2242 if (s) {
2243 s->totdepth = 0;
2244 s->maxdepth = 0;
2245 s->tnodes = 0;
2246 s->leaves = 0;
2247 s->nullpointers = 0;
2248
2249 for(i=0; i< MAX_CHILDS; i++)
2250 s->nodesizes[i] = 0;
2251 }
2252 return s;
2253 }
2254
2255 static struct trie_stat *trie_collect_stats(struct trie *t)
2256 {
2257 struct node *n = t->trie;
2258 struct trie_stat *s = trie_stat_new();
2259 int cindex = 0;
2260 int indent = 1;
2261 int pend = 0;
2262 int depth = 0;
2263
2264 read_lock(&fib_lock);
2265
2266 if (s) {
2267 if (n) {
2268 if (IS_TNODE(n)) {
2269 struct tnode *tn = (struct tnode *)n;
2270 pend = tn->pos+tn->bits;
2271 indent += 3;
2272 s->nodesizes[tn->bits]++;
2273 depth++;
2274
2275 while (tn && cindex < (1 << tn->bits)) {
2276 if (tn->child[cindex]) {
2277 /* Got a child */
2278
2279 if (IS_LEAF(tn->child[cindex])) {
2280 cindex++;
2281
2282 /* stats */
2283 if (depth > s->maxdepth)
2284 s->maxdepth = depth;
2285 s->totdepth += depth;
2286 s->leaves++;
2287 }
2288
2289 else {
2290 /*
2291 * New tnode. Decend one level
2292 */
2293
2294 s->tnodes++;
2295 s->nodesizes[tn->bits]++;
2296 depth++;
2297
2298 n = tn->child[cindex];
2299 tn = (struct tnode *)n;
2300 pend = tn->pos+tn->bits;
2301
2302 indent += 3;
2303 cindex = 0;
2304 }
2305 }
2306 else {
2307 cindex++;
2308 s->nullpointers++;
2309 }
2310
2311 /*
2312 * Test if we are done
2313 */
2314
2315 while (cindex >= (1 << tn->bits)) {
2316
2317 /*
2318 * Move upwards and test for root
2319 * pop off all traversed nodes
2320 */
2321
2322
2323 if (NODE_PARENT(tn) == NULL) {
2324 tn = NULL;
2325 n = NULL;
2326 break;
2327 }
2328 else {
2329 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2330 tn = NODE_PARENT(tn);
2331 cindex++;
2332 n = (struct node *)tn;
2333 pend = tn->pos+tn->bits;
2334 indent -= 3;
2335 depth--;
2336 }
2337 }
2338 }
2339 }
2340 else n = NULL;
2341 }
2342 }
2343
2344 read_unlock(&fib_lock);
2345 return s;
2346 }
2347
2348 #ifdef CONFIG_PROC_FS
2349
2350 static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2351 {
2352 return NULL;
2353 }
2354
2355 static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2356 {
2357 return NULL;
2358 }
2359
2360 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2361 {
2362 void *v = NULL;
2363
2364 if (ip_fib_main_table)
2365 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2366 return v;
2367 }
2368
2369 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2370 {
2371 ++*pos;
2372 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2373 }
2374
2375 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2376 {
2377
2378 }
2379
2380 /*
2381 * This outputs /proc/net/fib_triestats
2382 *
2383 * It always works in backward compatibility mode.
2384 * The format of the file is not supposed to be changed.
2385 */
2386
2387 static void collect_and_show(struct trie *t, struct seq_file *seq)
2388 {
2389 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2390 int i, max, pointers;
2391 struct trie_stat *stat;
2392 int avdepth;
2393
2394 stat = trie_collect_stats(t);
2395
2396 bytes=0;
2397 seq_printf(seq, "trie=%p\n", t);
2398
2399 if (stat) {
2400 if (stat->leaves)
2401 avdepth=stat->totdepth*100 / stat->leaves;
2402 else
2403 avdepth=0;
2404 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2405 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2406
2407 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2408 bytes += sizeof(struct leaf) * stat->leaves;
2409 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2410 bytes += sizeof(struct tnode) * stat->tnodes;
2411
2412 max = MAX_CHILDS-1;
2413
2414 while (max >= 0 && stat->nodesizes[max] == 0)
2415 max--;
2416 pointers = 0;
2417
2418 for (i = 1; i <= max; i++)
2419 if (stat->nodesizes[i] != 0) {
2420 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2421 pointers += (1<<i) * stat->nodesizes[i];
2422 }
2423 seq_printf(seq, "\n");
2424 seq_printf(seq, "Pointers: %d\n", pointers);
2425 bytes += sizeof(struct node *) * pointers;
2426 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2427 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2428
2429 kfree(stat);
2430 }
2431
2432 #ifdef CONFIG_IP_FIB_TRIE_STATS
2433 seq_printf(seq, "Counters:\n---------\n");
2434 seq_printf(seq,"gets = %d\n", t->stats.gets);
2435 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2436 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2437 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2438 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2439 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2440 #ifdef CLEAR_STATS
2441 memset(&(t->stats), 0, sizeof(t->stats));
2442 #endif
2443 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2444 }
2445
2446 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2447 {
2448 char bf[128];
2449
2450 if (v == SEQ_START_TOKEN) {
2451 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2452 sizeof(struct leaf), sizeof(struct tnode));
2453 if (trie_local)
2454 collect_and_show(trie_local, seq);
2455
2456 if (trie_main)
2457 collect_and_show(trie_main, seq);
2458 }
2459 else {
2460 snprintf(bf, sizeof(bf),
2461 "*\t%08X\t%08X", 200, 400);
2462
2463 seq_printf(seq, "%-127s\n", bf);
2464 }
2465 return 0;
2466 }
2467
2468 static struct seq_operations fib_triestat_seq_ops = {
2469 .start = fib_triestat_seq_start,
2470 .next = fib_triestat_seq_next,
2471 .stop = fib_triestat_seq_stop,
2472 .show = fib_triestat_seq_show,
2473 };
2474
2475 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2476 {
2477 struct seq_file *seq;
2478 int rc = -ENOMEM;
2479
2480 rc = seq_open(file, &fib_triestat_seq_ops);
2481 if (rc)
2482 goto out_kfree;
2483
2484 seq = file->private_data;
2485 out:
2486 return rc;
2487 out_kfree:
2488 goto out;
2489 }
2490
2491 static struct file_operations fib_triestat_seq_fops = {
2492 .owner = THIS_MODULE,
2493 .open = fib_triestat_seq_open,
2494 .read = seq_read,
2495 .llseek = seq_lseek,
2496 .release = seq_release_private,
2497 };
2498
2499 int __init fib_stat_proc_init(void)
2500 {
2501 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2502 return -ENOMEM;
2503 return 0;
2504 }
2505
2506 void __init fib_stat_proc_exit(void)
2507 {
2508 proc_net_remove("fib_triestat");
2509 }
2510
2511 static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2512 {
2513 return NULL;
2514 }
2515
2516 static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2517 {
2518 return NULL;
2519 }
2520
2521 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2522 {
2523 void *v = NULL;
2524
2525 if (ip_fib_main_table)
2526 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2527 return v;
2528 }
2529
2530 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2531 {
2532 ++*pos;
2533 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2534 }
2535
2536 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2537 {
2538
2539 }
2540
2541 /*
2542 * This outputs /proc/net/fib_trie.
2543 *
2544 * It always works in backward compatibility mode.
2545 * The format of the file is not supposed to be changed.
2546 */
2547
2548 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2549 {
2550 char bf[128];
2551
2552 if (v == SEQ_START_TOKEN) {
2553 if (trie_local)
2554 trie_dump_seq(seq, trie_local);
2555
2556 if (trie_main)
2557 trie_dump_seq(seq, trie_main);
2558 }
2559
2560 else {
2561 snprintf(bf, sizeof(bf),
2562 "*\t%08X\t%08X", 200, 400);
2563 seq_printf(seq, "%-127s\n", bf);
2564 }
2565
2566 return 0;
2567 }
2568
2569 static struct seq_operations fib_trie_seq_ops = {
2570 .start = fib_trie_seq_start,
2571 .next = fib_trie_seq_next,
2572 .stop = fib_trie_seq_stop,
2573 .show = fib_trie_seq_show,
2574 };
2575
2576 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2577 {
2578 struct seq_file *seq;
2579 int rc = -ENOMEM;
2580
2581 rc = seq_open(file, &fib_trie_seq_ops);
2582 if (rc)
2583 goto out_kfree;
2584
2585 seq = file->private_data;
2586 out:
2587 return rc;
2588 out_kfree:
2589 goto out;
2590 }
2591
2592 static struct file_operations fib_trie_seq_fops = {
2593 .owner = THIS_MODULE,
2594 .open = fib_trie_seq_open,
2595 .read = seq_read,
2596 .llseek = seq_lseek,
2597 .release= seq_release_private,
2598 };
2599
2600 int __init fib_proc_init(void)
2601 {
2602 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2603 return -ENOMEM;
2604 return 0;
2605 }
2606
2607 void __init fib_proc_exit(void)
2608 {
2609 proc_net_remove("fib_trie");
2610 }
2611
2612 #endif /* CONFIG_PROC_FS */
This page took 0.127565 seconds and 5 git commands to generate.