58ba9f4f2c92eade0c84b5b14c84e2754d0bbccb
[deliverable/linux.git] / net / ipv4 / fib_trie.c
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
49 */
50
51 #define VERSION "0.408"
52
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
58 #include <linux/mm.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
63 #include <linux/in.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <net/net_namespace.h>
75 #include <net/ip.h>
76 #include <net/protocol.h>
77 #include <net/route.h>
78 #include <net/tcp.h>
79 #include <net/sock.h>
80 #include <net/ip_fib.h>
81 #include "fib_lookup.h"
82
83 #define MAX_STAT_DEPTH 32
84
85 #define KEYLENGTH (8*sizeof(t_key))
86
87 typedef unsigned int t_key;
88
89 #define T_TNODE 0
90 #define T_LEAF 1
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93
94 #define IS_TNODE(n) (!(n->parent & T_LEAF))
95 #define IS_LEAF(n) (n->parent & T_LEAF)
96
97 struct node {
98 unsigned long parent;
99 t_key key;
100 };
101
102 struct leaf {
103 unsigned long parent;
104 t_key key;
105 struct hlist_head list;
106 struct rcu_head rcu;
107 };
108
109 struct leaf_info {
110 struct hlist_node hlist;
111 struct rcu_head rcu;
112 int plen;
113 struct list_head falh;
114 };
115
116 struct tnode {
117 unsigned long parent;
118 t_key key;
119 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
120 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
121 unsigned int full_children; /* KEYLENGTH bits needed */
122 unsigned int empty_children; /* KEYLENGTH bits needed */
123 union {
124 struct rcu_head rcu;
125 struct work_struct work;
126 struct tnode *tnode_free;
127 };
128 struct node *child[0];
129 };
130
131 #ifdef CONFIG_IP_FIB_TRIE_STATS
132 struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
138 unsigned int resize_node_skipped;
139 };
140 #endif
141
142 struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int prefixes;
149 unsigned int nodesizes[MAX_STAT_DEPTH];
150 };
151
152 struct trie {
153 struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156 #endif
157 };
158
159 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
161 int wasfull);
162 static struct node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 /* tnodes to free after resize(); protected by RTNL */
166 static struct tnode *tnode_free_head;
167 static size_t tnode_free_size;
168
169 /*
170 * synchronize_rcu after call_rcu for that many pages; it should be especially
171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
172 * obtained experimentally, aiming to avoid visible slowdown.
173 */
174 static const int sync_pages = 128;
175
176 static struct kmem_cache *fn_alias_kmem __read_mostly;
177 static struct kmem_cache *trie_leaf_kmem __read_mostly;
178
179 static inline struct tnode *node_parent(struct node *node)
180 {
181 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
182 }
183
184 static inline struct tnode *node_parent_rcu(struct node *node)
185 {
186 struct tnode *ret = node_parent(node);
187
188 return rcu_dereference(ret);
189 }
190
191 /* Same as rcu_assign_pointer
192 * but that macro() assumes that value is a pointer.
193 */
194 static inline void node_set_parent(struct node *node, struct tnode *ptr)
195 {
196 smp_wmb();
197 node->parent = (unsigned long)ptr | NODE_TYPE(node);
198 }
199
200 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
201 {
202 BUG_ON(i >= 1U << tn->bits);
203
204 return tn->child[i];
205 }
206
207 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
208 {
209 struct node *ret = tnode_get_child(tn, i);
210
211 return rcu_dereference(ret);
212 }
213
214 static inline int tnode_child_length(const struct tnode *tn)
215 {
216 return 1 << tn->bits;
217 }
218
219 static inline t_key mask_pfx(t_key k, unsigned short l)
220 {
221 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
222 }
223
224 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
225 {
226 if (offset < KEYLENGTH)
227 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
228 else
229 return 0;
230 }
231
232 static inline int tkey_equals(t_key a, t_key b)
233 {
234 return a == b;
235 }
236
237 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
238 {
239 if (bits == 0 || offset >= KEYLENGTH)
240 return 1;
241 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
242 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
243 }
244
245 static inline int tkey_mismatch(t_key a, int offset, t_key b)
246 {
247 t_key diff = a ^ b;
248 int i = offset;
249
250 if (!diff)
251 return 0;
252 while ((diff << i) >> (KEYLENGTH-1) == 0)
253 i++;
254 return i;
255 }
256
257 /*
258 To understand this stuff, an understanding of keys and all their bits is
259 necessary. Every node in the trie has a key associated with it, but not
260 all of the bits in that key are significant.
261
262 Consider a node 'n' and its parent 'tp'.
263
264 If n is a leaf, every bit in its key is significant. Its presence is
265 necessitated by path compression, since during a tree traversal (when
266 searching for a leaf - unless we are doing an insertion) we will completely
267 ignore all skipped bits we encounter. Thus we need to verify, at the end of
268 a potentially successful search, that we have indeed been walking the
269 correct key path.
270
271 Note that we can never "miss" the correct key in the tree if present by
272 following the wrong path. Path compression ensures that segments of the key
273 that are the same for all keys with a given prefix are skipped, but the
274 skipped part *is* identical for each node in the subtrie below the skipped
275 bit! trie_insert() in this implementation takes care of that - note the
276 call to tkey_sub_equals() in trie_insert().
277
278 if n is an internal node - a 'tnode' here, the various parts of its key
279 have many different meanings.
280
281 Example:
282 _________________________________________________________________
283 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
284 -----------------------------------------------------------------
285 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
286
287 _________________________________________________________________
288 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
289 -----------------------------------------------------------------
290 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
291
292 tp->pos = 7
293 tp->bits = 3
294 n->pos = 15
295 n->bits = 4
296
297 First, let's just ignore the bits that come before the parent tp, that is
298 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
299 not use them for anything.
300
301 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
302 index into the parent's child array. That is, they will be used to find
303 'n' among tp's children.
304
305 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
306 for the node n.
307
308 All the bits we have seen so far are significant to the node n. The rest
309 of the bits are really not needed or indeed known in n->key.
310
311 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
312 n's child array, and will of course be different for each child.
313
314
315 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
316 at this point.
317
318 */
319
320 static inline void check_tnode(const struct tnode *tn)
321 {
322 WARN_ON(tn && tn->pos+tn->bits > 32);
323 }
324
325 static const int halve_threshold = 25;
326 static const int inflate_threshold = 50;
327 static const int halve_threshold_root = 15;
328 static const int inflate_threshold_root = 25;
329
330
331 static void __alias_free_mem(struct rcu_head *head)
332 {
333 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
334 kmem_cache_free(fn_alias_kmem, fa);
335 }
336
337 static inline void alias_free_mem_rcu(struct fib_alias *fa)
338 {
339 call_rcu(&fa->rcu, __alias_free_mem);
340 }
341
342 static void __leaf_free_rcu(struct rcu_head *head)
343 {
344 struct leaf *l = container_of(head, struct leaf, rcu);
345 kmem_cache_free(trie_leaf_kmem, l);
346 }
347
348 static inline void free_leaf(struct leaf *l)
349 {
350 call_rcu_bh(&l->rcu, __leaf_free_rcu);
351 }
352
353 static void __leaf_info_free_rcu(struct rcu_head *head)
354 {
355 kfree(container_of(head, struct leaf_info, rcu));
356 }
357
358 static inline void free_leaf_info(struct leaf_info *leaf)
359 {
360 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
361 }
362
363 static struct tnode *tnode_alloc(size_t size)
364 {
365 if (size <= PAGE_SIZE)
366 return kzalloc(size, GFP_KERNEL);
367 else
368 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
369 }
370
371 static void __tnode_vfree(struct work_struct *arg)
372 {
373 struct tnode *tn = container_of(arg, struct tnode, work);
374 vfree(tn);
375 }
376
377 static void __tnode_free_rcu(struct rcu_head *head)
378 {
379 struct tnode *tn = container_of(head, struct tnode, rcu);
380 size_t size = sizeof(struct tnode) +
381 (sizeof(struct node *) << tn->bits);
382
383 if (size <= PAGE_SIZE)
384 kfree(tn);
385 else {
386 INIT_WORK(&tn->work, __tnode_vfree);
387 schedule_work(&tn->work);
388 }
389 }
390
391 static inline void tnode_free(struct tnode *tn)
392 {
393 if (IS_LEAF(tn))
394 free_leaf((struct leaf *) tn);
395 else
396 call_rcu(&tn->rcu, __tnode_free_rcu);
397 }
398
399 static void tnode_free_safe(struct tnode *tn)
400 {
401 BUG_ON(IS_LEAF(tn));
402 tn->tnode_free = tnode_free_head;
403 tnode_free_head = tn;
404 tnode_free_size += sizeof(struct tnode) +
405 (sizeof(struct node *) << tn->bits);
406 }
407
408 static void tnode_free_flush(void)
409 {
410 struct tnode *tn;
411
412 while ((tn = tnode_free_head)) {
413 tnode_free_head = tn->tnode_free;
414 tn->tnode_free = NULL;
415 tnode_free(tn);
416 }
417
418 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
419 tnode_free_size = 0;
420 synchronize_rcu();
421 }
422 }
423
424 static struct leaf *leaf_new(void)
425 {
426 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
427 if (l) {
428 l->parent = T_LEAF;
429 INIT_HLIST_HEAD(&l->list);
430 }
431 return l;
432 }
433
434 static struct leaf_info *leaf_info_new(int plen)
435 {
436 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
437 if (li) {
438 li->plen = plen;
439 INIT_LIST_HEAD(&li->falh);
440 }
441 return li;
442 }
443
444 static struct tnode *tnode_new(t_key key, int pos, int bits)
445 {
446 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
447 struct tnode *tn = tnode_alloc(sz);
448
449 if (tn) {
450 tn->parent = T_TNODE;
451 tn->pos = pos;
452 tn->bits = bits;
453 tn->key = key;
454 tn->full_children = 0;
455 tn->empty_children = 1<<bits;
456 }
457
458 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
459 (unsigned long) (sizeof(struct node) << bits));
460 return tn;
461 }
462
463 /*
464 * Check whether a tnode 'n' is "full", i.e. it is an internal node
465 * and no bits are skipped. See discussion in dyntree paper p. 6
466 */
467
468 static inline int tnode_full(const struct tnode *tn, const struct node *n)
469 {
470 if (n == NULL || IS_LEAF(n))
471 return 0;
472
473 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
474 }
475
476 static inline void put_child(struct trie *t, struct tnode *tn, int i,
477 struct node *n)
478 {
479 tnode_put_child_reorg(tn, i, n, -1);
480 }
481
482 /*
483 * Add a child at position i overwriting the old value.
484 * Update the value of full_children and empty_children.
485 */
486
487 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
488 int wasfull)
489 {
490 struct node *chi = tn->child[i];
491 int isfull;
492
493 BUG_ON(i >= 1<<tn->bits);
494
495 /* update emptyChildren */
496 if (n == NULL && chi != NULL)
497 tn->empty_children++;
498 else if (n != NULL && chi == NULL)
499 tn->empty_children--;
500
501 /* update fullChildren */
502 if (wasfull == -1)
503 wasfull = tnode_full(tn, chi);
504
505 isfull = tnode_full(tn, n);
506 if (wasfull && !isfull)
507 tn->full_children--;
508 else if (!wasfull && isfull)
509 tn->full_children++;
510
511 if (n)
512 node_set_parent(n, tn);
513
514 rcu_assign_pointer(tn->child[i], n);
515 }
516
517 static struct node *resize(struct trie *t, struct tnode *tn)
518 {
519 int i;
520 int err = 0;
521 struct tnode *old_tn;
522 int inflate_threshold_use;
523 int halve_threshold_use;
524 int max_resize;
525
526 if (!tn)
527 return NULL;
528
529 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
530 tn, inflate_threshold, halve_threshold);
531
532 /* No children */
533 if (tn->empty_children == tnode_child_length(tn)) {
534 tnode_free_safe(tn);
535 return NULL;
536 }
537 /* One child */
538 if (tn->empty_children == tnode_child_length(tn) - 1)
539 for (i = 0; i < tnode_child_length(tn); i++) {
540 struct node *n;
541
542 n = tn->child[i];
543 if (!n)
544 continue;
545
546 /* compress one level */
547 node_set_parent(n, NULL);
548 tnode_free_safe(tn);
549 return n;
550 }
551 /*
552 * Double as long as the resulting node has a number of
553 * nonempty nodes that are above the threshold.
554 */
555
556 /*
557 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
558 * the Helsinki University of Technology and Matti Tikkanen of Nokia
559 * Telecommunications, page 6:
560 * "A node is doubled if the ratio of non-empty children to all
561 * children in the *doubled* node is at least 'high'."
562 *
563 * 'high' in this instance is the variable 'inflate_threshold'. It
564 * is expressed as a percentage, so we multiply it with
565 * tnode_child_length() and instead of multiplying by 2 (since the
566 * child array will be doubled by inflate()) and multiplying
567 * the left-hand side by 100 (to handle the percentage thing) we
568 * multiply the left-hand side by 50.
569 *
570 * The left-hand side may look a bit weird: tnode_child_length(tn)
571 * - tn->empty_children is of course the number of non-null children
572 * in the current node. tn->full_children is the number of "full"
573 * children, that is non-null tnodes with a skip value of 0.
574 * All of those will be doubled in the resulting inflated tnode, so
575 * we just count them one extra time here.
576 *
577 * A clearer way to write this would be:
578 *
579 * to_be_doubled = tn->full_children;
580 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
581 * tn->full_children;
582 *
583 * new_child_length = tnode_child_length(tn) * 2;
584 *
585 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
586 * new_child_length;
587 * if (new_fill_factor >= inflate_threshold)
588 *
589 * ...and so on, tho it would mess up the while () loop.
590 *
591 * anyway,
592 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
593 * inflate_threshold
594 *
595 * avoid a division:
596 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
597 * inflate_threshold * new_child_length
598 *
599 * expand not_to_be_doubled and to_be_doubled, and shorten:
600 * 100 * (tnode_child_length(tn) - tn->empty_children +
601 * tn->full_children) >= inflate_threshold * new_child_length
602 *
603 * expand new_child_length:
604 * 100 * (tnode_child_length(tn) - tn->empty_children +
605 * tn->full_children) >=
606 * inflate_threshold * tnode_child_length(tn) * 2
607 *
608 * shorten again:
609 * 50 * (tn->full_children + tnode_child_length(tn) -
610 * tn->empty_children) >= inflate_threshold *
611 * tnode_child_length(tn)
612 *
613 */
614
615 check_tnode(tn);
616
617 /* Keep root node larger */
618
619 if (!tn->parent)
620 inflate_threshold_use = inflate_threshold_root;
621 else
622 inflate_threshold_use = inflate_threshold;
623
624 err = 0;
625 max_resize = 10;
626 while ((tn->full_children > 0 && max_resize-- &&
627 50 * (tn->full_children + tnode_child_length(tn)
628 - tn->empty_children)
629 >= inflate_threshold_use * tnode_child_length(tn))) {
630
631 old_tn = tn;
632 tn = inflate(t, tn);
633
634 if (IS_ERR(tn)) {
635 tn = old_tn;
636 #ifdef CONFIG_IP_FIB_TRIE_STATS
637 t->stats.resize_node_skipped++;
638 #endif
639 break;
640 }
641 }
642
643 if (max_resize < 0) {
644 if (!tn->parent)
645 pr_warning("Fix inflate_threshold_root."
646 " Now=%d size=%d bits\n",
647 inflate_threshold_root, tn->bits);
648 else
649 pr_warning("Fix inflate_threshold."
650 " Now=%d size=%d bits\n",
651 inflate_threshold, tn->bits);
652 }
653
654 check_tnode(tn);
655
656 /*
657 * Halve as long as the number of empty children in this
658 * node is above threshold.
659 */
660
661
662 /* Keep root node larger */
663
664 if (!tn->parent)
665 halve_threshold_use = halve_threshold_root;
666 else
667 halve_threshold_use = halve_threshold;
668
669 err = 0;
670 max_resize = 10;
671 while (tn->bits > 1 && max_resize-- &&
672 100 * (tnode_child_length(tn) - tn->empty_children) <
673 halve_threshold_use * tnode_child_length(tn)) {
674
675 old_tn = tn;
676 tn = halve(t, tn);
677 if (IS_ERR(tn)) {
678 tn = old_tn;
679 #ifdef CONFIG_IP_FIB_TRIE_STATS
680 t->stats.resize_node_skipped++;
681 #endif
682 break;
683 }
684 }
685
686 if (max_resize < 0) {
687 if (!tn->parent)
688 pr_warning("Fix halve_threshold_root."
689 " Now=%d size=%d bits\n",
690 halve_threshold_root, tn->bits);
691 else
692 pr_warning("Fix halve_threshold."
693 " Now=%d size=%d bits\n",
694 halve_threshold, tn->bits);
695 }
696
697 /* Only one child remains */
698 if (tn->empty_children == tnode_child_length(tn) - 1)
699 for (i = 0; i < tnode_child_length(tn); i++) {
700 struct node *n;
701
702 n = tn->child[i];
703 if (!n)
704 continue;
705
706 /* compress one level */
707
708 node_set_parent(n, NULL);
709 tnode_free_safe(tn);
710 return n;
711 }
712
713 return (struct node *) tn;
714 }
715
716 static struct tnode *inflate(struct trie *t, struct tnode *tn)
717 {
718 struct tnode *oldtnode = tn;
719 int olen = tnode_child_length(tn);
720 int i;
721
722 pr_debug("In inflate\n");
723
724 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
725
726 if (!tn)
727 return ERR_PTR(-ENOMEM);
728
729 /*
730 * Preallocate and store tnodes before the actual work so we
731 * don't get into an inconsistent state if memory allocation
732 * fails. In case of failure we return the oldnode and inflate
733 * of tnode is ignored.
734 */
735
736 for (i = 0; i < olen; i++) {
737 struct tnode *inode;
738
739 inode = (struct tnode *) tnode_get_child(oldtnode, i);
740 if (inode &&
741 IS_TNODE(inode) &&
742 inode->pos == oldtnode->pos + oldtnode->bits &&
743 inode->bits > 1) {
744 struct tnode *left, *right;
745 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
746
747 left = tnode_new(inode->key&(~m), inode->pos + 1,
748 inode->bits - 1);
749 if (!left)
750 goto nomem;
751
752 right = tnode_new(inode->key|m, inode->pos + 1,
753 inode->bits - 1);
754
755 if (!right) {
756 tnode_free(left);
757 goto nomem;
758 }
759
760 put_child(t, tn, 2*i, (struct node *) left);
761 put_child(t, tn, 2*i+1, (struct node *) right);
762 }
763 }
764
765 for (i = 0; i < olen; i++) {
766 struct tnode *inode;
767 struct node *node = tnode_get_child(oldtnode, i);
768 struct tnode *left, *right;
769 int size, j;
770
771 /* An empty child */
772 if (node == NULL)
773 continue;
774
775 /* A leaf or an internal node with skipped bits */
776
777 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
778 tn->pos + tn->bits - 1) {
779 if (tkey_extract_bits(node->key,
780 oldtnode->pos + oldtnode->bits,
781 1) == 0)
782 put_child(t, tn, 2*i, node);
783 else
784 put_child(t, tn, 2*i+1, node);
785 continue;
786 }
787
788 /* An internal node with two children */
789 inode = (struct tnode *) node;
790
791 if (inode->bits == 1) {
792 put_child(t, tn, 2*i, inode->child[0]);
793 put_child(t, tn, 2*i+1, inode->child[1]);
794
795 tnode_free_safe(inode);
796 continue;
797 }
798
799 /* An internal node with more than two children */
800
801 /* We will replace this node 'inode' with two new
802 * ones, 'left' and 'right', each with half of the
803 * original children. The two new nodes will have
804 * a position one bit further down the key and this
805 * means that the "significant" part of their keys
806 * (see the discussion near the top of this file)
807 * will differ by one bit, which will be "0" in
808 * left's key and "1" in right's key. Since we are
809 * moving the key position by one step, the bit that
810 * we are moving away from - the bit at position
811 * (inode->pos) - is the one that will differ between
812 * left and right. So... we synthesize that bit in the
813 * two new keys.
814 * The mask 'm' below will be a single "one" bit at
815 * the position (inode->pos)
816 */
817
818 /* Use the old key, but set the new significant
819 * bit to zero.
820 */
821
822 left = (struct tnode *) tnode_get_child(tn, 2*i);
823 put_child(t, tn, 2*i, NULL);
824
825 BUG_ON(!left);
826
827 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
828 put_child(t, tn, 2*i+1, NULL);
829
830 BUG_ON(!right);
831
832 size = tnode_child_length(left);
833 for (j = 0; j < size; j++) {
834 put_child(t, left, j, inode->child[j]);
835 put_child(t, right, j, inode->child[j + size]);
836 }
837 put_child(t, tn, 2*i, resize(t, left));
838 put_child(t, tn, 2*i+1, resize(t, right));
839
840 tnode_free_safe(inode);
841 }
842 tnode_free_safe(oldtnode);
843 return tn;
844 nomem:
845 {
846 int size = tnode_child_length(tn);
847 int j;
848
849 for (j = 0; j < size; j++)
850 if (tn->child[j])
851 tnode_free((struct tnode *)tn->child[j]);
852
853 tnode_free(tn);
854
855 return ERR_PTR(-ENOMEM);
856 }
857 }
858
859 static struct tnode *halve(struct trie *t, struct tnode *tn)
860 {
861 struct tnode *oldtnode = tn;
862 struct node *left, *right;
863 int i;
864 int olen = tnode_child_length(tn);
865
866 pr_debug("In halve\n");
867
868 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
869
870 if (!tn)
871 return ERR_PTR(-ENOMEM);
872
873 /*
874 * Preallocate and store tnodes before the actual work so we
875 * don't get into an inconsistent state if memory allocation
876 * fails. In case of failure we return the oldnode and halve
877 * of tnode is ignored.
878 */
879
880 for (i = 0; i < olen; i += 2) {
881 left = tnode_get_child(oldtnode, i);
882 right = tnode_get_child(oldtnode, i+1);
883
884 /* Two nonempty children */
885 if (left && right) {
886 struct tnode *newn;
887
888 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
889
890 if (!newn)
891 goto nomem;
892
893 put_child(t, tn, i/2, (struct node *)newn);
894 }
895
896 }
897
898 for (i = 0; i < olen; i += 2) {
899 struct tnode *newBinNode;
900
901 left = tnode_get_child(oldtnode, i);
902 right = tnode_get_child(oldtnode, i+1);
903
904 /* At least one of the children is empty */
905 if (left == NULL) {
906 if (right == NULL) /* Both are empty */
907 continue;
908 put_child(t, tn, i/2, right);
909 continue;
910 }
911
912 if (right == NULL) {
913 put_child(t, tn, i/2, left);
914 continue;
915 }
916
917 /* Two nonempty children */
918 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
919 put_child(t, tn, i/2, NULL);
920 put_child(t, newBinNode, 0, left);
921 put_child(t, newBinNode, 1, right);
922 put_child(t, tn, i/2, resize(t, newBinNode));
923 }
924 tnode_free_safe(oldtnode);
925 return tn;
926 nomem:
927 {
928 int size = tnode_child_length(tn);
929 int j;
930
931 for (j = 0; j < size; j++)
932 if (tn->child[j])
933 tnode_free((struct tnode *)tn->child[j]);
934
935 tnode_free(tn);
936
937 return ERR_PTR(-ENOMEM);
938 }
939 }
940
941 /* readside must use rcu_read_lock currently dump routines
942 via get_fa_head and dump */
943
944 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
945 {
946 struct hlist_head *head = &l->list;
947 struct hlist_node *node;
948 struct leaf_info *li;
949
950 hlist_for_each_entry_rcu(li, node, head, hlist)
951 if (li->plen == plen)
952 return li;
953
954 return NULL;
955 }
956
957 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
958 {
959 struct leaf_info *li = find_leaf_info(l, plen);
960
961 if (!li)
962 return NULL;
963
964 return &li->falh;
965 }
966
967 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
968 {
969 struct leaf_info *li = NULL, *last = NULL;
970 struct hlist_node *node;
971
972 if (hlist_empty(head)) {
973 hlist_add_head_rcu(&new->hlist, head);
974 } else {
975 hlist_for_each_entry(li, node, head, hlist) {
976 if (new->plen > li->plen)
977 break;
978
979 last = li;
980 }
981 if (last)
982 hlist_add_after_rcu(&last->hlist, &new->hlist);
983 else
984 hlist_add_before_rcu(&new->hlist, &li->hlist);
985 }
986 }
987
988 /* rcu_read_lock needs to be hold by caller from readside */
989
990 static struct leaf *
991 fib_find_node(struct trie *t, u32 key)
992 {
993 int pos;
994 struct tnode *tn;
995 struct node *n;
996
997 pos = 0;
998 n = rcu_dereference(t->trie);
999
1000 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1001 tn = (struct tnode *) n;
1002
1003 check_tnode(tn);
1004
1005 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1006 pos = tn->pos + tn->bits;
1007 n = tnode_get_child_rcu(tn,
1008 tkey_extract_bits(key,
1009 tn->pos,
1010 tn->bits));
1011 } else
1012 break;
1013 }
1014 /* Case we have found a leaf. Compare prefixes */
1015
1016 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
1017 return (struct leaf *)n;
1018
1019 return NULL;
1020 }
1021
1022 static void trie_rebalance(struct trie *t, struct tnode *tn)
1023 {
1024 int wasfull;
1025 t_key cindex, key;
1026 struct tnode *tp;
1027
1028 key = tn->key;
1029
1030 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1031 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1032 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1033 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1034
1035 tnode_put_child_reorg((struct tnode *)tp, cindex,
1036 (struct node *)tn, wasfull);
1037
1038 tp = node_parent((struct node *) tn);
1039 if (!tp)
1040 rcu_assign_pointer(t->trie, (struct node *)tn);
1041
1042 tnode_free_flush();
1043 if (!tp)
1044 break;
1045 tn = tp;
1046 }
1047
1048 /* Handle last (top) tnode */
1049 if (IS_TNODE(tn))
1050 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1051
1052 rcu_assign_pointer(t->trie, (struct node *)tn);
1053 tnode_free_flush();
1054
1055 return;
1056 }
1057
1058 /* only used from updater-side */
1059
1060 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1061 {
1062 int pos, newpos;
1063 struct tnode *tp = NULL, *tn = NULL;
1064 struct node *n;
1065 struct leaf *l;
1066 int missbit;
1067 struct list_head *fa_head = NULL;
1068 struct leaf_info *li;
1069 t_key cindex;
1070
1071 pos = 0;
1072 n = t->trie;
1073
1074 /* If we point to NULL, stop. Either the tree is empty and we should
1075 * just put a new leaf in if, or we have reached an empty child slot,
1076 * and we should just put our new leaf in that.
1077 * If we point to a T_TNODE, check if it matches our key. Note that
1078 * a T_TNODE might be skipping any number of bits - its 'pos' need
1079 * not be the parent's 'pos'+'bits'!
1080 *
1081 * If it does match the current key, get pos/bits from it, extract
1082 * the index from our key, push the T_TNODE and walk the tree.
1083 *
1084 * If it doesn't, we have to replace it with a new T_TNODE.
1085 *
1086 * If we point to a T_LEAF, it might or might not have the same key
1087 * as we do. If it does, just change the value, update the T_LEAF's
1088 * value, and return it.
1089 * If it doesn't, we need to replace it with a T_TNODE.
1090 */
1091
1092 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1093 tn = (struct tnode *) n;
1094
1095 check_tnode(tn);
1096
1097 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1098 tp = tn;
1099 pos = tn->pos + tn->bits;
1100 n = tnode_get_child(tn,
1101 tkey_extract_bits(key,
1102 tn->pos,
1103 tn->bits));
1104
1105 BUG_ON(n && node_parent(n) != tn);
1106 } else
1107 break;
1108 }
1109
1110 /*
1111 * n ----> NULL, LEAF or TNODE
1112 *
1113 * tp is n's (parent) ----> NULL or TNODE
1114 */
1115
1116 BUG_ON(tp && IS_LEAF(tp));
1117
1118 /* Case 1: n is a leaf. Compare prefixes */
1119
1120 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1121 l = (struct leaf *) n;
1122 li = leaf_info_new(plen);
1123
1124 if (!li)
1125 return NULL;
1126
1127 fa_head = &li->falh;
1128 insert_leaf_info(&l->list, li);
1129 goto done;
1130 }
1131 l = leaf_new();
1132
1133 if (!l)
1134 return NULL;
1135
1136 l->key = key;
1137 li = leaf_info_new(plen);
1138
1139 if (!li) {
1140 free_leaf(l);
1141 return NULL;
1142 }
1143
1144 fa_head = &li->falh;
1145 insert_leaf_info(&l->list, li);
1146
1147 if (t->trie && n == NULL) {
1148 /* Case 2: n is NULL, and will just insert a new leaf */
1149
1150 node_set_parent((struct node *)l, tp);
1151
1152 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1153 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1154 } else {
1155 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1156 /*
1157 * Add a new tnode here
1158 * first tnode need some special handling
1159 */
1160
1161 if (tp)
1162 pos = tp->pos+tp->bits;
1163 else
1164 pos = 0;
1165
1166 if (n) {
1167 newpos = tkey_mismatch(key, pos, n->key);
1168 tn = tnode_new(n->key, newpos, 1);
1169 } else {
1170 newpos = 0;
1171 tn = tnode_new(key, newpos, 1); /* First tnode */
1172 }
1173
1174 if (!tn) {
1175 free_leaf_info(li);
1176 free_leaf(l);
1177 return NULL;
1178 }
1179
1180 node_set_parent((struct node *)tn, tp);
1181
1182 missbit = tkey_extract_bits(key, newpos, 1);
1183 put_child(t, tn, missbit, (struct node *)l);
1184 put_child(t, tn, 1-missbit, n);
1185
1186 if (tp) {
1187 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1188 put_child(t, (struct tnode *)tp, cindex,
1189 (struct node *)tn);
1190 } else {
1191 rcu_assign_pointer(t->trie, (struct node *)tn);
1192 tp = tn;
1193 }
1194 }
1195
1196 if (tp && tp->pos + tp->bits > 32)
1197 pr_warning("fib_trie"
1198 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1199 tp, tp->pos, tp->bits, key, plen);
1200
1201 /* Rebalance the trie */
1202
1203 trie_rebalance(t, tp);
1204 done:
1205 return fa_head;
1206 }
1207
1208 /*
1209 * Caller must hold RTNL.
1210 */
1211 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1212 {
1213 struct trie *t = (struct trie *) tb->tb_data;
1214 struct fib_alias *fa, *new_fa;
1215 struct list_head *fa_head = NULL;
1216 struct fib_info *fi;
1217 int plen = cfg->fc_dst_len;
1218 u8 tos = cfg->fc_tos;
1219 u32 key, mask;
1220 int err;
1221 struct leaf *l;
1222
1223 if (plen > 32)
1224 return -EINVAL;
1225
1226 key = ntohl(cfg->fc_dst);
1227
1228 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1229
1230 mask = ntohl(inet_make_mask(plen));
1231
1232 if (key & ~mask)
1233 return -EINVAL;
1234
1235 key = key & mask;
1236
1237 fi = fib_create_info(cfg);
1238 if (IS_ERR(fi)) {
1239 err = PTR_ERR(fi);
1240 goto err;
1241 }
1242
1243 l = fib_find_node(t, key);
1244 fa = NULL;
1245
1246 if (l) {
1247 fa_head = get_fa_head(l, plen);
1248 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1249 }
1250
1251 /* Now fa, if non-NULL, points to the first fib alias
1252 * with the same keys [prefix,tos,priority], if such key already
1253 * exists or to the node before which we will insert new one.
1254 *
1255 * If fa is NULL, we will need to allocate a new one and
1256 * insert to the head of f.
1257 *
1258 * If f is NULL, no fib node matched the destination key
1259 * and we need to allocate a new one of those as well.
1260 */
1261
1262 if (fa && fa->fa_tos == tos &&
1263 fa->fa_info->fib_priority == fi->fib_priority) {
1264 struct fib_alias *fa_first, *fa_match;
1265
1266 err = -EEXIST;
1267 if (cfg->fc_nlflags & NLM_F_EXCL)
1268 goto out;
1269
1270 /* We have 2 goals:
1271 * 1. Find exact match for type, scope, fib_info to avoid
1272 * duplicate routes
1273 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1274 */
1275 fa_match = NULL;
1276 fa_first = fa;
1277 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1278 list_for_each_entry_continue(fa, fa_head, fa_list) {
1279 if (fa->fa_tos != tos)
1280 break;
1281 if (fa->fa_info->fib_priority != fi->fib_priority)
1282 break;
1283 if (fa->fa_type == cfg->fc_type &&
1284 fa->fa_scope == cfg->fc_scope &&
1285 fa->fa_info == fi) {
1286 fa_match = fa;
1287 break;
1288 }
1289 }
1290
1291 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1292 struct fib_info *fi_drop;
1293 u8 state;
1294
1295 fa = fa_first;
1296 if (fa_match) {
1297 if (fa == fa_match)
1298 err = 0;
1299 goto out;
1300 }
1301 err = -ENOBUFS;
1302 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1303 if (new_fa == NULL)
1304 goto out;
1305
1306 fi_drop = fa->fa_info;
1307 new_fa->fa_tos = fa->fa_tos;
1308 new_fa->fa_info = fi;
1309 new_fa->fa_type = cfg->fc_type;
1310 new_fa->fa_scope = cfg->fc_scope;
1311 state = fa->fa_state;
1312 new_fa->fa_state = state & ~FA_S_ACCESSED;
1313
1314 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1315 alias_free_mem_rcu(fa);
1316
1317 fib_release_info(fi_drop);
1318 if (state & FA_S_ACCESSED)
1319 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1320 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1321 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1322
1323 goto succeeded;
1324 }
1325 /* Error if we find a perfect match which
1326 * uses the same scope, type, and nexthop
1327 * information.
1328 */
1329 if (fa_match)
1330 goto out;
1331
1332 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1333 fa = fa_first;
1334 }
1335 err = -ENOENT;
1336 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1337 goto out;
1338
1339 err = -ENOBUFS;
1340 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1341 if (new_fa == NULL)
1342 goto out;
1343
1344 new_fa->fa_info = fi;
1345 new_fa->fa_tos = tos;
1346 new_fa->fa_type = cfg->fc_type;
1347 new_fa->fa_scope = cfg->fc_scope;
1348 new_fa->fa_state = 0;
1349 /*
1350 * Insert new entry to the list.
1351 */
1352
1353 if (!fa_head) {
1354 fa_head = fib_insert_node(t, key, plen);
1355 if (unlikely(!fa_head)) {
1356 err = -ENOMEM;
1357 goto out_free_new_fa;
1358 }
1359 }
1360
1361 list_add_tail_rcu(&new_fa->fa_list,
1362 (fa ? &fa->fa_list : fa_head));
1363
1364 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1365 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1366 &cfg->fc_nlinfo, 0);
1367 succeeded:
1368 return 0;
1369
1370 out_free_new_fa:
1371 kmem_cache_free(fn_alias_kmem, new_fa);
1372 out:
1373 fib_release_info(fi);
1374 err:
1375 return err;
1376 }
1377
1378 /* should be called with rcu_read_lock */
1379 static int check_leaf(struct trie *t, struct leaf *l,
1380 t_key key, const struct flowi *flp,
1381 struct fib_result *res)
1382 {
1383 struct leaf_info *li;
1384 struct hlist_head *hhead = &l->list;
1385 struct hlist_node *node;
1386
1387 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1388 int err;
1389 int plen = li->plen;
1390 __be32 mask = inet_make_mask(plen);
1391
1392 if (l->key != (key & ntohl(mask)))
1393 continue;
1394
1395 err = fib_semantic_match(&li->falh, flp, res, plen);
1396
1397 #ifdef CONFIG_IP_FIB_TRIE_STATS
1398 if (err <= 0)
1399 t->stats.semantic_match_passed++;
1400 else
1401 t->stats.semantic_match_miss++;
1402 #endif
1403 if (err <= 0)
1404 return err;
1405 }
1406
1407 return 1;
1408 }
1409
1410 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1411 struct fib_result *res)
1412 {
1413 struct trie *t = (struct trie *) tb->tb_data;
1414 int ret;
1415 struct node *n;
1416 struct tnode *pn;
1417 int pos, bits;
1418 t_key key = ntohl(flp->fl4_dst);
1419 int chopped_off;
1420 t_key cindex = 0;
1421 int current_prefix_length = KEYLENGTH;
1422 struct tnode *cn;
1423 t_key node_prefix, key_prefix, pref_mismatch;
1424 int mp;
1425
1426 rcu_read_lock();
1427
1428 n = rcu_dereference(t->trie);
1429 if (!n)
1430 goto failed;
1431
1432 #ifdef CONFIG_IP_FIB_TRIE_STATS
1433 t->stats.gets++;
1434 #endif
1435
1436 /* Just a leaf? */
1437 if (IS_LEAF(n)) {
1438 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1439 goto found;
1440 }
1441
1442 pn = (struct tnode *) n;
1443 chopped_off = 0;
1444
1445 while (pn) {
1446 pos = pn->pos;
1447 bits = pn->bits;
1448
1449 if (!chopped_off)
1450 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1451 pos, bits);
1452
1453 n = tnode_get_child(pn, cindex);
1454
1455 if (n == NULL) {
1456 #ifdef CONFIG_IP_FIB_TRIE_STATS
1457 t->stats.null_node_hit++;
1458 #endif
1459 goto backtrace;
1460 }
1461
1462 if (IS_LEAF(n)) {
1463 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1464 if (ret > 0)
1465 goto backtrace;
1466 goto found;
1467 }
1468
1469 cn = (struct tnode *)n;
1470
1471 /*
1472 * It's a tnode, and we can do some extra checks here if we
1473 * like, to avoid descending into a dead-end branch.
1474 * This tnode is in the parent's child array at index
1475 * key[p_pos..p_pos+p_bits] but potentially with some bits
1476 * chopped off, so in reality the index may be just a
1477 * subprefix, padded with zero at the end.
1478 * We can also take a look at any skipped bits in this
1479 * tnode - everything up to p_pos is supposed to be ok,
1480 * and the non-chopped bits of the index (se previous
1481 * paragraph) are also guaranteed ok, but the rest is
1482 * considered unknown.
1483 *
1484 * The skipped bits are key[pos+bits..cn->pos].
1485 */
1486
1487 /* If current_prefix_length < pos+bits, we are already doing
1488 * actual prefix matching, which means everything from
1489 * pos+(bits-chopped_off) onward must be zero along some
1490 * branch of this subtree - otherwise there is *no* valid
1491 * prefix present. Here we can only check the skipped
1492 * bits. Remember, since we have already indexed into the
1493 * parent's child array, we know that the bits we chopped of
1494 * *are* zero.
1495 */
1496
1497 /* NOTA BENE: Checking only skipped bits
1498 for the new node here */
1499
1500 if (current_prefix_length < pos+bits) {
1501 if (tkey_extract_bits(cn->key, current_prefix_length,
1502 cn->pos - current_prefix_length)
1503 || !(cn->child[0]))
1504 goto backtrace;
1505 }
1506
1507 /*
1508 * If chopped_off=0, the index is fully validated and we
1509 * only need to look at the skipped bits for this, the new,
1510 * tnode. What we actually want to do is to find out if
1511 * these skipped bits match our key perfectly, or if we will
1512 * have to count on finding a matching prefix further down,
1513 * because if we do, we would like to have some way of
1514 * verifying the existence of such a prefix at this point.
1515 */
1516
1517 /* The only thing we can do at this point is to verify that
1518 * any such matching prefix can indeed be a prefix to our
1519 * key, and if the bits in the node we are inspecting that
1520 * do not match our key are not ZERO, this cannot be true.
1521 * Thus, find out where there is a mismatch (before cn->pos)
1522 * and verify that all the mismatching bits are zero in the
1523 * new tnode's key.
1524 */
1525
1526 /*
1527 * Note: We aren't very concerned about the piece of
1528 * the key that precede pn->pos+pn->bits, since these
1529 * have already been checked. The bits after cn->pos
1530 * aren't checked since these are by definition
1531 * "unknown" at this point. Thus, what we want to see
1532 * is if we are about to enter the "prefix matching"
1533 * state, and in that case verify that the skipped
1534 * bits that will prevail throughout this subtree are
1535 * zero, as they have to be if we are to find a
1536 * matching prefix.
1537 */
1538
1539 node_prefix = mask_pfx(cn->key, cn->pos);
1540 key_prefix = mask_pfx(key, cn->pos);
1541 pref_mismatch = key_prefix^node_prefix;
1542 mp = 0;
1543
1544 /*
1545 * In short: If skipped bits in this node do not match
1546 * the search key, enter the "prefix matching"
1547 * state.directly.
1548 */
1549 if (pref_mismatch) {
1550 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1551 mp++;
1552 pref_mismatch = pref_mismatch << 1;
1553 }
1554 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1555
1556 if (key_prefix != 0)
1557 goto backtrace;
1558
1559 if (current_prefix_length >= cn->pos)
1560 current_prefix_length = mp;
1561 }
1562
1563 pn = (struct tnode *)n; /* Descend */
1564 chopped_off = 0;
1565 continue;
1566
1567 backtrace:
1568 chopped_off++;
1569
1570 /* As zero don't change the child key (cindex) */
1571 while ((chopped_off <= pn->bits)
1572 && !(cindex & (1<<(chopped_off-1))))
1573 chopped_off++;
1574
1575 /* Decrease current_... with bits chopped off */
1576 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1577 current_prefix_length = pn->pos + pn->bits
1578 - chopped_off;
1579
1580 /*
1581 * Either we do the actual chop off according or if we have
1582 * chopped off all bits in this tnode walk up to our parent.
1583 */
1584
1585 if (chopped_off <= pn->bits) {
1586 cindex &= ~(1 << (chopped_off-1));
1587 } else {
1588 struct tnode *parent = node_parent((struct node *) pn);
1589 if (!parent)
1590 goto failed;
1591
1592 /* Get Child's index */
1593 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1594 pn = parent;
1595 chopped_off = 0;
1596
1597 #ifdef CONFIG_IP_FIB_TRIE_STATS
1598 t->stats.backtrack++;
1599 #endif
1600 goto backtrace;
1601 }
1602 }
1603 failed:
1604 ret = 1;
1605 found:
1606 rcu_read_unlock();
1607 return ret;
1608 }
1609
1610 /*
1611 * Remove the leaf and return parent.
1612 */
1613 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1614 {
1615 struct tnode *tp = node_parent((struct node *) l);
1616
1617 pr_debug("entering trie_leaf_remove(%p)\n", l);
1618
1619 if (tp) {
1620 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1621 put_child(t, (struct tnode *)tp, cindex, NULL);
1622 trie_rebalance(t, tp);
1623 } else
1624 rcu_assign_pointer(t->trie, NULL);
1625
1626 free_leaf(l);
1627 }
1628
1629 /*
1630 * Caller must hold RTNL.
1631 */
1632 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1633 {
1634 struct trie *t = (struct trie *) tb->tb_data;
1635 u32 key, mask;
1636 int plen = cfg->fc_dst_len;
1637 u8 tos = cfg->fc_tos;
1638 struct fib_alias *fa, *fa_to_delete;
1639 struct list_head *fa_head;
1640 struct leaf *l;
1641 struct leaf_info *li;
1642
1643 if (plen > 32)
1644 return -EINVAL;
1645
1646 key = ntohl(cfg->fc_dst);
1647 mask = ntohl(inet_make_mask(plen));
1648
1649 if (key & ~mask)
1650 return -EINVAL;
1651
1652 key = key & mask;
1653 l = fib_find_node(t, key);
1654
1655 if (!l)
1656 return -ESRCH;
1657
1658 fa_head = get_fa_head(l, plen);
1659 fa = fib_find_alias(fa_head, tos, 0);
1660
1661 if (!fa)
1662 return -ESRCH;
1663
1664 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1665
1666 fa_to_delete = NULL;
1667 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1668 list_for_each_entry_continue(fa, fa_head, fa_list) {
1669 struct fib_info *fi = fa->fa_info;
1670
1671 if (fa->fa_tos != tos)
1672 break;
1673
1674 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1675 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1676 fa->fa_scope == cfg->fc_scope) &&
1677 (!cfg->fc_protocol ||
1678 fi->fib_protocol == cfg->fc_protocol) &&
1679 fib_nh_match(cfg, fi) == 0) {
1680 fa_to_delete = fa;
1681 break;
1682 }
1683 }
1684
1685 if (!fa_to_delete)
1686 return -ESRCH;
1687
1688 fa = fa_to_delete;
1689 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1690 &cfg->fc_nlinfo, 0);
1691
1692 l = fib_find_node(t, key);
1693 li = find_leaf_info(l, plen);
1694
1695 list_del_rcu(&fa->fa_list);
1696
1697 if (list_empty(fa_head)) {
1698 hlist_del_rcu(&li->hlist);
1699 free_leaf_info(li);
1700 }
1701
1702 if (hlist_empty(&l->list))
1703 trie_leaf_remove(t, l);
1704
1705 if (fa->fa_state & FA_S_ACCESSED)
1706 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1707
1708 fib_release_info(fa->fa_info);
1709 alias_free_mem_rcu(fa);
1710 return 0;
1711 }
1712
1713 static int trie_flush_list(struct list_head *head)
1714 {
1715 struct fib_alias *fa, *fa_node;
1716 int found = 0;
1717
1718 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1719 struct fib_info *fi = fa->fa_info;
1720
1721 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1722 list_del_rcu(&fa->fa_list);
1723 fib_release_info(fa->fa_info);
1724 alias_free_mem_rcu(fa);
1725 found++;
1726 }
1727 }
1728 return found;
1729 }
1730
1731 static int trie_flush_leaf(struct leaf *l)
1732 {
1733 int found = 0;
1734 struct hlist_head *lih = &l->list;
1735 struct hlist_node *node, *tmp;
1736 struct leaf_info *li = NULL;
1737
1738 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1739 found += trie_flush_list(&li->falh);
1740
1741 if (list_empty(&li->falh)) {
1742 hlist_del_rcu(&li->hlist);
1743 free_leaf_info(li);
1744 }
1745 }
1746 return found;
1747 }
1748
1749 /*
1750 * Scan for the next right leaf starting at node p->child[idx]
1751 * Since we have back pointer, no recursion necessary.
1752 */
1753 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1754 {
1755 do {
1756 t_key idx;
1757
1758 if (c)
1759 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1760 else
1761 idx = 0;
1762
1763 while (idx < 1u << p->bits) {
1764 c = tnode_get_child_rcu(p, idx++);
1765 if (!c)
1766 continue;
1767
1768 if (IS_LEAF(c)) {
1769 prefetch(p->child[idx]);
1770 return (struct leaf *) c;
1771 }
1772
1773 /* Rescan start scanning in new node */
1774 p = (struct tnode *) c;
1775 idx = 0;
1776 }
1777
1778 /* Node empty, walk back up to parent */
1779 c = (struct node *) p;
1780 } while ( (p = node_parent_rcu(c)) != NULL);
1781
1782 return NULL; /* Root of trie */
1783 }
1784
1785 static struct leaf *trie_firstleaf(struct trie *t)
1786 {
1787 struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1788
1789 if (!n)
1790 return NULL;
1791
1792 if (IS_LEAF(n)) /* trie is just a leaf */
1793 return (struct leaf *) n;
1794
1795 return leaf_walk_rcu(n, NULL);
1796 }
1797
1798 static struct leaf *trie_nextleaf(struct leaf *l)
1799 {
1800 struct node *c = (struct node *) l;
1801 struct tnode *p = node_parent(c);
1802
1803 if (!p)
1804 return NULL; /* trie with just one leaf */
1805
1806 return leaf_walk_rcu(p, c);
1807 }
1808
1809 static struct leaf *trie_leafindex(struct trie *t, int index)
1810 {
1811 struct leaf *l = trie_firstleaf(t);
1812
1813 while (l && index-- > 0)
1814 l = trie_nextleaf(l);
1815
1816 return l;
1817 }
1818
1819
1820 /*
1821 * Caller must hold RTNL.
1822 */
1823 static int fn_trie_flush(struct fib_table *tb)
1824 {
1825 struct trie *t = (struct trie *) tb->tb_data;
1826 struct leaf *l, *ll = NULL;
1827 int found = 0;
1828
1829 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1830 found += trie_flush_leaf(l);
1831
1832 if (ll && hlist_empty(&ll->list))
1833 trie_leaf_remove(t, ll);
1834 ll = l;
1835 }
1836
1837 if (ll && hlist_empty(&ll->list))
1838 trie_leaf_remove(t, ll);
1839
1840 pr_debug("trie_flush found=%d\n", found);
1841 return found;
1842 }
1843
1844 static void fn_trie_select_default(struct fib_table *tb,
1845 const struct flowi *flp,
1846 struct fib_result *res)
1847 {
1848 struct trie *t = (struct trie *) tb->tb_data;
1849 int order, last_idx;
1850 struct fib_info *fi = NULL;
1851 struct fib_info *last_resort;
1852 struct fib_alias *fa = NULL;
1853 struct list_head *fa_head;
1854 struct leaf *l;
1855
1856 last_idx = -1;
1857 last_resort = NULL;
1858 order = -1;
1859
1860 rcu_read_lock();
1861
1862 l = fib_find_node(t, 0);
1863 if (!l)
1864 goto out;
1865
1866 fa_head = get_fa_head(l, 0);
1867 if (!fa_head)
1868 goto out;
1869
1870 if (list_empty(fa_head))
1871 goto out;
1872
1873 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1874 struct fib_info *next_fi = fa->fa_info;
1875
1876 if (fa->fa_scope != res->scope ||
1877 fa->fa_type != RTN_UNICAST)
1878 continue;
1879
1880 if (next_fi->fib_priority > res->fi->fib_priority)
1881 break;
1882 if (!next_fi->fib_nh[0].nh_gw ||
1883 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1884 continue;
1885 fa->fa_state |= FA_S_ACCESSED;
1886
1887 if (fi == NULL) {
1888 if (next_fi != res->fi)
1889 break;
1890 } else if (!fib_detect_death(fi, order, &last_resort,
1891 &last_idx, tb->tb_default)) {
1892 fib_result_assign(res, fi);
1893 tb->tb_default = order;
1894 goto out;
1895 }
1896 fi = next_fi;
1897 order++;
1898 }
1899 if (order <= 0 || fi == NULL) {
1900 tb->tb_default = -1;
1901 goto out;
1902 }
1903
1904 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1905 tb->tb_default)) {
1906 fib_result_assign(res, fi);
1907 tb->tb_default = order;
1908 goto out;
1909 }
1910 if (last_idx >= 0)
1911 fib_result_assign(res, last_resort);
1912 tb->tb_default = last_idx;
1913 out:
1914 rcu_read_unlock();
1915 }
1916
1917 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1918 struct fib_table *tb,
1919 struct sk_buff *skb, struct netlink_callback *cb)
1920 {
1921 int i, s_i;
1922 struct fib_alias *fa;
1923 __be32 xkey = htonl(key);
1924
1925 s_i = cb->args[5];
1926 i = 0;
1927
1928 /* rcu_read_lock is hold by caller */
1929
1930 list_for_each_entry_rcu(fa, fah, fa_list) {
1931 if (i < s_i) {
1932 i++;
1933 continue;
1934 }
1935
1936 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1937 cb->nlh->nlmsg_seq,
1938 RTM_NEWROUTE,
1939 tb->tb_id,
1940 fa->fa_type,
1941 fa->fa_scope,
1942 xkey,
1943 plen,
1944 fa->fa_tos,
1945 fa->fa_info, NLM_F_MULTI) < 0) {
1946 cb->args[5] = i;
1947 return -1;
1948 }
1949 i++;
1950 }
1951 cb->args[5] = i;
1952 return skb->len;
1953 }
1954
1955 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1956 struct sk_buff *skb, struct netlink_callback *cb)
1957 {
1958 struct leaf_info *li;
1959 struct hlist_node *node;
1960 int i, s_i;
1961
1962 s_i = cb->args[4];
1963 i = 0;
1964
1965 /* rcu_read_lock is hold by caller */
1966 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1967 if (i < s_i) {
1968 i++;
1969 continue;
1970 }
1971
1972 if (i > s_i)
1973 cb->args[5] = 0;
1974
1975 if (list_empty(&li->falh))
1976 continue;
1977
1978 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1979 cb->args[4] = i;
1980 return -1;
1981 }
1982 i++;
1983 }
1984
1985 cb->args[4] = i;
1986 return skb->len;
1987 }
1988
1989 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1990 struct netlink_callback *cb)
1991 {
1992 struct leaf *l;
1993 struct trie *t = (struct trie *) tb->tb_data;
1994 t_key key = cb->args[2];
1995 int count = cb->args[3];
1996
1997 rcu_read_lock();
1998 /* Dump starting at last key.
1999 * Note: 0.0.0.0/0 (ie default) is first key.
2000 */
2001 if (count == 0)
2002 l = trie_firstleaf(t);
2003 else {
2004 /* Normally, continue from last key, but if that is missing
2005 * fallback to using slow rescan
2006 */
2007 l = fib_find_node(t, key);
2008 if (!l)
2009 l = trie_leafindex(t, count);
2010 }
2011
2012 while (l) {
2013 cb->args[2] = l->key;
2014 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
2015 cb->args[3] = count;
2016 rcu_read_unlock();
2017 return -1;
2018 }
2019
2020 ++count;
2021 l = trie_nextleaf(l);
2022 memset(&cb->args[4], 0,
2023 sizeof(cb->args) - 4*sizeof(cb->args[0]));
2024 }
2025 cb->args[3] = count;
2026 rcu_read_unlock();
2027
2028 return skb->len;
2029 }
2030
2031 void __init fib_hash_init(void)
2032 {
2033 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2034 sizeof(struct fib_alias),
2035 0, SLAB_PANIC, NULL);
2036
2037 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2038 max(sizeof(struct leaf),
2039 sizeof(struct leaf_info)),
2040 0, SLAB_PANIC, NULL);
2041 }
2042
2043
2044 /* Fix more generic FIB names for init later */
2045 struct fib_table *fib_hash_table(u32 id)
2046 {
2047 struct fib_table *tb;
2048 struct trie *t;
2049
2050 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2051 GFP_KERNEL);
2052 if (tb == NULL)
2053 return NULL;
2054
2055 tb->tb_id = id;
2056 tb->tb_default = -1;
2057 tb->tb_lookup = fn_trie_lookup;
2058 tb->tb_insert = fn_trie_insert;
2059 tb->tb_delete = fn_trie_delete;
2060 tb->tb_flush = fn_trie_flush;
2061 tb->tb_select_default = fn_trie_select_default;
2062 tb->tb_dump = fn_trie_dump;
2063
2064 t = (struct trie *) tb->tb_data;
2065 memset(t, 0, sizeof(*t));
2066
2067 if (id == RT_TABLE_LOCAL)
2068 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2069
2070 return tb;
2071 }
2072
2073 #ifdef CONFIG_PROC_FS
2074 /* Depth first Trie walk iterator */
2075 struct fib_trie_iter {
2076 struct seq_net_private p;
2077 struct fib_table *tb;
2078 struct tnode *tnode;
2079 unsigned index;
2080 unsigned depth;
2081 };
2082
2083 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2084 {
2085 struct tnode *tn = iter->tnode;
2086 unsigned cindex = iter->index;
2087 struct tnode *p;
2088
2089 /* A single entry routing table */
2090 if (!tn)
2091 return NULL;
2092
2093 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2094 iter->tnode, iter->index, iter->depth);
2095 rescan:
2096 while (cindex < (1<<tn->bits)) {
2097 struct node *n = tnode_get_child_rcu(tn, cindex);
2098
2099 if (n) {
2100 if (IS_LEAF(n)) {
2101 iter->tnode = tn;
2102 iter->index = cindex + 1;
2103 } else {
2104 /* push down one level */
2105 iter->tnode = (struct tnode *) n;
2106 iter->index = 0;
2107 ++iter->depth;
2108 }
2109 return n;
2110 }
2111
2112 ++cindex;
2113 }
2114
2115 /* Current node exhausted, pop back up */
2116 p = node_parent_rcu((struct node *)tn);
2117 if (p) {
2118 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2119 tn = p;
2120 --iter->depth;
2121 goto rescan;
2122 }
2123
2124 /* got root? */
2125 return NULL;
2126 }
2127
2128 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2129 struct trie *t)
2130 {
2131 struct node *n;
2132
2133 if (!t)
2134 return NULL;
2135
2136 n = rcu_dereference(t->trie);
2137 if (!n)
2138 return NULL;
2139
2140 if (IS_TNODE(n)) {
2141 iter->tnode = (struct tnode *) n;
2142 iter->index = 0;
2143 iter->depth = 1;
2144 } else {
2145 iter->tnode = NULL;
2146 iter->index = 0;
2147 iter->depth = 0;
2148 }
2149
2150 return n;
2151 }
2152
2153 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2154 {
2155 struct node *n;
2156 struct fib_trie_iter iter;
2157
2158 memset(s, 0, sizeof(*s));
2159
2160 rcu_read_lock();
2161 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2162 if (IS_LEAF(n)) {
2163 struct leaf *l = (struct leaf *)n;
2164 struct leaf_info *li;
2165 struct hlist_node *tmp;
2166
2167 s->leaves++;
2168 s->totdepth += iter.depth;
2169 if (iter.depth > s->maxdepth)
2170 s->maxdepth = iter.depth;
2171
2172 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2173 ++s->prefixes;
2174 } else {
2175 const struct tnode *tn = (const struct tnode *) n;
2176 int i;
2177
2178 s->tnodes++;
2179 if (tn->bits < MAX_STAT_DEPTH)
2180 s->nodesizes[tn->bits]++;
2181
2182 for (i = 0; i < (1<<tn->bits); i++)
2183 if (!tn->child[i])
2184 s->nullpointers++;
2185 }
2186 }
2187 rcu_read_unlock();
2188 }
2189
2190 /*
2191 * This outputs /proc/net/fib_triestats
2192 */
2193 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2194 {
2195 unsigned i, max, pointers, bytes, avdepth;
2196
2197 if (stat->leaves)
2198 avdepth = stat->totdepth*100 / stat->leaves;
2199 else
2200 avdepth = 0;
2201
2202 seq_printf(seq, "\tAver depth: %u.%02d\n",
2203 avdepth / 100, avdepth % 100);
2204 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2205
2206 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2207 bytes = sizeof(struct leaf) * stat->leaves;
2208
2209 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2210 bytes += sizeof(struct leaf_info) * stat->prefixes;
2211
2212 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2213 bytes += sizeof(struct tnode) * stat->tnodes;
2214
2215 max = MAX_STAT_DEPTH;
2216 while (max > 0 && stat->nodesizes[max-1] == 0)
2217 max--;
2218
2219 pointers = 0;
2220 for (i = 1; i <= max; i++)
2221 if (stat->nodesizes[i] != 0) {
2222 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2223 pointers += (1<<i) * stat->nodesizes[i];
2224 }
2225 seq_putc(seq, '\n');
2226 seq_printf(seq, "\tPointers: %u\n", pointers);
2227
2228 bytes += sizeof(struct node *) * pointers;
2229 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2230 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2231 }
2232
2233 #ifdef CONFIG_IP_FIB_TRIE_STATS
2234 static void trie_show_usage(struct seq_file *seq,
2235 const struct trie_use_stats *stats)
2236 {
2237 seq_printf(seq, "\nCounters:\n---------\n");
2238 seq_printf(seq, "gets = %u\n", stats->gets);
2239 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2240 seq_printf(seq, "semantic match passed = %u\n",
2241 stats->semantic_match_passed);
2242 seq_printf(seq, "semantic match miss = %u\n",
2243 stats->semantic_match_miss);
2244 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2245 seq_printf(seq, "skipped node resize = %u\n\n",
2246 stats->resize_node_skipped);
2247 }
2248 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2249
2250 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2251 {
2252 if (tb->tb_id == RT_TABLE_LOCAL)
2253 seq_puts(seq, "Local:\n");
2254 else if (tb->tb_id == RT_TABLE_MAIN)
2255 seq_puts(seq, "Main:\n");
2256 else
2257 seq_printf(seq, "Id %d:\n", tb->tb_id);
2258 }
2259
2260
2261 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2262 {
2263 struct net *net = (struct net *)seq->private;
2264 unsigned int h;
2265
2266 seq_printf(seq,
2267 "Basic info: size of leaf:"
2268 " %Zd bytes, size of tnode: %Zd bytes.\n",
2269 sizeof(struct leaf), sizeof(struct tnode));
2270
2271 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2272 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2273 struct hlist_node *node;
2274 struct fib_table *tb;
2275
2276 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2277 struct trie *t = (struct trie *) tb->tb_data;
2278 struct trie_stat stat;
2279
2280 if (!t)
2281 continue;
2282
2283 fib_table_print(seq, tb);
2284
2285 trie_collect_stats(t, &stat);
2286 trie_show_stats(seq, &stat);
2287 #ifdef CONFIG_IP_FIB_TRIE_STATS
2288 trie_show_usage(seq, &t->stats);
2289 #endif
2290 }
2291 }
2292
2293 return 0;
2294 }
2295
2296 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2297 {
2298 return single_open_net(inode, file, fib_triestat_seq_show);
2299 }
2300
2301 static const struct file_operations fib_triestat_fops = {
2302 .owner = THIS_MODULE,
2303 .open = fib_triestat_seq_open,
2304 .read = seq_read,
2305 .llseek = seq_lseek,
2306 .release = single_release_net,
2307 };
2308
2309 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2310 {
2311 struct fib_trie_iter *iter = seq->private;
2312 struct net *net = seq_file_net(seq);
2313 loff_t idx = 0;
2314 unsigned int h;
2315
2316 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2317 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2318 struct hlist_node *node;
2319 struct fib_table *tb;
2320
2321 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2322 struct node *n;
2323
2324 for (n = fib_trie_get_first(iter,
2325 (struct trie *) tb->tb_data);
2326 n; n = fib_trie_get_next(iter))
2327 if (pos == idx++) {
2328 iter->tb = tb;
2329 return n;
2330 }
2331 }
2332 }
2333
2334 return NULL;
2335 }
2336
2337 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2338 __acquires(RCU)
2339 {
2340 rcu_read_lock();
2341 return fib_trie_get_idx(seq, *pos);
2342 }
2343
2344 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2345 {
2346 struct fib_trie_iter *iter = seq->private;
2347 struct net *net = seq_file_net(seq);
2348 struct fib_table *tb = iter->tb;
2349 struct hlist_node *tb_node;
2350 unsigned int h;
2351 struct node *n;
2352
2353 ++*pos;
2354 /* next node in same table */
2355 n = fib_trie_get_next(iter);
2356 if (n)
2357 return n;
2358
2359 /* walk rest of this hash chain */
2360 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2361 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2362 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2363 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2364 if (n)
2365 goto found;
2366 }
2367
2368 /* new hash chain */
2369 while (++h < FIB_TABLE_HASHSZ) {
2370 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2371 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2372 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2373 if (n)
2374 goto found;
2375 }
2376 }
2377 return NULL;
2378
2379 found:
2380 iter->tb = tb;
2381 return n;
2382 }
2383
2384 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2385 __releases(RCU)
2386 {
2387 rcu_read_unlock();
2388 }
2389
2390 static void seq_indent(struct seq_file *seq, int n)
2391 {
2392 while (n-- > 0) seq_puts(seq, " ");
2393 }
2394
2395 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2396 {
2397 switch (s) {
2398 case RT_SCOPE_UNIVERSE: return "universe";
2399 case RT_SCOPE_SITE: return "site";
2400 case RT_SCOPE_LINK: return "link";
2401 case RT_SCOPE_HOST: return "host";
2402 case RT_SCOPE_NOWHERE: return "nowhere";
2403 default:
2404 snprintf(buf, len, "scope=%d", s);
2405 return buf;
2406 }
2407 }
2408
2409 static const char *rtn_type_names[__RTN_MAX] = {
2410 [RTN_UNSPEC] = "UNSPEC",
2411 [RTN_UNICAST] = "UNICAST",
2412 [RTN_LOCAL] = "LOCAL",
2413 [RTN_BROADCAST] = "BROADCAST",
2414 [RTN_ANYCAST] = "ANYCAST",
2415 [RTN_MULTICAST] = "MULTICAST",
2416 [RTN_BLACKHOLE] = "BLACKHOLE",
2417 [RTN_UNREACHABLE] = "UNREACHABLE",
2418 [RTN_PROHIBIT] = "PROHIBIT",
2419 [RTN_THROW] = "THROW",
2420 [RTN_NAT] = "NAT",
2421 [RTN_XRESOLVE] = "XRESOLVE",
2422 };
2423
2424 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2425 {
2426 if (t < __RTN_MAX && rtn_type_names[t])
2427 return rtn_type_names[t];
2428 snprintf(buf, len, "type %u", t);
2429 return buf;
2430 }
2431
2432 /* Pretty print the trie */
2433 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2434 {
2435 const struct fib_trie_iter *iter = seq->private;
2436 struct node *n = v;
2437
2438 if (!node_parent_rcu(n))
2439 fib_table_print(seq, iter->tb);
2440
2441 if (IS_TNODE(n)) {
2442 struct tnode *tn = (struct tnode *) n;
2443 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2444
2445 seq_indent(seq, iter->depth-1);
2446 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2447 &prf, tn->pos, tn->bits, tn->full_children,
2448 tn->empty_children);
2449
2450 } else {
2451 struct leaf *l = (struct leaf *) n;
2452 struct leaf_info *li;
2453 struct hlist_node *node;
2454 __be32 val = htonl(l->key);
2455
2456 seq_indent(seq, iter->depth);
2457 seq_printf(seq, " |-- %pI4\n", &val);
2458
2459 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2460 struct fib_alias *fa;
2461
2462 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2463 char buf1[32], buf2[32];
2464
2465 seq_indent(seq, iter->depth+1);
2466 seq_printf(seq, " /%d %s %s", li->plen,
2467 rtn_scope(buf1, sizeof(buf1),
2468 fa->fa_scope),
2469 rtn_type(buf2, sizeof(buf2),
2470 fa->fa_type));
2471 if (fa->fa_tos)
2472 seq_printf(seq, " tos=%d", fa->fa_tos);
2473 seq_putc(seq, '\n');
2474 }
2475 }
2476 }
2477
2478 return 0;
2479 }
2480
2481 static const struct seq_operations fib_trie_seq_ops = {
2482 .start = fib_trie_seq_start,
2483 .next = fib_trie_seq_next,
2484 .stop = fib_trie_seq_stop,
2485 .show = fib_trie_seq_show,
2486 };
2487
2488 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2489 {
2490 return seq_open_net(inode, file, &fib_trie_seq_ops,
2491 sizeof(struct fib_trie_iter));
2492 }
2493
2494 static const struct file_operations fib_trie_fops = {
2495 .owner = THIS_MODULE,
2496 .open = fib_trie_seq_open,
2497 .read = seq_read,
2498 .llseek = seq_lseek,
2499 .release = seq_release_net,
2500 };
2501
2502 struct fib_route_iter {
2503 struct seq_net_private p;
2504 struct trie *main_trie;
2505 loff_t pos;
2506 t_key key;
2507 };
2508
2509 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2510 {
2511 struct leaf *l = NULL;
2512 struct trie *t = iter->main_trie;
2513
2514 /* use cache location of last found key */
2515 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2516 pos -= iter->pos;
2517 else {
2518 iter->pos = 0;
2519 l = trie_firstleaf(t);
2520 }
2521
2522 while (l && pos-- > 0) {
2523 iter->pos++;
2524 l = trie_nextleaf(l);
2525 }
2526
2527 if (l)
2528 iter->key = pos; /* remember it */
2529 else
2530 iter->pos = 0; /* forget it */
2531
2532 return l;
2533 }
2534
2535 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2536 __acquires(RCU)
2537 {
2538 struct fib_route_iter *iter = seq->private;
2539 struct fib_table *tb;
2540
2541 rcu_read_lock();
2542 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2543 if (!tb)
2544 return NULL;
2545
2546 iter->main_trie = (struct trie *) tb->tb_data;
2547 if (*pos == 0)
2548 return SEQ_START_TOKEN;
2549 else
2550 return fib_route_get_idx(iter, *pos - 1);
2551 }
2552
2553 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2554 {
2555 struct fib_route_iter *iter = seq->private;
2556 struct leaf *l = v;
2557
2558 ++*pos;
2559 if (v == SEQ_START_TOKEN) {
2560 iter->pos = 0;
2561 l = trie_firstleaf(iter->main_trie);
2562 } else {
2563 iter->pos++;
2564 l = trie_nextleaf(l);
2565 }
2566
2567 if (l)
2568 iter->key = l->key;
2569 else
2570 iter->pos = 0;
2571 return l;
2572 }
2573
2574 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2575 __releases(RCU)
2576 {
2577 rcu_read_unlock();
2578 }
2579
2580 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2581 {
2582 static unsigned type2flags[RTN_MAX + 1] = {
2583 [7] = RTF_REJECT, [8] = RTF_REJECT,
2584 };
2585 unsigned flags = type2flags[type];
2586
2587 if (fi && fi->fib_nh->nh_gw)
2588 flags |= RTF_GATEWAY;
2589 if (mask == htonl(0xFFFFFFFF))
2590 flags |= RTF_HOST;
2591 flags |= RTF_UP;
2592 return flags;
2593 }
2594
2595 /*
2596 * This outputs /proc/net/route.
2597 * The format of the file is not supposed to be changed
2598 * and needs to be same as fib_hash output to avoid breaking
2599 * legacy utilities
2600 */
2601 static int fib_route_seq_show(struct seq_file *seq, void *v)
2602 {
2603 struct leaf *l = v;
2604 struct leaf_info *li;
2605 struct hlist_node *node;
2606
2607 if (v == SEQ_START_TOKEN) {
2608 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2609 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2610 "\tWindow\tIRTT");
2611 return 0;
2612 }
2613
2614 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2615 struct fib_alias *fa;
2616 __be32 mask, prefix;
2617
2618 mask = inet_make_mask(li->plen);
2619 prefix = htonl(l->key);
2620
2621 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2622 const struct fib_info *fi = fa->fa_info;
2623 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2624 int len;
2625
2626 if (fa->fa_type == RTN_BROADCAST
2627 || fa->fa_type == RTN_MULTICAST)
2628 continue;
2629
2630 if (fi)
2631 seq_printf(seq,
2632 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2633 "%d\t%08X\t%d\t%u\t%u%n",
2634 fi->fib_dev ? fi->fib_dev->name : "*",
2635 prefix,
2636 fi->fib_nh->nh_gw, flags, 0, 0,
2637 fi->fib_priority,
2638 mask,
2639 (fi->fib_advmss ?
2640 fi->fib_advmss + 40 : 0),
2641 fi->fib_window,
2642 fi->fib_rtt >> 3, &len);
2643 else
2644 seq_printf(seq,
2645 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2646 "%d\t%08X\t%d\t%u\t%u%n",
2647 prefix, 0, flags, 0, 0, 0,
2648 mask, 0, 0, 0, &len);
2649
2650 seq_printf(seq, "%*s\n", 127 - len, "");
2651 }
2652 }
2653
2654 return 0;
2655 }
2656
2657 static const struct seq_operations fib_route_seq_ops = {
2658 .start = fib_route_seq_start,
2659 .next = fib_route_seq_next,
2660 .stop = fib_route_seq_stop,
2661 .show = fib_route_seq_show,
2662 };
2663
2664 static int fib_route_seq_open(struct inode *inode, struct file *file)
2665 {
2666 return seq_open_net(inode, file, &fib_route_seq_ops,
2667 sizeof(struct fib_route_iter));
2668 }
2669
2670 static const struct file_operations fib_route_fops = {
2671 .owner = THIS_MODULE,
2672 .open = fib_route_seq_open,
2673 .read = seq_read,
2674 .llseek = seq_lseek,
2675 .release = seq_release_net,
2676 };
2677
2678 int __net_init fib_proc_init(struct net *net)
2679 {
2680 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2681 goto out1;
2682
2683 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2684 &fib_triestat_fops))
2685 goto out2;
2686
2687 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2688 goto out3;
2689
2690 return 0;
2691
2692 out3:
2693 proc_net_remove(net, "fib_triestat");
2694 out2:
2695 proc_net_remove(net, "fib_trie");
2696 out1:
2697 return -ENOMEM;
2698 }
2699
2700 void __net_exit fib_proc_exit(struct net *net)
2701 {
2702 proc_net_remove(net, "fib_trie");
2703 proc_net_remove(net, "fib_triestat");
2704 proc_net_remove(net, "route");
2705 }
2706
2707 #endif /* CONFIG_PROC_FS */
This page took 0.153551 seconds and 4 git commands to generate.