Merge branch 'hwmon-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jdelv...
[deliverable/linux.git] / net / ipv4 / fib_trie.c
... / ...
CommitLineData
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
49 */
50
51#define VERSION "0.408"
52
53#include <asm/uaccess.h>
54#include <asm/system.h>
55#include <linux/bitops.h>
56#include <linux/types.h>
57#include <linux/kernel.h>
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
65#include <linux/inetdevice.h>
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
69#include <linux/rcupdate.h>
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
74#include <net/net_namespace.h>
75#include <net/ip.h>
76#include <net/protocol.h>
77#include <net/route.h>
78#include <net/tcp.h>
79#include <net/sock.h>
80#include <net/ip_fib.h>
81#include "fib_lookup.h"
82
83#define MAX_STAT_DEPTH 32
84
85#define KEYLENGTH (8*sizeof(t_key))
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
92#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93
94#define IS_TNODE(n) (!(n->parent & T_LEAF))
95#define IS_LEAF(n) (n->parent & T_LEAF)
96
97struct node {
98 unsigned long parent;
99 t_key key;
100};
101
102struct leaf {
103 unsigned long parent;
104 t_key key;
105 struct hlist_head list;
106 struct rcu_head rcu;
107};
108
109struct leaf_info {
110 struct hlist_node hlist;
111 struct rcu_head rcu;
112 int plen;
113 struct list_head falh;
114};
115
116struct tnode {
117 unsigned long parent;
118 t_key key;
119 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
120 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
121 unsigned int full_children; /* KEYLENGTH bits needed */
122 unsigned int empty_children; /* KEYLENGTH bits needed */
123 union {
124 struct rcu_head rcu;
125 struct work_struct work;
126 struct tnode *tnode_free;
127 };
128 struct node *child[0];
129};
130
131#ifdef CONFIG_IP_FIB_TRIE_STATS
132struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
138 unsigned int resize_node_skipped;
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int prefixes;
149 unsigned int nodesizes[MAX_STAT_DEPTH];
150};
151
152struct trie {
153 struct node *trie;
154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156#endif
157};
158
159static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
161 int wasfull);
162static struct node *resize(struct trie *t, struct tnode *tn);
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
165/* tnodes to free after resize(); protected by RTNL */
166static struct tnode *tnode_free_head;
167
168static struct kmem_cache *fn_alias_kmem __read_mostly;
169static struct kmem_cache *trie_leaf_kmem __read_mostly;
170
171static inline struct tnode *node_parent(struct node *node)
172{
173 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
174}
175
176static inline struct tnode *node_parent_rcu(struct node *node)
177{
178 struct tnode *ret = node_parent(node);
179
180 return rcu_dereference(ret);
181}
182
183/* Same as rcu_assign_pointer
184 * but that macro() assumes that value is a pointer.
185 */
186static inline void node_set_parent(struct node *node, struct tnode *ptr)
187{
188 smp_wmb();
189 node->parent = (unsigned long)ptr | NODE_TYPE(node);
190}
191
192static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
193{
194 BUG_ON(i >= 1U << tn->bits);
195
196 return tn->child[i];
197}
198
199static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
200{
201 struct node *ret = tnode_get_child(tn, i);
202
203 return rcu_dereference(ret);
204}
205
206static inline int tnode_child_length(const struct tnode *tn)
207{
208 return 1 << tn->bits;
209}
210
211static inline t_key mask_pfx(t_key k, unsigned short l)
212{
213 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
214}
215
216static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
217{
218 if (offset < KEYLENGTH)
219 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
220 else
221 return 0;
222}
223
224static inline int tkey_equals(t_key a, t_key b)
225{
226 return a == b;
227}
228
229static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
230{
231 if (bits == 0 || offset >= KEYLENGTH)
232 return 1;
233 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
234 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
235}
236
237static inline int tkey_mismatch(t_key a, int offset, t_key b)
238{
239 t_key diff = a ^ b;
240 int i = offset;
241
242 if (!diff)
243 return 0;
244 while ((diff << i) >> (KEYLENGTH-1) == 0)
245 i++;
246 return i;
247}
248
249/*
250 To understand this stuff, an understanding of keys and all their bits is
251 necessary. Every node in the trie has a key associated with it, but not
252 all of the bits in that key are significant.
253
254 Consider a node 'n' and its parent 'tp'.
255
256 If n is a leaf, every bit in its key is significant. Its presence is
257 necessitated by path compression, since during a tree traversal (when
258 searching for a leaf - unless we are doing an insertion) we will completely
259 ignore all skipped bits we encounter. Thus we need to verify, at the end of
260 a potentially successful search, that we have indeed been walking the
261 correct key path.
262
263 Note that we can never "miss" the correct key in the tree if present by
264 following the wrong path. Path compression ensures that segments of the key
265 that are the same for all keys with a given prefix are skipped, but the
266 skipped part *is* identical for each node in the subtrie below the skipped
267 bit! trie_insert() in this implementation takes care of that - note the
268 call to tkey_sub_equals() in trie_insert().
269
270 if n is an internal node - a 'tnode' here, the various parts of its key
271 have many different meanings.
272
273 Example:
274 _________________________________________________________________
275 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
276 -----------------------------------------------------------------
277 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
278
279 _________________________________________________________________
280 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
281 -----------------------------------------------------------------
282 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
283
284 tp->pos = 7
285 tp->bits = 3
286 n->pos = 15
287 n->bits = 4
288
289 First, let's just ignore the bits that come before the parent tp, that is
290 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
291 not use them for anything.
292
293 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
294 index into the parent's child array. That is, they will be used to find
295 'n' among tp's children.
296
297 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
298 for the node n.
299
300 All the bits we have seen so far are significant to the node n. The rest
301 of the bits are really not needed or indeed known in n->key.
302
303 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
304 n's child array, and will of course be different for each child.
305
306
307 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
308 at this point.
309
310*/
311
312static inline void check_tnode(const struct tnode *tn)
313{
314 WARN_ON(tn && tn->pos+tn->bits > 32);
315}
316
317static const int halve_threshold = 25;
318static const int inflate_threshold = 50;
319static const int halve_threshold_root = 8;
320static const int inflate_threshold_root = 15;
321
322
323static void __alias_free_mem(struct rcu_head *head)
324{
325 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
326 kmem_cache_free(fn_alias_kmem, fa);
327}
328
329static inline void alias_free_mem_rcu(struct fib_alias *fa)
330{
331 call_rcu(&fa->rcu, __alias_free_mem);
332}
333
334static void __leaf_free_rcu(struct rcu_head *head)
335{
336 struct leaf *l = container_of(head, struct leaf, rcu);
337 kmem_cache_free(trie_leaf_kmem, l);
338}
339
340static inline void free_leaf(struct leaf *l)
341{
342 call_rcu_bh(&l->rcu, __leaf_free_rcu);
343}
344
345static void __leaf_info_free_rcu(struct rcu_head *head)
346{
347 kfree(container_of(head, struct leaf_info, rcu));
348}
349
350static inline void free_leaf_info(struct leaf_info *leaf)
351{
352 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
353}
354
355static struct tnode *tnode_alloc(size_t size)
356{
357 if (size <= PAGE_SIZE)
358 return kzalloc(size, GFP_KERNEL);
359 else
360 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
361}
362
363static void __tnode_vfree(struct work_struct *arg)
364{
365 struct tnode *tn = container_of(arg, struct tnode, work);
366 vfree(tn);
367}
368
369static void __tnode_free_rcu(struct rcu_head *head)
370{
371 struct tnode *tn = container_of(head, struct tnode, rcu);
372 size_t size = sizeof(struct tnode) +
373 (sizeof(struct node *) << tn->bits);
374
375 if (size <= PAGE_SIZE)
376 kfree(tn);
377 else {
378 INIT_WORK(&tn->work, __tnode_vfree);
379 schedule_work(&tn->work);
380 }
381}
382
383static inline void tnode_free(struct tnode *tn)
384{
385 if (IS_LEAF(tn))
386 free_leaf((struct leaf *) tn);
387 else
388 call_rcu(&tn->rcu, __tnode_free_rcu);
389}
390
391static void tnode_free_safe(struct tnode *tn)
392{
393 BUG_ON(IS_LEAF(tn));
394
395 if (node_parent((struct node *) tn)) {
396 tn->tnode_free = tnode_free_head;
397 tnode_free_head = tn;
398 } else {
399 tnode_free(tn);
400 }
401}
402
403static void tnode_free_flush(void)
404{
405 struct tnode *tn;
406
407 while ((tn = tnode_free_head)) {
408 tnode_free_head = tn->tnode_free;
409 tn->tnode_free = NULL;
410 tnode_free(tn);
411 }
412}
413
414static struct leaf *leaf_new(void)
415{
416 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
417 if (l) {
418 l->parent = T_LEAF;
419 INIT_HLIST_HEAD(&l->list);
420 }
421 return l;
422}
423
424static struct leaf_info *leaf_info_new(int plen)
425{
426 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
427 if (li) {
428 li->plen = plen;
429 INIT_LIST_HEAD(&li->falh);
430 }
431 return li;
432}
433
434static struct tnode *tnode_new(t_key key, int pos, int bits)
435{
436 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
437 struct tnode *tn = tnode_alloc(sz);
438
439 if (tn) {
440 tn->parent = T_TNODE;
441 tn->pos = pos;
442 tn->bits = bits;
443 tn->key = key;
444 tn->full_children = 0;
445 tn->empty_children = 1<<bits;
446 }
447
448 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
449 (unsigned long) (sizeof(struct node) << bits));
450 return tn;
451}
452
453/*
454 * Check whether a tnode 'n' is "full", i.e. it is an internal node
455 * and no bits are skipped. See discussion in dyntree paper p. 6
456 */
457
458static inline int tnode_full(const struct tnode *tn, const struct node *n)
459{
460 if (n == NULL || IS_LEAF(n))
461 return 0;
462
463 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
464}
465
466static inline void put_child(struct trie *t, struct tnode *tn, int i,
467 struct node *n)
468{
469 tnode_put_child_reorg(tn, i, n, -1);
470}
471
472 /*
473 * Add a child at position i overwriting the old value.
474 * Update the value of full_children and empty_children.
475 */
476
477static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
478 int wasfull)
479{
480 struct node *chi = tn->child[i];
481 int isfull;
482
483 BUG_ON(i >= 1<<tn->bits);
484
485 /* update emptyChildren */
486 if (n == NULL && chi != NULL)
487 tn->empty_children++;
488 else if (n != NULL && chi == NULL)
489 tn->empty_children--;
490
491 /* update fullChildren */
492 if (wasfull == -1)
493 wasfull = tnode_full(tn, chi);
494
495 isfull = tnode_full(tn, n);
496 if (wasfull && !isfull)
497 tn->full_children--;
498 else if (!wasfull && isfull)
499 tn->full_children++;
500
501 if (n)
502 node_set_parent(n, tn);
503
504 rcu_assign_pointer(tn->child[i], n);
505}
506
507static struct node *resize(struct trie *t, struct tnode *tn)
508{
509 int i;
510 int err = 0;
511 struct tnode *old_tn;
512 int inflate_threshold_use;
513 int halve_threshold_use;
514 int max_resize;
515
516 if (!tn)
517 return NULL;
518
519 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
520 tn, inflate_threshold, halve_threshold);
521
522 /* No children */
523 if (tn->empty_children == tnode_child_length(tn)) {
524 tnode_free_safe(tn);
525 return NULL;
526 }
527 /* One child */
528 if (tn->empty_children == tnode_child_length(tn) - 1)
529 for (i = 0; i < tnode_child_length(tn); i++) {
530 struct node *n;
531
532 n = tn->child[i];
533 if (!n)
534 continue;
535
536 /* compress one level */
537 node_set_parent(n, NULL);
538 tnode_free_safe(tn);
539 return n;
540 }
541 /*
542 * Double as long as the resulting node has a number of
543 * nonempty nodes that are above the threshold.
544 */
545
546 /*
547 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
548 * the Helsinki University of Technology and Matti Tikkanen of Nokia
549 * Telecommunications, page 6:
550 * "A node is doubled if the ratio of non-empty children to all
551 * children in the *doubled* node is at least 'high'."
552 *
553 * 'high' in this instance is the variable 'inflate_threshold'. It
554 * is expressed as a percentage, so we multiply it with
555 * tnode_child_length() and instead of multiplying by 2 (since the
556 * child array will be doubled by inflate()) and multiplying
557 * the left-hand side by 100 (to handle the percentage thing) we
558 * multiply the left-hand side by 50.
559 *
560 * The left-hand side may look a bit weird: tnode_child_length(tn)
561 * - tn->empty_children is of course the number of non-null children
562 * in the current node. tn->full_children is the number of "full"
563 * children, that is non-null tnodes with a skip value of 0.
564 * All of those will be doubled in the resulting inflated tnode, so
565 * we just count them one extra time here.
566 *
567 * A clearer way to write this would be:
568 *
569 * to_be_doubled = tn->full_children;
570 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
571 * tn->full_children;
572 *
573 * new_child_length = tnode_child_length(tn) * 2;
574 *
575 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
576 * new_child_length;
577 * if (new_fill_factor >= inflate_threshold)
578 *
579 * ...and so on, tho it would mess up the while () loop.
580 *
581 * anyway,
582 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
583 * inflate_threshold
584 *
585 * avoid a division:
586 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
587 * inflate_threshold * new_child_length
588 *
589 * expand not_to_be_doubled and to_be_doubled, and shorten:
590 * 100 * (tnode_child_length(tn) - tn->empty_children +
591 * tn->full_children) >= inflate_threshold * new_child_length
592 *
593 * expand new_child_length:
594 * 100 * (tnode_child_length(tn) - tn->empty_children +
595 * tn->full_children) >=
596 * inflate_threshold * tnode_child_length(tn) * 2
597 *
598 * shorten again:
599 * 50 * (tn->full_children + tnode_child_length(tn) -
600 * tn->empty_children) >= inflate_threshold *
601 * tnode_child_length(tn)
602 *
603 */
604
605 check_tnode(tn);
606
607 /* Keep root node larger */
608
609 if (!tn->parent)
610 inflate_threshold_use = inflate_threshold_root;
611 else
612 inflate_threshold_use = inflate_threshold;
613
614 err = 0;
615 max_resize = 10;
616 while ((tn->full_children > 0 && max_resize-- &&
617 50 * (tn->full_children + tnode_child_length(tn)
618 - tn->empty_children)
619 >= inflate_threshold_use * tnode_child_length(tn))) {
620
621 old_tn = tn;
622 tn = inflate(t, tn);
623
624 if (IS_ERR(tn)) {
625 tn = old_tn;
626#ifdef CONFIG_IP_FIB_TRIE_STATS
627 t->stats.resize_node_skipped++;
628#endif
629 break;
630 }
631 }
632
633 if (max_resize < 0) {
634 if (!tn->parent)
635 pr_warning("Fix inflate_threshold_root."
636 " Now=%d size=%d bits\n",
637 inflate_threshold_root, tn->bits);
638 else
639 pr_warning("Fix inflate_threshold."
640 " Now=%d size=%d bits\n",
641 inflate_threshold, tn->bits);
642 }
643
644 check_tnode(tn);
645
646 /*
647 * Halve as long as the number of empty children in this
648 * node is above threshold.
649 */
650
651
652 /* Keep root node larger */
653
654 if (!tn->parent)
655 halve_threshold_use = halve_threshold_root;
656 else
657 halve_threshold_use = halve_threshold;
658
659 err = 0;
660 max_resize = 10;
661 while (tn->bits > 1 && max_resize-- &&
662 100 * (tnode_child_length(tn) - tn->empty_children) <
663 halve_threshold_use * tnode_child_length(tn)) {
664
665 old_tn = tn;
666 tn = halve(t, tn);
667 if (IS_ERR(tn)) {
668 tn = old_tn;
669#ifdef CONFIG_IP_FIB_TRIE_STATS
670 t->stats.resize_node_skipped++;
671#endif
672 break;
673 }
674 }
675
676 if (max_resize < 0) {
677 if (!tn->parent)
678 pr_warning("Fix halve_threshold_root."
679 " Now=%d size=%d bits\n",
680 halve_threshold_root, tn->bits);
681 else
682 pr_warning("Fix halve_threshold."
683 " Now=%d size=%d bits\n",
684 halve_threshold, tn->bits);
685 }
686
687 /* Only one child remains */
688 if (tn->empty_children == tnode_child_length(tn) - 1)
689 for (i = 0; i < tnode_child_length(tn); i++) {
690 struct node *n;
691
692 n = tn->child[i];
693 if (!n)
694 continue;
695
696 /* compress one level */
697
698 node_set_parent(n, NULL);
699 tnode_free_safe(tn);
700 return n;
701 }
702
703 return (struct node *) tn;
704}
705
706static struct tnode *inflate(struct trie *t, struct tnode *tn)
707{
708 struct tnode *oldtnode = tn;
709 int olen = tnode_child_length(tn);
710 int i;
711
712 pr_debug("In inflate\n");
713
714 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
715
716 if (!tn)
717 return ERR_PTR(-ENOMEM);
718
719 /*
720 * Preallocate and store tnodes before the actual work so we
721 * don't get into an inconsistent state if memory allocation
722 * fails. In case of failure we return the oldnode and inflate
723 * of tnode is ignored.
724 */
725
726 for (i = 0; i < olen; i++) {
727 struct tnode *inode;
728
729 inode = (struct tnode *) tnode_get_child(oldtnode, i);
730 if (inode &&
731 IS_TNODE(inode) &&
732 inode->pos == oldtnode->pos + oldtnode->bits &&
733 inode->bits > 1) {
734 struct tnode *left, *right;
735 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
736
737 left = tnode_new(inode->key&(~m), inode->pos + 1,
738 inode->bits - 1);
739 if (!left)
740 goto nomem;
741
742 right = tnode_new(inode->key|m, inode->pos + 1,
743 inode->bits - 1);
744
745 if (!right) {
746 tnode_free(left);
747 goto nomem;
748 }
749
750 put_child(t, tn, 2*i, (struct node *) left);
751 put_child(t, tn, 2*i+1, (struct node *) right);
752 }
753 }
754
755 for (i = 0; i < olen; i++) {
756 struct tnode *inode;
757 struct node *node = tnode_get_child(oldtnode, i);
758 struct tnode *left, *right;
759 int size, j;
760
761 /* An empty child */
762 if (node == NULL)
763 continue;
764
765 /* A leaf or an internal node with skipped bits */
766
767 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
768 tn->pos + tn->bits - 1) {
769 if (tkey_extract_bits(node->key,
770 oldtnode->pos + oldtnode->bits,
771 1) == 0)
772 put_child(t, tn, 2*i, node);
773 else
774 put_child(t, tn, 2*i+1, node);
775 continue;
776 }
777
778 /* An internal node with two children */
779 inode = (struct tnode *) node;
780
781 if (inode->bits == 1) {
782 put_child(t, tn, 2*i, inode->child[0]);
783 put_child(t, tn, 2*i+1, inode->child[1]);
784
785 tnode_free_safe(inode);
786 continue;
787 }
788
789 /* An internal node with more than two children */
790
791 /* We will replace this node 'inode' with two new
792 * ones, 'left' and 'right', each with half of the
793 * original children. The two new nodes will have
794 * a position one bit further down the key and this
795 * means that the "significant" part of their keys
796 * (see the discussion near the top of this file)
797 * will differ by one bit, which will be "0" in
798 * left's key and "1" in right's key. Since we are
799 * moving the key position by one step, the bit that
800 * we are moving away from - the bit at position
801 * (inode->pos) - is the one that will differ between
802 * left and right. So... we synthesize that bit in the
803 * two new keys.
804 * The mask 'm' below will be a single "one" bit at
805 * the position (inode->pos)
806 */
807
808 /* Use the old key, but set the new significant
809 * bit to zero.
810 */
811
812 left = (struct tnode *) tnode_get_child(tn, 2*i);
813 put_child(t, tn, 2*i, NULL);
814
815 BUG_ON(!left);
816
817 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
818 put_child(t, tn, 2*i+1, NULL);
819
820 BUG_ON(!right);
821
822 size = tnode_child_length(left);
823 for (j = 0; j < size; j++) {
824 put_child(t, left, j, inode->child[j]);
825 put_child(t, right, j, inode->child[j + size]);
826 }
827 put_child(t, tn, 2*i, resize(t, left));
828 put_child(t, tn, 2*i+1, resize(t, right));
829
830 tnode_free_safe(inode);
831 }
832 tnode_free_safe(oldtnode);
833 return tn;
834nomem:
835 {
836 int size = tnode_child_length(tn);
837 int j;
838
839 for (j = 0; j < size; j++)
840 if (tn->child[j])
841 tnode_free((struct tnode *)tn->child[j]);
842
843 tnode_free(tn);
844
845 return ERR_PTR(-ENOMEM);
846 }
847}
848
849static struct tnode *halve(struct trie *t, struct tnode *tn)
850{
851 struct tnode *oldtnode = tn;
852 struct node *left, *right;
853 int i;
854 int olen = tnode_child_length(tn);
855
856 pr_debug("In halve\n");
857
858 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
859
860 if (!tn)
861 return ERR_PTR(-ENOMEM);
862
863 /*
864 * Preallocate and store tnodes before the actual work so we
865 * don't get into an inconsistent state if memory allocation
866 * fails. In case of failure we return the oldnode and halve
867 * of tnode is ignored.
868 */
869
870 for (i = 0; i < olen; i += 2) {
871 left = tnode_get_child(oldtnode, i);
872 right = tnode_get_child(oldtnode, i+1);
873
874 /* Two nonempty children */
875 if (left && right) {
876 struct tnode *newn;
877
878 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
879
880 if (!newn)
881 goto nomem;
882
883 put_child(t, tn, i/2, (struct node *)newn);
884 }
885
886 }
887
888 for (i = 0; i < olen; i += 2) {
889 struct tnode *newBinNode;
890
891 left = tnode_get_child(oldtnode, i);
892 right = tnode_get_child(oldtnode, i+1);
893
894 /* At least one of the children is empty */
895 if (left == NULL) {
896 if (right == NULL) /* Both are empty */
897 continue;
898 put_child(t, tn, i/2, right);
899 continue;
900 }
901
902 if (right == NULL) {
903 put_child(t, tn, i/2, left);
904 continue;
905 }
906
907 /* Two nonempty children */
908 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
909 put_child(t, tn, i/2, NULL);
910 put_child(t, newBinNode, 0, left);
911 put_child(t, newBinNode, 1, right);
912 put_child(t, tn, i/2, resize(t, newBinNode));
913 }
914 tnode_free_safe(oldtnode);
915 return tn;
916nomem:
917 {
918 int size = tnode_child_length(tn);
919 int j;
920
921 for (j = 0; j < size; j++)
922 if (tn->child[j])
923 tnode_free((struct tnode *)tn->child[j]);
924
925 tnode_free(tn);
926
927 return ERR_PTR(-ENOMEM);
928 }
929}
930
931/* readside must use rcu_read_lock currently dump routines
932 via get_fa_head and dump */
933
934static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
935{
936 struct hlist_head *head = &l->list;
937 struct hlist_node *node;
938 struct leaf_info *li;
939
940 hlist_for_each_entry_rcu(li, node, head, hlist)
941 if (li->plen == plen)
942 return li;
943
944 return NULL;
945}
946
947static inline struct list_head *get_fa_head(struct leaf *l, int plen)
948{
949 struct leaf_info *li = find_leaf_info(l, plen);
950
951 if (!li)
952 return NULL;
953
954 return &li->falh;
955}
956
957static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
958{
959 struct leaf_info *li = NULL, *last = NULL;
960 struct hlist_node *node;
961
962 if (hlist_empty(head)) {
963 hlist_add_head_rcu(&new->hlist, head);
964 } else {
965 hlist_for_each_entry(li, node, head, hlist) {
966 if (new->plen > li->plen)
967 break;
968
969 last = li;
970 }
971 if (last)
972 hlist_add_after_rcu(&last->hlist, &new->hlist);
973 else
974 hlist_add_before_rcu(&new->hlist, &li->hlist);
975 }
976}
977
978/* rcu_read_lock needs to be hold by caller from readside */
979
980static struct leaf *
981fib_find_node(struct trie *t, u32 key)
982{
983 int pos;
984 struct tnode *tn;
985 struct node *n;
986
987 pos = 0;
988 n = rcu_dereference(t->trie);
989
990 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
991 tn = (struct tnode *) n;
992
993 check_tnode(tn);
994
995 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
996 pos = tn->pos + tn->bits;
997 n = tnode_get_child_rcu(tn,
998 tkey_extract_bits(key,
999 tn->pos,
1000 tn->bits));
1001 } else
1002 break;
1003 }
1004 /* Case we have found a leaf. Compare prefixes */
1005
1006 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
1007 return (struct leaf *)n;
1008
1009 return NULL;
1010}
1011
1012static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
1013{
1014 int wasfull;
1015 t_key cindex, key;
1016 struct tnode *tp;
1017
1018 key = tn->key;
1019
1020 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1021 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1022 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1023 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1024
1025 tnode_put_child_reorg((struct tnode *)tp, cindex,
1026 (struct node *)tn, wasfull);
1027
1028 tp = node_parent((struct node *) tn);
1029 tnode_free_flush();
1030 if (!tp)
1031 break;
1032 tn = tp;
1033 }
1034
1035 /* Handle last (top) tnode */
1036 if (IS_TNODE(tn)) {
1037 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1038 tnode_free_flush();
1039 }
1040
1041 return (struct node *)tn;
1042}
1043
1044/* only used from updater-side */
1045
1046static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1047{
1048 int pos, newpos;
1049 struct tnode *tp = NULL, *tn = NULL;
1050 struct node *n;
1051 struct leaf *l;
1052 int missbit;
1053 struct list_head *fa_head = NULL;
1054 struct leaf_info *li;
1055 t_key cindex;
1056
1057 pos = 0;
1058 n = t->trie;
1059
1060 /* If we point to NULL, stop. Either the tree is empty and we should
1061 * just put a new leaf in if, or we have reached an empty child slot,
1062 * and we should just put our new leaf in that.
1063 * If we point to a T_TNODE, check if it matches our key. Note that
1064 * a T_TNODE might be skipping any number of bits - its 'pos' need
1065 * not be the parent's 'pos'+'bits'!
1066 *
1067 * If it does match the current key, get pos/bits from it, extract
1068 * the index from our key, push the T_TNODE and walk the tree.
1069 *
1070 * If it doesn't, we have to replace it with a new T_TNODE.
1071 *
1072 * If we point to a T_LEAF, it might or might not have the same key
1073 * as we do. If it does, just change the value, update the T_LEAF's
1074 * value, and return it.
1075 * If it doesn't, we need to replace it with a T_TNODE.
1076 */
1077
1078 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1079 tn = (struct tnode *) n;
1080
1081 check_tnode(tn);
1082
1083 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1084 tp = tn;
1085 pos = tn->pos + tn->bits;
1086 n = tnode_get_child(tn,
1087 tkey_extract_bits(key,
1088 tn->pos,
1089 tn->bits));
1090
1091 BUG_ON(n && node_parent(n) != tn);
1092 } else
1093 break;
1094 }
1095
1096 /*
1097 * n ----> NULL, LEAF or TNODE
1098 *
1099 * tp is n's (parent) ----> NULL or TNODE
1100 */
1101
1102 BUG_ON(tp && IS_LEAF(tp));
1103
1104 /* Case 1: n is a leaf. Compare prefixes */
1105
1106 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1107 l = (struct leaf *) n;
1108 li = leaf_info_new(plen);
1109
1110 if (!li)
1111 return NULL;
1112
1113 fa_head = &li->falh;
1114 insert_leaf_info(&l->list, li);
1115 goto done;
1116 }
1117 l = leaf_new();
1118
1119 if (!l)
1120 return NULL;
1121
1122 l->key = key;
1123 li = leaf_info_new(plen);
1124
1125 if (!li) {
1126 free_leaf(l);
1127 return NULL;
1128 }
1129
1130 fa_head = &li->falh;
1131 insert_leaf_info(&l->list, li);
1132
1133 if (t->trie && n == NULL) {
1134 /* Case 2: n is NULL, and will just insert a new leaf */
1135
1136 node_set_parent((struct node *)l, tp);
1137
1138 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1139 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1140 } else {
1141 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1142 /*
1143 * Add a new tnode here
1144 * first tnode need some special handling
1145 */
1146
1147 if (tp)
1148 pos = tp->pos+tp->bits;
1149 else
1150 pos = 0;
1151
1152 if (n) {
1153 newpos = tkey_mismatch(key, pos, n->key);
1154 tn = tnode_new(n->key, newpos, 1);
1155 } else {
1156 newpos = 0;
1157 tn = tnode_new(key, newpos, 1); /* First tnode */
1158 }
1159
1160 if (!tn) {
1161 free_leaf_info(li);
1162 free_leaf(l);
1163 return NULL;
1164 }
1165
1166 node_set_parent((struct node *)tn, tp);
1167
1168 missbit = tkey_extract_bits(key, newpos, 1);
1169 put_child(t, tn, missbit, (struct node *)l);
1170 put_child(t, tn, 1-missbit, n);
1171
1172 if (tp) {
1173 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1174 put_child(t, (struct tnode *)tp, cindex,
1175 (struct node *)tn);
1176 } else {
1177 rcu_assign_pointer(t->trie, (struct node *)tn);
1178 tp = tn;
1179 }
1180 }
1181
1182 if (tp && tp->pos + tp->bits > 32)
1183 pr_warning("fib_trie"
1184 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1185 tp, tp->pos, tp->bits, key, plen);
1186
1187 /* Rebalance the trie */
1188
1189 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1190done:
1191 return fa_head;
1192}
1193
1194/*
1195 * Caller must hold RTNL.
1196 */
1197static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1198{
1199 struct trie *t = (struct trie *) tb->tb_data;
1200 struct fib_alias *fa, *new_fa;
1201 struct list_head *fa_head = NULL;
1202 struct fib_info *fi;
1203 int plen = cfg->fc_dst_len;
1204 u8 tos = cfg->fc_tos;
1205 u32 key, mask;
1206 int err;
1207 struct leaf *l;
1208
1209 if (plen > 32)
1210 return -EINVAL;
1211
1212 key = ntohl(cfg->fc_dst);
1213
1214 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1215
1216 mask = ntohl(inet_make_mask(plen));
1217
1218 if (key & ~mask)
1219 return -EINVAL;
1220
1221 key = key & mask;
1222
1223 fi = fib_create_info(cfg);
1224 if (IS_ERR(fi)) {
1225 err = PTR_ERR(fi);
1226 goto err;
1227 }
1228
1229 l = fib_find_node(t, key);
1230 fa = NULL;
1231
1232 if (l) {
1233 fa_head = get_fa_head(l, plen);
1234 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1235 }
1236
1237 /* Now fa, if non-NULL, points to the first fib alias
1238 * with the same keys [prefix,tos,priority], if such key already
1239 * exists or to the node before which we will insert new one.
1240 *
1241 * If fa is NULL, we will need to allocate a new one and
1242 * insert to the head of f.
1243 *
1244 * If f is NULL, no fib node matched the destination key
1245 * and we need to allocate a new one of those as well.
1246 */
1247
1248 if (fa && fa->fa_tos == tos &&
1249 fa->fa_info->fib_priority == fi->fib_priority) {
1250 struct fib_alias *fa_first, *fa_match;
1251
1252 err = -EEXIST;
1253 if (cfg->fc_nlflags & NLM_F_EXCL)
1254 goto out;
1255
1256 /* We have 2 goals:
1257 * 1. Find exact match for type, scope, fib_info to avoid
1258 * duplicate routes
1259 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1260 */
1261 fa_match = NULL;
1262 fa_first = fa;
1263 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1264 list_for_each_entry_continue(fa, fa_head, fa_list) {
1265 if (fa->fa_tos != tos)
1266 break;
1267 if (fa->fa_info->fib_priority != fi->fib_priority)
1268 break;
1269 if (fa->fa_type == cfg->fc_type &&
1270 fa->fa_scope == cfg->fc_scope &&
1271 fa->fa_info == fi) {
1272 fa_match = fa;
1273 break;
1274 }
1275 }
1276
1277 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1278 struct fib_info *fi_drop;
1279 u8 state;
1280
1281 fa = fa_first;
1282 if (fa_match) {
1283 if (fa == fa_match)
1284 err = 0;
1285 goto out;
1286 }
1287 err = -ENOBUFS;
1288 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1289 if (new_fa == NULL)
1290 goto out;
1291
1292 fi_drop = fa->fa_info;
1293 new_fa->fa_tos = fa->fa_tos;
1294 new_fa->fa_info = fi;
1295 new_fa->fa_type = cfg->fc_type;
1296 new_fa->fa_scope = cfg->fc_scope;
1297 state = fa->fa_state;
1298 new_fa->fa_state = state & ~FA_S_ACCESSED;
1299
1300 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1301 alias_free_mem_rcu(fa);
1302
1303 fib_release_info(fi_drop);
1304 if (state & FA_S_ACCESSED)
1305 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1306 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1307 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1308
1309 goto succeeded;
1310 }
1311 /* Error if we find a perfect match which
1312 * uses the same scope, type, and nexthop
1313 * information.
1314 */
1315 if (fa_match)
1316 goto out;
1317
1318 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1319 fa = fa_first;
1320 }
1321 err = -ENOENT;
1322 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1323 goto out;
1324
1325 err = -ENOBUFS;
1326 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1327 if (new_fa == NULL)
1328 goto out;
1329
1330 new_fa->fa_info = fi;
1331 new_fa->fa_tos = tos;
1332 new_fa->fa_type = cfg->fc_type;
1333 new_fa->fa_scope = cfg->fc_scope;
1334 new_fa->fa_state = 0;
1335 /*
1336 * Insert new entry to the list.
1337 */
1338
1339 if (!fa_head) {
1340 fa_head = fib_insert_node(t, key, plen);
1341 if (unlikely(!fa_head)) {
1342 err = -ENOMEM;
1343 goto out_free_new_fa;
1344 }
1345 }
1346
1347 list_add_tail_rcu(&new_fa->fa_list,
1348 (fa ? &fa->fa_list : fa_head));
1349
1350 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1351 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1352 &cfg->fc_nlinfo, 0);
1353succeeded:
1354 return 0;
1355
1356out_free_new_fa:
1357 kmem_cache_free(fn_alias_kmem, new_fa);
1358out:
1359 fib_release_info(fi);
1360err:
1361 return err;
1362}
1363
1364/* should be called with rcu_read_lock */
1365static int check_leaf(struct trie *t, struct leaf *l,
1366 t_key key, const struct flowi *flp,
1367 struct fib_result *res)
1368{
1369 struct leaf_info *li;
1370 struct hlist_head *hhead = &l->list;
1371 struct hlist_node *node;
1372
1373 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1374 int err;
1375 int plen = li->plen;
1376 __be32 mask = inet_make_mask(plen);
1377
1378 if (l->key != (key & ntohl(mask)))
1379 continue;
1380
1381 err = fib_semantic_match(&li->falh, flp, res, plen);
1382
1383#ifdef CONFIG_IP_FIB_TRIE_STATS
1384 if (err <= 0)
1385 t->stats.semantic_match_passed++;
1386 else
1387 t->stats.semantic_match_miss++;
1388#endif
1389 if (err <= 0)
1390 return err;
1391 }
1392
1393 return 1;
1394}
1395
1396static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1397 struct fib_result *res)
1398{
1399 struct trie *t = (struct trie *) tb->tb_data;
1400 int ret;
1401 struct node *n;
1402 struct tnode *pn;
1403 int pos, bits;
1404 t_key key = ntohl(flp->fl4_dst);
1405 int chopped_off;
1406 t_key cindex = 0;
1407 int current_prefix_length = KEYLENGTH;
1408 struct tnode *cn;
1409 t_key node_prefix, key_prefix, pref_mismatch;
1410 int mp;
1411
1412 rcu_read_lock();
1413
1414 n = rcu_dereference(t->trie);
1415 if (!n)
1416 goto failed;
1417
1418#ifdef CONFIG_IP_FIB_TRIE_STATS
1419 t->stats.gets++;
1420#endif
1421
1422 /* Just a leaf? */
1423 if (IS_LEAF(n)) {
1424 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1425 goto found;
1426 }
1427
1428 pn = (struct tnode *) n;
1429 chopped_off = 0;
1430
1431 while (pn) {
1432 pos = pn->pos;
1433 bits = pn->bits;
1434
1435 if (!chopped_off)
1436 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1437 pos, bits);
1438
1439 n = tnode_get_child(pn, cindex);
1440
1441 if (n == NULL) {
1442#ifdef CONFIG_IP_FIB_TRIE_STATS
1443 t->stats.null_node_hit++;
1444#endif
1445 goto backtrace;
1446 }
1447
1448 if (IS_LEAF(n)) {
1449 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1450 if (ret > 0)
1451 goto backtrace;
1452 goto found;
1453 }
1454
1455 cn = (struct tnode *)n;
1456
1457 /*
1458 * It's a tnode, and we can do some extra checks here if we
1459 * like, to avoid descending into a dead-end branch.
1460 * This tnode is in the parent's child array at index
1461 * key[p_pos..p_pos+p_bits] but potentially with some bits
1462 * chopped off, so in reality the index may be just a
1463 * subprefix, padded with zero at the end.
1464 * We can also take a look at any skipped bits in this
1465 * tnode - everything up to p_pos is supposed to be ok,
1466 * and the non-chopped bits of the index (se previous
1467 * paragraph) are also guaranteed ok, but the rest is
1468 * considered unknown.
1469 *
1470 * The skipped bits are key[pos+bits..cn->pos].
1471 */
1472
1473 /* If current_prefix_length < pos+bits, we are already doing
1474 * actual prefix matching, which means everything from
1475 * pos+(bits-chopped_off) onward must be zero along some
1476 * branch of this subtree - otherwise there is *no* valid
1477 * prefix present. Here we can only check the skipped
1478 * bits. Remember, since we have already indexed into the
1479 * parent's child array, we know that the bits we chopped of
1480 * *are* zero.
1481 */
1482
1483 /* NOTA BENE: Checking only skipped bits
1484 for the new node here */
1485
1486 if (current_prefix_length < pos+bits) {
1487 if (tkey_extract_bits(cn->key, current_prefix_length,
1488 cn->pos - current_prefix_length)
1489 || !(cn->child[0]))
1490 goto backtrace;
1491 }
1492
1493 /*
1494 * If chopped_off=0, the index is fully validated and we
1495 * only need to look at the skipped bits for this, the new,
1496 * tnode. What we actually want to do is to find out if
1497 * these skipped bits match our key perfectly, or if we will
1498 * have to count on finding a matching prefix further down,
1499 * because if we do, we would like to have some way of
1500 * verifying the existence of such a prefix at this point.
1501 */
1502
1503 /* The only thing we can do at this point is to verify that
1504 * any such matching prefix can indeed be a prefix to our
1505 * key, and if the bits in the node we are inspecting that
1506 * do not match our key are not ZERO, this cannot be true.
1507 * Thus, find out where there is a mismatch (before cn->pos)
1508 * and verify that all the mismatching bits are zero in the
1509 * new tnode's key.
1510 */
1511
1512 /*
1513 * Note: We aren't very concerned about the piece of
1514 * the key that precede pn->pos+pn->bits, since these
1515 * have already been checked. The bits after cn->pos
1516 * aren't checked since these are by definition
1517 * "unknown" at this point. Thus, what we want to see
1518 * is if we are about to enter the "prefix matching"
1519 * state, and in that case verify that the skipped
1520 * bits that will prevail throughout this subtree are
1521 * zero, as they have to be if we are to find a
1522 * matching prefix.
1523 */
1524
1525 node_prefix = mask_pfx(cn->key, cn->pos);
1526 key_prefix = mask_pfx(key, cn->pos);
1527 pref_mismatch = key_prefix^node_prefix;
1528 mp = 0;
1529
1530 /*
1531 * In short: If skipped bits in this node do not match
1532 * the search key, enter the "prefix matching"
1533 * state.directly.
1534 */
1535 if (pref_mismatch) {
1536 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1537 mp++;
1538 pref_mismatch = pref_mismatch << 1;
1539 }
1540 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1541
1542 if (key_prefix != 0)
1543 goto backtrace;
1544
1545 if (current_prefix_length >= cn->pos)
1546 current_prefix_length = mp;
1547 }
1548
1549 pn = (struct tnode *)n; /* Descend */
1550 chopped_off = 0;
1551 continue;
1552
1553backtrace:
1554 chopped_off++;
1555
1556 /* As zero don't change the child key (cindex) */
1557 while ((chopped_off <= pn->bits)
1558 && !(cindex & (1<<(chopped_off-1))))
1559 chopped_off++;
1560
1561 /* Decrease current_... with bits chopped off */
1562 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1563 current_prefix_length = pn->pos + pn->bits
1564 - chopped_off;
1565
1566 /*
1567 * Either we do the actual chop off according or if we have
1568 * chopped off all bits in this tnode walk up to our parent.
1569 */
1570
1571 if (chopped_off <= pn->bits) {
1572 cindex &= ~(1 << (chopped_off-1));
1573 } else {
1574 struct tnode *parent = node_parent((struct node *) pn);
1575 if (!parent)
1576 goto failed;
1577
1578 /* Get Child's index */
1579 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1580 pn = parent;
1581 chopped_off = 0;
1582
1583#ifdef CONFIG_IP_FIB_TRIE_STATS
1584 t->stats.backtrack++;
1585#endif
1586 goto backtrace;
1587 }
1588 }
1589failed:
1590 ret = 1;
1591found:
1592 rcu_read_unlock();
1593 return ret;
1594}
1595
1596/*
1597 * Remove the leaf and return parent.
1598 */
1599static void trie_leaf_remove(struct trie *t, struct leaf *l)
1600{
1601 struct tnode *tp = node_parent((struct node *) l);
1602
1603 pr_debug("entering trie_leaf_remove(%p)\n", l);
1604
1605 if (tp) {
1606 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1607 put_child(t, (struct tnode *)tp, cindex, NULL);
1608 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1609 } else
1610 rcu_assign_pointer(t->trie, NULL);
1611
1612 free_leaf(l);
1613}
1614
1615/*
1616 * Caller must hold RTNL.
1617 */
1618static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1619{
1620 struct trie *t = (struct trie *) tb->tb_data;
1621 u32 key, mask;
1622 int plen = cfg->fc_dst_len;
1623 u8 tos = cfg->fc_tos;
1624 struct fib_alias *fa, *fa_to_delete;
1625 struct list_head *fa_head;
1626 struct leaf *l;
1627 struct leaf_info *li;
1628
1629 if (plen > 32)
1630 return -EINVAL;
1631
1632 key = ntohl(cfg->fc_dst);
1633 mask = ntohl(inet_make_mask(plen));
1634
1635 if (key & ~mask)
1636 return -EINVAL;
1637
1638 key = key & mask;
1639 l = fib_find_node(t, key);
1640
1641 if (!l)
1642 return -ESRCH;
1643
1644 fa_head = get_fa_head(l, plen);
1645 fa = fib_find_alias(fa_head, tos, 0);
1646
1647 if (!fa)
1648 return -ESRCH;
1649
1650 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1651
1652 fa_to_delete = NULL;
1653 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1654 list_for_each_entry_continue(fa, fa_head, fa_list) {
1655 struct fib_info *fi = fa->fa_info;
1656
1657 if (fa->fa_tos != tos)
1658 break;
1659
1660 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1661 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1662 fa->fa_scope == cfg->fc_scope) &&
1663 (!cfg->fc_protocol ||
1664 fi->fib_protocol == cfg->fc_protocol) &&
1665 fib_nh_match(cfg, fi) == 0) {
1666 fa_to_delete = fa;
1667 break;
1668 }
1669 }
1670
1671 if (!fa_to_delete)
1672 return -ESRCH;
1673
1674 fa = fa_to_delete;
1675 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1676 &cfg->fc_nlinfo, 0);
1677
1678 l = fib_find_node(t, key);
1679 li = find_leaf_info(l, plen);
1680
1681 list_del_rcu(&fa->fa_list);
1682
1683 if (list_empty(fa_head)) {
1684 hlist_del_rcu(&li->hlist);
1685 free_leaf_info(li);
1686 }
1687
1688 if (hlist_empty(&l->list))
1689 trie_leaf_remove(t, l);
1690
1691 if (fa->fa_state & FA_S_ACCESSED)
1692 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1693
1694 fib_release_info(fa->fa_info);
1695 alias_free_mem_rcu(fa);
1696 return 0;
1697}
1698
1699static int trie_flush_list(struct list_head *head)
1700{
1701 struct fib_alias *fa, *fa_node;
1702 int found = 0;
1703
1704 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1705 struct fib_info *fi = fa->fa_info;
1706
1707 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1708 list_del_rcu(&fa->fa_list);
1709 fib_release_info(fa->fa_info);
1710 alias_free_mem_rcu(fa);
1711 found++;
1712 }
1713 }
1714 return found;
1715}
1716
1717static int trie_flush_leaf(struct leaf *l)
1718{
1719 int found = 0;
1720 struct hlist_head *lih = &l->list;
1721 struct hlist_node *node, *tmp;
1722 struct leaf_info *li = NULL;
1723
1724 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1725 found += trie_flush_list(&li->falh);
1726
1727 if (list_empty(&li->falh)) {
1728 hlist_del_rcu(&li->hlist);
1729 free_leaf_info(li);
1730 }
1731 }
1732 return found;
1733}
1734
1735/*
1736 * Scan for the next right leaf starting at node p->child[idx]
1737 * Since we have back pointer, no recursion necessary.
1738 */
1739static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1740{
1741 do {
1742 t_key idx;
1743
1744 if (c)
1745 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1746 else
1747 idx = 0;
1748
1749 while (idx < 1u << p->bits) {
1750 c = tnode_get_child_rcu(p, idx++);
1751 if (!c)
1752 continue;
1753
1754 if (IS_LEAF(c)) {
1755 prefetch(p->child[idx]);
1756 return (struct leaf *) c;
1757 }
1758
1759 /* Rescan start scanning in new node */
1760 p = (struct tnode *) c;
1761 idx = 0;
1762 }
1763
1764 /* Node empty, walk back up to parent */
1765 c = (struct node *) p;
1766 } while ( (p = node_parent_rcu(c)) != NULL);
1767
1768 return NULL; /* Root of trie */
1769}
1770
1771static struct leaf *trie_firstleaf(struct trie *t)
1772{
1773 struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1774
1775 if (!n)
1776 return NULL;
1777
1778 if (IS_LEAF(n)) /* trie is just a leaf */
1779 return (struct leaf *) n;
1780
1781 return leaf_walk_rcu(n, NULL);
1782}
1783
1784static struct leaf *trie_nextleaf(struct leaf *l)
1785{
1786 struct node *c = (struct node *) l;
1787 struct tnode *p = node_parent(c);
1788
1789 if (!p)
1790 return NULL; /* trie with just one leaf */
1791
1792 return leaf_walk_rcu(p, c);
1793}
1794
1795static struct leaf *trie_leafindex(struct trie *t, int index)
1796{
1797 struct leaf *l = trie_firstleaf(t);
1798
1799 while (l && index-- > 0)
1800 l = trie_nextleaf(l);
1801
1802 return l;
1803}
1804
1805
1806/*
1807 * Caller must hold RTNL.
1808 */
1809static int fn_trie_flush(struct fib_table *tb)
1810{
1811 struct trie *t = (struct trie *) tb->tb_data;
1812 struct leaf *l, *ll = NULL;
1813 int found = 0;
1814
1815 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1816 found += trie_flush_leaf(l);
1817
1818 if (ll && hlist_empty(&ll->list))
1819 trie_leaf_remove(t, ll);
1820 ll = l;
1821 }
1822
1823 if (ll && hlist_empty(&ll->list))
1824 trie_leaf_remove(t, ll);
1825
1826 pr_debug("trie_flush found=%d\n", found);
1827 return found;
1828}
1829
1830static void fn_trie_select_default(struct fib_table *tb,
1831 const struct flowi *flp,
1832 struct fib_result *res)
1833{
1834 struct trie *t = (struct trie *) tb->tb_data;
1835 int order, last_idx;
1836 struct fib_info *fi = NULL;
1837 struct fib_info *last_resort;
1838 struct fib_alias *fa = NULL;
1839 struct list_head *fa_head;
1840 struct leaf *l;
1841
1842 last_idx = -1;
1843 last_resort = NULL;
1844 order = -1;
1845
1846 rcu_read_lock();
1847
1848 l = fib_find_node(t, 0);
1849 if (!l)
1850 goto out;
1851
1852 fa_head = get_fa_head(l, 0);
1853 if (!fa_head)
1854 goto out;
1855
1856 if (list_empty(fa_head))
1857 goto out;
1858
1859 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1860 struct fib_info *next_fi = fa->fa_info;
1861
1862 if (fa->fa_scope != res->scope ||
1863 fa->fa_type != RTN_UNICAST)
1864 continue;
1865
1866 if (next_fi->fib_priority > res->fi->fib_priority)
1867 break;
1868 if (!next_fi->fib_nh[0].nh_gw ||
1869 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1870 continue;
1871 fa->fa_state |= FA_S_ACCESSED;
1872
1873 if (fi == NULL) {
1874 if (next_fi != res->fi)
1875 break;
1876 } else if (!fib_detect_death(fi, order, &last_resort,
1877 &last_idx, tb->tb_default)) {
1878 fib_result_assign(res, fi);
1879 tb->tb_default = order;
1880 goto out;
1881 }
1882 fi = next_fi;
1883 order++;
1884 }
1885 if (order <= 0 || fi == NULL) {
1886 tb->tb_default = -1;
1887 goto out;
1888 }
1889
1890 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1891 tb->tb_default)) {
1892 fib_result_assign(res, fi);
1893 tb->tb_default = order;
1894 goto out;
1895 }
1896 if (last_idx >= 0)
1897 fib_result_assign(res, last_resort);
1898 tb->tb_default = last_idx;
1899out:
1900 rcu_read_unlock();
1901}
1902
1903static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1904 struct fib_table *tb,
1905 struct sk_buff *skb, struct netlink_callback *cb)
1906{
1907 int i, s_i;
1908 struct fib_alias *fa;
1909 __be32 xkey = htonl(key);
1910
1911 s_i = cb->args[5];
1912 i = 0;
1913
1914 /* rcu_read_lock is hold by caller */
1915
1916 list_for_each_entry_rcu(fa, fah, fa_list) {
1917 if (i < s_i) {
1918 i++;
1919 continue;
1920 }
1921
1922 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1923 cb->nlh->nlmsg_seq,
1924 RTM_NEWROUTE,
1925 tb->tb_id,
1926 fa->fa_type,
1927 fa->fa_scope,
1928 xkey,
1929 plen,
1930 fa->fa_tos,
1931 fa->fa_info, NLM_F_MULTI) < 0) {
1932 cb->args[5] = i;
1933 return -1;
1934 }
1935 i++;
1936 }
1937 cb->args[5] = i;
1938 return skb->len;
1939}
1940
1941static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1942 struct sk_buff *skb, struct netlink_callback *cb)
1943{
1944 struct leaf_info *li;
1945 struct hlist_node *node;
1946 int i, s_i;
1947
1948 s_i = cb->args[4];
1949 i = 0;
1950
1951 /* rcu_read_lock is hold by caller */
1952 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1953 if (i < s_i) {
1954 i++;
1955 continue;
1956 }
1957
1958 if (i > s_i)
1959 cb->args[5] = 0;
1960
1961 if (list_empty(&li->falh))
1962 continue;
1963
1964 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1965 cb->args[4] = i;
1966 return -1;
1967 }
1968 i++;
1969 }
1970
1971 cb->args[4] = i;
1972 return skb->len;
1973}
1974
1975static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1976 struct netlink_callback *cb)
1977{
1978 struct leaf *l;
1979 struct trie *t = (struct trie *) tb->tb_data;
1980 t_key key = cb->args[2];
1981 int count = cb->args[3];
1982
1983 rcu_read_lock();
1984 /* Dump starting at last key.
1985 * Note: 0.0.0.0/0 (ie default) is first key.
1986 */
1987 if (count == 0)
1988 l = trie_firstleaf(t);
1989 else {
1990 /* Normally, continue from last key, but if that is missing
1991 * fallback to using slow rescan
1992 */
1993 l = fib_find_node(t, key);
1994 if (!l)
1995 l = trie_leafindex(t, count);
1996 }
1997
1998 while (l) {
1999 cb->args[2] = l->key;
2000 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
2001 cb->args[3] = count;
2002 rcu_read_unlock();
2003 return -1;
2004 }
2005
2006 ++count;
2007 l = trie_nextleaf(l);
2008 memset(&cb->args[4], 0,
2009 sizeof(cb->args) - 4*sizeof(cb->args[0]));
2010 }
2011 cb->args[3] = count;
2012 rcu_read_unlock();
2013
2014 return skb->len;
2015}
2016
2017void __init fib_hash_init(void)
2018{
2019 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2020 sizeof(struct fib_alias),
2021 0, SLAB_PANIC, NULL);
2022
2023 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2024 max(sizeof(struct leaf),
2025 sizeof(struct leaf_info)),
2026 0, SLAB_PANIC, NULL);
2027}
2028
2029
2030/* Fix more generic FIB names for init later */
2031struct fib_table *fib_hash_table(u32 id)
2032{
2033 struct fib_table *tb;
2034 struct trie *t;
2035
2036 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2037 GFP_KERNEL);
2038 if (tb == NULL)
2039 return NULL;
2040
2041 tb->tb_id = id;
2042 tb->tb_default = -1;
2043 tb->tb_lookup = fn_trie_lookup;
2044 tb->tb_insert = fn_trie_insert;
2045 tb->tb_delete = fn_trie_delete;
2046 tb->tb_flush = fn_trie_flush;
2047 tb->tb_select_default = fn_trie_select_default;
2048 tb->tb_dump = fn_trie_dump;
2049
2050 t = (struct trie *) tb->tb_data;
2051 memset(t, 0, sizeof(*t));
2052
2053 if (id == RT_TABLE_LOCAL)
2054 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2055
2056 return tb;
2057}
2058
2059#ifdef CONFIG_PROC_FS
2060/* Depth first Trie walk iterator */
2061struct fib_trie_iter {
2062 struct seq_net_private p;
2063 struct fib_table *tb;
2064 struct tnode *tnode;
2065 unsigned index;
2066 unsigned depth;
2067};
2068
2069static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2070{
2071 struct tnode *tn = iter->tnode;
2072 unsigned cindex = iter->index;
2073 struct tnode *p;
2074
2075 /* A single entry routing table */
2076 if (!tn)
2077 return NULL;
2078
2079 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2080 iter->tnode, iter->index, iter->depth);
2081rescan:
2082 while (cindex < (1<<tn->bits)) {
2083 struct node *n = tnode_get_child_rcu(tn, cindex);
2084
2085 if (n) {
2086 if (IS_LEAF(n)) {
2087 iter->tnode = tn;
2088 iter->index = cindex + 1;
2089 } else {
2090 /* push down one level */
2091 iter->tnode = (struct tnode *) n;
2092 iter->index = 0;
2093 ++iter->depth;
2094 }
2095 return n;
2096 }
2097
2098 ++cindex;
2099 }
2100
2101 /* Current node exhausted, pop back up */
2102 p = node_parent_rcu((struct node *)tn);
2103 if (p) {
2104 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2105 tn = p;
2106 --iter->depth;
2107 goto rescan;
2108 }
2109
2110 /* got root? */
2111 return NULL;
2112}
2113
2114static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2115 struct trie *t)
2116{
2117 struct node *n;
2118
2119 if (!t)
2120 return NULL;
2121
2122 n = rcu_dereference(t->trie);
2123 if (!n)
2124 return NULL;
2125
2126 if (IS_TNODE(n)) {
2127 iter->tnode = (struct tnode *) n;
2128 iter->index = 0;
2129 iter->depth = 1;
2130 } else {
2131 iter->tnode = NULL;
2132 iter->index = 0;
2133 iter->depth = 0;
2134 }
2135
2136 return n;
2137}
2138
2139static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2140{
2141 struct node *n;
2142 struct fib_trie_iter iter;
2143
2144 memset(s, 0, sizeof(*s));
2145
2146 rcu_read_lock();
2147 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2148 if (IS_LEAF(n)) {
2149 struct leaf *l = (struct leaf *)n;
2150 struct leaf_info *li;
2151 struct hlist_node *tmp;
2152
2153 s->leaves++;
2154 s->totdepth += iter.depth;
2155 if (iter.depth > s->maxdepth)
2156 s->maxdepth = iter.depth;
2157
2158 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2159 ++s->prefixes;
2160 } else {
2161 const struct tnode *tn = (const struct tnode *) n;
2162 int i;
2163
2164 s->tnodes++;
2165 if (tn->bits < MAX_STAT_DEPTH)
2166 s->nodesizes[tn->bits]++;
2167
2168 for (i = 0; i < (1<<tn->bits); i++)
2169 if (!tn->child[i])
2170 s->nullpointers++;
2171 }
2172 }
2173 rcu_read_unlock();
2174}
2175
2176/*
2177 * This outputs /proc/net/fib_triestats
2178 */
2179static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2180{
2181 unsigned i, max, pointers, bytes, avdepth;
2182
2183 if (stat->leaves)
2184 avdepth = stat->totdepth*100 / stat->leaves;
2185 else
2186 avdepth = 0;
2187
2188 seq_printf(seq, "\tAver depth: %u.%02d\n",
2189 avdepth / 100, avdepth % 100);
2190 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2191
2192 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2193 bytes = sizeof(struct leaf) * stat->leaves;
2194
2195 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2196 bytes += sizeof(struct leaf_info) * stat->prefixes;
2197
2198 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2199 bytes += sizeof(struct tnode) * stat->tnodes;
2200
2201 max = MAX_STAT_DEPTH;
2202 while (max > 0 && stat->nodesizes[max-1] == 0)
2203 max--;
2204
2205 pointers = 0;
2206 for (i = 1; i <= max; i++)
2207 if (stat->nodesizes[i] != 0) {
2208 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2209 pointers += (1<<i) * stat->nodesizes[i];
2210 }
2211 seq_putc(seq, '\n');
2212 seq_printf(seq, "\tPointers: %u\n", pointers);
2213
2214 bytes += sizeof(struct node *) * pointers;
2215 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2216 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2217}
2218
2219#ifdef CONFIG_IP_FIB_TRIE_STATS
2220static void trie_show_usage(struct seq_file *seq,
2221 const struct trie_use_stats *stats)
2222{
2223 seq_printf(seq, "\nCounters:\n---------\n");
2224 seq_printf(seq, "gets = %u\n", stats->gets);
2225 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2226 seq_printf(seq, "semantic match passed = %u\n",
2227 stats->semantic_match_passed);
2228 seq_printf(seq, "semantic match miss = %u\n",
2229 stats->semantic_match_miss);
2230 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2231 seq_printf(seq, "skipped node resize = %u\n\n",
2232 stats->resize_node_skipped);
2233}
2234#endif /* CONFIG_IP_FIB_TRIE_STATS */
2235
2236static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2237{
2238 if (tb->tb_id == RT_TABLE_LOCAL)
2239 seq_puts(seq, "Local:\n");
2240 else if (tb->tb_id == RT_TABLE_MAIN)
2241 seq_puts(seq, "Main:\n");
2242 else
2243 seq_printf(seq, "Id %d:\n", tb->tb_id);
2244}
2245
2246
2247static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2248{
2249 struct net *net = (struct net *)seq->private;
2250 unsigned int h;
2251
2252 seq_printf(seq,
2253 "Basic info: size of leaf:"
2254 " %Zd bytes, size of tnode: %Zd bytes.\n",
2255 sizeof(struct leaf), sizeof(struct tnode));
2256
2257 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2258 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2259 struct hlist_node *node;
2260 struct fib_table *tb;
2261
2262 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2263 struct trie *t = (struct trie *) tb->tb_data;
2264 struct trie_stat stat;
2265
2266 if (!t)
2267 continue;
2268
2269 fib_table_print(seq, tb);
2270
2271 trie_collect_stats(t, &stat);
2272 trie_show_stats(seq, &stat);
2273#ifdef CONFIG_IP_FIB_TRIE_STATS
2274 trie_show_usage(seq, &t->stats);
2275#endif
2276 }
2277 }
2278
2279 return 0;
2280}
2281
2282static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2283{
2284 return single_open_net(inode, file, fib_triestat_seq_show);
2285}
2286
2287static const struct file_operations fib_triestat_fops = {
2288 .owner = THIS_MODULE,
2289 .open = fib_triestat_seq_open,
2290 .read = seq_read,
2291 .llseek = seq_lseek,
2292 .release = single_release_net,
2293};
2294
2295static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2296{
2297 struct fib_trie_iter *iter = seq->private;
2298 struct net *net = seq_file_net(seq);
2299 loff_t idx = 0;
2300 unsigned int h;
2301
2302 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2303 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2304 struct hlist_node *node;
2305 struct fib_table *tb;
2306
2307 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2308 struct node *n;
2309
2310 for (n = fib_trie_get_first(iter,
2311 (struct trie *) tb->tb_data);
2312 n; n = fib_trie_get_next(iter))
2313 if (pos == idx++) {
2314 iter->tb = tb;
2315 return n;
2316 }
2317 }
2318 }
2319
2320 return NULL;
2321}
2322
2323static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2324 __acquires(RCU)
2325{
2326 rcu_read_lock();
2327 return fib_trie_get_idx(seq, *pos);
2328}
2329
2330static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2331{
2332 struct fib_trie_iter *iter = seq->private;
2333 struct net *net = seq_file_net(seq);
2334 struct fib_table *tb = iter->tb;
2335 struct hlist_node *tb_node;
2336 unsigned int h;
2337 struct node *n;
2338
2339 ++*pos;
2340 /* next node in same table */
2341 n = fib_trie_get_next(iter);
2342 if (n)
2343 return n;
2344
2345 /* walk rest of this hash chain */
2346 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2347 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2348 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2349 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2350 if (n)
2351 goto found;
2352 }
2353
2354 /* new hash chain */
2355 while (++h < FIB_TABLE_HASHSZ) {
2356 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2357 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2358 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2359 if (n)
2360 goto found;
2361 }
2362 }
2363 return NULL;
2364
2365found:
2366 iter->tb = tb;
2367 return n;
2368}
2369
2370static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2371 __releases(RCU)
2372{
2373 rcu_read_unlock();
2374}
2375
2376static void seq_indent(struct seq_file *seq, int n)
2377{
2378 while (n-- > 0) seq_puts(seq, " ");
2379}
2380
2381static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2382{
2383 switch (s) {
2384 case RT_SCOPE_UNIVERSE: return "universe";
2385 case RT_SCOPE_SITE: return "site";
2386 case RT_SCOPE_LINK: return "link";
2387 case RT_SCOPE_HOST: return "host";
2388 case RT_SCOPE_NOWHERE: return "nowhere";
2389 default:
2390 snprintf(buf, len, "scope=%d", s);
2391 return buf;
2392 }
2393}
2394
2395static const char *rtn_type_names[__RTN_MAX] = {
2396 [RTN_UNSPEC] = "UNSPEC",
2397 [RTN_UNICAST] = "UNICAST",
2398 [RTN_LOCAL] = "LOCAL",
2399 [RTN_BROADCAST] = "BROADCAST",
2400 [RTN_ANYCAST] = "ANYCAST",
2401 [RTN_MULTICAST] = "MULTICAST",
2402 [RTN_BLACKHOLE] = "BLACKHOLE",
2403 [RTN_UNREACHABLE] = "UNREACHABLE",
2404 [RTN_PROHIBIT] = "PROHIBIT",
2405 [RTN_THROW] = "THROW",
2406 [RTN_NAT] = "NAT",
2407 [RTN_XRESOLVE] = "XRESOLVE",
2408};
2409
2410static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2411{
2412 if (t < __RTN_MAX && rtn_type_names[t])
2413 return rtn_type_names[t];
2414 snprintf(buf, len, "type %u", t);
2415 return buf;
2416}
2417
2418/* Pretty print the trie */
2419static int fib_trie_seq_show(struct seq_file *seq, void *v)
2420{
2421 const struct fib_trie_iter *iter = seq->private;
2422 struct node *n = v;
2423
2424 if (!node_parent_rcu(n))
2425 fib_table_print(seq, iter->tb);
2426
2427 if (IS_TNODE(n)) {
2428 struct tnode *tn = (struct tnode *) n;
2429 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2430
2431 seq_indent(seq, iter->depth-1);
2432 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2433 &prf, tn->pos, tn->bits, tn->full_children,
2434 tn->empty_children);
2435
2436 } else {
2437 struct leaf *l = (struct leaf *) n;
2438 struct leaf_info *li;
2439 struct hlist_node *node;
2440 __be32 val = htonl(l->key);
2441
2442 seq_indent(seq, iter->depth);
2443 seq_printf(seq, " |-- %pI4\n", &val);
2444
2445 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2446 struct fib_alias *fa;
2447
2448 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2449 char buf1[32], buf2[32];
2450
2451 seq_indent(seq, iter->depth+1);
2452 seq_printf(seq, " /%d %s %s", li->plen,
2453 rtn_scope(buf1, sizeof(buf1),
2454 fa->fa_scope),
2455 rtn_type(buf2, sizeof(buf2),
2456 fa->fa_type));
2457 if (fa->fa_tos)
2458 seq_printf(seq, " tos=%d", fa->fa_tos);
2459 seq_putc(seq, '\n');
2460 }
2461 }
2462 }
2463
2464 return 0;
2465}
2466
2467static const struct seq_operations fib_trie_seq_ops = {
2468 .start = fib_trie_seq_start,
2469 .next = fib_trie_seq_next,
2470 .stop = fib_trie_seq_stop,
2471 .show = fib_trie_seq_show,
2472};
2473
2474static int fib_trie_seq_open(struct inode *inode, struct file *file)
2475{
2476 return seq_open_net(inode, file, &fib_trie_seq_ops,
2477 sizeof(struct fib_trie_iter));
2478}
2479
2480static const struct file_operations fib_trie_fops = {
2481 .owner = THIS_MODULE,
2482 .open = fib_trie_seq_open,
2483 .read = seq_read,
2484 .llseek = seq_lseek,
2485 .release = seq_release_net,
2486};
2487
2488struct fib_route_iter {
2489 struct seq_net_private p;
2490 struct trie *main_trie;
2491 loff_t pos;
2492 t_key key;
2493};
2494
2495static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2496{
2497 struct leaf *l = NULL;
2498 struct trie *t = iter->main_trie;
2499
2500 /* use cache location of last found key */
2501 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2502 pos -= iter->pos;
2503 else {
2504 iter->pos = 0;
2505 l = trie_firstleaf(t);
2506 }
2507
2508 while (l && pos-- > 0) {
2509 iter->pos++;
2510 l = trie_nextleaf(l);
2511 }
2512
2513 if (l)
2514 iter->key = pos; /* remember it */
2515 else
2516 iter->pos = 0; /* forget it */
2517
2518 return l;
2519}
2520
2521static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2522 __acquires(RCU)
2523{
2524 struct fib_route_iter *iter = seq->private;
2525 struct fib_table *tb;
2526
2527 rcu_read_lock();
2528 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2529 if (!tb)
2530 return NULL;
2531
2532 iter->main_trie = (struct trie *) tb->tb_data;
2533 if (*pos == 0)
2534 return SEQ_START_TOKEN;
2535 else
2536 return fib_route_get_idx(iter, *pos - 1);
2537}
2538
2539static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2540{
2541 struct fib_route_iter *iter = seq->private;
2542 struct leaf *l = v;
2543
2544 ++*pos;
2545 if (v == SEQ_START_TOKEN) {
2546 iter->pos = 0;
2547 l = trie_firstleaf(iter->main_trie);
2548 } else {
2549 iter->pos++;
2550 l = trie_nextleaf(l);
2551 }
2552
2553 if (l)
2554 iter->key = l->key;
2555 else
2556 iter->pos = 0;
2557 return l;
2558}
2559
2560static void fib_route_seq_stop(struct seq_file *seq, void *v)
2561 __releases(RCU)
2562{
2563 rcu_read_unlock();
2564}
2565
2566static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2567{
2568 static unsigned type2flags[RTN_MAX + 1] = {
2569 [7] = RTF_REJECT, [8] = RTF_REJECT,
2570 };
2571 unsigned flags = type2flags[type];
2572
2573 if (fi && fi->fib_nh->nh_gw)
2574 flags |= RTF_GATEWAY;
2575 if (mask == htonl(0xFFFFFFFF))
2576 flags |= RTF_HOST;
2577 flags |= RTF_UP;
2578 return flags;
2579}
2580
2581/*
2582 * This outputs /proc/net/route.
2583 * The format of the file is not supposed to be changed
2584 * and needs to be same as fib_hash output to avoid breaking
2585 * legacy utilities
2586 */
2587static int fib_route_seq_show(struct seq_file *seq, void *v)
2588{
2589 struct leaf *l = v;
2590 struct leaf_info *li;
2591 struct hlist_node *node;
2592
2593 if (v == SEQ_START_TOKEN) {
2594 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2595 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2596 "\tWindow\tIRTT");
2597 return 0;
2598 }
2599
2600 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2601 struct fib_alias *fa;
2602 __be32 mask, prefix;
2603
2604 mask = inet_make_mask(li->plen);
2605 prefix = htonl(l->key);
2606
2607 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2608 const struct fib_info *fi = fa->fa_info;
2609 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2610 int len;
2611
2612 if (fa->fa_type == RTN_BROADCAST
2613 || fa->fa_type == RTN_MULTICAST)
2614 continue;
2615
2616 if (fi)
2617 seq_printf(seq,
2618 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2619 "%d\t%08X\t%d\t%u\t%u%n",
2620 fi->fib_dev ? fi->fib_dev->name : "*",
2621 prefix,
2622 fi->fib_nh->nh_gw, flags, 0, 0,
2623 fi->fib_priority,
2624 mask,
2625 (fi->fib_advmss ?
2626 fi->fib_advmss + 40 : 0),
2627 fi->fib_window,
2628 fi->fib_rtt >> 3, &len);
2629 else
2630 seq_printf(seq,
2631 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2632 "%d\t%08X\t%d\t%u\t%u%n",
2633 prefix, 0, flags, 0, 0, 0,
2634 mask, 0, 0, 0, &len);
2635
2636 seq_printf(seq, "%*s\n", 127 - len, "");
2637 }
2638 }
2639
2640 return 0;
2641}
2642
2643static const struct seq_operations fib_route_seq_ops = {
2644 .start = fib_route_seq_start,
2645 .next = fib_route_seq_next,
2646 .stop = fib_route_seq_stop,
2647 .show = fib_route_seq_show,
2648};
2649
2650static int fib_route_seq_open(struct inode *inode, struct file *file)
2651{
2652 return seq_open_net(inode, file, &fib_route_seq_ops,
2653 sizeof(struct fib_route_iter));
2654}
2655
2656static const struct file_operations fib_route_fops = {
2657 .owner = THIS_MODULE,
2658 .open = fib_route_seq_open,
2659 .read = seq_read,
2660 .llseek = seq_lseek,
2661 .release = seq_release_net,
2662};
2663
2664int __net_init fib_proc_init(struct net *net)
2665{
2666 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2667 goto out1;
2668
2669 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2670 &fib_triestat_fops))
2671 goto out2;
2672
2673 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2674 goto out3;
2675
2676 return 0;
2677
2678out3:
2679 proc_net_remove(net, "fib_triestat");
2680out2:
2681 proc_net_remove(net, "fib_trie");
2682out1:
2683 return -ENOMEM;
2684}
2685
2686void __net_exit fib_proc_exit(struct net *net)
2687{
2688 proc_net_remove(net, "fib_trie");
2689 proc_net_remove(net, "fib_triestat");
2690 proc_net_remove(net, "route");
2691}
2692
2693#endif /* CONFIG_PROC_FS */
This page took 0.032591 seconds and 5 git commands to generate.