[NET]: Introduce inet_connection_sock
[deliverable/linux.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
2f36895a 46#define VERSION "0.325"
19baf839
RO
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \
c877efb2 93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
19baf839 94#define NODE_SET_PARENT(_node, _ptr) \
c877efb2 95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
19baf839
RO
96 ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \
c877efb2 98 ((_node)->_parent = (_type))
19baf839 99#define NODE_TYPE(_node) \
c877efb2 100 ((_node)->_parent & NODE_TYPE_MASK)
19baf839
RO
101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104
105struct node {
106 t_key key;
107 unsigned long _parent;
108};
109
110struct leaf {
111 t_key key;
112 unsigned long _parent;
113 struct hlist_head list;
114};
115
116struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120};
121
122struct tnode {
123 t_key key;
124 unsigned long _parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
2f36895a 139 unsigned int resize_node_skipped;
19baf839
RO
140};
141#endif
142
143struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
c877efb2 150};
19baf839
RO
151
152struct trie {
153 struct node *trie;
154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156#endif
157 int size;
158 unsigned int revision;
159};
160
161static int trie_debug = 0;
162
163static int tnode_full(struct tnode *tn, struct node *n);
164static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166static int tnode_child_length(struct tnode *tn);
167static struct node *resize(struct trie *t, struct tnode *tn);
2f36895a
RO
168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
19baf839
RO
170static void tnode_free(struct tnode *tn);
171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
173extern int fib_detect_death(struct fib_info *fi, int order,
174 struct fib_info **last_resort, int *last_idx, int *dflt);
175
176extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
177 struct nlmsghdr *n, struct netlink_skb_parms *req);
178
179static kmem_cache_t *fn_alias_kmem;
180static struct trie *trie_local = NULL, *trie_main = NULL;
181
182static void trie_bug(char *err)
183{
184 printk("Trie Bug: %s\n", err);
185 BUG();
186}
187
c877efb2 188static inline struct node *tnode_get_child(struct tnode *tn, int i)
19baf839 189{
c877efb2 190 if (i >= 1<<tn->bits)
19baf839
RO
191 trie_bug("tnode_get_child");
192
193 return tn->child[i];
194}
195
196static inline int tnode_child_length(struct tnode *tn)
197{
198 return 1<<tn->bits;
199}
200
201/*
202 _________________________________________________________________
203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204 ----------------------------------------------------------------
c877efb2 205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
206
207 _________________________________________________________________
208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209 -----------------------------------------------------------------
210 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
211
212 tp->pos = 7
213 tp->bits = 3
214 n->pos = 15
215 n->bits=4
216 KEYLENGTH=32
217*/
218
219static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
220{
221 if (offset < KEYLENGTH)
222 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
223 else
224 return 0;
225}
226
227static inline int tkey_equals(t_key a, t_key b)
228{
c877efb2 229 return a == b;
19baf839
RO
230}
231
232static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
233{
c877efb2
SH
234 if (bits == 0 || offset >= KEYLENGTH)
235 return 1;
19baf839
RO
236 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 238}
19baf839
RO
239
240static inline int tkey_mismatch(t_key a, int offset, t_key b)
241{
242 t_key diff = a ^ b;
243 int i = offset;
244
c877efb2
SH
245 if (!diff)
246 return 0;
247 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
248 i++;
249 return i;
250}
251
252/* Candiate for fib_semantics */
253
254static void fn_free_alias(struct fib_alias *fa)
255{
256 fib_release_info(fa->fa_info);
257 kmem_cache_free(fn_alias_kmem, fa);
258}
259
260/*
261 To understand this stuff, an understanding of keys and all their bits is
262 necessary. Every node in the trie has a key associated with it, but not
263 all of the bits in that key are significant.
264
265 Consider a node 'n' and its parent 'tp'.
266
267 If n is a leaf, every bit in its key is significant. Its presence is
268 necessitaded by path compression, since during a tree traversal (when
269 searching for a leaf - unless we are doing an insertion) we will completely
270 ignore all skipped bits we encounter. Thus we need to verify, at the end of
271 a potentially successful search, that we have indeed been walking the
272 correct key path.
273
274 Note that we can never "miss" the correct key in the tree if present by
275 following the wrong path. Path compression ensures that segments of the key
276 that are the same for all keys with a given prefix are skipped, but the
277 skipped part *is* identical for each node in the subtrie below the skipped
278 bit! trie_insert() in this implementation takes care of that - note the
279 call to tkey_sub_equals() in trie_insert().
280
281 if n is an internal node - a 'tnode' here, the various parts of its key
282 have many different meanings.
283
284 Example:
285 _________________________________________________________________
286 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287 -----------------------------------------------------------------
288 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
289
290 _________________________________________________________________
291 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292 -----------------------------------------------------------------
293 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
294
295 tp->pos = 7
296 tp->bits = 3
297 n->pos = 15
298 n->bits=4
299
300 First, let's just ignore the bits that come before the parent tp, that is
301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302 not use them for anything.
303
304 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305 index into the parent's child array. That is, they will be used to find
306 'n' among tp's children.
307
308 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
309 for the node n.
310
311 All the bits we have seen so far are significant to the node n. The rest
312 of the bits are really not needed or indeed known in n->key.
313
314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315 n's child array, and will of course be different for each child.
316
c877efb2 317
19baf839
RO
318 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
319 at this point.
320
321*/
322
323static void check_tnode(struct tnode *tn)
324{
c877efb2 325 if (tn && tn->pos+tn->bits > 32) {
19baf839
RO
326 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
327 }
328}
329
330static int halve_threshold = 25;
331static int inflate_threshold = 50;
332
333static struct leaf *leaf_new(void)
334{
335 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
c877efb2 336 if (l) {
19baf839
RO
337 NODE_INIT_PARENT(l, T_LEAF);
338 INIT_HLIST_HEAD(&l->list);
339 }
340 return l;
341}
342
343static struct leaf_info *leaf_info_new(int plen)
344{
345 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
c877efb2 346 if (li) {
f835e471
RO
347 li->plen = plen;
348 INIT_LIST_HEAD(&li->falh);
349 }
19baf839
RO
350 return li;
351}
352
353static inline void free_leaf(struct leaf *l)
354{
355 kfree(l);
356}
357
358static inline void free_leaf_info(struct leaf_info *li)
359{
360 kfree(li);
361}
362
f0e36f8c
PM
363static struct tnode *tnode_alloc(unsigned int size)
364{
365 if (size <= PAGE_SIZE) {
366 return kmalloc(size, GFP_KERNEL);
367 } else {
368 return (struct tnode *)
c877efb2 369 __get_free_pages(GFP_KERNEL, get_order(size));
f0e36f8c
PM
370 }
371}
372
373static void __tnode_free(struct tnode *tn)
374{
375 unsigned int size = sizeof(struct tnode) +
376 (1<<tn->bits) * sizeof(struct node *);
377
378 if (size <= PAGE_SIZE)
379 kfree(tn);
380 else
381 free_pages((unsigned long)tn, get_order(size));
382}
383
19baf839
RO
384static struct tnode* tnode_new(t_key key, int pos, int bits)
385{
386 int nchildren = 1<<bits;
387 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
f0e36f8c 388 struct tnode *tn = tnode_alloc(sz);
19baf839 389
c877efb2 390 if (tn) {
19baf839
RO
391 memset(tn, 0, sz);
392 NODE_INIT_PARENT(tn, T_TNODE);
393 tn->pos = pos;
394 tn->bits = bits;
395 tn->key = key;
396 tn->full_children = 0;
397 tn->empty_children = 1<<bits;
398 }
c877efb2
SH
399
400 if (trie_debug > 0)
19baf839
RO
401 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
402 (unsigned int) (sizeof(struct node) * 1<<bits));
403 return tn;
404}
405
406static void tnode_free(struct tnode *tn)
407{
c877efb2 408 if (!tn) {
19baf839
RO
409 trie_bug("tnode_free\n");
410 }
c877efb2 411 if (IS_LEAF(tn)) {
19baf839 412 free_leaf((struct leaf *)tn);
c877efb2 413 if (trie_debug > 0 )
19baf839
RO
414 printk("FL %p \n", tn);
415 }
c877efb2 416 else if (IS_TNODE(tn)) {
f0e36f8c 417 __tnode_free(tn);
c877efb2 418 if (trie_debug > 0 )
19baf839
RO
419 printk("FT %p \n", tn);
420 }
421 else {
422 trie_bug("tnode_free\n");
423 }
424}
425
426/*
427 * Check whether a tnode 'n' is "full", i.e. it is an internal node
428 * and no bits are skipped. See discussion in dyntree paper p. 6
429 */
430
431static inline int tnode_full(struct tnode *tn, struct node *n)
432{
c877efb2 433 if (n == NULL || IS_LEAF(n))
19baf839
RO
434 return 0;
435
436 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
437}
438
c877efb2 439static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
19baf839
RO
440{
441 tnode_put_child_reorg(tn, i, n, -1);
442}
443
c877efb2 444 /*
19baf839
RO
445 * Add a child at position i overwriting the old value.
446 * Update the value of full_children and empty_children.
447 */
448
c877efb2 449static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
19baf839
RO
450{
451 struct node *chi;
452 int isfull;
453
c877efb2 454 if (i >= 1<<tn->bits) {
19baf839
RO
455 printk("bits=%d, i=%d\n", tn->bits, i);
456 trie_bug("tnode_put_child_reorg bits");
457 }
458 write_lock_bh(&fib_lock);
c877efb2 459 chi = tn->child[i];
19baf839
RO
460
461 /* update emptyChildren */
462 if (n == NULL && chi != NULL)
463 tn->empty_children++;
464 else if (n != NULL && chi == NULL)
465 tn->empty_children--;
c877efb2 466
19baf839
RO
467 /* update fullChildren */
468 if (wasfull == -1)
469 wasfull = tnode_full(tn, chi);
470
471 isfull = tnode_full(tn, n);
c877efb2 472 if (wasfull && !isfull)
19baf839 473 tn->full_children--;
c877efb2
SH
474
475 else if (!wasfull && isfull)
19baf839 476 tn->full_children++;
c877efb2
SH
477 if (n)
478 NODE_SET_PARENT(n, tn);
19baf839
RO
479
480 tn->child[i] = n;
481 write_unlock_bh(&fib_lock);
482}
483
c877efb2 484static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
485{
486 int i;
2f36895a 487 int err = 0;
19baf839
RO
488
489 if (!tn)
490 return NULL;
491
c877efb2
SH
492 if (trie_debug)
493 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
19baf839
RO
494 tn, inflate_threshold, halve_threshold);
495
496 /* No children */
497 if (tn->empty_children == tnode_child_length(tn)) {
498 tnode_free(tn);
499 return NULL;
500 }
501 /* One child */
502 if (tn->empty_children == tnode_child_length(tn) - 1)
503 for (i = 0; i < tnode_child_length(tn); i++) {
504
505 write_lock_bh(&fib_lock);
506 if (tn->child[i] != NULL) {
507
508 /* compress one level */
509 struct node *n = tn->child[i];
c877efb2 510 if (n)
19baf839
RO
511 NODE_INIT_PARENT(n, NODE_TYPE(n));
512
513 write_unlock_bh(&fib_lock);
514 tnode_free(tn);
515 return n;
516 }
517 write_unlock_bh(&fib_lock);
518 }
c877efb2 519 /*
19baf839
RO
520 * Double as long as the resulting node has a number of
521 * nonempty nodes that are above the threshold.
522 */
523
524 /*
c877efb2
SH
525 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
526 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 527 * Telecommunications, page 6:
c877efb2 528 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
529 * children in the *doubled* node is at least 'high'."
530 *
c877efb2
SH
531 * 'high' in this instance is the variable 'inflate_threshold'. It
532 * is expressed as a percentage, so we multiply it with
533 * tnode_child_length() and instead of multiplying by 2 (since the
534 * child array will be doubled by inflate()) and multiplying
535 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 536 * multiply the left-hand side by 50.
c877efb2
SH
537 *
538 * The left-hand side may look a bit weird: tnode_child_length(tn)
539 * - tn->empty_children is of course the number of non-null children
540 * in the current node. tn->full_children is the number of "full"
19baf839 541 * children, that is non-null tnodes with a skip value of 0.
c877efb2 542 * All of those will be doubled in the resulting inflated tnode, so
19baf839 543 * we just count them one extra time here.
c877efb2 544 *
19baf839 545 * A clearer way to write this would be:
c877efb2 546 *
19baf839 547 * to_be_doubled = tn->full_children;
c877efb2 548 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
549 * tn->full_children;
550 *
551 * new_child_length = tnode_child_length(tn) * 2;
552 *
c877efb2 553 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
554 * new_child_length;
555 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
556 *
557 * ...and so on, tho it would mess up the while () loop.
558 *
19baf839
RO
559 * anyway,
560 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
561 * inflate_threshold
c877efb2 562 *
19baf839
RO
563 * avoid a division:
564 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
565 * inflate_threshold * new_child_length
c877efb2 566 *
19baf839 567 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 568 * 100 * (tnode_child_length(tn) - tn->empty_children +
19baf839 569 * tn->full_children ) >= inflate_threshold * new_child_length
c877efb2 570 *
19baf839 571 * expand new_child_length:
c877efb2 572 * 100 * (tnode_child_length(tn) - tn->empty_children +
19baf839
RO
573 * tn->full_children ) >=
574 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 575 *
19baf839 576 * shorten again:
c877efb2
SH
577 * 50 * (tn->full_children + tnode_child_length(tn) -
578 * tn->empty_children ) >= inflate_threshold *
19baf839 579 * tnode_child_length(tn)
c877efb2 580 *
19baf839
RO
581 */
582
583 check_tnode(tn);
c877efb2 584
2f36895a 585 err = 0;
19baf839
RO
586 while ((tn->full_children > 0 &&
587 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
588 inflate_threshold * tnode_child_length(tn))) {
589
2f36895a
RO
590 tn = inflate(t, tn, &err);
591
c877efb2 592 if (err) {
2f36895a
RO
593#ifdef CONFIG_IP_FIB_TRIE_STATS
594 t->stats.resize_node_skipped++;
595#endif
596 break;
597 }
19baf839
RO
598 }
599
600 check_tnode(tn);
601
602 /*
603 * Halve as long as the number of empty children in this
604 * node is above threshold.
605 */
2f36895a
RO
606
607 err = 0;
19baf839
RO
608 while (tn->bits > 1 &&
609 100 * (tnode_child_length(tn) - tn->empty_children) <
2f36895a
RO
610 halve_threshold * tnode_child_length(tn)) {
611
612 tn = halve(t, tn, &err);
613
c877efb2 614 if (err) {
2f36895a
RO
615#ifdef CONFIG_IP_FIB_TRIE_STATS
616 t->stats.resize_node_skipped++;
617#endif
618 break;
619 }
620 }
19baf839 621
c877efb2 622
19baf839
RO
623 /* Only one child remains */
624
625 if (tn->empty_children == tnode_child_length(tn) - 1)
626 for (i = 0; i < tnode_child_length(tn); i++) {
c877efb2 627
19baf839
RO
628 write_lock_bh(&fib_lock);
629 if (tn->child[i] != NULL) {
630 /* compress one level */
631 struct node *n = tn->child[i];
632
c877efb2 633 if (n)
19baf839
RO
634 NODE_INIT_PARENT(n, NODE_TYPE(n));
635
636 write_unlock_bh(&fib_lock);
637 tnode_free(tn);
638 return n;
639 }
640 write_unlock_bh(&fib_lock);
641 }
642
643 return (struct node *) tn;
644}
645
2f36895a 646static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
19baf839
RO
647{
648 struct tnode *inode;
649 struct tnode *oldtnode = tn;
650 int olen = tnode_child_length(tn);
651 int i;
652
c877efb2 653 if (trie_debug)
19baf839
RO
654 printk("In inflate\n");
655
656 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
657
2f36895a
RO
658 if (!tn) {
659 *err = -ENOMEM;
660 return oldtnode;
661 }
662
663 /*
c877efb2
SH
664 * Preallocate and store tnodes before the actual work so we
665 * don't get into an inconsistent state if memory allocation
666 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
667 * of tnode is ignored.
668 */
c877efb2 669
2f36895a
RO
670 for(i = 0; i < olen; i++) {
671 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
672
673 if (inode &&
674 IS_TNODE(inode) &&
675 inode->pos == oldtnode->pos + oldtnode->bits &&
676 inode->bits > 1) {
677 struct tnode *left, *right;
678
679 t_key m = TKEY_GET_MASK(inode->pos, 1);
c877efb2 680
2f36895a
RO
681 left = tnode_new(inode->key&(~m), inode->pos + 1,
682 inode->bits - 1);
683
c877efb2
SH
684 if (!left) {
685 *err = -ENOMEM;
2f36895a
RO
686 break;
687 }
c877efb2 688
2f36895a
RO
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
c877efb2
SH
692 if (!right) {
693 *err = -ENOMEM;
2f36895a
RO
694 break;
695 }
696
697 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right);
699 }
700 }
701
c877efb2 702 if (*err) {
2f36895a
RO
703 int size = tnode_child_length(tn);
704 int j;
705
c877efb2
SH
706 for(j = 0; j < size; j++)
707 if (tn->child[j])
2f36895a
RO
708 tnode_free((struct tnode *)tn->child[j]);
709
710 tnode_free(tn);
c877efb2 711
2f36895a
RO
712 *err = -ENOMEM;
713 return oldtnode;
714 }
19baf839
RO
715
716 for(i = 0; i < olen; i++) {
717 struct node *node = tnode_get_child(oldtnode, i);
c877efb2 718
19baf839
RO
719 /* An empty child */
720 if (node == NULL)
721 continue;
722
723 /* A leaf or an internal node with skipped bits */
724
c877efb2 725 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 726 tn->pos + tn->bits - 1) {
c877efb2 727 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
19baf839
RO
728 1) == 0)
729 put_child(t, tn, 2*i, node);
730 else
731 put_child(t, tn, 2*i+1, node);
732 continue;
733 }
734
735 /* An internal node with two children */
736 inode = (struct tnode *) node;
737
738 if (inode->bits == 1) {
739 put_child(t, tn, 2*i, inode->child[0]);
740 put_child(t, tn, 2*i+1, inode->child[1]);
741
742 tnode_free(inode);
743 }
744
745 /* An internal node with more than two children */
746 else {
747 struct tnode *left, *right;
748 int size, j;
749
c877efb2
SH
750 /* We will replace this node 'inode' with two new
751 * ones, 'left' and 'right', each with half of the
752 * original children. The two new nodes will have
753 * a position one bit further down the key and this
754 * means that the "significant" part of their keys
755 * (see the discussion near the top of this file)
756 * will differ by one bit, which will be "0" in
757 * left's key and "1" in right's key. Since we are
758 * moving the key position by one step, the bit that
759 * we are moving away from - the bit at position
760 * (inode->pos) - is the one that will differ between
19baf839
RO
761 * left and right. So... we synthesize that bit in the
762 * two new keys.
c877efb2 763 * The mask 'm' below will be a single "one" bit at
19baf839
RO
764 * the position (inode->pos)
765 */
766
c877efb2
SH
767 /* Use the old key, but set the new significant
768 * bit to zero.
19baf839 769 */
19baf839 770
2f36895a
RO
771 left = (struct tnode *) tnode_get_child(tn, 2*i);
772 put_child(t, tn, 2*i, NULL);
773
c877efb2 774 if (!left)
2f36895a
RO
775 BUG();
776
777 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
778 put_child(t, tn, 2*i+1, NULL);
779
c877efb2 780 if (!right)
2f36895a 781 BUG();
19baf839 782
19baf839
RO
783 size = tnode_child_length(left);
784 for(j = 0; j < size; j++) {
785 put_child(t, left, j, inode->child[j]);
786 put_child(t, right, j, inode->child[j + size]);
787 }
788 put_child(t, tn, 2*i, resize(t, left));
789 put_child(t, tn, 2*i+1, resize(t, right));
790
791 tnode_free(inode);
792 }
793 }
794 tnode_free(oldtnode);
795 return tn;
796}
797
2f36895a 798static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
19baf839
RO
799{
800 struct tnode *oldtnode = tn;
801 struct node *left, *right;
802 int i;
803 int olen = tnode_child_length(tn);
804
c877efb2
SH
805 if (trie_debug) printk("In halve\n");
806
807 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 808
2f36895a
RO
809 if (!tn) {
810 *err = -ENOMEM;
811 return oldtnode;
812 }
813
814 /*
c877efb2
SH
815 * Preallocate and store tnodes before the actual work so we
816 * don't get into an inconsistent state if memory allocation
817 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
818 * of tnode is ignored.
819 */
820
821 for(i = 0; i < olen; i += 2) {
822 left = tnode_get_child(oldtnode, i);
823 right = tnode_get_child(oldtnode, i+1);
c877efb2 824
2f36895a 825 /* Two nonempty children */
c877efb2 826 if (left && right) {
2f36895a
RO
827 struct tnode *newBinNode =
828 tnode_new(left->key, tn->pos + tn->bits, 1);
829
c877efb2
SH
830 if (!newBinNode) {
831 *err = -ENOMEM;
2f36895a
RO
832 break;
833 }
834 put_child(t, tn, i/2, (struct node *)newBinNode);
835 }
836 }
837
c877efb2 838 if (*err) {
2f36895a
RO
839 int size = tnode_child_length(tn);
840 int j;
841
c877efb2
SH
842 for(j = 0; j < size; j++)
843 if (tn->child[j])
2f36895a
RO
844 tnode_free((struct tnode *)tn->child[j]);
845
846 tnode_free(tn);
c877efb2 847
2f36895a
RO
848 *err = -ENOMEM;
849 return oldtnode;
850 }
19baf839
RO
851
852 for(i = 0; i < olen; i += 2) {
853 left = tnode_get_child(oldtnode, i);
854 right = tnode_get_child(oldtnode, i+1);
c877efb2 855
19baf839
RO
856 /* At least one of the children is empty */
857 if (left == NULL) {
858 if (right == NULL) /* Both are empty */
859 continue;
860 put_child(t, tn, i/2, right);
861 } else if (right == NULL)
862 put_child(t, tn, i/2, left);
c877efb2 863
19baf839
RO
864 /* Two nonempty children */
865 else {
866 struct tnode *newBinNode =
2f36895a
RO
867 (struct tnode *) tnode_get_child(tn, i/2);
868 put_child(t, tn, i/2, NULL);
19baf839 869
c877efb2 870 if (!newBinNode)
2f36895a 871 BUG();
19baf839
RO
872
873 put_child(t, newBinNode, 0, left);
874 put_child(t, newBinNode, 1, right);
875 put_child(t, tn, i/2, resize(t, newBinNode));
876 }
877 }
878 tnode_free(oldtnode);
879 return tn;
880}
881
882static void *trie_init(struct trie *t)
883{
c877efb2 884 if (t) {
19baf839
RO
885 t->size = 0;
886 t->trie = NULL;
887 t->revision = 0;
888#ifdef CONFIG_IP_FIB_TRIE_STATS
889 memset(&t->stats, 0, sizeof(struct trie_use_stats));
890#endif
891 }
892 return t;
893}
894
895static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
896{
897 struct hlist_node *node;
898 struct leaf_info *li;
899
900 hlist_for_each_entry(li, node, head, hlist) {
c877efb2 901 if (li->plen == plen)
19baf839
RO
902 return li;
903 }
904 return NULL;
905}
906
907static inline struct list_head * get_fa_head(struct leaf *l, int plen)
908{
c877efb2 909 struct list_head *fa_head = NULL;
19baf839 910 struct leaf_info *li = find_leaf_info(&l->list, plen);
c877efb2
SH
911
912 if (li)
19baf839 913 fa_head = &li->falh;
c877efb2 914
19baf839
RO
915 return fa_head;
916}
917
918static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
919{
c877efb2 920 struct leaf_info *li = NULL, *last = NULL;
19baf839
RO
921 struct hlist_node *node, *tmp;
922
923 write_lock_bh(&fib_lock);
c877efb2
SH
924
925 if (hlist_empty(head))
19baf839
RO
926 hlist_add_head(&new->hlist, head);
927 else {
928 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
c877efb2
SH
929
930 if (new->plen > li->plen)
19baf839 931 break;
c877efb2 932
19baf839
RO
933 last = li;
934 }
c877efb2 935 if (last)
19baf839 936 hlist_add_after(&last->hlist, &new->hlist);
c877efb2 937 else
19baf839
RO
938 hlist_add_before(&new->hlist, &li->hlist);
939 }
940 write_unlock_bh(&fib_lock);
941}
942
943static struct leaf *
944fib_find_node(struct trie *t, u32 key)
945{
946 int pos;
947 struct tnode *tn;
948 struct node *n;
949
950 pos = 0;
c877efb2 951 n = t->trie;
19baf839
RO
952
953 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
954 tn = (struct tnode *) n;
c877efb2 955
19baf839 956 check_tnode(tn);
c877efb2
SH
957
958 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839
RO
959 pos=tn->pos + tn->bits;
960 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
961 }
962 else
963 break;
964 }
965 /* Case we have found a leaf. Compare prefixes */
966
967 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
968 struct leaf *l = (struct leaf *) n;
969 return l;
970 }
971 return NULL;
972}
973
974static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
975{
976 int i = 0;
977 int wasfull;
978 t_key cindex, key;
979 struct tnode *tp = NULL;
980
c877efb2 981 if (!tn)
19baf839 982 BUG();
c877efb2 983
19baf839
RO
984 key = tn->key;
985 i = 0;
986
987 while (tn != NULL && NODE_PARENT(tn) != NULL) {
988
c877efb2 989 if (i > 10) {
19baf839 990 printk("Rebalance tn=%p \n", tn);
c877efb2
SH
991 if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
992
19baf839 993 printk("Rebalance tp=%p \n", tp);
c877efb2 994 if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
19baf839
RO
995 }
996
c877efb2 997 if (i > 12) BUG();
19baf839
RO
998 i++;
999
1000 tp = NODE_PARENT(tn);
1001 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1002 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1003 tn = (struct tnode *) resize (t, (struct tnode *)tn);
1004 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
c877efb2
SH
1005
1006 if (!NODE_PARENT(tn))
19baf839
RO
1007 break;
1008
1009 tn = NODE_PARENT(tn);
1010 }
1011 /* Handle last (top) tnode */
c877efb2 1012 if (IS_TNODE(tn))
19baf839
RO
1013 tn = (struct tnode*) resize(t, (struct tnode *)tn);
1014
1015 return (struct node*) tn;
1016}
1017
f835e471
RO
1018static struct list_head *
1019fib_insert_node(struct trie *t, int *err, u32 key, int plen)
19baf839
RO
1020{
1021 int pos, newpos;
1022 struct tnode *tp = NULL, *tn = NULL;
1023 struct node *n;
1024 struct leaf *l;
1025 int missbit;
c877efb2 1026 struct list_head *fa_head = NULL;
19baf839
RO
1027 struct leaf_info *li;
1028 t_key cindex;
1029
1030 pos = 0;
c877efb2 1031 n = t->trie;
19baf839 1032
c877efb2
SH
1033 /* If we point to NULL, stop. Either the tree is empty and we should
1034 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1035 * and we should just put our new leaf in that.
c877efb2
SH
1036 * If we point to a T_TNODE, check if it matches our key. Note that
1037 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1038 * not be the parent's 'pos'+'bits'!
1039 *
c877efb2 1040 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1041 * the index from our key, push the T_TNODE and walk the tree.
1042 *
1043 * If it doesn't, we have to replace it with a new T_TNODE.
1044 *
c877efb2
SH
1045 * If we point to a T_LEAF, it might or might not have the same key
1046 * as we do. If it does, just change the value, update the T_LEAF's
1047 * value, and return it.
19baf839
RO
1048 * If it doesn't, we need to replace it with a T_TNODE.
1049 */
1050
1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1052 tn = (struct tnode *) n;
19baf839 1053
c877efb2
SH
1054 check_tnode(tn);
1055
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839
RO
1057 tp = tn;
1058 pos=tn->pos + tn->bits;
1059 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1060
c877efb2 1061 if (n && NODE_PARENT(n) != tn) {
19baf839
RO
1062 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1063 BUG();
1064 }
1065 }
1066 else
1067 break;
1068 }
1069
1070 /*
1071 * n ----> NULL, LEAF or TNODE
1072 *
c877efb2 1073 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1074 */
1075
c877efb2 1076 if (tp && IS_LEAF(tp))
19baf839
RO
1077 BUG();
1078
19baf839
RO
1079
1080 /* Case 1: n is a leaf. Compare prefixes */
1081
c877efb2 1082 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
19baf839 1083 struct leaf *l = ( struct leaf *) n;
c877efb2 1084
19baf839 1085 li = leaf_info_new(plen);
c877efb2
SH
1086
1087 if (!li) {
f835e471
RO
1088 *err = -ENOMEM;
1089 goto err;
1090 }
19baf839
RO
1091
1092 fa_head = &li->falh;
1093 insert_leaf_info(&l->list, li);
1094 goto done;
1095 }
1096 t->size++;
1097 l = leaf_new();
1098
c877efb2 1099 if (!l) {
f835e471
RO
1100 *err = -ENOMEM;
1101 goto err;
1102 }
19baf839
RO
1103
1104 l->key = key;
1105 li = leaf_info_new(plen);
1106
c877efb2 1107 if (!li) {
f835e471
RO
1108 tnode_free((struct tnode *) l);
1109 *err = -ENOMEM;
1110 goto err;
1111 }
19baf839
RO
1112
1113 fa_head = &li->falh;
1114 insert_leaf_info(&l->list, li);
1115
1116 /* Case 2: n is NULL, and will just insert a new leaf */
1117 if (t->trie && n == NULL) {
1118
1119 NODE_SET_PARENT(l, tp);
c877efb2
SH
1120
1121 if (!tp)
19baf839
RO
1122 BUG();
1123
1124 else {
1125 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1126 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1127 }
1128 }
1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1130 else {
c877efb2
SH
1131 /*
1132 * Add a new tnode here
19baf839
RO
1133 * first tnode need some special handling
1134 */
1135
1136 if (tp)
1137 pos=tp->pos+tp->bits;
1138 else
1139 pos=0;
c877efb2 1140 if (n) {
19baf839
RO
1141 newpos = tkey_mismatch(key, pos, n->key);
1142 tn = tnode_new(n->key, newpos, 1);
1143 }
1144 else {
1145 newpos = 0;
c877efb2 1146 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1147 }
19baf839 1148
c877efb2 1149 if (!tn) {
f835e471
RO
1150 free_leaf_info(li);
1151 tnode_free((struct tnode *) l);
1152 *err = -ENOMEM;
1153 goto err;
c877efb2
SH
1154 }
1155
19baf839
RO
1156 NODE_SET_PARENT(tn, tp);
1157
1158 missbit=tkey_extract_bits(key, newpos, 1);
1159 put_child(t, tn, missbit, (struct node *)l);
1160 put_child(t, tn, 1-missbit, n);
1161
c877efb2 1162 if (tp) {
19baf839
RO
1163 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1164 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1165 }
c877efb2 1166 else {
19baf839
RO
1167 t->trie = (struct node*) tn; /* First tnode */
1168 tp = tn;
1169 }
1170 }
c877efb2
SH
1171 if (tp && tp->pos+tp->bits > 32) {
1172 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
19baf839
RO
1173 tp, tp->pos, tp->bits, key, plen);
1174 }
1175 /* Rebalance the trie */
1176 t->trie = trie_rebalance(t, tp);
f835e471
RO
1177done:
1178 t->revision++;
1179err:;
19baf839
RO
1180 return fa_head;
1181}
1182
1183static int
1184fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1185 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1186{
1187 struct trie *t = (struct trie *) tb->tb_data;
1188 struct fib_alias *fa, *new_fa;
c877efb2 1189 struct list_head *fa_head = NULL;
19baf839
RO
1190 struct fib_info *fi;
1191 int plen = r->rtm_dst_len;
1192 int type = r->rtm_type;
1193 u8 tos = r->rtm_tos;
1194 u32 key, mask;
1195 int err;
1196 struct leaf *l;
1197
1198 if (plen > 32)
1199 return -EINVAL;
1200
1201 key = 0;
c877efb2 1202 if (rta->rta_dst)
19baf839
RO
1203 memcpy(&key, rta->rta_dst, 4);
1204
1205 key = ntohl(key);
1206
c877efb2 1207 if (trie_debug)
19baf839
RO
1208 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1209
c877efb2 1210 mask = ntohl( inet_make_mask(plen) );
19baf839 1211
c877efb2 1212 if (key & ~mask)
19baf839
RO
1213 return -EINVAL;
1214
1215 key = key & mask;
1216
1217 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1218 goto err;
1219
1220 l = fib_find_node(t, key);
c877efb2 1221 fa = NULL;
19baf839 1222
c877efb2 1223 if (l) {
19baf839
RO
1224 fa_head = get_fa_head(l, plen);
1225 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1226 }
1227
1228 /* Now fa, if non-NULL, points to the first fib alias
1229 * with the same keys [prefix,tos,priority], if such key already
1230 * exists or to the node before which we will insert new one.
1231 *
1232 * If fa is NULL, we will need to allocate a new one and
1233 * insert to the head of f.
1234 *
1235 * If f is NULL, no fib node matched the destination key
1236 * and we need to allocate a new one of those as well.
1237 */
1238
1239 if (fa &&
1240 fa->fa_info->fib_priority == fi->fib_priority) {
1241 struct fib_alias *fa_orig;
1242
1243 err = -EEXIST;
1244 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1245 goto out;
1246
1247 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1248 struct fib_info *fi_drop;
1249 u8 state;
1250
1251 write_lock_bh(&fib_lock);
1252
1253 fi_drop = fa->fa_info;
1254 fa->fa_info = fi;
1255 fa->fa_type = type;
1256 fa->fa_scope = r->rtm_scope;
1257 state = fa->fa_state;
1258 fa->fa_state &= ~FA_S_ACCESSED;
1259
1260 write_unlock_bh(&fib_lock);
1261
1262 fib_release_info(fi_drop);
1263 if (state & FA_S_ACCESSED)
1264 rt_cache_flush(-1);
1265
1266 goto succeeded;
1267 }
1268 /* Error if we find a perfect match which
1269 * uses the same scope, type, and nexthop
1270 * information.
1271 */
1272 fa_orig = fa;
1273 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1274 if (fa->fa_tos != tos)
1275 break;
1276 if (fa->fa_info->fib_priority != fi->fib_priority)
1277 break;
1278 if (fa->fa_type == type &&
1279 fa->fa_scope == r->rtm_scope &&
1280 fa->fa_info == fi) {
1281 goto out;
1282 }
1283 }
1284 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1285 fa = fa_orig;
1286 }
1287 err = -ENOENT;
1288 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1289 goto out;
1290
1291 err = -ENOBUFS;
1292 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1293 if (new_fa == NULL)
1294 goto out;
1295
1296 new_fa->fa_info = fi;
1297 new_fa->fa_tos = tos;
1298 new_fa->fa_type = type;
1299 new_fa->fa_scope = r->rtm_scope;
1300 new_fa->fa_state = 0;
1301#if 0
c877efb2 1302 new_fa->dst = NULL;
19baf839
RO
1303#endif
1304 /*
1305 * Insert new entry to the list.
1306 */
1307
c877efb2 1308 if (!fa_head) {
f835e471
RO
1309 fa_head = fib_insert_node(t, &err, key, plen);
1310 err = 0;
c877efb2 1311 if (err)
f835e471
RO
1312 goto out_free_new_fa;
1313 }
19baf839
RO
1314
1315 write_lock_bh(&fib_lock);
1316
1317 list_add_tail(&new_fa->fa_list,
1318 (fa ? &fa->fa_list : fa_head));
1319
1320 write_unlock_bh(&fib_lock);
1321
1322 rt_cache_flush(-1);
1323 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1324succeeded:
1325 return 0;
f835e471
RO
1326
1327out_free_new_fa:
1328 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1329out:
1330 fib_release_info(fi);
c877efb2 1331err:;
19baf839
RO
1332 return err;
1333}
1334
c877efb2 1335static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
06c74270 1336 struct fib_result *res)
19baf839 1337{
06c74270 1338 int err, i;
19baf839
RO
1339 t_key mask;
1340 struct leaf_info *li;
1341 struct hlist_head *hhead = &l->list;
1342 struct hlist_node *node;
c877efb2 1343
19baf839
RO
1344 hlist_for_each_entry(li, node, hhead, hlist) {
1345
1346 i = li->plen;
1347 mask = ntohl(inet_make_mask(i));
c877efb2 1348 if (l->key != (key & mask))
19baf839
RO
1349 continue;
1350
06c74270 1351 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
19baf839
RO
1352 *plen = i;
1353#ifdef CONFIG_IP_FIB_TRIE_STATS
1354 t->stats.semantic_match_passed++;
1355#endif
06c74270 1356 return err;
19baf839
RO
1357 }
1358#ifdef CONFIG_IP_FIB_TRIE_STATS
1359 t->stats.semantic_match_miss++;
1360#endif
1361 }
06c74270 1362 return 1;
19baf839
RO
1363}
1364
1365static int
1366fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1367{
1368 struct trie *t = (struct trie *) tb->tb_data;
1369 int plen, ret = 0;
1370 struct node *n;
1371 struct tnode *pn;
1372 int pos, bits;
1373 t_key key=ntohl(flp->fl4_dst);
1374 int chopped_off;
1375 t_key cindex = 0;
1376 int current_prefix_length = KEYLENGTH;
1377 n = t->trie;
1378
1379 read_lock(&fib_lock);
c877efb2 1380 if (!n)
19baf839
RO
1381 goto failed;
1382
1383#ifdef CONFIG_IP_FIB_TRIE_STATS
1384 t->stats.gets++;
1385#endif
1386
1387 /* Just a leaf? */
1388 if (IS_LEAF(n)) {
06c74270 1389 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
19baf839
RO
1390 goto found;
1391 goto failed;
1392 }
1393 pn = (struct tnode *) n;
1394 chopped_off = 0;
c877efb2 1395
19baf839
RO
1396 while (pn) {
1397
1398 pos = pn->pos;
1399 bits = pn->bits;
1400
c877efb2 1401 if (!chopped_off)
19baf839
RO
1402 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1403
1404 n = tnode_get_child(pn, cindex);
1405
1406 if (n == NULL) {
1407#ifdef CONFIG_IP_FIB_TRIE_STATS
1408 t->stats.null_node_hit++;
1409#endif
1410 goto backtrace;
1411 }
1412
1413 if (IS_TNODE(n)) {
1414#define HL_OPTIMIZE
1415#ifdef HL_OPTIMIZE
1416 struct tnode *cn = (struct tnode *)n;
1417 t_key node_prefix, key_prefix, pref_mismatch;
1418 int mp;
1419
1420 /*
c877efb2 1421 * It's a tnode, and we can do some extra checks here if we
19baf839 1422 * like, to avoid descending into a dead-end branch.
c877efb2
SH
1423 * This tnode is in the parent's child array at index
1424 * key[p_pos..p_pos+p_bits] but potentially with some bits
1425 * chopped off, so in reality the index may be just a
19baf839 1426 * subprefix, padded with zero at the end.
c877efb2
SH
1427 * We can also take a look at any skipped bits in this
1428 * tnode - everything up to p_pos is supposed to be ok,
19baf839 1429 * and the non-chopped bits of the index (se previous
c877efb2 1430 * paragraph) are also guaranteed ok, but the rest is
19baf839
RO
1431 * considered unknown.
1432 *
1433 * The skipped bits are key[pos+bits..cn->pos].
1434 */
c877efb2
SH
1435
1436 /* If current_prefix_length < pos+bits, we are already doing
1437 * actual prefix matching, which means everything from
1438 * pos+(bits-chopped_off) onward must be zero along some
1439 * branch of this subtree - otherwise there is *no* valid
19baf839 1440 * prefix present. Here we can only check the skipped
c877efb2
SH
1441 * bits. Remember, since we have already indexed into the
1442 * parent's child array, we know that the bits we chopped of
19baf839
RO
1443 * *are* zero.
1444 */
1445
1446 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
c877efb2 1447
19baf839
RO
1448 if (current_prefix_length < pos+bits) {
1449 if (tkey_extract_bits(cn->key, current_prefix_length,
1450 cn->pos - current_prefix_length) != 0 ||
1451 !(cn->child[0]))
1452 goto backtrace;
1453 }
1454
1455 /*
c877efb2
SH
1456 * If chopped_off=0, the index is fully validated and we
1457 * only need to look at the skipped bits for this, the new,
19baf839
RO
1458 * tnode. What we actually want to do is to find out if
1459 * these skipped bits match our key perfectly, or if we will
c877efb2
SH
1460 * have to count on finding a matching prefix further down,
1461 * because if we do, we would like to have some way of
1462 * verifying the existence of such a prefix at this point.
19baf839
RO
1463 */
1464
1465 /* The only thing we can do at this point is to verify that
1466 * any such matching prefix can indeed be a prefix to our
1467 * key, and if the bits in the node we are inspecting that
1468 * do not match our key are not ZERO, this cannot be true.
1469 * Thus, find out where there is a mismatch (before cn->pos)
1470 * and verify that all the mismatching bits are zero in the
1471 * new tnode's key.
1472 */
1473
c877efb2
SH
1474 /* Note: We aren't very concerned about the piece of the key
1475 * that precede pn->pos+pn->bits, since these have already been
1476 * checked. The bits after cn->pos aren't checked since these are
1477 * by definition "unknown" at this point. Thus, what we want to
1478 * see is if we are about to enter the "prefix matching" state,
1479 * and in that case verify that the skipped bits that will prevail
1480 * throughout this subtree are zero, as they have to be if we are
19baf839
RO
1481 * to find a matching prefix.
1482 */
1483
1484 node_prefix = MASK_PFX(cn->key, cn->pos);
c877efb2 1485 key_prefix = MASK_PFX(key, cn->pos);
19baf839
RO
1486 pref_mismatch = key_prefix^node_prefix;
1487 mp = 0;
1488
c877efb2 1489 /* In short: If skipped bits in this node do not match the search
19baf839
RO
1490 * key, enter the "prefix matching" state.directly.
1491 */
1492 if (pref_mismatch) {
1493 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1494 mp++;
1495 pref_mismatch = pref_mismatch <<1;
1496 }
1497 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
c877efb2 1498
19baf839
RO
1499 if (key_prefix != 0)
1500 goto backtrace;
1501
1502 if (current_prefix_length >= cn->pos)
1503 current_prefix_length=mp;
1504 }
1505#endif
1506 pn = (struct tnode *)n; /* Descend */
1507 chopped_off = 0;
1508 continue;
c877efb2
SH
1509 }
1510 if (IS_LEAF(n)) {
06c74270 1511 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
19baf839
RO
1512 goto found;
1513 }
1514backtrace:
1515 chopped_off++;
1516
1517 /* As zero don't change the child key (cindex) */
1518 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1519 chopped_off++;
1520 }
1521
1522 /* Decrease current_... with bits chopped off */
1523 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1524 current_prefix_length = pn->pos + pn->bits - chopped_off;
c877efb2 1525
19baf839 1526 /*
c877efb2 1527 * Either we do the actual chop off according or if we have
19baf839
RO
1528 * chopped off all bits in this tnode walk up to our parent.
1529 */
1530
c877efb2 1531 if (chopped_off <= pn->bits)
19baf839
RO
1532 cindex &= ~(1 << (chopped_off-1));
1533 else {
c877efb2 1534 if (NODE_PARENT(pn) == NULL)
19baf839 1535 goto failed;
c877efb2 1536
19baf839
RO
1537 /* Get Child's index */
1538 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1539 pn = NODE_PARENT(pn);
1540 chopped_off = 0;
1541
1542#ifdef CONFIG_IP_FIB_TRIE_STATS
1543 t->stats.backtrack++;
1544#endif
1545 goto backtrace;
c877efb2 1546 }
19baf839
RO
1547 }
1548failed:
c877efb2 1549 ret = 1;
19baf839
RO
1550found:
1551 read_unlock(&fib_lock);
1552 return ret;
1553}
1554
1555static int trie_leaf_remove(struct trie *t, t_key key)
1556{
1557 t_key cindex;
1558 struct tnode *tp = NULL;
1559 struct node *n = t->trie;
1560 struct leaf *l;
1561
c877efb2 1562 if (trie_debug)
19baf839
RO
1563 printk("entering trie_leaf_remove(%p)\n", n);
1564
1565 /* Note that in the case skipped bits, those bits are *not* checked!
c877efb2 1566 * When we finish this, we will have NULL or a T_LEAF, and the
19baf839
RO
1567 * T_LEAF may or may not match our key.
1568 */
1569
1570 while (n != NULL && IS_TNODE(n)) {
1571 struct tnode *tn = (struct tnode *) n;
1572 check_tnode(tn);
1573 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1574
c877efb2 1575 if (n && NODE_PARENT(n) != tn) {
19baf839
RO
1576 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1577 BUG();
1578 }
1579 }
1580 l = (struct leaf *) n;
1581
c877efb2 1582 if (!n || !tkey_equals(l->key, key))
19baf839 1583 return 0;
c877efb2
SH
1584
1585 /*
1586 * Key found.
1587 * Remove the leaf and rebalance the tree
19baf839
RO
1588 */
1589
1590 t->revision++;
1591 t->size--;
1592
1593 tp = NODE_PARENT(n);
1594 tnode_free((struct tnode *) n);
1595
c877efb2 1596 if (tp) {
19baf839
RO
1597 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1598 put_child(t, (struct tnode *)tp, cindex, NULL);
1599 t->trie = trie_rebalance(t, tp);
1600 }
1601 else
1602 t->trie = NULL;
1603
1604 return 1;
1605}
1606
1607static int
1608fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1609 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1610{
1611 struct trie *t = (struct trie *) tb->tb_data;
1612 u32 key, mask;
1613 int plen = r->rtm_dst_len;
1614 u8 tos = r->rtm_tos;
1615 struct fib_alias *fa, *fa_to_delete;
1616 struct list_head *fa_head;
1617 struct leaf *l;
1618
c877efb2 1619 if (plen > 32)
19baf839
RO
1620 return -EINVAL;
1621
1622 key = 0;
c877efb2 1623 if (rta->rta_dst)
19baf839
RO
1624 memcpy(&key, rta->rta_dst, 4);
1625
1626 key = ntohl(key);
c877efb2 1627 mask = ntohl( inet_make_mask(plen) );
19baf839 1628
c877efb2 1629 if (key & ~mask)
19baf839
RO
1630 return -EINVAL;
1631
1632 key = key & mask;
1633 l = fib_find_node(t, key);
1634
c877efb2 1635 if (!l)
19baf839
RO
1636 return -ESRCH;
1637
1638 fa_head = get_fa_head(l, plen);
1639 fa = fib_find_alias(fa_head, tos, 0);
1640
1641 if (!fa)
1642 return -ESRCH;
1643
1644 if (trie_debug)
1645 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1646
1647 fa_to_delete = NULL;
1648 fa_head = fa->fa_list.prev;
1649 list_for_each_entry(fa, fa_head, fa_list) {
1650 struct fib_info *fi = fa->fa_info;
1651
1652 if (fa->fa_tos != tos)
1653 break;
1654
1655 if ((!r->rtm_type ||
1656 fa->fa_type == r->rtm_type) &&
1657 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1658 fa->fa_scope == r->rtm_scope) &&
1659 (!r->rtm_protocol ||
1660 fi->fib_protocol == r->rtm_protocol) &&
1661 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1662 fa_to_delete = fa;
1663 break;
1664 }
1665 }
1666
1667 if (fa_to_delete) {
1668 int kill_li = 0;
1669 struct leaf_info *li;
1670
1671 fa = fa_to_delete;
1672 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1673
1674 l = fib_find_node(t, key);
1675 li = find_leaf_info(&l->list, plen);
1676
1677 write_lock_bh(&fib_lock);
1678
1679 list_del(&fa->fa_list);
1680
c877efb2 1681 if (list_empty(fa_head)) {
19baf839
RO
1682 hlist_del(&li->hlist);
1683 kill_li = 1;
1684 }
1685 write_unlock_bh(&fib_lock);
c877efb2
SH
1686
1687 if (kill_li)
19baf839
RO
1688 free_leaf_info(li);
1689
c877efb2 1690 if (hlist_empty(&l->list))
19baf839
RO
1691 trie_leaf_remove(t, key);
1692
1693 if (fa->fa_state & FA_S_ACCESSED)
1694 rt_cache_flush(-1);
1695
1696 fn_free_alias(fa);
1697 return 0;
1698 }
1699 return -ESRCH;
1700}
1701
1702static int trie_flush_list(struct trie *t, struct list_head *head)
1703{
1704 struct fib_alias *fa, *fa_node;
1705 int found = 0;
1706
1707 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1708 struct fib_info *fi = fa->fa_info;
c877efb2 1709
19baf839
RO
1710 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1711
c877efb2 1712 write_lock_bh(&fib_lock);
19baf839 1713 list_del(&fa->fa_list);
c877efb2 1714 write_unlock_bh(&fib_lock);
19baf839
RO
1715
1716 fn_free_alias(fa);
1717 found++;
1718 }
1719 }
1720 return found;
1721}
1722
1723static int trie_flush_leaf(struct trie *t, struct leaf *l)
1724{
1725 int found = 0;
1726 struct hlist_head *lih = &l->list;
1727 struct hlist_node *node, *tmp;
1728 struct leaf_info *li = NULL;
1729
1730 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
c877efb2 1731
19baf839
RO
1732 found += trie_flush_list(t, &li->falh);
1733
1734 if (list_empty(&li->falh)) {
1735
c877efb2 1736 write_lock_bh(&fib_lock);
19baf839 1737 hlist_del(&li->hlist);
c877efb2 1738 write_unlock_bh(&fib_lock);
19baf839
RO
1739
1740 free_leaf_info(li);
1741 }
1742 }
1743 return found;
1744}
1745
1746static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1747{
1748 struct node *c = (struct node *) thisleaf;
1749 struct tnode *p;
1750 int idx;
1751
c877efb2
SH
1752 if (c == NULL) {
1753 if (t->trie == NULL)
19baf839
RO
1754 return NULL;
1755
1756 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1757 return (struct leaf *) t->trie;
1758
1759 p = (struct tnode*) t->trie; /* Start */
1760 }
c877efb2 1761 else
19baf839 1762 p = (struct tnode *) NODE_PARENT(c);
c877efb2 1763
19baf839
RO
1764 while (p) {
1765 int pos, last;
1766
1767 /* Find the next child of the parent */
c877efb2
SH
1768 if (c)
1769 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1770 else
19baf839
RO
1771 pos = 0;
1772
1773 last = 1 << p->bits;
1774 for(idx = pos; idx < last ; idx++) {
c877efb2 1775 if (p->child[idx]) {
19baf839
RO
1776
1777 /* Decend if tnode */
1778
1779 while (IS_TNODE(p->child[idx])) {
1780 p = (struct tnode*) p->child[idx];
1781 idx = 0;
c877efb2 1782
19baf839 1783 /* Rightmost non-NULL branch */
c877efb2
SH
1784 if (p && IS_TNODE(p))
1785 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
19baf839
RO
1786
1787 /* Done with this tnode? */
c877efb2 1788 if (idx >= (1 << p->bits) || p->child[idx] == NULL )
19baf839
RO
1789 goto up;
1790 }
1791 return (struct leaf*) p->child[idx];
1792 }
1793 }
1794up:
1795 /* No more children go up one step */
1796 c = (struct node*) p;
1797 p = (struct tnode *) NODE_PARENT(p);
1798 }
1799 return NULL; /* Ready. Root of trie */
1800}
1801
1802static int fn_trie_flush(struct fib_table *tb)
1803{
1804 struct trie *t = (struct trie *) tb->tb_data;
1805 struct leaf *ll = NULL, *l = NULL;
1806 int found = 0, h;
1807
1808 t->revision++;
1809
1810 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1811 found += trie_flush_leaf(t, l);
1812
1813 if (ll && hlist_empty(&ll->list))
1814 trie_leaf_remove(t, ll->key);
1815 ll = l;
1816 }
1817
1818 if (ll && hlist_empty(&ll->list))
1819 trie_leaf_remove(t, ll->key);
1820
c877efb2 1821 if (trie_debug)
19baf839
RO
1822 printk("trie_flush found=%d\n", found);
1823 return found;
1824}
1825
1826static int trie_last_dflt=-1;
1827
1828static void
1829fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1830{
1831 struct trie *t = (struct trie *) tb->tb_data;
1832 int order, last_idx;
1833 struct fib_info *fi = NULL;
1834 struct fib_info *last_resort;
1835 struct fib_alias *fa = NULL;
1836 struct list_head *fa_head;
1837 struct leaf *l;
1838
1839 last_idx = -1;
1840 last_resort = NULL;
1841 order = -1;
1842
1843 read_lock(&fib_lock);
c877efb2 1844
19baf839 1845 l = fib_find_node(t, 0);
c877efb2 1846 if (!l)
19baf839
RO
1847 goto out;
1848
1849 fa_head = get_fa_head(l, 0);
c877efb2 1850 if (!fa_head)
19baf839
RO
1851 goto out;
1852
c877efb2 1853 if (list_empty(fa_head))
19baf839
RO
1854 goto out;
1855
1856 list_for_each_entry(fa, fa_head, fa_list) {
1857 struct fib_info *next_fi = fa->fa_info;
c877efb2 1858
19baf839
RO
1859 if (fa->fa_scope != res->scope ||
1860 fa->fa_type != RTN_UNICAST)
1861 continue;
c877efb2 1862
19baf839
RO
1863 if (next_fi->fib_priority > res->fi->fib_priority)
1864 break;
1865 if (!next_fi->fib_nh[0].nh_gw ||
1866 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1867 continue;
1868 fa->fa_state |= FA_S_ACCESSED;
c877efb2 1869
19baf839
RO
1870 if (fi == NULL) {
1871 if (next_fi != res->fi)
1872 break;
1873 } else if (!fib_detect_death(fi, order, &last_resort,
1874 &last_idx, &trie_last_dflt)) {
1875 if (res->fi)
1876 fib_info_put(res->fi);
1877 res->fi = fi;
1878 atomic_inc(&fi->fib_clntref);
1879 trie_last_dflt = order;
1880 goto out;
1881 }
1882 fi = next_fi;
1883 order++;
1884 }
1885 if (order <= 0 || fi == NULL) {
1886 trie_last_dflt = -1;
1887 goto out;
1888 }
1889
1890 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1891 if (res->fi)
1892 fib_info_put(res->fi);
1893 res->fi = fi;
1894 atomic_inc(&fi->fib_clntref);
1895 trie_last_dflt = order;
1896 goto out;
1897 }
1898 if (last_idx >= 0) {
1899 if (res->fi)
1900 fib_info_put(res->fi);
1901 res->fi = last_resort;
1902 if (last_resort)
1903 atomic_inc(&last_resort->fib_clntref);
1904 }
1905 trie_last_dflt = last_idx;
1906 out:;
c877efb2 1907 read_unlock(&fib_lock);
19baf839
RO
1908}
1909
c877efb2 1910static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
19baf839
RO
1911 struct sk_buff *skb, struct netlink_callback *cb)
1912{
1913 int i, s_i;
1914 struct fib_alias *fa;
1915
1916 u32 xkey=htonl(key);
1917
1918 s_i=cb->args[3];
1919 i = 0;
1920
1921 list_for_each_entry(fa, fah, fa_list) {
1922 if (i < s_i) {
1923 i++;
1924 continue;
1925 }
1926 if (fa->fa_info->fib_nh == NULL) {
1927 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1928 i++;
1929 continue;
1930 }
1931 if (fa->fa_info == NULL) {
1932 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1933 i++;
1934 continue;
1935 }
1936
1937 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1938 cb->nlh->nlmsg_seq,
1939 RTM_NEWROUTE,
1940 tb->tb_id,
1941 fa->fa_type,
1942 fa->fa_scope,
1943 &xkey,
1944 plen,
1945 fa->fa_tos,
90f66914 1946 fa->fa_info, 0) < 0) {
19baf839
RO
1947 cb->args[3] = i;
1948 return -1;
1949 }
1950 i++;
1951 }
1952 cb->args[3]=i;
1953 return skb->len;
1954}
1955
c877efb2 1956static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
19baf839
RO
1957 struct netlink_callback *cb)
1958{
1959 int h, s_h;
1960 struct list_head *fa_head;
1961 struct leaf *l = NULL;
1962 s_h=cb->args[2];
1963
1964 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1965
1966 if (h < s_h)
1967 continue;
1968 if (h > s_h)
1969 memset(&cb->args[3], 0,
1970 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1971
1972 fa_head = get_fa_head(l, plen);
c877efb2
SH
1973
1974 if (!fa_head)
19baf839
RO
1975 continue;
1976
c877efb2 1977 if (list_empty(fa_head))
19baf839
RO
1978 continue;
1979
1980 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1981 cb->args[2]=h;
1982 return -1;
1983 }
1984 }
1985 cb->args[2]=h;
1986 return skb->len;
1987}
1988
1989static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1990{
1991 int m, s_m;
1992 struct trie *t = (struct trie *) tb->tb_data;
1993
1994 s_m = cb->args[1];
1995
1996 read_lock(&fib_lock);
1997 for (m=0; m<=32; m++) {
1998
1999 if (m < s_m)
2000 continue;
2001 if (m > s_m)
2002 memset(&cb->args[2], 0,
2003 sizeof(cb->args) - 2*sizeof(cb->args[0]));
2004
2005 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
2006 cb->args[1] = m;
2007 goto out;
2008 }
2009 }
2010 read_unlock(&fib_lock);
2011 cb->args[1] = m;
2012 return skb->len;
2013 out:
2014 read_unlock(&fib_lock);
2015 return -1;
2016}
2017
2018/* Fix more generic FIB names for init later */
2019
2020#ifdef CONFIG_IP_MULTIPLE_TABLES
2021struct fib_table * fib_hash_init(int id)
2022#else
2023struct fib_table * __init fib_hash_init(int id)
2024#endif
2025{
2026 struct fib_table *tb;
2027 struct trie *t;
2028
2029 if (fn_alias_kmem == NULL)
2030 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2031 sizeof(struct fib_alias),
2032 0, SLAB_HWCACHE_ALIGN,
2033 NULL, NULL);
2034
2035 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2036 GFP_KERNEL);
2037 if (tb == NULL)
2038 return NULL;
2039
2040 tb->tb_id = id;
2041 tb->tb_lookup = fn_trie_lookup;
2042 tb->tb_insert = fn_trie_insert;
2043 tb->tb_delete = fn_trie_delete;
2044 tb->tb_flush = fn_trie_flush;
2045 tb->tb_select_default = fn_trie_select_default;
2046 tb->tb_dump = fn_trie_dump;
2047 memset(tb->tb_data, 0, sizeof(struct trie));
2048
2049 t = (struct trie *) tb->tb_data;
2050
2051 trie_init(t);
2052
c877efb2
SH
2053 if (id == RT_TABLE_LOCAL)
2054 trie_local = t;
2055 else if (id == RT_TABLE_MAIN)
2056 trie_main = t;
19baf839
RO
2057
2058 if (id == RT_TABLE_LOCAL)
2059 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2060
2061 return tb;
2062}
2063
2064/* Trie dump functions */
2065
2066static void putspace_seq(struct seq_file *seq, int n)
2067{
2068 while (n--) seq_printf(seq, " ");
2069}
2070
2071static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2072{
2073 while (bits--)
2074 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2075}
2076
c877efb2 2077static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
19baf839
RO
2078 int pend, int cindex, int bits)
2079{
2080 putspace_seq(seq, indent);
2081 if (IS_LEAF(n))
2082 seq_printf(seq, "|");
2083 else
2084 seq_printf(seq, "+");
2085 if (bits) {
2086 seq_printf(seq, "%d/", cindex);
2087 printbin_seq(seq, cindex, bits);
2088 seq_printf(seq, ": ");
2089 }
2090 else
2091 seq_printf(seq, "<root>: ");
2092 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2093
2094 if (IS_LEAF(n))
c877efb2 2095 seq_printf(seq, "key=%d.%d.%d.%d\n",
19baf839
RO
2096 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2097 else {
c877efb2 2098 int plen = ((struct tnode *)n)->pos;
19baf839 2099 t_key prf=MASK_PFX(n->key, plen);
c877efb2 2100 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
19baf839
RO
2101 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2102 }
2103 if (IS_LEAF(n)) {
2104 struct leaf *l=(struct leaf *)n;
2105 struct fib_alias *fa;
2106 int i;
2107 for (i=32; i>=0; i--)
c877efb2
SH
2108 if (find_leaf_info(&l->list, i)) {
2109
19baf839 2110 struct list_head *fa_head = get_fa_head(l, i);
c877efb2
SH
2111
2112 if (!fa_head)
19baf839
RO
2113 continue;
2114
c877efb2 2115 if (list_empty(fa_head))
19baf839
RO
2116 continue;
2117
2118 putspace_seq(seq, indent+2);
2119 seq_printf(seq, "{/%d...dumping}\n", i);
2120
2121
2122 list_for_each_entry(fa, fa_head, fa_list) {
2123 putspace_seq(seq, indent+2);
2124 if (fa->fa_info->fib_nh == NULL) {
2125 seq_printf(seq, "Error _fib_nh=NULL\n");
2126 continue;
2127 }
2128 if (fa->fa_info == NULL) {
2129 seq_printf(seq, "Error fa_info=NULL\n");
2130 continue;
2131 }
2132
2133 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2134 fa->fa_type,
2135 fa->fa_scope,
2136 fa->fa_tos);
2137 }
2138 }
2139 }
2140 else if (IS_TNODE(n)) {
c877efb2 2141 struct tnode *tn = (struct tnode *)n;
19baf839
RO
2142 putspace_seq(seq, indent); seq_printf(seq, "| ");
2143 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
2144 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2145 seq_printf(seq, "}\n");
2146 putspace_seq(seq, indent); seq_printf(seq, "| ");
2147 seq_printf(seq, "{pos=%d", tn->pos);
2148 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2149 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2150 putspace_seq(seq, indent); seq_printf(seq, "| ");
2151 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2152 }
2153}
2154
2155static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2156{
c877efb2 2157 struct node *n = t->trie;
19baf839
RO
2158 int cindex=0;
2159 int indent=1;
2160 int pend=0;
2161 int depth = 0;
2162
2163 read_lock(&fib_lock);
2164
2165 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2166 if (n) {
2167 printnode_seq(seq, indent, n, pend, cindex, 0);
2168 if (IS_TNODE(n)) {
c877efb2 2169 struct tnode *tn = (struct tnode *)n;
19baf839
RO
2170 pend = tn->pos+tn->bits;
2171 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2172 indent += 3;
2173 depth++;
2174
2175 while (tn && cindex < (1 << tn->bits)) {
2176 if (tn->child[cindex]) {
c877efb2 2177
19baf839 2178 /* Got a child */
c877efb2 2179
19baf839 2180 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
c877efb2 2181 if (IS_LEAF(tn->child[cindex])) {
19baf839 2182 cindex++;
c877efb2 2183
19baf839
RO
2184 }
2185 else {
c877efb2
SH
2186 /*
2187 * New tnode. Decend one level
19baf839 2188 */
c877efb2 2189
19baf839 2190 depth++;
c877efb2
SH
2191 n = tn->child[cindex];
2192 tn = (struct tnode *)n;
2193 pend = tn->pos+tn->bits;
19baf839
RO
2194 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2195 indent+=3;
2196 cindex=0;
2197 }
2198 }
c877efb2 2199 else
19baf839
RO
2200 cindex++;
2201
2202 /*
c877efb2 2203 * Test if we are done
19baf839 2204 */
c877efb2 2205
19baf839
RO
2206 while (cindex >= (1 << tn->bits)) {
2207
2208 /*
2209 * Move upwards and test for root
2210 * pop off all traversed nodes
2211 */
c877efb2 2212
19baf839
RO
2213 if (NODE_PARENT(tn) == NULL) {
2214 tn = NULL;
2215 n = NULL;
2216 break;
2217 }
2218 else {
2219 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2220 tn = NODE_PARENT(tn);
2221 cindex++;
c877efb2
SH
2222 n = (struct node *)tn;
2223 pend = tn->pos+tn->bits;
19baf839
RO
2224 indent-=3;
2225 depth--;
2226 }
2227 }
2228 }
2229 }
2230 else n = NULL;
2231 }
2232 else seq_printf(seq, "------ trie is empty\n");
2233
2234 read_unlock(&fib_lock);
2235}
2236
2237static struct trie_stat *trie_stat_new(void)
2238{
2239 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2240 int i;
c877efb2
SH
2241
2242 if (s) {
19baf839
RO
2243 s->totdepth = 0;
2244 s->maxdepth = 0;
2245 s->tnodes = 0;
2246 s->leaves = 0;
2247 s->nullpointers = 0;
c877efb2 2248
19baf839
RO
2249 for(i=0; i< MAX_CHILDS; i++)
2250 s->nodesizes[i] = 0;
2251 }
2252 return s;
c877efb2 2253}
19baf839
RO
2254
2255static struct trie_stat *trie_collect_stats(struct trie *t)
2256{
c877efb2 2257 struct node *n = t->trie;
19baf839
RO
2258 struct trie_stat *s = trie_stat_new();
2259 int cindex = 0;
2260 int indent = 1;
2261 int pend = 0;
2262 int depth = 0;
2263
c877efb2 2264 read_lock(&fib_lock);
19baf839
RO
2265
2266 if (s) {
2267 if (n) {
2268 if (IS_TNODE(n)) {
2269 struct tnode *tn = (struct tnode *)n;
c877efb2 2270 pend = tn->pos+tn->bits;
19baf839
RO
2271 indent += 3;
2272 s->nodesizes[tn->bits]++;
2273 depth++;
2274
2275 while (tn && cindex < (1 << tn->bits)) {
2276 if (tn->child[cindex]) {
2277 /* Got a child */
c877efb2
SH
2278
2279 if (IS_LEAF(tn->child[cindex])) {
19baf839 2280 cindex++;
c877efb2 2281
19baf839
RO
2282 /* stats */
2283 if (depth > s->maxdepth)
2284 s->maxdepth = depth;
2285 s->totdepth += depth;
2286 s->leaves++;
2287 }
c877efb2 2288
19baf839 2289 else {
c877efb2
SH
2290 /*
2291 * New tnode. Decend one level
19baf839 2292 */
c877efb2 2293
19baf839
RO
2294 s->tnodes++;
2295 s->nodesizes[tn->bits]++;
2296 depth++;
c877efb2 2297
19baf839
RO
2298 n = tn->child[cindex];
2299 tn = (struct tnode *)n;
2300 pend = tn->pos+tn->bits;
2301
2302 indent += 3;
2303 cindex = 0;
2304 }
2305 }
2306 else {
2307 cindex++;
c877efb2 2308 s->nullpointers++;
19baf839
RO
2309 }
2310
2311 /*
c877efb2 2312 * Test if we are done
19baf839 2313 */
c877efb2 2314
19baf839
RO
2315 while (cindex >= (1 << tn->bits)) {
2316
2317 /*
2318 * Move upwards and test for root
2319 * pop off all traversed nodes
2320 */
2321
c877efb2 2322
19baf839
RO
2323 if (NODE_PARENT(tn) == NULL) {
2324 tn = NULL;
2325 n = NULL;
2326 break;
2327 }
2328 else {
2329 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2330 tn = NODE_PARENT(tn);
c877efb2 2331 cindex++;
19baf839 2332 n = (struct node *)tn;
c877efb2 2333 pend = tn->pos+tn->bits;
19baf839
RO
2334 indent -= 3;
2335 depth--;
2336 }
2337 }
2338 }
2339 }
2340 else n = NULL;
2341 }
2342 }
2343
c877efb2 2344 read_unlock(&fib_lock);
19baf839
RO
2345 return s;
2346}
2347
2348#ifdef CONFIG_PROC_FS
2349
2350static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2351{
2352 return NULL;
2353}
2354
2355static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2356{
2357 return NULL;
2358}
2359
2360static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2361{
2362 void *v = NULL;
2363
2364 if (ip_fib_main_table)
2365 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2366 return v;
2367}
2368
2369static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2370{
2371 ++*pos;
2372 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2373}
2374
2375static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2376{
2377
2378}
2379
c877efb2 2380/*
19baf839
RO
2381 * This outputs /proc/net/fib_triestats
2382 *
2383 * It always works in backward compatibility mode.
2384 * The format of the file is not supposed to be changed.
2385 */
2386
2387static void collect_and_show(struct trie *t, struct seq_file *seq)
2388{
2389 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2390 int i, max, pointers;
2391 struct trie_stat *stat;
2392 int avdepth;
2393
2394 stat = trie_collect_stats(t);
2395
2396 bytes=0;
2397 seq_printf(seq, "trie=%p\n", t);
2398
2399 if (stat) {
2400 if (stat->leaves)
2401 avdepth=stat->totdepth*100 / stat->leaves;
2402 else
2403 avdepth=0;
2404 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2405 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
c877efb2 2406
19baf839
RO
2407 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2408 bytes += sizeof(struct leaf) * stat->leaves;
2409 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2410 bytes += sizeof(struct tnode) * stat->tnodes;
2411
2412 max = MAX_CHILDS-1;
2413
2414 while (max >= 0 && stat->nodesizes[max] == 0)
2415 max--;
2416 pointers = 0;
2417
c877efb2 2418 for (i = 1; i <= max; i++)
19baf839
RO
2419 if (stat->nodesizes[i] != 0) {
2420 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2421 pointers += (1<<i) * stat->nodesizes[i];
2422 }
2423 seq_printf(seq, "\n");
2424 seq_printf(seq, "Pointers: %d\n", pointers);
2425 bytes += sizeof(struct node *) * pointers;
2426 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2427 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2428
2429 kfree(stat);
2430 }
2431
2432#ifdef CONFIG_IP_FIB_TRIE_STATS
2433 seq_printf(seq, "Counters:\n---------\n");
2434 seq_printf(seq,"gets = %d\n", t->stats.gets);
2435 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2436 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2437 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2438 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2f36895a 2439 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
19baf839
RO
2440#ifdef CLEAR_STATS
2441 memset(&(t->stats), 0, sizeof(t->stats));
2442#endif
2443#endif /* CONFIG_IP_FIB_TRIE_STATS */
2444}
2445
2446static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2447{
2448 char bf[128];
c877efb2 2449
19baf839 2450 if (v == SEQ_START_TOKEN) {
c877efb2 2451 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
19baf839 2452 sizeof(struct leaf), sizeof(struct tnode));
c877efb2 2453 if (trie_local)
19baf839
RO
2454 collect_and_show(trie_local, seq);
2455
c877efb2 2456 if (trie_main)
19baf839
RO
2457 collect_and_show(trie_main, seq);
2458 }
2459 else {
2460 snprintf(bf, sizeof(bf),
2461 "*\t%08X\t%08X", 200, 400);
c877efb2 2462
19baf839
RO
2463 seq_printf(seq, "%-127s\n", bf);
2464 }
2465 return 0;
2466}
2467
2468static struct seq_operations fib_triestat_seq_ops = {
c877efb2
SH
2469 .start = fib_triestat_seq_start,
2470 .next = fib_triestat_seq_next,
2471 .stop = fib_triestat_seq_stop,
2472 .show = fib_triestat_seq_show,
19baf839
RO
2473};
2474
2475static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2476{
2477 struct seq_file *seq;
2478 int rc = -ENOMEM;
2479
2480 rc = seq_open(file, &fib_triestat_seq_ops);
2481 if (rc)
2482 goto out_kfree;
2483
c877efb2 2484 seq = file->private_data;
19baf839
RO
2485out:
2486 return rc;
2487out_kfree:
2488 goto out;
2489}
2490
2491static struct file_operations fib_triestat_seq_fops = {
c877efb2
SH
2492 .owner = THIS_MODULE,
2493 .open = fib_triestat_seq_open,
2494 .read = seq_read,
2495 .llseek = seq_lseek,
2496 .release = seq_release_private,
19baf839
RO
2497};
2498
2499int __init fib_stat_proc_init(void)
2500{
2501 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2502 return -ENOMEM;
2503 return 0;
2504}
2505
2506void __init fib_stat_proc_exit(void)
2507{
2508 proc_net_remove("fib_triestat");
2509}
2510
2511static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2512{
2513 return NULL;
2514}
2515
2516static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2517{
2518 return NULL;
2519}
2520
2521static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2522{
2523 void *v = NULL;
2524
2525 if (ip_fib_main_table)
2526 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2527 return v;
2528}
2529
2530static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2531{
2532 ++*pos;
2533 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2534}
2535
2536static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2537{
2538
2539}
2540
c877efb2 2541/*
19baf839
RO
2542 * This outputs /proc/net/fib_trie.
2543 *
2544 * It always works in backward compatibility mode.
2545 * The format of the file is not supposed to be changed.
2546 */
2547
2548static int fib_trie_seq_show(struct seq_file *seq, void *v)
2549{
2550 char bf[128];
2551
2552 if (v == SEQ_START_TOKEN) {
c877efb2 2553 if (trie_local)
19baf839
RO
2554 trie_dump_seq(seq, trie_local);
2555
c877efb2 2556 if (trie_main)
19baf839
RO
2557 trie_dump_seq(seq, trie_main);
2558 }
2559
2560 else {
2561 snprintf(bf, sizeof(bf),
2562 "*\t%08X\t%08X", 200, 400);
2563 seq_printf(seq, "%-127s\n", bf);
2564 }
2565
2566 return 0;
2567}
2568
2569static struct seq_operations fib_trie_seq_ops = {
c877efb2
SH
2570 .start = fib_trie_seq_start,
2571 .next = fib_trie_seq_next,
2572 .stop = fib_trie_seq_stop,
2573 .show = fib_trie_seq_show,
19baf839
RO
2574};
2575
2576static int fib_trie_seq_open(struct inode *inode, struct file *file)
2577{
2578 struct seq_file *seq;
2579 int rc = -ENOMEM;
2580
2581 rc = seq_open(file, &fib_trie_seq_ops);
2582 if (rc)
2583 goto out_kfree;
2584
c877efb2 2585 seq = file->private_data;
19baf839
RO
2586out:
2587 return rc;
2588out_kfree:
2589 goto out;
2590}
2591
2592static struct file_operations fib_trie_seq_fops = {
c877efb2
SH
2593 .owner = THIS_MODULE,
2594 .open = fib_trie_seq_open,
2595 .read = seq_read,
2596 .llseek = seq_lseek,
2597 .release= seq_release_private,
19baf839
RO
2598};
2599
2600int __init fib_proc_init(void)
2601{
2602 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2603 return -ENOMEM;
2604 return 0;
2605}
2606
2607void __init fib_proc_exit(void)
2608{
2609 proc_net_remove("fib_trie");
2610}
2611
2612#endif /* CONFIG_PROC_FS */
This page took 0.222795 seconds and 5 git commands to generate.