net: Rename FLOWI_FLAG_VRFSRC to FLOWI_FLAG_L3MDEV_SRC
[deliverable/linux.git] / drivers / net / vrf.c
1 /*
2 * vrf.c: device driver to encapsulate a VRF space
3 *
4 * Copyright (c) 2015 Cumulus Networks. All rights reserved.
5 * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com>
6 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
7 *
8 * Based on dummy, team and ipvlan drivers
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 */
15
16 #include <linux/module.h>
17 #include <linux/kernel.h>
18 #include <linux/netdevice.h>
19 #include <linux/etherdevice.h>
20 #include <linux/ip.h>
21 #include <linux/init.h>
22 #include <linux/moduleparam.h>
23 #include <linux/netfilter.h>
24 #include <linux/rtnetlink.h>
25 #include <net/rtnetlink.h>
26 #include <linux/u64_stats_sync.h>
27 #include <linux/hashtable.h>
28
29 #include <linux/inetdevice.h>
30 #include <net/arp.h>
31 #include <net/ip.h>
32 #include <net/ip_fib.h>
33 #include <net/ip6_route.h>
34 #include <net/rtnetlink.h>
35 #include <net/route.h>
36 #include <net/addrconf.h>
37 #include <net/l3mdev.h>
38
39 #define DRV_NAME "vrf"
40 #define DRV_VERSION "1.0"
41
42 #define vrf_is_slave(dev) ((dev)->flags & IFF_SLAVE)
43
44 #define vrf_master_get_rcu(dev) \
45 ((struct net_device *)rcu_dereference(dev->rx_handler_data))
46
47 struct slave {
48 struct list_head list;
49 struct net_device *dev;
50 };
51
52 struct slave_queue {
53 struct list_head all_slaves;
54 };
55
56 struct net_vrf {
57 struct slave_queue queue;
58 struct rtable *rth;
59 u32 tb_id;
60 };
61
62 struct pcpu_dstats {
63 u64 tx_pkts;
64 u64 tx_bytes;
65 u64 tx_drps;
66 u64 rx_pkts;
67 u64 rx_bytes;
68 struct u64_stats_sync syncp;
69 };
70
71 static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie)
72 {
73 return dst;
74 }
75
76 static int vrf_ip_local_out(struct sk_buff *skb)
77 {
78 return ip_local_out(skb);
79 }
80
81 static unsigned int vrf_v4_mtu(const struct dst_entry *dst)
82 {
83 /* TO-DO: return max ethernet size? */
84 return dst->dev->mtu;
85 }
86
87 static void vrf_dst_destroy(struct dst_entry *dst)
88 {
89 /* our dst lives forever - or until the device is closed */
90 }
91
92 static unsigned int vrf_default_advmss(const struct dst_entry *dst)
93 {
94 return 65535 - 40;
95 }
96
97 static struct dst_ops vrf_dst_ops = {
98 .family = AF_INET,
99 .local_out = vrf_ip_local_out,
100 .check = vrf_ip_check,
101 .mtu = vrf_v4_mtu,
102 .destroy = vrf_dst_destroy,
103 .default_advmss = vrf_default_advmss,
104 };
105
106 static bool is_ip_rx_frame(struct sk_buff *skb)
107 {
108 switch (skb->protocol) {
109 case htons(ETH_P_IP):
110 case htons(ETH_P_IPV6):
111 return true;
112 }
113 return false;
114 }
115
116 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
117 {
118 vrf_dev->stats.tx_errors++;
119 kfree_skb(skb);
120 }
121
122 /* note: already called with rcu_read_lock */
123 static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
124 {
125 struct sk_buff *skb = *pskb;
126
127 if (is_ip_rx_frame(skb)) {
128 struct net_device *dev = vrf_master_get_rcu(skb->dev);
129 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
130
131 u64_stats_update_begin(&dstats->syncp);
132 dstats->rx_pkts++;
133 dstats->rx_bytes += skb->len;
134 u64_stats_update_end(&dstats->syncp);
135
136 skb->dev = dev;
137
138 return RX_HANDLER_ANOTHER;
139 }
140 return RX_HANDLER_PASS;
141 }
142
143 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
144 struct rtnl_link_stats64 *stats)
145 {
146 int i;
147
148 for_each_possible_cpu(i) {
149 const struct pcpu_dstats *dstats;
150 u64 tbytes, tpkts, tdrops, rbytes, rpkts;
151 unsigned int start;
152
153 dstats = per_cpu_ptr(dev->dstats, i);
154 do {
155 start = u64_stats_fetch_begin_irq(&dstats->syncp);
156 tbytes = dstats->tx_bytes;
157 tpkts = dstats->tx_pkts;
158 tdrops = dstats->tx_drps;
159 rbytes = dstats->rx_bytes;
160 rpkts = dstats->rx_pkts;
161 } while (u64_stats_fetch_retry_irq(&dstats->syncp, start));
162 stats->tx_bytes += tbytes;
163 stats->tx_packets += tpkts;
164 stats->tx_dropped += tdrops;
165 stats->rx_bytes += rbytes;
166 stats->rx_packets += rpkts;
167 }
168 return stats;
169 }
170
171 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
172 struct net_device *dev)
173 {
174 vrf_tx_error(dev, skb);
175 return NET_XMIT_DROP;
176 }
177
178 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4,
179 struct net_device *vrf_dev)
180 {
181 struct rtable *rt;
182 int err = 1;
183
184 rt = ip_route_output_flow(dev_net(vrf_dev), fl4, NULL);
185 if (IS_ERR(rt))
186 goto out;
187
188 /* TO-DO: what about broadcast ? */
189 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
190 ip_rt_put(rt);
191 goto out;
192 }
193
194 skb_dst_drop(skb);
195 skb_dst_set(skb, &rt->dst);
196 err = 0;
197 out:
198 return err;
199 }
200
201 static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
202 struct net_device *vrf_dev)
203 {
204 struct iphdr *ip4h = ip_hdr(skb);
205 int ret = NET_XMIT_DROP;
206 struct flowi4 fl4 = {
207 /* needed to match OIF rule */
208 .flowi4_oif = vrf_dev->ifindex,
209 .flowi4_iif = LOOPBACK_IFINDEX,
210 .flowi4_tos = RT_TOS(ip4h->tos),
211 .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC |
212 FLOWI_FLAG_SKIP_NH_OIF,
213 .daddr = ip4h->daddr,
214 };
215
216 if (vrf_send_v4_prep(skb, &fl4, vrf_dev))
217 goto err;
218
219 if (!ip4h->saddr) {
220 ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
221 RT_SCOPE_LINK);
222 }
223
224 ret = ip_local_out(skb);
225 if (unlikely(net_xmit_eval(ret)))
226 vrf_dev->stats.tx_errors++;
227 else
228 ret = NET_XMIT_SUCCESS;
229
230 out:
231 return ret;
232 err:
233 vrf_tx_error(vrf_dev, skb);
234 goto out;
235 }
236
237 static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev)
238 {
239 /* strip the ethernet header added for pass through VRF device */
240 __skb_pull(skb, skb_network_offset(skb));
241
242 switch (skb->protocol) {
243 case htons(ETH_P_IP):
244 return vrf_process_v4_outbound(skb, dev);
245 case htons(ETH_P_IPV6):
246 return vrf_process_v6_outbound(skb, dev);
247 default:
248 vrf_tx_error(dev, skb);
249 return NET_XMIT_DROP;
250 }
251 }
252
253 static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
254 {
255 netdev_tx_t ret = is_ip_tx_frame(skb, dev);
256
257 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
258 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
259
260 u64_stats_update_begin(&dstats->syncp);
261 dstats->tx_pkts++;
262 dstats->tx_bytes += skb->len;
263 u64_stats_update_end(&dstats->syncp);
264 } else {
265 this_cpu_inc(dev->dstats->tx_drps);
266 }
267
268 return ret;
269 }
270
271 /* modelled after ip_finish_output2 */
272 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
273 {
274 struct dst_entry *dst = skb_dst(skb);
275 struct rtable *rt = (struct rtable *)dst;
276 struct net_device *dev = dst->dev;
277 unsigned int hh_len = LL_RESERVED_SPACE(dev);
278 struct neighbour *neigh;
279 u32 nexthop;
280 int ret = -EINVAL;
281
282 /* Be paranoid, rather than too clever. */
283 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
284 struct sk_buff *skb2;
285
286 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
287 if (!skb2) {
288 ret = -ENOMEM;
289 goto err;
290 }
291 if (skb->sk)
292 skb_set_owner_w(skb2, skb->sk);
293
294 consume_skb(skb);
295 skb = skb2;
296 }
297
298 rcu_read_lock_bh();
299
300 nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr);
301 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
302 if (unlikely(!neigh))
303 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
304 if (!IS_ERR(neigh))
305 ret = dst_neigh_output(dst, neigh, skb);
306
307 rcu_read_unlock_bh();
308 err:
309 if (unlikely(ret < 0))
310 vrf_tx_error(skb->dev, skb);
311 return ret;
312 }
313
314 static int vrf_output(struct sock *sk, struct sk_buff *skb)
315 {
316 struct net_device *dev = skb_dst(skb)->dev;
317 struct net *net = dev_net(dev);
318
319 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
320
321 skb->dev = dev;
322 skb->protocol = htons(ETH_P_IP);
323
324 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
325 net, sk, skb, NULL, dev,
326 vrf_finish_output,
327 !(IPCB(skb)->flags & IPSKB_REROUTED));
328 }
329
330 static void vrf_rtable_destroy(struct net_vrf *vrf)
331 {
332 struct dst_entry *dst = (struct dst_entry *)vrf->rth;
333
334 dst_destroy(dst);
335 vrf->rth = NULL;
336 }
337
338 static struct rtable *vrf_rtable_create(struct net_device *dev)
339 {
340 struct net_vrf *vrf = netdev_priv(dev);
341 struct rtable *rth;
342
343 rth = dst_alloc(&vrf_dst_ops, dev, 2,
344 DST_OBSOLETE_NONE,
345 (DST_HOST | DST_NOPOLICY | DST_NOXFRM));
346 if (rth) {
347 rth->dst.output = vrf_output;
348 rth->rt_genid = rt_genid_ipv4(dev_net(dev));
349 rth->rt_flags = 0;
350 rth->rt_type = RTN_UNICAST;
351 rth->rt_is_input = 0;
352 rth->rt_iif = 0;
353 rth->rt_pmtu = 0;
354 rth->rt_gateway = 0;
355 rth->rt_uses_gateway = 0;
356 rth->rt_table_id = vrf->tb_id;
357 INIT_LIST_HEAD(&rth->rt_uncached);
358 rth->rt_uncached_list = NULL;
359 }
360
361 return rth;
362 }
363
364 /**************************** device handling ********************/
365
366 /* cycle interface to flush neighbor cache and move routes across tables */
367 static void cycle_netdev(struct net_device *dev)
368 {
369 unsigned int flags = dev->flags;
370 int ret;
371
372 if (!netif_running(dev))
373 return;
374
375 ret = dev_change_flags(dev, flags & ~IFF_UP);
376 if (ret >= 0)
377 ret = dev_change_flags(dev, flags);
378
379 if (ret < 0) {
380 netdev_err(dev,
381 "Failed to cycle device %s; route tables might be wrong!\n",
382 dev->name);
383 }
384 }
385
386 static struct slave *__vrf_find_slave_dev(struct slave_queue *queue,
387 struct net_device *dev)
388 {
389 struct list_head *head = &queue->all_slaves;
390 struct slave *slave;
391
392 list_for_each_entry(slave, head, list) {
393 if (slave->dev == dev)
394 return slave;
395 }
396
397 return NULL;
398 }
399
400 /* inverse of __vrf_insert_slave */
401 static void __vrf_remove_slave(struct slave_queue *queue, struct slave *slave)
402 {
403 list_del(&slave->list);
404 }
405
406 static void __vrf_insert_slave(struct slave_queue *queue, struct slave *slave)
407 {
408 list_add(&slave->list, &queue->all_slaves);
409 }
410
411 static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
412 {
413 struct slave *slave = kzalloc(sizeof(*slave), GFP_KERNEL);
414 struct net_vrf *vrf = netdev_priv(dev);
415 struct slave_queue *queue = &vrf->queue;
416 int ret = -ENOMEM;
417
418 if (!slave)
419 goto out_fail;
420
421 slave->dev = port_dev;
422
423 /* register the packet handler for slave ports */
424 ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev);
425 if (ret) {
426 netdev_err(port_dev,
427 "Device %s failed to register rx_handler\n",
428 port_dev->name);
429 goto out_fail;
430 }
431
432 ret = netdev_master_upper_dev_link(port_dev, dev);
433 if (ret < 0)
434 goto out_unregister;
435
436 port_dev->flags |= IFF_SLAVE;
437 __vrf_insert_slave(queue, slave);
438 cycle_netdev(port_dev);
439
440 return 0;
441
442 out_unregister:
443 netdev_rx_handler_unregister(port_dev);
444 out_fail:
445 kfree(slave);
446 return ret;
447 }
448
449 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
450 {
451 if (netif_is_l3_master(port_dev) || vrf_is_slave(port_dev))
452 return -EINVAL;
453
454 return do_vrf_add_slave(dev, port_dev);
455 }
456
457 /* inverse of do_vrf_add_slave */
458 static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
459 {
460 struct net_vrf *vrf = netdev_priv(dev);
461 struct slave_queue *queue = &vrf->queue;
462 struct slave *slave;
463
464 netdev_upper_dev_unlink(port_dev, dev);
465 port_dev->flags &= ~IFF_SLAVE;
466
467 netdev_rx_handler_unregister(port_dev);
468
469 cycle_netdev(port_dev);
470
471 slave = __vrf_find_slave_dev(queue, port_dev);
472 if (slave)
473 __vrf_remove_slave(queue, slave);
474
475 kfree(slave);
476
477 return 0;
478 }
479
480 static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
481 {
482 return do_vrf_del_slave(dev, port_dev);
483 }
484
485 static void vrf_dev_uninit(struct net_device *dev)
486 {
487 struct net_vrf *vrf = netdev_priv(dev);
488 struct slave_queue *queue = &vrf->queue;
489 struct list_head *head = &queue->all_slaves;
490 struct slave *slave, *next;
491
492 vrf_rtable_destroy(vrf);
493
494 list_for_each_entry_safe(slave, next, head, list)
495 vrf_del_slave(dev, slave->dev);
496
497 free_percpu(dev->dstats);
498 dev->dstats = NULL;
499 }
500
501 static int vrf_dev_init(struct net_device *dev)
502 {
503 struct net_vrf *vrf = netdev_priv(dev);
504
505 INIT_LIST_HEAD(&vrf->queue.all_slaves);
506
507 dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
508 if (!dev->dstats)
509 goto out_nomem;
510
511 /* create the default dst which points back to us */
512 vrf->rth = vrf_rtable_create(dev);
513 if (!vrf->rth)
514 goto out_stats;
515
516 dev->flags = IFF_MASTER | IFF_NOARP;
517
518 return 0;
519
520 out_stats:
521 free_percpu(dev->dstats);
522 dev->dstats = NULL;
523 out_nomem:
524 return -ENOMEM;
525 }
526
527 static const struct net_device_ops vrf_netdev_ops = {
528 .ndo_init = vrf_dev_init,
529 .ndo_uninit = vrf_dev_uninit,
530 .ndo_start_xmit = vrf_xmit,
531 .ndo_get_stats64 = vrf_get_stats64,
532 .ndo_add_slave = vrf_add_slave,
533 .ndo_del_slave = vrf_del_slave,
534 };
535
536 static u32 vrf_fib_table(const struct net_device *dev)
537 {
538 struct net_vrf *vrf = netdev_priv(dev);
539
540 return vrf->tb_id;
541 }
542
543 static struct rtable *vrf_get_rtable(const struct net_device *dev,
544 const struct flowi4 *fl4)
545 {
546 struct rtable *rth = NULL;
547
548 if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) {
549 struct net_vrf *vrf = netdev_priv(dev);
550
551 rth = vrf->rth;
552 atomic_inc(&rth->dst.__refcnt);
553 }
554
555 return rth;
556 }
557
558 static const struct l3mdev_ops vrf_l3mdev_ops = {
559 .l3mdev_fib_table = vrf_fib_table,
560 .l3mdev_get_rtable = vrf_get_rtable,
561 };
562
563 static void vrf_get_drvinfo(struct net_device *dev,
564 struct ethtool_drvinfo *info)
565 {
566 strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
567 strlcpy(info->version, DRV_VERSION, sizeof(info->version));
568 }
569
570 static const struct ethtool_ops vrf_ethtool_ops = {
571 .get_drvinfo = vrf_get_drvinfo,
572 };
573
574 static void vrf_setup(struct net_device *dev)
575 {
576 ether_setup(dev);
577
578 /* Initialize the device structure. */
579 dev->netdev_ops = &vrf_netdev_ops;
580 dev->l3mdev_ops = &vrf_l3mdev_ops;
581 dev->ethtool_ops = &vrf_ethtool_ops;
582 dev->destructor = free_netdev;
583
584 /* Fill in device structure with ethernet-generic values. */
585 eth_hw_addr_random(dev);
586
587 /* don't acquire vrf device's netif_tx_lock when transmitting */
588 dev->features |= NETIF_F_LLTX;
589
590 /* don't allow vrf devices to change network namespaces. */
591 dev->features |= NETIF_F_NETNS_LOCAL;
592 }
593
594 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[])
595 {
596 if (tb[IFLA_ADDRESS]) {
597 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
598 return -EINVAL;
599 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
600 return -EADDRNOTAVAIL;
601 }
602 return 0;
603 }
604
605 static void vrf_dellink(struct net_device *dev, struct list_head *head)
606 {
607 unregister_netdevice_queue(dev, head);
608 }
609
610 static int vrf_newlink(struct net *src_net, struct net_device *dev,
611 struct nlattr *tb[], struct nlattr *data[])
612 {
613 struct net_vrf *vrf = netdev_priv(dev);
614 int err;
615
616 if (!data || !data[IFLA_VRF_TABLE])
617 return -EINVAL;
618
619 vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]);
620
621 dev->priv_flags |= IFF_L3MDEV_MASTER;
622
623 err = register_netdevice(dev);
624 if (err < 0)
625 goto out_fail;
626
627 return 0;
628
629 out_fail:
630 free_netdev(dev);
631 return err;
632 }
633
634 static size_t vrf_nl_getsize(const struct net_device *dev)
635 {
636 return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */
637 }
638
639 static int vrf_fillinfo(struct sk_buff *skb,
640 const struct net_device *dev)
641 {
642 struct net_vrf *vrf = netdev_priv(dev);
643
644 return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
645 }
646
647 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
648 [IFLA_VRF_TABLE] = { .type = NLA_U32 },
649 };
650
651 static struct rtnl_link_ops vrf_link_ops __read_mostly = {
652 .kind = DRV_NAME,
653 .priv_size = sizeof(struct net_vrf),
654
655 .get_size = vrf_nl_getsize,
656 .policy = vrf_nl_policy,
657 .validate = vrf_validate,
658 .fill_info = vrf_fillinfo,
659
660 .newlink = vrf_newlink,
661 .dellink = vrf_dellink,
662 .setup = vrf_setup,
663 .maxtype = IFLA_VRF_MAX,
664 };
665
666 static int vrf_device_event(struct notifier_block *unused,
667 unsigned long event, void *ptr)
668 {
669 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
670
671 /* only care about unregister events to drop slave references */
672 if (event == NETDEV_UNREGISTER) {
673 struct net_device *vrf_dev;
674
675 if (!vrf_is_slave(dev) || netif_is_l3_master(dev))
676 goto out;
677
678 vrf_dev = netdev_master_upper_dev_get(dev);
679 vrf_del_slave(vrf_dev, dev);
680 }
681 out:
682 return NOTIFY_DONE;
683 }
684
685 static struct notifier_block vrf_notifier_block __read_mostly = {
686 .notifier_call = vrf_device_event,
687 };
688
689 static int __init vrf_init_module(void)
690 {
691 int rc;
692
693 vrf_dst_ops.kmem_cachep =
694 kmem_cache_create("vrf_ip_dst_cache",
695 sizeof(struct rtable), 0,
696 SLAB_HWCACHE_ALIGN,
697 NULL);
698
699 if (!vrf_dst_ops.kmem_cachep)
700 return -ENOMEM;
701
702 register_netdevice_notifier(&vrf_notifier_block);
703
704 rc = rtnl_link_register(&vrf_link_ops);
705 if (rc < 0)
706 goto error;
707
708 return 0;
709
710 error:
711 unregister_netdevice_notifier(&vrf_notifier_block);
712 kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
713 return rc;
714 }
715
716 static void __exit vrf_cleanup_module(void)
717 {
718 rtnl_link_unregister(&vrf_link_ops);
719 unregister_netdevice_notifier(&vrf_notifier_block);
720 kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
721 }
722
723 module_init(vrf_init_module);
724 module_exit(vrf_cleanup_module);
725 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
726 MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
727 MODULE_LICENSE("GPL");
728 MODULE_ALIAS_RTNL_LINK(DRV_NAME);
729 MODULE_VERSION(DRV_VERSION);
This page took 0.048758 seconds and 6 git commands to generate.