packet: support extensible, 64 bit clean mmaped ring structure
[deliverable/linux.git] / net / packet / af_packet.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
12 * Fixes:
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
41 * and packet_mreq.
42 *
43 * This program is free software; you can redistribute it and/or
44 * modify it under the terms of the GNU General Public License
45 * as published by the Free Software Foundation; either version
46 * 2 of the License, or (at your option) any later version.
47 *
48 */
49
50 #include <linux/types.h>
51 #include <linux/mm.h>
52 #include <linux/capability.h>
53 #include <linux/fcntl.h>
54 #include <linux/socket.h>
55 #include <linux/in.h>
56 #include <linux/inet.h>
57 #include <linux/netdevice.h>
58 #include <linux/if_packet.h>
59 #include <linux/wireless.h>
60 #include <linux/kernel.h>
61 #include <linux/kmod.h>
62 #include <net/net_namespace.h>
63 #include <net/ip.h>
64 #include <net/protocol.h>
65 #include <linux/skbuff.h>
66 #include <net/sock.h>
67 #include <linux/errno.h>
68 #include <linux/timer.h>
69 #include <asm/system.h>
70 #include <asm/uaccess.h>
71 #include <asm/ioctls.h>
72 #include <asm/page.h>
73 #include <asm/cacheflush.h>
74 #include <asm/io.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/poll.h>
78 #include <linux/module.h>
79 #include <linux/init.h>
80
81 #ifdef CONFIG_INET
82 #include <net/inet_common.h>
83 #endif
84
85 /*
86 Assumptions:
87 - if device has no dev->hard_header routine, it adds and removes ll header
88 inside itself. In this case ll header is invisible outside of device,
89 but higher levels still should reserve dev->hard_header_len.
90 Some devices are enough clever to reallocate skb, when header
91 will not fit to reserved space (tunnel), another ones are silly
92 (PPP).
93 - packet socket receives packets with pulled ll header,
94 so that SOCK_RAW should push it back.
95
96 On receive:
97 -----------
98
99 Incoming, dev->hard_header!=NULL
100 mac_header -> ll header
101 data -> data
102
103 Outgoing, dev->hard_header!=NULL
104 mac_header -> ll header
105 data -> ll header
106
107 Incoming, dev->hard_header==NULL
108 mac_header -> UNKNOWN position. It is very likely, that it points to ll
109 header. PPP makes it, that is wrong, because introduce
110 assymetry between rx and tx paths.
111 data -> data
112
113 Outgoing, dev->hard_header==NULL
114 mac_header -> data. ll header is still not built!
115 data -> data
116
117 Resume
118 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
119
120
121 On transmit:
122 ------------
123
124 dev->hard_header != NULL
125 mac_header -> ll header
126 data -> ll header
127
128 dev->hard_header == NULL (ll header is added by device, we cannot control it)
129 mac_header -> data
130 data -> data
131
132 We should set nh.raw on output to correct posistion,
133 packet classifier depends on it.
134 */
135
136 /* Private packet socket structures. */
137
138 struct packet_mclist
139 {
140 struct packet_mclist *next;
141 int ifindex;
142 int count;
143 unsigned short type;
144 unsigned short alen;
145 unsigned char addr[MAX_ADDR_LEN];
146 };
147 /* identical to struct packet_mreq except it has
148 * a longer address field.
149 */
150 struct packet_mreq_max
151 {
152 int mr_ifindex;
153 unsigned short mr_type;
154 unsigned short mr_alen;
155 unsigned char mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
160 #endif
161
162 static void packet_flush_mclist(struct sock *sk);
163
164 struct packet_sock {
165 /* struct sock has to be the first member of packet_sock */
166 struct sock sk;
167 struct tpacket_stats stats;
168 #ifdef CONFIG_PACKET_MMAP
169 char * *pg_vec;
170 unsigned int head;
171 unsigned int frames_per_block;
172 unsigned int frame_size;
173 unsigned int frame_max;
174 int copy_thresh;
175 #endif
176 struct packet_type prot_hook;
177 spinlock_t bind_lock;
178 unsigned int running:1, /* prot_hook is attached*/
179 auxdata:1,
180 origdev:1;
181 int ifindex; /* bound device */
182 __be16 num;
183 struct packet_mclist *mclist;
184 #ifdef CONFIG_PACKET_MMAP
185 atomic_t mapped;
186 unsigned int pg_vec_order;
187 unsigned int pg_vec_pages;
188 unsigned int pg_vec_len;
189 enum tpacket_versions tp_version;
190 unsigned int tp_hdrlen;
191 #endif
192 };
193
194 struct packet_skb_cb {
195 unsigned int origlen;
196 union {
197 struct sockaddr_pkt pkt;
198 struct sockaddr_ll ll;
199 } sa;
200 };
201
202 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
203
204 #ifdef CONFIG_PACKET_MMAP
205
206 static void *packet_lookup_frame(struct packet_sock *po, unsigned int position,
207 int status)
208 {
209 unsigned int pg_vec_pos, frame_offset;
210 union {
211 struct tpacket_hdr *h1;
212 struct tpacket2_hdr *h2;
213 void *raw;
214 } h;
215
216 pg_vec_pos = position / po->frames_per_block;
217 frame_offset = position % po->frames_per_block;
218
219 h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
220 switch (po->tp_version) {
221 case TPACKET_V1:
222 if (status != h.h1->tp_status ? TP_STATUS_USER :
223 TP_STATUS_KERNEL)
224 return NULL;
225 break;
226 case TPACKET_V2:
227 if (status != h.h2->tp_status ? TP_STATUS_USER :
228 TP_STATUS_KERNEL)
229 return NULL;
230 break;
231 }
232 return h.raw;
233 }
234
235 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
236 {
237 union {
238 struct tpacket_hdr *h1;
239 struct tpacket2_hdr *h2;
240 void *raw;
241 } h;
242
243 h.raw = frame;
244 switch (po->tp_version) {
245 case TPACKET_V1:
246 h.h1->tp_status = status;
247 break;
248 case TPACKET_V2:
249 h.h2->tp_status = status;
250 break;
251 }
252 }
253 #endif
254
255 static inline struct packet_sock *pkt_sk(struct sock *sk)
256 {
257 return (struct packet_sock *)sk;
258 }
259
260 static void packet_sock_destruct(struct sock *sk)
261 {
262 BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
263 BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
264
265 if (!sock_flag(sk, SOCK_DEAD)) {
266 printk("Attempt to release alive packet socket: %p\n", sk);
267 return;
268 }
269
270 sk_refcnt_debug_dec(sk);
271 }
272
273
274 static const struct proto_ops packet_ops;
275
276 static const struct proto_ops packet_ops_spkt;
277
278 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
279 {
280 struct sock *sk;
281 struct sockaddr_pkt *spkt;
282
283 /*
284 * When we registered the protocol we saved the socket in the data
285 * field for just this event.
286 */
287
288 sk = pt->af_packet_priv;
289
290 /*
291 * Yank back the headers [hope the device set this
292 * right or kerboom...]
293 *
294 * Incoming packets have ll header pulled,
295 * push it back.
296 *
297 * For outgoing ones skb->data == skb_mac_header(skb)
298 * so that this procedure is noop.
299 */
300
301 if (skb->pkt_type == PACKET_LOOPBACK)
302 goto out;
303
304 if (dev_net(dev) != sock_net(sk))
305 goto out;
306
307 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
308 goto oom;
309
310 /* drop any routing info */
311 dst_release(skb->dst);
312 skb->dst = NULL;
313
314 /* drop conntrack reference */
315 nf_reset(skb);
316
317 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
318
319 skb_push(skb, skb->data - skb_mac_header(skb));
320
321 /*
322 * The SOCK_PACKET socket receives _all_ frames.
323 */
324
325 spkt->spkt_family = dev->type;
326 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
327 spkt->spkt_protocol = skb->protocol;
328
329 /*
330 * Charge the memory to the socket. This is done specifically
331 * to prevent sockets using all the memory up.
332 */
333
334 if (sock_queue_rcv_skb(sk,skb) == 0)
335 return 0;
336
337 out:
338 kfree_skb(skb);
339 oom:
340 return 0;
341 }
342
343
344 /*
345 * Output a raw packet to a device layer. This bypasses all the other
346 * protocol layers and you must therefore supply it with a complete frame
347 */
348
349 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
350 struct msghdr *msg, size_t len)
351 {
352 struct sock *sk = sock->sk;
353 struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
354 struct sk_buff *skb;
355 struct net_device *dev;
356 __be16 proto=0;
357 int err;
358
359 /*
360 * Get and verify the address.
361 */
362
363 if (saddr)
364 {
365 if (msg->msg_namelen < sizeof(struct sockaddr))
366 return(-EINVAL);
367 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
368 proto=saddr->spkt_protocol;
369 }
370 else
371 return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */
372
373 /*
374 * Find the device first to size check it
375 */
376
377 saddr->spkt_device[13] = 0;
378 dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
379 err = -ENODEV;
380 if (dev == NULL)
381 goto out_unlock;
382
383 err = -ENETDOWN;
384 if (!(dev->flags & IFF_UP))
385 goto out_unlock;
386
387 /*
388 * You may not queue a frame bigger than the mtu. This is the lowest level
389 * raw protocol and you must do your own fragmentation at this level.
390 */
391
392 err = -EMSGSIZE;
393 if (len > dev->mtu + dev->hard_header_len)
394 goto out_unlock;
395
396 err = -ENOBUFS;
397 skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
398
399 /*
400 * If the write buffer is full, then tough. At this level the user gets to
401 * deal with the problem - do your own algorithmic backoffs. That's far
402 * more flexible.
403 */
404
405 if (skb == NULL)
406 goto out_unlock;
407
408 /*
409 * Fill it in
410 */
411
412 /* FIXME: Save some space for broken drivers that write a
413 * hard header at transmission time by themselves. PPP is the
414 * notable one here. This should really be fixed at the driver level.
415 */
416 skb_reserve(skb, LL_RESERVED_SPACE(dev));
417 skb_reset_network_header(skb);
418
419 /* Try to align data part correctly */
420 if (dev->header_ops) {
421 skb->data -= dev->hard_header_len;
422 skb->tail -= dev->hard_header_len;
423 if (len < dev->hard_header_len)
424 skb_reset_network_header(skb);
425 }
426
427 /* Returns -EFAULT on error */
428 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
429 skb->protocol = proto;
430 skb->dev = dev;
431 skb->priority = sk->sk_priority;
432 if (err)
433 goto out_free;
434
435 /*
436 * Now send it
437 */
438
439 dev_queue_xmit(skb);
440 dev_put(dev);
441 return(len);
442
443 out_free:
444 kfree_skb(skb);
445 out_unlock:
446 if (dev)
447 dev_put(dev);
448 return err;
449 }
450
451 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
452 unsigned int res)
453 {
454 struct sk_filter *filter;
455
456 rcu_read_lock_bh();
457 filter = rcu_dereference(sk->sk_filter);
458 if (filter != NULL)
459 res = sk_run_filter(skb, filter->insns, filter->len);
460 rcu_read_unlock_bh();
461
462 return res;
463 }
464
465 /*
466 This function makes lazy skb cloning in hope that most of packets
467 are discarded by BPF.
468
469 Note tricky part: we DO mangle shared skb! skb->data, skb->len
470 and skb->cb are mangled. It works because (and until) packets
471 falling here are owned by current CPU. Output packets are cloned
472 by dev_queue_xmit_nit(), input packets are processed by net_bh
473 sequencially, so that if we return skb to original state on exit,
474 we will not harm anyone.
475 */
476
477 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
478 {
479 struct sock *sk;
480 struct sockaddr_ll *sll;
481 struct packet_sock *po;
482 u8 * skb_head = skb->data;
483 int skb_len = skb->len;
484 unsigned int snaplen, res;
485
486 if (skb->pkt_type == PACKET_LOOPBACK)
487 goto drop;
488
489 sk = pt->af_packet_priv;
490 po = pkt_sk(sk);
491
492 if (dev_net(dev) != sock_net(sk))
493 goto drop;
494
495 skb->dev = dev;
496
497 if (dev->header_ops) {
498 /* The device has an explicit notion of ll header,
499 exported to higher levels.
500
501 Otherwise, the device hides datails of it frame
502 structure, so that corresponding packet head
503 never delivered to user.
504 */
505 if (sk->sk_type != SOCK_DGRAM)
506 skb_push(skb, skb->data - skb_mac_header(skb));
507 else if (skb->pkt_type == PACKET_OUTGOING) {
508 /* Special case: outgoing packets have ll header at head */
509 skb_pull(skb, skb_network_offset(skb));
510 }
511 }
512
513 snaplen = skb->len;
514
515 res = run_filter(skb, sk, snaplen);
516 if (!res)
517 goto drop_n_restore;
518 if (snaplen > res)
519 snaplen = res;
520
521 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
522 (unsigned)sk->sk_rcvbuf)
523 goto drop_n_acct;
524
525 if (skb_shared(skb)) {
526 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
527 if (nskb == NULL)
528 goto drop_n_acct;
529
530 if (skb_head != skb->data) {
531 skb->data = skb_head;
532 skb->len = skb_len;
533 }
534 kfree_skb(skb);
535 skb = nskb;
536 }
537
538 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
539 sizeof(skb->cb));
540
541 sll = &PACKET_SKB_CB(skb)->sa.ll;
542 sll->sll_family = AF_PACKET;
543 sll->sll_hatype = dev->type;
544 sll->sll_protocol = skb->protocol;
545 sll->sll_pkttype = skb->pkt_type;
546 if (unlikely(po->origdev))
547 sll->sll_ifindex = orig_dev->ifindex;
548 else
549 sll->sll_ifindex = dev->ifindex;
550
551 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
552
553 PACKET_SKB_CB(skb)->origlen = skb->len;
554
555 if (pskb_trim(skb, snaplen))
556 goto drop_n_acct;
557
558 skb_set_owner_r(skb, sk);
559 skb->dev = NULL;
560 dst_release(skb->dst);
561 skb->dst = NULL;
562
563 /* drop conntrack reference */
564 nf_reset(skb);
565
566 spin_lock(&sk->sk_receive_queue.lock);
567 po->stats.tp_packets++;
568 __skb_queue_tail(&sk->sk_receive_queue, skb);
569 spin_unlock(&sk->sk_receive_queue.lock);
570 sk->sk_data_ready(sk, skb->len);
571 return 0;
572
573 drop_n_acct:
574 spin_lock(&sk->sk_receive_queue.lock);
575 po->stats.tp_drops++;
576 spin_unlock(&sk->sk_receive_queue.lock);
577
578 drop_n_restore:
579 if (skb_head != skb->data && skb_shared(skb)) {
580 skb->data = skb_head;
581 skb->len = skb_len;
582 }
583 drop:
584 kfree_skb(skb);
585 return 0;
586 }
587
588 #ifdef CONFIG_PACKET_MMAP
589 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
590 {
591 struct sock *sk;
592 struct packet_sock *po;
593 struct sockaddr_ll *sll;
594 union {
595 struct tpacket_hdr *h1;
596 struct tpacket2_hdr *h2;
597 void *raw;
598 } h;
599 u8 * skb_head = skb->data;
600 int skb_len = skb->len;
601 unsigned int snaplen, res;
602 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
603 unsigned short macoff, netoff, hdrlen;
604 struct sk_buff *copy_skb = NULL;
605 struct timeval tv;
606 struct timespec ts;
607
608 if (skb->pkt_type == PACKET_LOOPBACK)
609 goto drop;
610
611 sk = pt->af_packet_priv;
612 po = pkt_sk(sk);
613
614 if (dev_net(dev) != sock_net(sk))
615 goto drop;
616
617 if (dev->header_ops) {
618 if (sk->sk_type != SOCK_DGRAM)
619 skb_push(skb, skb->data - skb_mac_header(skb));
620 else if (skb->pkt_type == PACKET_OUTGOING) {
621 /* Special case: outgoing packets have ll header at head */
622 skb_pull(skb, skb_network_offset(skb));
623 }
624 }
625
626 if (skb->ip_summed == CHECKSUM_PARTIAL)
627 status |= TP_STATUS_CSUMNOTREADY;
628
629 snaplen = skb->len;
630
631 res = run_filter(skb, sk, snaplen);
632 if (!res)
633 goto drop_n_restore;
634 if (snaplen > res)
635 snaplen = res;
636
637 if (sk->sk_type == SOCK_DGRAM) {
638 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16;
639 } else {
640 unsigned maclen = skb_network_offset(skb);
641 netoff = TPACKET_ALIGN(po->tp_hdrlen +
642 (maclen < 16 ? 16 : maclen));
643 macoff = netoff - maclen;
644 }
645
646 if (macoff + snaplen > po->frame_size) {
647 if (po->copy_thresh &&
648 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
649 (unsigned)sk->sk_rcvbuf) {
650 if (skb_shared(skb)) {
651 copy_skb = skb_clone(skb, GFP_ATOMIC);
652 } else {
653 copy_skb = skb_get(skb);
654 skb_head = skb->data;
655 }
656 if (copy_skb)
657 skb_set_owner_r(copy_skb, sk);
658 }
659 snaplen = po->frame_size - macoff;
660 if ((int)snaplen < 0)
661 snaplen = 0;
662 }
663
664 spin_lock(&sk->sk_receive_queue.lock);
665 h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL);
666 if (!h.raw)
667 goto ring_is_full;
668 po->head = po->head != po->frame_max ? po->head+1 : 0;
669 po->stats.tp_packets++;
670 if (copy_skb) {
671 status |= TP_STATUS_COPY;
672 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
673 }
674 if (!po->stats.tp_drops)
675 status &= ~TP_STATUS_LOSING;
676 spin_unlock(&sk->sk_receive_queue.lock);
677
678 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
679
680 switch (po->tp_version) {
681 case TPACKET_V1:
682 h.h1->tp_len = skb->len;
683 h.h1->tp_snaplen = snaplen;
684 h.h1->tp_mac = macoff;
685 h.h1->tp_net = netoff;
686 if (skb->tstamp.tv64)
687 tv = ktime_to_timeval(skb->tstamp);
688 else
689 do_gettimeofday(&tv);
690 h.h1->tp_sec = tv.tv_sec;
691 h.h1->tp_usec = tv.tv_usec;
692 hdrlen = sizeof(*h.h1);
693 break;
694 case TPACKET_V2:
695 h.h2->tp_len = skb->len;
696 h.h2->tp_snaplen = snaplen;
697 h.h2->tp_mac = macoff;
698 h.h2->tp_net = netoff;
699 if (skb->tstamp.tv64)
700 ts = ktime_to_timespec(skb->tstamp);
701 else
702 getnstimeofday(&ts);
703 h.h2->tp_sec = ts.tv_sec;
704 h.h2->tp_nsec = ts.tv_nsec;
705 hdrlen = sizeof(*h.h2);
706 break;
707 default:
708 BUG();
709 }
710
711 sll = h.raw + TPACKET_ALIGN(hdrlen);
712 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
713 sll->sll_family = AF_PACKET;
714 sll->sll_hatype = dev->type;
715 sll->sll_protocol = skb->protocol;
716 sll->sll_pkttype = skb->pkt_type;
717 if (unlikely(po->origdev))
718 sll->sll_ifindex = orig_dev->ifindex;
719 else
720 sll->sll_ifindex = dev->ifindex;
721
722 __packet_set_status(po, h.raw, status);
723 smp_mb();
724
725 {
726 struct page *p_start, *p_end;
727 u8 *h_end = h.raw + macoff + snaplen - 1;
728
729 p_start = virt_to_page(h.raw);
730 p_end = virt_to_page(h_end);
731 while (p_start <= p_end) {
732 flush_dcache_page(p_start);
733 p_start++;
734 }
735 }
736
737 sk->sk_data_ready(sk, 0);
738
739 drop_n_restore:
740 if (skb_head != skb->data && skb_shared(skb)) {
741 skb->data = skb_head;
742 skb->len = skb_len;
743 }
744 drop:
745 kfree_skb(skb);
746 return 0;
747
748 ring_is_full:
749 po->stats.tp_drops++;
750 spin_unlock(&sk->sk_receive_queue.lock);
751
752 sk->sk_data_ready(sk, 0);
753 if (copy_skb)
754 kfree_skb(copy_skb);
755 goto drop_n_restore;
756 }
757
758 #endif
759
760
761 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
762 struct msghdr *msg, size_t len)
763 {
764 struct sock *sk = sock->sk;
765 struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
766 struct sk_buff *skb;
767 struct net_device *dev;
768 __be16 proto;
769 unsigned char *addr;
770 int ifindex, err, reserve = 0;
771
772 /*
773 * Get and verify the address.
774 */
775
776 if (saddr == NULL) {
777 struct packet_sock *po = pkt_sk(sk);
778
779 ifindex = po->ifindex;
780 proto = po->num;
781 addr = NULL;
782 } else {
783 err = -EINVAL;
784 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
785 goto out;
786 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
787 goto out;
788 ifindex = saddr->sll_ifindex;
789 proto = saddr->sll_protocol;
790 addr = saddr->sll_addr;
791 }
792
793
794 dev = dev_get_by_index(sock_net(sk), ifindex);
795 err = -ENXIO;
796 if (dev == NULL)
797 goto out_unlock;
798 if (sock->type == SOCK_RAW)
799 reserve = dev->hard_header_len;
800
801 err = -ENETDOWN;
802 if (!(dev->flags & IFF_UP))
803 goto out_unlock;
804
805 err = -EMSGSIZE;
806 if (len > dev->mtu+reserve)
807 goto out_unlock;
808
809 skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
810 msg->msg_flags & MSG_DONTWAIT, &err);
811 if (skb==NULL)
812 goto out_unlock;
813
814 skb_reserve(skb, LL_RESERVED_SPACE(dev));
815 skb_reset_network_header(skb);
816
817 err = -EINVAL;
818 if (sock->type == SOCK_DGRAM &&
819 dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
820 goto out_free;
821
822 /* Returns -EFAULT on error */
823 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
824 if (err)
825 goto out_free;
826
827 skb->protocol = proto;
828 skb->dev = dev;
829 skb->priority = sk->sk_priority;
830
831 /*
832 * Now send it
833 */
834
835 err = dev_queue_xmit(skb);
836 if (err > 0 && (err = net_xmit_errno(err)) != 0)
837 goto out_unlock;
838
839 dev_put(dev);
840
841 return(len);
842
843 out_free:
844 kfree_skb(skb);
845 out_unlock:
846 if (dev)
847 dev_put(dev);
848 out:
849 return err;
850 }
851
852 /*
853 * Close a PACKET socket. This is fairly simple. We immediately go
854 * to 'closed' state and remove our protocol entry in the device list.
855 */
856
857 static int packet_release(struct socket *sock)
858 {
859 struct sock *sk = sock->sk;
860 struct packet_sock *po;
861 struct net *net;
862
863 if (!sk)
864 return 0;
865
866 net = sock_net(sk);
867 po = pkt_sk(sk);
868
869 write_lock_bh(&net->packet.sklist_lock);
870 sk_del_node_init(sk);
871 write_unlock_bh(&net->packet.sklist_lock);
872
873 /*
874 * Unhook packet receive handler.
875 */
876
877 if (po->running) {
878 /*
879 * Remove the protocol hook
880 */
881 dev_remove_pack(&po->prot_hook);
882 po->running = 0;
883 po->num = 0;
884 __sock_put(sk);
885 }
886
887 packet_flush_mclist(sk);
888
889 #ifdef CONFIG_PACKET_MMAP
890 if (po->pg_vec) {
891 struct tpacket_req req;
892 memset(&req, 0, sizeof(req));
893 packet_set_ring(sk, &req, 1);
894 }
895 #endif
896
897 /*
898 * Now the socket is dead. No more input will appear.
899 */
900
901 sock_orphan(sk);
902 sock->sk = NULL;
903
904 /* Purge queues */
905
906 skb_queue_purge(&sk->sk_receive_queue);
907 sk_refcnt_debug_release(sk);
908
909 sock_put(sk);
910 return 0;
911 }
912
913 /*
914 * Attach a packet hook.
915 */
916
917 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
918 {
919 struct packet_sock *po = pkt_sk(sk);
920 /*
921 * Detach an existing hook if present.
922 */
923
924 lock_sock(sk);
925
926 spin_lock(&po->bind_lock);
927 if (po->running) {
928 __sock_put(sk);
929 po->running = 0;
930 po->num = 0;
931 spin_unlock(&po->bind_lock);
932 dev_remove_pack(&po->prot_hook);
933 spin_lock(&po->bind_lock);
934 }
935
936 po->num = protocol;
937 po->prot_hook.type = protocol;
938 po->prot_hook.dev = dev;
939
940 po->ifindex = dev ? dev->ifindex : 0;
941
942 if (protocol == 0)
943 goto out_unlock;
944
945 if (!dev || (dev->flags & IFF_UP)) {
946 dev_add_pack(&po->prot_hook);
947 sock_hold(sk);
948 po->running = 1;
949 } else {
950 sk->sk_err = ENETDOWN;
951 if (!sock_flag(sk, SOCK_DEAD))
952 sk->sk_error_report(sk);
953 }
954
955 out_unlock:
956 spin_unlock(&po->bind_lock);
957 release_sock(sk);
958 return 0;
959 }
960
961 /*
962 * Bind a packet socket to a device
963 */
964
965 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
966 {
967 struct sock *sk=sock->sk;
968 char name[15];
969 struct net_device *dev;
970 int err = -ENODEV;
971
972 /*
973 * Check legality
974 */
975
976 if (addr_len != sizeof(struct sockaddr))
977 return -EINVAL;
978 strlcpy(name,uaddr->sa_data,sizeof(name));
979
980 dev = dev_get_by_name(sock_net(sk), name);
981 if (dev) {
982 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
983 dev_put(dev);
984 }
985 return err;
986 }
987
988 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
989 {
990 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
991 struct sock *sk=sock->sk;
992 struct net_device *dev = NULL;
993 int err;
994
995
996 /*
997 * Check legality
998 */
999
1000 if (addr_len < sizeof(struct sockaddr_ll))
1001 return -EINVAL;
1002 if (sll->sll_family != AF_PACKET)
1003 return -EINVAL;
1004
1005 if (sll->sll_ifindex) {
1006 err = -ENODEV;
1007 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1008 if (dev == NULL)
1009 goto out;
1010 }
1011 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1012 if (dev)
1013 dev_put(dev);
1014
1015 out:
1016 return err;
1017 }
1018
1019 static struct proto packet_proto = {
1020 .name = "PACKET",
1021 .owner = THIS_MODULE,
1022 .obj_size = sizeof(struct packet_sock),
1023 };
1024
1025 /*
1026 * Create a packet of type SOCK_PACKET.
1027 */
1028
1029 static int packet_create(struct net *net, struct socket *sock, int protocol)
1030 {
1031 struct sock *sk;
1032 struct packet_sock *po;
1033 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1034 int err;
1035
1036 if (!capable(CAP_NET_RAW))
1037 return -EPERM;
1038 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1039 sock->type != SOCK_PACKET)
1040 return -ESOCKTNOSUPPORT;
1041
1042 sock->state = SS_UNCONNECTED;
1043
1044 err = -ENOBUFS;
1045 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1046 if (sk == NULL)
1047 goto out;
1048
1049 sock->ops = &packet_ops;
1050 if (sock->type == SOCK_PACKET)
1051 sock->ops = &packet_ops_spkt;
1052
1053 sock_init_data(sock, sk);
1054
1055 po = pkt_sk(sk);
1056 sk->sk_family = PF_PACKET;
1057 po->num = proto;
1058
1059 sk->sk_destruct = packet_sock_destruct;
1060 sk_refcnt_debug_inc(sk);
1061
1062 /*
1063 * Attach a protocol block
1064 */
1065
1066 spin_lock_init(&po->bind_lock);
1067 po->prot_hook.func = packet_rcv;
1068
1069 if (sock->type == SOCK_PACKET)
1070 po->prot_hook.func = packet_rcv_spkt;
1071
1072 po->prot_hook.af_packet_priv = sk;
1073
1074 if (proto) {
1075 po->prot_hook.type = proto;
1076 dev_add_pack(&po->prot_hook);
1077 sock_hold(sk);
1078 po->running = 1;
1079 }
1080
1081 write_lock_bh(&net->packet.sklist_lock);
1082 sk_add_node(sk, &net->packet.sklist);
1083 write_unlock_bh(&net->packet.sklist_lock);
1084 return(0);
1085 out:
1086 return err;
1087 }
1088
1089 /*
1090 * Pull a packet from our receive queue and hand it to the user.
1091 * If necessary we block.
1092 */
1093
1094 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1095 struct msghdr *msg, size_t len, int flags)
1096 {
1097 struct sock *sk = sock->sk;
1098 struct sk_buff *skb;
1099 int copied, err;
1100 struct sockaddr_ll *sll;
1101
1102 err = -EINVAL;
1103 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1104 goto out;
1105
1106 #if 0
1107 /* What error should we return now? EUNATTACH? */
1108 if (pkt_sk(sk)->ifindex < 0)
1109 return -ENODEV;
1110 #endif
1111
1112 /*
1113 * Call the generic datagram receiver. This handles all sorts
1114 * of horrible races and re-entrancy so we can forget about it
1115 * in the protocol layers.
1116 *
1117 * Now it will return ENETDOWN, if device have just gone down,
1118 * but then it will block.
1119 */
1120
1121 skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1122
1123 /*
1124 * An error occurred so return it. Because skb_recv_datagram()
1125 * handles the blocking we don't see and worry about blocking
1126 * retries.
1127 */
1128
1129 if (skb == NULL)
1130 goto out;
1131
1132 /*
1133 * If the address length field is there to be filled in, we fill
1134 * it in now.
1135 */
1136
1137 sll = &PACKET_SKB_CB(skb)->sa.ll;
1138 if (sock->type == SOCK_PACKET)
1139 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1140 else
1141 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1142
1143 /*
1144 * You lose any data beyond the buffer you gave. If it worries a
1145 * user program they can ask the device for its MTU anyway.
1146 */
1147
1148 copied = skb->len;
1149 if (copied > len)
1150 {
1151 copied=len;
1152 msg->msg_flags|=MSG_TRUNC;
1153 }
1154
1155 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1156 if (err)
1157 goto out_free;
1158
1159 sock_recv_timestamp(msg, sk, skb);
1160
1161 if (msg->msg_name)
1162 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1163 msg->msg_namelen);
1164
1165 if (pkt_sk(sk)->auxdata) {
1166 struct tpacket_auxdata aux;
1167
1168 aux.tp_status = TP_STATUS_USER;
1169 if (skb->ip_summed == CHECKSUM_PARTIAL)
1170 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1171 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1172 aux.tp_snaplen = skb->len;
1173 aux.tp_mac = 0;
1174 aux.tp_net = skb_network_offset(skb);
1175
1176 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1177 }
1178
1179 /*
1180 * Free or return the buffer as appropriate. Again this
1181 * hides all the races and re-entrancy issues from us.
1182 */
1183 err = (flags&MSG_TRUNC) ? skb->len : copied;
1184
1185 out_free:
1186 skb_free_datagram(sk, skb);
1187 out:
1188 return err;
1189 }
1190
1191 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1192 int *uaddr_len, int peer)
1193 {
1194 struct net_device *dev;
1195 struct sock *sk = sock->sk;
1196
1197 if (peer)
1198 return -EOPNOTSUPP;
1199
1200 uaddr->sa_family = AF_PACKET;
1201 dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1202 if (dev) {
1203 strlcpy(uaddr->sa_data, dev->name, 15);
1204 dev_put(dev);
1205 } else
1206 memset(uaddr->sa_data, 0, 14);
1207 *uaddr_len = sizeof(*uaddr);
1208
1209 return 0;
1210 }
1211
1212 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1213 int *uaddr_len, int peer)
1214 {
1215 struct net_device *dev;
1216 struct sock *sk = sock->sk;
1217 struct packet_sock *po = pkt_sk(sk);
1218 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1219
1220 if (peer)
1221 return -EOPNOTSUPP;
1222
1223 sll->sll_family = AF_PACKET;
1224 sll->sll_ifindex = po->ifindex;
1225 sll->sll_protocol = po->num;
1226 dev = dev_get_by_index(sock_net(sk), po->ifindex);
1227 if (dev) {
1228 sll->sll_hatype = dev->type;
1229 sll->sll_halen = dev->addr_len;
1230 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1231 dev_put(dev);
1232 } else {
1233 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1234 sll->sll_halen = 0;
1235 }
1236 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1237
1238 return 0;
1239 }
1240
1241 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1242 int what)
1243 {
1244 switch (i->type) {
1245 case PACKET_MR_MULTICAST:
1246 if (what > 0)
1247 dev_mc_add(dev, i->addr, i->alen, 0);
1248 else
1249 dev_mc_delete(dev, i->addr, i->alen, 0);
1250 break;
1251 case PACKET_MR_PROMISC:
1252 return dev_set_promiscuity(dev, what);
1253 break;
1254 case PACKET_MR_ALLMULTI:
1255 return dev_set_allmulti(dev, what);
1256 break;
1257 default:;
1258 }
1259 return 0;
1260 }
1261
1262 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1263 {
1264 for ( ; i; i=i->next) {
1265 if (i->ifindex == dev->ifindex)
1266 packet_dev_mc(dev, i, what);
1267 }
1268 }
1269
1270 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1271 {
1272 struct packet_sock *po = pkt_sk(sk);
1273 struct packet_mclist *ml, *i;
1274 struct net_device *dev;
1275 int err;
1276
1277 rtnl_lock();
1278
1279 err = -ENODEV;
1280 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1281 if (!dev)
1282 goto done;
1283
1284 err = -EINVAL;
1285 if (mreq->mr_alen > dev->addr_len)
1286 goto done;
1287
1288 err = -ENOBUFS;
1289 i = kmalloc(sizeof(*i), GFP_KERNEL);
1290 if (i == NULL)
1291 goto done;
1292
1293 err = 0;
1294 for (ml = po->mclist; ml; ml = ml->next) {
1295 if (ml->ifindex == mreq->mr_ifindex &&
1296 ml->type == mreq->mr_type &&
1297 ml->alen == mreq->mr_alen &&
1298 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1299 ml->count++;
1300 /* Free the new element ... */
1301 kfree(i);
1302 goto done;
1303 }
1304 }
1305
1306 i->type = mreq->mr_type;
1307 i->ifindex = mreq->mr_ifindex;
1308 i->alen = mreq->mr_alen;
1309 memcpy(i->addr, mreq->mr_address, i->alen);
1310 i->count = 1;
1311 i->next = po->mclist;
1312 po->mclist = i;
1313 err = packet_dev_mc(dev, i, 1);
1314 if (err) {
1315 po->mclist = i->next;
1316 kfree(i);
1317 }
1318
1319 done:
1320 rtnl_unlock();
1321 return err;
1322 }
1323
1324 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1325 {
1326 struct packet_mclist *ml, **mlp;
1327
1328 rtnl_lock();
1329
1330 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1331 if (ml->ifindex == mreq->mr_ifindex &&
1332 ml->type == mreq->mr_type &&
1333 ml->alen == mreq->mr_alen &&
1334 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1335 if (--ml->count == 0) {
1336 struct net_device *dev;
1337 *mlp = ml->next;
1338 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1339 if (dev) {
1340 packet_dev_mc(dev, ml, -1);
1341 dev_put(dev);
1342 }
1343 kfree(ml);
1344 }
1345 rtnl_unlock();
1346 return 0;
1347 }
1348 }
1349 rtnl_unlock();
1350 return -EADDRNOTAVAIL;
1351 }
1352
1353 static void packet_flush_mclist(struct sock *sk)
1354 {
1355 struct packet_sock *po = pkt_sk(sk);
1356 struct packet_mclist *ml;
1357
1358 if (!po->mclist)
1359 return;
1360
1361 rtnl_lock();
1362 while ((ml = po->mclist) != NULL) {
1363 struct net_device *dev;
1364
1365 po->mclist = ml->next;
1366 if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1367 packet_dev_mc(dev, ml, -1);
1368 dev_put(dev);
1369 }
1370 kfree(ml);
1371 }
1372 rtnl_unlock();
1373 }
1374
1375 static int
1376 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1377 {
1378 struct sock *sk = sock->sk;
1379 struct packet_sock *po = pkt_sk(sk);
1380 int ret;
1381
1382 if (level != SOL_PACKET)
1383 return -ENOPROTOOPT;
1384
1385 switch(optname) {
1386 case PACKET_ADD_MEMBERSHIP:
1387 case PACKET_DROP_MEMBERSHIP:
1388 {
1389 struct packet_mreq_max mreq;
1390 int len = optlen;
1391 memset(&mreq, 0, sizeof(mreq));
1392 if (len < sizeof(struct packet_mreq))
1393 return -EINVAL;
1394 if (len > sizeof(mreq))
1395 len = sizeof(mreq);
1396 if (copy_from_user(&mreq,optval,len))
1397 return -EFAULT;
1398 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1399 return -EINVAL;
1400 if (optname == PACKET_ADD_MEMBERSHIP)
1401 ret = packet_mc_add(sk, &mreq);
1402 else
1403 ret = packet_mc_drop(sk, &mreq);
1404 return ret;
1405 }
1406
1407 #ifdef CONFIG_PACKET_MMAP
1408 case PACKET_RX_RING:
1409 {
1410 struct tpacket_req req;
1411
1412 if (optlen<sizeof(req))
1413 return -EINVAL;
1414 if (copy_from_user(&req,optval,sizeof(req)))
1415 return -EFAULT;
1416 return packet_set_ring(sk, &req, 0);
1417 }
1418 case PACKET_COPY_THRESH:
1419 {
1420 int val;
1421
1422 if (optlen!=sizeof(val))
1423 return -EINVAL;
1424 if (copy_from_user(&val,optval,sizeof(val)))
1425 return -EFAULT;
1426
1427 pkt_sk(sk)->copy_thresh = val;
1428 return 0;
1429 }
1430 case PACKET_VERSION:
1431 {
1432 int val;
1433
1434 if (optlen != sizeof(val))
1435 return -EINVAL;
1436 if (po->pg_vec)
1437 return -EBUSY;
1438 if (copy_from_user(&val, optval, sizeof(val)))
1439 return -EFAULT;
1440 switch (val) {
1441 case TPACKET_V1:
1442 case TPACKET_V2:
1443 po->tp_version = val;
1444 return 0;
1445 default:
1446 return -EINVAL;
1447 }
1448 }
1449 #endif
1450 case PACKET_AUXDATA:
1451 {
1452 int val;
1453
1454 if (optlen < sizeof(val))
1455 return -EINVAL;
1456 if (copy_from_user(&val, optval, sizeof(val)))
1457 return -EFAULT;
1458
1459 po->auxdata = !!val;
1460 return 0;
1461 }
1462 case PACKET_ORIGDEV:
1463 {
1464 int val;
1465
1466 if (optlen < sizeof(val))
1467 return -EINVAL;
1468 if (copy_from_user(&val, optval, sizeof(val)))
1469 return -EFAULT;
1470
1471 po->origdev = !!val;
1472 return 0;
1473 }
1474 default:
1475 return -ENOPROTOOPT;
1476 }
1477 }
1478
1479 static int packet_getsockopt(struct socket *sock, int level, int optname,
1480 char __user *optval, int __user *optlen)
1481 {
1482 int len;
1483 int val;
1484 struct sock *sk = sock->sk;
1485 struct packet_sock *po = pkt_sk(sk);
1486 void *data;
1487 struct tpacket_stats st;
1488
1489 if (level != SOL_PACKET)
1490 return -ENOPROTOOPT;
1491
1492 if (get_user(len, optlen))
1493 return -EFAULT;
1494
1495 if (len < 0)
1496 return -EINVAL;
1497
1498 switch(optname) {
1499 case PACKET_STATISTICS:
1500 if (len > sizeof(struct tpacket_stats))
1501 len = sizeof(struct tpacket_stats);
1502 spin_lock_bh(&sk->sk_receive_queue.lock);
1503 st = po->stats;
1504 memset(&po->stats, 0, sizeof(st));
1505 spin_unlock_bh(&sk->sk_receive_queue.lock);
1506 st.tp_packets += st.tp_drops;
1507
1508 data = &st;
1509 break;
1510 case PACKET_AUXDATA:
1511 if (len > sizeof(int))
1512 len = sizeof(int);
1513 val = po->auxdata;
1514
1515 data = &val;
1516 break;
1517 case PACKET_ORIGDEV:
1518 if (len > sizeof(int))
1519 len = sizeof(int);
1520 val = po->origdev;
1521
1522 data = &val;
1523 break;
1524 #ifdef CONFIG_PACKET_MMAP
1525 case PACKET_VERSION:
1526 if (len > sizeof(int))
1527 len = sizeof(int);
1528 val = po->tp_version;
1529 data = &val;
1530 break;
1531 case PACKET_HDRLEN:
1532 if (len > sizeof(int))
1533 len = sizeof(int);
1534 if (copy_from_user(&val, optval, len))
1535 return -EFAULT;
1536 switch (val) {
1537 case TPACKET_V1:
1538 val = sizeof(struct tpacket_hdr);
1539 break;
1540 case TPACKET_V2:
1541 val = sizeof(struct tpacket2_hdr);
1542 break;
1543 default:
1544 return -EINVAL;
1545 }
1546 data = &val;
1547 break;
1548 #endif
1549 default:
1550 return -ENOPROTOOPT;
1551 }
1552
1553 if (put_user(len, optlen))
1554 return -EFAULT;
1555 if (copy_to_user(optval, data, len))
1556 return -EFAULT;
1557 return 0;
1558 }
1559
1560
1561 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1562 {
1563 struct sock *sk;
1564 struct hlist_node *node;
1565 struct net_device *dev = data;
1566 struct net *net = dev_net(dev);
1567
1568 read_lock(&net->packet.sklist_lock);
1569 sk_for_each(sk, node, &net->packet.sklist) {
1570 struct packet_sock *po = pkt_sk(sk);
1571
1572 switch (msg) {
1573 case NETDEV_UNREGISTER:
1574 if (po->mclist)
1575 packet_dev_mclist(dev, po->mclist, -1);
1576 /* fallthrough */
1577
1578 case NETDEV_DOWN:
1579 if (dev->ifindex == po->ifindex) {
1580 spin_lock(&po->bind_lock);
1581 if (po->running) {
1582 __dev_remove_pack(&po->prot_hook);
1583 __sock_put(sk);
1584 po->running = 0;
1585 sk->sk_err = ENETDOWN;
1586 if (!sock_flag(sk, SOCK_DEAD))
1587 sk->sk_error_report(sk);
1588 }
1589 if (msg == NETDEV_UNREGISTER) {
1590 po->ifindex = -1;
1591 po->prot_hook.dev = NULL;
1592 }
1593 spin_unlock(&po->bind_lock);
1594 }
1595 break;
1596 case NETDEV_UP:
1597 spin_lock(&po->bind_lock);
1598 if (dev->ifindex == po->ifindex && po->num &&
1599 !po->running) {
1600 dev_add_pack(&po->prot_hook);
1601 sock_hold(sk);
1602 po->running = 1;
1603 }
1604 spin_unlock(&po->bind_lock);
1605 break;
1606 }
1607 }
1608 read_unlock(&net->packet.sklist_lock);
1609 return NOTIFY_DONE;
1610 }
1611
1612
1613 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1614 unsigned long arg)
1615 {
1616 struct sock *sk = sock->sk;
1617
1618 switch(cmd) {
1619 case SIOCOUTQ:
1620 {
1621 int amount = atomic_read(&sk->sk_wmem_alloc);
1622 return put_user(amount, (int __user *)arg);
1623 }
1624 case SIOCINQ:
1625 {
1626 struct sk_buff *skb;
1627 int amount = 0;
1628
1629 spin_lock_bh(&sk->sk_receive_queue.lock);
1630 skb = skb_peek(&sk->sk_receive_queue);
1631 if (skb)
1632 amount = skb->len;
1633 spin_unlock_bh(&sk->sk_receive_queue.lock);
1634 return put_user(amount, (int __user *)arg);
1635 }
1636 case SIOCGSTAMP:
1637 return sock_get_timestamp(sk, (struct timeval __user *)arg);
1638 case SIOCGSTAMPNS:
1639 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1640
1641 #ifdef CONFIG_INET
1642 case SIOCADDRT:
1643 case SIOCDELRT:
1644 case SIOCDARP:
1645 case SIOCGARP:
1646 case SIOCSARP:
1647 case SIOCGIFADDR:
1648 case SIOCSIFADDR:
1649 case SIOCGIFBRDADDR:
1650 case SIOCSIFBRDADDR:
1651 case SIOCGIFNETMASK:
1652 case SIOCSIFNETMASK:
1653 case SIOCGIFDSTADDR:
1654 case SIOCSIFDSTADDR:
1655 case SIOCSIFFLAGS:
1656 if (sock_net(sk) != &init_net)
1657 return -ENOIOCTLCMD;
1658 return inet_dgram_ops.ioctl(sock, cmd, arg);
1659 #endif
1660
1661 default:
1662 return -ENOIOCTLCMD;
1663 }
1664 return 0;
1665 }
1666
1667 #ifndef CONFIG_PACKET_MMAP
1668 #define packet_mmap sock_no_mmap
1669 #define packet_poll datagram_poll
1670 #else
1671
1672 static unsigned int packet_poll(struct file * file, struct socket *sock,
1673 poll_table *wait)
1674 {
1675 struct sock *sk = sock->sk;
1676 struct packet_sock *po = pkt_sk(sk);
1677 unsigned int mask = datagram_poll(file, sock, wait);
1678
1679 spin_lock_bh(&sk->sk_receive_queue.lock);
1680 if (po->pg_vec) {
1681 unsigned last = po->head ? po->head-1 : po->frame_max;
1682
1683 if (packet_lookup_frame(po, last, TP_STATUS_USER))
1684 mask |= POLLIN | POLLRDNORM;
1685 }
1686 spin_unlock_bh(&sk->sk_receive_queue.lock);
1687 return mask;
1688 }
1689
1690
1691 /* Dirty? Well, I still did not learn better way to account
1692 * for user mmaps.
1693 */
1694
1695 static void packet_mm_open(struct vm_area_struct *vma)
1696 {
1697 struct file *file = vma->vm_file;
1698 struct socket * sock = file->private_data;
1699 struct sock *sk = sock->sk;
1700
1701 if (sk)
1702 atomic_inc(&pkt_sk(sk)->mapped);
1703 }
1704
1705 static void packet_mm_close(struct vm_area_struct *vma)
1706 {
1707 struct file *file = vma->vm_file;
1708 struct socket * sock = file->private_data;
1709 struct sock *sk = sock->sk;
1710
1711 if (sk)
1712 atomic_dec(&pkt_sk(sk)->mapped);
1713 }
1714
1715 static struct vm_operations_struct packet_mmap_ops = {
1716 .open = packet_mm_open,
1717 .close =packet_mm_close,
1718 };
1719
1720 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1721 {
1722 int i;
1723
1724 for (i = 0; i < len; i++) {
1725 if (likely(pg_vec[i]))
1726 free_pages((unsigned long) pg_vec[i], order);
1727 }
1728 kfree(pg_vec);
1729 }
1730
1731 static inline char *alloc_one_pg_vec_page(unsigned long order)
1732 {
1733 return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1734 order);
1735 }
1736
1737 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1738 {
1739 unsigned int block_nr = req->tp_block_nr;
1740 char **pg_vec;
1741 int i;
1742
1743 pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1744 if (unlikely(!pg_vec))
1745 goto out;
1746
1747 for (i = 0; i < block_nr; i++) {
1748 pg_vec[i] = alloc_one_pg_vec_page(order);
1749 if (unlikely(!pg_vec[i]))
1750 goto out_free_pgvec;
1751 }
1752
1753 out:
1754 return pg_vec;
1755
1756 out_free_pgvec:
1757 free_pg_vec(pg_vec, order, block_nr);
1758 pg_vec = NULL;
1759 goto out;
1760 }
1761
1762 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1763 {
1764 char **pg_vec = NULL;
1765 struct packet_sock *po = pkt_sk(sk);
1766 int was_running, order = 0;
1767 __be16 num;
1768 int err = 0;
1769
1770 if (req->tp_block_nr) {
1771 int i;
1772
1773 /* Sanity tests and some calculations */
1774
1775 if (unlikely(po->pg_vec))
1776 return -EBUSY;
1777
1778 switch (po->tp_version) {
1779 case TPACKET_V1:
1780 po->tp_hdrlen = TPACKET_HDRLEN;
1781 break;
1782 case TPACKET_V2:
1783 po->tp_hdrlen = TPACKET2_HDRLEN;
1784 break;
1785 }
1786
1787 if (unlikely((int)req->tp_block_size <= 0))
1788 return -EINVAL;
1789 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1790 return -EINVAL;
1791 if (unlikely(req->tp_frame_size < po->tp_hdrlen))
1792 return -EINVAL;
1793 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1794 return -EINVAL;
1795
1796 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1797 if (unlikely(po->frames_per_block <= 0))
1798 return -EINVAL;
1799 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1800 req->tp_frame_nr))
1801 return -EINVAL;
1802
1803 err = -ENOMEM;
1804 order = get_order(req->tp_block_size);
1805 pg_vec = alloc_pg_vec(req, order);
1806 if (unlikely(!pg_vec))
1807 goto out;
1808
1809 for (i = 0; i < req->tp_block_nr; i++) {
1810 void *ptr = pg_vec[i];
1811 int k;
1812
1813 for (k = 0; k < po->frames_per_block; k++) {
1814 __packet_set_status(po, ptr, TP_STATUS_KERNEL);
1815 ptr += req->tp_frame_size;
1816 }
1817 }
1818 /* Done */
1819 } else {
1820 if (unlikely(req->tp_frame_nr))
1821 return -EINVAL;
1822 }
1823
1824 lock_sock(sk);
1825
1826 /* Detach socket from network */
1827 spin_lock(&po->bind_lock);
1828 was_running = po->running;
1829 num = po->num;
1830 if (was_running) {
1831 __dev_remove_pack(&po->prot_hook);
1832 po->num = 0;
1833 po->running = 0;
1834 __sock_put(sk);
1835 }
1836 spin_unlock(&po->bind_lock);
1837
1838 synchronize_net();
1839
1840 err = -EBUSY;
1841 if (closing || atomic_read(&po->mapped) == 0) {
1842 err = 0;
1843 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1844
1845 spin_lock_bh(&sk->sk_receive_queue.lock);
1846 pg_vec = XC(po->pg_vec, pg_vec);
1847 po->frame_max = (req->tp_frame_nr - 1);
1848 po->head = 0;
1849 po->frame_size = req->tp_frame_size;
1850 spin_unlock_bh(&sk->sk_receive_queue.lock);
1851
1852 order = XC(po->pg_vec_order, order);
1853 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1854
1855 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1856 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1857 skb_queue_purge(&sk->sk_receive_queue);
1858 #undef XC
1859 if (atomic_read(&po->mapped))
1860 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1861 }
1862
1863 spin_lock(&po->bind_lock);
1864 if (was_running && !po->running) {
1865 sock_hold(sk);
1866 po->running = 1;
1867 po->num = num;
1868 dev_add_pack(&po->prot_hook);
1869 }
1870 spin_unlock(&po->bind_lock);
1871
1872 release_sock(sk);
1873
1874 if (pg_vec)
1875 free_pg_vec(pg_vec, order, req->tp_block_nr);
1876 out:
1877 return err;
1878 }
1879
1880 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1881 {
1882 struct sock *sk = sock->sk;
1883 struct packet_sock *po = pkt_sk(sk);
1884 unsigned long size;
1885 unsigned long start;
1886 int err = -EINVAL;
1887 int i;
1888
1889 if (vma->vm_pgoff)
1890 return -EINVAL;
1891
1892 size = vma->vm_end - vma->vm_start;
1893
1894 lock_sock(sk);
1895 if (po->pg_vec == NULL)
1896 goto out;
1897 if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1898 goto out;
1899
1900 start = vma->vm_start;
1901 for (i = 0; i < po->pg_vec_len; i++) {
1902 struct page *page = virt_to_page(po->pg_vec[i]);
1903 int pg_num;
1904
1905 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1906 err = vm_insert_page(vma, start, page);
1907 if (unlikely(err))
1908 goto out;
1909 start += PAGE_SIZE;
1910 }
1911 }
1912 atomic_inc(&po->mapped);
1913 vma->vm_ops = &packet_mmap_ops;
1914 err = 0;
1915
1916 out:
1917 release_sock(sk);
1918 return err;
1919 }
1920 #endif
1921
1922
1923 static const struct proto_ops packet_ops_spkt = {
1924 .family = PF_PACKET,
1925 .owner = THIS_MODULE,
1926 .release = packet_release,
1927 .bind = packet_bind_spkt,
1928 .connect = sock_no_connect,
1929 .socketpair = sock_no_socketpair,
1930 .accept = sock_no_accept,
1931 .getname = packet_getname_spkt,
1932 .poll = datagram_poll,
1933 .ioctl = packet_ioctl,
1934 .listen = sock_no_listen,
1935 .shutdown = sock_no_shutdown,
1936 .setsockopt = sock_no_setsockopt,
1937 .getsockopt = sock_no_getsockopt,
1938 .sendmsg = packet_sendmsg_spkt,
1939 .recvmsg = packet_recvmsg,
1940 .mmap = sock_no_mmap,
1941 .sendpage = sock_no_sendpage,
1942 };
1943
1944 static const struct proto_ops packet_ops = {
1945 .family = PF_PACKET,
1946 .owner = THIS_MODULE,
1947 .release = packet_release,
1948 .bind = packet_bind,
1949 .connect = sock_no_connect,
1950 .socketpair = sock_no_socketpair,
1951 .accept = sock_no_accept,
1952 .getname = packet_getname,
1953 .poll = packet_poll,
1954 .ioctl = packet_ioctl,
1955 .listen = sock_no_listen,
1956 .shutdown = sock_no_shutdown,
1957 .setsockopt = packet_setsockopt,
1958 .getsockopt = packet_getsockopt,
1959 .sendmsg = packet_sendmsg,
1960 .recvmsg = packet_recvmsg,
1961 .mmap = packet_mmap,
1962 .sendpage = sock_no_sendpage,
1963 };
1964
1965 static struct net_proto_family packet_family_ops = {
1966 .family = PF_PACKET,
1967 .create = packet_create,
1968 .owner = THIS_MODULE,
1969 };
1970
1971 static struct notifier_block packet_netdev_notifier = {
1972 .notifier_call =packet_notifier,
1973 };
1974
1975 #ifdef CONFIG_PROC_FS
1976 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
1977 {
1978 struct sock *s;
1979 struct hlist_node *node;
1980
1981 sk_for_each(s, node, &net->packet.sklist) {
1982 if (!off--)
1983 return s;
1984 }
1985 return NULL;
1986 }
1987
1988 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1989 __acquires(seq_file_net(seq)->packet.sklist_lock)
1990 {
1991 struct net *net = seq_file_net(seq);
1992 read_lock(&net->packet.sklist_lock);
1993 return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
1994 }
1995
1996 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1997 {
1998 struct net *net = seq_file_net(seq);
1999 ++*pos;
2000 return (v == SEQ_START_TOKEN)
2001 ? sk_head(&net->packet.sklist)
2002 : sk_next((struct sock*)v) ;
2003 }
2004
2005 static void packet_seq_stop(struct seq_file *seq, void *v)
2006 __releases(seq_file_net(seq)->packet.sklist_lock)
2007 {
2008 struct net *net = seq_file_net(seq);
2009 read_unlock(&net->packet.sklist_lock);
2010 }
2011
2012 static int packet_seq_show(struct seq_file *seq, void *v)
2013 {
2014 if (v == SEQ_START_TOKEN)
2015 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
2016 else {
2017 struct sock *s = v;
2018 const struct packet_sock *po = pkt_sk(s);
2019
2020 seq_printf(seq,
2021 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
2022 s,
2023 atomic_read(&s->sk_refcnt),
2024 s->sk_type,
2025 ntohs(po->num),
2026 po->ifindex,
2027 po->running,
2028 atomic_read(&s->sk_rmem_alloc),
2029 sock_i_uid(s),
2030 sock_i_ino(s) );
2031 }
2032
2033 return 0;
2034 }
2035
2036 static const struct seq_operations packet_seq_ops = {
2037 .start = packet_seq_start,
2038 .next = packet_seq_next,
2039 .stop = packet_seq_stop,
2040 .show = packet_seq_show,
2041 };
2042
2043 static int packet_seq_open(struct inode *inode, struct file *file)
2044 {
2045 return seq_open_net(inode, file, &packet_seq_ops,
2046 sizeof(struct seq_net_private));
2047 }
2048
2049 static const struct file_operations packet_seq_fops = {
2050 .owner = THIS_MODULE,
2051 .open = packet_seq_open,
2052 .read = seq_read,
2053 .llseek = seq_lseek,
2054 .release = seq_release_net,
2055 };
2056
2057 #endif
2058
2059 static int packet_net_init(struct net *net)
2060 {
2061 rwlock_init(&net->packet.sklist_lock);
2062 INIT_HLIST_HEAD(&net->packet.sklist);
2063
2064 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2065 return -ENOMEM;
2066
2067 return 0;
2068 }
2069
2070 static void packet_net_exit(struct net *net)
2071 {
2072 proc_net_remove(net, "packet");
2073 }
2074
2075 static struct pernet_operations packet_net_ops = {
2076 .init = packet_net_init,
2077 .exit = packet_net_exit,
2078 };
2079
2080
2081 static void __exit packet_exit(void)
2082 {
2083 unregister_netdevice_notifier(&packet_netdev_notifier);
2084 unregister_pernet_subsys(&packet_net_ops);
2085 sock_unregister(PF_PACKET);
2086 proto_unregister(&packet_proto);
2087 }
2088
2089 static int __init packet_init(void)
2090 {
2091 int rc = proto_register(&packet_proto, 0);
2092
2093 if (rc != 0)
2094 goto out;
2095
2096 sock_register(&packet_family_ops);
2097 register_pernet_subsys(&packet_net_ops);
2098 register_netdevice_notifier(&packet_netdev_notifier);
2099 out:
2100 return rc;
2101 }
2102
2103 module_init(packet_init);
2104 module_exit(packet_exit);
2105 MODULE_LICENSE("GPL");
2106 MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 0.075817 seconds and 5 git commands to generate.