net: skb_free_datagram_locked() doesnt drop all packets
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
1da177e4
LT
96/*
97 Assumptions:
98 - if device has no dev->hard_header routine, it adds and removes ll header
99 inside itself. In this case ll header is invisible outside of device,
100 but higher levels still should reserve dev->hard_header_len.
101 Some devices are enough clever to reallocate skb, when header
102 will not fit to reserved space (tunnel), another ones are silly
103 (PPP).
104 - packet socket receives packets with pulled ll header,
105 so that SOCK_RAW should push it back.
106
107On receive:
108-----------
109
110Incoming, dev->hard_header!=NULL
b0e380b1
ACM
111 mac_header -> ll header
112 data -> data
1da177e4
LT
113
114Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> ll header
1da177e4
LT
117
118Incoming, dev->hard_header==NULL
b0e380b1
ACM
119 mac_header -> UNKNOWN position. It is very likely, that it points to ll
120 header. PPP makes it, that is wrong, because introduce
db0c58f9 121 assymetry between rx and tx paths.
b0e380b1 122 data -> data
1da177e4
LT
123
124Outgoing, dev->hard_header==NULL
b0e380b1
ACM
125 mac_header -> data. ll header is still not built!
126 data -> data
1da177e4
LT
127
128Resume
129 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
131
132On transmit:
133------------
134
135dev->hard_header != NULL
b0e380b1
ACM
136 mac_header -> ll header
137 data -> ll header
1da177e4
LT
138
139dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
140 mac_header -> data
141 data -> data
1da177e4
LT
142
143 We should set nh.raw on output to correct posistion,
144 packet classifier depends on it.
145 */
146
1da177e4
LT
147/* Private packet socket structures. */
148
40d4e3df 149struct packet_mclist {
1da177e4
LT
150 struct packet_mclist *next;
151 int ifindex;
152 int count;
153 unsigned short type;
154 unsigned short alen;
0fb375fb
EB
155 unsigned char addr[MAX_ADDR_LEN];
156};
157/* identical to struct packet_mreq except it has
158 * a longer address field.
159 */
40d4e3df 160struct packet_mreq_max {
0fb375fb
EB
161 int mr_ifindex;
162 unsigned short mr_type;
163 unsigned short mr_alen;
164 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 165};
a2efcfa0 166
f6fb8f10 167static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
168 int closing, int tx_ring);
169
f6fb8f10 170
171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
178/* kbdq - kernel block descriptor queue */
bc59ba39 179struct tpacket_kbdq_core {
f6fb8f10 180 struct pgv *pkbdq;
181 unsigned int feature_req_word;
182 unsigned int hdrlen;
183 unsigned char reset_pending_on_curr_blk;
184 unsigned char delete_blk_timer;
185 unsigned short kactive_blk_num;
186 unsigned short blk_sizeof_priv;
187
188 /* last_kactive_blk_num:
189 * trick to see if user-space has caught up
190 * in order to avoid refreshing timer when every single pkt arrives.
191 */
192 unsigned short last_kactive_blk_num;
193
194 char *pkblk_start;
195 char *pkblk_end;
196 int kblk_size;
197 unsigned int knum_blocks;
198 uint64_t knxt_seq_num;
199 char *prev;
200 char *nxt_offset;
201 struct sk_buff *skb;
202
203 atomic_t blk_fill_in_prog;
204
205 /* Default is set to 8ms */
206#define DEFAULT_PRB_RETIRE_TOV (8)
207
208 unsigned short retire_blk_tov;
209 unsigned short version;
210 unsigned long tov_in_jiffies;
211
212 /* timer to retire an outstanding block */
213 struct timer_list retire_blk_timer;
214};
215
216#define PGV_FROM_VMALLOC 1
0e3125c7
NH
217struct pgv {
218 char *buffer;
0e3125c7
NH
219};
220
69e3c75f 221struct packet_ring_buffer {
0e3125c7 222 struct pgv *pg_vec;
69e3c75f
JB
223 unsigned int head;
224 unsigned int frames_per_block;
225 unsigned int frame_size;
226 unsigned int frame_max;
227
228 unsigned int pg_vec_order;
229 unsigned int pg_vec_pages;
230 unsigned int pg_vec_len;
231
bc59ba39 232 struct tpacket_kbdq_core prb_bdqc;
69e3c75f
JB
233 atomic_t pending;
234};
235
f6fb8f10 236#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
237#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
238#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
239#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
240#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
241#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
242#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
243
69e3c75f
JB
244struct packet_sock;
245static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4 246
f6fb8f10 247static void *packet_previous_frame(struct packet_sock *po,
248 struct packet_ring_buffer *rb,
249 int status);
250static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 251static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
252 struct tpacket_block_desc *);
253static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 254 struct packet_sock *);
bc59ba39 255static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 256 struct packet_sock *, unsigned int status);
bc59ba39 257static int prb_queue_frozen(struct tpacket_kbdq_core *);
258static void prb_open_block(struct tpacket_kbdq_core *,
259 struct tpacket_block_desc *);
f6fb8f10 260static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 261static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
262static void prb_init_blk_timer(struct packet_sock *,
263 struct tpacket_kbdq_core *,
264 void (*func) (unsigned long));
265static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
266static void prb_clear_rxhash(struct tpacket_kbdq_core *,
267 struct tpacket3_hdr *);
268static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
269 struct tpacket3_hdr *);
1da177e4
LT
270static void packet_flush_mclist(struct sock *sk);
271
dc99f600 272struct packet_fanout;
1da177e4
LT
273struct packet_sock {
274 /* struct sock has to be the first member of packet_sock */
275 struct sock sk;
dc99f600 276 struct packet_fanout *fanout;
1da177e4 277 struct tpacket_stats stats;
f6fb8f10 278 union tpacket_stats_u stats_u;
69e3c75f
JB
279 struct packet_ring_buffer rx_ring;
280 struct packet_ring_buffer tx_ring;
1da177e4 281 int copy_thresh;
1da177e4 282 spinlock_t bind_lock;
905db440 283 struct mutex pg_vec_lock;
8dc41944 284 unsigned int running:1, /* prot_hook is attached*/
80feaacb 285 auxdata:1,
bfd5f4a3
SS
286 origdev:1,
287 has_vnet_hdr:1;
1da177e4 288 int ifindex; /* bound device */
0e11c91e 289 __be16 num;
1da177e4 290 struct packet_mclist *mclist;
1da177e4 291 atomic_t mapped;
bbd6ef87
PM
292 enum tpacket_versions tp_version;
293 unsigned int tp_hdrlen;
8913336a 294 unsigned int tp_reserve;
69e3c75f 295 unsigned int tp_loss:1;
614f60fa 296 unsigned int tp_tstamp;
94b05952 297 struct packet_type prot_hook ____cacheline_aligned_in_smp;
1da177e4
LT
298};
299
dc99f600
DM
300#define PACKET_FANOUT_MAX 256
301
302struct packet_fanout {
303#ifdef CONFIG_NET_NS
304 struct net *net;
305#endif
306 unsigned int num_members;
307 u16 id;
308 u8 type;
7736d33f 309 u8 defrag;
dc99f600
DM
310 atomic_t rr_cur;
311 struct list_head list;
312 struct sock *arr[PACKET_FANOUT_MAX];
313 spinlock_t lock;
314 atomic_t sk_ref;
315 struct packet_type prot_hook ____cacheline_aligned_in_smp;
316};
317
ffbc6111
HX
318struct packet_skb_cb {
319 unsigned int origlen;
320 union {
321 struct sockaddr_pkt pkt;
322 struct sockaddr_ll ll;
323 } sa;
324};
325
326#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 327
bc59ba39 328#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 329#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 330 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 331#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 332 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 333#define GET_NEXT_PRB_BLK_NUM(x) \
334 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
335 ((x)->kactive_blk_num+1) : 0)
336
eea49cc9 337static struct packet_sock *pkt_sk(struct sock *sk)
ce06b03e
DM
338{
339 return (struct packet_sock *)sk;
340}
341
dc99f600
DM
342static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
343static void __fanout_link(struct sock *sk, struct packet_sock *po);
344
ce06b03e
DM
345/* register_prot_hook must be invoked with the po->bind_lock held,
346 * or from a context in which asynchronous accesses to the packet
347 * socket is not possible (packet_create()).
348 */
349static void register_prot_hook(struct sock *sk)
350{
351 struct packet_sock *po = pkt_sk(sk);
352 if (!po->running) {
dc99f600
DM
353 if (po->fanout)
354 __fanout_link(sk, po);
355 else
356 dev_add_pack(&po->prot_hook);
ce06b03e
DM
357 sock_hold(sk);
358 po->running = 1;
359 }
360}
361
362/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
363 * held. If the sync parameter is true, we will temporarily drop
364 * the po->bind_lock and do a synchronize_net to make sure no
365 * asynchronous packet processing paths still refer to the elements
366 * of po->prot_hook. If the sync parameter is false, it is the
367 * callers responsibility to take care of this.
368 */
369static void __unregister_prot_hook(struct sock *sk, bool sync)
370{
371 struct packet_sock *po = pkt_sk(sk);
372
373 po->running = 0;
dc99f600
DM
374 if (po->fanout)
375 __fanout_unlink(sk, po);
376 else
377 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
378 __sock_put(sk);
379
380 if (sync) {
381 spin_unlock(&po->bind_lock);
382 synchronize_net();
383 spin_lock(&po->bind_lock);
384 }
385}
386
387static void unregister_prot_hook(struct sock *sk, bool sync)
388{
389 struct packet_sock *po = pkt_sk(sk);
390
391 if (po->running)
392 __unregister_prot_hook(sk, sync);
393}
394
f6dafa95 395static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
396{
397 if (is_vmalloc_addr(addr))
398 return vmalloc_to_page(addr);
399 return virt_to_page(addr);
400}
401
69e3c75f 402static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 403{
bbd6ef87
PM
404 union {
405 struct tpacket_hdr *h1;
406 struct tpacket2_hdr *h2;
407 void *raw;
408 } h;
1da177e4 409
69e3c75f 410 h.raw = frame;
bbd6ef87
PM
411 switch (po->tp_version) {
412 case TPACKET_V1:
69e3c75f 413 h.h1->tp_status = status;
0af55bb5 414 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
415 break;
416 case TPACKET_V2:
69e3c75f 417 h.h2->tp_status = status;
0af55bb5 418 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 419 break;
f6fb8f10 420 case TPACKET_V3:
69e3c75f 421 default:
f6fb8f10 422 WARN(1, "TPACKET version not supported.\n");
69e3c75f 423 BUG();
bbd6ef87 424 }
69e3c75f
JB
425
426 smp_wmb();
bbd6ef87
PM
427}
428
69e3c75f 429static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
430{
431 union {
432 struct tpacket_hdr *h1;
433 struct tpacket2_hdr *h2;
434 void *raw;
435 } h;
436
69e3c75f
JB
437 smp_rmb();
438
bbd6ef87
PM
439 h.raw = frame;
440 switch (po->tp_version) {
441 case TPACKET_V1:
0af55bb5 442 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 443 return h.h1->tp_status;
bbd6ef87 444 case TPACKET_V2:
0af55bb5 445 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 446 return h.h2->tp_status;
f6fb8f10 447 case TPACKET_V3:
69e3c75f 448 default:
f6fb8f10 449 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
450 BUG();
451 return 0;
bbd6ef87 452 }
1da177e4 453}
69e3c75f
JB
454
455static void *packet_lookup_frame(struct packet_sock *po,
456 struct packet_ring_buffer *rb,
457 unsigned int position,
458 int status)
459{
460 unsigned int pg_vec_pos, frame_offset;
461 union {
462 struct tpacket_hdr *h1;
463 struct tpacket2_hdr *h2;
464 void *raw;
465 } h;
466
467 pg_vec_pos = position / rb->frames_per_block;
468 frame_offset = position % rb->frames_per_block;
469
0e3125c7
NH
470 h.raw = rb->pg_vec[pg_vec_pos].buffer +
471 (frame_offset * rb->frame_size);
69e3c75f
JB
472
473 if (status != __packet_get_status(po, h.raw))
474 return NULL;
475
476 return h.raw;
477}
478
eea49cc9 479static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
480 struct packet_ring_buffer *rb,
481 int status)
482{
483 return packet_lookup_frame(po, rb, rb->head, status);
484}
485
bc59ba39 486static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 487{
488 del_timer_sync(&pkc->retire_blk_timer);
489}
490
491static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
492 int tx_ring,
493 struct sk_buff_head *rb_queue)
494{
bc59ba39 495 struct tpacket_kbdq_core *pkc;
f6fb8f10 496
497 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
498
499 spin_lock(&rb_queue->lock);
500 pkc->delete_blk_timer = 1;
501 spin_unlock(&rb_queue->lock);
502
503 prb_del_retire_blk_timer(pkc);
504}
505
506static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 507 struct tpacket_kbdq_core *pkc,
f6fb8f10 508 void (*func) (unsigned long))
509{
510 init_timer(&pkc->retire_blk_timer);
511 pkc->retire_blk_timer.data = (long)po;
512 pkc->retire_blk_timer.function = func;
513 pkc->retire_blk_timer.expires = jiffies;
514}
515
516static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
517{
bc59ba39 518 struct tpacket_kbdq_core *pkc;
f6fb8f10 519
520 if (tx_ring)
521 BUG();
522
523 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
524 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
525}
526
527static int prb_calc_retire_blk_tmo(struct packet_sock *po,
528 int blk_size_in_bytes)
529{
530 struct net_device *dev;
531 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
532 struct ethtool_cmd ecmd;
533 int err;
f6fb8f10 534
4bc71cb9
JP
535 rtnl_lock();
536 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
537 if (unlikely(!dev)) {
538 rtnl_unlock();
f6fb8f10 539 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
540 }
541 err = __ethtool_get_settings(dev, &ecmd);
542 rtnl_unlock();
543 if (!err) {
544 switch (ecmd.speed) {
545 case SPEED_10000:
546 msec = 1;
547 div = 10000/1000;
548 break;
549 case SPEED_1000:
550 msec = 1;
551 div = 1000/1000;
552 break;
553 /*
554 * If the link speed is so slow you don't really
555 * need to worry about perf anyways
556 */
557 case SPEED_100:
558 case SPEED_10:
559 default:
560 return DEFAULT_PRB_RETIRE_TOV;
f6fb8f10 561 }
562 }
563
564 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
565
566 if (div)
567 mbits /= div;
568
569 tmo = mbits * msec;
570
571 if (div)
572 return tmo+1;
573 return tmo;
574}
575
bc59ba39 576static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 577 union tpacket_req_u *req_u)
578{
579 p1->feature_req_word = req_u->req3.tp_feature_req_word;
580}
581
582static void init_prb_bdqc(struct packet_sock *po,
583 struct packet_ring_buffer *rb,
584 struct pgv *pg_vec,
585 union tpacket_req_u *req_u, int tx_ring)
586{
bc59ba39 587 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
588 struct tpacket_block_desc *pbd;
f6fb8f10 589
590 memset(p1, 0x0, sizeof(*p1));
591
592 p1->knxt_seq_num = 1;
593 p1->pkbdq = pg_vec;
bc59ba39 594 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 595 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 596 p1->kblk_size = req_u->req3.tp_block_size;
597 p1->knum_blocks = req_u->req3.tp_block_nr;
598 p1->hdrlen = po->tp_hdrlen;
599 p1->version = po->tp_version;
600 p1->last_kactive_blk_num = 0;
601 po->stats_u.stats3.tp_freeze_q_cnt = 0;
602 if (req_u->req3.tp_retire_blk_tov)
603 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
604 else
605 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
606 req_u->req3.tp_block_size);
607 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
608 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
609
610 prb_init_ft_ops(p1, req_u);
611 prb_setup_retire_blk_timer(po, tx_ring);
612 prb_open_block(p1, pbd);
613}
614
615/* Do NOT update the last_blk_num first.
616 * Assumes sk_buff_head lock is held.
617 */
bc59ba39 618static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 619{
620 mod_timer(&pkc->retire_blk_timer,
621 jiffies + pkc->tov_in_jiffies);
622 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
623}
624
625/*
626 * Timer logic:
627 * 1) We refresh the timer only when we open a block.
628 * By doing this we don't waste cycles refreshing the timer
629 * on packet-by-packet basis.
630 *
631 * With a 1MB block-size, on a 1Gbps line, it will take
632 * i) ~8 ms to fill a block + ii) memcpy etc.
633 * In this cut we are not accounting for the memcpy time.
634 *
635 * So, if the user sets the 'tmo' to 10ms then the timer
636 * will never fire while the block is still getting filled
637 * (which is what we want). However, the user could choose
638 * to close a block early and that's fine.
639 *
640 * But when the timer does fire, we check whether or not to refresh it.
641 * Since the tmo granularity is in msecs, it is not too expensive
642 * to refresh the timer, lets say every '8' msecs.
643 * Either the user can set the 'tmo' or we can derive it based on
644 * a) line-speed and b) block-size.
645 * prb_calc_retire_blk_tmo() calculates the tmo.
646 *
647 */
648static void prb_retire_rx_blk_timer_expired(unsigned long data)
649{
650 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 651 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 652 unsigned int frozen;
bc59ba39 653 struct tpacket_block_desc *pbd;
f6fb8f10 654
655 spin_lock(&po->sk.sk_receive_queue.lock);
656
657 frozen = prb_queue_frozen(pkc);
658 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
659
660 if (unlikely(pkc->delete_blk_timer))
661 goto out;
662
663 /* We only need to plug the race when the block is partially filled.
664 * tpacket_rcv:
665 * lock(); increment BLOCK_NUM_PKTS; unlock()
666 * copy_bits() is in progress ...
667 * timer fires on other cpu:
668 * we can't retire the current block because copy_bits
669 * is in progress.
670 *
671 */
672 if (BLOCK_NUM_PKTS(pbd)) {
673 while (atomic_read(&pkc->blk_fill_in_prog)) {
674 /* Waiting for skb_copy_bits to finish... */
675 cpu_relax();
676 }
677 }
678
679 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
680 if (!frozen) {
681 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
682 if (!prb_dispatch_next_block(pkc, po))
683 goto refresh_timer;
684 else
685 goto out;
686 } else {
687 /* Case 1. Queue was frozen because user-space was
688 * lagging behind.
689 */
690 if (prb_curr_blk_in_use(pkc, pbd)) {
691 /*
692 * Ok, user-space is still behind.
693 * So just refresh the timer.
694 */
695 goto refresh_timer;
696 } else {
697 /* Case 2. queue was frozen,user-space caught up,
698 * now the link went idle && the timer fired.
699 * We don't have a block to close.So we open this
700 * block and restart the timer.
701 * opening a block thaws the queue,restarts timer
702 * Thawing/timer-refresh is a side effect.
703 */
704 prb_open_block(pkc, pbd);
705 goto out;
706 }
707 }
708 }
709
710refresh_timer:
711 _prb_refresh_rx_retire_blk_timer(pkc);
712
713out:
714 spin_unlock(&po->sk.sk_receive_queue.lock);
715}
716
eea49cc9 717static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 718 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 719{
720 /* Flush everything minus the block header */
721
722#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
723 u8 *start, *end;
724
725 start = (u8 *)pbd1;
726
727 /* Skip the block header(we know header WILL fit in 4K) */
728 start += PAGE_SIZE;
729
730 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
731 for (; start < end; start += PAGE_SIZE)
732 flush_dcache_page(pgv_to_page(start));
733
734 smp_wmb();
735#endif
736
737 /* Now update the block status. */
738
739 BLOCK_STATUS(pbd1) = status;
740
741 /* Flush the block header */
742
743#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
744 start = (u8 *)pbd1;
745 flush_dcache_page(pgv_to_page(start));
746
747 smp_wmb();
748#endif
749}
750
751/*
752 * Side effect:
753 *
754 * 1) flush the block
755 * 2) Increment active_blk_num
756 *
757 * Note:We DONT refresh the timer on purpose.
758 * Because almost always the next block will be opened.
759 */
bc59ba39 760static void prb_close_block(struct tpacket_kbdq_core *pkc1,
761 struct tpacket_block_desc *pbd1,
f6fb8f10 762 struct packet_sock *po, unsigned int stat)
763{
764 __u32 status = TP_STATUS_USER | stat;
765
766 struct tpacket3_hdr *last_pkt;
bc59ba39 767 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 768
769 if (po->stats.tp_drops)
770 status |= TP_STATUS_LOSING;
771
772 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
773 last_pkt->tp_next_offset = 0;
774
775 /* Get the ts of the last pkt */
776 if (BLOCK_NUM_PKTS(pbd1)) {
777 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
778 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
779 } else {
780 /* Ok, we tmo'd - so get the current time */
781 struct timespec ts;
782 getnstimeofday(&ts);
783 h1->ts_last_pkt.ts_sec = ts.tv_sec;
784 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
785 }
786
787 smp_wmb();
788
789 /* Flush the block */
790 prb_flush_block(pkc1, pbd1, status);
791
792 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
793}
794
eea49cc9 795static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 796{
797 pkc->reset_pending_on_curr_blk = 0;
798}
799
800/*
801 * Side effect of opening a block:
802 *
803 * 1) prb_queue is thawed.
804 * 2) retire_blk_timer is refreshed.
805 *
806 */
bc59ba39 807static void prb_open_block(struct tpacket_kbdq_core *pkc1,
808 struct tpacket_block_desc *pbd1)
f6fb8f10 809{
810 struct timespec ts;
bc59ba39 811 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 812
813 smp_rmb();
814
815 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
816
817 /* We could have just memset this but we will lose the
818 * flexibility of making the priv area sticky
819 */
820 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
821 BLOCK_NUM_PKTS(pbd1) = 0;
822 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
823 getnstimeofday(&ts);
824 h1->ts_first_pkt.ts_sec = ts.tv_sec;
825 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
826 pkc1->pkblk_start = (char *)pbd1;
e3192690 827 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 828 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
829 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
830 pbd1->version = pkc1->version;
831 pkc1->prev = pkc1->nxt_offset;
832 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
833 prb_thaw_queue(pkc1);
834 _prb_refresh_rx_retire_blk_timer(pkc1);
835
836 smp_wmb();
837
838 return;
839 }
840
841 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
842 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
843 dump_stack();
844 BUG();
845}
846
847/*
848 * Queue freeze logic:
849 * 1) Assume tp_block_nr = 8 blocks.
850 * 2) At time 't0', user opens Rx ring.
851 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
852 * 4) user-space is either sleeping or processing block '0'.
853 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
854 * it will close block-7,loop around and try to fill block '0'.
855 * call-flow:
856 * __packet_lookup_frame_in_block
857 * prb_retire_current_block()
858 * prb_dispatch_next_block()
859 * |->(BLOCK_STATUS == USER) evaluates to true
860 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
861 * 6) Now there are two cases:
862 * 6.1) Link goes idle right after the queue is frozen.
863 * But remember, the last open_block() refreshed the timer.
864 * When this timer expires,it will refresh itself so that we can
865 * re-open block-0 in near future.
866 * 6.2) Link is busy and keeps on receiving packets. This is a simple
867 * case and __packet_lookup_frame_in_block will check if block-0
868 * is free and can now be re-used.
869 */
eea49cc9 870static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 871 struct packet_sock *po)
872{
873 pkc->reset_pending_on_curr_blk = 1;
874 po->stats_u.stats3.tp_freeze_q_cnt++;
875}
876
877#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
878
879/*
880 * If the next block is free then we will dispatch it
881 * and return a good offset.
882 * Else, we will freeze the queue.
883 * So, caller must check the return value.
884 */
bc59ba39 885static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 886 struct packet_sock *po)
887{
bc59ba39 888 struct tpacket_block_desc *pbd;
f6fb8f10 889
890 smp_rmb();
891
892 /* 1. Get current block num */
893 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
894
895 /* 2. If this block is currently in_use then freeze the queue */
896 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
897 prb_freeze_queue(pkc, po);
898 return NULL;
899 }
900
901 /*
902 * 3.
903 * open this block and return the offset where the first packet
904 * needs to get stored.
905 */
906 prb_open_block(pkc, pbd);
907 return (void *)pkc->nxt_offset;
908}
909
bc59ba39 910static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 911 struct packet_sock *po, unsigned int status)
912{
bc59ba39 913 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 914
915 /* retire/close the current block */
916 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
917 /*
918 * Plug the case where copy_bits() is in progress on
919 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
920 * have space to copy the pkt in the current block and
921 * called prb_retire_current_block()
922 *
923 * We don't need to worry about the TMO case because
924 * the timer-handler already handled this case.
925 */
926 if (!(status & TP_STATUS_BLK_TMO)) {
927 while (atomic_read(&pkc->blk_fill_in_prog)) {
928 /* Waiting for skb_copy_bits to finish... */
929 cpu_relax();
930 }
931 }
932 prb_close_block(pkc, pbd, po, status);
933 return;
934 }
935
936 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
937 dump_stack();
938 BUG();
939}
940
eea49cc9 941static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 942 struct tpacket_block_desc *pbd)
f6fb8f10 943{
944 return TP_STATUS_USER & BLOCK_STATUS(pbd);
945}
946
eea49cc9 947static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 948{
949 return pkc->reset_pending_on_curr_blk;
950}
951
eea49cc9 952static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 953{
bc59ba39 954 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 955 atomic_dec(&pkc->blk_fill_in_prog);
956}
957
eea49cc9 958static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 959 struct tpacket3_hdr *ppd)
960{
961 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
962}
963
eea49cc9 964static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 965 struct tpacket3_hdr *ppd)
966{
967 ppd->hv1.tp_rxhash = 0;
968}
969
eea49cc9 970static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 971 struct tpacket3_hdr *ppd)
972{
973 if (vlan_tx_tag_present(pkc->skb)) {
974 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
975 ppd->tp_status = TP_STATUS_VLAN_VALID;
976 } else {
977 ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
978 }
979}
980
bc59ba39 981static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 982 struct tpacket3_hdr *ppd)
983{
984 prb_fill_vlan_info(pkc, ppd);
985
986 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
987 prb_fill_rxhash(pkc, ppd);
988 else
989 prb_clear_rxhash(pkc, ppd);
990}
991
eea49cc9 992static void prb_fill_curr_block(char *curr,
bc59ba39 993 struct tpacket_kbdq_core *pkc,
994 struct tpacket_block_desc *pbd,
f6fb8f10 995 unsigned int len)
996{
997 struct tpacket3_hdr *ppd;
998
999 ppd = (struct tpacket3_hdr *)curr;
1000 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1001 pkc->prev = curr;
1002 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1004 BLOCK_NUM_PKTS(pbd) += 1;
1005 atomic_inc(&pkc->blk_fill_in_prog);
1006 prb_run_all_ft_ops(pkc, ppd);
1007}
1008
1009/* Assumes caller has the sk->rx_queue.lock */
1010static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1011 struct sk_buff *skb,
1012 int status,
1013 unsigned int len
1014 )
1015{
bc59ba39 1016 struct tpacket_kbdq_core *pkc;
1017 struct tpacket_block_desc *pbd;
f6fb8f10 1018 char *curr, *end;
1019
e3192690 1020 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1021 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1022
1023 /* Queue is frozen when user space is lagging behind */
1024 if (prb_queue_frozen(pkc)) {
1025 /*
1026 * Check if that last block which caused the queue to freeze,
1027 * is still in_use by user-space.
1028 */
1029 if (prb_curr_blk_in_use(pkc, pbd)) {
1030 /* Can't record this packet */
1031 return NULL;
1032 } else {
1033 /*
1034 * Ok, the block was released by user-space.
1035 * Now let's open that block.
1036 * opening a block also thaws the queue.
1037 * Thawing is a side effect.
1038 */
1039 prb_open_block(pkc, pbd);
1040 }
1041 }
1042
1043 smp_mb();
1044 curr = pkc->nxt_offset;
1045 pkc->skb = skb;
e3192690 1046 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1047
1048 /* first try the current block */
1049 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1050 prb_fill_curr_block(curr, pkc, pbd, len);
1051 return (void *)curr;
1052 }
1053
1054 /* Ok, close the current block */
1055 prb_retire_current_block(pkc, po, 0);
1056
1057 /* Now, try to dispatch the next block */
1058 curr = (char *)prb_dispatch_next_block(pkc, po);
1059 if (curr) {
1060 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1061 prb_fill_curr_block(curr, pkc, pbd, len);
1062 return (void *)curr;
1063 }
1064
1065 /*
1066 * No free blocks are available.user_space hasn't caught up yet.
1067 * Queue was just frozen and now this packet will get dropped.
1068 */
1069 return NULL;
1070}
1071
eea49cc9 1072static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1073 struct sk_buff *skb,
1074 int status, unsigned int len)
1075{
1076 char *curr = NULL;
1077 switch (po->tp_version) {
1078 case TPACKET_V1:
1079 case TPACKET_V2:
1080 curr = packet_lookup_frame(po, &po->rx_ring,
1081 po->rx_ring.head, status);
1082 return curr;
1083 case TPACKET_V3:
1084 return __packet_lookup_frame_in_block(po, skb, status, len);
1085 default:
1086 WARN(1, "TPACKET version not supported\n");
1087 BUG();
1088 return 0;
1089 }
1090}
1091
eea49cc9 1092static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1093 struct packet_ring_buffer *rb,
1094 unsigned int previous,
1095 int status)
1096{
bc59ba39 1097 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1098 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
f6fb8f10 1099
1100 if (status != BLOCK_STATUS(pbd))
1101 return NULL;
1102 return pbd;
1103}
1104
eea49cc9 1105static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1106{
1107 unsigned int prev;
1108 if (rb->prb_bdqc.kactive_blk_num)
1109 prev = rb->prb_bdqc.kactive_blk_num-1;
1110 else
1111 prev = rb->prb_bdqc.knum_blocks-1;
1112 return prev;
1113}
1114
1115/* Assumes caller has held the rx_queue.lock */
eea49cc9 1116static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1117 struct packet_ring_buffer *rb,
1118 int status)
1119{
1120 unsigned int previous = prb_previous_blk_num(rb);
1121 return prb_lookup_block(po, rb, previous, status);
1122}
1123
eea49cc9 1124static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1125 struct packet_ring_buffer *rb,
1126 int status)
1127{
1128 if (po->tp_version <= TPACKET_V2)
1129 return packet_previous_frame(po, rb, status);
1130
1131 return __prb_previous_block(po, rb, status);
1132}
1133
eea49cc9 1134static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1135 struct packet_ring_buffer *rb)
1136{
1137 switch (po->tp_version) {
1138 case TPACKET_V1:
1139 case TPACKET_V2:
1140 return packet_increment_head(rb);
1141 case TPACKET_V3:
1142 default:
1143 WARN(1, "TPACKET version not supported.\n");
1144 BUG();
1145 return;
1146 }
1147}
1148
eea49cc9 1149static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1150 struct packet_ring_buffer *rb,
1151 int status)
1152{
1153 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1154 return packet_lookup_frame(po, rb, previous, status);
1155}
1156
eea49cc9 1157static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1158{
1159 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1160}
1161
1da177e4
LT
1162static void packet_sock_destruct(struct sock *sk)
1163{
ed85b565
RC
1164 skb_queue_purge(&sk->sk_error_queue);
1165
547b792c
IJ
1166 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1167 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1168
1169 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1170 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1171 return;
1172 }
1173
17ab56a2 1174 sk_refcnt_debug_dec(sk);
1da177e4
LT
1175}
1176
dc99f600
DM
1177static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1178{
1179 int x = atomic_read(&f->rr_cur) + 1;
1180
1181 if (x >= num)
1182 x = 0;
1183
1184 return x;
1185}
1186
1187static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1188{
1189 u32 idx, hash = skb->rxhash;
1190
1191 idx = ((u64)hash * num) >> 32;
1192
1193 return f->arr[idx];
1194}
1195
1196static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1197{
1198 int cur, old;
1199
1200 cur = atomic_read(&f->rr_cur);
1201 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1202 fanout_rr_next(f, num))) != cur)
1203 cur = old;
1204 return f->arr[cur];
1205}
1206
95ec3eb4
DM
1207static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1208{
1209 unsigned int cpu = smp_processor_id();
1210
1211 return f->arr[cpu % num];
1212}
1213
95ec3eb4
DM
1214static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1215 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1216{
1217 struct packet_fanout *f = pt->af_packet_priv;
1218 unsigned int num = f->num_members;
1219 struct packet_sock *po;
1220 struct sock *sk;
1221
1222 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1223 !num) {
1224 kfree_skb(skb);
1225 return 0;
1226 }
1227
95ec3eb4
DM
1228 switch (f->type) {
1229 case PACKET_FANOUT_HASH:
1230 default:
1231 if (f->defrag) {
bc416d97 1232 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1233 if (!skb)
1234 return 0;
1235 }
1236 skb_get_rxhash(skb);
1237 sk = fanout_demux_hash(f, skb, num);
1238 break;
1239 case PACKET_FANOUT_LB:
1240 sk = fanout_demux_lb(f, skb, num);
1241 break;
1242 case PACKET_FANOUT_CPU:
1243 sk = fanout_demux_cpu(f, skb, num);
1244 break;
dc99f600
DM
1245 }
1246
dc99f600
DM
1247 po = pkt_sk(sk);
1248
1249 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1250}
1251
1252static DEFINE_MUTEX(fanout_mutex);
1253static LIST_HEAD(fanout_list);
1254
1255static void __fanout_link(struct sock *sk, struct packet_sock *po)
1256{
1257 struct packet_fanout *f = po->fanout;
1258
1259 spin_lock(&f->lock);
1260 f->arr[f->num_members] = sk;
1261 smp_wmb();
1262 f->num_members++;
1263 spin_unlock(&f->lock);
1264}
1265
1266static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1267{
1268 struct packet_fanout *f = po->fanout;
1269 int i;
1270
1271 spin_lock(&f->lock);
1272 for (i = 0; i < f->num_members; i++) {
1273 if (f->arr[i] == sk)
1274 break;
1275 }
1276 BUG_ON(i >= f->num_members);
1277 f->arr[i] = f->arr[f->num_members - 1];
1278 f->num_members--;
1279 spin_unlock(&f->lock);
1280}
1281
7736d33f 1282static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1283{
1284 struct packet_sock *po = pkt_sk(sk);
1285 struct packet_fanout *f, *match;
7736d33f
DM
1286 u8 type = type_flags & 0xff;
1287 u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
dc99f600
DM
1288 int err;
1289
1290 switch (type) {
1291 case PACKET_FANOUT_HASH:
1292 case PACKET_FANOUT_LB:
95ec3eb4 1293 case PACKET_FANOUT_CPU:
dc99f600
DM
1294 break;
1295 default:
1296 return -EINVAL;
1297 }
1298
1299 if (!po->running)
1300 return -EINVAL;
1301
1302 if (po->fanout)
1303 return -EALREADY;
1304
1305 mutex_lock(&fanout_mutex);
1306 match = NULL;
1307 list_for_each_entry(f, &fanout_list, list) {
1308 if (f->id == id &&
1309 read_pnet(&f->net) == sock_net(sk)) {
1310 match = f;
1311 break;
1312 }
1313 }
afe62c68 1314 err = -EINVAL;
7736d33f 1315 if (match && match->defrag != defrag)
afe62c68 1316 goto out;
dc99f600 1317 if (!match) {
afe62c68 1318 err = -ENOMEM;
dc99f600 1319 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1320 if (!match)
1321 goto out;
1322 write_pnet(&match->net, sock_net(sk));
1323 match->id = id;
1324 match->type = type;
1325 match->defrag = defrag;
1326 atomic_set(&match->rr_cur, 0);
1327 INIT_LIST_HEAD(&match->list);
1328 spin_lock_init(&match->lock);
1329 atomic_set(&match->sk_ref, 0);
1330 match->prot_hook.type = po->prot_hook.type;
1331 match->prot_hook.dev = po->prot_hook.dev;
1332 match->prot_hook.func = packet_rcv_fanout;
1333 match->prot_hook.af_packet_priv = match;
1334 dev_add_pack(&match->prot_hook);
1335 list_add(&match->list, &fanout_list);
dc99f600 1336 }
afe62c68
ED
1337 err = -EINVAL;
1338 if (match->type == type &&
1339 match->prot_hook.type == po->prot_hook.type &&
1340 match->prot_hook.dev == po->prot_hook.dev) {
1341 err = -ENOSPC;
1342 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1343 __dev_remove_pack(&po->prot_hook);
1344 po->fanout = match;
1345 atomic_inc(&match->sk_ref);
1346 __fanout_link(sk, po);
1347 err = 0;
dc99f600
DM
1348 }
1349 }
afe62c68 1350out:
dc99f600
DM
1351 mutex_unlock(&fanout_mutex);
1352 return err;
1353}
1354
1355static void fanout_release(struct sock *sk)
1356{
1357 struct packet_sock *po = pkt_sk(sk);
1358 struct packet_fanout *f;
1359
1360 f = po->fanout;
1361 if (!f)
1362 return;
1363
1364 po->fanout = NULL;
1365
1366 mutex_lock(&fanout_mutex);
1367 if (atomic_dec_and_test(&f->sk_ref)) {
1368 list_del(&f->list);
1369 dev_remove_pack(&f->prot_hook);
1370 kfree(f);
1371 }
1372 mutex_unlock(&fanout_mutex);
1373}
1da177e4 1374
90ddc4f0 1375static const struct proto_ops packet_ops;
1da177e4 1376
90ddc4f0 1377static const struct proto_ops packet_ops_spkt;
1da177e4 1378
40d4e3df
ED
1379static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1380 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1381{
1382 struct sock *sk;
1383 struct sockaddr_pkt *spkt;
1384
1385 /*
1386 * When we registered the protocol we saved the socket in the data
1387 * field for just this event.
1388 */
1389
1390 sk = pt->af_packet_priv;
1ce4f28b 1391
1da177e4
LT
1392 /*
1393 * Yank back the headers [hope the device set this
1394 * right or kerboom...]
1395 *
1396 * Incoming packets have ll header pulled,
1397 * push it back.
1398 *
98e399f8 1399 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1400 * so that this procedure is noop.
1401 */
1402
1403 if (skb->pkt_type == PACKET_LOOPBACK)
1404 goto out;
1405
09ad9bc7 1406 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1407 goto out;
1408
40d4e3df
ED
1409 skb = skb_share_check(skb, GFP_ATOMIC);
1410 if (skb == NULL)
1da177e4
LT
1411 goto oom;
1412
1413 /* drop any routing info */
adf30907 1414 skb_dst_drop(skb);
1da177e4 1415
84531c24
PO
1416 /* drop conntrack reference */
1417 nf_reset(skb);
1418
ffbc6111 1419 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1420
98e399f8 1421 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1422
1423 /*
1424 * The SOCK_PACKET socket receives _all_ frames.
1425 */
1426
1427 spkt->spkt_family = dev->type;
1428 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1429 spkt->spkt_protocol = skb->protocol;
1430
1431 /*
1432 * Charge the memory to the socket. This is done specifically
1433 * to prevent sockets using all the memory up.
1434 */
1435
40d4e3df 1436 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1437 return 0;
1438
1439out:
1440 kfree_skb(skb);
1441oom:
1442 return 0;
1443}
1444
1445
1446/*
1447 * Output a raw packet to a device layer. This bypasses all the other
1448 * protocol layers and you must therefore supply it with a complete frame
1449 */
1ce4f28b 1450
1da177e4
LT
1451static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1452 struct msghdr *msg, size_t len)
1453{
1454 struct sock *sk = sock->sk;
40d4e3df 1455 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1456 struct sk_buff *skb = NULL;
1da177e4 1457 struct net_device *dev;
40d4e3df 1458 __be16 proto = 0;
1da177e4 1459 int err;
3bdc0eba 1460 int extra_len = 0;
1ce4f28b 1461
1da177e4 1462 /*
1ce4f28b 1463 * Get and verify the address.
1da177e4
LT
1464 */
1465
40d4e3df 1466 if (saddr) {
1da177e4 1467 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1468 return -EINVAL;
1469 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1470 proto = saddr->spkt_protocol;
1471 } else
1472 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1473
1474 /*
1ce4f28b 1475 * Find the device first to size check it
1da177e4
LT
1476 */
1477
de74e92a 1478 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1479retry:
654d1f8a
ED
1480 rcu_read_lock();
1481 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1482 err = -ENODEV;
1483 if (dev == NULL)
1484 goto out_unlock;
1ce4f28b 1485
d5e76b0a
DM
1486 err = -ENETDOWN;
1487 if (!(dev->flags & IFF_UP))
1488 goto out_unlock;
1489
1da177e4 1490 /*
40d4e3df
ED
1491 * You may not queue a frame bigger than the mtu. This is the lowest level
1492 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1493 */
1ce4f28b 1494
3bdc0eba
BG
1495 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1496 if (!netif_supports_nofcs(dev)) {
1497 err = -EPROTONOSUPPORT;
1498 goto out_unlock;
1499 }
1500 extra_len = 4; /* We're doing our own CRC */
1501 }
1502
1da177e4 1503 err = -EMSGSIZE;
3bdc0eba 1504 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1505 goto out_unlock;
1506
1a35ca80
ED
1507 if (!skb) {
1508 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1509 int tlen = dev->needed_tailroom;
1a35ca80
ED
1510 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1511
1512 rcu_read_unlock();
4ce40912 1513 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1514 if (skb == NULL)
1515 return -ENOBUFS;
1516 /* FIXME: Save some space for broken drivers that write a hard
1517 * header at transmission time by themselves. PPP is the notable
1518 * one here. This should really be fixed at the driver level.
1519 */
1520 skb_reserve(skb, reserved);
1521 skb_reset_network_header(skb);
1522
1523 /* Try to align data part correctly */
1524 if (hhlen) {
1525 skb->data -= hhlen;
1526 skb->tail -= hhlen;
1527 if (len < hhlen)
1528 skb_reset_network_header(skb);
1529 }
1530 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1531 if (err)
1532 goto out_free;
1533 goto retry;
1da177e4
LT
1534 }
1535
3bdc0eba 1536 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1537 /* Earlier code assumed this would be a VLAN pkt,
1538 * double-check this now that we have the actual
1539 * packet in hand.
1540 */
1541 struct ethhdr *ehdr;
1542 skb_reset_mac_header(skb);
1543 ehdr = eth_hdr(skb);
1544 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1545 err = -EMSGSIZE;
1546 goto out_unlock;
1547 }
1548 }
1a35ca80 1549
1da177e4
LT
1550 skb->protocol = proto;
1551 skb->dev = dev;
1552 skb->priority = sk->sk_priority;
2d37a186 1553 skb->mark = sk->sk_mark;
2244d07b 1554 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1555 if (err < 0)
1556 goto out_unlock;
1da177e4 1557
3bdc0eba
BG
1558 if (unlikely(extra_len == 4))
1559 skb->no_fcs = 1;
1560
1da177e4 1561 dev_queue_xmit(skb);
654d1f8a 1562 rcu_read_unlock();
40d4e3df 1563 return len;
1da177e4 1564
1da177e4 1565out_unlock:
654d1f8a 1566 rcu_read_unlock();
1a35ca80
ED
1567out_free:
1568 kfree_skb(skb);
1da177e4
LT
1569 return err;
1570}
1da177e4 1571
eea49cc9 1572static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1573 const struct sock *sk,
dbcb5855 1574 unsigned int res)
1da177e4
LT
1575{
1576 struct sk_filter *filter;
fda9ef5d 1577
80f8f102
ED
1578 rcu_read_lock();
1579 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1580 if (filter != NULL)
0a14842f 1581 res = SK_RUN_FILTER(filter, skb);
80f8f102 1582 rcu_read_unlock();
1da177e4 1583
dbcb5855 1584 return res;
1da177e4
LT
1585}
1586
1587/*
62ab0812
ED
1588 * This function makes lazy skb cloning in hope that most of packets
1589 * are discarded by BPF.
1590 *
1591 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1592 * and skb->cb are mangled. It works because (and until) packets
1593 * falling here are owned by current CPU. Output packets are cloned
1594 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1595 * sequencially, so that if we return skb to original state on exit,
1596 * we will not harm anyone.
1da177e4
LT
1597 */
1598
40d4e3df
ED
1599static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1600 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1601{
1602 struct sock *sk;
1603 struct sockaddr_ll *sll;
1604 struct packet_sock *po;
40d4e3df 1605 u8 *skb_head = skb->data;
1da177e4 1606 int skb_len = skb->len;
dbcb5855 1607 unsigned int snaplen, res;
1da177e4
LT
1608
1609 if (skb->pkt_type == PACKET_LOOPBACK)
1610 goto drop;
1611
1612 sk = pt->af_packet_priv;
1613 po = pkt_sk(sk);
1614
09ad9bc7 1615 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1616 goto drop;
1617
1da177e4
LT
1618 skb->dev = dev;
1619
3b04ddde 1620 if (dev->header_ops) {
1da177e4 1621 /* The device has an explicit notion of ll header,
62ab0812
ED
1622 * exported to higher levels.
1623 *
1624 * Otherwise, the device hides details of its frame
1625 * structure, so that corresponding packet head is
1626 * never delivered to user.
1da177e4
LT
1627 */
1628 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1629 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1630 else if (skb->pkt_type == PACKET_OUTGOING) {
1631 /* Special case: outgoing packets have ll header at head */
bbe735e4 1632 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1633 }
1634 }
1635
1636 snaplen = skb->len;
1637
dbcb5855
DM
1638 res = run_filter(skb, sk, snaplen);
1639 if (!res)
fda9ef5d 1640 goto drop_n_restore;
dbcb5855
DM
1641 if (snaplen > res)
1642 snaplen = res;
1da177e4 1643
0fd7bac6 1644 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1645 goto drop_n_acct;
1646
1647 if (skb_shared(skb)) {
1648 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1649 if (nskb == NULL)
1650 goto drop_n_acct;
1651
1652 if (skb_head != skb->data) {
1653 skb->data = skb_head;
1654 skb->len = skb_len;
1655 }
abc4e4fa 1656 consume_skb(skb);
1da177e4
LT
1657 skb = nskb;
1658 }
1659
ffbc6111
HX
1660 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1661 sizeof(skb->cb));
1662
1663 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1664 sll->sll_family = AF_PACKET;
1665 sll->sll_hatype = dev->type;
1666 sll->sll_protocol = skb->protocol;
1667 sll->sll_pkttype = skb->pkt_type;
8032b464 1668 if (unlikely(po->origdev))
80feaacb
PWJ
1669 sll->sll_ifindex = orig_dev->ifindex;
1670 else
1671 sll->sll_ifindex = dev->ifindex;
1da177e4 1672
b95cce35 1673 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1674
ffbc6111 1675 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1676
1da177e4
LT
1677 if (pskb_trim(skb, snaplen))
1678 goto drop_n_acct;
1679
1680 skb_set_owner_r(skb, sk);
1681 skb->dev = NULL;
adf30907 1682 skb_dst_drop(skb);
1da177e4 1683
84531c24
PO
1684 /* drop conntrack reference */
1685 nf_reset(skb);
1686
1da177e4
LT
1687 spin_lock(&sk->sk_receive_queue.lock);
1688 po->stats.tp_packets++;
3b885787 1689 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1690 __skb_queue_tail(&sk->sk_receive_queue, skb);
1691 spin_unlock(&sk->sk_receive_queue.lock);
1692 sk->sk_data_ready(sk, skb->len);
1693 return 0;
1694
1695drop_n_acct:
7091fbd8
WB
1696 spin_lock(&sk->sk_receive_queue.lock);
1697 po->stats.tp_drops++;
1698 atomic_inc(&sk->sk_drops);
1699 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1700
1701drop_n_restore:
1702 if (skb_head != skb->data && skb_shared(skb)) {
1703 skb->data = skb_head;
1704 skb->len = skb_len;
1705 }
1706drop:
ead2ceb0 1707 consume_skb(skb);
1da177e4
LT
1708 return 0;
1709}
1710
40d4e3df
ED
1711static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1712 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1713{
1714 struct sock *sk;
1715 struct packet_sock *po;
1716 struct sockaddr_ll *sll;
bbd6ef87
PM
1717 union {
1718 struct tpacket_hdr *h1;
1719 struct tpacket2_hdr *h2;
f6fb8f10 1720 struct tpacket3_hdr *h3;
bbd6ef87
PM
1721 void *raw;
1722 } h;
40d4e3df 1723 u8 *skb_head = skb->data;
1da177e4 1724 int skb_len = skb->len;
dbcb5855 1725 unsigned int snaplen, res;
f6fb8f10 1726 unsigned long status = TP_STATUS_USER;
bbd6ef87 1727 unsigned short macoff, netoff, hdrlen;
1da177e4 1728 struct sk_buff *copy_skb = NULL;
b7aa0bf7 1729 struct timeval tv;
bbd6ef87 1730 struct timespec ts;
614f60fa 1731 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
1732
1733 if (skb->pkt_type == PACKET_LOOPBACK)
1734 goto drop;
1735
1736 sk = pt->af_packet_priv;
1737 po = pkt_sk(sk);
1738
09ad9bc7 1739 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1740 goto drop;
1741
3b04ddde 1742 if (dev->header_ops) {
1da177e4 1743 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1744 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1745 else if (skb->pkt_type == PACKET_OUTGOING) {
1746 /* Special case: outgoing packets have ll header at head */
bbe735e4 1747 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1748 }
1749 }
1750
8dc41944
HX
1751 if (skb->ip_summed == CHECKSUM_PARTIAL)
1752 status |= TP_STATUS_CSUMNOTREADY;
1753
1da177e4
LT
1754 snaplen = skb->len;
1755
dbcb5855
DM
1756 res = run_filter(skb, sk, snaplen);
1757 if (!res)
fda9ef5d 1758 goto drop_n_restore;
dbcb5855
DM
1759 if (snaplen > res)
1760 snaplen = res;
1da177e4
LT
1761
1762 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1763 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1764 po->tp_reserve;
1da177e4 1765 } else {
95c96174 1766 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1767 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1768 (maclen < 16 ? 16 : maclen)) +
1769 po->tp_reserve;
1da177e4
LT
1770 macoff = netoff - maclen;
1771 }
f6fb8f10 1772 if (po->tp_version <= TPACKET_V2) {
1773 if (macoff + snaplen > po->rx_ring.frame_size) {
1774 if (po->copy_thresh &&
0fd7bac6 1775 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1776 if (skb_shared(skb)) {
1777 copy_skb = skb_clone(skb, GFP_ATOMIC);
1778 } else {
1779 copy_skb = skb_get(skb);
1780 skb_head = skb->data;
1781 }
1782 if (copy_skb)
1783 skb_set_owner_r(copy_skb, sk);
1da177e4 1784 }
f6fb8f10 1785 snaplen = po->rx_ring.frame_size - macoff;
1786 if ((int)snaplen < 0)
1787 snaplen = 0;
1da177e4 1788 }
1da177e4 1789 }
1da177e4 1790 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1791 h.raw = packet_current_rx_frame(po, skb,
1792 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1793 if (!h.raw)
1da177e4 1794 goto ring_is_full;
f6fb8f10 1795 if (po->tp_version <= TPACKET_V2) {
1796 packet_increment_rx_head(po, &po->rx_ring);
1797 /*
1798 * LOSING will be reported till you read the stats,
1799 * because it's COR - Clear On Read.
1800 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1801 * at packet level.
1802 */
1803 if (po->stats.tp_drops)
1804 status |= TP_STATUS_LOSING;
1805 }
1da177e4
LT
1806 po->stats.tp_packets++;
1807 if (copy_skb) {
1808 status |= TP_STATUS_COPY;
1809 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1810 }
1da177e4
LT
1811 spin_unlock(&sk->sk_receive_queue.lock);
1812
bbd6ef87 1813 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 1814
bbd6ef87
PM
1815 switch (po->tp_version) {
1816 case TPACKET_V1:
1817 h.h1->tp_len = skb->len;
1818 h.h1->tp_snaplen = snaplen;
1819 h.h1->tp_mac = macoff;
1820 h.h1->tp_net = netoff;
614f60fa
SM
1821 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1822 && shhwtstamps->syststamp.tv64)
1823 tv = ktime_to_timeval(shhwtstamps->syststamp);
1824 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1825 && shhwtstamps->hwtstamp.tv64)
1826 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1827 else if (skb->tstamp.tv64)
bbd6ef87
PM
1828 tv = ktime_to_timeval(skb->tstamp);
1829 else
1830 do_gettimeofday(&tv);
1831 h.h1->tp_sec = tv.tv_sec;
1832 h.h1->tp_usec = tv.tv_usec;
1833 hdrlen = sizeof(*h.h1);
1834 break;
1835 case TPACKET_V2:
1836 h.h2->tp_len = skb->len;
1837 h.h2->tp_snaplen = snaplen;
1838 h.h2->tp_mac = macoff;
1839 h.h2->tp_net = netoff;
614f60fa
SM
1840 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1841 && shhwtstamps->syststamp.tv64)
1842 ts = ktime_to_timespec(shhwtstamps->syststamp);
1843 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1844 && shhwtstamps->hwtstamp.tv64)
1845 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1846 else if (skb->tstamp.tv64)
bbd6ef87
PM
1847 ts = ktime_to_timespec(skb->tstamp);
1848 else
1849 getnstimeofday(&ts);
1850 h.h2->tp_sec = ts.tv_sec;
1851 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1852 if (vlan_tx_tag_present(skb)) {
1853 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1854 status |= TP_STATUS_VLAN_VALID;
1855 } else {
1856 h.h2->tp_vlan_tci = 0;
1857 }
13fcb7bd 1858 h.h2->tp_padding = 0;
bbd6ef87
PM
1859 hdrlen = sizeof(*h.h2);
1860 break;
f6fb8f10 1861 case TPACKET_V3:
1862 /* tp_nxt_offset,vlan are already populated above.
1863 * So DONT clear those fields here
1864 */
1865 h.h3->tp_status |= status;
1866 h.h3->tp_len = skb->len;
1867 h.h3->tp_snaplen = snaplen;
1868 h.h3->tp_mac = macoff;
1869 h.h3->tp_net = netoff;
1870 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1871 && shhwtstamps->syststamp.tv64)
1872 ts = ktime_to_timespec(shhwtstamps->syststamp);
1873 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1874 && shhwtstamps->hwtstamp.tv64)
1875 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1876 else if (skb->tstamp.tv64)
1877 ts = ktime_to_timespec(skb->tstamp);
1878 else
1879 getnstimeofday(&ts);
1880 h.h3->tp_sec = ts.tv_sec;
1881 h.h3->tp_nsec = ts.tv_nsec;
1882 hdrlen = sizeof(*h.h3);
1883 break;
bbd6ef87
PM
1884 default:
1885 BUG();
1886 }
1da177e4 1887
bbd6ef87 1888 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1889 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1890 sll->sll_family = AF_PACKET;
1891 sll->sll_hatype = dev->type;
1892 sll->sll_protocol = skb->protocol;
1893 sll->sll_pkttype = skb->pkt_type;
8032b464 1894 if (unlikely(po->origdev))
80feaacb
PWJ
1895 sll->sll_ifindex = orig_dev->ifindex;
1896 else
1897 sll->sll_ifindex = dev->ifindex;
1da177e4 1898
e16aa207 1899 smp_mb();
f6dafa95 1900#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1901 {
0af55bb5
CG
1902 u8 *start, *end;
1903
f6fb8f10 1904 if (po->tp_version <= TPACKET_V2) {
1905 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1906 + macoff + snaplen);
1907 for (start = h.raw; start < end; start += PAGE_SIZE)
1908 flush_dcache_page(pgv_to_page(start));
1909 }
cc9f01b2 1910 smp_wmb();
1da177e4 1911 }
f6dafa95 1912#endif
f6fb8f10 1913 if (po->tp_version <= TPACKET_V2)
1914 __packet_set_status(po, h.raw, status);
1915 else
1916 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1917
1918 sk->sk_data_ready(sk, 0);
1919
1920drop_n_restore:
1921 if (skb_head != skb->data && skb_shared(skb)) {
1922 skb->data = skb_head;
1923 skb->len = skb_len;
1924 }
1925drop:
1ce4f28b 1926 kfree_skb(skb);
1da177e4
LT
1927 return 0;
1928
1929ring_is_full:
1930 po->stats.tp_drops++;
1931 spin_unlock(&sk->sk_receive_queue.lock);
1932
1933 sk->sk_data_ready(sk, 0);
acb5d75b 1934 kfree_skb(copy_skb);
1da177e4
LT
1935 goto drop_n_restore;
1936}
1937
69e3c75f
JB
1938static void tpacket_destruct_skb(struct sk_buff *skb)
1939{
1940 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1941 void *ph;
1da177e4 1942
69e3c75f
JB
1943 if (likely(po->tx_ring.pg_vec)) {
1944 ph = skb_shinfo(skb)->destructor_arg;
1945 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
1946 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1947 atomic_dec(&po->tx_ring.pending);
1948 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1949 }
1950
1951 sock_wfree(skb);
1952}
1953
40d4e3df
ED
1954static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1955 void *frame, struct net_device *dev, int size_max,
ae641949 1956 __be16 proto, unsigned char *addr, int hlen)
69e3c75f
JB
1957{
1958 union {
1959 struct tpacket_hdr *h1;
1960 struct tpacket2_hdr *h2;
1961 void *raw;
1962 } ph;
1963 int to_write, offset, len, tp_len, nr_frags, len_max;
1964 struct socket *sock = po->sk.sk_socket;
1965 struct page *page;
1966 void *data;
1967 int err;
1968
1969 ph.raw = frame;
1970
1971 skb->protocol = proto;
1972 skb->dev = dev;
1973 skb->priority = po->sk.sk_priority;
2d37a186 1974 skb->mark = po->sk.sk_mark;
69e3c75f
JB
1975 skb_shinfo(skb)->destructor_arg = ph.raw;
1976
1977 switch (po->tp_version) {
1978 case TPACKET_V2:
1979 tp_len = ph.h2->tp_len;
1980 break;
1981 default:
1982 tp_len = ph.h1->tp_len;
1983 break;
1984 }
1985 if (unlikely(tp_len > size_max)) {
40d4e3df 1986 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1987 return -EMSGSIZE;
1988 }
1989
ae641949 1990 skb_reserve(skb, hlen);
69e3c75f
JB
1991 skb_reset_network_header(skb);
1992
1993 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1994 to_write = tp_len;
1995
1996 if (sock->type == SOCK_DGRAM) {
1997 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1998 NULL, tp_len);
1999 if (unlikely(err < 0))
2000 return -EINVAL;
40d4e3df 2001 } else if (dev->hard_header_len) {
69e3c75f
JB
2002 /* net device doesn't like empty head */
2003 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2004 pr_err("packet size is too short (%d < %d)\n",
2005 tp_len, dev->hard_header_len);
69e3c75f
JB
2006 return -EINVAL;
2007 }
2008
2009 skb_push(skb, dev->hard_header_len);
2010 err = skb_store_bits(skb, 0, data,
2011 dev->hard_header_len);
2012 if (unlikely(err))
2013 return err;
2014
2015 data += dev->hard_header_len;
2016 to_write -= dev->hard_header_len;
2017 }
2018
2019 err = -EFAULT;
69e3c75f
JB
2020 offset = offset_in_page(data);
2021 len_max = PAGE_SIZE - offset;
2022 len = ((to_write > len_max) ? len_max : to_write);
2023
2024 skb->data_len = to_write;
2025 skb->len += to_write;
2026 skb->truesize += to_write;
2027 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2028
2029 while (likely(to_write)) {
2030 nr_frags = skb_shinfo(skb)->nr_frags;
2031
2032 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2033 pr_err("Packet exceed the number of skb frags(%lu)\n",
2034 MAX_SKB_FRAGS);
69e3c75f
JB
2035 return -EFAULT;
2036 }
2037
0af55bb5
CG
2038 page = pgv_to_page(data);
2039 data += len;
69e3c75f
JB
2040 flush_dcache_page(page);
2041 get_page(page);
0af55bb5 2042 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2043 to_write -= len;
2044 offset = 0;
2045 len_max = PAGE_SIZE;
2046 len = ((to_write > len_max) ? len_max : to_write);
2047 }
2048
2049 return tp_len;
2050}
2051
2052static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2053{
69e3c75f
JB
2054 struct sk_buff *skb;
2055 struct net_device *dev;
2056 __be16 proto;
827d9780
BG
2057 bool need_rls_dev = false;
2058 int err, reserve = 0;
40d4e3df
ED
2059 void *ph;
2060 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2061 int tp_len, size_max;
2062 unsigned char *addr;
2063 int len_sum = 0;
2064 int status = 0;
ae641949 2065 int hlen, tlen;
69e3c75f 2066
69e3c75f
JB
2067 mutex_lock(&po->pg_vec_lock);
2068
2069 err = -EBUSY;
2070 if (saddr == NULL) {
827d9780 2071 dev = po->prot_hook.dev;
69e3c75f
JB
2072 proto = po->num;
2073 addr = NULL;
2074 } else {
2075 err = -EINVAL;
2076 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2077 goto out;
2078 if (msg->msg_namelen < (saddr->sll_halen
2079 + offsetof(struct sockaddr_ll,
2080 sll_addr)))
2081 goto out;
69e3c75f
JB
2082 proto = saddr->sll_protocol;
2083 addr = saddr->sll_addr;
827d9780
BG
2084 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2085 need_rls_dev = true;
69e3c75f
JB
2086 }
2087
69e3c75f
JB
2088 err = -ENXIO;
2089 if (unlikely(dev == NULL))
2090 goto out;
2091
2092 reserve = dev->hard_header_len;
2093
2094 err = -ENETDOWN;
2095 if (unlikely(!(dev->flags & IFF_UP)))
2096 goto out_put;
2097
2098 size_max = po->tx_ring.frame_size
b5dd884e 2099 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2100
2101 if (size_max > dev->mtu + reserve)
2102 size_max = dev->mtu + reserve;
2103
2104 do {
2105 ph = packet_current_frame(po, &po->tx_ring,
2106 TP_STATUS_SEND_REQUEST);
2107
2108 if (unlikely(ph == NULL)) {
2109 schedule();
2110 continue;
2111 }
2112
2113 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2114 hlen = LL_RESERVED_SPACE(dev);
2115 tlen = dev->needed_tailroom;
69e3c75f 2116 skb = sock_alloc_send_skb(&po->sk,
ae641949 2117 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2118 0, &err);
2119
2120 if (unlikely(skb == NULL))
2121 goto out_status;
2122
2123 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2124 addr, hlen);
69e3c75f
JB
2125
2126 if (unlikely(tp_len < 0)) {
2127 if (po->tp_loss) {
2128 __packet_set_status(po, ph,
2129 TP_STATUS_AVAILABLE);
2130 packet_increment_head(&po->tx_ring);
2131 kfree_skb(skb);
2132 continue;
2133 } else {
2134 status = TP_STATUS_WRONG_FORMAT;
2135 err = tp_len;
2136 goto out_status;
2137 }
2138 }
2139
2140 skb->destructor = tpacket_destruct_skb;
2141 __packet_set_status(po, ph, TP_STATUS_SENDING);
2142 atomic_inc(&po->tx_ring.pending);
2143
2144 status = TP_STATUS_SEND_REQUEST;
2145 err = dev_queue_xmit(skb);
eb70df13
JP
2146 if (unlikely(err > 0)) {
2147 err = net_xmit_errno(err);
2148 if (err && __packet_get_status(po, ph) ==
2149 TP_STATUS_AVAILABLE) {
2150 /* skb was destructed already */
2151 skb = NULL;
2152 goto out_status;
2153 }
2154 /*
2155 * skb was dropped but not destructed yet;
2156 * let's treat it like congestion or err < 0
2157 */
2158 err = 0;
2159 }
69e3c75f
JB
2160 packet_increment_head(&po->tx_ring);
2161 len_sum += tp_len;
f64f9e71
JP
2162 } while (likely((ph != NULL) ||
2163 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2164 (atomic_read(&po->tx_ring.pending))))
2165 );
69e3c75f
JB
2166
2167 err = len_sum;
2168 goto out_put;
2169
69e3c75f
JB
2170out_status:
2171 __packet_set_status(po, ph, status);
2172 kfree_skb(skb);
2173out_put:
827d9780
BG
2174 if (need_rls_dev)
2175 dev_put(dev);
69e3c75f
JB
2176out:
2177 mutex_unlock(&po->pg_vec_lock);
2178 return err;
2179}
69e3c75f 2180
eea49cc9
OJ
2181static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2182 size_t reserve, size_t len,
2183 size_t linear, int noblock,
2184 int *err)
bfd5f4a3
SS
2185{
2186 struct sk_buff *skb;
2187
2188 /* Under a page? Don't bother with paged skb. */
2189 if (prepad + len < PAGE_SIZE || !linear)
2190 linear = len;
2191
2192 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2193 err);
2194 if (!skb)
2195 return NULL;
2196
2197 skb_reserve(skb, reserve);
2198 skb_put(skb, linear);
2199 skb->data_len = len - linear;
2200 skb->len += len - linear;
2201
2202 return skb;
2203}
2204
69e3c75f 2205static int packet_snd(struct socket *sock,
1da177e4
LT
2206 struct msghdr *msg, size_t len)
2207{
2208 struct sock *sk = sock->sk;
40d4e3df 2209 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2210 struct sk_buff *skb;
2211 struct net_device *dev;
0e11c91e 2212 __be16 proto;
827d9780 2213 bool need_rls_dev = false;
1da177e4 2214 unsigned char *addr;
827d9780 2215 int err, reserve = 0;
bfd5f4a3
SS
2216 struct virtio_net_hdr vnet_hdr = { 0 };
2217 int offset = 0;
2218 int vnet_hdr_len;
2219 struct packet_sock *po = pkt_sk(sk);
2220 unsigned short gso_type = 0;
ae641949 2221 int hlen, tlen;
3bdc0eba 2222 int extra_len = 0;
1da177e4
LT
2223
2224 /*
1ce4f28b 2225 * Get and verify the address.
1da177e4 2226 */
1ce4f28b 2227
1da177e4 2228 if (saddr == NULL) {
827d9780 2229 dev = po->prot_hook.dev;
1da177e4
LT
2230 proto = po->num;
2231 addr = NULL;
2232 } else {
2233 err = -EINVAL;
2234 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2235 goto out;
0fb375fb
EB
2236 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2237 goto out;
1da177e4
LT
2238 proto = saddr->sll_protocol;
2239 addr = saddr->sll_addr;
827d9780
BG
2240 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2241 need_rls_dev = true;
1da177e4
LT
2242 }
2243
1da177e4
LT
2244 err = -ENXIO;
2245 if (dev == NULL)
2246 goto out_unlock;
2247 if (sock->type == SOCK_RAW)
2248 reserve = dev->hard_header_len;
2249
d5e76b0a
DM
2250 err = -ENETDOWN;
2251 if (!(dev->flags & IFF_UP))
2252 goto out_unlock;
2253
bfd5f4a3
SS
2254 if (po->has_vnet_hdr) {
2255 vnet_hdr_len = sizeof(vnet_hdr);
2256
2257 err = -EINVAL;
2258 if (len < vnet_hdr_len)
2259 goto out_unlock;
2260
2261 len -= vnet_hdr_len;
2262
2263 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2264 vnet_hdr_len);
2265 if (err < 0)
2266 goto out_unlock;
2267
2268 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2269 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2270 vnet_hdr.hdr_len))
2271 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2272 vnet_hdr.csum_offset + 2;
2273
2274 err = -EINVAL;
2275 if (vnet_hdr.hdr_len > len)
2276 goto out_unlock;
2277
2278 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2279 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2280 case VIRTIO_NET_HDR_GSO_TCPV4:
2281 gso_type = SKB_GSO_TCPV4;
2282 break;
2283 case VIRTIO_NET_HDR_GSO_TCPV6:
2284 gso_type = SKB_GSO_TCPV6;
2285 break;
2286 case VIRTIO_NET_HDR_GSO_UDP:
2287 gso_type = SKB_GSO_UDP;
2288 break;
2289 default:
2290 goto out_unlock;
2291 }
2292
2293 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2294 gso_type |= SKB_GSO_TCP_ECN;
2295
2296 if (vnet_hdr.gso_size == 0)
2297 goto out_unlock;
2298
2299 }
2300 }
2301
3bdc0eba
BG
2302 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2303 if (!netif_supports_nofcs(dev)) {
2304 err = -EPROTONOSUPPORT;
2305 goto out_unlock;
2306 }
2307 extra_len = 4; /* We're doing our own CRC */
2308 }
2309
1da177e4 2310 err = -EMSGSIZE;
3bdc0eba 2311 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2312 goto out_unlock;
2313
bfd5f4a3 2314 err = -ENOBUFS;
ae641949
HX
2315 hlen = LL_RESERVED_SPACE(dev);
2316 tlen = dev->needed_tailroom;
2317 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2318 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2319 if (skb == NULL)
1da177e4
LT
2320 goto out_unlock;
2321
bfd5f4a3 2322 skb_set_network_header(skb, reserve);
1da177e4 2323
0c4e8581
SH
2324 err = -EINVAL;
2325 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2326 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2327 goto out_free;
1da177e4
LT
2328
2329 /* Returns -EFAULT on error */
bfd5f4a3 2330 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2331 if (err)
2332 goto out_free;
2244d07b 2333 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
2334 if (err < 0)
2335 goto out_free;
1da177e4 2336
3bdc0eba 2337 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2338 /* Earlier code assumed this would be a VLAN pkt,
2339 * double-check this now that we have the actual
2340 * packet in hand.
2341 */
2342 struct ethhdr *ehdr;
2343 skb_reset_mac_header(skb);
2344 ehdr = eth_hdr(skb);
2345 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2346 err = -EMSGSIZE;
2347 goto out_free;
2348 }
2349 }
2350
1da177e4
LT
2351 skb->protocol = proto;
2352 skb->dev = dev;
2353 skb->priority = sk->sk_priority;
2d37a186 2354 skb->mark = sk->sk_mark;
1da177e4 2355
bfd5f4a3
SS
2356 if (po->has_vnet_hdr) {
2357 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2358 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2359 vnet_hdr.csum_offset)) {
2360 err = -EINVAL;
2361 goto out_free;
2362 }
2363 }
2364
2365 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2366 skb_shinfo(skb)->gso_type = gso_type;
2367
2368 /* Header must be checked, and gso_segs computed. */
2369 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2370 skb_shinfo(skb)->gso_segs = 0;
2371
2372 len += vnet_hdr_len;
2373 }
2374
3bdc0eba
BG
2375 if (unlikely(extra_len == 4))
2376 skb->no_fcs = 1;
2377
1da177e4
LT
2378 /*
2379 * Now send it
2380 */
2381
2382 err = dev_queue_xmit(skb);
2383 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2384 goto out_unlock;
2385
827d9780
BG
2386 if (need_rls_dev)
2387 dev_put(dev);
1da177e4 2388
40d4e3df 2389 return len;
1da177e4
LT
2390
2391out_free:
2392 kfree_skb(skb);
2393out_unlock:
827d9780 2394 if (dev && need_rls_dev)
1da177e4
LT
2395 dev_put(dev);
2396out:
2397 return err;
2398}
2399
69e3c75f
JB
2400static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2401 struct msghdr *msg, size_t len)
2402{
69e3c75f
JB
2403 struct sock *sk = sock->sk;
2404 struct packet_sock *po = pkt_sk(sk);
2405 if (po->tx_ring.pg_vec)
2406 return tpacket_snd(po, msg);
2407 else
69e3c75f
JB
2408 return packet_snd(sock, msg, len);
2409}
2410
1da177e4
LT
2411/*
2412 * Close a PACKET socket. This is fairly simple. We immediately go
2413 * to 'closed' state and remove our protocol entry in the device list.
2414 */
2415
2416static int packet_release(struct socket *sock)
2417{
2418 struct sock *sk = sock->sk;
2419 struct packet_sock *po;
d12d01d6 2420 struct net *net;
f6fb8f10 2421 union tpacket_req_u req_u;
1da177e4
LT
2422
2423 if (!sk)
2424 return 0;
2425
3b1e0a65 2426 net = sock_net(sk);
1da177e4
LT
2427 po = pkt_sk(sk);
2428
808f5114 2429 spin_lock_bh(&net->packet.sklist_lock);
2430 sk_del_node_init_rcu(sk);
920de804 2431 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 2432 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 2433
808f5114 2434 spin_lock(&po->bind_lock);
ce06b03e 2435 unregister_prot_hook(sk, false);
160ff18a
BG
2436 if (po->prot_hook.dev) {
2437 dev_put(po->prot_hook.dev);
2438 po->prot_hook.dev = NULL;
2439 }
808f5114 2440 spin_unlock(&po->bind_lock);
1da177e4 2441
1da177e4 2442 packet_flush_mclist(sk);
1da177e4 2443
f6fb8f10 2444 memset(&req_u, 0, sizeof(req_u));
69e3c75f
JB
2445
2446 if (po->rx_ring.pg_vec)
f6fb8f10 2447 packet_set_ring(sk, &req_u, 1, 0);
69e3c75f
JB
2448
2449 if (po->tx_ring.pg_vec)
f6fb8f10 2450 packet_set_ring(sk, &req_u, 1, 1);
1da177e4 2451
dc99f600
DM
2452 fanout_release(sk);
2453
808f5114 2454 synchronize_net();
1da177e4
LT
2455 /*
2456 * Now the socket is dead. No more input will appear.
2457 */
1da177e4
LT
2458 sock_orphan(sk);
2459 sock->sk = NULL;
2460
2461 /* Purge queues */
2462
2463 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2464 sk_refcnt_debug_release(sk);
1da177e4
LT
2465
2466 sock_put(sk);
2467 return 0;
2468}
2469
2470/*
2471 * Attach a packet hook.
2472 */
2473
0e11c91e 2474static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2475{
2476 struct packet_sock *po = pkt_sk(sk);
dc99f600 2477
aef950b4
WY
2478 if (po->fanout) {
2479 if (dev)
2480 dev_put(dev);
2481
dc99f600 2482 return -EINVAL;
aef950b4 2483 }
1da177e4
LT
2484
2485 lock_sock(sk);
2486
2487 spin_lock(&po->bind_lock);
ce06b03e 2488 unregister_prot_hook(sk, true);
1da177e4
LT
2489 po->num = protocol;
2490 po->prot_hook.type = protocol;
160ff18a
BG
2491 if (po->prot_hook.dev)
2492 dev_put(po->prot_hook.dev);
1da177e4
LT
2493 po->prot_hook.dev = dev;
2494
2495 po->ifindex = dev ? dev->ifindex : 0;
2496
2497 if (protocol == 0)
2498 goto out_unlock;
2499
be85d4ad 2500 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2501 register_prot_hook(sk);
be85d4ad
UT
2502 } else {
2503 sk->sk_err = ENETDOWN;
2504 if (!sock_flag(sk, SOCK_DEAD))
2505 sk->sk_error_report(sk);
1da177e4
LT
2506 }
2507
2508out_unlock:
2509 spin_unlock(&po->bind_lock);
2510 release_sock(sk);
2511 return 0;
2512}
2513
2514/*
2515 * Bind a packet socket to a device
2516 */
2517
40d4e3df
ED
2518static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2519 int addr_len)
1da177e4 2520{
40d4e3df 2521 struct sock *sk = sock->sk;
1da177e4
LT
2522 char name[15];
2523 struct net_device *dev;
2524 int err = -ENODEV;
1ce4f28b 2525
1da177e4
LT
2526 /*
2527 * Check legality
2528 */
1ce4f28b 2529
8ae55f04 2530 if (addr_len != sizeof(struct sockaddr))
1da177e4 2531 return -EINVAL;
40d4e3df 2532 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2533
3b1e0a65 2534 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2535 if (dev)
1da177e4 2536 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2537 return err;
2538}
1da177e4
LT
2539
2540static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2541{
40d4e3df
ED
2542 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2543 struct sock *sk = sock->sk;
1da177e4
LT
2544 struct net_device *dev = NULL;
2545 int err;
2546
2547
2548 /*
2549 * Check legality
2550 */
1ce4f28b 2551
1da177e4
LT
2552 if (addr_len < sizeof(struct sockaddr_ll))
2553 return -EINVAL;
2554 if (sll->sll_family != AF_PACKET)
2555 return -EINVAL;
2556
2557 if (sll->sll_ifindex) {
2558 err = -ENODEV;
3b1e0a65 2559 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2560 if (dev == NULL)
2561 goto out;
2562 }
2563 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2564
2565out:
2566 return err;
2567}
2568
2569static struct proto packet_proto = {
2570 .name = "PACKET",
2571 .owner = THIS_MODULE,
2572 .obj_size = sizeof(struct packet_sock),
2573};
2574
2575/*
1ce4f28b 2576 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2577 */
2578
3f378b68
EP
2579static int packet_create(struct net *net, struct socket *sock, int protocol,
2580 int kern)
1da177e4
LT
2581{
2582 struct sock *sk;
2583 struct packet_sock *po;
0e11c91e 2584 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2585 int err;
2586
2587 if (!capable(CAP_NET_RAW))
2588 return -EPERM;
be02097c
DM
2589 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2590 sock->type != SOCK_PACKET)
1da177e4
LT
2591 return -ESOCKTNOSUPPORT;
2592
2593 sock->state = SS_UNCONNECTED;
2594
2595 err = -ENOBUFS;
6257ff21 2596 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2597 if (sk == NULL)
2598 goto out;
2599
2600 sock->ops = &packet_ops;
1da177e4
LT
2601 if (sock->type == SOCK_PACKET)
2602 sock->ops = &packet_ops_spkt;
be02097c 2603
1da177e4
LT
2604 sock_init_data(sock, sk);
2605
2606 po = pkt_sk(sk);
2607 sk->sk_family = PF_PACKET;
0e11c91e 2608 po->num = proto;
1da177e4
LT
2609
2610 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2611 sk_refcnt_debug_inc(sk);
1da177e4
LT
2612
2613 /*
2614 * Attach a protocol block
2615 */
2616
2617 spin_lock_init(&po->bind_lock);
905db440 2618 mutex_init(&po->pg_vec_lock);
1da177e4 2619 po->prot_hook.func = packet_rcv;
be02097c 2620
1da177e4
LT
2621 if (sock->type == SOCK_PACKET)
2622 po->prot_hook.func = packet_rcv_spkt;
be02097c 2623
1da177e4
LT
2624 po->prot_hook.af_packet_priv = sk;
2625
0e11c91e
AV
2626 if (proto) {
2627 po->prot_hook.type = proto;
ce06b03e 2628 register_prot_hook(sk);
1da177e4
LT
2629 }
2630
808f5114 2631 spin_lock_bh(&net->packet.sklist_lock);
2632 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 2633 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 2634 spin_unlock_bh(&net->packet.sklist_lock);
2635
40d4e3df 2636 return 0;
1da177e4
LT
2637out:
2638 return err;
2639}
2640
ed85b565
RC
2641static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2642{
2643 struct sock_exterr_skb *serr;
2644 struct sk_buff *skb, *skb2;
2645 int copied, err;
2646
2647 err = -EAGAIN;
2648 skb = skb_dequeue(&sk->sk_error_queue);
2649 if (skb == NULL)
2650 goto out;
2651
2652 copied = skb->len;
2653 if (copied > len) {
2654 msg->msg_flags |= MSG_TRUNC;
2655 copied = len;
2656 }
2657 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2658 if (err)
2659 goto out_free_skb;
2660
2661 sock_recv_timestamp(msg, sk, skb);
2662
2663 serr = SKB_EXT_ERR(skb);
2664 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2665 sizeof(serr->ee), &serr->ee);
2666
2667 msg->msg_flags |= MSG_ERRQUEUE;
2668 err = copied;
2669
2670 /* Reset and regenerate socket error */
2671 spin_lock_bh(&sk->sk_error_queue.lock);
2672 sk->sk_err = 0;
2673 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2674 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2675 spin_unlock_bh(&sk->sk_error_queue.lock);
2676 sk->sk_error_report(sk);
2677 } else
2678 spin_unlock_bh(&sk->sk_error_queue.lock);
2679
2680out_free_skb:
2681 kfree_skb(skb);
2682out:
2683 return err;
2684}
2685
1da177e4
LT
2686/*
2687 * Pull a packet from our receive queue and hand it to the user.
2688 * If necessary we block.
2689 */
2690
2691static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2692 struct msghdr *msg, size_t len, int flags)
2693{
2694 struct sock *sk = sock->sk;
2695 struct sk_buff *skb;
2696 int copied, err;
0fb375fb 2697 struct sockaddr_ll *sll;
bfd5f4a3 2698 int vnet_hdr_len = 0;
1da177e4
LT
2699
2700 err = -EINVAL;
ed85b565 2701 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2702 goto out;
2703
2704#if 0
2705 /* What error should we return now? EUNATTACH? */
2706 if (pkt_sk(sk)->ifindex < 0)
2707 return -ENODEV;
2708#endif
2709
ed85b565
RC
2710 if (flags & MSG_ERRQUEUE) {
2711 err = packet_recv_error(sk, msg, len);
2712 goto out;
2713 }
2714
1da177e4
LT
2715 /*
2716 * Call the generic datagram receiver. This handles all sorts
2717 * of horrible races and re-entrancy so we can forget about it
2718 * in the protocol layers.
2719 *
2720 * Now it will return ENETDOWN, if device have just gone down,
2721 * but then it will block.
2722 */
2723
40d4e3df 2724 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2725
2726 /*
1ce4f28b 2727 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2728 * handles the blocking we don't see and worry about blocking
2729 * retries.
2730 */
2731
8ae55f04 2732 if (skb == NULL)
1da177e4
LT
2733 goto out;
2734
bfd5f4a3
SS
2735 if (pkt_sk(sk)->has_vnet_hdr) {
2736 struct virtio_net_hdr vnet_hdr = { 0 };
2737
2738 err = -EINVAL;
2739 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2740 if (len < vnet_hdr_len)
bfd5f4a3
SS
2741 goto out_free;
2742
1f18b717
MK
2743 len -= vnet_hdr_len;
2744
bfd5f4a3
SS
2745 if (skb_is_gso(skb)) {
2746 struct skb_shared_info *sinfo = skb_shinfo(skb);
2747
2748 /* This is a hint as to how much should be linear. */
2749 vnet_hdr.hdr_len = skb_headlen(skb);
2750 vnet_hdr.gso_size = sinfo->gso_size;
2751 if (sinfo->gso_type & SKB_GSO_TCPV4)
2752 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2753 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2754 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2755 else if (sinfo->gso_type & SKB_GSO_UDP)
2756 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2757 else if (sinfo->gso_type & SKB_GSO_FCOE)
2758 goto out_free;
2759 else
2760 BUG();
2761 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2762 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2763 } else
2764 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2765
2766 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2767 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2768 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2769 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2770 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2771 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2772 } /* else everything is zero */
2773
2774 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2775 vnet_hdr_len);
2776 if (err < 0)
2777 goto out_free;
2778 }
2779
0fb375fb
EB
2780 /*
2781 * If the address length field is there to be filled in, we fill
2782 * it in now.
2783 */
2784
ffbc6111 2785 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2786 if (sock->type == SOCK_PACKET)
2787 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2788 else
2789 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2790
1da177e4
LT
2791 /*
2792 * You lose any data beyond the buffer you gave. If it worries a
2793 * user program they can ask the device for its MTU anyway.
2794 */
2795
2796 copied = skb->len;
40d4e3df
ED
2797 if (copied > len) {
2798 copied = len;
2799 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2800 }
2801
2802 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2803 if (err)
2804 goto out_free;
2805
3b885787 2806 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2807
2808 if (msg->msg_name)
ffbc6111
HX
2809 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2810 msg->msg_namelen);
1da177e4 2811
8dc41944 2812 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2813 struct tpacket_auxdata aux;
2814
2815 aux.tp_status = TP_STATUS_USER;
2816 if (skb->ip_summed == CHECKSUM_PARTIAL)
2817 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2818 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2819 aux.tp_snaplen = skb->len;
2820 aux.tp_mac = 0;
bbe735e4 2821 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2822 if (vlan_tx_tag_present(skb)) {
2823 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2824 aux.tp_status |= TP_STATUS_VLAN_VALID;
2825 } else {
2826 aux.tp_vlan_tci = 0;
2827 }
13fcb7bd 2828 aux.tp_padding = 0;
ffbc6111 2829 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2830 }
2831
1da177e4
LT
2832 /*
2833 * Free or return the buffer as appropriate. Again this
2834 * hides all the races and re-entrancy issues from us.
2835 */
bfd5f4a3 2836 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2837
2838out_free:
2839 skb_free_datagram(sk, skb);
2840out:
2841 return err;
2842}
2843
1da177e4
LT
2844static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2845 int *uaddr_len, int peer)
2846{
2847 struct net_device *dev;
2848 struct sock *sk = sock->sk;
2849
2850 if (peer)
2851 return -EOPNOTSUPP;
2852
2853 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2854 rcu_read_lock();
2855 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2856 if (dev)
67286640 2857 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2858 else
1da177e4 2859 memset(uaddr->sa_data, 0, 14);
654d1f8a 2860 rcu_read_unlock();
1da177e4
LT
2861 *uaddr_len = sizeof(*uaddr);
2862
2863 return 0;
2864}
1da177e4
LT
2865
2866static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2867 int *uaddr_len, int peer)
2868{
2869 struct net_device *dev;
2870 struct sock *sk = sock->sk;
2871 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2872 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2873
2874 if (peer)
2875 return -EOPNOTSUPP;
2876
2877 sll->sll_family = AF_PACKET;
2878 sll->sll_ifindex = po->ifindex;
2879 sll->sll_protocol = po->num;
67286640 2880 sll->sll_pkttype = 0;
654d1f8a
ED
2881 rcu_read_lock();
2882 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2883 if (dev) {
2884 sll->sll_hatype = dev->type;
2885 sll->sll_halen = dev->addr_len;
2886 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2887 } else {
2888 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2889 sll->sll_halen = 0;
2890 }
654d1f8a 2891 rcu_read_unlock();
0fb375fb 2892 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2893
2894 return 0;
2895}
2896
2aeb0b88
WC
2897static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2898 int what)
1da177e4
LT
2899{
2900 switch (i->type) {
2901 case PACKET_MR_MULTICAST:
1162563f
JP
2902 if (i->alen != dev->addr_len)
2903 return -EINVAL;
1da177e4 2904 if (what > 0)
22bedad3 2905 return dev_mc_add(dev, i->addr);
1da177e4 2906 else
22bedad3 2907 return dev_mc_del(dev, i->addr);
1da177e4
LT
2908 break;
2909 case PACKET_MR_PROMISC:
2aeb0b88 2910 return dev_set_promiscuity(dev, what);
1da177e4
LT
2911 break;
2912 case PACKET_MR_ALLMULTI:
2aeb0b88 2913 return dev_set_allmulti(dev, what);
1da177e4 2914 break;
d95ed927 2915 case PACKET_MR_UNICAST:
1162563f
JP
2916 if (i->alen != dev->addr_len)
2917 return -EINVAL;
d95ed927 2918 if (what > 0)
a748ee24 2919 return dev_uc_add(dev, i->addr);
d95ed927 2920 else
a748ee24 2921 return dev_uc_del(dev, i->addr);
d95ed927 2922 break;
40d4e3df
ED
2923 default:
2924 break;
1da177e4 2925 }
2aeb0b88 2926 return 0;
1da177e4
LT
2927}
2928
2929static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2930{
40d4e3df 2931 for ( ; i; i = i->next) {
1da177e4
LT
2932 if (i->ifindex == dev->ifindex)
2933 packet_dev_mc(dev, i, what);
2934 }
2935}
2936
0fb375fb 2937static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2938{
2939 struct packet_sock *po = pkt_sk(sk);
2940 struct packet_mclist *ml, *i;
2941 struct net_device *dev;
2942 int err;
2943
2944 rtnl_lock();
2945
2946 err = -ENODEV;
3b1e0a65 2947 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2948 if (!dev)
2949 goto done;
2950
2951 err = -EINVAL;
1162563f 2952 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2953 goto done;
2954
2955 err = -ENOBUFS;
8b3a7005 2956 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2957 if (i == NULL)
2958 goto done;
2959
2960 err = 0;
2961 for (ml = po->mclist; ml; ml = ml->next) {
2962 if (ml->ifindex == mreq->mr_ifindex &&
2963 ml->type == mreq->mr_type &&
2964 ml->alen == mreq->mr_alen &&
2965 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2966 ml->count++;
2967 /* Free the new element ... */
2968 kfree(i);
2969 goto done;
2970 }
2971 }
2972
2973 i->type = mreq->mr_type;
2974 i->ifindex = mreq->mr_ifindex;
2975 i->alen = mreq->mr_alen;
2976 memcpy(i->addr, mreq->mr_address, i->alen);
2977 i->count = 1;
2978 i->next = po->mclist;
2979 po->mclist = i;
2aeb0b88
WC
2980 err = packet_dev_mc(dev, i, 1);
2981 if (err) {
2982 po->mclist = i->next;
2983 kfree(i);
2984 }
1da177e4
LT
2985
2986done:
2987 rtnl_unlock();
2988 return err;
2989}
2990
0fb375fb 2991static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2992{
2993 struct packet_mclist *ml, **mlp;
2994
2995 rtnl_lock();
2996
2997 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2998 if (ml->ifindex == mreq->mr_ifindex &&
2999 ml->type == mreq->mr_type &&
3000 ml->alen == mreq->mr_alen &&
3001 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3002 if (--ml->count == 0) {
3003 struct net_device *dev;
3004 *mlp = ml->next;
ad959e76
ED
3005 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3006 if (dev)
1da177e4 3007 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3008 kfree(ml);
3009 }
3010 rtnl_unlock();
3011 return 0;
3012 }
3013 }
3014 rtnl_unlock();
3015 return -EADDRNOTAVAIL;
3016}
3017
3018static void packet_flush_mclist(struct sock *sk)
3019{
3020 struct packet_sock *po = pkt_sk(sk);
3021 struct packet_mclist *ml;
3022
3023 if (!po->mclist)
3024 return;
3025
3026 rtnl_lock();
3027 while ((ml = po->mclist) != NULL) {
3028 struct net_device *dev;
3029
3030 po->mclist = ml->next;
ad959e76
ED
3031 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3032 if (dev != NULL)
1da177e4 3033 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3034 kfree(ml);
3035 }
3036 rtnl_unlock();
3037}
1da177e4
LT
3038
3039static int
b7058842 3040packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3041{
3042 struct sock *sk = sock->sk;
8dc41944 3043 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3044 int ret;
3045
3046 if (level != SOL_PACKET)
3047 return -ENOPROTOOPT;
3048
69e3c75f 3049 switch (optname) {
1ce4f28b 3050 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3051 case PACKET_DROP_MEMBERSHIP:
3052 {
0fb375fb
EB
3053 struct packet_mreq_max mreq;
3054 int len = optlen;
3055 memset(&mreq, 0, sizeof(mreq));
3056 if (len < sizeof(struct packet_mreq))
1da177e4 3057 return -EINVAL;
0fb375fb
EB
3058 if (len > sizeof(mreq))
3059 len = sizeof(mreq);
40d4e3df 3060 if (copy_from_user(&mreq, optval, len))
1da177e4 3061 return -EFAULT;
0fb375fb
EB
3062 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3063 return -EINVAL;
1da177e4
LT
3064 if (optname == PACKET_ADD_MEMBERSHIP)
3065 ret = packet_mc_add(sk, &mreq);
3066 else
3067 ret = packet_mc_drop(sk, &mreq);
3068 return ret;
3069 }
a2efcfa0 3070
1da177e4 3071 case PACKET_RX_RING:
69e3c75f 3072 case PACKET_TX_RING:
1da177e4 3073 {
f6fb8f10 3074 union tpacket_req_u req_u;
3075 int len;
1da177e4 3076
f6fb8f10 3077 switch (po->tp_version) {
3078 case TPACKET_V1:
3079 case TPACKET_V2:
3080 len = sizeof(req_u.req);
3081 break;
3082 case TPACKET_V3:
3083 default:
3084 len = sizeof(req_u.req3);
3085 break;
3086 }
3087 if (optlen < len)
1da177e4 3088 return -EINVAL;
bfd5f4a3
SS
3089 if (pkt_sk(sk)->has_vnet_hdr)
3090 return -EINVAL;
f6fb8f10 3091 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3092 return -EFAULT;
f6fb8f10 3093 return packet_set_ring(sk, &req_u, 0,
3094 optname == PACKET_TX_RING);
1da177e4
LT
3095 }
3096 case PACKET_COPY_THRESH:
3097 {
3098 int val;
3099
40d4e3df 3100 if (optlen != sizeof(val))
1da177e4 3101 return -EINVAL;
40d4e3df 3102 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3103 return -EFAULT;
3104
3105 pkt_sk(sk)->copy_thresh = val;
3106 return 0;
3107 }
bbd6ef87
PM
3108 case PACKET_VERSION:
3109 {
3110 int val;
3111
3112 if (optlen != sizeof(val))
3113 return -EINVAL;
69e3c75f 3114 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3115 return -EBUSY;
3116 if (copy_from_user(&val, optval, sizeof(val)))
3117 return -EFAULT;
3118 switch (val) {
3119 case TPACKET_V1:
3120 case TPACKET_V2:
f6fb8f10 3121 case TPACKET_V3:
bbd6ef87
PM
3122 po->tp_version = val;
3123 return 0;
3124 default:
3125 return -EINVAL;
3126 }
3127 }
8913336a
PM
3128 case PACKET_RESERVE:
3129 {
3130 unsigned int val;
3131
3132 if (optlen != sizeof(val))
3133 return -EINVAL;
69e3c75f 3134 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3135 return -EBUSY;
3136 if (copy_from_user(&val, optval, sizeof(val)))
3137 return -EFAULT;
3138 po->tp_reserve = val;
3139 return 0;
3140 }
69e3c75f
JB
3141 case PACKET_LOSS:
3142 {
3143 unsigned int val;
3144
3145 if (optlen != sizeof(val))
3146 return -EINVAL;
3147 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3148 return -EBUSY;
3149 if (copy_from_user(&val, optval, sizeof(val)))
3150 return -EFAULT;
3151 po->tp_loss = !!val;
3152 return 0;
3153 }
8dc41944
HX
3154 case PACKET_AUXDATA:
3155 {
3156 int val;
3157
3158 if (optlen < sizeof(val))
3159 return -EINVAL;
3160 if (copy_from_user(&val, optval, sizeof(val)))
3161 return -EFAULT;
3162
3163 po->auxdata = !!val;
3164 return 0;
3165 }
80feaacb
PWJ
3166 case PACKET_ORIGDEV:
3167 {
3168 int val;
3169
3170 if (optlen < sizeof(val))
3171 return -EINVAL;
3172 if (copy_from_user(&val, optval, sizeof(val)))
3173 return -EFAULT;
3174
3175 po->origdev = !!val;
3176 return 0;
3177 }
bfd5f4a3
SS
3178 case PACKET_VNET_HDR:
3179 {
3180 int val;
3181
3182 if (sock->type != SOCK_RAW)
3183 return -EINVAL;
3184 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3185 return -EBUSY;
3186 if (optlen < sizeof(val))
3187 return -EINVAL;
3188 if (copy_from_user(&val, optval, sizeof(val)))
3189 return -EFAULT;
3190
3191 po->has_vnet_hdr = !!val;
3192 return 0;
3193 }
614f60fa
SM
3194 case PACKET_TIMESTAMP:
3195 {
3196 int val;
3197
3198 if (optlen != sizeof(val))
3199 return -EINVAL;
3200 if (copy_from_user(&val, optval, sizeof(val)))
3201 return -EFAULT;
3202
3203 po->tp_tstamp = val;
3204 return 0;
3205 }
dc99f600
DM
3206 case PACKET_FANOUT:
3207 {
3208 int val;
3209
3210 if (optlen != sizeof(val))
3211 return -EINVAL;
3212 if (copy_from_user(&val, optval, sizeof(val)))
3213 return -EFAULT;
3214
3215 return fanout_add(sk, val & 0xffff, val >> 16);
3216 }
1da177e4
LT
3217 default:
3218 return -ENOPROTOOPT;
3219 }
3220}
3221
3222static int packet_getsockopt(struct socket *sock, int level, int optname,
3223 char __user *optval, int __user *optlen)
3224{
3225 int len;
c06fff6e 3226 int val, lv = sizeof(val);
1da177e4
LT
3227 struct sock *sk = sock->sk;
3228 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3229 void *data = &val;
8dc41944 3230 struct tpacket_stats st;
f6fb8f10 3231 union tpacket_stats_u st_u;
1da177e4
LT
3232
3233 if (level != SOL_PACKET)
3234 return -ENOPROTOOPT;
3235
8ae55f04
KK
3236 if (get_user(len, optlen))
3237 return -EFAULT;
1da177e4
LT
3238
3239 if (len < 0)
3240 return -EINVAL;
1ce4f28b 3241
69e3c75f 3242 switch (optname) {
1da177e4 3243 case PACKET_STATISTICS:
1da177e4 3244 spin_lock_bh(&sk->sk_receive_queue.lock);
f6fb8f10 3245 if (po->tp_version == TPACKET_V3) {
c06fff6e 3246 lv = sizeof(struct tpacket_stats_v3);
f6fb8f10 3247 memcpy(&st_u.stats3, &po->stats,
c06fff6e 3248 sizeof(struct tpacket_stats));
f6fb8f10 3249 st_u.stats3.tp_freeze_q_cnt =
c06fff6e 3250 po->stats_u.stats3.tp_freeze_q_cnt;
f6fb8f10 3251 st_u.stats3.tp_packets += po->stats.tp_drops;
3252 data = &st_u.stats3;
3253 } else {
c06fff6e 3254 lv = sizeof(struct tpacket_stats);
f6fb8f10 3255 st = po->stats;
3256 st.tp_packets += st.tp_drops;
3257 data = &st;
3258 }
1da177e4
LT
3259 memset(&po->stats, 0, sizeof(st));
3260 spin_unlock_bh(&sk->sk_receive_queue.lock);
8dc41944
HX
3261 break;
3262 case PACKET_AUXDATA:
8dc41944 3263 val = po->auxdata;
80feaacb
PWJ
3264 break;
3265 case PACKET_ORIGDEV:
80feaacb 3266 val = po->origdev;
bfd5f4a3
SS
3267 break;
3268 case PACKET_VNET_HDR:
bfd5f4a3 3269 val = po->has_vnet_hdr;
1da177e4 3270 break;
bbd6ef87 3271 case PACKET_VERSION:
bbd6ef87 3272 val = po->tp_version;
bbd6ef87
PM
3273 break;
3274 case PACKET_HDRLEN:
3275 if (len > sizeof(int))
3276 len = sizeof(int);
3277 if (copy_from_user(&val, optval, len))
3278 return -EFAULT;
3279 switch (val) {
3280 case TPACKET_V1:
3281 val = sizeof(struct tpacket_hdr);
3282 break;
3283 case TPACKET_V2:
3284 val = sizeof(struct tpacket2_hdr);
3285 break;
f6fb8f10 3286 case TPACKET_V3:
3287 val = sizeof(struct tpacket3_hdr);
3288 break;
bbd6ef87
PM
3289 default:
3290 return -EINVAL;
3291 }
bbd6ef87 3292 break;
8913336a 3293 case PACKET_RESERVE:
8913336a 3294 val = po->tp_reserve;
8913336a 3295 break;
69e3c75f 3296 case PACKET_LOSS:
69e3c75f 3297 val = po->tp_loss;
69e3c75f 3298 break;
614f60fa 3299 case PACKET_TIMESTAMP:
614f60fa 3300 val = po->tp_tstamp;
614f60fa 3301 break;
dc99f600 3302 case PACKET_FANOUT:
dc99f600
DM
3303 val = (po->fanout ?
3304 ((u32)po->fanout->id |
3305 ((u32)po->fanout->type << 16)) :
3306 0);
dc99f600 3307 break;
1da177e4
LT
3308 default:
3309 return -ENOPROTOOPT;
3310 }
3311
c06fff6e
ED
3312 if (len > lv)
3313 len = lv;
8ae55f04
KK
3314 if (put_user(len, optlen))
3315 return -EFAULT;
8dc41944
HX
3316 if (copy_to_user(optval, data, len))
3317 return -EFAULT;
8ae55f04 3318 return 0;
1da177e4
LT
3319}
3320
3321
3322static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3323{
3324 struct sock *sk;
3325 struct hlist_node *node;
ad930650 3326 struct net_device *dev = data;
c346dca1 3327 struct net *net = dev_net(dev);
1da177e4 3328
808f5114 3329 rcu_read_lock();
3330 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
3331 struct packet_sock *po = pkt_sk(sk);
3332
3333 switch (msg) {
3334 case NETDEV_UNREGISTER:
1da177e4
LT
3335 if (po->mclist)
3336 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3337 /* fallthrough */
3338
1da177e4
LT
3339 case NETDEV_DOWN:
3340 if (dev->ifindex == po->ifindex) {
3341 spin_lock(&po->bind_lock);
3342 if (po->running) {
ce06b03e 3343 __unregister_prot_hook(sk, false);
1da177e4
LT
3344 sk->sk_err = ENETDOWN;
3345 if (!sock_flag(sk, SOCK_DEAD))
3346 sk->sk_error_report(sk);
3347 }
3348 if (msg == NETDEV_UNREGISTER) {
3349 po->ifindex = -1;
160ff18a
BG
3350 if (po->prot_hook.dev)
3351 dev_put(po->prot_hook.dev);
1da177e4
LT
3352 po->prot_hook.dev = NULL;
3353 }
3354 spin_unlock(&po->bind_lock);
3355 }
3356 break;
3357 case NETDEV_UP:
808f5114 3358 if (dev->ifindex == po->ifindex) {
3359 spin_lock(&po->bind_lock);
ce06b03e
DM
3360 if (po->num)
3361 register_prot_hook(sk);
808f5114 3362 spin_unlock(&po->bind_lock);
1da177e4 3363 }
1da177e4
LT
3364 break;
3365 }
3366 }
808f5114 3367 rcu_read_unlock();
1da177e4
LT
3368 return NOTIFY_DONE;
3369}
3370
3371
3372static int packet_ioctl(struct socket *sock, unsigned int cmd,
3373 unsigned long arg)
3374{
3375 struct sock *sk = sock->sk;
3376
69e3c75f 3377 switch (cmd) {
40d4e3df
ED
3378 case SIOCOUTQ:
3379 {
3380 int amount = sk_wmem_alloc_get(sk);
31e6d363 3381
40d4e3df
ED
3382 return put_user(amount, (int __user *)arg);
3383 }
3384 case SIOCINQ:
3385 {
3386 struct sk_buff *skb;
3387 int amount = 0;
3388
3389 spin_lock_bh(&sk->sk_receive_queue.lock);
3390 skb = skb_peek(&sk->sk_receive_queue);
3391 if (skb)
3392 amount = skb->len;
3393 spin_unlock_bh(&sk->sk_receive_queue.lock);
3394 return put_user(amount, (int __user *)arg);
3395 }
3396 case SIOCGSTAMP:
3397 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3398 case SIOCGSTAMPNS:
3399 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3400
1da177e4 3401#ifdef CONFIG_INET
40d4e3df
ED
3402 case SIOCADDRT:
3403 case SIOCDELRT:
3404 case SIOCDARP:
3405 case SIOCGARP:
3406 case SIOCSARP:
3407 case SIOCGIFADDR:
3408 case SIOCSIFADDR:
3409 case SIOCGIFBRDADDR:
3410 case SIOCSIFBRDADDR:
3411 case SIOCGIFNETMASK:
3412 case SIOCSIFNETMASK:
3413 case SIOCGIFDSTADDR:
3414 case SIOCSIFDSTADDR:
3415 case SIOCSIFFLAGS:
40d4e3df 3416 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3417#endif
3418
40d4e3df
ED
3419 default:
3420 return -ENOIOCTLCMD;
1da177e4
LT
3421 }
3422 return 0;
3423}
3424
40d4e3df 3425static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3426 poll_table *wait)
3427{
3428 struct sock *sk = sock->sk;
3429 struct packet_sock *po = pkt_sk(sk);
3430 unsigned int mask = datagram_poll(file, sock, wait);
3431
3432 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3433 if (po->rx_ring.pg_vec) {
f6fb8f10 3434 if (!packet_previous_rx_frame(po, &po->rx_ring,
3435 TP_STATUS_KERNEL))
1da177e4
LT
3436 mask |= POLLIN | POLLRDNORM;
3437 }
3438 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3439 spin_lock_bh(&sk->sk_write_queue.lock);
3440 if (po->tx_ring.pg_vec) {
3441 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3442 mask |= POLLOUT | POLLWRNORM;
3443 }
3444 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3445 return mask;
3446}
3447
3448
3449/* Dirty? Well, I still did not learn better way to account
3450 * for user mmaps.
3451 */
3452
3453static void packet_mm_open(struct vm_area_struct *vma)
3454{
3455 struct file *file = vma->vm_file;
40d4e3df 3456 struct socket *sock = file->private_data;
1da177e4 3457 struct sock *sk = sock->sk;
1ce4f28b 3458
1da177e4
LT
3459 if (sk)
3460 atomic_inc(&pkt_sk(sk)->mapped);
3461}
3462
3463static void packet_mm_close(struct vm_area_struct *vma)
3464{
3465 struct file *file = vma->vm_file;
40d4e3df 3466 struct socket *sock = file->private_data;
1da177e4 3467 struct sock *sk = sock->sk;
1ce4f28b 3468
1da177e4
LT
3469 if (sk)
3470 atomic_dec(&pkt_sk(sk)->mapped);
3471}
3472
f0f37e2f 3473static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3474 .open = packet_mm_open,
3475 .close = packet_mm_close,
1da177e4
LT
3476};
3477
0e3125c7
NH
3478static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3479 unsigned int len)
1da177e4
LT
3480{
3481 int i;
3482
4ebf0ae2 3483 for (i = 0; i < len; i++) {
0e3125c7 3484 if (likely(pg_vec[i].buffer)) {
c56b4d90 3485 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3486 vfree(pg_vec[i].buffer);
3487 else
3488 free_pages((unsigned long)pg_vec[i].buffer,
3489 order);
3490 pg_vec[i].buffer = NULL;
3491 }
1da177e4
LT
3492 }
3493 kfree(pg_vec);
3494}
3495
eea49cc9 3496static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3497{
0e3125c7
NH
3498 char *buffer = NULL;
3499 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3500 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3501
3502 buffer = (char *) __get_free_pages(gfp_flags, order);
3503
3504 if (buffer)
3505 return buffer;
3506
3507 /*
3508 * __get_free_pages failed, fall back to vmalloc
3509 */
bbce5a59 3510 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3511
0e3125c7
NH
3512 if (buffer)
3513 return buffer;
3514
3515 /*
3516 * vmalloc failed, lets dig into swap here
3517 */
0e3125c7
NH
3518 gfp_flags &= ~__GFP_NORETRY;
3519 buffer = (char *)__get_free_pages(gfp_flags, order);
3520 if (buffer)
3521 return buffer;
3522
3523 /*
3524 * complete and utter failure
3525 */
3526 return NULL;
4ebf0ae2
DM
3527}
3528
0e3125c7 3529static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3530{
3531 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3532 struct pgv *pg_vec;
4ebf0ae2
DM
3533 int i;
3534
0e3125c7 3535 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3536 if (unlikely(!pg_vec))
3537 goto out;
3538
3539 for (i = 0; i < block_nr; i++) {
c56b4d90 3540 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3541 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3542 goto out_free_pgvec;
3543 }
3544
3545out:
3546 return pg_vec;
3547
3548out_free_pgvec:
3549 free_pg_vec(pg_vec, order, block_nr);
3550 pg_vec = NULL;
3551 goto out;
3552}
1da177e4 3553
f6fb8f10 3554static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3555 int closing, int tx_ring)
1da177e4 3556{
0e3125c7 3557 struct pgv *pg_vec = NULL;
1da177e4 3558 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3559 int was_running, order = 0;
69e3c75f
JB
3560 struct packet_ring_buffer *rb;
3561 struct sk_buff_head *rb_queue;
0e11c91e 3562 __be16 num;
f6fb8f10 3563 int err = -EINVAL;
3564 /* Added to avoid minimal code churn */
3565 struct tpacket_req *req = &req_u->req;
3566
3567 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3568 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3569 WARN(1, "Tx-ring is not supported.\n");
3570 goto out;
3571 }
1ce4f28b 3572
69e3c75f
JB
3573 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3574 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3575
69e3c75f
JB
3576 err = -EBUSY;
3577 if (!closing) {
3578 if (atomic_read(&po->mapped))
3579 goto out;
3580 if (atomic_read(&rb->pending))
3581 goto out;
3582 }
1da177e4 3583
69e3c75f
JB
3584 if (req->tp_block_nr) {
3585 /* Sanity tests and some calculations */
3586 err = -EBUSY;
3587 if (unlikely(rb->pg_vec))
3588 goto out;
1da177e4 3589
bbd6ef87
PM
3590 switch (po->tp_version) {
3591 case TPACKET_V1:
3592 po->tp_hdrlen = TPACKET_HDRLEN;
3593 break;
3594 case TPACKET_V2:
3595 po->tp_hdrlen = TPACKET2_HDRLEN;
3596 break;
f6fb8f10 3597 case TPACKET_V3:
3598 po->tp_hdrlen = TPACKET3_HDRLEN;
3599 break;
bbd6ef87
PM
3600 }
3601
69e3c75f 3602 err = -EINVAL;
4ebf0ae2 3603 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3604 goto out;
4ebf0ae2 3605 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3606 goto out;
8913336a 3607 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3608 po->tp_reserve))
3609 goto out;
4ebf0ae2 3610 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3611 goto out;
1da177e4 3612
69e3c75f
JB
3613 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3614 if (unlikely(rb->frames_per_block <= 0))
3615 goto out;
3616 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3617 req->tp_frame_nr))
3618 goto out;
1da177e4
LT
3619
3620 err = -ENOMEM;
4ebf0ae2
DM
3621 order = get_order(req->tp_block_size);
3622 pg_vec = alloc_pg_vec(req, order);
3623 if (unlikely(!pg_vec))
1da177e4 3624 goto out;
f6fb8f10 3625 switch (po->tp_version) {
3626 case TPACKET_V3:
3627 /* Transmit path is not supported. We checked
3628 * it above but just being paranoid
3629 */
3630 if (!tx_ring)
3631 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3632 break;
3633 default:
3634 break;
3635 }
69e3c75f
JB
3636 }
3637 /* Done */
3638 else {
3639 err = -EINVAL;
4ebf0ae2 3640 if (unlikely(req->tp_frame_nr))
69e3c75f 3641 goto out;
1da177e4
LT
3642 }
3643
3644 lock_sock(sk);
3645
3646 /* Detach socket from network */
3647 spin_lock(&po->bind_lock);
3648 was_running = po->running;
3649 num = po->num;
3650 if (was_running) {
1da177e4 3651 po->num = 0;
ce06b03e 3652 __unregister_prot_hook(sk, false);
1da177e4
LT
3653 }
3654 spin_unlock(&po->bind_lock);
1ce4f28b 3655
1da177e4
LT
3656 synchronize_net();
3657
3658 err = -EBUSY;
905db440 3659 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3660 if (closing || atomic_read(&po->mapped) == 0) {
3661 err = 0;
69e3c75f 3662 spin_lock_bh(&rb_queue->lock);
c053fd96 3663 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3664 rb->frame_max = (req->tp_frame_nr - 1);
3665 rb->head = 0;
3666 rb->frame_size = req->tp_frame_size;
3667 spin_unlock_bh(&rb_queue->lock);
3668
c053fd96
CG
3669 swap(rb->pg_vec_order, order);
3670 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3671
3672 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3673 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3674 tpacket_rcv : packet_rcv;
3675 skb_queue_purge(rb_queue);
1da177e4 3676 if (atomic_read(&po->mapped))
40d4e3df
ED
3677 pr_err("packet_mmap: vma is busy: %d\n",
3678 atomic_read(&po->mapped));
1da177e4 3679 }
905db440 3680 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3681
3682 spin_lock(&po->bind_lock);
ce06b03e 3683 if (was_running) {
1da177e4 3684 po->num = num;
ce06b03e 3685 register_prot_hook(sk);
1da177e4
LT
3686 }
3687 spin_unlock(&po->bind_lock);
f6fb8f10 3688 if (closing && (po->tp_version > TPACKET_V2)) {
3689 /* Because we don't support block-based V3 on tx-ring */
3690 if (!tx_ring)
3691 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3692 }
1da177e4
LT
3693 release_sock(sk);
3694
1da177e4
LT
3695 if (pg_vec)
3696 free_pg_vec(pg_vec, order, req->tp_block_nr);
3697out:
3698 return err;
3699}
3700
69e3c75f
JB
3701static int packet_mmap(struct file *file, struct socket *sock,
3702 struct vm_area_struct *vma)
1da177e4
LT
3703{
3704 struct sock *sk = sock->sk;
3705 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3706 unsigned long size, expected_size;
3707 struct packet_ring_buffer *rb;
1da177e4
LT
3708 unsigned long start;
3709 int err = -EINVAL;
3710 int i;
3711
3712 if (vma->vm_pgoff)
3713 return -EINVAL;
3714
905db440 3715 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3716
3717 expected_size = 0;
3718 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3719 if (rb->pg_vec) {
3720 expected_size += rb->pg_vec_len
3721 * rb->pg_vec_pages
3722 * PAGE_SIZE;
3723 }
3724 }
3725
3726 if (expected_size == 0)
1da177e4 3727 goto out;
69e3c75f
JB
3728
3729 size = vma->vm_end - vma->vm_start;
3730 if (size != expected_size)
1da177e4
LT
3731 goto out;
3732
1da177e4 3733 start = vma->vm_start;
69e3c75f
JB
3734 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3735 if (rb->pg_vec == NULL)
3736 continue;
3737
3738 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3739 struct page *page;
3740 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3741 int pg_num;
3742
c56b4d90
CG
3743 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3744 page = pgv_to_page(kaddr);
69e3c75f
JB
3745 err = vm_insert_page(vma, start, page);
3746 if (unlikely(err))
3747 goto out;
3748 start += PAGE_SIZE;
0e3125c7 3749 kaddr += PAGE_SIZE;
69e3c75f 3750 }
4ebf0ae2 3751 }
1da177e4 3752 }
69e3c75f 3753
4ebf0ae2 3754 atomic_inc(&po->mapped);
1da177e4
LT
3755 vma->vm_ops = &packet_mmap_ops;
3756 err = 0;
3757
3758out:
905db440 3759 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3760 return err;
3761}
1da177e4 3762
90ddc4f0 3763static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3764 .family = PF_PACKET,
3765 .owner = THIS_MODULE,
3766 .release = packet_release,
3767 .bind = packet_bind_spkt,
3768 .connect = sock_no_connect,
3769 .socketpair = sock_no_socketpair,
3770 .accept = sock_no_accept,
3771 .getname = packet_getname_spkt,
3772 .poll = datagram_poll,
3773 .ioctl = packet_ioctl,
3774 .listen = sock_no_listen,
3775 .shutdown = sock_no_shutdown,
3776 .setsockopt = sock_no_setsockopt,
3777 .getsockopt = sock_no_getsockopt,
3778 .sendmsg = packet_sendmsg_spkt,
3779 .recvmsg = packet_recvmsg,
3780 .mmap = sock_no_mmap,
3781 .sendpage = sock_no_sendpage,
3782};
1da177e4 3783
90ddc4f0 3784static const struct proto_ops packet_ops = {
1da177e4
LT
3785 .family = PF_PACKET,
3786 .owner = THIS_MODULE,
3787 .release = packet_release,
3788 .bind = packet_bind,
3789 .connect = sock_no_connect,
3790 .socketpair = sock_no_socketpair,
3791 .accept = sock_no_accept,
1ce4f28b 3792 .getname = packet_getname,
1da177e4
LT
3793 .poll = packet_poll,
3794 .ioctl = packet_ioctl,
3795 .listen = sock_no_listen,
3796 .shutdown = sock_no_shutdown,
3797 .setsockopt = packet_setsockopt,
3798 .getsockopt = packet_getsockopt,
3799 .sendmsg = packet_sendmsg,
3800 .recvmsg = packet_recvmsg,
3801 .mmap = packet_mmap,
3802 .sendpage = sock_no_sendpage,
3803};
3804
ec1b4cf7 3805static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3806 .family = PF_PACKET,
3807 .create = packet_create,
3808 .owner = THIS_MODULE,
3809};
3810
3811static struct notifier_block packet_netdev_notifier = {
40d4e3df 3812 .notifier_call = packet_notifier,
1da177e4
LT
3813};
3814
3815#ifdef CONFIG_PROC_FS
1da177e4
LT
3816
3817static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3818 __acquires(RCU)
1da177e4 3819{
e372c414 3820 struct net *net = seq_file_net(seq);
808f5114 3821
3822 rcu_read_lock();
3823 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3824}
3825
3826static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3827{
1bf40954 3828 struct net *net = seq_file_net(seq);
808f5114 3829 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3830}
3831
3832static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3833 __releases(RCU)
1da177e4 3834{
808f5114 3835 rcu_read_unlock();
1da177e4
LT
3836}
3837
1ce4f28b 3838static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3839{
3840 if (v == SEQ_START_TOKEN)
3841 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3842 else {
b7ceabd9 3843 struct sock *s = sk_entry(v);
1da177e4
LT
3844 const struct packet_sock *po = pkt_sk(s);
3845
3846 seq_printf(seq,
71338aa7 3847 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3848 s,
3849 atomic_read(&s->sk_refcnt),
3850 s->sk_type,
3851 ntohs(po->num),
3852 po->ifindex,
3853 po->running,
3854 atomic_read(&s->sk_rmem_alloc),
3855 sock_i_uid(s),
40d4e3df 3856 sock_i_ino(s));
1da177e4
LT
3857 }
3858
3859 return 0;
3860}
3861
56b3d975 3862static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3863 .start = packet_seq_start,
3864 .next = packet_seq_next,
3865 .stop = packet_seq_stop,
3866 .show = packet_seq_show,
3867};
3868
3869static int packet_seq_open(struct inode *inode, struct file *file)
3870{
e372c414
DL
3871 return seq_open_net(inode, file, &packet_seq_ops,
3872 sizeof(struct seq_net_private));
1da177e4
LT
3873}
3874
da7071d7 3875static const struct file_operations packet_seq_fops = {
1da177e4
LT
3876 .owner = THIS_MODULE,
3877 .open = packet_seq_open,
3878 .read = seq_read,
3879 .llseek = seq_lseek,
e372c414 3880 .release = seq_release_net,
1da177e4
LT
3881};
3882
3883#endif
3884
2c8c1e72 3885static int __net_init packet_net_init(struct net *net)
d12d01d6 3886{
808f5114 3887 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 3888 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
3889
3890 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
3891 return -ENOMEM;
3892
3893 return 0;
3894}
3895
2c8c1e72 3896static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
3897{
3898 proc_net_remove(net, "packet");
3899}
3900
3901static struct pernet_operations packet_net_ops = {
3902 .init = packet_net_init,
3903 .exit = packet_net_exit,
3904};
3905
3906
1da177e4
LT
3907static void __exit packet_exit(void)
3908{
1da177e4 3909 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3910 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3911 sock_unregister(PF_PACKET);
3912 proto_unregister(&packet_proto);
3913}
3914
3915static int __init packet_init(void)
3916{
3917 int rc = proto_register(&packet_proto, 0);
3918
3919 if (rc != 0)
3920 goto out;
3921
3922 sock_register(&packet_family_ops);
d12d01d6 3923 register_pernet_subsys(&packet_net_ops);
1da177e4 3924 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3925out:
3926 return rc;
3927}
3928
3929module_init(packet_init);
3930module_exit(packet_exit);
3931MODULE_LICENSE("GPL");
3932MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 0.921825 seconds and 5 git commands to generate.