net: phy: mdio: add missing __iomem annotation
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
0f75b09c 91#include <linux/if_arp.h>
1da177e4
LT
92
93#ifdef CONFIG_INET
94#include <net/inet_common.h>
95#endif
96
2787b04b
PE
97#include "internal.h"
98
1da177e4
LT
99/*
100 Assumptions:
101 - if device has no dev->hard_header routine, it adds and removes ll header
102 inside itself. In this case ll header is invisible outside of device,
103 but higher levels still should reserve dev->hard_header_len.
104 Some devices are enough clever to reallocate skb, when header
105 will not fit to reserved space (tunnel), another ones are silly
106 (PPP).
107 - packet socket receives packets with pulled ll header,
108 so that SOCK_RAW should push it back.
109
110On receive:
111-----------
112
113Incoming, dev->hard_header!=NULL
b0e380b1
ACM
114 mac_header -> ll header
115 data -> data
1da177e4
LT
116
117Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
118 mac_header -> ll header
119 data -> ll header
1da177e4
LT
120
121Incoming, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> UNKNOWN position. It is very likely, that it points to ll
123 header. PPP makes it, that is wrong, because introduce
db0c58f9 124 assymetry between rx and tx paths.
b0e380b1 125 data -> data
1da177e4
LT
126
127Outgoing, dev->hard_header==NULL
b0e380b1
ACM
128 mac_header -> data. ll header is still not built!
129 data -> data
1da177e4
LT
130
131Resume
132 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133
134
135On transmit:
136------------
137
138dev->hard_header != NULL
b0e380b1
ACM
139 mac_header -> ll header
140 data -> ll header
1da177e4
LT
141
142dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
143 mac_header -> data
144 data -> data
1da177e4
LT
145
146 We should set nh.raw on output to correct posistion,
147 packet classifier depends on it.
148 */
149
1da177e4
LT
150/* Private packet socket structures. */
151
0fb375fb
EB
152/* identical to struct packet_mreq except it has
153 * a longer address field.
154 */
40d4e3df 155struct packet_mreq_max {
0fb375fb
EB
156 int mr_ifindex;
157 unsigned short mr_type;
158 unsigned short mr_alen;
159 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 160};
a2efcfa0 161
184f489e
DB
162union tpacket_uhdr {
163 struct tpacket_hdr *h1;
164 struct tpacket2_hdr *h2;
165 struct tpacket3_hdr *h3;
166 void *raw;
167};
168
f6fb8f10 169static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
170 int closing, int tx_ring);
171
f6fb8f10 172#define V3_ALIGNMENT (8)
173
bc59ba39 174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
f6fb8f10 179#define PGV_FROM_VMALLOC 1
69e3c75f 180
f6fb8f10 181#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
182#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
183#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
184#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
185#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
186#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
188
69e3c75f
JB
189struct packet_sock;
190static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
191static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
192 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 193
f6fb8f10 194static void *packet_previous_frame(struct packet_sock *po,
195 struct packet_ring_buffer *rb,
196 int status);
197static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 198static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
199 struct tpacket_block_desc *);
200static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 201 struct packet_sock *);
bc59ba39 202static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 203 struct packet_sock *, unsigned int status);
bc59ba39 204static int prb_queue_frozen(struct tpacket_kbdq_core *);
205static void prb_open_block(struct tpacket_kbdq_core *,
206 struct tpacket_block_desc *);
f6fb8f10 207static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 208static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
209static void prb_init_blk_timer(struct packet_sock *,
210 struct tpacket_kbdq_core *,
211 void (*func) (unsigned long));
212static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
213static void prb_clear_rxhash(struct tpacket_kbdq_core *,
214 struct tpacket3_hdr *);
215static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
216 struct tpacket3_hdr *);
1da177e4
LT
217static void packet_flush_mclist(struct sock *sk);
218
ffbc6111
HX
219struct packet_skb_cb {
220 unsigned int origlen;
221 union {
222 struct sockaddr_pkt pkt;
223 struct sockaddr_ll ll;
224 } sa;
225};
226
227#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 228
bc59ba39 229#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 230#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 231 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 232#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 233 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 234#define GET_NEXT_PRB_BLK_NUM(x) \
235 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
236 ((x)->kactive_blk_num+1) : 0)
237
dc99f600
DM
238static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
239static void __fanout_link(struct sock *sk, struct packet_sock *po);
240
ce06b03e
DM
241/* register_prot_hook must be invoked with the po->bind_lock held,
242 * or from a context in which asynchronous accesses to the packet
243 * socket is not possible (packet_create()).
244 */
245static void register_prot_hook(struct sock *sk)
246{
247 struct packet_sock *po = pkt_sk(sk);
248 if (!po->running) {
dc99f600
DM
249 if (po->fanout)
250 __fanout_link(sk, po);
251 else
252 dev_add_pack(&po->prot_hook);
ce06b03e
DM
253 sock_hold(sk);
254 po->running = 1;
255 }
256}
257
258/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
259 * held. If the sync parameter is true, we will temporarily drop
260 * the po->bind_lock and do a synchronize_net to make sure no
261 * asynchronous packet processing paths still refer to the elements
262 * of po->prot_hook. If the sync parameter is false, it is the
263 * callers responsibility to take care of this.
264 */
265static void __unregister_prot_hook(struct sock *sk, bool sync)
266{
267 struct packet_sock *po = pkt_sk(sk);
268
269 po->running = 0;
dc99f600
DM
270 if (po->fanout)
271 __fanout_unlink(sk, po);
272 else
273 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
274 __sock_put(sk);
275
276 if (sync) {
277 spin_unlock(&po->bind_lock);
278 synchronize_net();
279 spin_lock(&po->bind_lock);
280 }
281}
282
283static void unregister_prot_hook(struct sock *sk, bool sync)
284{
285 struct packet_sock *po = pkt_sk(sk);
286
287 if (po->running)
288 __unregister_prot_hook(sk, sync);
289}
290
f6dafa95 291static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
292{
293 if (is_vmalloc_addr(addr))
294 return vmalloc_to_page(addr);
295 return virt_to_page(addr);
296}
297
69e3c75f 298static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 299{
184f489e 300 union tpacket_uhdr h;
1da177e4 301
69e3c75f 302 h.raw = frame;
bbd6ef87
PM
303 switch (po->tp_version) {
304 case TPACKET_V1:
69e3c75f 305 h.h1->tp_status = status;
0af55bb5 306 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
307 break;
308 case TPACKET_V2:
69e3c75f 309 h.h2->tp_status = status;
0af55bb5 310 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 311 break;
f6fb8f10 312 case TPACKET_V3:
69e3c75f 313 default:
f6fb8f10 314 WARN(1, "TPACKET version not supported.\n");
69e3c75f 315 BUG();
bbd6ef87 316 }
69e3c75f
JB
317
318 smp_wmb();
bbd6ef87
PM
319}
320
69e3c75f 321static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 322{
184f489e 323 union tpacket_uhdr h;
bbd6ef87 324
69e3c75f
JB
325 smp_rmb();
326
bbd6ef87
PM
327 h.raw = frame;
328 switch (po->tp_version) {
329 case TPACKET_V1:
0af55bb5 330 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 331 return h.h1->tp_status;
bbd6ef87 332 case TPACKET_V2:
0af55bb5 333 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 334 return h.h2->tp_status;
f6fb8f10 335 case TPACKET_V3:
69e3c75f 336 default:
f6fb8f10 337 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
338 BUG();
339 return 0;
bbd6ef87 340 }
1da177e4 341}
69e3c75f 342
b9c32fb2
DB
343static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
344 unsigned int flags)
7a51384c
DB
345{
346 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
347
348 if (shhwtstamps) {
349 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
350 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 351 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
352 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
353 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 354 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
355 }
356
357 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 358 return TP_STATUS_TS_SOFTWARE;
7a51384c 359
b9c32fb2 360 return 0;
7a51384c
DB
361}
362
b9c32fb2
DB
363static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
364 struct sk_buff *skb)
2e31396f
WB
365{
366 union tpacket_uhdr h;
367 struct timespec ts;
b9c32fb2 368 __u32 ts_status;
2e31396f 369
b9c32fb2
DB
370 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
371 return 0;
2e31396f
WB
372
373 h.raw = frame;
374 switch (po->tp_version) {
375 case TPACKET_V1:
376 h.h1->tp_sec = ts.tv_sec;
377 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
378 break;
379 case TPACKET_V2:
380 h.h2->tp_sec = ts.tv_sec;
381 h.h2->tp_nsec = ts.tv_nsec;
382 break;
383 case TPACKET_V3:
384 default:
385 WARN(1, "TPACKET version not supported.\n");
386 BUG();
387 }
388
389 /* one flush is safe, as both fields always lie on the same cacheline */
390 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
391 smp_wmb();
b9c32fb2
DB
392
393 return ts_status;
2e31396f
WB
394}
395
69e3c75f
JB
396static void *packet_lookup_frame(struct packet_sock *po,
397 struct packet_ring_buffer *rb,
398 unsigned int position,
399 int status)
400{
401 unsigned int pg_vec_pos, frame_offset;
184f489e 402 union tpacket_uhdr h;
69e3c75f
JB
403
404 pg_vec_pos = position / rb->frames_per_block;
405 frame_offset = position % rb->frames_per_block;
406
0e3125c7
NH
407 h.raw = rb->pg_vec[pg_vec_pos].buffer +
408 (frame_offset * rb->frame_size);
69e3c75f
JB
409
410 if (status != __packet_get_status(po, h.raw))
411 return NULL;
412
413 return h.raw;
414}
415
eea49cc9 416static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
417 struct packet_ring_buffer *rb,
418 int status)
419{
420 return packet_lookup_frame(po, rb, rb->head, status);
421}
422
bc59ba39 423static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 424{
425 del_timer_sync(&pkc->retire_blk_timer);
426}
427
428static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
429 int tx_ring,
430 struct sk_buff_head *rb_queue)
431{
bc59ba39 432 struct tpacket_kbdq_core *pkc;
f6fb8f10 433
434 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
435
436 spin_lock(&rb_queue->lock);
437 pkc->delete_blk_timer = 1;
438 spin_unlock(&rb_queue->lock);
439
440 prb_del_retire_blk_timer(pkc);
441}
442
443static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 444 struct tpacket_kbdq_core *pkc,
f6fb8f10 445 void (*func) (unsigned long))
446{
447 init_timer(&pkc->retire_blk_timer);
448 pkc->retire_blk_timer.data = (long)po;
449 pkc->retire_blk_timer.function = func;
450 pkc->retire_blk_timer.expires = jiffies;
451}
452
453static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
454{
bc59ba39 455 struct tpacket_kbdq_core *pkc;
f6fb8f10 456
457 if (tx_ring)
458 BUG();
459
460 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
461 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
462}
463
464static int prb_calc_retire_blk_tmo(struct packet_sock *po,
465 int blk_size_in_bytes)
466{
467 struct net_device *dev;
468 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
469 struct ethtool_cmd ecmd;
470 int err;
e440cf2c 471 u32 speed;
f6fb8f10 472
4bc71cb9
JP
473 rtnl_lock();
474 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
475 if (unlikely(!dev)) {
476 rtnl_unlock();
f6fb8f10 477 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
478 }
479 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 480 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
481 rtnl_unlock();
482 if (!err) {
4bc71cb9
JP
483 /*
484 * If the link speed is so slow you don't really
485 * need to worry about perf anyways
486 */
e440cf2c 487 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 488 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 489 } else {
490 msec = 1;
491 div = speed / 1000;
f6fb8f10 492 }
493 }
494
495 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
496
497 if (div)
498 mbits /= div;
499
500 tmo = mbits * msec;
501
502 if (div)
503 return tmo+1;
504 return tmo;
505}
506
bc59ba39 507static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 508 union tpacket_req_u *req_u)
509{
510 p1->feature_req_word = req_u->req3.tp_feature_req_word;
511}
512
513static void init_prb_bdqc(struct packet_sock *po,
514 struct packet_ring_buffer *rb,
515 struct pgv *pg_vec,
516 union tpacket_req_u *req_u, int tx_ring)
517{
bc59ba39 518 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
519 struct tpacket_block_desc *pbd;
f6fb8f10 520
521 memset(p1, 0x0, sizeof(*p1));
522
523 p1->knxt_seq_num = 1;
524 p1->pkbdq = pg_vec;
bc59ba39 525 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 526 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 527 p1->kblk_size = req_u->req3.tp_block_size;
528 p1->knum_blocks = req_u->req3.tp_block_nr;
529 p1->hdrlen = po->tp_hdrlen;
530 p1->version = po->tp_version;
531 p1->last_kactive_blk_num = 0;
ee80fbf3 532 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 533 if (req_u->req3.tp_retire_blk_tov)
534 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
535 else
536 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
537 req_u->req3.tp_block_size);
538 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
539 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
540
541 prb_init_ft_ops(p1, req_u);
542 prb_setup_retire_blk_timer(po, tx_ring);
543 prb_open_block(p1, pbd);
544}
545
546/* Do NOT update the last_blk_num first.
547 * Assumes sk_buff_head lock is held.
548 */
bc59ba39 549static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 550{
551 mod_timer(&pkc->retire_blk_timer,
552 jiffies + pkc->tov_in_jiffies);
553 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
554}
555
556/*
557 * Timer logic:
558 * 1) We refresh the timer only when we open a block.
559 * By doing this we don't waste cycles refreshing the timer
560 * on packet-by-packet basis.
561 *
562 * With a 1MB block-size, on a 1Gbps line, it will take
563 * i) ~8 ms to fill a block + ii) memcpy etc.
564 * In this cut we are not accounting for the memcpy time.
565 *
566 * So, if the user sets the 'tmo' to 10ms then the timer
567 * will never fire while the block is still getting filled
568 * (which is what we want). However, the user could choose
569 * to close a block early and that's fine.
570 *
571 * But when the timer does fire, we check whether or not to refresh it.
572 * Since the tmo granularity is in msecs, it is not too expensive
573 * to refresh the timer, lets say every '8' msecs.
574 * Either the user can set the 'tmo' or we can derive it based on
575 * a) line-speed and b) block-size.
576 * prb_calc_retire_blk_tmo() calculates the tmo.
577 *
578 */
579static void prb_retire_rx_blk_timer_expired(unsigned long data)
580{
581 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 582 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 583 unsigned int frozen;
bc59ba39 584 struct tpacket_block_desc *pbd;
f6fb8f10 585
586 spin_lock(&po->sk.sk_receive_queue.lock);
587
588 frozen = prb_queue_frozen(pkc);
589 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
590
591 if (unlikely(pkc->delete_blk_timer))
592 goto out;
593
594 /* We only need to plug the race when the block is partially filled.
595 * tpacket_rcv:
596 * lock(); increment BLOCK_NUM_PKTS; unlock()
597 * copy_bits() is in progress ...
598 * timer fires on other cpu:
599 * we can't retire the current block because copy_bits
600 * is in progress.
601 *
602 */
603 if (BLOCK_NUM_PKTS(pbd)) {
604 while (atomic_read(&pkc->blk_fill_in_prog)) {
605 /* Waiting for skb_copy_bits to finish... */
606 cpu_relax();
607 }
608 }
609
610 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
611 if (!frozen) {
612 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
613 if (!prb_dispatch_next_block(pkc, po))
614 goto refresh_timer;
615 else
616 goto out;
617 } else {
618 /* Case 1. Queue was frozen because user-space was
619 * lagging behind.
620 */
621 if (prb_curr_blk_in_use(pkc, pbd)) {
622 /*
623 * Ok, user-space is still behind.
624 * So just refresh the timer.
625 */
626 goto refresh_timer;
627 } else {
628 /* Case 2. queue was frozen,user-space caught up,
629 * now the link went idle && the timer fired.
630 * We don't have a block to close.So we open this
631 * block and restart the timer.
632 * opening a block thaws the queue,restarts timer
633 * Thawing/timer-refresh is a side effect.
634 */
635 prb_open_block(pkc, pbd);
636 goto out;
637 }
638 }
639 }
640
641refresh_timer:
642 _prb_refresh_rx_retire_blk_timer(pkc);
643
644out:
645 spin_unlock(&po->sk.sk_receive_queue.lock);
646}
647
eea49cc9 648static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 649 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 650{
651 /* Flush everything minus the block header */
652
653#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
654 u8 *start, *end;
655
656 start = (u8 *)pbd1;
657
658 /* Skip the block header(we know header WILL fit in 4K) */
659 start += PAGE_SIZE;
660
661 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
662 for (; start < end; start += PAGE_SIZE)
663 flush_dcache_page(pgv_to_page(start));
664
665 smp_wmb();
666#endif
667
668 /* Now update the block status. */
669
670 BLOCK_STATUS(pbd1) = status;
671
672 /* Flush the block header */
673
674#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
675 start = (u8 *)pbd1;
676 flush_dcache_page(pgv_to_page(start));
677
678 smp_wmb();
679#endif
680}
681
682/*
683 * Side effect:
684 *
685 * 1) flush the block
686 * 2) Increment active_blk_num
687 *
688 * Note:We DONT refresh the timer on purpose.
689 * Because almost always the next block will be opened.
690 */
bc59ba39 691static void prb_close_block(struct tpacket_kbdq_core *pkc1,
692 struct tpacket_block_desc *pbd1,
f6fb8f10 693 struct packet_sock *po, unsigned int stat)
694{
695 __u32 status = TP_STATUS_USER | stat;
696
697 struct tpacket3_hdr *last_pkt;
bc59ba39 698 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 699
ee80fbf3 700 if (po->stats.stats3.tp_drops)
f6fb8f10 701 status |= TP_STATUS_LOSING;
702
703 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
704 last_pkt->tp_next_offset = 0;
705
706 /* Get the ts of the last pkt */
707 if (BLOCK_NUM_PKTS(pbd1)) {
708 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
709 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
710 } else {
711 /* Ok, we tmo'd - so get the current time */
712 struct timespec ts;
713 getnstimeofday(&ts);
714 h1->ts_last_pkt.ts_sec = ts.tv_sec;
715 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
716 }
717
718 smp_wmb();
719
720 /* Flush the block */
721 prb_flush_block(pkc1, pbd1, status);
722
723 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
724}
725
eea49cc9 726static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 727{
728 pkc->reset_pending_on_curr_blk = 0;
729}
730
731/*
732 * Side effect of opening a block:
733 *
734 * 1) prb_queue is thawed.
735 * 2) retire_blk_timer is refreshed.
736 *
737 */
bc59ba39 738static void prb_open_block(struct tpacket_kbdq_core *pkc1,
739 struct tpacket_block_desc *pbd1)
f6fb8f10 740{
741 struct timespec ts;
bc59ba39 742 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 743
744 smp_rmb();
745
8da3056c
DB
746 /* We could have just memset this but we will lose the
747 * flexibility of making the priv area sticky
748 */
f6fb8f10 749
8da3056c
DB
750 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
751 BLOCK_NUM_PKTS(pbd1) = 0;
752 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 753
8da3056c
DB
754 getnstimeofday(&ts);
755
756 h1->ts_first_pkt.ts_sec = ts.tv_sec;
757 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 758
8da3056c
DB
759 pkc1->pkblk_start = (char *)pbd1;
760 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
761
762 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
763 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
764
765 pbd1->version = pkc1->version;
766 pkc1->prev = pkc1->nxt_offset;
767 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
768
769 prb_thaw_queue(pkc1);
770 _prb_refresh_rx_retire_blk_timer(pkc1);
771
772 smp_wmb();
f6fb8f10 773}
774
775/*
776 * Queue freeze logic:
777 * 1) Assume tp_block_nr = 8 blocks.
778 * 2) At time 't0', user opens Rx ring.
779 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
780 * 4) user-space is either sleeping or processing block '0'.
781 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
782 * it will close block-7,loop around and try to fill block '0'.
783 * call-flow:
784 * __packet_lookup_frame_in_block
785 * prb_retire_current_block()
786 * prb_dispatch_next_block()
787 * |->(BLOCK_STATUS == USER) evaluates to true
788 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
789 * 6) Now there are two cases:
790 * 6.1) Link goes idle right after the queue is frozen.
791 * But remember, the last open_block() refreshed the timer.
792 * When this timer expires,it will refresh itself so that we can
793 * re-open block-0 in near future.
794 * 6.2) Link is busy and keeps on receiving packets. This is a simple
795 * case and __packet_lookup_frame_in_block will check if block-0
796 * is free and can now be re-used.
797 */
eea49cc9 798static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 799 struct packet_sock *po)
800{
801 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 802 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 803}
804
805#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
806
807/*
808 * If the next block is free then we will dispatch it
809 * and return a good offset.
810 * Else, we will freeze the queue.
811 * So, caller must check the return value.
812 */
bc59ba39 813static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 814 struct packet_sock *po)
815{
bc59ba39 816 struct tpacket_block_desc *pbd;
f6fb8f10 817
818 smp_rmb();
819
820 /* 1. Get current block num */
821 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
822
823 /* 2. If this block is currently in_use then freeze the queue */
824 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
825 prb_freeze_queue(pkc, po);
826 return NULL;
827 }
828
829 /*
830 * 3.
831 * open this block and return the offset where the first packet
832 * needs to get stored.
833 */
834 prb_open_block(pkc, pbd);
835 return (void *)pkc->nxt_offset;
836}
837
bc59ba39 838static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 839 struct packet_sock *po, unsigned int status)
840{
bc59ba39 841 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 842
843 /* retire/close the current block */
844 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
845 /*
846 * Plug the case where copy_bits() is in progress on
847 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
848 * have space to copy the pkt in the current block and
849 * called prb_retire_current_block()
850 *
851 * We don't need to worry about the TMO case because
852 * the timer-handler already handled this case.
853 */
854 if (!(status & TP_STATUS_BLK_TMO)) {
855 while (atomic_read(&pkc->blk_fill_in_prog)) {
856 /* Waiting for skb_copy_bits to finish... */
857 cpu_relax();
858 }
859 }
860 prb_close_block(pkc, pbd, po, status);
861 return;
862 }
f6fb8f10 863}
864
eea49cc9 865static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 866 struct tpacket_block_desc *pbd)
f6fb8f10 867{
868 return TP_STATUS_USER & BLOCK_STATUS(pbd);
869}
870
eea49cc9 871static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 872{
873 return pkc->reset_pending_on_curr_blk;
874}
875
eea49cc9 876static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 877{
bc59ba39 878 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 879 atomic_dec(&pkc->blk_fill_in_prog);
880}
881
eea49cc9 882static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 883 struct tpacket3_hdr *ppd)
884{
885 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
886}
887
eea49cc9 888static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 889 struct tpacket3_hdr *ppd)
890{
891 ppd->hv1.tp_rxhash = 0;
892}
893
eea49cc9 894static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 895 struct tpacket3_hdr *ppd)
896{
897 if (vlan_tx_tag_present(pkc->skb)) {
898 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
899 ppd->tp_status = TP_STATUS_VLAN_VALID;
900 } else {
9e67030a 901 ppd->hv1.tp_vlan_tci = 0;
902 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 903 }
904}
905
bc59ba39 906static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 907 struct tpacket3_hdr *ppd)
908{
909 prb_fill_vlan_info(pkc, ppd);
910
911 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
912 prb_fill_rxhash(pkc, ppd);
913 else
914 prb_clear_rxhash(pkc, ppd);
915}
916
eea49cc9 917static void prb_fill_curr_block(char *curr,
bc59ba39 918 struct tpacket_kbdq_core *pkc,
919 struct tpacket_block_desc *pbd,
f6fb8f10 920 unsigned int len)
921{
922 struct tpacket3_hdr *ppd;
923
924 ppd = (struct tpacket3_hdr *)curr;
925 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
926 pkc->prev = curr;
927 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
928 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
929 BLOCK_NUM_PKTS(pbd) += 1;
930 atomic_inc(&pkc->blk_fill_in_prog);
931 prb_run_all_ft_ops(pkc, ppd);
932}
933
934/* Assumes caller has the sk->rx_queue.lock */
935static void *__packet_lookup_frame_in_block(struct packet_sock *po,
936 struct sk_buff *skb,
937 int status,
938 unsigned int len
939 )
940{
bc59ba39 941 struct tpacket_kbdq_core *pkc;
942 struct tpacket_block_desc *pbd;
f6fb8f10 943 char *curr, *end;
944
e3192690 945 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 946 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
947
948 /* Queue is frozen when user space is lagging behind */
949 if (prb_queue_frozen(pkc)) {
950 /*
951 * Check if that last block which caused the queue to freeze,
952 * is still in_use by user-space.
953 */
954 if (prb_curr_blk_in_use(pkc, pbd)) {
955 /* Can't record this packet */
956 return NULL;
957 } else {
958 /*
959 * Ok, the block was released by user-space.
960 * Now let's open that block.
961 * opening a block also thaws the queue.
962 * Thawing is a side effect.
963 */
964 prb_open_block(pkc, pbd);
965 }
966 }
967
968 smp_mb();
969 curr = pkc->nxt_offset;
970 pkc->skb = skb;
e3192690 971 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 972
973 /* first try the current block */
974 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
975 prb_fill_curr_block(curr, pkc, pbd, len);
976 return (void *)curr;
977 }
978
979 /* Ok, close the current block */
980 prb_retire_current_block(pkc, po, 0);
981
982 /* Now, try to dispatch the next block */
983 curr = (char *)prb_dispatch_next_block(pkc, po);
984 if (curr) {
985 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
986 prb_fill_curr_block(curr, pkc, pbd, len);
987 return (void *)curr;
988 }
989
990 /*
991 * No free blocks are available.user_space hasn't caught up yet.
992 * Queue was just frozen and now this packet will get dropped.
993 */
994 return NULL;
995}
996
eea49cc9 997static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 998 struct sk_buff *skb,
999 int status, unsigned int len)
1000{
1001 char *curr = NULL;
1002 switch (po->tp_version) {
1003 case TPACKET_V1:
1004 case TPACKET_V2:
1005 curr = packet_lookup_frame(po, &po->rx_ring,
1006 po->rx_ring.head, status);
1007 return curr;
1008 case TPACKET_V3:
1009 return __packet_lookup_frame_in_block(po, skb, status, len);
1010 default:
1011 WARN(1, "TPACKET version not supported\n");
1012 BUG();
99aa3473 1013 return NULL;
f6fb8f10 1014 }
1015}
1016
eea49cc9 1017static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1018 struct packet_ring_buffer *rb,
77f65ebd 1019 unsigned int idx,
f6fb8f10 1020 int status)
1021{
bc59ba39 1022 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1023 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1024
1025 if (status != BLOCK_STATUS(pbd))
1026 return NULL;
1027 return pbd;
1028}
1029
eea49cc9 1030static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1031{
1032 unsigned int prev;
1033 if (rb->prb_bdqc.kactive_blk_num)
1034 prev = rb->prb_bdqc.kactive_blk_num-1;
1035 else
1036 prev = rb->prb_bdqc.knum_blocks-1;
1037 return prev;
1038}
1039
1040/* Assumes caller has held the rx_queue.lock */
eea49cc9 1041static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1042 struct packet_ring_buffer *rb,
1043 int status)
1044{
1045 unsigned int previous = prb_previous_blk_num(rb);
1046 return prb_lookup_block(po, rb, previous, status);
1047}
1048
eea49cc9 1049static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1050 struct packet_ring_buffer *rb,
1051 int status)
1052{
1053 if (po->tp_version <= TPACKET_V2)
1054 return packet_previous_frame(po, rb, status);
1055
1056 return __prb_previous_block(po, rb, status);
1057}
1058
eea49cc9 1059static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1060 struct packet_ring_buffer *rb)
1061{
1062 switch (po->tp_version) {
1063 case TPACKET_V1:
1064 case TPACKET_V2:
1065 return packet_increment_head(rb);
1066 case TPACKET_V3:
1067 default:
1068 WARN(1, "TPACKET version not supported.\n");
1069 BUG();
1070 return;
1071 }
1072}
1073
eea49cc9 1074static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1075 struct packet_ring_buffer *rb,
1076 int status)
1077{
1078 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1079 return packet_lookup_frame(po, rb, previous, status);
1080}
1081
eea49cc9 1082static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1083{
1084 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1085}
1086
77f65ebd
WB
1087static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1088{
1089 struct sock *sk = &po->sk;
1090 bool has_room;
1091
1092 if (po->prot_hook.func != tpacket_rcv)
1093 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1094 <= sk->sk_rcvbuf;
1095
1096 spin_lock(&sk->sk_receive_queue.lock);
1097 if (po->tp_version == TPACKET_V3)
1098 has_room = prb_lookup_block(po, &po->rx_ring,
1099 po->rx_ring.prb_bdqc.kactive_blk_num,
1100 TP_STATUS_KERNEL);
1101 else
1102 has_room = packet_lookup_frame(po, &po->rx_ring,
1103 po->rx_ring.head,
1104 TP_STATUS_KERNEL);
1105 spin_unlock(&sk->sk_receive_queue.lock);
1106
1107 return has_room;
1108}
1109
1da177e4
LT
1110static void packet_sock_destruct(struct sock *sk)
1111{
ed85b565
RC
1112 skb_queue_purge(&sk->sk_error_queue);
1113
547b792c
IJ
1114 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1115 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1116
1117 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1118 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1119 return;
1120 }
1121
17ab56a2 1122 sk_refcnt_debug_dec(sk);
1da177e4
LT
1123}
1124
dc99f600
DM
1125static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1126{
1127 int x = atomic_read(&f->rr_cur) + 1;
1128
1129 if (x >= num)
1130 x = 0;
1131
1132 return x;
1133}
1134
77f65ebd
WB
1135static unsigned int fanout_demux_hash(struct packet_fanout *f,
1136 struct sk_buff *skb,
1137 unsigned int num)
dc99f600 1138{
77f65ebd 1139 return (((u64)skb->rxhash) * num) >> 32;
dc99f600
DM
1140}
1141
77f65ebd
WB
1142static unsigned int fanout_demux_lb(struct packet_fanout *f,
1143 struct sk_buff *skb,
1144 unsigned int num)
dc99f600
DM
1145{
1146 int cur, old;
1147
1148 cur = atomic_read(&f->rr_cur);
1149 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1150 fanout_rr_next(f, num))) != cur)
1151 cur = old;
77f65ebd
WB
1152 return cur;
1153}
1154
1155static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1156 struct sk_buff *skb,
1157 unsigned int num)
1158{
1159 return smp_processor_id() % num;
dc99f600
DM
1160}
1161
77f65ebd
WB
1162static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1163 struct sk_buff *skb,
1164 unsigned int idx, unsigned int skip,
1165 unsigned int num)
95ec3eb4 1166{
77f65ebd 1167 unsigned int i, j;
95ec3eb4 1168
77f65ebd
WB
1169 i = j = min_t(int, f->next[idx], num - 1);
1170 do {
1171 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1172 if (i != j)
1173 f->next[idx] = i;
1174 return i;
1175 }
1176 if (++i == num)
1177 i = 0;
1178 } while (i != j);
1179
1180 return idx;
1181}
1182
1183static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1184{
1185 return f->flags & (flag >> 8);
95ec3eb4
DM
1186}
1187
95ec3eb4
DM
1188static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1189 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1190{
1191 struct packet_fanout *f = pt->af_packet_priv;
1192 unsigned int num = f->num_members;
1193 struct packet_sock *po;
77f65ebd 1194 unsigned int idx;
dc99f600
DM
1195
1196 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1197 !num) {
1198 kfree_skb(skb);
1199 return 0;
1200 }
1201
95ec3eb4
DM
1202 switch (f->type) {
1203 case PACKET_FANOUT_HASH:
1204 default:
77f65ebd 1205 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1206 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1207 if (!skb)
1208 return 0;
1209 }
1210 skb_get_rxhash(skb);
77f65ebd 1211 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1212 break;
1213 case PACKET_FANOUT_LB:
77f65ebd 1214 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1215 break;
1216 case PACKET_FANOUT_CPU:
77f65ebd
WB
1217 idx = fanout_demux_cpu(f, skb, num);
1218 break;
1219 case PACKET_FANOUT_ROLLOVER:
1220 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1221 break;
dc99f600
DM
1222 }
1223
77f65ebd
WB
1224 po = pkt_sk(f->arr[idx]);
1225 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1226 unlikely(!packet_rcv_has_room(po, skb))) {
1227 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1228 po = pkt_sk(f->arr[idx]);
1229 }
dc99f600
DM
1230
1231 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1232}
1233
fff3321d
PE
1234DEFINE_MUTEX(fanout_mutex);
1235EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1236static LIST_HEAD(fanout_list);
1237
1238static void __fanout_link(struct sock *sk, struct packet_sock *po)
1239{
1240 struct packet_fanout *f = po->fanout;
1241
1242 spin_lock(&f->lock);
1243 f->arr[f->num_members] = sk;
1244 smp_wmb();
1245 f->num_members++;
1246 spin_unlock(&f->lock);
1247}
1248
1249static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1250{
1251 struct packet_fanout *f = po->fanout;
1252 int i;
1253
1254 spin_lock(&f->lock);
1255 for (i = 0; i < f->num_members; i++) {
1256 if (f->arr[i] == sk)
1257 break;
1258 }
1259 BUG_ON(i >= f->num_members);
1260 f->arr[i] = f->arr[f->num_members - 1];
1261 f->num_members--;
1262 spin_unlock(&f->lock);
1263}
1264
a0dfb263 1265static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
c0de08d0
EL
1266{
1267 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1268 return true;
1269
1270 return false;
1271}
1272
7736d33f 1273static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1274{
1275 struct packet_sock *po = pkt_sk(sk);
1276 struct packet_fanout *f, *match;
7736d33f 1277 u8 type = type_flags & 0xff;
77f65ebd 1278 u8 flags = type_flags >> 8;
dc99f600
DM
1279 int err;
1280
1281 switch (type) {
77f65ebd
WB
1282 case PACKET_FANOUT_ROLLOVER:
1283 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1284 return -EINVAL;
dc99f600
DM
1285 case PACKET_FANOUT_HASH:
1286 case PACKET_FANOUT_LB:
95ec3eb4 1287 case PACKET_FANOUT_CPU:
dc99f600
DM
1288 break;
1289 default:
1290 return -EINVAL;
1291 }
1292
1293 if (!po->running)
1294 return -EINVAL;
1295
1296 if (po->fanout)
1297 return -EALREADY;
1298
1299 mutex_lock(&fanout_mutex);
1300 match = NULL;
1301 list_for_each_entry(f, &fanout_list, list) {
1302 if (f->id == id &&
1303 read_pnet(&f->net) == sock_net(sk)) {
1304 match = f;
1305 break;
1306 }
1307 }
afe62c68 1308 err = -EINVAL;
77f65ebd 1309 if (match && match->flags != flags)
afe62c68 1310 goto out;
dc99f600 1311 if (!match) {
afe62c68 1312 err = -ENOMEM;
dc99f600 1313 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1314 if (!match)
1315 goto out;
1316 write_pnet(&match->net, sock_net(sk));
1317 match->id = id;
1318 match->type = type;
77f65ebd 1319 match->flags = flags;
afe62c68
ED
1320 atomic_set(&match->rr_cur, 0);
1321 INIT_LIST_HEAD(&match->list);
1322 spin_lock_init(&match->lock);
1323 atomic_set(&match->sk_ref, 0);
1324 match->prot_hook.type = po->prot_hook.type;
1325 match->prot_hook.dev = po->prot_hook.dev;
1326 match->prot_hook.func = packet_rcv_fanout;
1327 match->prot_hook.af_packet_priv = match;
c0de08d0 1328 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1329 dev_add_pack(&match->prot_hook);
1330 list_add(&match->list, &fanout_list);
dc99f600 1331 }
afe62c68
ED
1332 err = -EINVAL;
1333 if (match->type == type &&
1334 match->prot_hook.type == po->prot_hook.type &&
1335 match->prot_hook.dev == po->prot_hook.dev) {
1336 err = -ENOSPC;
1337 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1338 __dev_remove_pack(&po->prot_hook);
1339 po->fanout = match;
1340 atomic_inc(&match->sk_ref);
1341 __fanout_link(sk, po);
1342 err = 0;
dc99f600
DM
1343 }
1344 }
afe62c68 1345out:
dc99f600
DM
1346 mutex_unlock(&fanout_mutex);
1347 return err;
1348}
1349
1350static void fanout_release(struct sock *sk)
1351{
1352 struct packet_sock *po = pkt_sk(sk);
1353 struct packet_fanout *f;
1354
1355 f = po->fanout;
1356 if (!f)
1357 return;
1358
fff3321d 1359 mutex_lock(&fanout_mutex);
dc99f600
DM
1360 po->fanout = NULL;
1361
dc99f600
DM
1362 if (atomic_dec_and_test(&f->sk_ref)) {
1363 list_del(&f->list);
1364 dev_remove_pack(&f->prot_hook);
1365 kfree(f);
1366 }
1367 mutex_unlock(&fanout_mutex);
1368}
1da177e4 1369
90ddc4f0 1370static const struct proto_ops packet_ops;
1da177e4 1371
90ddc4f0 1372static const struct proto_ops packet_ops_spkt;
1da177e4 1373
40d4e3df
ED
1374static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1375 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1376{
1377 struct sock *sk;
1378 struct sockaddr_pkt *spkt;
1379
1380 /*
1381 * When we registered the protocol we saved the socket in the data
1382 * field for just this event.
1383 */
1384
1385 sk = pt->af_packet_priv;
1ce4f28b 1386
1da177e4
LT
1387 /*
1388 * Yank back the headers [hope the device set this
1389 * right or kerboom...]
1390 *
1391 * Incoming packets have ll header pulled,
1392 * push it back.
1393 *
98e399f8 1394 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1395 * so that this procedure is noop.
1396 */
1397
1398 if (skb->pkt_type == PACKET_LOOPBACK)
1399 goto out;
1400
09ad9bc7 1401 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1402 goto out;
1403
40d4e3df
ED
1404 skb = skb_share_check(skb, GFP_ATOMIC);
1405 if (skb == NULL)
1da177e4
LT
1406 goto oom;
1407
1408 /* drop any routing info */
adf30907 1409 skb_dst_drop(skb);
1da177e4 1410
84531c24
PO
1411 /* drop conntrack reference */
1412 nf_reset(skb);
1413
ffbc6111 1414 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1415
98e399f8 1416 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1417
1418 /*
1419 * The SOCK_PACKET socket receives _all_ frames.
1420 */
1421
1422 spkt->spkt_family = dev->type;
1423 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1424 spkt->spkt_protocol = skb->protocol;
1425
1426 /*
1427 * Charge the memory to the socket. This is done specifically
1428 * to prevent sockets using all the memory up.
1429 */
1430
40d4e3df 1431 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1432 return 0;
1433
1434out:
1435 kfree_skb(skb);
1436oom:
1437 return 0;
1438}
1439
1440
1441/*
1442 * Output a raw packet to a device layer. This bypasses all the other
1443 * protocol layers and you must therefore supply it with a complete frame
1444 */
1ce4f28b 1445
1da177e4
LT
1446static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1447 struct msghdr *msg, size_t len)
1448{
1449 struct sock *sk = sock->sk;
40d4e3df 1450 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1451 struct sk_buff *skb = NULL;
1da177e4 1452 struct net_device *dev;
40d4e3df 1453 __be16 proto = 0;
1da177e4 1454 int err;
3bdc0eba 1455 int extra_len = 0;
1ce4f28b 1456
1da177e4 1457 /*
1ce4f28b 1458 * Get and verify the address.
1da177e4
LT
1459 */
1460
40d4e3df 1461 if (saddr) {
1da177e4 1462 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1463 return -EINVAL;
1464 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1465 proto = saddr->spkt_protocol;
1466 } else
1467 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1468
1469 /*
1ce4f28b 1470 * Find the device first to size check it
1da177e4
LT
1471 */
1472
de74e92a 1473 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1474retry:
654d1f8a
ED
1475 rcu_read_lock();
1476 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1477 err = -ENODEV;
1478 if (dev == NULL)
1479 goto out_unlock;
1ce4f28b 1480
d5e76b0a
DM
1481 err = -ENETDOWN;
1482 if (!(dev->flags & IFF_UP))
1483 goto out_unlock;
1484
1da177e4 1485 /*
40d4e3df
ED
1486 * You may not queue a frame bigger than the mtu. This is the lowest level
1487 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1488 */
1ce4f28b 1489
3bdc0eba
BG
1490 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1491 if (!netif_supports_nofcs(dev)) {
1492 err = -EPROTONOSUPPORT;
1493 goto out_unlock;
1494 }
1495 extra_len = 4; /* We're doing our own CRC */
1496 }
1497
1da177e4 1498 err = -EMSGSIZE;
3bdc0eba 1499 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1500 goto out_unlock;
1501
1a35ca80
ED
1502 if (!skb) {
1503 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1504 int tlen = dev->needed_tailroom;
1a35ca80
ED
1505 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1506
1507 rcu_read_unlock();
4ce40912 1508 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1509 if (skb == NULL)
1510 return -ENOBUFS;
1511 /* FIXME: Save some space for broken drivers that write a hard
1512 * header at transmission time by themselves. PPP is the notable
1513 * one here. This should really be fixed at the driver level.
1514 */
1515 skb_reserve(skb, reserved);
1516 skb_reset_network_header(skb);
1517
1518 /* Try to align data part correctly */
1519 if (hhlen) {
1520 skb->data -= hhlen;
1521 skb->tail -= hhlen;
1522 if (len < hhlen)
1523 skb_reset_network_header(skb);
1524 }
1525 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1526 if (err)
1527 goto out_free;
1528 goto retry;
1da177e4
LT
1529 }
1530
3bdc0eba 1531 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1532 /* Earlier code assumed this would be a VLAN pkt,
1533 * double-check this now that we have the actual
1534 * packet in hand.
1535 */
1536 struct ethhdr *ehdr;
1537 skb_reset_mac_header(skb);
1538 ehdr = eth_hdr(skb);
1539 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1540 err = -EMSGSIZE;
1541 goto out_unlock;
1542 }
1543 }
1a35ca80 1544
1da177e4
LT
1545 skb->protocol = proto;
1546 skb->dev = dev;
1547 skb->priority = sk->sk_priority;
2d37a186 1548 skb->mark = sk->sk_mark;
bf84a010
DB
1549
1550 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1551
3bdc0eba
BG
1552 if (unlikely(extra_len == 4))
1553 skb->no_fcs = 1;
1554
40893fd0 1555 skb_probe_transport_header(skb, 0);
c1aad275 1556
1da177e4 1557 dev_queue_xmit(skb);
654d1f8a 1558 rcu_read_unlock();
40d4e3df 1559 return len;
1da177e4 1560
1da177e4 1561out_unlock:
654d1f8a 1562 rcu_read_unlock();
1a35ca80
ED
1563out_free:
1564 kfree_skb(skb);
1da177e4
LT
1565 return err;
1566}
1da177e4 1567
eea49cc9 1568static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1569 const struct sock *sk,
dbcb5855 1570 unsigned int res)
1da177e4
LT
1571{
1572 struct sk_filter *filter;
fda9ef5d 1573
80f8f102
ED
1574 rcu_read_lock();
1575 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1576 if (filter != NULL)
0a14842f 1577 res = SK_RUN_FILTER(filter, skb);
80f8f102 1578 rcu_read_unlock();
1da177e4 1579
dbcb5855 1580 return res;
1da177e4
LT
1581}
1582
1583/*
62ab0812
ED
1584 * This function makes lazy skb cloning in hope that most of packets
1585 * are discarded by BPF.
1586 *
1587 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1588 * and skb->cb are mangled. It works because (and until) packets
1589 * falling here are owned by current CPU. Output packets are cloned
1590 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1591 * sequencially, so that if we return skb to original state on exit,
1592 * we will not harm anyone.
1da177e4
LT
1593 */
1594
40d4e3df
ED
1595static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1596 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1597{
1598 struct sock *sk;
1599 struct sockaddr_ll *sll;
1600 struct packet_sock *po;
40d4e3df 1601 u8 *skb_head = skb->data;
1da177e4 1602 int skb_len = skb->len;
dbcb5855 1603 unsigned int snaplen, res;
1da177e4
LT
1604
1605 if (skb->pkt_type == PACKET_LOOPBACK)
1606 goto drop;
1607
1608 sk = pt->af_packet_priv;
1609 po = pkt_sk(sk);
1610
09ad9bc7 1611 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1612 goto drop;
1613
1da177e4
LT
1614 skb->dev = dev;
1615
3b04ddde 1616 if (dev->header_ops) {
1da177e4 1617 /* The device has an explicit notion of ll header,
62ab0812
ED
1618 * exported to higher levels.
1619 *
1620 * Otherwise, the device hides details of its frame
1621 * structure, so that corresponding packet head is
1622 * never delivered to user.
1da177e4
LT
1623 */
1624 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1625 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1626 else if (skb->pkt_type == PACKET_OUTGOING) {
1627 /* Special case: outgoing packets have ll header at head */
bbe735e4 1628 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1629 }
1630 }
1631
1632 snaplen = skb->len;
1633
dbcb5855
DM
1634 res = run_filter(skb, sk, snaplen);
1635 if (!res)
fda9ef5d 1636 goto drop_n_restore;
dbcb5855
DM
1637 if (snaplen > res)
1638 snaplen = res;
1da177e4 1639
0fd7bac6 1640 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1641 goto drop_n_acct;
1642
1643 if (skb_shared(skb)) {
1644 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1645 if (nskb == NULL)
1646 goto drop_n_acct;
1647
1648 if (skb_head != skb->data) {
1649 skb->data = skb_head;
1650 skb->len = skb_len;
1651 }
abc4e4fa 1652 consume_skb(skb);
1da177e4
LT
1653 skb = nskb;
1654 }
1655
ffbc6111
HX
1656 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1657 sizeof(skb->cb));
1658
1659 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1660 sll->sll_family = AF_PACKET;
1661 sll->sll_hatype = dev->type;
1662 sll->sll_protocol = skb->protocol;
1663 sll->sll_pkttype = skb->pkt_type;
8032b464 1664 if (unlikely(po->origdev))
80feaacb
PWJ
1665 sll->sll_ifindex = orig_dev->ifindex;
1666 else
1667 sll->sll_ifindex = dev->ifindex;
1da177e4 1668
b95cce35 1669 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1670
ffbc6111 1671 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1672
1da177e4
LT
1673 if (pskb_trim(skb, snaplen))
1674 goto drop_n_acct;
1675
1676 skb_set_owner_r(skb, sk);
1677 skb->dev = NULL;
adf30907 1678 skb_dst_drop(skb);
1da177e4 1679
84531c24
PO
1680 /* drop conntrack reference */
1681 nf_reset(skb);
1682
1da177e4 1683 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1684 po->stats.stats1.tp_packets++;
3b885787 1685 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1686 __skb_queue_tail(&sk->sk_receive_queue, skb);
1687 spin_unlock(&sk->sk_receive_queue.lock);
1688 sk->sk_data_ready(sk, skb->len);
1689 return 0;
1690
1691drop_n_acct:
7091fbd8 1692 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1693 po->stats.stats1.tp_drops++;
7091fbd8
WB
1694 atomic_inc(&sk->sk_drops);
1695 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1696
1697drop_n_restore:
1698 if (skb_head != skb->data && skb_shared(skb)) {
1699 skb->data = skb_head;
1700 skb->len = skb_len;
1701 }
1702drop:
ead2ceb0 1703 consume_skb(skb);
1da177e4
LT
1704 return 0;
1705}
1706
40d4e3df
ED
1707static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1708 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1709{
1710 struct sock *sk;
1711 struct packet_sock *po;
1712 struct sockaddr_ll *sll;
184f489e 1713 union tpacket_uhdr h;
40d4e3df 1714 u8 *skb_head = skb->data;
1da177e4 1715 int skb_len = skb->len;
dbcb5855 1716 unsigned int snaplen, res;
f6fb8f10 1717 unsigned long status = TP_STATUS_USER;
bbd6ef87 1718 unsigned short macoff, netoff, hdrlen;
1da177e4 1719 struct sk_buff *copy_skb = NULL;
bbd6ef87 1720 struct timespec ts;
b9c32fb2 1721 __u32 ts_status;
1da177e4
LT
1722
1723 if (skb->pkt_type == PACKET_LOOPBACK)
1724 goto drop;
1725
1726 sk = pt->af_packet_priv;
1727 po = pkt_sk(sk);
1728
09ad9bc7 1729 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1730 goto drop;
1731
3b04ddde 1732 if (dev->header_ops) {
1da177e4 1733 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1734 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1735 else if (skb->pkt_type == PACKET_OUTGOING) {
1736 /* Special case: outgoing packets have ll header at head */
bbe735e4 1737 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1738 }
1739 }
1740
8dc41944
HX
1741 if (skb->ip_summed == CHECKSUM_PARTIAL)
1742 status |= TP_STATUS_CSUMNOTREADY;
1743
1da177e4
LT
1744 snaplen = skb->len;
1745
dbcb5855
DM
1746 res = run_filter(skb, sk, snaplen);
1747 if (!res)
fda9ef5d 1748 goto drop_n_restore;
dbcb5855
DM
1749 if (snaplen > res)
1750 snaplen = res;
1da177e4
LT
1751
1752 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1753 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1754 po->tp_reserve;
1da177e4 1755 } else {
95c96174 1756 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1757 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1758 (maclen < 16 ? 16 : maclen)) +
1759 po->tp_reserve;
1da177e4
LT
1760 macoff = netoff - maclen;
1761 }
f6fb8f10 1762 if (po->tp_version <= TPACKET_V2) {
1763 if (macoff + snaplen > po->rx_ring.frame_size) {
1764 if (po->copy_thresh &&
0fd7bac6 1765 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1766 if (skb_shared(skb)) {
1767 copy_skb = skb_clone(skb, GFP_ATOMIC);
1768 } else {
1769 copy_skb = skb_get(skb);
1770 skb_head = skb->data;
1771 }
1772 if (copy_skb)
1773 skb_set_owner_r(copy_skb, sk);
1da177e4 1774 }
f6fb8f10 1775 snaplen = po->rx_ring.frame_size - macoff;
1776 if ((int)snaplen < 0)
1777 snaplen = 0;
1da177e4 1778 }
1da177e4 1779 }
1da177e4 1780 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1781 h.raw = packet_current_rx_frame(po, skb,
1782 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1783 if (!h.raw)
1da177e4 1784 goto ring_is_full;
f6fb8f10 1785 if (po->tp_version <= TPACKET_V2) {
1786 packet_increment_rx_head(po, &po->rx_ring);
1787 /*
1788 * LOSING will be reported till you read the stats,
1789 * because it's COR - Clear On Read.
1790 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1791 * at packet level.
1792 */
ee80fbf3 1793 if (po->stats.stats1.tp_drops)
f6fb8f10 1794 status |= TP_STATUS_LOSING;
1795 }
ee80fbf3 1796 po->stats.stats1.tp_packets++;
1da177e4
LT
1797 if (copy_skb) {
1798 status |= TP_STATUS_COPY;
1799 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1800 }
1da177e4
LT
1801 spin_unlock(&sk->sk_receive_queue.lock);
1802
bbd6ef87 1803 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1804
1805 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1806 getnstimeofday(&ts);
1da177e4 1807
b9c32fb2
DB
1808 status |= ts_status;
1809
bbd6ef87
PM
1810 switch (po->tp_version) {
1811 case TPACKET_V1:
1812 h.h1->tp_len = skb->len;
1813 h.h1->tp_snaplen = snaplen;
1814 h.h1->tp_mac = macoff;
1815 h.h1->tp_net = netoff;
4b457bdf
DB
1816 h.h1->tp_sec = ts.tv_sec;
1817 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1818 hdrlen = sizeof(*h.h1);
1819 break;
1820 case TPACKET_V2:
1821 h.h2->tp_len = skb->len;
1822 h.h2->tp_snaplen = snaplen;
1823 h.h2->tp_mac = macoff;
1824 h.h2->tp_net = netoff;
bbd6ef87
PM
1825 h.h2->tp_sec = ts.tv_sec;
1826 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1827 if (vlan_tx_tag_present(skb)) {
1828 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1829 status |= TP_STATUS_VLAN_VALID;
1830 } else {
1831 h.h2->tp_vlan_tci = 0;
1832 }
13fcb7bd 1833 h.h2->tp_padding = 0;
bbd6ef87
PM
1834 hdrlen = sizeof(*h.h2);
1835 break;
f6fb8f10 1836 case TPACKET_V3:
1837 /* tp_nxt_offset,vlan are already populated above.
1838 * So DONT clear those fields here
1839 */
1840 h.h3->tp_status |= status;
1841 h.h3->tp_len = skb->len;
1842 h.h3->tp_snaplen = snaplen;
1843 h.h3->tp_mac = macoff;
1844 h.h3->tp_net = netoff;
f6fb8f10 1845 h.h3->tp_sec = ts.tv_sec;
1846 h.h3->tp_nsec = ts.tv_nsec;
1847 hdrlen = sizeof(*h.h3);
1848 break;
bbd6ef87
PM
1849 default:
1850 BUG();
1851 }
1da177e4 1852
bbd6ef87 1853 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1854 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1855 sll->sll_family = AF_PACKET;
1856 sll->sll_hatype = dev->type;
1857 sll->sll_protocol = skb->protocol;
1858 sll->sll_pkttype = skb->pkt_type;
8032b464 1859 if (unlikely(po->origdev))
80feaacb
PWJ
1860 sll->sll_ifindex = orig_dev->ifindex;
1861 else
1862 sll->sll_ifindex = dev->ifindex;
1da177e4 1863
e16aa207 1864 smp_mb();
f6dafa95 1865#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1866 {
0af55bb5
CG
1867 u8 *start, *end;
1868
f6fb8f10 1869 if (po->tp_version <= TPACKET_V2) {
1870 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1871 + macoff + snaplen);
1872 for (start = h.raw; start < end; start += PAGE_SIZE)
1873 flush_dcache_page(pgv_to_page(start));
1874 }
cc9f01b2 1875 smp_wmb();
1da177e4 1876 }
f6dafa95 1877#endif
f6fb8f10 1878 if (po->tp_version <= TPACKET_V2)
1879 __packet_set_status(po, h.raw, status);
1880 else
1881 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1882
1883 sk->sk_data_ready(sk, 0);
1884
1885drop_n_restore:
1886 if (skb_head != skb->data && skb_shared(skb)) {
1887 skb->data = skb_head;
1888 skb->len = skb_len;
1889 }
1890drop:
1ce4f28b 1891 kfree_skb(skb);
1da177e4
LT
1892 return 0;
1893
1894ring_is_full:
ee80fbf3 1895 po->stats.stats1.tp_drops++;
1da177e4
LT
1896 spin_unlock(&sk->sk_receive_queue.lock);
1897
1898 sk->sk_data_ready(sk, 0);
acb5d75b 1899 kfree_skb(copy_skb);
1da177e4
LT
1900 goto drop_n_restore;
1901}
1902
69e3c75f
JB
1903static void tpacket_destruct_skb(struct sk_buff *skb)
1904{
1905 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1906 void *ph;
1da177e4 1907
69e3c75f 1908 if (likely(po->tx_ring.pg_vec)) {
b9c32fb2
DB
1909 __u32 ts;
1910
69e3c75f 1911 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
1912 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1913 atomic_dec(&po->tx_ring.pending);
b9c32fb2
DB
1914
1915 ts = __packet_set_timestamp(po, ph, skb);
1916 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
1917 }
1918
1919 sock_wfree(skb);
1920}
1921
40d4e3df
ED
1922static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1923 void *frame, struct net_device *dev, int size_max,
ae641949 1924 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 1925{
184f489e 1926 union tpacket_uhdr ph;
cbd89acb 1927 int to_write, offset, len, tp_len, nr_frags, len_max, max_frame_len;
69e3c75f
JB
1928 struct socket *sock = po->sk.sk_socket;
1929 struct page *page;
1930 void *data;
1931 int err;
1932
1933 ph.raw = frame;
1934
1935 skb->protocol = proto;
1936 skb->dev = dev;
1937 skb->priority = po->sk.sk_priority;
2d37a186 1938 skb->mark = po->sk.sk_mark;
2e31396f 1939 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
1940 skb_shinfo(skb)->destructor_arg = ph.raw;
1941
1942 switch (po->tp_version) {
1943 case TPACKET_V2:
1944 tp_len = ph.h2->tp_len;
1945 break;
1946 default:
1947 tp_len = ph.h1->tp_len;
1948 break;
1949 }
69e3c75f 1950
ae641949 1951 skb_reserve(skb, hlen);
69e3c75f 1952 skb_reset_network_header(skb);
40893fd0 1953 skb_probe_transport_header(skb, 0);
c1aad275 1954
5920cd3a
PC
1955 if (po->tp_tx_has_off) {
1956 int off_min, off_max, off;
1957 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1958 off_max = po->tx_ring.frame_size - tp_len;
1959 if (sock->type == SOCK_DGRAM) {
1960 switch (po->tp_version) {
1961 case TPACKET_V2:
1962 off = ph.h2->tp_net;
1963 break;
1964 default:
1965 off = ph.h1->tp_net;
1966 break;
1967 }
1968 } else {
1969 switch (po->tp_version) {
1970 case TPACKET_V2:
1971 off = ph.h2->tp_mac;
1972 break;
1973 default:
1974 off = ph.h1->tp_mac;
1975 break;
1976 }
1977 }
1978 if (unlikely((off < off_min) || (off_max < off)))
1979 return -EINVAL;
1980 data = ph.raw + off;
1981 } else {
1982 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1983 }
69e3c75f
JB
1984 to_write = tp_len;
1985
1986 if (sock->type == SOCK_DGRAM) {
1987 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1988 NULL, tp_len);
1989 if (unlikely(err < 0))
1990 return -EINVAL;
40d4e3df 1991 } else if (dev->hard_header_len) {
69e3c75f
JB
1992 /* net device doesn't like empty head */
1993 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
1994 pr_err("packet size is too short (%d < %d)\n",
1995 tp_len, dev->hard_header_len);
69e3c75f
JB
1996 return -EINVAL;
1997 }
1998
1999 skb_push(skb, dev->hard_header_len);
2000 err = skb_store_bits(skb, 0, data,
2001 dev->hard_header_len);
2002 if (unlikely(err))
2003 return err;
2004
0f75b09c
PS
2005 if (dev->type == ARPHRD_ETHER)
2006 skb->protocol = eth_type_trans(skb, dev);
2007
69e3c75f
JB
2008 data += dev->hard_header_len;
2009 to_write -= dev->hard_header_len;
2010 }
2011
cbd89acb
PS
2012 max_frame_len = dev->mtu + dev->hard_header_len;
2013 if (skb->protocol == htons(ETH_P_8021Q))
2014 max_frame_len += VLAN_HLEN;
2015
2016 if (size_max > max_frame_len)
2017 size_max = max_frame_len;
2018
2019 if (unlikely(tp_len > size_max)) {
2020 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2021 return -EMSGSIZE;
2022 }
2023
69e3c75f
JB
2024 offset = offset_in_page(data);
2025 len_max = PAGE_SIZE - offset;
2026 len = ((to_write > len_max) ? len_max : to_write);
2027
2028 skb->data_len = to_write;
2029 skb->len += to_write;
2030 skb->truesize += to_write;
2031 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2032
2033 while (likely(to_write)) {
2034 nr_frags = skb_shinfo(skb)->nr_frags;
2035
2036 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2037 pr_err("Packet exceed the number of skb frags(%lu)\n",
2038 MAX_SKB_FRAGS);
69e3c75f
JB
2039 return -EFAULT;
2040 }
2041
0af55bb5
CG
2042 page = pgv_to_page(data);
2043 data += len;
69e3c75f
JB
2044 flush_dcache_page(page);
2045 get_page(page);
0af55bb5 2046 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2047 to_write -= len;
2048 offset = 0;
2049 len_max = PAGE_SIZE;
2050 len = ((to_write > len_max) ? len_max : to_write);
2051 }
2052
2053 return tp_len;
2054}
2055
2056static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2057{
69e3c75f
JB
2058 struct sk_buff *skb;
2059 struct net_device *dev;
2060 __be16 proto;
827d9780 2061 bool need_rls_dev = false;
cbd89acb 2062 int err;
40d4e3df
ED
2063 void *ph;
2064 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2065 int tp_len, size_max;
2066 unsigned char *addr;
2067 int len_sum = 0;
9e67030a 2068 int status = TP_STATUS_AVAILABLE;
ae641949 2069 int hlen, tlen;
69e3c75f 2070
69e3c75f
JB
2071 mutex_lock(&po->pg_vec_lock);
2072
69e3c75f 2073 if (saddr == NULL) {
827d9780 2074 dev = po->prot_hook.dev;
69e3c75f
JB
2075 proto = po->num;
2076 addr = NULL;
2077 } else {
2078 err = -EINVAL;
2079 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2080 goto out;
2081 if (msg->msg_namelen < (saddr->sll_halen
2082 + offsetof(struct sockaddr_ll,
2083 sll_addr)))
2084 goto out;
69e3c75f
JB
2085 proto = saddr->sll_protocol;
2086 addr = saddr->sll_addr;
827d9780
BG
2087 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2088 need_rls_dev = true;
69e3c75f
JB
2089 }
2090
69e3c75f
JB
2091 err = -ENXIO;
2092 if (unlikely(dev == NULL))
2093 goto out;
2094
69e3c75f
JB
2095 err = -ENETDOWN;
2096 if (unlikely(!(dev->flags & IFF_UP)))
2097 goto out_put;
2098
2099 size_max = po->tx_ring.frame_size
b5dd884e 2100 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2101
69e3c75f
JB
2102 do {
2103 ph = packet_current_frame(po, &po->tx_ring,
2104 TP_STATUS_SEND_REQUEST);
2105
2106 if (unlikely(ph == NULL)) {
2107 schedule();
2108 continue;
2109 }
2110
2111 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2112 hlen = LL_RESERVED_SPACE(dev);
2113 tlen = dev->needed_tailroom;
69e3c75f 2114 skb = sock_alloc_send_skb(&po->sk,
ae641949 2115 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2116 0, &err);
2117
2118 if (unlikely(skb == NULL))
2119 goto out_status;
2120
2121 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2122 addr, hlen);
69e3c75f
JB
2123
2124 if (unlikely(tp_len < 0)) {
2125 if (po->tp_loss) {
2126 __packet_set_status(po, ph,
2127 TP_STATUS_AVAILABLE);
2128 packet_increment_head(&po->tx_ring);
2129 kfree_skb(skb);
2130 continue;
2131 } else {
2132 status = TP_STATUS_WRONG_FORMAT;
2133 err = tp_len;
2134 goto out_status;
2135 }
2136 }
2137
2138 skb->destructor = tpacket_destruct_skb;
2139 __packet_set_status(po, ph, TP_STATUS_SENDING);
2140 atomic_inc(&po->tx_ring.pending);
2141
2142 status = TP_STATUS_SEND_REQUEST;
2143 err = dev_queue_xmit(skb);
eb70df13
JP
2144 if (unlikely(err > 0)) {
2145 err = net_xmit_errno(err);
2146 if (err && __packet_get_status(po, ph) ==
2147 TP_STATUS_AVAILABLE) {
2148 /* skb was destructed already */
2149 skb = NULL;
2150 goto out_status;
2151 }
2152 /*
2153 * skb was dropped but not destructed yet;
2154 * let's treat it like congestion or err < 0
2155 */
2156 err = 0;
2157 }
69e3c75f
JB
2158 packet_increment_head(&po->tx_ring);
2159 len_sum += tp_len;
f64f9e71
JP
2160 } while (likely((ph != NULL) ||
2161 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2162 (atomic_read(&po->tx_ring.pending))))
2163 );
69e3c75f
JB
2164
2165 err = len_sum;
2166 goto out_put;
2167
69e3c75f
JB
2168out_status:
2169 __packet_set_status(po, ph, status);
2170 kfree_skb(skb);
2171out_put:
827d9780
BG
2172 if (need_rls_dev)
2173 dev_put(dev);
69e3c75f
JB
2174out:
2175 mutex_unlock(&po->pg_vec_lock);
2176 return err;
2177}
69e3c75f 2178
eea49cc9
OJ
2179static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2180 size_t reserve, size_t len,
2181 size_t linear, int noblock,
2182 int *err)
bfd5f4a3
SS
2183{
2184 struct sk_buff *skb;
2185
2186 /* Under a page? Don't bother with paged skb. */
2187 if (prepad + len < PAGE_SIZE || !linear)
2188 linear = len;
2189
2190 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2191 err);
2192 if (!skb)
2193 return NULL;
2194
2195 skb_reserve(skb, reserve);
2196 skb_put(skb, linear);
2197 skb->data_len = len - linear;
2198 skb->len += len - linear;
2199
2200 return skb;
2201}
2202
69e3c75f 2203static int packet_snd(struct socket *sock,
1da177e4
LT
2204 struct msghdr *msg, size_t len)
2205{
2206 struct sock *sk = sock->sk;
40d4e3df 2207 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2208 struct sk_buff *skb;
2209 struct net_device *dev;
0e11c91e 2210 __be16 proto;
827d9780 2211 bool need_rls_dev = false;
1da177e4 2212 unsigned char *addr;
827d9780 2213 int err, reserve = 0;
bfd5f4a3
SS
2214 struct virtio_net_hdr vnet_hdr = { 0 };
2215 int offset = 0;
2216 int vnet_hdr_len;
2217 struct packet_sock *po = pkt_sk(sk);
2218 unsigned short gso_type = 0;
ae641949 2219 int hlen, tlen;
3bdc0eba 2220 int extra_len = 0;
1da177e4
LT
2221
2222 /*
1ce4f28b 2223 * Get and verify the address.
1da177e4 2224 */
1ce4f28b 2225
1da177e4 2226 if (saddr == NULL) {
827d9780 2227 dev = po->prot_hook.dev;
1da177e4
LT
2228 proto = po->num;
2229 addr = NULL;
2230 } else {
2231 err = -EINVAL;
2232 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2233 goto out;
0fb375fb
EB
2234 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2235 goto out;
1da177e4
LT
2236 proto = saddr->sll_protocol;
2237 addr = saddr->sll_addr;
827d9780
BG
2238 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2239 need_rls_dev = true;
1da177e4
LT
2240 }
2241
1da177e4
LT
2242 err = -ENXIO;
2243 if (dev == NULL)
2244 goto out_unlock;
2245 if (sock->type == SOCK_RAW)
2246 reserve = dev->hard_header_len;
2247
d5e76b0a
DM
2248 err = -ENETDOWN;
2249 if (!(dev->flags & IFF_UP))
2250 goto out_unlock;
2251
bfd5f4a3
SS
2252 if (po->has_vnet_hdr) {
2253 vnet_hdr_len = sizeof(vnet_hdr);
2254
2255 err = -EINVAL;
2256 if (len < vnet_hdr_len)
2257 goto out_unlock;
2258
2259 len -= vnet_hdr_len;
2260
2261 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2262 vnet_hdr_len);
2263 if (err < 0)
2264 goto out_unlock;
2265
2266 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2267 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2268 vnet_hdr.hdr_len))
2269 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2270 vnet_hdr.csum_offset + 2;
2271
2272 err = -EINVAL;
2273 if (vnet_hdr.hdr_len > len)
2274 goto out_unlock;
2275
2276 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2277 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2278 case VIRTIO_NET_HDR_GSO_TCPV4:
2279 gso_type = SKB_GSO_TCPV4;
2280 break;
2281 case VIRTIO_NET_HDR_GSO_TCPV6:
2282 gso_type = SKB_GSO_TCPV6;
2283 break;
2284 case VIRTIO_NET_HDR_GSO_UDP:
2285 gso_type = SKB_GSO_UDP;
2286 break;
2287 default:
2288 goto out_unlock;
2289 }
2290
2291 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2292 gso_type |= SKB_GSO_TCP_ECN;
2293
2294 if (vnet_hdr.gso_size == 0)
2295 goto out_unlock;
2296
2297 }
2298 }
2299
3bdc0eba
BG
2300 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2301 if (!netif_supports_nofcs(dev)) {
2302 err = -EPROTONOSUPPORT;
2303 goto out_unlock;
2304 }
2305 extra_len = 4; /* We're doing our own CRC */
2306 }
2307
1da177e4 2308 err = -EMSGSIZE;
3bdc0eba 2309 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2310 goto out_unlock;
2311
bfd5f4a3 2312 err = -ENOBUFS;
ae641949
HX
2313 hlen = LL_RESERVED_SPACE(dev);
2314 tlen = dev->needed_tailroom;
2315 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2316 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2317 if (skb == NULL)
1da177e4
LT
2318 goto out_unlock;
2319
bfd5f4a3 2320 skb_set_network_header(skb, reserve);
1da177e4 2321
0c4e8581
SH
2322 err = -EINVAL;
2323 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2324 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2325 goto out_free;
1da177e4
LT
2326
2327 /* Returns -EFAULT on error */
bfd5f4a3 2328 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2329 if (err)
2330 goto out_free;
bf84a010
DB
2331
2332 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2333
0f75b09c
PS
2334 if (dev->type == ARPHRD_ETHER) {
2335 skb->protocol = eth_type_trans(skb, dev);
c483e026
PS
2336 if (skb->protocol == htons(ETH_P_8021Q))
2337 reserve += VLAN_HLEN;
0f75b09c
PS
2338 } else {
2339 skb->protocol = proto;
2340 skb->dev = dev;
2341 }
2342
3bdc0eba 2343 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
c483e026
PS
2344 err = -EMSGSIZE;
2345 goto out_free;
57f89bfa
BG
2346 }
2347
1da177e4 2348 skb->priority = sk->sk_priority;
2d37a186 2349 skb->mark = sk->sk_mark;
1da177e4 2350
bfd5f4a3
SS
2351 if (po->has_vnet_hdr) {
2352 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2353 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2354 vnet_hdr.csum_offset)) {
2355 err = -EINVAL;
2356 goto out_free;
2357 }
2358 }
2359
2360 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2361 skb_shinfo(skb)->gso_type = gso_type;
2362
2363 /* Header must be checked, and gso_segs computed. */
2364 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2365 skb_shinfo(skb)->gso_segs = 0;
2366
2367 len += vnet_hdr_len;
2368 }
2369
40893fd0 2370 skb_probe_transport_header(skb, reserve);
c1aad275 2371
3bdc0eba
BG
2372 if (unlikely(extra_len == 4))
2373 skb->no_fcs = 1;
2374
1da177e4
LT
2375 /*
2376 * Now send it
2377 */
2378
2379 err = dev_queue_xmit(skb);
2380 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2381 goto out_unlock;
2382
827d9780
BG
2383 if (need_rls_dev)
2384 dev_put(dev);
1da177e4 2385
40d4e3df 2386 return len;
1da177e4
LT
2387
2388out_free:
2389 kfree_skb(skb);
2390out_unlock:
827d9780 2391 if (dev && need_rls_dev)
1da177e4
LT
2392 dev_put(dev);
2393out:
2394 return err;
2395}
2396
69e3c75f
JB
2397static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2398 struct msghdr *msg, size_t len)
2399{
69e3c75f
JB
2400 struct sock *sk = sock->sk;
2401 struct packet_sock *po = pkt_sk(sk);
2402 if (po->tx_ring.pg_vec)
2403 return tpacket_snd(po, msg);
2404 else
69e3c75f
JB
2405 return packet_snd(sock, msg, len);
2406}
2407
1da177e4
LT
2408/*
2409 * Close a PACKET socket. This is fairly simple. We immediately go
2410 * to 'closed' state and remove our protocol entry in the device list.
2411 */
2412
2413static int packet_release(struct socket *sock)
2414{
2415 struct sock *sk = sock->sk;
2416 struct packet_sock *po;
d12d01d6 2417 struct net *net;
f6fb8f10 2418 union tpacket_req_u req_u;
1da177e4
LT
2419
2420 if (!sk)
2421 return 0;
2422
3b1e0a65 2423 net = sock_net(sk);
1da177e4
LT
2424 po = pkt_sk(sk);
2425
0fa7fa98 2426 mutex_lock(&net->packet.sklist_lock);
808f5114 2427 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2428 mutex_unlock(&net->packet.sklist_lock);
2429
2430 preempt_disable();
920de804 2431 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2432 preempt_enable();
1da177e4 2433
808f5114 2434 spin_lock(&po->bind_lock);
ce06b03e 2435 unregister_prot_hook(sk, false);
160ff18a
BG
2436 if (po->prot_hook.dev) {
2437 dev_put(po->prot_hook.dev);
2438 po->prot_hook.dev = NULL;
2439 }
808f5114 2440 spin_unlock(&po->bind_lock);
1da177e4 2441
1da177e4 2442 packet_flush_mclist(sk);
1da177e4 2443
9665d5d6
PS
2444 if (po->rx_ring.pg_vec) {
2445 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2446 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2447 }
69e3c75f 2448
9665d5d6
PS
2449 if (po->tx_ring.pg_vec) {
2450 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2451 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2452 }
1da177e4 2453
dc99f600
DM
2454 fanout_release(sk);
2455
808f5114 2456 synchronize_net();
1da177e4
LT
2457 /*
2458 * Now the socket is dead. No more input will appear.
2459 */
1da177e4
LT
2460 sock_orphan(sk);
2461 sock->sk = NULL;
2462
2463 /* Purge queues */
2464
2465 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2466 sk_refcnt_debug_release(sk);
1da177e4
LT
2467
2468 sock_put(sk);
2469 return 0;
2470}
2471
2472/*
2473 * Attach a packet hook.
2474 */
2475
0e11c91e 2476static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2477{
2478 struct packet_sock *po = pkt_sk(sk);
dc99f600 2479
aef950b4
WY
2480 if (po->fanout) {
2481 if (dev)
2482 dev_put(dev);
2483
dc99f600 2484 return -EINVAL;
aef950b4 2485 }
1da177e4
LT
2486
2487 lock_sock(sk);
2488
2489 spin_lock(&po->bind_lock);
ce06b03e 2490 unregister_prot_hook(sk, true);
1da177e4
LT
2491 po->num = protocol;
2492 po->prot_hook.type = protocol;
160ff18a
BG
2493 if (po->prot_hook.dev)
2494 dev_put(po->prot_hook.dev);
1da177e4
LT
2495 po->prot_hook.dev = dev;
2496
2497 po->ifindex = dev ? dev->ifindex : 0;
2498
2499 if (protocol == 0)
2500 goto out_unlock;
2501
be85d4ad 2502 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2503 register_prot_hook(sk);
be85d4ad
UT
2504 } else {
2505 sk->sk_err = ENETDOWN;
2506 if (!sock_flag(sk, SOCK_DEAD))
2507 sk->sk_error_report(sk);
1da177e4
LT
2508 }
2509
2510out_unlock:
2511 spin_unlock(&po->bind_lock);
2512 release_sock(sk);
2513 return 0;
2514}
2515
2516/*
2517 * Bind a packet socket to a device
2518 */
2519
40d4e3df
ED
2520static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2521 int addr_len)
1da177e4 2522{
40d4e3df 2523 struct sock *sk = sock->sk;
1da177e4
LT
2524 char name[15];
2525 struct net_device *dev;
2526 int err = -ENODEV;
1ce4f28b 2527
1da177e4
LT
2528 /*
2529 * Check legality
2530 */
1ce4f28b 2531
8ae55f04 2532 if (addr_len != sizeof(struct sockaddr))
1da177e4 2533 return -EINVAL;
40d4e3df 2534 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2535
3b1e0a65 2536 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2537 if (dev)
1da177e4 2538 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2539 return err;
2540}
1da177e4
LT
2541
2542static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2543{
40d4e3df
ED
2544 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2545 struct sock *sk = sock->sk;
1da177e4
LT
2546 struct net_device *dev = NULL;
2547 int err;
2548
2549
2550 /*
2551 * Check legality
2552 */
1ce4f28b 2553
1da177e4
LT
2554 if (addr_len < sizeof(struct sockaddr_ll))
2555 return -EINVAL;
2556 if (sll->sll_family != AF_PACKET)
2557 return -EINVAL;
2558
2559 if (sll->sll_ifindex) {
2560 err = -ENODEV;
3b1e0a65 2561 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2562 if (dev == NULL)
2563 goto out;
2564 }
2565 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2566
2567out:
2568 return err;
2569}
2570
2571static struct proto packet_proto = {
2572 .name = "PACKET",
2573 .owner = THIS_MODULE,
2574 .obj_size = sizeof(struct packet_sock),
2575};
2576
2577/*
1ce4f28b 2578 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2579 */
2580
3f378b68
EP
2581static int packet_create(struct net *net, struct socket *sock, int protocol,
2582 int kern)
1da177e4
LT
2583{
2584 struct sock *sk;
2585 struct packet_sock *po;
0e11c91e 2586 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2587 int err;
2588
df008c91 2589 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2590 return -EPERM;
be02097c
DM
2591 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2592 sock->type != SOCK_PACKET)
1da177e4
LT
2593 return -ESOCKTNOSUPPORT;
2594
2595 sock->state = SS_UNCONNECTED;
2596
2597 err = -ENOBUFS;
6257ff21 2598 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2599 if (sk == NULL)
2600 goto out;
2601
2602 sock->ops = &packet_ops;
1da177e4
LT
2603 if (sock->type == SOCK_PACKET)
2604 sock->ops = &packet_ops_spkt;
be02097c 2605
1da177e4
LT
2606 sock_init_data(sock, sk);
2607
2608 po = pkt_sk(sk);
2609 sk->sk_family = PF_PACKET;
0e11c91e 2610 po->num = proto;
1da177e4
LT
2611
2612 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2613 sk_refcnt_debug_inc(sk);
1da177e4
LT
2614
2615 /*
2616 * Attach a protocol block
2617 */
2618
2619 spin_lock_init(&po->bind_lock);
905db440 2620 mutex_init(&po->pg_vec_lock);
1da177e4 2621 po->prot_hook.func = packet_rcv;
be02097c 2622
1da177e4
LT
2623 if (sock->type == SOCK_PACKET)
2624 po->prot_hook.func = packet_rcv_spkt;
be02097c 2625
1da177e4
LT
2626 po->prot_hook.af_packet_priv = sk;
2627
0e11c91e
AV
2628 if (proto) {
2629 po->prot_hook.type = proto;
ce06b03e 2630 register_prot_hook(sk);
1da177e4
LT
2631 }
2632
0fa7fa98 2633 mutex_lock(&net->packet.sklist_lock);
808f5114 2634 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2635 mutex_unlock(&net->packet.sklist_lock);
2636
2637 preempt_disable();
3680453c 2638 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2639 preempt_enable();
808f5114 2640
40d4e3df 2641 return 0;
1da177e4
LT
2642out:
2643 return err;
2644}
2645
2646/*
2647 * Pull a packet from our receive queue and hand it to the user.
2648 * If necessary we block.
2649 */
2650
2651static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2652 struct msghdr *msg, size_t len, int flags)
2653{
2654 struct sock *sk = sock->sk;
2655 struct sk_buff *skb;
2656 int copied, err;
0fb375fb 2657 struct sockaddr_ll *sll;
bfd5f4a3 2658 int vnet_hdr_len = 0;
1da177e4
LT
2659
2660 err = -EINVAL;
ed85b565 2661 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2662 goto out;
2663
2664#if 0
2665 /* What error should we return now? EUNATTACH? */
2666 if (pkt_sk(sk)->ifindex < 0)
2667 return -ENODEV;
2668#endif
2669
ed85b565 2670 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
2671 err = sock_recv_errqueue(sk, msg, len,
2672 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
2673 goto out;
2674 }
2675
1da177e4
LT
2676 /*
2677 * Call the generic datagram receiver. This handles all sorts
2678 * of horrible races and re-entrancy so we can forget about it
2679 * in the protocol layers.
2680 *
2681 * Now it will return ENETDOWN, if device have just gone down,
2682 * but then it will block.
2683 */
2684
40d4e3df 2685 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2686
2687 /*
1ce4f28b 2688 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2689 * handles the blocking we don't see and worry about blocking
2690 * retries.
2691 */
2692
8ae55f04 2693 if (skb == NULL)
1da177e4
LT
2694 goto out;
2695
bfd5f4a3
SS
2696 if (pkt_sk(sk)->has_vnet_hdr) {
2697 struct virtio_net_hdr vnet_hdr = { 0 };
2698
2699 err = -EINVAL;
2700 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2701 if (len < vnet_hdr_len)
bfd5f4a3
SS
2702 goto out_free;
2703
1f18b717
MK
2704 len -= vnet_hdr_len;
2705
bfd5f4a3
SS
2706 if (skb_is_gso(skb)) {
2707 struct skb_shared_info *sinfo = skb_shinfo(skb);
2708
2709 /* This is a hint as to how much should be linear. */
2710 vnet_hdr.hdr_len = skb_headlen(skb);
2711 vnet_hdr.gso_size = sinfo->gso_size;
2712 if (sinfo->gso_type & SKB_GSO_TCPV4)
2713 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2714 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2715 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2716 else if (sinfo->gso_type & SKB_GSO_UDP)
2717 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2718 else if (sinfo->gso_type & SKB_GSO_FCOE)
2719 goto out_free;
2720 else
2721 BUG();
2722 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2723 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2724 } else
2725 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2726
2727 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2728 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2729 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2730 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2731 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2732 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2733 } /* else everything is zero */
2734
2735 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2736 vnet_hdr_len);
2737 if (err < 0)
2738 goto out_free;
2739 }
2740
0fb375fb
EB
2741 /*
2742 * If the address length field is there to be filled in, we fill
2743 * it in now.
2744 */
2745
ffbc6111 2746 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2747 if (sock->type == SOCK_PACKET)
2748 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2749 else
2750 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2751
1da177e4
LT
2752 /*
2753 * You lose any data beyond the buffer you gave. If it worries a
2754 * user program they can ask the device for its MTU anyway.
2755 */
2756
2757 copied = skb->len;
40d4e3df
ED
2758 if (copied > len) {
2759 copied = len;
2760 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2761 }
2762
2763 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2764 if (err)
2765 goto out_free;
2766
3b885787 2767 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2768
2769 if (msg->msg_name)
ffbc6111
HX
2770 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2771 msg->msg_namelen);
1da177e4 2772
8dc41944 2773 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2774 struct tpacket_auxdata aux;
2775
2776 aux.tp_status = TP_STATUS_USER;
2777 if (skb->ip_summed == CHECKSUM_PARTIAL)
2778 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2779 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2780 aux.tp_snaplen = skb->len;
2781 aux.tp_mac = 0;
bbe735e4 2782 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2783 if (vlan_tx_tag_present(skb)) {
2784 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2785 aux.tp_status |= TP_STATUS_VLAN_VALID;
2786 } else {
2787 aux.tp_vlan_tci = 0;
2788 }
13fcb7bd 2789 aux.tp_padding = 0;
ffbc6111 2790 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2791 }
2792
1da177e4
LT
2793 /*
2794 * Free or return the buffer as appropriate. Again this
2795 * hides all the races and re-entrancy issues from us.
2796 */
bfd5f4a3 2797 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2798
2799out_free:
2800 skb_free_datagram(sk, skb);
2801out:
2802 return err;
2803}
2804
1da177e4
LT
2805static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2806 int *uaddr_len, int peer)
2807{
2808 struct net_device *dev;
2809 struct sock *sk = sock->sk;
2810
2811 if (peer)
2812 return -EOPNOTSUPP;
2813
2814 uaddr->sa_family = AF_PACKET;
2dc85bf3 2815 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
2816 rcu_read_lock();
2817 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2818 if (dev)
2dc85bf3 2819 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 2820 rcu_read_unlock();
1da177e4
LT
2821 *uaddr_len = sizeof(*uaddr);
2822
2823 return 0;
2824}
1da177e4
LT
2825
2826static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2827 int *uaddr_len, int peer)
2828{
2829 struct net_device *dev;
2830 struct sock *sk = sock->sk;
2831 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2832 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2833
2834 if (peer)
2835 return -EOPNOTSUPP;
2836
2837 sll->sll_family = AF_PACKET;
2838 sll->sll_ifindex = po->ifindex;
2839 sll->sll_protocol = po->num;
67286640 2840 sll->sll_pkttype = 0;
654d1f8a
ED
2841 rcu_read_lock();
2842 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2843 if (dev) {
2844 sll->sll_hatype = dev->type;
2845 sll->sll_halen = dev->addr_len;
2846 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2847 } else {
2848 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2849 sll->sll_halen = 0;
2850 }
654d1f8a 2851 rcu_read_unlock();
0fb375fb 2852 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2853
2854 return 0;
2855}
2856
2aeb0b88
WC
2857static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2858 int what)
1da177e4
LT
2859{
2860 switch (i->type) {
2861 case PACKET_MR_MULTICAST:
1162563f
JP
2862 if (i->alen != dev->addr_len)
2863 return -EINVAL;
1da177e4 2864 if (what > 0)
22bedad3 2865 return dev_mc_add(dev, i->addr);
1da177e4 2866 else
22bedad3 2867 return dev_mc_del(dev, i->addr);
1da177e4
LT
2868 break;
2869 case PACKET_MR_PROMISC:
2aeb0b88 2870 return dev_set_promiscuity(dev, what);
1da177e4
LT
2871 break;
2872 case PACKET_MR_ALLMULTI:
2aeb0b88 2873 return dev_set_allmulti(dev, what);
1da177e4 2874 break;
d95ed927 2875 case PACKET_MR_UNICAST:
1162563f
JP
2876 if (i->alen != dev->addr_len)
2877 return -EINVAL;
d95ed927 2878 if (what > 0)
a748ee24 2879 return dev_uc_add(dev, i->addr);
d95ed927 2880 else
a748ee24 2881 return dev_uc_del(dev, i->addr);
d95ed927 2882 break;
40d4e3df
ED
2883 default:
2884 break;
1da177e4 2885 }
2aeb0b88 2886 return 0;
1da177e4
LT
2887}
2888
2889static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2890{
40d4e3df 2891 for ( ; i; i = i->next) {
1da177e4
LT
2892 if (i->ifindex == dev->ifindex)
2893 packet_dev_mc(dev, i, what);
2894 }
2895}
2896
0fb375fb 2897static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2898{
2899 struct packet_sock *po = pkt_sk(sk);
2900 struct packet_mclist *ml, *i;
2901 struct net_device *dev;
2902 int err;
2903
2904 rtnl_lock();
2905
2906 err = -ENODEV;
3b1e0a65 2907 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2908 if (!dev)
2909 goto done;
2910
2911 err = -EINVAL;
1162563f 2912 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2913 goto done;
2914
2915 err = -ENOBUFS;
8b3a7005 2916 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2917 if (i == NULL)
2918 goto done;
2919
2920 err = 0;
2921 for (ml = po->mclist; ml; ml = ml->next) {
2922 if (ml->ifindex == mreq->mr_ifindex &&
2923 ml->type == mreq->mr_type &&
2924 ml->alen == mreq->mr_alen &&
2925 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2926 ml->count++;
2927 /* Free the new element ... */
2928 kfree(i);
2929 goto done;
2930 }
2931 }
2932
2933 i->type = mreq->mr_type;
2934 i->ifindex = mreq->mr_ifindex;
2935 i->alen = mreq->mr_alen;
2936 memcpy(i->addr, mreq->mr_address, i->alen);
2937 i->count = 1;
2938 i->next = po->mclist;
2939 po->mclist = i;
2aeb0b88
WC
2940 err = packet_dev_mc(dev, i, 1);
2941 if (err) {
2942 po->mclist = i->next;
2943 kfree(i);
2944 }
1da177e4
LT
2945
2946done:
2947 rtnl_unlock();
2948 return err;
2949}
2950
0fb375fb 2951static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2952{
2953 struct packet_mclist *ml, **mlp;
2954
2955 rtnl_lock();
2956
2957 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2958 if (ml->ifindex == mreq->mr_ifindex &&
2959 ml->type == mreq->mr_type &&
2960 ml->alen == mreq->mr_alen &&
2961 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2962 if (--ml->count == 0) {
2963 struct net_device *dev;
2964 *mlp = ml->next;
ad959e76
ED
2965 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2966 if (dev)
1da177e4 2967 packet_dev_mc(dev, ml, -1);
1da177e4
LT
2968 kfree(ml);
2969 }
2970 rtnl_unlock();
2971 return 0;
2972 }
2973 }
2974 rtnl_unlock();
2975 return -EADDRNOTAVAIL;
2976}
2977
2978static void packet_flush_mclist(struct sock *sk)
2979{
2980 struct packet_sock *po = pkt_sk(sk);
2981 struct packet_mclist *ml;
2982
2983 if (!po->mclist)
2984 return;
2985
2986 rtnl_lock();
2987 while ((ml = po->mclist) != NULL) {
2988 struct net_device *dev;
2989
2990 po->mclist = ml->next;
ad959e76
ED
2991 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2992 if (dev != NULL)
1da177e4 2993 packet_dev_mc(dev, ml, -1);
1da177e4
LT
2994 kfree(ml);
2995 }
2996 rtnl_unlock();
2997}
1da177e4
LT
2998
2999static int
b7058842 3000packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3001{
3002 struct sock *sk = sock->sk;
8dc41944 3003 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3004 int ret;
3005
3006 if (level != SOL_PACKET)
3007 return -ENOPROTOOPT;
3008
69e3c75f 3009 switch (optname) {
1ce4f28b 3010 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3011 case PACKET_DROP_MEMBERSHIP:
3012 {
0fb375fb
EB
3013 struct packet_mreq_max mreq;
3014 int len = optlen;
3015 memset(&mreq, 0, sizeof(mreq));
3016 if (len < sizeof(struct packet_mreq))
1da177e4 3017 return -EINVAL;
0fb375fb
EB
3018 if (len > sizeof(mreq))
3019 len = sizeof(mreq);
40d4e3df 3020 if (copy_from_user(&mreq, optval, len))
1da177e4 3021 return -EFAULT;
0fb375fb
EB
3022 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3023 return -EINVAL;
1da177e4
LT
3024 if (optname == PACKET_ADD_MEMBERSHIP)
3025 ret = packet_mc_add(sk, &mreq);
3026 else
3027 ret = packet_mc_drop(sk, &mreq);
3028 return ret;
3029 }
a2efcfa0 3030
1da177e4 3031 case PACKET_RX_RING:
69e3c75f 3032 case PACKET_TX_RING:
1da177e4 3033 {
f6fb8f10 3034 union tpacket_req_u req_u;
3035 int len;
1da177e4 3036
f6fb8f10 3037 switch (po->tp_version) {
3038 case TPACKET_V1:
3039 case TPACKET_V2:
3040 len = sizeof(req_u.req);
3041 break;
3042 case TPACKET_V3:
3043 default:
3044 len = sizeof(req_u.req3);
3045 break;
3046 }
3047 if (optlen < len)
1da177e4 3048 return -EINVAL;
bfd5f4a3
SS
3049 if (pkt_sk(sk)->has_vnet_hdr)
3050 return -EINVAL;
f6fb8f10 3051 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3052 return -EFAULT;
f6fb8f10 3053 return packet_set_ring(sk, &req_u, 0,
3054 optname == PACKET_TX_RING);
1da177e4
LT
3055 }
3056 case PACKET_COPY_THRESH:
3057 {
3058 int val;
3059
40d4e3df 3060 if (optlen != sizeof(val))
1da177e4 3061 return -EINVAL;
40d4e3df 3062 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3063 return -EFAULT;
3064
3065 pkt_sk(sk)->copy_thresh = val;
3066 return 0;
3067 }
bbd6ef87
PM
3068 case PACKET_VERSION:
3069 {
3070 int val;
3071
3072 if (optlen != sizeof(val))
3073 return -EINVAL;
69e3c75f 3074 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3075 return -EBUSY;
3076 if (copy_from_user(&val, optval, sizeof(val)))
3077 return -EFAULT;
3078 switch (val) {
3079 case TPACKET_V1:
3080 case TPACKET_V2:
f6fb8f10 3081 case TPACKET_V3:
bbd6ef87
PM
3082 po->tp_version = val;
3083 return 0;
3084 default:
3085 return -EINVAL;
3086 }
3087 }
8913336a
PM
3088 case PACKET_RESERVE:
3089 {
3090 unsigned int val;
3091
3092 if (optlen != sizeof(val))
3093 return -EINVAL;
69e3c75f 3094 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3095 return -EBUSY;
3096 if (copy_from_user(&val, optval, sizeof(val)))
3097 return -EFAULT;
3098 po->tp_reserve = val;
3099 return 0;
3100 }
69e3c75f
JB
3101 case PACKET_LOSS:
3102 {
3103 unsigned int val;
3104
3105 if (optlen != sizeof(val))
3106 return -EINVAL;
3107 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3108 return -EBUSY;
3109 if (copy_from_user(&val, optval, sizeof(val)))
3110 return -EFAULT;
3111 po->tp_loss = !!val;
3112 return 0;
3113 }
8dc41944
HX
3114 case PACKET_AUXDATA:
3115 {
3116 int val;
3117
3118 if (optlen < sizeof(val))
3119 return -EINVAL;
3120 if (copy_from_user(&val, optval, sizeof(val)))
3121 return -EFAULT;
3122
3123 po->auxdata = !!val;
3124 return 0;
3125 }
80feaacb
PWJ
3126 case PACKET_ORIGDEV:
3127 {
3128 int val;
3129
3130 if (optlen < sizeof(val))
3131 return -EINVAL;
3132 if (copy_from_user(&val, optval, sizeof(val)))
3133 return -EFAULT;
3134
3135 po->origdev = !!val;
3136 return 0;
3137 }
bfd5f4a3
SS
3138 case PACKET_VNET_HDR:
3139 {
3140 int val;
3141
3142 if (sock->type != SOCK_RAW)
3143 return -EINVAL;
3144 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3145 return -EBUSY;
3146 if (optlen < sizeof(val))
3147 return -EINVAL;
3148 if (copy_from_user(&val, optval, sizeof(val)))
3149 return -EFAULT;
3150
3151 po->has_vnet_hdr = !!val;
3152 return 0;
3153 }
614f60fa
SM
3154 case PACKET_TIMESTAMP:
3155 {
3156 int val;
3157
3158 if (optlen != sizeof(val))
3159 return -EINVAL;
3160 if (copy_from_user(&val, optval, sizeof(val)))
3161 return -EFAULT;
3162
3163 po->tp_tstamp = val;
3164 return 0;
3165 }
dc99f600
DM
3166 case PACKET_FANOUT:
3167 {
3168 int val;
3169
3170 if (optlen != sizeof(val))
3171 return -EINVAL;
3172 if (copy_from_user(&val, optval, sizeof(val)))
3173 return -EFAULT;
3174
3175 return fanout_add(sk, val & 0xffff, val >> 16);
3176 }
5920cd3a
PC
3177 case PACKET_TX_HAS_OFF:
3178 {
3179 unsigned int val;
3180
3181 if (optlen != sizeof(val))
3182 return -EINVAL;
3183 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3184 return -EBUSY;
3185 if (copy_from_user(&val, optval, sizeof(val)))
3186 return -EFAULT;
3187 po->tp_tx_has_off = !!val;
3188 return 0;
3189 }
1da177e4
LT
3190 default:
3191 return -ENOPROTOOPT;
3192 }
3193}
3194
3195static int packet_getsockopt(struct socket *sock, int level, int optname,
3196 char __user *optval, int __user *optlen)
3197{
3198 int len;
c06fff6e 3199 int val, lv = sizeof(val);
1da177e4
LT
3200 struct sock *sk = sock->sk;
3201 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3202 void *data = &val;
ee80fbf3 3203 union tpacket_stats_u st;
1da177e4
LT
3204
3205 if (level != SOL_PACKET)
3206 return -ENOPROTOOPT;
3207
8ae55f04
KK
3208 if (get_user(len, optlen))
3209 return -EFAULT;
1da177e4
LT
3210
3211 if (len < 0)
3212 return -EINVAL;
1ce4f28b 3213
69e3c75f 3214 switch (optname) {
1da177e4 3215 case PACKET_STATISTICS:
1da177e4 3216 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3217 memcpy(&st, &po->stats, sizeof(st));
3218 memset(&po->stats, 0, sizeof(po->stats));
3219 spin_unlock_bh(&sk->sk_receive_queue.lock);
3220
f6fb8f10 3221 if (po->tp_version == TPACKET_V3) {
c06fff6e 3222 lv = sizeof(struct tpacket_stats_v3);
ee80fbf3 3223 data = &st.stats3;
f6fb8f10 3224 } else {
c06fff6e 3225 lv = sizeof(struct tpacket_stats);
ee80fbf3 3226 data = &st.stats1;
f6fb8f10 3227 }
ee80fbf3 3228
8dc41944
HX
3229 break;
3230 case PACKET_AUXDATA:
8dc41944 3231 val = po->auxdata;
80feaacb
PWJ
3232 break;
3233 case PACKET_ORIGDEV:
80feaacb 3234 val = po->origdev;
bfd5f4a3
SS
3235 break;
3236 case PACKET_VNET_HDR:
bfd5f4a3 3237 val = po->has_vnet_hdr;
1da177e4 3238 break;
bbd6ef87 3239 case PACKET_VERSION:
bbd6ef87 3240 val = po->tp_version;
bbd6ef87
PM
3241 break;
3242 case PACKET_HDRLEN:
3243 if (len > sizeof(int))
3244 len = sizeof(int);
3245 if (copy_from_user(&val, optval, len))
3246 return -EFAULT;
3247 switch (val) {
3248 case TPACKET_V1:
3249 val = sizeof(struct tpacket_hdr);
3250 break;
3251 case TPACKET_V2:
3252 val = sizeof(struct tpacket2_hdr);
3253 break;
f6fb8f10 3254 case TPACKET_V3:
3255 val = sizeof(struct tpacket3_hdr);
3256 break;
bbd6ef87
PM
3257 default:
3258 return -EINVAL;
3259 }
bbd6ef87 3260 break;
8913336a 3261 case PACKET_RESERVE:
8913336a 3262 val = po->tp_reserve;
8913336a 3263 break;
69e3c75f 3264 case PACKET_LOSS:
69e3c75f 3265 val = po->tp_loss;
69e3c75f 3266 break;
614f60fa 3267 case PACKET_TIMESTAMP:
614f60fa 3268 val = po->tp_tstamp;
614f60fa 3269 break;
dc99f600 3270 case PACKET_FANOUT:
dc99f600
DM
3271 val = (po->fanout ?
3272 ((u32)po->fanout->id |
77f65ebd
WB
3273 ((u32)po->fanout->type << 16) |
3274 ((u32)po->fanout->flags << 24)) :
dc99f600 3275 0);
dc99f600 3276 break;
5920cd3a
PC
3277 case PACKET_TX_HAS_OFF:
3278 val = po->tp_tx_has_off;
3279 break;
1da177e4
LT
3280 default:
3281 return -ENOPROTOOPT;
3282 }
3283
c06fff6e
ED
3284 if (len > lv)
3285 len = lv;
8ae55f04
KK
3286 if (put_user(len, optlen))
3287 return -EFAULT;
8dc41944
HX
3288 if (copy_to_user(optval, data, len))
3289 return -EFAULT;
8ae55f04 3290 return 0;
1da177e4
LT
3291}
3292
3293
351638e7
JP
3294static int packet_notifier(struct notifier_block *this,
3295 unsigned long msg, void *ptr)
1da177e4
LT
3296{
3297 struct sock *sk;
351638e7 3298 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3299 struct net *net = dev_net(dev);
1da177e4 3300
808f5114 3301 rcu_read_lock();
b67bfe0d 3302 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3303 struct packet_sock *po = pkt_sk(sk);
3304
3305 switch (msg) {
3306 case NETDEV_UNREGISTER:
1da177e4
LT
3307 if (po->mclist)
3308 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3309 /* fallthrough */
3310
1da177e4
LT
3311 case NETDEV_DOWN:
3312 if (dev->ifindex == po->ifindex) {
3313 spin_lock(&po->bind_lock);
3314 if (po->running) {
ce06b03e 3315 __unregister_prot_hook(sk, false);
1da177e4
LT
3316 sk->sk_err = ENETDOWN;
3317 if (!sock_flag(sk, SOCK_DEAD))
3318 sk->sk_error_report(sk);
3319 }
3320 if (msg == NETDEV_UNREGISTER) {
3321 po->ifindex = -1;
160ff18a
BG
3322 if (po->prot_hook.dev)
3323 dev_put(po->prot_hook.dev);
1da177e4
LT
3324 po->prot_hook.dev = NULL;
3325 }
3326 spin_unlock(&po->bind_lock);
3327 }
3328 break;
3329 case NETDEV_UP:
808f5114 3330 if (dev->ifindex == po->ifindex) {
3331 spin_lock(&po->bind_lock);
ce06b03e
DM
3332 if (po->num)
3333 register_prot_hook(sk);
808f5114 3334 spin_unlock(&po->bind_lock);
1da177e4 3335 }
1da177e4
LT
3336 break;
3337 }
3338 }
808f5114 3339 rcu_read_unlock();
1da177e4
LT
3340 return NOTIFY_DONE;
3341}
3342
3343
3344static int packet_ioctl(struct socket *sock, unsigned int cmd,
3345 unsigned long arg)
3346{
3347 struct sock *sk = sock->sk;
3348
69e3c75f 3349 switch (cmd) {
40d4e3df
ED
3350 case SIOCOUTQ:
3351 {
3352 int amount = sk_wmem_alloc_get(sk);
31e6d363 3353
40d4e3df
ED
3354 return put_user(amount, (int __user *)arg);
3355 }
3356 case SIOCINQ:
3357 {
3358 struct sk_buff *skb;
3359 int amount = 0;
3360
3361 spin_lock_bh(&sk->sk_receive_queue.lock);
3362 skb = skb_peek(&sk->sk_receive_queue);
3363 if (skb)
3364 amount = skb->len;
3365 spin_unlock_bh(&sk->sk_receive_queue.lock);
3366 return put_user(amount, (int __user *)arg);
3367 }
3368 case SIOCGSTAMP:
3369 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3370 case SIOCGSTAMPNS:
3371 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3372
1da177e4 3373#ifdef CONFIG_INET
40d4e3df
ED
3374 case SIOCADDRT:
3375 case SIOCDELRT:
3376 case SIOCDARP:
3377 case SIOCGARP:
3378 case SIOCSARP:
3379 case SIOCGIFADDR:
3380 case SIOCSIFADDR:
3381 case SIOCGIFBRDADDR:
3382 case SIOCSIFBRDADDR:
3383 case SIOCGIFNETMASK:
3384 case SIOCSIFNETMASK:
3385 case SIOCGIFDSTADDR:
3386 case SIOCSIFDSTADDR:
3387 case SIOCSIFFLAGS:
40d4e3df 3388 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3389#endif
3390
40d4e3df
ED
3391 default:
3392 return -ENOIOCTLCMD;
1da177e4
LT
3393 }
3394 return 0;
3395}
3396
40d4e3df 3397static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3398 poll_table *wait)
3399{
3400 struct sock *sk = sock->sk;
3401 struct packet_sock *po = pkt_sk(sk);
3402 unsigned int mask = datagram_poll(file, sock, wait);
3403
3404 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3405 if (po->rx_ring.pg_vec) {
f6fb8f10 3406 if (!packet_previous_rx_frame(po, &po->rx_ring,
3407 TP_STATUS_KERNEL))
1da177e4
LT
3408 mask |= POLLIN | POLLRDNORM;
3409 }
3410 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3411 spin_lock_bh(&sk->sk_write_queue.lock);
3412 if (po->tx_ring.pg_vec) {
3413 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3414 mask |= POLLOUT | POLLWRNORM;
3415 }
3416 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3417 return mask;
3418}
3419
3420
3421/* Dirty? Well, I still did not learn better way to account
3422 * for user mmaps.
3423 */
3424
3425static void packet_mm_open(struct vm_area_struct *vma)
3426{
3427 struct file *file = vma->vm_file;
40d4e3df 3428 struct socket *sock = file->private_data;
1da177e4 3429 struct sock *sk = sock->sk;
1ce4f28b 3430
1da177e4
LT
3431 if (sk)
3432 atomic_inc(&pkt_sk(sk)->mapped);
3433}
3434
3435static void packet_mm_close(struct vm_area_struct *vma)
3436{
3437 struct file *file = vma->vm_file;
40d4e3df 3438 struct socket *sock = file->private_data;
1da177e4 3439 struct sock *sk = sock->sk;
1ce4f28b 3440
1da177e4
LT
3441 if (sk)
3442 atomic_dec(&pkt_sk(sk)->mapped);
3443}
3444
f0f37e2f 3445static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3446 .open = packet_mm_open,
3447 .close = packet_mm_close,
1da177e4
LT
3448};
3449
0e3125c7
NH
3450static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3451 unsigned int len)
1da177e4
LT
3452{
3453 int i;
3454
4ebf0ae2 3455 for (i = 0; i < len; i++) {
0e3125c7 3456 if (likely(pg_vec[i].buffer)) {
c56b4d90 3457 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3458 vfree(pg_vec[i].buffer);
3459 else
3460 free_pages((unsigned long)pg_vec[i].buffer,
3461 order);
3462 pg_vec[i].buffer = NULL;
3463 }
1da177e4
LT
3464 }
3465 kfree(pg_vec);
3466}
3467
eea49cc9 3468static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3469{
0e3125c7
NH
3470 char *buffer = NULL;
3471 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3472 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3473
3474 buffer = (char *) __get_free_pages(gfp_flags, order);
3475
3476 if (buffer)
3477 return buffer;
3478
3479 /*
3480 * __get_free_pages failed, fall back to vmalloc
3481 */
bbce5a59 3482 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3483
0e3125c7
NH
3484 if (buffer)
3485 return buffer;
3486
3487 /*
3488 * vmalloc failed, lets dig into swap here
3489 */
0e3125c7
NH
3490 gfp_flags &= ~__GFP_NORETRY;
3491 buffer = (char *)__get_free_pages(gfp_flags, order);
3492 if (buffer)
3493 return buffer;
3494
3495 /*
3496 * complete and utter failure
3497 */
3498 return NULL;
4ebf0ae2
DM
3499}
3500
0e3125c7 3501static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3502{
3503 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3504 struct pgv *pg_vec;
4ebf0ae2
DM
3505 int i;
3506
0e3125c7 3507 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3508 if (unlikely(!pg_vec))
3509 goto out;
3510
3511 for (i = 0; i < block_nr; i++) {
c56b4d90 3512 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3513 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3514 goto out_free_pgvec;
3515 }
3516
3517out:
3518 return pg_vec;
3519
3520out_free_pgvec:
3521 free_pg_vec(pg_vec, order, block_nr);
3522 pg_vec = NULL;
3523 goto out;
3524}
1da177e4 3525
f6fb8f10 3526static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3527 int closing, int tx_ring)
1da177e4 3528{
0e3125c7 3529 struct pgv *pg_vec = NULL;
1da177e4 3530 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3531 int was_running, order = 0;
69e3c75f
JB
3532 struct packet_ring_buffer *rb;
3533 struct sk_buff_head *rb_queue;
0e11c91e 3534 __be16 num;
f6fb8f10 3535 int err = -EINVAL;
3536 /* Added to avoid minimal code churn */
3537 struct tpacket_req *req = &req_u->req;
3538
3539 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3540 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3541 WARN(1, "Tx-ring is not supported.\n");
3542 goto out;
3543 }
1ce4f28b 3544
69e3c75f
JB
3545 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3546 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3547
69e3c75f
JB
3548 err = -EBUSY;
3549 if (!closing) {
3550 if (atomic_read(&po->mapped))
3551 goto out;
3552 if (atomic_read(&rb->pending))
3553 goto out;
3554 }
1da177e4 3555
69e3c75f
JB
3556 if (req->tp_block_nr) {
3557 /* Sanity tests and some calculations */
3558 err = -EBUSY;
3559 if (unlikely(rb->pg_vec))
3560 goto out;
1da177e4 3561
bbd6ef87
PM
3562 switch (po->tp_version) {
3563 case TPACKET_V1:
3564 po->tp_hdrlen = TPACKET_HDRLEN;
3565 break;
3566 case TPACKET_V2:
3567 po->tp_hdrlen = TPACKET2_HDRLEN;
3568 break;
f6fb8f10 3569 case TPACKET_V3:
3570 po->tp_hdrlen = TPACKET3_HDRLEN;
3571 break;
bbd6ef87
PM
3572 }
3573
69e3c75f 3574 err = -EINVAL;
4ebf0ae2 3575 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3576 goto out;
4ebf0ae2 3577 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3578 goto out;
8913336a 3579 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3580 po->tp_reserve))
3581 goto out;
4ebf0ae2 3582 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3583 goto out;
1da177e4 3584
69e3c75f
JB
3585 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3586 if (unlikely(rb->frames_per_block <= 0))
3587 goto out;
3588 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3589 req->tp_frame_nr))
3590 goto out;
1da177e4
LT
3591
3592 err = -ENOMEM;
4ebf0ae2
DM
3593 order = get_order(req->tp_block_size);
3594 pg_vec = alloc_pg_vec(req, order);
3595 if (unlikely(!pg_vec))
1da177e4 3596 goto out;
f6fb8f10 3597 switch (po->tp_version) {
3598 case TPACKET_V3:
3599 /* Transmit path is not supported. We checked
3600 * it above but just being paranoid
3601 */
3602 if (!tx_ring)
3603 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3604 break;
3605 default:
3606 break;
3607 }
69e3c75f
JB
3608 }
3609 /* Done */
3610 else {
3611 err = -EINVAL;
4ebf0ae2 3612 if (unlikely(req->tp_frame_nr))
69e3c75f 3613 goto out;
1da177e4
LT
3614 }
3615
3616 lock_sock(sk);
3617
3618 /* Detach socket from network */
3619 spin_lock(&po->bind_lock);
3620 was_running = po->running;
3621 num = po->num;
3622 if (was_running) {
1da177e4 3623 po->num = 0;
ce06b03e 3624 __unregister_prot_hook(sk, false);
1da177e4
LT
3625 }
3626 spin_unlock(&po->bind_lock);
1ce4f28b 3627
1da177e4
LT
3628 synchronize_net();
3629
3630 err = -EBUSY;
905db440 3631 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3632 if (closing || atomic_read(&po->mapped) == 0) {
3633 err = 0;
69e3c75f 3634 spin_lock_bh(&rb_queue->lock);
c053fd96 3635 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3636 rb->frame_max = (req->tp_frame_nr - 1);
3637 rb->head = 0;
3638 rb->frame_size = req->tp_frame_size;
3639 spin_unlock_bh(&rb_queue->lock);
3640
c053fd96
CG
3641 swap(rb->pg_vec_order, order);
3642 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3643
3644 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3645 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3646 tpacket_rcv : packet_rcv;
3647 skb_queue_purge(rb_queue);
1da177e4 3648 if (atomic_read(&po->mapped))
40d4e3df
ED
3649 pr_err("packet_mmap: vma is busy: %d\n",
3650 atomic_read(&po->mapped));
1da177e4 3651 }
905db440 3652 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3653
3654 spin_lock(&po->bind_lock);
ce06b03e 3655 if (was_running) {
1da177e4 3656 po->num = num;
ce06b03e 3657 register_prot_hook(sk);
1da177e4
LT
3658 }
3659 spin_unlock(&po->bind_lock);
f6fb8f10 3660 if (closing && (po->tp_version > TPACKET_V2)) {
3661 /* Because we don't support block-based V3 on tx-ring */
3662 if (!tx_ring)
3663 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3664 }
1da177e4
LT
3665 release_sock(sk);
3666
1da177e4
LT
3667 if (pg_vec)
3668 free_pg_vec(pg_vec, order, req->tp_block_nr);
3669out:
3670 return err;
3671}
3672
69e3c75f
JB
3673static int packet_mmap(struct file *file, struct socket *sock,
3674 struct vm_area_struct *vma)
1da177e4
LT
3675{
3676 struct sock *sk = sock->sk;
3677 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3678 unsigned long size, expected_size;
3679 struct packet_ring_buffer *rb;
1da177e4
LT
3680 unsigned long start;
3681 int err = -EINVAL;
3682 int i;
3683
3684 if (vma->vm_pgoff)
3685 return -EINVAL;
3686
905db440 3687 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3688
3689 expected_size = 0;
3690 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3691 if (rb->pg_vec) {
3692 expected_size += rb->pg_vec_len
3693 * rb->pg_vec_pages
3694 * PAGE_SIZE;
3695 }
3696 }
3697
3698 if (expected_size == 0)
1da177e4 3699 goto out;
69e3c75f
JB
3700
3701 size = vma->vm_end - vma->vm_start;
3702 if (size != expected_size)
1da177e4
LT
3703 goto out;
3704
1da177e4 3705 start = vma->vm_start;
69e3c75f
JB
3706 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3707 if (rb->pg_vec == NULL)
3708 continue;
3709
3710 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3711 struct page *page;
3712 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3713 int pg_num;
3714
c56b4d90
CG
3715 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3716 page = pgv_to_page(kaddr);
69e3c75f
JB
3717 err = vm_insert_page(vma, start, page);
3718 if (unlikely(err))
3719 goto out;
3720 start += PAGE_SIZE;
0e3125c7 3721 kaddr += PAGE_SIZE;
69e3c75f 3722 }
4ebf0ae2 3723 }
1da177e4 3724 }
69e3c75f 3725
4ebf0ae2 3726 atomic_inc(&po->mapped);
1da177e4
LT
3727 vma->vm_ops = &packet_mmap_ops;
3728 err = 0;
3729
3730out:
905db440 3731 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3732 return err;
3733}
1da177e4 3734
90ddc4f0 3735static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3736 .family = PF_PACKET,
3737 .owner = THIS_MODULE,
3738 .release = packet_release,
3739 .bind = packet_bind_spkt,
3740 .connect = sock_no_connect,
3741 .socketpair = sock_no_socketpair,
3742 .accept = sock_no_accept,
3743 .getname = packet_getname_spkt,
3744 .poll = datagram_poll,
3745 .ioctl = packet_ioctl,
3746 .listen = sock_no_listen,
3747 .shutdown = sock_no_shutdown,
3748 .setsockopt = sock_no_setsockopt,
3749 .getsockopt = sock_no_getsockopt,
3750 .sendmsg = packet_sendmsg_spkt,
3751 .recvmsg = packet_recvmsg,
3752 .mmap = sock_no_mmap,
3753 .sendpage = sock_no_sendpage,
3754};
1da177e4 3755
90ddc4f0 3756static const struct proto_ops packet_ops = {
1da177e4
LT
3757 .family = PF_PACKET,
3758 .owner = THIS_MODULE,
3759 .release = packet_release,
3760 .bind = packet_bind,
3761 .connect = sock_no_connect,
3762 .socketpair = sock_no_socketpair,
3763 .accept = sock_no_accept,
1ce4f28b 3764 .getname = packet_getname,
1da177e4
LT
3765 .poll = packet_poll,
3766 .ioctl = packet_ioctl,
3767 .listen = sock_no_listen,
3768 .shutdown = sock_no_shutdown,
3769 .setsockopt = packet_setsockopt,
3770 .getsockopt = packet_getsockopt,
3771 .sendmsg = packet_sendmsg,
3772 .recvmsg = packet_recvmsg,
3773 .mmap = packet_mmap,
3774 .sendpage = sock_no_sendpage,
3775};
3776
ec1b4cf7 3777static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3778 .family = PF_PACKET,
3779 .create = packet_create,
3780 .owner = THIS_MODULE,
3781};
3782
3783static struct notifier_block packet_netdev_notifier = {
40d4e3df 3784 .notifier_call = packet_notifier,
1da177e4
LT
3785};
3786
3787#ifdef CONFIG_PROC_FS
1da177e4
LT
3788
3789static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3790 __acquires(RCU)
1da177e4 3791{
e372c414 3792 struct net *net = seq_file_net(seq);
808f5114 3793
3794 rcu_read_lock();
3795 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3796}
3797
3798static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3799{
1bf40954 3800 struct net *net = seq_file_net(seq);
808f5114 3801 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3802}
3803
3804static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3805 __releases(RCU)
1da177e4 3806{
808f5114 3807 rcu_read_unlock();
1da177e4
LT
3808}
3809
1ce4f28b 3810static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3811{
3812 if (v == SEQ_START_TOKEN)
3813 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3814 else {
b7ceabd9 3815 struct sock *s = sk_entry(v);
1da177e4
LT
3816 const struct packet_sock *po = pkt_sk(s);
3817
3818 seq_printf(seq,
71338aa7 3819 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3820 s,
3821 atomic_read(&s->sk_refcnt),
3822 s->sk_type,
3823 ntohs(po->num),
3824 po->ifindex,
3825 po->running,
3826 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3827 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3828 sock_i_ino(s));
1da177e4
LT
3829 }
3830
3831 return 0;
3832}
3833
56b3d975 3834static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3835 .start = packet_seq_start,
3836 .next = packet_seq_next,
3837 .stop = packet_seq_stop,
3838 .show = packet_seq_show,
3839};
3840
3841static int packet_seq_open(struct inode *inode, struct file *file)
3842{
e372c414
DL
3843 return seq_open_net(inode, file, &packet_seq_ops,
3844 sizeof(struct seq_net_private));
1da177e4
LT
3845}
3846
da7071d7 3847static const struct file_operations packet_seq_fops = {
1da177e4
LT
3848 .owner = THIS_MODULE,
3849 .open = packet_seq_open,
3850 .read = seq_read,
3851 .llseek = seq_lseek,
e372c414 3852 .release = seq_release_net,
1da177e4
LT
3853};
3854
3855#endif
3856
2c8c1e72 3857static int __net_init packet_net_init(struct net *net)
d12d01d6 3858{
0fa7fa98 3859 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3860 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3861
d4beaa66 3862 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3863 return -ENOMEM;
3864
3865 return 0;
3866}
3867
2c8c1e72 3868static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3869{
ece31ffd 3870 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3871}
3872
3873static struct pernet_operations packet_net_ops = {
3874 .init = packet_net_init,
3875 .exit = packet_net_exit,
3876};
3877
3878
1da177e4
LT
3879static void __exit packet_exit(void)
3880{
1da177e4 3881 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3882 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3883 sock_unregister(PF_PACKET);
3884 proto_unregister(&packet_proto);
3885}
3886
3887static int __init packet_init(void)
3888{
3889 int rc = proto_register(&packet_proto, 0);
3890
3891 if (rc != 0)
3892 goto out;
3893
3894 sock_register(&packet_family_ops);
d12d01d6 3895 register_pernet_subsys(&packet_net_ops);
1da177e4 3896 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3897out:
3898 return rc;
3899}
3900
3901module_init(packet_init);
3902module_exit(packet_exit);
3903MODULE_LICENSE("GPL");
3904MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 0.973348 seconds and 5 git commands to generate.