packet: minor: convert status bits into shifting format
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
ce06b03e
DM
240/* register_prot_hook must be invoked with the po->bind_lock held,
241 * or from a context in which asynchronous accesses to the packet
242 * socket is not possible (packet_create()).
243 */
244static void register_prot_hook(struct sock *sk)
245{
246 struct packet_sock *po = pkt_sk(sk);
247 if (!po->running) {
dc99f600
DM
248 if (po->fanout)
249 __fanout_link(sk, po);
250 else
251 dev_add_pack(&po->prot_hook);
ce06b03e
DM
252 sock_hold(sk);
253 po->running = 1;
254 }
255}
256
257/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
258 * held. If the sync parameter is true, we will temporarily drop
259 * the po->bind_lock and do a synchronize_net to make sure no
260 * asynchronous packet processing paths still refer to the elements
261 * of po->prot_hook. If the sync parameter is false, it is the
262 * callers responsibility to take care of this.
263 */
264static void __unregister_prot_hook(struct sock *sk, bool sync)
265{
266 struct packet_sock *po = pkt_sk(sk);
267
268 po->running = 0;
dc99f600
DM
269 if (po->fanout)
270 __fanout_unlink(sk, po);
271 else
272 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
273 __sock_put(sk);
274
275 if (sync) {
276 spin_unlock(&po->bind_lock);
277 synchronize_net();
278 spin_lock(&po->bind_lock);
279 }
280}
281
282static void unregister_prot_hook(struct sock *sk, bool sync)
283{
284 struct packet_sock *po = pkt_sk(sk);
285
286 if (po->running)
287 __unregister_prot_hook(sk, sync);
288}
289
f6dafa95 290static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
291{
292 if (is_vmalloc_addr(addr))
293 return vmalloc_to_page(addr);
294 return virt_to_page(addr);
295}
296
69e3c75f 297static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 298{
184f489e 299 union tpacket_uhdr h;
1da177e4 300
69e3c75f 301 h.raw = frame;
bbd6ef87
PM
302 switch (po->tp_version) {
303 case TPACKET_V1:
69e3c75f 304 h.h1->tp_status = status;
0af55bb5 305 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
306 break;
307 case TPACKET_V2:
69e3c75f 308 h.h2->tp_status = status;
0af55bb5 309 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 310 break;
f6fb8f10 311 case TPACKET_V3:
69e3c75f 312 default:
f6fb8f10 313 WARN(1, "TPACKET version not supported.\n");
69e3c75f 314 BUG();
bbd6ef87 315 }
69e3c75f
JB
316
317 smp_wmb();
bbd6ef87
PM
318}
319
69e3c75f 320static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 321{
184f489e 322 union tpacket_uhdr h;
bbd6ef87 323
69e3c75f
JB
324 smp_rmb();
325
bbd6ef87
PM
326 h.raw = frame;
327 switch (po->tp_version) {
328 case TPACKET_V1:
0af55bb5 329 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 330 return h.h1->tp_status;
bbd6ef87 331 case TPACKET_V2:
0af55bb5 332 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 333 return h.h2->tp_status;
f6fb8f10 334 case TPACKET_V3:
69e3c75f 335 default:
f6fb8f10 336 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
337 BUG();
338 return 0;
bbd6ef87 339 }
1da177e4 340}
69e3c75f 341
7a51384c
DB
342static bool tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
343 unsigned int flags)
344{
345 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
346
347 if (shhwtstamps) {
348 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
349 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
350 return true;
351 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
352 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
353 return true;
354 }
355
356 if (ktime_to_timespec_cond(skb->tstamp, ts))
357 return true;
358
359 return false;
360}
361
2e31396f 362static void __packet_set_timestamp(struct packet_sock *po, void *frame,
7a51384c 363 struct sk_buff *skb)
2e31396f
WB
364{
365 union tpacket_uhdr h;
366 struct timespec ts;
367
7a51384c 368 if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp))
2e31396f
WB
369 return;
370
371 h.raw = frame;
372 switch (po->tp_version) {
373 case TPACKET_V1:
374 h.h1->tp_sec = ts.tv_sec;
375 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
376 break;
377 case TPACKET_V2:
378 h.h2->tp_sec = ts.tv_sec;
379 h.h2->tp_nsec = ts.tv_nsec;
380 break;
381 case TPACKET_V3:
382 default:
383 WARN(1, "TPACKET version not supported.\n");
384 BUG();
385 }
386
387 /* one flush is safe, as both fields always lie on the same cacheline */
388 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
389 smp_wmb();
390}
391
69e3c75f
JB
392static void *packet_lookup_frame(struct packet_sock *po,
393 struct packet_ring_buffer *rb,
394 unsigned int position,
395 int status)
396{
397 unsigned int pg_vec_pos, frame_offset;
184f489e 398 union tpacket_uhdr h;
69e3c75f
JB
399
400 pg_vec_pos = position / rb->frames_per_block;
401 frame_offset = position % rb->frames_per_block;
402
0e3125c7
NH
403 h.raw = rb->pg_vec[pg_vec_pos].buffer +
404 (frame_offset * rb->frame_size);
69e3c75f
JB
405
406 if (status != __packet_get_status(po, h.raw))
407 return NULL;
408
409 return h.raw;
410}
411
eea49cc9 412static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
413 struct packet_ring_buffer *rb,
414 int status)
415{
416 return packet_lookup_frame(po, rb, rb->head, status);
417}
418
bc59ba39 419static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 420{
421 del_timer_sync(&pkc->retire_blk_timer);
422}
423
424static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
425 int tx_ring,
426 struct sk_buff_head *rb_queue)
427{
bc59ba39 428 struct tpacket_kbdq_core *pkc;
f6fb8f10 429
430 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
431
432 spin_lock(&rb_queue->lock);
433 pkc->delete_blk_timer = 1;
434 spin_unlock(&rb_queue->lock);
435
436 prb_del_retire_blk_timer(pkc);
437}
438
439static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 440 struct tpacket_kbdq_core *pkc,
f6fb8f10 441 void (*func) (unsigned long))
442{
443 init_timer(&pkc->retire_blk_timer);
444 pkc->retire_blk_timer.data = (long)po;
445 pkc->retire_blk_timer.function = func;
446 pkc->retire_blk_timer.expires = jiffies;
447}
448
449static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
450{
bc59ba39 451 struct tpacket_kbdq_core *pkc;
f6fb8f10 452
453 if (tx_ring)
454 BUG();
455
456 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
457 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
458}
459
460static int prb_calc_retire_blk_tmo(struct packet_sock *po,
461 int blk_size_in_bytes)
462{
463 struct net_device *dev;
464 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
465 struct ethtool_cmd ecmd;
466 int err;
e440cf2c 467 u32 speed;
f6fb8f10 468
4bc71cb9
JP
469 rtnl_lock();
470 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
471 if (unlikely(!dev)) {
472 rtnl_unlock();
f6fb8f10 473 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
474 }
475 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 476 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
477 rtnl_unlock();
478 if (!err) {
4bc71cb9
JP
479 /*
480 * If the link speed is so slow you don't really
481 * need to worry about perf anyways
482 */
e440cf2c 483 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 484 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 485 } else {
486 msec = 1;
487 div = speed / 1000;
f6fb8f10 488 }
489 }
490
491 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
492
493 if (div)
494 mbits /= div;
495
496 tmo = mbits * msec;
497
498 if (div)
499 return tmo+1;
500 return tmo;
501}
502
bc59ba39 503static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 504 union tpacket_req_u *req_u)
505{
506 p1->feature_req_word = req_u->req3.tp_feature_req_word;
507}
508
509static void init_prb_bdqc(struct packet_sock *po,
510 struct packet_ring_buffer *rb,
511 struct pgv *pg_vec,
512 union tpacket_req_u *req_u, int tx_ring)
513{
bc59ba39 514 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
515 struct tpacket_block_desc *pbd;
f6fb8f10 516
517 memset(p1, 0x0, sizeof(*p1));
518
519 p1->knxt_seq_num = 1;
520 p1->pkbdq = pg_vec;
bc59ba39 521 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 522 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 523 p1->kblk_size = req_u->req3.tp_block_size;
524 p1->knum_blocks = req_u->req3.tp_block_nr;
525 p1->hdrlen = po->tp_hdrlen;
526 p1->version = po->tp_version;
527 p1->last_kactive_blk_num = 0;
528 po->stats_u.stats3.tp_freeze_q_cnt = 0;
529 if (req_u->req3.tp_retire_blk_tov)
530 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
531 else
532 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
533 req_u->req3.tp_block_size);
534 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
535 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
536
537 prb_init_ft_ops(p1, req_u);
538 prb_setup_retire_blk_timer(po, tx_ring);
539 prb_open_block(p1, pbd);
540}
541
542/* Do NOT update the last_blk_num first.
543 * Assumes sk_buff_head lock is held.
544 */
bc59ba39 545static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 546{
547 mod_timer(&pkc->retire_blk_timer,
548 jiffies + pkc->tov_in_jiffies);
549 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
550}
551
552/*
553 * Timer logic:
554 * 1) We refresh the timer only when we open a block.
555 * By doing this we don't waste cycles refreshing the timer
556 * on packet-by-packet basis.
557 *
558 * With a 1MB block-size, on a 1Gbps line, it will take
559 * i) ~8 ms to fill a block + ii) memcpy etc.
560 * In this cut we are not accounting for the memcpy time.
561 *
562 * So, if the user sets the 'tmo' to 10ms then the timer
563 * will never fire while the block is still getting filled
564 * (which is what we want). However, the user could choose
565 * to close a block early and that's fine.
566 *
567 * But when the timer does fire, we check whether or not to refresh it.
568 * Since the tmo granularity is in msecs, it is not too expensive
569 * to refresh the timer, lets say every '8' msecs.
570 * Either the user can set the 'tmo' or we can derive it based on
571 * a) line-speed and b) block-size.
572 * prb_calc_retire_blk_tmo() calculates the tmo.
573 *
574 */
575static void prb_retire_rx_blk_timer_expired(unsigned long data)
576{
577 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 578 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 579 unsigned int frozen;
bc59ba39 580 struct tpacket_block_desc *pbd;
f6fb8f10 581
582 spin_lock(&po->sk.sk_receive_queue.lock);
583
584 frozen = prb_queue_frozen(pkc);
585 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
586
587 if (unlikely(pkc->delete_blk_timer))
588 goto out;
589
590 /* We only need to plug the race when the block is partially filled.
591 * tpacket_rcv:
592 * lock(); increment BLOCK_NUM_PKTS; unlock()
593 * copy_bits() is in progress ...
594 * timer fires on other cpu:
595 * we can't retire the current block because copy_bits
596 * is in progress.
597 *
598 */
599 if (BLOCK_NUM_PKTS(pbd)) {
600 while (atomic_read(&pkc->blk_fill_in_prog)) {
601 /* Waiting for skb_copy_bits to finish... */
602 cpu_relax();
603 }
604 }
605
606 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
607 if (!frozen) {
608 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
609 if (!prb_dispatch_next_block(pkc, po))
610 goto refresh_timer;
611 else
612 goto out;
613 } else {
614 /* Case 1. Queue was frozen because user-space was
615 * lagging behind.
616 */
617 if (prb_curr_blk_in_use(pkc, pbd)) {
618 /*
619 * Ok, user-space is still behind.
620 * So just refresh the timer.
621 */
622 goto refresh_timer;
623 } else {
624 /* Case 2. queue was frozen,user-space caught up,
625 * now the link went idle && the timer fired.
626 * We don't have a block to close.So we open this
627 * block and restart the timer.
628 * opening a block thaws the queue,restarts timer
629 * Thawing/timer-refresh is a side effect.
630 */
631 prb_open_block(pkc, pbd);
632 goto out;
633 }
634 }
635 }
636
637refresh_timer:
638 _prb_refresh_rx_retire_blk_timer(pkc);
639
640out:
641 spin_unlock(&po->sk.sk_receive_queue.lock);
642}
643
eea49cc9 644static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 645 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 646{
647 /* Flush everything minus the block header */
648
649#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
650 u8 *start, *end;
651
652 start = (u8 *)pbd1;
653
654 /* Skip the block header(we know header WILL fit in 4K) */
655 start += PAGE_SIZE;
656
657 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
658 for (; start < end; start += PAGE_SIZE)
659 flush_dcache_page(pgv_to_page(start));
660
661 smp_wmb();
662#endif
663
664 /* Now update the block status. */
665
666 BLOCK_STATUS(pbd1) = status;
667
668 /* Flush the block header */
669
670#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
671 start = (u8 *)pbd1;
672 flush_dcache_page(pgv_to_page(start));
673
674 smp_wmb();
675#endif
676}
677
678/*
679 * Side effect:
680 *
681 * 1) flush the block
682 * 2) Increment active_blk_num
683 *
684 * Note:We DONT refresh the timer on purpose.
685 * Because almost always the next block will be opened.
686 */
bc59ba39 687static void prb_close_block(struct tpacket_kbdq_core *pkc1,
688 struct tpacket_block_desc *pbd1,
f6fb8f10 689 struct packet_sock *po, unsigned int stat)
690{
691 __u32 status = TP_STATUS_USER | stat;
692
693 struct tpacket3_hdr *last_pkt;
bc59ba39 694 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 695
696 if (po->stats.tp_drops)
697 status |= TP_STATUS_LOSING;
698
699 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
700 last_pkt->tp_next_offset = 0;
701
702 /* Get the ts of the last pkt */
703 if (BLOCK_NUM_PKTS(pbd1)) {
704 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
705 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
706 } else {
707 /* Ok, we tmo'd - so get the current time */
708 struct timespec ts;
709 getnstimeofday(&ts);
710 h1->ts_last_pkt.ts_sec = ts.tv_sec;
711 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
712 }
713
714 smp_wmb();
715
716 /* Flush the block */
717 prb_flush_block(pkc1, pbd1, status);
718
719 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
720}
721
eea49cc9 722static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 723{
724 pkc->reset_pending_on_curr_blk = 0;
725}
726
727/*
728 * Side effect of opening a block:
729 *
730 * 1) prb_queue is thawed.
731 * 2) retire_blk_timer is refreshed.
732 *
733 */
bc59ba39 734static void prb_open_block(struct tpacket_kbdq_core *pkc1,
735 struct tpacket_block_desc *pbd1)
f6fb8f10 736{
737 struct timespec ts;
bc59ba39 738 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 739
740 smp_rmb();
741
742 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
743
744 /* We could have just memset this but we will lose the
745 * flexibility of making the priv area sticky
746 */
747 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
748 BLOCK_NUM_PKTS(pbd1) = 0;
749 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
750 getnstimeofday(&ts);
751 h1->ts_first_pkt.ts_sec = ts.tv_sec;
752 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
753 pkc1->pkblk_start = (char *)pbd1;
e3192690 754 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 755 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
756 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
757 pbd1->version = pkc1->version;
758 pkc1->prev = pkc1->nxt_offset;
759 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
760 prb_thaw_queue(pkc1);
761 _prb_refresh_rx_retire_blk_timer(pkc1);
762
763 smp_wmb();
764
765 return;
766 }
767
768 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
769 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
770 dump_stack();
771 BUG();
772}
773
774/*
775 * Queue freeze logic:
776 * 1) Assume tp_block_nr = 8 blocks.
777 * 2) At time 't0', user opens Rx ring.
778 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
779 * 4) user-space is either sleeping or processing block '0'.
780 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
781 * it will close block-7,loop around and try to fill block '0'.
782 * call-flow:
783 * __packet_lookup_frame_in_block
784 * prb_retire_current_block()
785 * prb_dispatch_next_block()
786 * |->(BLOCK_STATUS == USER) evaluates to true
787 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
788 * 6) Now there are two cases:
789 * 6.1) Link goes idle right after the queue is frozen.
790 * But remember, the last open_block() refreshed the timer.
791 * When this timer expires,it will refresh itself so that we can
792 * re-open block-0 in near future.
793 * 6.2) Link is busy and keeps on receiving packets. This is a simple
794 * case and __packet_lookup_frame_in_block will check if block-0
795 * is free and can now be re-used.
796 */
eea49cc9 797static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 798 struct packet_sock *po)
799{
800 pkc->reset_pending_on_curr_blk = 1;
801 po->stats_u.stats3.tp_freeze_q_cnt++;
802}
803
804#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
805
806/*
807 * If the next block is free then we will dispatch it
808 * and return a good offset.
809 * Else, we will freeze the queue.
810 * So, caller must check the return value.
811 */
bc59ba39 812static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 813 struct packet_sock *po)
814{
bc59ba39 815 struct tpacket_block_desc *pbd;
f6fb8f10 816
817 smp_rmb();
818
819 /* 1. Get current block num */
820 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
821
822 /* 2. If this block is currently in_use then freeze the queue */
823 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
824 prb_freeze_queue(pkc, po);
825 return NULL;
826 }
827
828 /*
829 * 3.
830 * open this block and return the offset where the first packet
831 * needs to get stored.
832 */
833 prb_open_block(pkc, pbd);
834 return (void *)pkc->nxt_offset;
835}
836
bc59ba39 837static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 838 struct packet_sock *po, unsigned int status)
839{
bc59ba39 840 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 841
842 /* retire/close the current block */
843 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
844 /*
845 * Plug the case where copy_bits() is in progress on
846 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
847 * have space to copy the pkt in the current block and
848 * called prb_retire_current_block()
849 *
850 * We don't need to worry about the TMO case because
851 * the timer-handler already handled this case.
852 */
853 if (!(status & TP_STATUS_BLK_TMO)) {
854 while (atomic_read(&pkc->blk_fill_in_prog)) {
855 /* Waiting for skb_copy_bits to finish... */
856 cpu_relax();
857 }
858 }
859 prb_close_block(pkc, pbd, po, status);
860 return;
861 }
862
863 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
864 dump_stack();
865 BUG();
866}
867
eea49cc9 868static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 869 struct tpacket_block_desc *pbd)
f6fb8f10 870{
871 return TP_STATUS_USER & BLOCK_STATUS(pbd);
872}
873
eea49cc9 874static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 875{
876 return pkc->reset_pending_on_curr_blk;
877}
878
eea49cc9 879static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 880{
bc59ba39 881 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 882 atomic_dec(&pkc->blk_fill_in_prog);
883}
884
eea49cc9 885static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 886 struct tpacket3_hdr *ppd)
887{
888 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
889}
890
eea49cc9 891static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 892 struct tpacket3_hdr *ppd)
893{
894 ppd->hv1.tp_rxhash = 0;
895}
896
eea49cc9 897static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 898 struct tpacket3_hdr *ppd)
899{
900 if (vlan_tx_tag_present(pkc->skb)) {
901 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
902 ppd->tp_status = TP_STATUS_VLAN_VALID;
903 } else {
9e67030a 904 ppd->hv1.tp_vlan_tci = 0;
905 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 906 }
907}
908
bc59ba39 909static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 910 struct tpacket3_hdr *ppd)
911{
912 prb_fill_vlan_info(pkc, ppd);
913
914 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
915 prb_fill_rxhash(pkc, ppd);
916 else
917 prb_clear_rxhash(pkc, ppd);
918}
919
eea49cc9 920static void prb_fill_curr_block(char *curr,
bc59ba39 921 struct tpacket_kbdq_core *pkc,
922 struct tpacket_block_desc *pbd,
f6fb8f10 923 unsigned int len)
924{
925 struct tpacket3_hdr *ppd;
926
927 ppd = (struct tpacket3_hdr *)curr;
928 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
929 pkc->prev = curr;
930 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
931 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
932 BLOCK_NUM_PKTS(pbd) += 1;
933 atomic_inc(&pkc->blk_fill_in_prog);
934 prb_run_all_ft_ops(pkc, ppd);
935}
936
937/* Assumes caller has the sk->rx_queue.lock */
938static void *__packet_lookup_frame_in_block(struct packet_sock *po,
939 struct sk_buff *skb,
940 int status,
941 unsigned int len
942 )
943{
bc59ba39 944 struct tpacket_kbdq_core *pkc;
945 struct tpacket_block_desc *pbd;
f6fb8f10 946 char *curr, *end;
947
e3192690 948 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 949 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
950
951 /* Queue is frozen when user space is lagging behind */
952 if (prb_queue_frozen(pkc)) {
953 /*
954 * Check if that last block which caused the queue to freeze,
955 * is still in_use by user-space.
956 */
957 if (prb_curr_blk_in_use(pkc, pbd)) {
958 /* Can't record this packet */
959 return NULL;
960 } else {
961 /*
962 * Ok, the block was released by user-space.
963 * Now let's open that block.
964 * opening a block also thaws the queue.
965 * Thawing is a side effect.
966 */
967 prb_open_block(pkc, pbd);
968 }
969 }
970
971 smp_mb();
972 curr = pkc->nxt_offset;
973 pkc->skb = skb;
e3192690 974 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 975
976 /* first try the current block */
977 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
978 prb_fill_curr_block(curr, pkc, pbd, len);
979 return (void *)curr;
980 }
981
982 /* Ok, close the current block */
983 prb_retire_current_block(pkc, po, 0);
984
985 /* Now, try to dispatch the next block */
986 curr = (char *)prb_dispatch_next_block(pkc, po);
987 if (curr) {
988 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
989 prb_fill_curr_block(curr, pkc, pbd, len);
990 return (void *)curr;
991 }
992
993 /*
994 * No free blocks are available.user_space hasn't caught up yet.
995 * Queue was just frozen and now this packet will get dropped.
996 */
997 return NULL;
998}
999
eea49cc9 1000static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1001 struct sk_buff *skb,
1002 int status, unsigned int len)
1003{
1004 char *curr = NULL;
1005 switch (po->tp_version) {
1006 case TPACKET_V1:
1007 case TPACKET_V2:
1008 curr = packet_lookup_frame(po, &po->rx_ring,
1009 po->rx_ring.head, status);
1010 return curr;
1011 case TPACKET_V3:
1012 return __packet_lookup_frame_in_block(po, skb, status, len);
1013 default:
1014 WARN(1, "TPACKET version not supported\n");
1015 BUG();
99aa3473 1016 return NULL;
f6fb8f10 1017 }
1018}
1019
eea49cc9 1020static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1021 struct packet_ring_buffer *rb,
77f65ebd 1022 unsigned int idx,
f6fb8f10 1023 int status)
1024{
bc59ba39 1025 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1026 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1027
1028 if (status != BLOCK_STATUS(pbd))
1029 return NULL;
1030 return pbd;
1031}
1032
eea49cc9 1033static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1034{
1035 unsigned int prev;
1036 if (rb->prb_bdqc.kactive_blk_num)
1037 prev = rb->prb_bdqc.kactive_blk_num-1;
1038 else
1039 prev = rb->prb_bdqc.knum_blocks-1;
1040 return prev;
1041}
1042
1043/* Assumes caller has held the rx_queue.lock */
eea49cc9 1044static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1045 struct packet_ring_buffer *rb,
1046 int status)
1047{
1048 unsigned int previous = prb_previous_blk_num(rb);
1049 return prb_lookup_block(po, rb, previous, status);
1050}
1051
eea49cc9 1052static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1053 struct packet_ring_buffer *rb,
1054 int status)
1055{
1056 if (po->tp_version <= TPACKET_V2)
1057 return packet_previous_frame(po, rb, status);
1058
1059 return __prb_previous_block(po, rb, status);
1060}
1061
eea49cc9 1062static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1063 struct packet_ring_buffer *rb)
1064{
1065 switch (po->tp_version) {
1066 case TPACKET_V1:
1067 case TPACKET_V2:
1068 return packet_increment_head(rb);
1069 case TPACKET_V3:
1070 default:
1071 WARN(1, "TPACKET version not supported.\n");
1072 BUG();
1073 return;
1074 }
1075}
1076
eea49cc9 1077static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1078 struct packet_ring_buffer *rb,
1079 int status)
1080{
1081 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1082 return packet_lookup_frame(po, rb, previous, status);
1083}
1084
eea49cc9 1085static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1086{
1087 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1088}
1089
77f65ebd
WB
1090static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1091{
1092 struct sock *sk = &po->sk;
1093 bool has_room;
1094
1095 if (po->prot_hook.func != tpacket_rcv)
1096 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1097 <= sk->sk_rcvbuf;
1098
1099 spin_lock(&sk->sk_receive_queue.lock);
1100 if (po->tp_version == TPACKET_V3)
1101 has_room = prb_lookup_block(po, &po->rx_ring,
1102 po->rx_ring.prb_bdqc.kactive_blk_num,
1103 TP_STATUS_KERNEL);
1104 else
1105 has_room = packet_lookup_frame(po, &po->rx_ring,
1106 po->rx_ring.head,
1107 TP_STATUS_KERNEL);
1108 spin_unlock(&sk->sk_receive_queue.lock);
1109
1110 return has_room;
1111}
1112
1da177e4
LT
1113static void packet_sock_destruct(struct sock *sk)
1114{
ed85b565
RC
1115 skb_queue_purge(&sk->sk_error_queue);
1116
547b792c
IJ
1117 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1118 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1119
1120 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1121 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1122 return;
1123 }
1124
17ab56a2 1125 sk_refcnt_debug_dec(sk);
1da177e4
LT
1126}
1127
dc99f600
DM
1128static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1129{
1130 int x = atomic_read(&f->rr_cur) + 1;
1131
1132 if (x >= num)
1133 x = 0;
1134
1135 return x;
1136}
1137
77f65ebd
WB
1138static unsigned int fanout_demux_hash(struct packet_fanout *f,
1139 struct sk_buff *skb,
1140 unsigned int num)
dc99f600 1141{
77f65ebd 1142 return (((u64)skb->rxhash) * num) >> 32;
dc99f600
DM
1143}
1144
77f65ebd
WB
1145static unsigned int fanout_demux_lb(struct packet_fanout *f,
1146 struct sk_buff *skb,
1147 unsigned int num)
dc99f600
DM
1148{
1149 int cur, old;
1150
1151 cur = atomic_read(&f->rr_cur);
1152 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1153 fanout_rr_next(f, num))) != cur)
1154 cur = old;
77f65ebd
WB
1155 return cur;
1156}
1157
1158static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1159 struct sk_buff *skb,
1160 unsigned int num)
1161{
1162 return smp_processor_id() % num;
dc99f600
DM
1163}
1164
77f65ebd
WB
1165static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1166 struct sk_buff *skb,
1167 unsigned int idx, unsigned int skip,
1168 unsigned int num)
95ec3eb4 1169{
77f65ebd 1170 unsigned int i, j;
95ec3eb4 1171
77f65ebd
WB
1172 i = j = min_t(int, f->next[idx], num - 1);
1173 do {
1174 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1175 if (i != j)
1176 f->next[idx] = i;
1177 return i;
1178 }
1179 if (++i == num)
1180 i = 0;
1181 } while (i != j);
1182
1183 return idx;
1184}
1185
1186static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1187{
1188 return f->flags & (flag >> 8);
95ec3eb4
DM
1189}
1190
95ec3eb4
DM
1191static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1192 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1193{
1194 struct packet_fanout *f = pt->af_packet_priv;
1195 unsigned int num = f->num_members;
1196 struct packet_sock *po;
77f65ebd 1197 unsigned int idx;
dc99f600
DM
1198
1199 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1200 !num) {
1201 kfree_skb(skb);
1202 return 0;
1203 }
1204
95ec3eb4
DM
1205 switch (f->type) {
1206 case PACKET_FANOUT_HASH:
1207 default:
77f65ebd 1208 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1209 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1210 if (!skb)
1211 return 0;
1212 }
1213 skb_get_rxhash(skb);
77f65ebd 1214 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1215 break;
1216 case PACKET_FANOUT_LB:
77f65ebd 1217 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1218 break;
1219 case PACKET_FANOUT_CPU:
77f65ebd
WB
1220 idx = fanout_demux_cpu(f, skb, num);
1221 break;
1222 case PACKET_FANOUT_ROLLOVER:
1223 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1224 break;
dc99f600
DM
1225 }
1226
77f65ebd
WB
1227 po = pkt_sk(f->arr[idx]);
1228 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1229 unlikely(!packet_rcv_has_room(po, skb))) {
1230 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1231 po = pkt_sk(f->arr[idx]);
1232 }
dc99f600
DM
1233
1234 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1235}
1236
fff3321d
PE
1237DEFINE_MUTEX(fanout_mutex);
1238EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1239static LIST_HEAD(fanout_list);
1240
1241static void __fanout_link(struct sock *sk, struct packet_sock *po)
1242{
1243 struct packet_fanout *f = po->fanout;
1244
1245 spin_lock(&f->lock);
1246 f->arr[f->num_members] = sk;
1247 smp_wmb();
1248 f->num_members++;
1249 spin_unlock(&f->lock);
1250}
1251
1252static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1253{
1254 struct packet_fanout *f = po->fanout;
1255 int i;
1256
1257 spin_lock(&f->lock);
1258 for (i = 0; i < f->num_members; i++) {
1259 if (f->arr[i] == sk)
1260 break;
1261 }
1262 BUG_ON(i >= f->num_members);
1263 f->arr[i] = f->arr[f->num_members - 1];
1264 f->num_members--;
1265 spin_unlock(&f->lock);
1266}
1267
a0dfb263 1268static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
c0de08d0
EL
1269{
1270 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1271 return true;
1272
1273 return false;
1274}
1275
7736d33f 1276static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1277{
1278 struct packet_sock *po = pkt_sk(sk);
1279 struct packet_fanout *f, *match;
7736d33f 1280 u8 type = type_flags & 0xff;
77f65ebd 1281 u8 flags = type_flags >> 8;
dc99f600
DM
1282 int err;
1283
1284 switch (type) {
77f65ebd
WB
1285 case PACKET_FANOUT_ROLLOVER:
1286 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1287 return -EINVAL;
dc99f600
DM
1288 case PACKET_FANOUT_HASH:
1289 case PACKET_FANOUT_LB:
95ec3eb4 1290 case PACKET_FANOUT_CPU:
dc99f600
DM
1291 break;
1292 default:
1293 return -EINVAL;
1294 }
1295
1296 if (!po->running)
1297 return -EINVAL;
1298
1299 if (po->fanout)
1300 return -EALREADY;
1301
1302 mutex_lock(&fanout_mutex);
1303 match = NULL;
1304 list_for_each_entry(f, &fanout_list, list) {
1305 if (f->id == id &&
1306 read_pnet(&f->net) == sock_net(sk)) {
1307 match = f;
1308 break;
1309 }
1310 }
afe62c68 1311 err = -EINVAL;
77f65ebd 1312 if (match && match->flags != flags)
afe62c68 1313 goto out;
dc99f600 1314 if (!match) {
afe62c68 1315 err = -ENOMEM;
dc99f600 1316 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1317 if (!match)
1318 goto out;
1319 write_pnet(&match->net, sock_net(sk));
1320 match->id = id;
1321 match->type = type;
77f65ebd 1322 match->flags = flags;
afe62c68
ED
1323 atomic_set(&match->rr_cur, 0);
1324 INIT_LIST_HEAD(&match->list);
1325 spin_lock_init(&match->lock);
1326 atomic_set(&match->sk_ref, 0);
1327 match->prot_hook.type = po->prot_hook.type;
1328 match->prot_hook.dev = po->prot_hook.dev;
1329 match->prot_hook.func = packet_rcv_fanout;
1330 match->prot_hook.af_packet_priv = match;
c0de08d0 1331 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1332 dev_add_pack(&match->prot_hook);
1333 list_add(&match->list, &fanout_list);
dc99f600 1334 }
afe62c68
ED
1335 err = -EINVAL;
1336 if (match->type == type &&
1337 match->prot_hook.type == po->prot_hook.type &&
1338 match->prot_hook.dev == po->prot_hook.dev) {
1339 err = -ENOSPC;
1340 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1341 __dev_remove_pack(&po->prot_hook);
1342 po->fanout = match;
1343 atomic_inc(&match->sk_ref);
1344 __fanout_link(sk, po);
1345 err = 0;
dc99f600
DM
1346 }
1347 }
afe62c68 1348out:
dc99f600
DM
1349 mutex_unlock(&fanout_mutex);
1350 return err;
1351}
1352
1353static void fanout_release(struct sock *sk)
1354{
1355 struct packet_sock *po = pkt_sk(sk);
1356 struct packet_fanout *f;
1357
1358 f = po->fanout;
1359 if (!f)
1360 return;
1361
fff3321d 1362 mutex_lock(&fanout_mutex);
dc99f600
DM
1363 po->fanout = NULL;
1364
dc99f600
DM
1365 if (atomic_dec_and_test(&f->sk_ref)) {
1366 list_del(&f->list);
1367 dev_remove_pack(&f->prot_hook);
1368 kfree(f);
1369 }
1370 mutex_unlock(&fanout_mutex);
1371}
1da177e4 1372
90ddc4f0 1373static const struct proto_ops packet_ops;
1da177e4 1374
90ddc4f0 1375static const struct proto_ops packet_ops_spkt;
1da177e4 1376
40d4e3df
ED
1377static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1378 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1379{
1380 struct sock *sk;
1381 struct sockaddr_pkt *spkt;
1382
1383 /*
1384 * When we registered the protocol we saved the socket in the data
1385 * field for just this event.
1386 */
1387
1388 sk = pt->af_packet_priv;
1ce4f28b 1389
1da177e4
LT
1390 /*
1391 * Yank back the headers [hope the device set this
1392 * right or kerboom...]
1393 *
1394 * Incoming packets have ll header pulled,
1395 * push it back.
1396 *
98e399f8 1397 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1398 * so that this procedure is noop.
1399 */
1400
1401 if (skb->pkt_type == PACKET_LOOPBACK)
1402 goto out;
1403
09ad9bc7 1404 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1405 goto out;
1406
40d4e3df
ED
1407 skb = skb_share_check(skb, GFP_ATOMIC);
1408 if (skb == NULL)
1da177e4
LT
1409 goto oom;
1410
1411 /* drop any routing info */
adf30907 1412 skb_dst_drop(skb);
1da177e4 1413
84531c24
PO
1414 /* drop conntrack reference */
1415 nf_reset(skb);
1416
ffbc6111 1417 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1418
98e399f8 1419 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1420
1421 /*
1422 * The SOCK_PACKET socket receives _all_ frames.
1423 */
1424
1425 spkt->spkt_family = dev->type;
1426 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1427 spkt->spkt_protocol = skb->protocol;
1428
1429 /*
1430 * Charge the memory to the socket. This is done specifically
1431 * to prevent sockets using all the memory up.
1432 */
1433
40d4e3df 1434 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1435 return 0;
1436
1437out:
1438 kfree_skb(skb);
1439oom:
1440 return 0;
1441}
1442
1443
1444/*
1445 * Output a raw packet to a device layer. This bypasses all the other
1446 * protocol layers and you must therefore supply it with a complete frame
1447 */
1ce4f28b 1448
1da177e4
LT
1449static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1450 struct msghdr *msg, size_t len)
1451{
1452 struct sock *sk = sock->sk;
40d4e3df 1453 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1454 struct sk_buff *skb = NULL;
1da177e4 1455 struct net_device *dev;
40d4e3df 1456 __be16 proto = 0;
1da177e4 1457 int err;
3bdc0eba 1458 int extra_len = 0;
1ce4f28b 1459
1da177e4 1460 /*
1ce4f28b 1461 * Get and verify the address.
1da177e4
LT
1462 */
1463
40d4e3df 1464 if (saddr) {
1da177e4 1465 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1466 return -EINVAL;
1467 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1468 proto = saddr->spkt_protocol;
1469 } else
1470 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1471
1472 /*
1ce4f28b 1473 * Find the device first to size check it
1da177e4
LT
1474 */
1475
de74e92a 1476 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1477retry:
654d1f8a
ED
1478 rcu_read_lock();
1479 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1480 err = -ENODEV;
1481 if (dev == NULL)
1482 goto out_unlock;
1ce4f28b 1483
d5e76b0a
DM
1484 err = -ENETDOWN;
1485 if (!(dev->flags & IFF_UP))
1486 goto out_unlock;
1487
1da177e4 1488 /*
40d4e3df
ED
1489 * You may not queue a frame bigger than the mtu. This is the lowest level
1490 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1491 */
1ce4f28b 1492
3bdc0eba
BG
1493 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1494 if (!netif_supports_nofcs(dev)) {
1495 err = -EPROTONOSUPPORT;
1496 goto out_unlock;
1497 }
1498 extra_len = 4; /* We're doing our own CRC */
1499 }
1500
1da177e4 1501 err = -EMSGSIZE;
3bdc0eba 1502 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1503 goto out_unlock;
1504
1a35ca80
ED
1505 if (!skb) {
1506 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1507 int tlen = dev->needed_tailroom;
1a35ca80
ED
1508 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1509
1510 rcu_read_unlock();
4ce40912 1511 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1512 if (skb == NULL)
1513 return -ENOBUFS;
1514 /* FIXME: Save some space for broken drivers that write a hard
1515 * header at transmission time by themselves. PPP is the notable
1516 * one here. This should really be fixed at the driver level.
1517 */
1518 skb_reserve(skb, reserved);
1519 skb_reset_network_header(skb);
1520
1521 /* Try to align data part correctly */
1522 if (hhlen) {
1523 skb->data -= hhlen;
1524 skb->tail -= hhlen;
1525 if (len < hhlen)
1526 skb_reset_network_header(skb);
1527 }
1528 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1529 if (err)
1530 goto out_free;
1531 goto retry;
1da177e4
LT
1532 }
1533
3bdc0eba 1534 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1535 /* Earlier code assumed this would be a VLAN pkt,
1536 * double-check this now that we have the actual
1537 * packet in hand.
1538 */
1539 struct ethhdr *ehdr;
1540 skb_reset_mac_header(skb);
1541 ehdr = eth_hdr(skb);
1542 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1543 err = -EMSGSIZE;
1544 goto out_unlock;
1545 }
1546 }
1a35ca80 1547
1da177e4
LT
1548 skb->protocol = proto;
1549 skb->dev = dev;
1550 skb->priority = sk->sk_priority;
2d37a186 1551 skb->mark = sk->sk_mark;
bf84a010
DB
1552
1553 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1554
3bdc0eba
BG
1555 if (unlikely(extra_len == 4))
1556 skb->no_fcs = 1;
1557
40893fd0 1558 skb_probe_transport_header(skb, 0);
c1aad275 1559
1da177e4 1560 dev_queue_xmit(skb);
654d1f8a 1561 rcu_read_unlock();
40d4e3df 1562 return len;
1da177e4 1563
1da177e4 1564out_unlock:
654d1f8a 1565 rcu_read_unlock();
1a35ca80
ED
1566out_free:
1567 kfree_skb(skb);
1da177e4
LT
1568 return err;
1569}
1da177e4 1570
eea49cc9 1571static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1572 const struct sock *sk,
dbcb5855 1573 unsigned int res)
1da177e4
LT
1574{
1575 struct sk_filter *filter;
fda9ef5d 1576
80f8f102
ED
1577 rcu_read_lock();
1578 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1579 if (filter != NULL)
0a14842f 1580 res = SK_RUN_FILTER(filter, skb);
80f8f102 1581 rcu_read_unlock();
1da177e4 1582
dbcb5855 1583 return res;
1da177e4
LT
1584}
1585
1586/*
62ab0812
ED
1587 * This function makes lazy skb cloning in hope that most of packets
1588 * are discarded by BPF.
1589 *
1590 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1591 * and skb->cb are mangled. It works because (and until) packets
1592 * falling here are owned by current CPU. Output packets are cloned
1593 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1594 * sequencially, so that if we return skb to original state on exit,
1595 * we will not harm anyone.
1da177e4
LT
1596 */
1597
40d4e3df
ED
1598static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1599 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1600{
1601 struct sock *sk;
1602 struct sockaddr_ll *sll;
1603 struct packet_sock *po;
40d4e3df 1604 u8 *skb_head = skb->data;
1da177e4 1605 int skb_len = skb->len;
dbcb5855 1606 unsigned int snaplen, res;
1da177e4
LT
1607
1608 if (skb->pkt_type == PACKET_LOOPBACK)
1609 goto drop;
1610
1611 sk = pt->af_packet_priv;
1612 po = pkt_sk(sk);
1613
09ad9bc7 1614 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1615 goto drop;
1616
1da177e4
LT
1617 skb->dev = dev;
1618
3b04ddde 1619 if (dev->header_ops) {
1da177e4 1620 /* The device has an explicit notion of ll header,
62ab0812
ED
1621 * exported to higher levels.
1622 *
1623 * Otherwise, the device hides details of its frame
1624 * structure, so that corresponding packet head is
1625 * never delivered to user.
1da177e4
LT
1626 */
1627 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1628 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1629 else if (skb->pkt_type == PACKET_OUTGOING) {
1630 /* Special case: outgoing packets have ll header at head */
bbe735e4 1631 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1632 }
1633 }
1634
1635 snaplen = skb->len;
1636
dbcb5855
DM
1637 res = run_filter(skb, sk, snaplen);
1638 if (!res)
fda9ef5d 1639 goto drop_n_restore;
dbcb5855
DM
1640 if (snaplen > res)
1641 snaplen = res;
1da177e4 1642
0fd7bac6 1643 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1644 goto drop_n_acct;
1645
1646 if (skb_shared(skb)) {
1647 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1648 if (nskb == NULL)
1649 goto drop_n_acct;
1650
1651 if (skb_head != skb->data) {
1652 skb->data = skb_head;
1653 skb->len = skb_len;
1654 }
abc4e4fa 1655 consume_skb(skb);
1da177e4
LT
1656 skb = nskb;
1657 }
1658
ffbc6111
HX
1659 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1660 sizeof(skb->cb));
1661
1662 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1663 sll->sll_family = AF_PACKET;
1664 sll->sll_hatype = dev->type;
1665 sll->sll_protocol = skb->protocol;
1666 sll->sll_pkttype = skb->pkt_type;
8032b464 1667 if (unlikely(po->origdev))
80feaacb
PWJ
1668 sll->sll_ifindex = orig_dev->ifindex;
1669 else
1670 sll->sll_ifindex = dev->ifindex;
1da177e4 1671
b95cce35 1672 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1673
ffbc6111 1674 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1675
1da177e4
LT
1676 if (pskb_trim(skb, snaplen))
1677 goto drop_n_acct;
1678
1679 skb_set_owner_r(skb, sk);
1680 skb->dev = NULL;
adf30907 1681 skb_dst_drop(skb);
1da177e4 1682
84531c24
PO
1683 /* drop conntrack reference */
1684 nf_reset(skb);
1685
1da177e4
LT
1686 spin_lock(&sk->sk_receive_queue.lock);
1687 po->stats.tp_packets++;
3b885787 1688 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1689 __skb_queue_tail(&sk->sk_receive_queue, skb);
1690 spin_unlock(&sk->sk_receive_queue.lock);
1691 sk->sk_data_ready(sk, skb->len);
1692 return 0;
1693
1694drop_n_acct:
7091fbd8
WB
1695 spin_lock(&sk->sk_receive_queue.lock);
1696 po->stats.tp_drops++;
1697 atomic_inc(&sk->sk_drops);
1698 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1699
1700drop_n_restore:
1701 if (skb_head != skb->data && skb_shared(skb)) {
1702 skb->data = skb_head;
1703 skb->len = skb_len;
1704 }
1705drop:
ead2ceb0 1706 consume_skb(skb);
1da177e4
LT
1707 return 0;
1708}
1709
40d4e3df
ED
1710static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1711 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1712{
1713 struct sock *sk;
1714 struct packet_sock *po;
1715 struct sockaddr_ll *sll;
184f489e 1716 union tpacket_uhdr h;
40d4e3df 1717 u8 *skb_head = skb->data;
1da177e4 1718 int skb_len = skb->len;
dbcb5855 1719 unsigned int snaplen, res;
f6fb8f10 1720 unsigned long status = TP_STATUS_USER;
bbd6ef87 1721 unsigned short macoff, netoff, hdrlen;
1da177e4 1722 struct sk_buff *copy_skb = NULL;
bbd6ef87 1723 struct timespec ts;
1da177e4
LT
1724
1725 if (skb->pkt_type == PACKET_LOOPBACK)
1726 goto drop;
1727
1728 sk = pt->af_packet_priv;
1729 po = pkt_sk(sk);
1730
09ad9bc7 1731 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1732 goto drop;
1733
3b04ddde 1734 if (dev->header_ops) {
1da177e4 1735 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1736 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1737 else if (skb->pkt_type == PACKET_OUTGOING) {
1738 /* Special case: outgoing packets have ll header at head */
bbe735e4 1739 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1740 }
1741 }
1742
8dc41944
HX
1743 if (skb->ip_summed == CHECKSUM_PARTIAL)
1744 status |= TP_STATUS_CSUMNOTREADY;
1745
1da177e4
LT
1746 snaplen = skb->len;
1747
dbcb5855
DM
1748 res = run_filter(skb, sk, snaplen);
1749 if (!res)
fda9ef5d 1750 goto drop_n_restore;
dbcb5855
DM
1751 if (snaplen > res)
1752 snaplen = res;
1da177e4
LT
1753
1754 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1755 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1756 po->tp_reserve;
1da177e4 1757 } else {
95c96174 1758 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1759 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1760 (maclen < 16 ? 16 : maclen)) +
1761 po->tp_reserve;
1da177e4
LT
1762 macoff = netoff - maclen;
1763 }
f6fb8f10 1764 if (po->tp_version <= TPACKET_V2) {
1765 if (macoff + snaplen > po->rx_ring.frame_size) {
1766 if (po->copy_thresh &&
0fd7bac6 1767 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1768 if (skb_shared(skb)) {
1769 copy_skb = skb_clone(skb, GFP_ATOMIC);
1770 } else {
1771 copy_skb = skb_get(skb);
1772 skb_head = skb->data;
1773 }
1774 if (copy_skb)
1775 skb_set_owner_r(copy_skb, sk);
1da177e4 1776 }
f6fb8f10 1777 snaplen = po->rx_ring.frame_size - macoff;
1778 if ((int)snaplen < 0)
1779 snaplen = 0;
1da177e4 1780 }
1da177e4 1781 }
1da177e4 1782 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1783 h.raw = packet_current_rx_frame(po, skb,
1784 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1785 if (!h.raw)
1da177e4 1786 goto ring_is_full;
f6fb8f10 1787 if (po->tp_version <= TPACKET_V2) {
1788 packet_increment_rx_head(po, &po->rx_ring);
1789 /*
1790 * LOSING will be reported till you read the stats,
1791 * because it's COR - Clear On Read.
1792 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1793 * at packet level.
1794 */
1795 if (po->stats.tp_drops)
1796 status |= TP_STATUS_LOSING;
1797 }
1da177e4
LT
1798 po->stats.tp_packets++;
1799 if (copy_skb) {
1800 status |= TP_STATUS_COPY;
1801 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1802 }
1da177e4
LT
1803 spin_unlock(&sk->sk_receive_queue.lock);
1804
bbd6ef87 1805 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
7a51384c
DB
1806 if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp))
1807 getnstimeofday(&ts);
1da177e4 1808
bbd6ef87
PM
1809 switch (po->tp_version) {
1810 case TPACKET_V1:
1811 h.h1->tp_len = skb->len;
1812 h.h1->tp_snaplen = snaplen;
1813 h.h1->tp_mac = macoff;
1814 h.h1->tp_net = netoff;
4b457bdf
DB
1815 h.h1->tp_sec = ts.tv_sec;
1816 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1817 hdrlen = sizeof(*h.h1);
1818 break;
1819 case TPACKET_V2:
1820 h.h2->tp_len = skb->len;
1821 h.h2->tp_snaplen = snaplen;
1822 h.h2->tp_mac = macoff;
1823 h.h2->tp_net = netoff;
bbd6ef87
PM
1824 h.h2->tp_sec = ts.tv_sec;
1825 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1826 if (vlan_tx_tag_present(skb)) {
1827 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1828 status |= TP_STATUS_VLAN_VALID;
1829 } else {
1830 h.h2->tp_vlan_tci = 0;
1831 }
13fcb7bd 1832 h.h2->tp_padding = 0;
bbd6ef87
PM
1833 hdrlen = sizeof(*h.h2);
1834 break;
f6fb8f10 1835 case TPACKET_V3:
1836 /* tp_nxt_offset,vlan are already populated above.
1837 * So DONT clear those fields here
1838 */
1839 h.h3->tp_status |= status;
1840 h.h3->tp_len = skb->len;
1841 h.h3->tp_snaplen = snaplen;
1842 h.h3->tp_mac = macoff;
1843 h.h3->tp_net = netoff;
f6fb8f10 1844 h.h3->tp_sec = ts.tv_sec;
1845 h.h3->tp_nsec = ts.tv_nsec;
1846 hdrlen = sizeof(*h.h3);
1847 break;
bbd6ef87
PM
1848 default:
1849 BUG();
1850 }
1da177e4 1851
bbd6ef87 1852 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1853 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1854 sll->sll_family = AF_PACKET;
1855 sll->sll_hatype = dev->type;
1856 sll->sll_protocol = skb->protocol;
1857 sll->sll_pkttype = skb->pkt_type;
8032b464 1858 if (unlikely(po->origdev))
80feaacb
PWJ
1859 sll->sll_ifindex = orig_dev->ifindex;
1860 else
1861 sll->sll_ifindex = dev->ifindex;
1da177e4 1862
e16aa207 1863 smp_mb();
f6dafa95 1864#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1865 {
0af55bb5
CG
1866 u8 *start, *end;
1867
f6fb8f10 1868 if (po->tp_version <= TPACKET_V2) {
1869 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1870 + macoff + snaplen);
1871 for (start = h.raw; start < end; start += PAGE_SIZE)
1872 flush_dcache_page(pgv_to_page(start));
1873 }
cc9f01b2 1874 smp_wmb();
1da177e4 1875 }
f6dafa95 1876#endif
f6fb8f10 1877 if (po->tp_version <= TPACKET_V2)
1878 __packet_set_status(po, h.raw, status);
1879 else
1880 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1881
1882 sk->sk_data_ready(sk, 0);
1883
1884drop_n_restore:
1885 if (skb_head != skb->data && skb_shared(skb)) {
1886 skb->data = skb_head;
1887 skb->len = skb_len;
1888 }
1889drop:
1ce4f28b 1890 kfree_skb(skb);
1da177e4
LT
1891 return 0;
1892
1893ring_is_full:
1894 po->stats.tp_drops++;
1895 spin_unlock(&sk->sk_receive_queue.lock);
1896
1897 sk->sk_data_ready(sk, 0);
acb5d75b 1898 kfree_skb(copy_skb);
1da177e4
LT
1899 goto drop_n_restore;
1900}
1901
69e3c75f
JB
1902static void tpacket_destruct_skb(struct sk_buff *skb)
1903{
1904 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1905 void *ph;
1da177e4 1906
69e3c75f
JB
1907 if (likely(po->tx_ring.pg_vec)) {
1908 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
1909 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1910 atomic_dec(&po->tx_ring.pending);
7a51384c 1911 __packet_set_timestamp(po, ph, skb);
69e3c75f
JB
1912 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1913 }
1914
1915 sock_wfree(skb);
1916}
1917
40d4e3df
ED
1918static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1919 void *frame, struct net_device *dev, int size_max,
ae641949 1920 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 1921{
184f489e 1922 union tpacket_uhdr ph;
69e3c75f
JB
1923 int to_write, offset, len, tp_len, nr_frags, len_max;
1924 struct socket *sock = po->sk.sk_socket;
1925 struct page *page;
1926 void *data;
1927 int err;
1928
1929 ph.raw = frame;
1930
1931 skb->protocol = proto;
1932 skb->dev = dev;
1933 skb->priority = po->sk.sk_priority;
2d37a186 1934 skb->mark = po->sk.sk_mark;
2e31396f 1935 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
1936 skb_shinfo(skb)->destructor_arg = ph.raw;
1937
1938 switch (po->tp_version) {
1939 case TPACKET_V2:
1940 tp_len = ph.h2->tp_len;
1941 break;
1942 default:
1943 tp_len = ph.h1->tp_len;
1944 break;
1945 }
1946 if (unlikely(tp_len > size_max)) {
40d4e3df 1947 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1948 return -EMSGSIZE;
1949 }
1950
ae641949 1951 skb_reserve(skb, hlen);
69e3c75f 1952 skb_reset_network_header(skb);
40893fd0 1953 skb_probe_transport_header(skb, 0);
c1aad275 1954
5920cd3a
PC
1955 if (po->tp_tx_has_off) {
1956 int off_min, off_max, off;
1957 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1958 off_max = po->tx_ring.frame_size - tp_len;
1959 if (sock->type == SOCK_DGRAM) {
1960 switch (po->tp_version) {
1961 case TPACKET_V2:
1962 off = ph.h2->tp_net;
1963 break;
1964 default:
1965 off = ph.h1->tp_net;
1966 break;
1967 }
1968 } else {
1969 switch (po->tp_version) {
1970 case TPACKET_V2:
1971 off = ph.h2->tp_mac;
1972 break;
1973 default:
1974 off = ph.h1->tp_mac;
1975 break;
1976 }
1977 }
1978 if (unlikely((off < off_min) || (off_max < off)))
1979 return -EINVAL;
1980 data = ph.raw + off;
1981 } else {
1982 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1983 }
69e3c75f
JB
1984 to_write = tp_len;
1985
1986 if (sock->type == SOCK_DGRAM) {
1987 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1988 NULL, tp_len);
1989 if (unlikely(err < 0))
1990 return -EINVAL;
40d4e3df 1991 } else if (dev->hard_header_len) {
69e3c75f
JB
1992 /* net device doesn't like empty head */
1993 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
1994 pr_err("packet size is too short (%d < %d)\n",
1995 tp_len, dev->hard_header_len);
69e3c75f
JB
1996 return -EINVAL;
1997 }
1998
1999 skb_push(skb, dev->hard_header_len);
2000 err = skb_store_bits(skb, 0, data,
2001 dev->hard_header_len);
2002 if (unlikely(err))
2003 return err;
2004
2005 data += dev->hard_header_len;
2006 to_write -= dev->hard_header_len;
2007 }
2008
69e3c75f
JB
2009 offset = offset_in_page(data);
2010 len_max = PAGE_SIZE - offset;
2011 len = ((to_write > len_max) ? len_max : to_write);
2012
2013 skb->data_len = to_write;
2014 skb->len += to_write;
2015 skb->truesize += to_write;
2016 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2017
2018 while (likely(to_write)) {
2019 nr_frags = skb_shinfo(skb)->nr_frags;
2020
2021 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2022 pr_err("Packet exceed the number of skb frags(%lu)\n",
2023 MAX_SKB_FRAGS);
69e3c75f
JB
2024 return -EFAULT;
2025 }
2026
0af55bb5
CG
2027 page = pgv_to_page(data);
2028 data += len;
69e3c75f
JB
2029 flush_dcache_page(page);
2030 get_page(page);
0af55bb5 2031 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2032 to_write -= len;
2033 offset = 0;
2034 len_max = PAGE_SIZE;
2035 len = ((to_write > len_max) ? len_max : to_write);
2036 }
2037
2038 return tp_len;
2039}
2040
2041static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2042{
69e3c75f
JB
2043 struct sk_buff *skb;
2044 struct net_device *dev;
2045 __be16 proto;
827d9780
BG
2046 bool need_rls_dev = false;
2047 int err, reserve = 0;
40d4e3df
ED
2048 void *ph;
2049 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2050 int tp_len, size_max;
2051 unsigned char *addr;
2052 int len_sum = 0;
9e67030a 2053 int status = TP_STATUS_AVAILABLE;
ae641949 2054 int hlen, tlen;
69e3c75f 2055
69e3c75f
JB
2056 mutex_lock(&po->pg_vec_lock);
2057
69e3c75f 2058 if (saddr == NULL) {
827d9780 2059 dev = po->prot_hook.dev;
69e3c75f
JB
2060 proto = po->num;
2061 addr = NULL;
2062 } else {
2063 err = -EINVAL;
2064 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2065 goto out;
2066 if (msg->msg_namelen < (saddr->sll_halen
2067 + offsetof(struct sockaddr_ll,
2068 sll_addr)))
2069 goto out;
69e3c75f
JB
2070 proto = saddr->sll_protocol;
2071 addr = saddr->sll_addr;
827d9780
BG
2072 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2073 need_rls_dev = true;
69e3c75f
JB
2074 }
2075
69e3c75f
JB
2076 err = -ENXIO;
2077 if (unlikely(dev == NULL))
2078 goto out;
2079
2080 reserve = dev->hard_header_len;
2081
2082 err = -ENETDOWN;
2083 if (unlikely(!(dev->flags & IFF_UP)))
2084 goto out_put;
2085
2086 size_max = po->tx_ring.frame_size
b5dd884e 2087 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2088
2089 if (size_max > dev->mtu + reserve)
2090 size_max = dev->mtu + reserve;
2091
2092 do {
2093 ph = packet_current_frame(po, &po->tx_ring,
2094 TP_STATUS_SEND_REQUEST);
2095
2096 if (unlikely(ph == NULL)) {
2097 schedule();
2098 continue;
2099 }
2100
2101 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2102 hlen = LL_RESERVED_SPACE(dev);
2103 tlen = dev->needed_tailroom;
69e3c75f 2104 skb = sock_alloc_send_skb(&po->sk,
ae641949 2105 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2106 0, &err);
2107
2108 if (unlikely(skb == NULL))
2109 goto out_status;
2110
2111 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2112 addr, hlen);
69e3c75f
JB
2113
2114 if (unlikely(tp_len < 0)) {
2115 if (po->tp_loss) {
2116 __packet_set_status(po, ph,
2117 TP_STATUS_AVAILABLE);
2118 packet_increment_head(&po->tx_ring);
2119 kfree_skb(skb);
2120 continue;
2121 } else {
2122 status = TP_STATUS_WRONG_FORMAT;
2123 err = tp_len;
2124 goto out_status;
2125 }
2126 }
2127
2128 skb->destructor = tpacket_destruct_skb;
2129 __packet_set_status(po, ph, TP_STATUS_SENDING);
2130 atomic_inc(&po->tx_ring.pending);
2131
2132 status = TP_STATUS_SEND_REQUEST;
2133 err = dev_queue_xmit(skb);
eb70df13
JP
2134 if (unlikely(err > 0)) {
2135 err = net_xmit_errno(err);
2136 if (err && __packet_get_status(po, ph) ==
2137 TP_STATUS_AVAILABLE) {
2138 /* skb was destructed already */
2139 skb = NULL;
2140 goto out_status;
2141 }
2142 /*
2143 * skb was dropped but not destructed yet;
2144 * let's treat it like congestion or err < 0
2145 */
2146 err = 0;
2147 }
69e3c75f
JB
2148 packet_increment_head(&po->tx_ring);
2149 len_sum += tp_len;
f64f9e71
JP
2150 } while (likely((ph != NULL) ||
2151 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2152 (atomic_read(&po->tx_ring.pending))))
2153 );
69e3c75f
JB
2154
2155 err = len_sum;
2156 goto out_put;
2157
69e3c75f
JB
2158out_status:
2159 __packet_set_status(po, ph, status);
2160 kfree_skb(skb);
2161out_put:
827d9780
BG
2162 if (need_rls_dev)
2163 dev_put(dev);
69e3c75f
JB
2164out:
2165 mutex_unlock(&po->pg_vec_lock);
2166 return err;
2167}
69e3c75f 2168
eea49cc9
OJ
2169static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2170 size_t reserve, size_t len,
2171 size_t linear, int noblock,
2172 int *err)
bfd5f4a3
SS
2173{
2174 struct sk_buff *skb;
2175
2176 /* Under a page? Don't bother with paged skb. */
2177 if (prepad + len < PAGE_SIZE || !linear)
2178 linear = len;
2179
2180 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2181 err);
2182 if (!skb)
2183 return NULL;
2184
2185 skb_reserve(skb, reserve);
2186 skb_put(skb, linear);
2187 skb->data_len = len - linear;
2188 skb->len += len - linear;
2189
2190 return skb;
2191}
2192
69e3c75f 2193static int packet_snd(struct socket *sock,
1da177e4
LT
2194 struct msghdr *msg, size_t len)
2195{
2196 struct sock *sk = sock->sk;
40d4e3df 2197 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2198 struct sk_buff *skb;
2199 struct net_device *dev;
0e11c91e 2200 __be16 proto;
827d9780 2201 bool need_rls_dev = false;
1da177e4 2202 unsigned char *addr;
827d9780 2203 int err, reserve = 0;
bfd5f4a3
SS
2204 struct virtio_net_hdr vnet_hdr = { 0 };
2205 int offset = 0;
2206 int vnet_hdr_len;
2207 struct packet_sock *po = pkt_sk(sk);
2208 unsigned short gso_type = 0;
ae641949 2209 int hlen, tlen;
3bdc0eba 2210 int extra_len = 0;
1da177e4
LT
2211
2212 /*
1ce4f28b 2213 * Get and verify the address.
1da177e4 2214 */
1ce4f28b 2215
1da177e4 2216 if (saddr == NULL) {
827d9780 2217 dev = po->prot_hook.dev;
1da177e4
LT
2218 proto = po->num;
2219 addr = NULL;
2220 } else {
2221 err = -EINVAL;
2222 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2223 goto out;
0fb375fb
EB
2224 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2225 goto out;
1da177e4
LT
2226 proto = saddr->sll_protocol;
2227 addr = saddr->sll_addr;
827d9780
BG
2228 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2229 need_rls_dev = true;
1da177e4
LT
2230 }
2231
1da177e4
LT
2232 err = -ENXIO;
2233 if (dev == NULL)
2234 goto out_unlock;
2235 if (sock->type == SOCK_RAW)
2236 reserve = dev->hard_header_len;
2237
d5e76b0a
DM
2238 err = -ENETDOWN;
2239 if (!(dev->flags & IFF_UP))
2240 goto out_unlock;
2241
bfd5f4a3
SS
2242 if (po->has_vnet_hdr) {
2243 vnet_hdr_len = sizeof(vnet_hdr);
2244
2245 err = -EINVAL;
2246 if (len < vnet_hdr_len)
2247 goto out_unlock;
2248
2249 len -= vnet_hdr_len;
2250
2251 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2252 vnet_hdr_len);
2253 if (err < 0)
2254 goto out_unlock;
2255
2256 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2257 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2258 vnet_hdr.hdr_len))
2259 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2260 vnet_hdr.csum_offset + 2;
2261
2262 err = -EINVAL;
2263 if (vnet_hdr.hdr_len > len)
2264 goto out_unlock;
2265
2266 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2267 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2268 case VIRTIO_NET_HDR_GSO_TCPV4:
2269 gso_type = SKB_GSO_TCPV4;
2270 break;
2271 case VIRTIO_NET_HDR_GSO_TCPV6:
2272 gso_type = SKB_GSO_TCPV6;
2273 break;
2274 case VIRTIO_NET_HDR_GSO_UDP:
2275 gso_type = SKB_GSO_UDP;
2276 break;
2277 default:
2278 goto out_unlock;
2279 }
2280
2281 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2282 gso_type |= SKB_GSO_TCP_ECN;
2283
2284 if (vnet_hdr.gso_size == 0)
2285 goto out_unlock;
2286
2287 }
2288 }
2289
3bdc0eba
BG
2290 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2291 if (!netif_supports_nofcs(dev)) {
2292 err = -EPROTONOSUPPORT;
2293 goto out_unlock;
2294 }
2295 extra_len = 4; /* We're doing our own CRC */
2296 }
2297
1da177e4 2298 err = -EMSGSIZE;
3bdc0eba 2299 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2300 goto out_unlock;
2301
bfd5f4a3 2302 err = -ENOBUFS;
ae641949
HX
2303 hlen = LL_RESERVED_SPACE(dev);
2304 tlen = dev->needed_tailroom;
2305 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2306 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2307 if (skb == NULL)
1da177e4
LT
2308 goto out_unlock;
2309
bfd5f4a3 2310 skb_set_network_header(skb, reserve);
1da177e4 2311
0c4e8581
SH
2312 err = -EINVAL;
2313 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2314 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2315 goto out_free;
1da177e4
LT
2316
2317 /* Returns -EFAULT on error */
bfd5f4a3 2318 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2319 if (err)
2320 goto out_free;
bf84a010
DB
2321
2322 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2323
3bdc0eba 2324 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2325 /* Earlier code assumed this would be a VLAN pkt,
2326 * double-check this now that we have the actual
2327 * packet in hand.
2328 */
2329 struct ethhdr *ehdr;
2330 skb_reset_mac_header(skb);
2331 ehdr = eth_hdr(skb);
2332 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2333 err = -EMSGSIZE;
2334 goto out_free;
2335 }
2336 }
2337
1da177e4
LT
2338 skb->protocol = proto;
2339 skb->dev = dev;
2340 skb->priority = sk->sk_priority;
2d37a186 2341 skb->mark = sk->sk_mark;
1da177e4 2342
bfd5f4a3
SS
2343 if (po->has_vnet_hdr) {
2344 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2345 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2346 vnet_hdr.csum_offset)) {
2347 err = -EINVAL;
2348 goto out_free;
2349 }
2350 }
2351
2352 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2353 skb_shinfo(skb)->gso_type = gso_type;
2354
2355 /* Header must be checked, and gso_segs computed. */
2356 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2357 skb_shinfo(skb)->gso_segs = 0;
2358
2359 len += vnet_hdr_len;
2360 }
2361
40893fd0 2362 skb_probe_transport_header(skb, reserve);
c1aad275 2363
3bdc0eba
BG
2364 if (unlikely(extra_len == 4))
2365 skb->no_fcs = 1;
2366
1da177e4
LT
2367 /*
2368 * Now send it
2369 */
2370
2371 err = dev_queue_xmit(skb);
2372 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2373 goto out_unlock;
2374
827d9780
BG
2375 if (need_rls_dev)
2376 dev_put(dev);
1da177e4 2377
40d4e3df 2378 return len;
1da177e4
LT
2379
2380out_free:
2381 kfree_skb(skb);
2382out_unlock:
827d9780 2383 if (dev && need_rls_dev)
1da177e4
LT
2384 dev_put(dev);
2385out:
2386 return err;
2387}
2388
69e3c75f
JB
2389static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2390 struct msghdr *msg, size_t len)
2391{
69e3c75f
JB
2392 struct sock *sk = sock->sk;
2393 struct packet_sock *po = pkt_sk(sk);
2394 if (po->tx_ring.pg_vec)
2395 return tpacket_snd(po, msg);
2396 else
69e3c75f
JB
2397 return packet_snd(sock, msg, len);
2398}
2399
1da177e4
LT
2400/*
2401 * Close a PACKET socket. This is fairly simple. We immediately go
2402 * to 'closed' state and remove our protocol entry in the device list.
2403 */
2404
2405static int packet_release(struct socket *sock)
2406{
2407 struct sock *sk = sock->sk;
2408 struct packet_sock *po;
d12d01d6 2409 struct net *net;
f6fb8f10 2410 union tpacket_req_u req_u;
1da177e4
LT
2411
2412 if (!sk)
2413 return 0;
2414
3b1e0a65 2415 net = sock_net(sk);
1da177e4
LT
2416 po = pkt_sk(sk);
2417
0fa7fa98 2418 mutex_lock(&net->packet.sklist_lock);
808f5114 2419 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2420 mutex_unlock(&net->packet.sklist_lock);
2421
2422 preempt_disable();
920de804 2423 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2424 preempt_enable();
1da177e4 2425
808f5114 2426 spin_lock(&po->bind_lock);
ce06b03e 2427 unregister_prot_hook(sk, false);
160ff18a
BG
2428 if (po->prot_hook.dev) {
2429 dev_put(po->prot_hook.dev);
2430 po->prot_hook.dev = NULL;
2431 }
808f5114 2432 spin_unlock(&po->bind_lock);
1da177e4 2433
1da177e4 2434 packet_flush_mclist(sk);
1da177e4 2435
9665d5d6
PS
2436 if (po->rx_ring.pg_vec) {
2437 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2438 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2439 }
69e3c75f 2440
9665d5d6
PS
2441 if (po->tx_ring.pg_vec) {
2442 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2443 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2444 }
1da177e4 2445
dc99f600
DM
2446 fanout_release(sk);
2447
808f5114 2448 synchronize_net();
1da177e4
LT
2449 /*
2450 * Now the socket is dead. No more input will appear.
2451 */
1da177e4
LT
2452 sock_orphan(sk);
2453 sock->sk = NULL;
2454
2455 /* Purge queues */
2456
2457 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2458 sk_refcnt_debug_release(sk);
1da177e4
LT
2459
2460 sock_put(sk);
2461 return 0;
2462}
2463
2464/*
2465 * Attach a packet hook.
2466 */
2467
0e11c91e 2468static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2469{
2470 struct packet_sock *po = pkt_sk(sk);
dc99f600 2471
aef950b4
WY
2472 if (po->fanout) {
2473 if (dev)
2474 dev_put(dev);
2475
dc99f600 2476 return -EINVAL;
aef950b4 2477 }
1da177e4
LT
2478
2479 lock_sock(sk);
2480
2481 spin_lock(&po->bind_lock);
ce06b03e 2482 unregister_prot_hook(sk, true);
1da177e4
LT
2483 po->num = protocol;
2484 po->prot_hook.type = protocol;
160ff18a
BG
2485 if (po->prot_hook.dev)
2486 dev_put(po->prot_hook.dev);
1da177e4
LT
2487 po->prot_hook.dev = dev;
2488
2489 po->ifindex = dev ? dev->ifindex : 0;
2490
2491 if (protocol == 0)
2492 goto out_unlock;
2493
be85d4ad 2494 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2495 register_prot_hook(sk);
be85d4ad
UT
2496 } else {
2497 sk->sk_err = ENETDOWN;
2498 if (!sock_flag(sk, SOCK_DEAD))
2499 sk->sk_error_report(sk);
1da177e4
LT
2500 }
2501
2502out_unlock:
2503 spin_unlock(&po->bind_lock);
2504 release_sock(sk);
2505 return 0;
2506}
2507
2508/*
2509 * Bind a packet socket to a device
2510 */
2511
40d4e3df
ED
2512static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2513 int addr_len)
1da177e4 2514{
40d4e3df 2515 struct sock *sk = sock->sk;
1da177e4
LT
2516 char name[15];
2517 struct net_device *dev;
2518 int err = -ENODEV;
1ce4f28b 2519
1da177e4
LT
2520 /*
2521 * Check legality
2522 */
1ce4f28b 2523
8ae55f04 2524 if (addr_len != sizeof(struct sockaddr))
1da177e4 2525 return -EINVAL;
40d4e3df 2526 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2527
3b1e0a65 2528 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2529 if (dev)
1da177e4 2530 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2531 return err;
2532}
1da177e4
LT
2533
2534static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2535{
40d4e3df
ED
2536 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2537 struct sock *sk = sock->sk;
1da177e4
LT
2538 struct net_device *dev = NULL;
2539 int err;
2540
2541
2542 /*
2543 * Check legality
2544 */
1ce4f28b 2545
1da177e4
LT
2546 if (addr_len < sizeof(struct sockaddr_ll))
2547 return -EINVAL;
2548 if (sll->sll_family != AF_PACKET)
2549 return -EINVAL;
2550
2551 if (sll->sll_ifindex) {
2552 err = -ENODEV;
3b1e0a65 2553 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2554 if (dev == NULL)
2555 goto out;
2556 }
2557 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2558
2559out:
2560 return err;
2561}
2562
2563static struct proto packet_proto = {
2564 .name = "PACKET",
2565 .owner = THIS_MODULE,
2566 .obj_size = sizeof(struct packet_sock),
2567};
2568
2569/*
1ce4f28b 2570 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2571 */
2572
3f378b68
EP
2573static int packet_create(struct net *net, struct socket *sock, int protocol,
2574 int kern)
1da177e4
LT
2575{
2576 struct sock *sk;
2577 struct packet_sock *po;
0e11c91e 2578 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2579 int err;
2580
df008c91 2581 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2582 return -EPERM;
be02097c
DM
2583 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2584 sock->type != SOCK_PACKET)
1da177e4
LT
2585 return -ESOCKTNOSUPPORT;
2586
2587 sock->state = SS_UNCONNECTED;
2588
2589 err = -ENOBUFS;
6257ff21 2590 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2591 if (sk == NULL)
2592 goto out;
2593
2594 sock->ops = &packet_ops;
1da177e4
LT
2595 if (sock->type == SOCK_PACKET)
2596 sock->ops = &packet_ops_spkt;
be02097c 2597
1da177e4
LT
2598 sock_init_data(sock, sk);
2599
2600 po = pkt_sk(sk);
2601 sk->sk_family = PF_PACKET;
0e11c91e 2602 po->num = proto;
1da177e4
LT
2603
2604 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2605 sk_refcnt_debug_inc(sk);
1da177e4
LT
2606
2607 /*
2608 * Attach a protocol block
2609 */
2610
2611 spin_lock_init(&po->bind_lock);
905db440 2612 mutex_init(&po->pg_vec_lock);
1da177e4 2613 po->prot_hook.func = packet_rcv;
be02097c 2614
1da177e4
LT
2615 if (sock->type == SOCK_PACKET)
2616 po->prot_hook.func = packet_rcv_spkt;
be02097c 2617
1da177e4
LT
2618 po->prot_hook.af_packet_priv = sk;
2619
0e11c91e
AV
2620 if (proto) {
2621 po->prot_hook.type = proto;
ce06b03e 2622 register_prot_hook(sk);
1da177e4
LT
2623 }
2624
0fa7fa98 2625 mutex_lock(&net->packet.sklist_lock);
808f5114 2626 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2627 mutex_unlock(&net->packet.sklist_lock);
2628
2629 preempt_disable();
3680453c 2630 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2631 preempt_enable();
808f5114 2632
40d4e3df 2633 return 0;
1da177e4
LT
2634out:
2635 return err;
2636}
2637
ed85b565
RC
2638static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2639{
2640 struct sock_exterr_skb *serr;
2641 struct sk_buff *skb, *skb2;
2642 int copied, err;
2643
2644 err = -EAGAIN;
2645 skb = skb_dequeue(&sk->sk_error_queue);
2646 if (skb == NULL)
2647 goto out;
2648
2649 copied = skb->len;
2650 if (copied > len) {
2651 msg->msg_flags |= MSG_TRUNC;
2652 copied = len;
2653 }
2654 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2655 if (err)
2656 goto out_free_skb;
2657
2658 sock_recv_timestamp(msg, sk, skb);
2659
2660 serr = SKB_EXT_ERR(skb);
2661 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2662 sizeof(serr->ee), &serr->ee);
2663
2664 msg->msg_flags |= MSG_ERRQUEUE;
2665 err = copied;
2666
2667 /* Reset and regenerate socket error */
2668 spin_lock_bh(&sk->sk_error_queue.lock);
2669 sk->sk_err = 0;
2670 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2671 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2672 spin_unlock_bh(&sk->sk_error_queue.lock);
2673 sk->sk_error_report(sk);
2674 } else
2675 spin_unlock_bh(&sk->sk_error_queue.lock);
2676
2677out_free_skb:
2678 kfree_skb(skb);
2679out:
2680 return err;
2681}
2682
1da177e4
LT
2683/*
2684 * Pull a packet from our receive queue and hand it to the user.
2685 * If necessary we block.
2686 */
2687
2688static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2689 struct msghdr *msg, size_t len, int flags)
2690{
2691 struct sock *sk = sock->sk;
2692 struct sk_buff *skb;
2693 int copied, err;
0fb375fb 2694 struct sockaddr_ll *sll;
bfd5f4a3 2695 int vnet_hdr_len = 0;
1da177e4
LT
2696
2697 err = -EINVAL;
ed85b565 2698 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2699 goto out;
2700
2701#if 0
2702 /* What error should we return now? EUNATTACH? */
2703 if (pkt_sk(sk)->ifindex < 0)
2704 return -ENODEV;
2705#endif
2706
ed85b565
RC
2707 if (flags & MSG_ERRQUEUE) {
2708 err = packet_recv_error(sk, msg, len);
2709 goto out;
2710 }
2711
1da177e4
LT
2712 /*
2713 * Call the generic datagram receiver. This handles all sorts
2714 * of horrible races and re-entrancy so we can forget about it
2715 * in the protocol layers.
2716 *
2717 * Now it will return ENETDOWN, if device have just gone down,
2718 * but then it will block.
2719 */
2720
40d4e3df 2721 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2722
2723 /*
1ce4f28b 2724 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2725 * handles the blocking we don't see and worry about blocking
2726 * retries.
2727 */
2728
8ae55f04 2729 if (skb == NULL)
1da177e4
LT
2730 goto out;
2731
bfd5f4a3
SS
2732 if (pkt_sk(sk)->has_vnet_hdr) {
2733 struct virtio_net_hdr vnet_hdr = { 0 };
2734
2735 err = -EINVAL;
2736 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2737 if (len < vnet_hdr_len)
bfd5f4a3
SS
2738 goto out_free;
2739
1f18b717
MK
2740 len -= vnet_hdr_len;
2741
bfd5f4a3
SS
2742 if (skb_is_gso(skb)) {
2743 struct skb_shared_info *sinfo = skb_shinfo(skb);
2744
2745 /* This is a hint as to how much should be linear. */
2746 vnet_hdr.hdr_len = skb_headlen(skb);
2747 vnet_hdr.gso_size = sinfo->gso_size;
2748 if (sinfo->gso_type & SKB_GSO_TCPV4)
2749 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2750 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2751 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2752 else if (sinfo->gso_type & SKB_GSO_UDP)
2753 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2754 else if (sinfo->gso_type & SKB_GSO_FCOE)
2755 goto out_free;
2756 else
2757 BUG();
2758 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2759 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2760 } else
2761 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2762
2763 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2764 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2765 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2766 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2767 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2768 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2769 } /* else everything is zero */
2770
2771 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2772 vnet_hdr_len);
2773 if (err < 0)
2774 goto out_free;
2775 }
2776
0fb375fb
EB
2777 /*
2778 * If the address length field is there to be filled in, we fill
2779 * it in now.
2780 */
2781
ffbc6111 2782 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2783 if (sock->type == SOCK_PACKET)
2784 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2785 else
2786 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2787
1da177e4
LT
2788 /*
2789 * You lose any data beyond the buffer you gave. If it worries a
2790 * user program they can ask the device for its MTU anyway.
2791 */
2792
2793 copied = skb->len;
40d4e3df
ED
2794 if (copied > len) {
2795 copied = len;
2796 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2797 }
2798
2799 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2800 if (err)
2801 goto out_free;
2802
3b885787 2803 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2804
2805 if (msg->msg_name)
ffbc6111
HX
2806 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2807 msg->msg_namelen);
1da177e4 2808
8dc41944 2809 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2810 struct tpacket_auxdata aux;
2811
2812 aux.tp_status = TP_STATUS_USER;
2813 if (skb->ip_summed == CHECKSUM_PARTIAL)
2814 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2815 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2816 aux.tp_snaplen = skb->len;
2817 aux.tp_mac = 0;
bbe735e4 2818 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2819 if (vlan_tx_tag_present(skb)) {
2820 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2821 aux.tp_status |= TP_STATUS_VLAN_VALID;
2822 } else {
2823 aux.tp_vlan_tci = 0;
2824 }
13fcb7bd 2825 aux.tp_padding = 0;
ffbc6111 2826 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2827 }
2828
1da177e4
LT
2829 /*
2830 * Free or return the buffer as appropriate. Again this
2831 * hides all the races and re-entrancy issues from us.
2832 */
bfd5f4a3 2833 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2834
2835out_free:
2836 skb_free_datagram(sk, skb);
2837out:
2838 return err;
2839}
2840
1da177e4
LT
2841static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2842 int *uaddr_len, int peer)
2843{
2844 struct net_device *dev;
2845 struct sock *sk = sock->sk;
2846
2847 if (peer)
2848 return -EOPNOTSUPP;
2849
2850 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2851 rcu_read_lock();
2852 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2853 if (dev)
67286640 2854 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2855 else
1da177e4 2856 memset(uaddr->sa_data, 0, 14);
654d1f8a 2857 rcu_read_unlock();
1da177e4
LT
2858 *uaddr_len = sizeof(*uaddr);
2859
2860 return 0;
2861}
1da177e4
LT
2862
2863static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2864 int *uaddr_len, int peer)
2865{
2866 struct net_device *dev;
2867 struct sock *sk = sock->sk;
2868 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2869 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2870
2871 if (peer)
2872 return -EOPNOTSUPP;
2873
2874 sll->sll_family = AF_PACKET;
2875 sll->sll_ifindex = po->ifindex;
2876 sll->sll_protocol = po->num;
67286640 2877 sll->sll_pkttype = 0;
654d1f8a
ED
2878 rcu_read_lock();
2879 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2880 if (dev) {
2881 sll->sll_hatype = dev->type;
2882 sll->sll_halen = dev->addr_len;
2883 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2884 } else {
2885 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2886 sll->sll_halen = 0;
2887 }
654d1f8a 2888 rcu_read_unlock();
0fb375fb 2889 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2890
2891 return 0;
2892}
2893
2aeb0b88
WC
2894static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2895 int what)
1da177e4
LT
2896{
2897 switch (i->type) {
2898 case PACKET_MR_MULTICAST:
1162563f
JP
2899 if (i->alen != dev->addr_len)
2900 return -EINVAL;
1da177e4 2901 if (what > 0)
22bedad3 2902 return dev_mc_add(dev, i->addr);
1da177e4 2903 else
22bedad3 2904 return dev_mc_del(dev, i->addr);
1da177e4
LT
2905 break;
2906 case PACKET_MR_PROMISC:
2aeb0b88 2907 return dev_set_promiscuity(dev, what);
1da177e4
LT
2908 break;
2909 case PACKET_MR_ALLMULTI:
2aeb0b88 2910 return dev_set_allmulti(dev, what);
1da177e4 2911 break;
d95ed927 2912 case PACKET_MR_UNICAST:
1162563f
JP
2913 if (i->alen != dev->addr_len)
2914 return -EINVAL;
d95ed927 2915 if (what > 0)
a748ee24 2916 return dev_uc_add(dev, i->addr);
d95ed927 2917 else
a748ee24 2918 return dev_uc_del(dev, i->addr);
d95ed927 2919 break;
40d4e3df
ED
2920 default:
2921 break;
1da177e4 2922 }
2aeb0b88 2923 return 0;
1da177e4
LT
2924}
2925
2926static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2927{
40d4e3df 2928 for ( ; i; i = i->next) {
1da177e4
LT
2929 if (i->ifindex == dev->ifindex)
2930 packet_dev_mc(dev, i, what);
2931 }
2932}
2933
0fb375fb 2934static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2935{
2936 struct packet_sock *po = pkt_sk(sk);
2937 struct packet_mclist *ml, *i;
2938 struct net_device *dev;
2939 int err;
2940
2941 rtnl_lock();
2942
2943 err = -ENODEV;
3b1e0a65 2944 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2945 if (!dev)
2946 goto done;
2947
2948 err = -EINVAL;
1162563f 2949 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2950 goto done;
2951
2952 err = -ENOBUFS;
8b3a7005 2953 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2954 if (i == NULL)
2955 goto done;
2956
2957 err = 0;
2958 for (ml = po->mclist; ml; ml = ml->next) {
2959 if (ml->ifindex == mreq->mr_ifindex &&
2960 ml->type == mreq->mr_type &&
2961 ml->alen == mreq->mr_alen &&
2962 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2963 ml->count++;
2964 /* Free the new element ... */
2965 kfree(i);
2966 goto done;
2967 }
2968 }
2969
2970 i->type = mreq->mr_type;
2971 i->ifindex = mreq->mr_ifindex;
2972 i->alen = mreq->mr_alen;
2973 memcpy(i->addr, mreq->mr_address, i->alen);
2974 i->count = 1;
2975 i->next = po->mclist;
2976 po->mclist = i;
2aeb0b88
WC
2977 err = packet_dev_mc(dev, i, 1);
2978 if (err) {
2979 po->mclist = i->next;
2980 kfree(i);
2981 }
1da177e4
LT
2982
2983done:
2984 rtnl_unlock();
2985 return err;
2986}
2987
0fb375fb 2988static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2989{
2990 struct packet_mclist *ml, **mlp;
2991
2992 rtnl_lock();
2993
2994 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2995 if (ml->ifindex == mreq->mr_ifindex &&
2996 ml->type == mreq->mr_type &&
2997 ml->alen == mreq->mr_alen &&
2998 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2999 if (--ml->count == 0) {
3000 struct net_device *dev;
3001 *mlp = ml->next;
ad959e76
ED
3002 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3003 if (dev)
1da177e4 3004 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3005 kfree(ml);
3006 }
3007 rtnl_unlock();
3008 return 0;
3009 }
3010 }
3011 rtnl_unlock();
3012 return -EADDRNOTAVAIL;
3013}
3014
3015static void packet_flush_mclist(struct sock *sk)
3016{
3017 struct packet_sock *po = pkt_sk(sk);
3018 struct packet_mclist *ml;
3019
3020 if (!po->mclist)
3021 return;
3022
3023 rtnl_lock();
3024 while ((ml = po->mclist) != NULL) {
3025 struct net_device *dev;
3026
3027 po->mclist = ml->next;
ad959e76
ED
3028 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3029 if (dev != NULL)
1da177e4 3030 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3031 kfree(ml);
3032 }
3033 rtnl_unlock();
3034}
1da177e4
LT
3035
3036static int
b7058842 3037packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3038{
3039 struct sock *sk = sock->sk;
8dc41944 3040 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3041 int ret;
3042
3043 if (level != SOL_PACKET)
3044 return -ENOPROTOOPT;
3045
69e3c75f 3046 switch (optname) {
1ce4f28b 3047 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3048 case PACKET_DROP_MEMBERSHIP:
3049 {
0fb375fb
EB
3050 struct packet_mreq_max mreq;
3051 int len = optlen;
3052 memset(&mreq, 0, sizeof(mreq));
3053 if (len < sizeof(struct packet_mreq))
1da177e4 3054 return -EINVAL;
0fb375fb
EB
3055 if (len > sizeof(mreq))
3056 len = sizeof(mreq);
40d4e3df 3057 if (copy_from_user(&mreq, optval, len))
1da177e4 3058 return -EFAULT;
0fb375fb
EB
3059 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3060 return -EINVAL;
1da177e4
LT
3061 if (optname == PACKET_ADD_MEMBERSHIP)
3062 ret = packet_mc_add(sk, &mreq);
3063 else
3064 ret = packet_mc_drop(sk, &mreq);
3065 return ret;
3066 }
a2efcfa0 3067
1da177e4 3068 case PACKET_RX_RING:
69e3c75f 3069 case PACKET_TX_RING:
1da177e4 3070 {
f6fb8f10 3071 union tpacket_req_u req_u;
3072 int len;
1da177e4 3073
f6fb8f10 3074 switch (po->tp_version) {
3075 case TPACKET_V1:
3076 case TPACKET_V2:
3077 len = sizeof(req_u.req);
3078 break;
3079 case TPACKET_V3:
3080 default:
3081 len = sizeof(req_u.req3);
3082 break;
3083 }
3084 if (optlen < len)
1da177e4 3085 return -EINVAL;
bfd5f4a3
SS
3086 if (pkt_sk(sk)->has_vnet_hdr)
3087 return -EINVAL;
f6fb8f10 3088 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3089 return -EFAULT;
f6fb8f10 3090 return packet_set_ring(sk, &req_u, 0,
3091 optname == PACKET_TX_RING);
1da177e4
LT
3092 }
3093 case PACKET_COPY_THRESH:
3094 {
3095 int val;
3096
40d4e3df 3097 if (optlen != sizeof(val))
1da177e4 3098 return -EINVAL;
40d4e3df 3099 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3100 return -EFAULT;
3101
3102 pkt_sk(sk)->copy_thresh = val;
3103 return 0;
3104 }
bbd6ef87
PM
3105 case PACKET_VERSION:
3106 {
3107 int val;
3108
3109 if (optlen != sizeof(val))
3110 return -EINVAL;
69e3c75f 3111 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3112 return -EBUSY;
3113 if (copy_from_user(&val, optval, sizeof(val)))
3114 return -EFAULT;
3115 switch (val) {
3116 case TPACKET_V1:
3117 case TPACKET_V2:
f6fb8f10 3118 case TPACKET_V3:
bbd6ef87
PM
3119 po->tp_version = val;
3120 return 0;
3121 default:
3122 return -EINVAL;
3123 }
3124 }
8913336a
PM
3125 case PACKET_RESERVE:
3126 {
3127 unsigned int val;
3128
3129 if (optlen != sizeof(val))
3130 return -EINVAL;
69e3c75f 3131 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3132 return -EBUSY;
3133 if (copy_from_user(&val, optval, sizeof(val)))
3134 return -EFAULT;
3135 po->tp_reserve = val;
3136 return 0;
3137 }
69e3c75f
JB
3138 case PACKET_LOSS:
3139 {
3140 unsigned int val;
3141
3142 if (optlen != sizeof(val))
3143 return -EINVAL;
3144 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3145 return -EBUSY;
3146 if (copy_from_user(&val, optval, sizeof(val)))
3147 return -EFAULT;
3148 po->tp_loss = !!val;
3149 return 0;
3150 }
8dc41944
HX
3151 case PACKET_AUXDATA:
3152 {
3153 int val;
3154
3155 if (optlen < sizeof(val))
3156 return -EINVAL;
3157 if (copy_from_user(&val, optval, sizeof(val)))
3158 return -EFAULT;
3159
3160 po->auxdata = !!val;
3161 return 0;
3162 }
80feaacb
PWJ
3163 case PACKET_ORIGDEV:
3164 {
3165 int val;
3166
3167 if (optlen < sizeof(val))
3168 return -EINVAL;
3169 if (copy_from_user(&val, optval, sizeof(val)))
3170 return -EFAULT;
3171
3172 po->origdev = !!val;
3173 return 0;
3174 }
bfd5f4a3
SS
3175 case PACKET_VNET_HDR:
3176 {
3177 int val;
3178
3179 if (sock->type != SOCK_RAW)
3180 return -EINVAL;
3181 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3182 return -EBUSY;
3183 if (optlen < sizeof(val))
3184 return -EINVAL;
3185 if (copy_from_user(&val, optval, sizeof(val)))
3186 return -EFAULT;
3187
3188 po->has_vnet_hdr = !!val;
3189 return 0;
3190 }
614f60fa
SM
3191 case PACKET_TIMESTAMP:
3192 {
3193 int val;
3194
3195 if (optlen != sizeof(val))
3196 return -EINVAL;
3197 if (copy_from_user(&val, optval, sizeof(val)))
3198 return -EFAULT;
3199
3200 po->tp_tstamp = val;
3201 return 0;
3202 }
dc99f600
DM
3203 case PACKET_FANOUT:
3204 {
3205 int val;
3206
3207 if (optlen != sizeof(val))
3208 return -EINVAL;
3209 if (copy_from_user(&val, optval, sizeof(val)))
3210 return -EFAULT;
3211
3212 return fanout_add(sk, val & 0xffff, val >> 16);
3213 }
5920cd3a
PC
3214 case PACKET_TX_HAS_OFF:
3215 {
3216 unsigned int val;
3217
3218 if (optlen != sizeof(val))
3219 return -EINVAL;
3220 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3221 return -EBUSY;
3222 if (copy_from_user(&val, optval, sizeof(val)))
3223 return -EFAULT;
3224 po->tp_tx_has_off = !!val;
3225 return 0;
3226 }
1da177e4
LT
3227 default:
3228 return -ENOPROTOOPT;
3229 }
3230}
3231
3232static int packet_getsockopt(struct socket *sock, int level, int optname,
3233 char __user *optval, int __user *optlen)
3234{
3235 int len;
c06fff6e 3236 int val, lv = sizeof(val);
1da177e4
LT
3237 struct sock *sk = sock->sk;
3238 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3239 void *data = &val;
8dc41944 3240 struct tpacket_stats st;
f6fb8f10 3241 union tpacket_stats_u st_u;
1da177e4
LT
3242
3243 if (level != SOL_PACKET)
3244 return -ENOPROTOOPT;
3245
8ae55f04
KK
3246 if (get_user(len, optlen))
3247 return -EFAULT;
1da177e4
LT
3248
3249 if (len < 0)
3250 return -EINVAL;
1ce4f28b 3251
69e3c75f 3252 switch (optname) {
1da177e4 3253 case PACKET_STATISTICS:
1da177e4 3254 spin_lock_bh(&sk->sk_receive_queue.lock);
f6fb8f10 3255 if (po->tp_version == TPACKET_V3) {
c06fff6e 3256 lv = sizeof(struct tpacket_stats_v3);
f6fb8f10 3257 memcpy(&st_u.stats3, &po->stats,
c06fff6e 3258 sizeof(struct tpacket_stats));
f6fb8f10 3259 st_u.stats3.tp_freeze_q_cnt =
c06fff6e 3260 po->stats_u.stats3.tp_freeze_q_cnt;
f6fb8f10 3261 st_u.stats3.tp_packets += po->stats.tp_drops;
3262 data = &st_u.stats3;
3263 } else {
c06fff6e 3264 lv = sizeof(struct tpacket_stats);
f6fb8f10 3265 st = po->stats;
3266 st.tp_packets += st.tp_drops;
3267 data = &st;
3268 }
1da177e4
LT
3269 memset(&po->stats, 0, sizeof(st));
3270 spin_unlock_bh(&sk->sk_receive_queue.lock);
8dc41944
HX
3271 break;
3272 case PACKET_AUXDATA:
8dc41944 3273 val = po->auxdata;
80feaacb
PWJ
3274 break;
3275 case PACKET_ORIGDEV:
80feaacb 3276 val = po->origdev;
bfd5f4a3
SS
3277 break;
3278 case PACKET_VNET_HDR:
bfd5f4a3 3279 val = po->has_vnet_hdr;
1da177e4 3280 break;
bbd6ef87 3281 case PACKET_VERSION:
bbd6ef87 3282 val = po->tp_version;
bbd6ef87
PM
3283 break;
3284 case PACKET_HDRLEN:
3285 if (len > sizeof(int))
3286 len = sizeof(int);
3287 if (copy_from_user(&val, optval, len))
3288 return -EFAULT;
3289 switch (val) {
3290 case TPACKET_V1:
3291 val = sizeof(struct tpacket_hdr);
3292 break;
3293 case TPACKET_V2:
3294 val = sizeof(struct tpacket2_hdr);
3295 break;
f6fb8f10 3296 case TPACKET_V3:
3297 val = sizeof(struct tpacket3_hdr);
3298 break;
bbd6ef87
PM
3299 default:
3300 return -EINVAL;
3301 }
bbd6ef87 3302 break;
8913336a 3303 case PACKET_RESERVE:
8913336a 3304 val = po->tp_reserve;
8913336a 3305 break;
69e3c75f 3306 case PACKET_LOSS:
69e3c75f 3307 val = po->tp_loss;
69e3c75f 3308 break;
614f60fa 3309 case PACKET_TIMESTAMP:
614f60fa 3310 val = po->tp_tstamp;
614f60fa 3311 break;
dc99f600 3312 case PACKET_FANOUT:
dc99f600
DM
3313 val = (po->fanout ?
3314 ((u32)po->fanout->id |
77f65ebd
WB
3315 ((u32)po->fanout->type << 16) |
3316 ((u32)po->fanout->flags << 24)) :
dc99f600 3317 0);
dc99f600 3318 break;
5920cd3a
PC
3319 case PACKET_TX_HAS_OFF:
3320 val = po->tp_tx_has_off;
3321 break;
1da177e4
LT
3322 default:
3323 return -ENOPROTOOPT;
3324 }
3325
c06fff6e
ED
3326 if (len > lv)
3327 len = lv;
8ae55f04
KK
3328 if (put_user(len, optlen))
3329 return -EFAULT;
8dc41944
HX
3330 if (copy_to_user(optval, data, len))
3331 return -EFAULT;
8ae55f04 3332 return 0;
1da177e4
LT
3333}
3334
3335
3336static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3337{
3338 struct sock *sk;
ad930650 3339 struct net_device *dev = data;
c346dca1 3340 struct net *net = dev_net(dev);
1da177e4 3341
808f5114 3342 rcu_read_lock();
b67bfe0d 3343 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3344 struct packet_sock *po = pkt_sk(sk);
3345
3346 switch (msg) {
3347 case NETDEV_UNREGISTER:
1da177e4
LT
3348 if (po->mclist)
3349 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3350 /* fallthrough */
3351
1da177e4
LT
3352 case NETDEV_DOWN:
3353 if (dev->ifindex == po->ifindex) {
3354 spin_lock(&po->bind_lock);
3355 if (po->running) {
ce06b03e 3356 __unregister_prot_hook(sk, false);
1da177e4
LT
3357 sk->sk_err = ENETDOWN;
3358 if (!sock_flag(sk, SOCK_DEAD))
3359 sk->sk_error_report(sk);
3360 }
3361 if (msg == NETDEV_UNREGISTER) {
3362 po->ifindex = -1;
160ff18a
BG
3363 if (po->prot_hook.dev)
3364 dev_put(po->prot_hook.dev);
1da177e4
LT
3365 po->prot_hook.dev = NULL;
3366 }
3367 spin_unlock(&po->bind_lock);
3368 }
3369 break;
3370 case NETDEV_UP:
808f5114 3371 if (dev->ifindex == po->ifindex) {
3372 spin_lock(&po->bind_lock);
ce06b03e
DM
3373 if (po->num)
3374 register_prot_hook(sk);
808f5114 3375 spin_unlock(&po->bind_lock);
1da177e4 3376 }
1da177e4
LT
3377 break;
3378 }
3379 }
808f5114 3380 rcu_read_unlock();
1da177e4
LT
3381 return NOTIFY_DONE;
3382}
3383
3384
3385static int packet_ioctl(struct socket *sock, unsigned int cmd,
3386 unsigned long arg)
3387{
3388 struct sock *sk = sock->sk;
3389
69e3c75f 3390 switch (cmd) {
40d4e3df
ED
3391 case SIOCOUTQ:
3392 {
3393 int amount = sk_wmem_alloc_get(sk);
31e6d363 3394
40d4e3df
ED
3395 return put_user(amount, (int __user *)arg);
3396 }
3397 case SIOCINQ:
3398 {
3399 struct sk_buff *skb;
3400 int amount = 0;
3401
3402 spin_lock_bh(&sk->sk_receive_queue.lock);
3403 skb = skb_peek(&sk->sk_receive_queue);
3404 if (skb)
3405 amount = skb->len;
3406 spin_unlock_bh(&sk->sk_receive_queue.lock);
3407 return put_user(amount, (int __user *)arg);
3408 }
3409 case SIOCGSTAMP:
3410 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3411 case SIOCGSTAMPNS:
3412 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3413
1da177e4 3414#ifdef CONFIG_INET
40d4e3df
ED
3415 case SIOCADDRT:
3416 case SIOCDELRT:
3417 case SIOCDARP:
3418 case SIOCGARP:
3419 case SIOCSARP:
3420 case SIOCGIFADDR:
3421 case SIOCSIFADDR:
3422 case SIOCGIFBRDADDR:
3423 case SIOCSIFBRDADDR:
3424 case SIOCGIFNETMASK:
3425 case SIOCSIFNETMASK:
3426 case SIOCGIFDSTADDR:
3427 case SIOCSIFDSTADDR:
3428 case SIOCSIFFLAGS:
40d4e3df 3429 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3430#endif
3431
40d4e3df
ED
3432 default:
3433 return -ENOIOCTLCMD;
1da177e4
LT
3434 }
3435 return 0;
3436}
3437
40d4e3df 3438static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3439 poll_table *wait)
3440{
3441 struct sock *sk = sock->sk;
3442 struct packet_sock *po = pkt_sk(sk);
3443 unsigned int mask = datagram_poll(file, sock, wait);
3444
3445 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3446 if (po->rx_ring.pg_vec) {
f6fb8f10 3447 if (!packet_previous_rx_frame(po, &po->rx_ring,
3448 TP_STATUS_KERNEL))
1da177e4
LT
3449 mask |= POLLIN | POLLRDNORM;
3450 }
3451 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3452 spin_lock_bh(&sk->sk_write_queue.lock);
3453 if (po->tx_ring.pg_vec) {
3454 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3455 mask |= POLLOUT | POLLWRNORM;
3456 }
3457 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3458 return mask;
3459}
3460
3461
3462/* Dirty? Well, I still did not learn better way to account
3463 * for user mmaps.
3464 */
3465
3466static void packet_mm_open(struct vm_area_struct *vma)
3467{
3468 struct file *file = vma->vm_file;
40d4e3df 3469 struct socket *sock = file->private_data;
1da177e4 3470 struct sock *sk = sock->sk;
1ce4f28b 3471
1da177e4
LT
3472 if (sk)
3473 atomic_inc(&pkt_sk(sk)->mapped);
3474}
3475
3476static void packet_mm_close(struct vm_area_struct *vma)
3477{
3478 struct file *file = vma->vm_file;
40d4e3df 3479 struct socket *sock = file->private_data;
1da177e4 3480 struct sock *sk = sock->sk;
1ce4f28b 3481
1da177e4
LT
3482 if (sk)
3483 atomic_dec(&pkt_sk(sk)->mapped);
3484}
3485
f0f37e2f 3486static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3487 .open = packet_mm_open,
3488 .close = packet_mm_close,
1da177e4
LT
3489};
3490
0e3125c7
NH
3491static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3492 unsigned int len)
1da177e4
LT
3493{
3494 int i;
3495
4ebf0ae2 3496 for (i = 0; i < len; i++) {
0e3125c7 3497 if (likely(pg_vec[i].buffer)) {
c56b4d90 3498 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3499 vfree(pg_vec[i].buffer);
3500 else
3501 free_pages((unsigned long)pg_vec[i].buffer,
3502 order);
3503 pg_vec[i].buffer = NULL;
3504 }
1da177e4
LT
3505 }
3506 kfree(pg_vec);
3507}
3508
eea49cc9 3509static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3510{
0e3125c7
NH
3511 char *buffer = NULL;
3512 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3513 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3514
3515 buffer = (char *) __get_free_pages(gfp_flags, order);
3516
3517 if (buffer)
3518 return buffer;
3519
3520 /*
3521 * __get_free_pages failed, fall back to vmalloc
3522 */
bbce5a59 3523 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3524
0e3125c7
NH
3525 if (buffer)
3526 return buffer;
3527
3528 /*
3529 * vmalloc failed, lets dig into swap here
3530 */
0e3125c7
NH
3531 gfp_flags &= ~__GFP_NORETRY;
3532 buffer = (char *)__get_free_pages(gfp_flags, order);
3533 if (buffer)
3534 return buffer;
3535
3536 /*
3537 * complete and utter failure
3538 */
3539 return NULL;
4ebf0ae2
DM
3540}
3541
0e3125c7 3542static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3543{
3544 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3545 struct pgv *pg_vec;
4ebf0ae2
DM
3546 int i;
3547
0e3125c7 3548 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3549 if (unlikely(!pg_vec))
3550 goto out;
3551
3552 for (i = 0; i < block_nr; i++) {
c56b4d90 3553 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3554 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3555 goto out_free_pgvec;
3556 }
3557
3558out:
3559 return pg_vec;
3560
3561out_free_pgvec:
3562 free_pg_vec(pg_vec, order, block_nr);
3563 pg_vec = NULL;
3564 goto out;
3565}
1da177e4 3566
f6fb8f10 3567static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3568 int closing, int tx_ring)
1da177e4 3569{
0e3125c7 3570 struct pgv *pg_vec = NULL;
1da177e4 3571 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3572 int was_running, order = 0;
69e3c75f
JB
3573 struct packet_ring_buffer *rb;
3574 struct sk_buff_head *rb_queue;
0e11c91e 3575 __be16 num;
f6fb8f10 3576 int err = -EINVAL;
3577 /* Added to avoid minimal code churn */
3578 struct tpacket_req *req = &req_u->req;
3579
3580 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3581 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3582 WARN(1, "Tx-ring is not supported.\n");
3583 goto out;
3584 }
1ce4f28b 3585
69e3c75f
JB
3586 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3587 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3588
69e3c75f
JB
3589 err = -EBUSY;
3590 if (!closing) {
3591 if (atomic_read(&po->mapped))
3592 goto out;
3593 if (atomic_read(&rb->pending))
3594 goto out;
3595 }
1da177e4 3596
69e3c75f
JB
3597 if (req->tp_block_nr) {
3598 /* Sanity tests and some calculations */
3599 err = -EBUSY;
3600 if (unlikely(rb->pg_vec))
3601 goto out;
1da177e4 3602
bbd6ef87
PM
3603 switch (po->tp_version) {
3604 case TPACKET_V1:
3605 po->tp_hdrlen = TPACKET_HDRLEN;
3606 break;
3607 case TPACKET_V2:
3608 po->tp_hdrlen = TPACKET2_HDRLEN;
3609 break;
f6fb8f10 3610 case TPACKET_V3:
3611 po->tp_hdrlen = TPACKET3_HDRLEN;
3612 break;
bbd6ef87
PM
3613 }
3614
69e3c75f 3615 err = -EINVAL;
4ebf0ae2 3616 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3617 goto out;
4ebf0ae2 3618 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3619 goto out;
8913336a 3620 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3621 po->tp_reserve))
3622 goto out;
4ebf0ae2 3623 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3624 goto out;
1da177e4 3625
69e3c75f
JB
3626 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3627 if (unlikely(rb->frames_per_block <= 0))
3628 goto out;
3629 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3630 req->tp_frame_nr))
3631 goto out;
1da177e4
LT
3632
3633 err = -ENOMEM;
4ebf0ae2
DM
3634 order = get_order(req->tp_block_size);
3635 pg_vec = alloc_pg_vec(req, order);
3636 if (unlikely(!pg_vec))
1da177e4 3637 goto out;
f6fb8f10 3638 switch (po->tp_version) {
3639 case TPACKET_V3:
3640 /* Transmit path is not supported. We checked
3641 * it above but just being paranoid
3642 */
3643 if (!tx_ring)
3644 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3645 break;
3646 default:
3647 break;
3648 }
69e3c75f
JB
3649 }
3650 /* Done */
3651 else {
3652 err = -EINVAL;
4ebf0ae2 3653 if (unlikely(req->tp_frame_nr))
69e3c75f 3654 goto out;
1da177e4
LT
3655 }
3656
3657 lock_sock(sk);
3658
3659 /* Detach socket from network */
3660 spin_lock(&po->bind_lock);
3661 was_running = po->running;
3662 num = po->num;
3663 if (was_running) {
1da177e4 3664 po->num = 0;
ce06b03e 3665 __unregister_prot_hook(sk, false);
1da177e4
LT
3666 }
3667 spin_unlock(&po->bind_lock);
1ce4f28b 3668
1da177e4
LT
3669 synchronize_net();
3670
3671 err = -EBUSY;
905db440 3672 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3673 if (closing || atomic_read(&po->mapped) == 0) {
3674 err = 0;
69e3c75f 3675 spin_lock_bh(&rb_queue->lock);
c053fd96 3676 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3677 rb->frame_max = (req->tp_frame_nr - 1);
3678 rb->head = 0;
3679 rb->frame_size = req->tp_frame_size;
3680 spin_unlock_bh(&rb_queue->lock);
3681
c053fd96
CG
3682 swap(rb->pg_vec_order, order);
3683 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3684
3685 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3686 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3687 tpacket_rcv : packet_rcv;
3688 skb_queue_purge(rb_queue);
1da177e4 3689 if (atomic_read(&po->mapped))
40d4e3df
ED
3690 pr_err("packet_mmap: vma is busy: %d\n",
3691 atomic_read(&po->mapped));
1da177e4 3692 }
905db440 3693 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3694
3695 spin_lock(&po->bind_lock);
ce06b03e 3696 if (was_running) {
1da177e4 3697 po->num = num;
ce06b03e 3698 register_prot_hook(sk);
1da177e4
LT
3699 }
3700 spin_unlock(&po->bind_lock);
f6fb8f10 3701 if (closing && (po->tp_version > TPACKET_V2)) {
3702 /* Because we don't support block-based V3 on tx-ring */
3703 if (!tx_ring)
3704 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3705 }
1da177e4
LT
3706 release_sock(sk);
3707
1da177e4
LT
3708 if (pg_vec)
3709 free_pg_vec(pg_vec, order, req->tp_block_nr);
3710out:
3711 return err;
3712}
3713
69e3c75f
JB
3714static int packet_mmap(struct file *file, struct socket *sock,
3715 struct vm_area_struct *vma)
1da177e4
LT
3716{
3717 struct sock *sk = sock->sk;
3718 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3719 unsigned long size, expected_size;
3720 struct packet_ring_buffer *rb;
1da177e4
LT
3721 unsigned long start;
3722 int err = -EINVAL;
3723 int i;
3724
3725 if (vma->vm_pgoff)
3726 return -EINVAL;
3727
905db440 3728 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3729
3730 expected_size = 0;
3731 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3732 if (rb->pg_vec) {
3733 expected_size += rb->pg_vec_len
3734 * rb->pg_vec_pages
3735 * PAGE_SIZE;
3736 }
3737 }
3738
3739 if (expected_size == 0)
1da177e4 3740 goto out;
69e3c75f
JB
3741
3742 size = vma->vm_end - vma->vm_start;
3743 if (size != expected_size)
1da177e4
LT
3744 goto out;
3745
1da177e4 3746 start = vma->vm_start;
69e3c75f
JB
3747 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3748 if (rb->pg_vec == NULL)
3749 continue;
3750
3751 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3752 struct page *page;
3753 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3754 int pg_num;
3755
c56b4d90
CG
3756 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3757 page = pgv_to_page(kaddr);
69e3c75f
JB
3758 err = vm_insert_page(vma, start, page);
3759 if (unlikely(err))
3760 goto out;
3761 start += PAGE_SIZE;
0e3125c7 3762 kaddr += PAGE_SIZE;
69e3c75f 3763 }
4ebf0ae2 3764 }
1da177e4 3765 }
69e3c75f 3766
4ebf0ae2 3767 atomic_inc(&po->mapped);
1da177e4
LT
3768 vma->vm_ops = &packet_mmap_ops;
3769 err = 0;
3770
3771out:
905db440 3772 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3773 return err;
3774}
1da177e4 3775
90ddc4f0 3776static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3777 .family = PF_PACKET,
3778 .owner = THIS_MODULE,
3779 .release = packet_release,
3780 .bind = packet_bind_spkt,
3781 .connect = sock_no_connect,
3782 .socketpair = sock_no_socketpair,
3783 .accept = sock_no_accept,
3784 .getname = packet_getname_spkt,
3785 .poll = datagram_poll,
3786 .ioctl = packet_ioctl,
3787 .listen = sock_no_listen,
3788 .shutdown = sock_no_shutdown,
3789 .setsockopt = sock_no_setsockopt,
3790 .getsockopt = sock_no_getsockopt,
3791 .sendmsg = packet_sendmsg_spkt,
3792 .recvmsg = packet_recvmsg,
3793 .mmap = sock_no_mmap,
3794 .sendpage = sock_no_sendpage,
3795};
1da177e4 3796
90ddc4f0 3797static const struct proto_ops packet_ops = {
1da177e4
LT
3798 .family = PF_PACKET,
3799 .owner = THIS_MODULE,
3800 .release = packet_release,
3801 .bind = packet_bind,
3802 .connect = sock_no_connect,
3803 .socketpair = sock_no_socketpair,
3804 .accept = sock_no_accept,
1ce4f28b 3805 .getname = packet_getname,
1da177e4
LT
3806 .poll = packet_poll,
3807 .ioctl = packet_ioctl,
3808 .listen = sock_no_listen,
3809 .shutdown = sock_no_shutdown,
3810 .setsockopt = packet_setsockopt,
3811 .getsockopt = packet_getsockopt,
3812 .sendmsg = packet_sendmsg,
3813 .recvmsg = packet_recvmsg,
3814 .mmap = packet_mmap,
3815 .sendpage = sock_no_sendpage,
3816};
3817
ec1b4cf7 3818static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3819 .family = PF_PACKET,
3820 .create = packet_create,
3821 .owner = THIS_MODULE,
3822};
3823
3824static struct notifier_block packet_netdev_notifier = {
40d4e3df 3825 .notifier_call = packet_notifier,
1da177e4
LT
3826};
3827
3828#ifdef CONFIG_PROC_FS
1da177e4
LT
3829
3830static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3831 __acquires(RCU)
1da177e4 3832{
e372c414 3833 struct net *net = seq_file_net(seq);
808f5114 3834
3835 rcu_read_lock();
3836 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3837}
3838
3839static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3840{
1bf40954 3841 struct net *net = seq_file_net(seq);
808f5114 3842 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3843}
3844
3845static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3846 __releases(RCU)
1da177e4 3847{
808f5114 3848 rcu_read_unlock();
1da177e4
LT
3849}
3850
1ce4f28b 3851static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3852{
3853 if (v == SEQ_START_TOKEN)
3854 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3855 else {
b7ceabd9 3856 struct sock *s = sk_entry(v);
1da177e4
LT
3857 const struct packet_sock *po = pkt_sk(s);
3858
3859 seq_printf(seq,
71338aa7 3860 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3861 s,
3862 atomic_read(&s->sk_refcnt),
3863 s->sk_type,
3864 ntohs(po->num),
3865 po->ifindex,
3866 po->running,
3867 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3868 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3869 sock_i_ino(s));
1da177e4
LT
3870 }
3871
3872 return 0;
3873}
3874
56b3d975 3875static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3876 .start = packet_seq_start,
3877 .next = packet_seq_next,
3878 .stop = packet_seq_stop,
3879 .show = packet_seq_show,
3880};
3881
3882static int packet_seq_open(struct inode *inode, struct file *file)
3883{
e372c414
DL
3884 return seq_open_net(inode, file, &packet_seq_ops,
3885 sizeof(struct seq_net_private));
1da177e4
LT
3886}
3887
da7071d7 3888static const struct file_operations packet_seq_fops = {
1da177e4
LT
3889 .owner = THIS_MODULE,
3890 .open = packet_seq_open,
3891 .read = seq_read,
3892 .llseek = seq_lseek,
e372c414 3893 .release = seq_release_net,
1da177e4
LT
3894};
3895
3896#endif
3897
2c8c1e72 3898static int __net_init packet_net_init(struct net *net)
d12d01d6 3899{
0fa7fa98 3900 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3901 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3902
d4beaa66 3903 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3904 return -ENOMEM;
3905
3906 return 0;
3907}
3908
2c8c1e72 3909static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3910{
ece31ffd 3911 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3912}
3913
3914static struct pernet_operations packet_net_ops = {
3915 .init = packet_net_init,
3916 .exit = packet_net_exit,
3917};
3918
3919
1da177e4
LT
3920static void __exit packet_exit(void)
3921{
1da177e4 3922 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3923 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3924 sock_unregister(PF_PACKET);
3925 proto_unregister(&packet_proto);
3926}
3927
3928static int __init packet_init(void)
3929{
3930 int rc = proto_register(&packet_proto, 0);
3931
3932 if (rc != 0)
3933 goto out;
3934
3935 sock_register(&packet_family_ops);
d12d01d6 3936 register_pernet_subsys(&packet_net_ops);
1da177e4 3937 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3938out:
3939 return rc;
3940}
3941
3942module_init(packet_init);
3943module_exit(packet_exit);
3944MODULE_LICENSE("GPL");
3945MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.688986 seconds and 5 git commands to generate.