netpoll: Close race condition between poll_one_napi and napi_disable
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
1da177e4 96
2787b04b
PE
97#include "internal.h"
98
1da177e4
LT
99/*
100 Assumptions:
101 - if device has no dev->hard_header routine, it adds and removes ll header
102 inside itself. In this case ll header is invisible outside of device,
103 but higher levels still should reserve dev->hard_header_len.
104 Some devices are enough clever to reallocate skb, when header
105 will not fit to reserved space (tunnel), another ones are silly
106 (PPP).
107 - packet socket receives packets with pulled ll header,
108 so that SOCK_RAW should push it back.
109
110On receive:
111-----------
112
113Incoming, dev->hard_header!=NULL
b0e380b1
ACM
114 mac_header -> ll header
115 data -> data
1da177e4
LT
116
117Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
118 mac_header -> ll header
119 data -> ll header
1da177e4
LT
120
121Incoming, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> UNKNOWN position. It is very likely, that it points to ll
123 header. PPP makes it, that is wrong, because introduce
db0c58f9 124 assymetry between rx and tx paths.
b0e380b1 125 data -> data
1da177e4
LT
126
127Outgoing, dev->hard_header==NULL
b0e380b1
ACM
128 mac_header -> data. ll header is still not built!
129 data -> data
1da177e4
LT
130
131Resume
132 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133
134
135On transmit:
136------------
137
138dev->hard_header != NULL
b0e380b1
ACM
139 mac_header -> ll header
140 data -> ll header
1da177e4
LT
141
142dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
143 mac_header -> data
144 data -> data
1da177e4
LT
145
146 We should set nh.raw on output to correct posistion,
147 packet classifier depends on it.
148 */
149
1da177e4
LT
150/* Private packet socket structures. */
151
0fb375fb
EB
152/* identical to struct packet_mreq except it has
153 * a longer address field.
154 */
40d4e3df 155struct packet_mreq_max {
0fb375fb
EB
156 int mr_ifindex;
157 unsigned short mr_type;
158 unsigned short mr_alen;
159 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 160};
a2efcfa0 161
184f489e
DB
162union tpacket_uhdr {
163 struct tpacket_hdr *h1;
164 struct tpacket2_hdr *h2;
165 struct tpacket3_hdr *h3;
166 void *raw;
167};
168
f6fb8f10 169static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
170 int closing, int tx_ring);
171
f6fb8f10 172#define V3_ALIGNMENT (8)
173
bc59ba39 174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
f6fb8f10 179#define PGV_FROM_VMALLOC 1
69e3c75f 180
f6fb8f10 181#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
182#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
183#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
184#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
185#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
186#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
188
69e3c75f
JB
189struct packet_sock;
190static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
191static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
192 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 193
f6fb8f10 194static void *packet_previous_frame(struct packet_sock *po,
195 struct packet_ring_buffer *rb,
196 int status);
197static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 198static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
199 struct tpacket_block_desc *);
200static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 201 struct packet_sock *);
bc59ba39 202static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 203 struct packet_sock *, unsigned int status);
bc59ba39 204static int prb_queue_frozen(struct tpacket_kbdq_core *);
205static void prb_open_block(struct tpacket_kbdq_core *,
206 struct tpacket_block_desc *);
f6fb8f10 207static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 208static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
209static void prb_init_blk_timer(struct packet_sock *,
210 struct tpacket_kbdq_core *,
211 void (*func) (unsigned long));
212static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
213static void prb_clear_rxhash(struct tpacket_kbdq_core *,
214 struct tpacket3_hdr *);
215static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
216 struct tpacket3_hdr *);
1da177e4
LT
217static void packet_flush_mclist(struct sock *sk);
218
ffbc6111 219struct packet_skb_cb {
ffbc6111
HX
220 union {
221 struct sockaddr_pkt pkt;
2472d761
EB
222 union {
223 /* Trick: alias skb original length with
224 * ll.sll_family and ll.protocol in order
225 * to save room.
226 */
227 unsigned int origlen;
228 struct sockaddr_ll ll;
229 };
ffbc6111
HX
230 } sa;
231};
232
233#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 234
bc59ba39 235#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 236#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 237 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 238#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 239 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 240#define GET_NEXT_PRB_BLK_NUM(x) \
241 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
242 ((x)->kactive_blk_num+1) : 0)
243
dc99f600
DM
244static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
245static void __fanout_link(struct sock *sk, struct packet_sock *po);
246
d346a3fa
DB
247static int packet_direct_xmit(struct sk_buff *skb)
248{
249 struct net_device *dev = skb->dev;
d346a3fa
DB
250 netdev_features_t features;
251 struct netdev_queue *txq;
43279500 252 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
253
254 if (unlikely(!netif_running(dev) ||
43279500
DB
255 !netif_carrier_ok(dev)))
256 goto drop;
d346a3fa
DB
257
258 features = netif_skb_features(skb);
259 if (skb_needs_linearize(skb, features) &&
43279500
DB
260 __skb_linearize(skb))
261 goto drop;
d346a3fa 262
10c51b56 263 txq = skb_get_tx_queue(dev, skb);
d346a3fa 264
43279500
DB
265 local_bh_disable();
266
267 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 268 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 269 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 270 HARD_TX_UNLOCK(dev, txq);
d346a3fa 271
43279500
DB
272 local_bh_enable();
273
274 if (!dev_xmit_complete(ret))
d346a3fa 275 kfree_skb(skb);
43279500 276
d346a3fa 277 return ret;
43279500 278drop:
0f97ede4 279 atomic_long_inc(&dev->tx_dropped);
43279500
DB
280 kfree_skb(skb);
281 return NET_XMIT_DROP;
d346a3fa
DB
282}
283
66e56cd4
DB
284static struct net_device *packet_cached_dev_get(struct packet_sock *po)
285{
286 struct net_device *dev;
287
288 rcu_read_lock();
289 dev = rcu_dereference(po->cached_dev);
290 if (likely(dev))
291 dev_hold(dev);
292 rcu_read_unlock();
293
294 return dev;
295}
296
297static void packet_cached_dev_assign(struct packet_sock *po,
298 struct net_device *dev)
299{
300 rcu_assign_pointer(po->cached_dev, dev);
301}
302
303static void packet_cached_dev_reset(struct packet_sock *po)
304{
305 RCU_INIT_POINTER(po->cached_dev, NULL);
306}
307
d346a3fa
DB
308static bool packet_use_direct_xmit(const struct packet_sock *po)
309{
310 return po->xmit == packet_direct_xmit;
311}
312
0fd5d57b 313static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 314{
1cbac010 315 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
316}
317
0fd5d57b
DB
318static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
319{
320 const struct net_device_ops *ops = dev->netdev_ops;
321 u16 queue_index;
322
323 if (ops->ndo_select_queue) {
324 queue_index = ops->ndo_select_queue(dev, skb, NULL,
325 __packet_pick_tx_queue);
326 queue_index = netdev_cap_txqueue(dev, queue_index);
327 } else {
328 queue_index = __packet_pick_tx_queue(dev, skb);
329 }
330
331 skb_set_queue_mapping(skb, queue_index);
332}
333
ce06b03e
DM
334/* register_prot_hook must be invoked with the po->bind_lock held,
335 * or from a context in which asynchronous accesses to the packet
336 * socket is not possible (packet_create()).
337 */
338static void register_prot_hook(struct sock *sk)
339{
340 struct packet_sock *po = pkt_sk(sk);
e40526cb 341
ce06b03e 342 if (!po->running) {
66e56cd4 343 if (po->fanout)
dc99f600 344 __fanout_link(sk, po);
66e56cd4 345 else
dc99f600 346 dev_add_pack(&po->prot_hook);
e40526cb 347
ce06b03e
DM
348 sock_hold(sk);
349 po->running = 1;
350 }
351}
352
353/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
354 * held. If the sync parameter is true, we will temporarily drop
355 * the po->bind_lock and do a synchronize_net to make sure no
356 * asynchronous packet processing paths still refer to the elements
357 * of po->prot_hook. If the sync parameter is false, it is the
358 * callers responsibility to take care of this.
359 */
360static void __unregister_prot_hook(struct sock *sk, bool sync)
361{
362 struct packet_sock *po = pkt_sk(sk);
363
364 po->running = 0;
66e56cd4
DB
365
366 if (po->fanout)
dc99f600 367 __fanout_unlink(sk, po);
66e56cd4 368 else
dc99f600 369 __dev_remove_pack(&po->prot_hook);
e40526cb 370
ce06b03e
DM
371 __sock_put(sk);
372
373 if (sync) {
374 spin_unlock(&po->bind_lock);
375 synchronize_net();
376 spin_lock(&po->bind_lock);
377 }
378}
379
380static void unregister_prot_hook(struct sock *sk, bool sync)
381{
382 struct packet_sock *po = pkt_sk(sk);
383
384 if (po->running)
385 __unregister_prot_hook(sk, sync);
386}
387
6e58040b 388static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
389{
390 if (is_vmalloc_addr(addr))
391 return vmalloc_to_page(addr);
392 return virt_to_page(addr);
393}
394
69e3c75f 395static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 396{
184f489e 397 union tpacket_uhdr h;
1da177e4 398
69e3c75f 399 h.raw = frame;
bbd6ef87
PM
400 switch (po->tp_version) {
401 case TPACKET_V1:
69e3c75f 402 h.h1->tp_status = status;
0af55bb5 403 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
404 break;
405 case TPACKET_V2:
69e3c75f 406 h.h2->tp_status = status;
0af55bb5 407 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 408 break;
f6fb8f10 409 case TPACKET_V3:
69e3c75f 410 default:
f6fb8f10 411 WARN(1, "TPACKET version not supported.\n");
69e3c75f 412 BUG();
bbd6ef87 413 }
69e3c75f
JB
414
415 smp_wmb();
bbd6ef87
PM
416}
417
69e3c75f 418static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 419{
184f489e 420 union tpacket_uhdr h;
bbd6ef87 421
69e3c75f
JB
422 smp_rmb();
423
bbd6ef87
PM
424 h.raw = frame;
425 switch (po->tp_version) {
426 case TPACKET_V1:
0af55bb5 427 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 428 return h.h1->tp_status;
bbd6ef87 429 case TPACKET_V2:
0af55bb5 430 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 431 return h.h2->tp_status;
f6fb8f10 432 case TPACKET_V3:
69e3c75f 433 default:
f6fb8f10 434 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
435 BUG();
436 return 0;
bbd6ef87 437 }
1da177e4 438}
69e3c75f 439
b9c32fb2
DB
440static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
441 unsigned int flags)
7a51384c
DB
442{
443 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
444
68a360e8
WB
445 if (shhwtstamps &&
446 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
447 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
448 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
449
450 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 451 return TP_STATUS_TS_SOFTWARE;
7a51384c 452
b9c32fb2 453 return 0;
7a51384c
DB
454}
455
b9c32fb2
DB
456static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
457 struct sk_buff *skb)
2e31396f
WB
458{
459 union tpacket_uhdr h;
460 struct timespec ts;
b9c32fb2 461 __u32 ts_status;
2e31396f 462
b9c32fb2
DB
463 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
464 return 0;
2e31396f
WB
465
466 h.raw = frame;
467 switch (po->tp_version) {
468 case TPACKET_V1:
469 h.h1->tp_sec = ts.tv_sec;
470 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
471 break;
472 case TPACKET_V2:
473 h.h2->tp_sec = ts.tv_sec;
474 h.h2->tp_nsec = ts.tv_nsec;
475 break;
476 case TPACKET_V3:
477 default:
478 WARN(1, "TPACKET version not supported.\n");
479 BUG();
480 }
481
482 /* one flush is safe, as both fields always lie on the same cacheline */
483 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
484 smp_wmb();
b9c32fb2
DB
485
486 return ts_status;
2e31396f
WB
487}
488
69e3c75f
JB
489static void *packet_lookup_frame(struct packet_sock *po,
490 struct packet_ring_buffer *rb,
491 unsigned int position,
492 int status)
493{
494 unsigned int pg_vec_pos, frame_offset;
184f489e 495 union tpacket_uhdr h;
69e3c75f
JB
496
497 pg_vec_pos = position / rb->frames_per_block;
498 frame_offset = position % rb->frames_per_block;
499
0e3125c7
NH
500 h.raw = rb->pg_vec[pg_vec_pos].buffer +
501 (frame_offset * rb->frame_size);
69e3c75f
JB
502
503 if (status != __packet_get_status(po, h.raw))
504 return NULL;
505
506 return h.raw;
507}
508
eea49cc9 509static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
510 struct packet_ring_buffer *rb,
511 int status)
512{
513 return packet_lookup_frame(po, rb, rb->head, status);
514}
515
bc59ba39 516static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 517{
518 del_timer_sync(&pkc->retire_blk_timer);
519}
520
521static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 522 struct sk_buff_head *rb_queue)
523{
bc59ba39 524 struct tpacket_kbdq_core *pkc;
f6fb8f10 525
73d0fcf2 526 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 527
ec6f809f 528 spin_lock_bh(&rb_queue->lock);
f6fb8f10 529 pkc->delete_blk_timer = 1;
ec6f809f 530 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 531
532 prb_del_retire_blk_timer(pkc);
533}
534
535static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 536 struct tpacket_kbdq_core *pkc,
f6fb8f10 537 void (*func) (unsigned long))
538{
539 init_timer(&pkc->retire_blk_timer);
540 pkc->retire_blk_timer.data = (long)po;
541 pkc->retire_blk_timer.function = func;
542 pkc->retire_blk_timer.expires = jiffies;
543}
544
e8e85cc5 545static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 546{
bc59ba39 547 struct tpacket_kbdq_core *pkc;
f6fb8f10 548
e8e85cc5 549 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 550 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
551}
552
553static int prb_calc_retire_blk_tmo(struct packet_sock *po,
554 int blk_size_in_bytes)
555{
556 struct net_device *dev;
557 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
558 struct ethtool_cmd ecmd;
559 int err;
e440cf2c 560 u32 speed;
f6fb8f10 561
4bc71cb9
JP
562 rtnl_lock();
563 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
564 if (unlikely(!dev)) {
565 rtnl_unlock();
f6fb8f10 566 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
567 }
568 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 569 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
570 rtnl_unlock();
571 if (!err) {
4bc71cb9
JP
572 /*
573 * If the link speed is so slow you don't really
574 * need to worry about perf anyways
575 */
e440cf2c 576 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 577 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 578 } else {
579 msec = 1;
580 div = speed / 1000;
f6fb8f10 581 }
582 }
583
584 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
585
586 if (div)
587 mbits /= div;
588
589 tmo = mbits * msec;
590
591 if (div)
592 return tmo+1;
593 return tmo;
594}
595
bc59ba39 596static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 597 union tpacket_req_u *req_u)
598{
599 p1->feature_req_word = req_u->req3.tp_feature_req_word;
600}
601
602static void init_prb_bdqc(struct packet_sock *po,
603 struct packet_ring_buffer *rb,
604 struct pgv *pg_vec,
e8e85cc5 605 union tpacket_req_u *req_u)
f6fb8f10 606{
22781a5b 607 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 608 struct tpacket_block_desc *pbd;
f6fb8f10 609
610 memset(p1, 0x0, sizeof(*p1));
611
612 p1->knxt_seq_num = 1;
613 p1->pkbdq = pg_vec;
bc59ba39 614 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 615 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 616 p1->kblk_size = req_u->req3.tp_block_size;
617 p1->knum_blocks = req_u->req3.tp_block_nr;
618 p1->hdrlen = po->tp_hdrlen;
619 p1->version = po->tp_version;
620 p1->last_kactive_blk_num = 0;
ee80fbf3 621 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 622 if (req_u->req3.tp_retire_blk_tov)
623 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
624 else
625 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
626 req_u->req3.tp_block_size);
627 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
628 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
629
dc808110 630 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 631 prb_init_ft_ops(p1, req_u);
e8e85cc5 632 prb_setup_retire_blk_timer(po);
f6fb8f10 633 prb_open_block(p1, pbd);
634}
635
636/* Do NOT update the last_blk_num first.
637 * Assumes sk_buff_head lock is held.
638 */
bc59ba39 639static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 640{
641 mod_timer(&pkc->retire_blk_timer,
642 jiffies + pkc->tov_in_jiffies);
643 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
644}
645
646/*
647 * Timer logic:
648 * 1) We refresh the timer only when we open a block.
649 * By doing this we don't waste cycles refreshing the timer
650 * on packet-by-packet basis.
651 *
652 * With a 1MB block-size, on a 1Gbps line, it will take
653 * i) ~8 ms to fill a block + ii) memcpy etc.
654 * In this cut we are not accounting for the memcpy time.
655 *
656 * So, if the user sets the 'tmo' to 10ms then the timer
657 * will never fire while the block is still getting filled
658 * (which is what we want). However, the user could choose
659 * to close a block early and that's fine.
660 *
661 * But when the timer does fire, we check whether or not to refresh it.
662 * Since the tmo granularity is in msecs, it is not too expensive
663 * to refresh the timer, lets say every '8' msecs.
664 * Either the user can set the 'tmo' or we can derive it based on
665 * a) line-speed and b) block-size.
666 * prb_calc_retire_blk_tmo() calculates the tmo.
667 *
668 */
669static void prb_retire_rx_blk_timer_expired(unsigned long data)
670{
671 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 672 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 673 unsigned int frozen;
bc59ba39 674 struct tpacket_block_desc *pbd;
f6fb8f10 675
676 spin_lock(&po->sk.sk_receive_queue.lock);
677
678 frozen = prb_queue_frozen(pkc);
679 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
680
681 if (unlikely(pkc->delete_blk_timer))
682 goto out;
683
684 /* We only need to plug the race when the block is partially filled.
685 * tpacket_rcv:
686 * lock(); increment BLOCK_NUM_PKTS; unlock()
687 * copy_bits() is in progress ...
688 * timer fires on other cpu:
689 * we can't retire the current block because copy_bits
690 * is in progress.
691 *
692 */
693 if (BLOCK_NUM_PKTS(pbd)) {
694 while (atomic_read(&pkc->blk_fill_in_prog)) {
695 /* Waiting for skb_copy_bits to finish... */
696 cpu_relax();
697 }
698 }
699
700 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
701 if (!frozen) {
41a50d62
AD
702 if (!BLOCK_NUM_PKTS(pbd)) {
703 /* An empty block. Just refresh the timer. */
704 goto refresh_timer;
705 }
f6fb8f10 706 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
707 if (!prb_dispatch_next_block(pkc, po))
708 goto refresh_timer;
709 else
710 goto out;
711 } else {
712 /* Case 1. Queue was frozen because user-space was
713 * lagging behind.
714 */
715 if (prb_curr_blk_in_use(pkc, pbd)) {
716 /*
717 * Ok, user-space is still behind.
718 * So just refresh the timer.
719 */
720 goto refresh_timer;
721 } else {
722 /* Case 2. queue was frozen,user-space caught up,
723 * now the link went idle && the timer fired.
724 * We don't have a block to close.So we open this
725 * block and restart the timer.
726 * opening a block thaws the queue,restarts timer
727 * Thawing/timer-refresh is a side effect.
728 */
729 prb_open_block(pkc, pbd);
730 goto out;
731 }
732 }
733 }
734
735refresh_timer:
736 _prb_refresh_rx_retire_blk_timer(pkc);
737
738out:
739 spin_unlock(&po->sk.sk_receive_queue.lock);
740}
741
eea49cc9 742static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 743 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 744{
745 /* Flush everything minus the block header */
746
747#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
748 u8 *start, *end;
749
750 start = (u8 *)pbd1;
751
752 /* Skip the block header(we know header WILL fit in 4K) */
753 start += PAGE_SIZE;
754
755 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
756 for (; start < end; start += PAGE_SIZE)
757 flush_dcache_page(pgv_to_page(start));
758
759 smp_wmb();
760#endif
761
762 /* Now update the block status. */
763
764 BLOCK_STATUS(pbd1) = status;
765
766 /* Flush the block header */
767
768#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
769 start = (u8 *)pbd1;
770 flush_dcache_page(pgv_to_page(start));
771
772 smp_wmb();
773#endif
774}
775
776/*
777 * Side effect:
778 *
779 * 1) flush the block
780 * 2) Increment active_blk_num
781 *
782 * Note:We DONT refresh the timer on purpose.
783 * Because almost always the next block will be opened.
784 */
bc59ba39 785static void prb_close_block(struct tpacket_kbdq_core *pkc1,
786 struct tpacket_block_desc *pbd1,
f6fb8f10 787 struct packet_sock *po, unsigned int stat)
788{
789 __u32 status = TP_STATUS_USER | stat;
790
791 struct tpacket3_hdr *last_pkt;
bc59ba39 792 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 793 struct sock *sk = &po->sk;
f6fb8f10 794
ee80fbf3 795 if (po->stats.stats3.tp_drops)
f6fb8f10 796 status |= TP_STATUS_LOSING;
797
798 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
799 last_pkt->tp_next_offset = 0;
800
801 /* Get the ts of the last pkt */
802 if (BLOCK_NUM_PKTS(pbd1)) {
803 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
804 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
805 } else {
41a50d62
AD
806 /* Ok, we tmo'd - so get the current time.
807 *
808 * It shouldn't really happen as we don't close empty
809 * blocks. See prb_retire_rx_blk_timer_expired().
810 */
f6fb8f10 811 struct timespec ts;
812 getnstimeofday(&ts);
813 h1->ts_last_pkt.ts_sec = ts.tv_sec;
814 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
815 }
816
817 smp_wmb();
818
819 /* Flush the block */
820 prb_flush_block(pkc1, pbd1, status);
821
da413eec
DC
822 sk->sk_data_ready(sk);
823
f6fb8f10 824 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
825}
826
eea49cc9 827static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 828{
829 pkc->reset_pending_on_curr_blk = 0;
830}
831
832/*
833 * Side effect of opening a block:
834 *
835 * 1) prb_queue is thawed.
836 * 2) retire_blk_timer is refreshed.
837 *
838 */
bc59ba39 839static void prb_open_block(struct tpacket_kbdq_core *pkc1,
840 struct tpacket_block_desc *pbd1)
f6fb8f10 841{
842 struct timespec ts;
bc59ba39 843 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 844
845 smp_rmb();
846
8da3056c
DB
847 /* We could have just memset this but we will lose the
848 * flexibility of making the priv area sticky
849 */
f6fb8f10 850
8da3056c
DB
851 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
852 BLOCK_NUM_PKTS(pbd1) = 0;
853 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 854
8da3056c
DB
855 getnstimeofday(&ts);
856
857 h1->ts_first_pkt.ts_sec = ts.tv_sec;
858 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 859
8da3056c
DB
860 pkc1->pkblk_start = (char *)pbd1;
861 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
862
863 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
864 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
865
866 pbd1->version = pkc1->version;
867 pkc1->prev = pkc1->nxt_offset;
868 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
869
870 prb_thaw_queue(pkc1);
871 _prb_refresh_rx_retire_blk_timer(pkc1);
872
873 smp_wmb();
f6fb8f10 874}
875
876/*
877 * Queue freeze logic:
878 * 1) Assume tp_block_nr = 8 blocks.
879 * 2) At time 't0', user opens Rx ring.
880 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
881 * 4) user-space is either sleeping or processing block '0'.
882 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
883 * it will close block-7,loop around and try to fill block '0'.
884 * call-flow:
885 * __packet_lookup_frame_in_block
886 * prb_retire_current_block()
887 * prb_dispatch_next_block()
888 * |->(BLOCK_STATUS == USER) evaluates to true
889 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
890 * 6) Now there are two cases:
891 * 6.1) Link goes idle right after the queue is frozen.
892 * But remember, the last open_block() refreshed the timer.
893 * When this timer expires,it will refresh itself so that we can
894 * re-open block-0 in near future.
895 * 6.2) Link is busy and keeps on receiving packets. This is a simple
896 * case and __packet_lookup_frame_in_block will check if block-0
897 * is free and can now be re-used.
898 */
eea49cc9 899static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 900 struct packet_sock *po)
901{
902 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 903 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 904}
905
906#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
907
908/*
909 * If the next block is free then we will dispatch it
910 * and return a good offset.
911 * Else, we will freeze the queue.
912 * So, caller must check the return value.
913 */
bc59ba39 914static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 915 struct packet_sock *po)
916{
bc59ba39 917 struct tpacket_block_desc *pbd;
f6fb8f10 918
919 smp_rmb();
920
921 /* 1. Get current block num */
922 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
923
924 /* 2. If this block is currently in_use then freeze the queue */
925 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
926 prb_freeze_queue(pkc, po);
927 return NULL;
928 }
929
930 /*
931 * 3.
932 * open this block and return the offset where the first packet
933 * needs to get stored.
934 */
935 prb_open_block(pkc, pbd);
936 return (void *)pkc->nxt_offset;
937}
938
bc59ba39 939static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 940 struct packet_sock *po, unsigned int status)
941{
bc59ba39 942 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 943
944 /* retire/close the current block */
945 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
946 /*
947 * Plug the case where copy_bits() is in progress on
948 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
949 * have space to copy the pkt in the current block and
950 * called prb_retire_current_block()
951 *
952 * We don't need to worry about the TMO case because
953 * the timer-handler already handled this case.
954 */
955 if (!(status & TP_STATUS_BLK_TMO)) {
956 while (atomic_read(&pkc->blk_fill_in_prog)) {
957 /* Waiting for skb_copy_bits to finish... */
958 cpu_relax();
959 }
960 }
961 prb_close_block(pkc, pbd, po, status);
962 return;
963 }
f6fb8f10 964}
965
eea49cc9 966static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 967 struct tpacket_block_desc *pbd)
f6fb8f10 968{
969 return TP_STATUS_USER & BLOCK_STATUS(pbd);
970}
971
eea49cc9 972static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 973{
974 return pkc->reset_pending_on_curr_blk;
975}
976
eea49cc9 977static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 978{
bc59ba39 979 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 980 atomic_dec(&pkc->blk_fill_in_prog);
981}
982
eea49cc9 983static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 984 struct tpacket3_hdr *ppd)
985{
3958afa1 986 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 987}
988
eea49cc9 989static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 990 struct tpacket3_hdr *ppd)
991{
992 ppd->hv1.tp_rxhash = 0;
993}
994
eea49cc9 995static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 996 struct tpacket3_hdr *ppd)
997{
df8a39de
JP
998 if (skb_vlan_tag_present(pkc->skb)) {
999 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1000 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1001 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1002 } else {
9e67030a 1003 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1004 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1005 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1006 }
1007}
1008
bc59ba39 1009static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1010 struct tpacket3_hdr *ppd)
1011{
a0cdfcf3 1012 ppd->hv1.tp_padding = 0;
f6fb8f10 1013 prb_fill_vlan_info(pkc, ppd);
1014
1015 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1016 prb_fill_rxhash(pkc, ppd);
1017 else
1018 prb_clear_rxhash(pkc, ppd);
1019}
1020
eea49cc9 1021static void prb_fill_curr_block(char *curr,
bc59ba39 1022 struct tpacket_kbdq_core *pkc,
1023 struct tpacket_block_desc *pbd,
f6fb8f10 1024 unsigned int len)
1025{
1026 struct tpacket3_hdr *ppd;
1027
1028 ppd = (struct tpacket3_hdr *)curr;
1029 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1030 pkc->prev = curr;
1031 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1032 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_NUM_PKTS(pbd) += 1;
1034 atomic_inc(&pkc->blk_fill_in_prog);
1035 prb_run_all_ft_ops(pkc, ppd);
1036}
1037
1038/* Assumes caller has the sk->rx_queue.lock */
1039static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1040 struct sk_buff *skb,
1041 int status,
1042 unsigned int len
1043 )
1044{
bc59ba39 1045 struct tpacket_kbdq_core *pkc;
1046 struct tpacket_block_desc *pbd;
f6fb8f10 1047 char *curr, *end;
1048
e3192690 1049 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1050 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1051
1052 /* Queue is frozen when user space is lagging behind */
1053 if (prb_queue_frozen(pkc)) {
1054 /*
1055 * Check if that last block which caused the queue to freeze,
1056 * is still in_use by user-space.
1057 */
1058 if (prb_curr_blk_in_use(pkc, pbd)) {
1059 /* Can't record this packet */
1060 return NULL;
1061 } else {
1062 /*
1063 * Ok, the block was released by user-space.
1064 * Now let's open that block.
1065 * opening a block also thaws the queue.
1066 * Thawing is a side effect.
1067 */
1068 prb_open_block(pkc, pbd);
1069 }
1070 }
1071
1072 smp_mb();
1073 curr = pkc->nxt_offset;
1074 pkc->skb = skb;
e3192690 1075 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1076
1077 /* first try the current block */
1078 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1079 prb_fill_curr_block(curr, pkc, pbd, len);
1080 return (void *)curr;
1081 }
1082
1083 /* Ok, close the current block */
1084 prb_retire_current_block(pkc, po, 0);
1085
1086 /* Now, try to dispatch the next block */
1087 curr = (char *)prb_dispatch_next_block(pkc, po);
1088 if (curr) {
1089 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1090 prb_fill_curr_block(curr, pkc, pbd, len);
1091 return (void *)curr;
1092 }
1093
1094 /*
1095 * No free blocks are available.user_space hasn't caught up yet.
1096 * Queue was just frozen and now this packet will get dropped.
1097 */
1098 return NULL;
1099}
1100
eea49cc9 1101static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1102 struct sk_buff *skb,
1103 int status, unsigned int len)
1104{
1105 char *curr = NULL;
1106 switch (po->tp_version) {
1107 case TPACKET_V1:
1108 case TPACKET_V2:
1109 curr = packet_lookup_frame(po, &po->rx_ring,
1110 po->rx_ring.head, status);
1111 return curr;
1112 case TPACKET_V3:
1113 return __packet_lookup_frame_in_block(po, skb, status, len);
1114 default:
1115 WARN(1, "TPACKET version not supported\n");
1116 BUG();
99aa3473 1117 return NULL;
f6fb8f10 1118 }
1119}
1120
eea49cc9 1121static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1122 struct packet_ring_buffer *rb,
77f65ebd 1123 unsigned int idx,
f6fb8f10 1124 int status)
1125{
bc59ba39 1126 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1127 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1128
1129 if (status != BLOCK_STATUS(pbd))
1130 return NULL;
1131 return pbd;
1132}
1133
eea49cc9 1134static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1135{
1136 unsigned int prev;
1137 if (rb->prb_bdqc.kactive_blk_num)
1138 prev = rb->prb_bdqc.kactive_blk_num-1;
1139 else
1140 prev = rb->prb_bdqc.knum_blocks-1;
1141 return prev;
1142}
1143
1144/* Assumes caller has held the rx_queue.lock */
eea49cc9 1145static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1146 struct packet_ring_buffer *rb,
1147 int status)
1148{
1149 unsigned int previous = prb_previous_blk_num(rb);
1150 return prb_lookup_block(po, rb, previous, status);
1151}
1152
eea49cc9 1153static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1154 struct packet_ring_buffer *rb,
1155 int status)
1156{
1157 if (po->tp_version <= TPACKET_V2)
1158 return packet_previous_frame(po, rb, status);
1159
1160 return __prb_previous_block(po, rb, status);
1161}
1162
eea49cc9 1163static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1164 struct packet_ring_buffer *rb)
1165{
1166 switch (po->tp_version) {
1167 case TPACKET_V1:
1168 case TPACKET_V2:
1169 return packet_increment_head(rb);
1170 case TPACKET_V3:
1171 default:
1172 WARN(1, "TPACKET version not supported.\n");
1173 BUG();
1174 return;
1175 }
1176}
1177
eea49cc9 1178static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1179 struct packet_ring_buffer *rb,
1180 int status)
1181{
1182 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1183 return packet_lookup_frame(po, rb, previous, status);
1184}
1185
eea49cc9 1186static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1187{
1188 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1189}
1190
b0138408
DB
1191static void packet_inc_pending(struct packet_ring_buffer *rb)
1192{
1193 this_cpu_inc(*rb->pending_refcnt);
1194}
1195
1196static void packet_dec_pending(struct packet_ring_buffer *rb)
1197{
1198 this_cpu_dec(*rb->pending_refcnt);
1199}
1200
1201static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1202{
1203 unsigned int refcnt = 0;
1204 int cpu;
1205
1206 /* We don't use pending refcount in rx_ring. */
1207 if (rb->pending_refcnt == NULL)
1208 return 0;
1209
1210 for_each_possible_cpu(cpu)
1211 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1212
1213 return refcnt;
1214}
1215
1216static int packet_alloc_pending(struct packet_sock *po)
1217{
1218 po->rx_ring.pending_refcnt = NULL;
1219
1220 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1221 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1222 return -ENOBUFS;
1223
1224 return 0;
1225}
1226
1227static void packet_free_pending(struct packet_sock *po)
1228{
1229 free_percpu(po->tx_ring.pending_refcnt);
1230}
1231
9954729b
WB
1232#define ROOM_POW_OFF 2
1233#define ROOM_NONE 0x0
1234#define ROOM_LOW 0x1
1235#define ROOM_NORMAL 0x2
1236
1237static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1238{
9954729b
WB
1239 int idx, len;
1240
1241 len = po->rx_ring.frame_max + 1;
1242 idx = po->rx_ring.head;
1243 if (pow_off)
1244 idx += len >> pow_off;
1245 if (idx >= len)
1246 idx -= len;
1247 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1248}
1249
1250static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1251{
1252 int idx, len;
1253
1254 len = po->rx_ring.prb_bdqc.knum_blocks;
1255 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1256 if (pow_off)
1257 idx += len >> pow_off;
1258 if (idx >= len)
1259 idx -= len;
1260 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1261}
77f65ebd 1262
2ccdbaa6 1263static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1264{
1265 struct sock *sk = &po->sk;
1266 int ret = ROOM_NONE;
1267
1268 if (po->prot_hook.func != tpacket_rcv) {
1269 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1270 - (skb ? skb->truesize : 0);
9954729b
WB
1271 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1272 return ROOM_NORMAL;
1273 else if (avail > 0)
1274 return ROOM_LOW;
1275 else
1276 return ROOM_NONE;
1277 }
77f65ebd 1278
9954729b
WB
1279 if (po->tp_version == TPACKET_V3) {
1280 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1281 ret = ROOM_NORMAL;
1282 else if (__tpacket_v3_has_room(po, 0))
1283 ret = ROOM_LOW;
1284 } else {
1285 if (__tpacket_has_room(po, ROOM_POW_OFF))
1286 ret = ROOM_NORMAL;
1287 else if (__tpacket_has_room(po, 0))
1288 ret = ROOM_LOW;
1289 }
2ccdbaa6
WB
1290
1291 return ret;
1292}
1293
1294static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1295{
1296 int ret;
1297 bool has_room;
1298
54d7c01d
WB
1299 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1300 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1301 has_room = ret == ROOM_NORMAL;
1302 if (po->pressure == has_room)
54d7c01d
WB
1303 po->pressure = !has_room;
1304 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1305
9954729b 1306 return ret;
77f65ebd
WB
1307}
1308
1da177e4
LT
1309static void packet_sock_destruct(struct sock *sk)
1310{
ed85b565
RC
1311 skb_queue_purge(&sk->sk_error_queue);
1312
547b792c
IJ
1313 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1314 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1315
1316 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1317 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1318 return;
1319 }
1320
17ab56a2 1321 sk_refcnt_debug_dec(sk);
1da177e4
LT
1322}
1323
3b3a5b0a
WB
1324static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1325{
1326 u32 rxhash;
1327 int i, count = 0;
1328
1329 rxhash = skb_get_hash(skb);
1330 for (i = 0; i < ROLLOVER_HLEN; i++)
1331 if (po->rollover->history[i] == rxhash)
1332 count++;
1333
1334 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1335 return count > (ROLLOVER_HLEN >> 1);
1336}
1337
77f65ebd
WB
1338static unsigned int fanout_demux_hash(struct packet_fanout *f,
1339 struct sk_buff *skb,
1340 unsigned int num)
dc99f600 1341{
61b905da 1342 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1343}
1344
77f65ebd
WB
1345static unsigned int fanout_demux_lb(struct packet_fanout *f,
1346 struct sk_buff *skb,
1347 unsigned int num)
dc99f600 1348{
468479e6 1349 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1350
468479e6 1351 return val % num;
77f65ebd
WB
1352}
1353
1354static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1355 struct sk_buff *skb,
1356 unsigned int num)
1357{
1358 return smp_processor_id() % num;
dc99f600
DM
1359}
1360
5df0ddfb
DB
1361static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1362 struct sk_buff *skb,
1363 unsigned int num)
1364{
f337db64 1365 return prandom_u32_max(num);
5df0ddfb
DB
1366}
1367
77f65ebd
WB
1368static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1369 struct sk_buff *skb,
ad377cab 1370 unsigned int idx, bool try_self,
77f65ebd 1371 unsigned int num)
95ec3eb4 1372{
4633c9e0 1373 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1374 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1375
0648ab70 1376 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1377
1378 if (try_self) {
1379 room = packet_rcv_has_room(po, skb);
1380 if (room == ROOM_NORMAL ||
1381 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1382 return idx;
4633c9e0 1383 po_skip = po;
3b3a5b0a 1384 }
ad377cab 1385
0648ab70 1386 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1387 do {
2ccdbaa6 1388 po_next = pkt_sk(f->arr[i]);
4633c9e0 1389 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1390 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1391 if (i != j)
0648ab70 1392 po->rollover->sock = i;
a9b63918
WB
1393 atomic_long_inc(&po->rollover->num);
1394 if (room == ROOM_LOW)
1395 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1396 return i;
1397 }
ad377cab 1398
77f65ebd
WB
1399 if (++i == num)
1400 i = 0;
1401 } while (i != j);
1402
a9b63918 1403 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1404 return idx;
1405}
1406
2d36097d
NH
1407static unsigned int fanout_demux_qm(struct packet_fanout *f,
1408 struct sk_buff *skb,
1409 unsigned int num)
1410{
1411 return skb_get_queue_mapping(skb) % num;
1412}
1413
47dceb8e
WB
1414static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1415 struct sk_buff *skb,
1416 unsigned int num)
1417{
1418 struct bpf_prog *prog;
1419 unsigned int ret = 0;
1420
1421 rcu_read_lock();
1422 prog = rcu_dereference(f->bpf_prog);
1423 if (prog)
1424 ret = BPF_PROG_RUN(prog, skb) % num;
1425 rcu_read_unlock();
1426
1427 return ret;
1428}
1429
77f65ebd
WB
1430static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1431{
1432 return f->flags & (flag >> 8);
95ec3eb4
DM
1433}
1434
95ec3eb4
DM
1435static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1436 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1437{
1438 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1439 unsigned int num = READ_ONCE(f->num_members);
dc99f600 1440 struct packet_sock *po;
77f65ebd 1441 unsigned int idx;
dc99f600
DM
1442
1443 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1444 !num) {
1445 kfree_skb(skb);
1446 return 0;
1447 }
1448
3f34b24a
AD
1449 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1450 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1451 if (!skb)
1452 return 0;
1453 }
95ec3eb4
DM
1454 switch (f->type) {
1455 case PACKET_FANOUT_HASH:
1456 default:
77f65ebd 1457 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1458 break;
1459 case PACKET_FANOUT_LB:
77f65ebd 1460 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1461 break;
1462 case PACKET_FANOUT_CPU:
77f65ebd
WB
1463 idx = fanout_demux_cpu(f, skb, num);
1464 break;
5df0ddfb
DB
1465 case PACKET_FANOUT_RND:
1466 idx = fanout_demux_rnd(f, skb, num);
1467 break;
2d36097d
NH
1468 case PACKET_FANOUT_QM:
1469 idx = fanout_demux_qm(f, skb, num);
1470 break;
77f65ebd 1471 case PACKET_FANOUT_ROLLOVER:
ad377cab 1472 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1473 break;
47dceb8e 1474 case PACKET_FANOUT_CBPF:
f2e52095 1475 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1476 idx = fanout_demux_bpf(f, skb, num);
1477 break;
dc99f600
DM
1478 }
1479
ad377cab
WB
1480 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1481 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1482
ad377cab 1483 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1484 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1485}
1486
fff3321d
PE
1487DEFINE_MUTEX(fanout_mutex);
1488EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1489static LIST_HEAD(fanout_list);
1490
1491static void __fanout_link(struct sock *sk, struct packet_sock *po)
1492{
1493 struct packet_fanout *f = po->fanout;
1494
1495 spin_lock(&f->lock);
1496 f->arr[f->num_members] = sk;
1497 smp_wmb();
1498 f->num_members++;
1499 spin_unlock(&f->lock);
1500}
1501
1502static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1503{
1504 struct packet_fanout *f = po->fanout;
1505 int i;
1506
1507 spin_lock(&f->lock);
1508 for (i = 0; i < f->num_members; i++) {
1509 if (f->arr[i] == sk)
1510 break;
1511 }
1512 BUG_ON(i >= f->num_members);
1513 f->arr[i] = f->arr[f->num_members - 1];
1514 f->num_members--;
1515 spin_unlock(&f->lock);
1516}
1517
d4dd8aee 1518static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1519{
d4dd8aee 1520 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1521 return true;
1522
1523 return false;
1524}
1525
47dceb8e
WB
1526static void fanout_init_data(struct packet_fanout *f)
1527{
1528 switch (f->type) {
1529 case PACKET_FANOUT_LB:
1530 atomic_set(&f->rr_cur, 0);
1531 break;
1532 case PACKET_FANOUT_CBPF:
f2e52095 1533 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1534 RCU_INIT_POINTER(f->bpf_prog, NULL);
1535 break;
1536 }
1537}
1538
1539static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1540{
1541 struct bpf_prog *old;
1542
1543 spin_lock(&f->lock);
1544 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1545 rcu_assign_pointer(f->bpf_prog, new);
1546 spin_unlock(&f->lock);
1547
1548 if (old) {
1549 synchronize_net();
1550 bpf_prog_destroy(old);
1551 }
1552}
1553
1554static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1555 unsigned int len)
1556{
1557 struct bpf_prog *new;
1558 struct sock_fprog fprog;
1559 int ret;
1560
1561 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1562 return -EPERM;
1563 if (len != sizeof(fprog))
1564 return -EINVAL;
1565 if (copy_from_user(&fprog, data, len))
1566 return -EFAULT;
1567
1568 ret = bpf_prog_create_from_user(&new, &fprog, NULL);
1569 if (ret)
1570 return ret;
1571
1572 __fanout_set_data_bpf(po->fanout, new);
1573 return 0;
1574}
1575
f2e52095
WB
1576static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1577 unsigned int len)
1578{
1579 struct bpf_prog *new;
1580 u32 fd;
1581
1582 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1583 return -EPERM;
1584 if (len != sizeof(fd))
1585 return -EINVAL;
1586 if (copy_from_user(&fd, data, len))
1587 return -EFAULT;
1588
1589 new = bpf_prog_get(fd);
1590 if (IS_ERR(new))
1591 return PTR_ERR(new);
1592 if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
1593 bpf_prog_put(new);
1594 return -EINVAL;
1595 }
1596
1597 __fanout_set_data_bpf(po->fanout, new);
1598 return 0;
1599}
1600
47dceb8e
WB
1601static int fanout_set_data(struct packet_sock *po, char __user *data,
1602 unsigned int len)
1603{
1604 switch (po->fanout->type) {
1605 case PACKET_FANOUT_CBPF:
1606 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1607 case PACKET_FANOUT_EBPF:
1608 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1609 default:
1610 return -EINVAL;
1611 };
1612}
1613
1614static void fanout_release_data(struct packet_fanout *f)
1615{
1616 switch (f->type) {
1617 case PACKET_FANOUT_CBPF:
f2e52095 1618 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1619 __fanout_set_data_bpf(f, NULL);
1620 };
1621}
1622
7736d33f 1623static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1624{
1625 struct packet_sock *po = pkt_sk(sk);
1626 struct packet_fanout *f, *match;
7736d33f 1627 u8 type = type_flags & 0xff;
77f65ebd 1628 u8 flags = type_flags >> 8;
dc99f600
DM
1629 int err;
1630
1631 switch (type) {
77f65ebd
WB
1632 case PACKET_FANOUT_ROLLOVER:
1633 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1634 return -EINVAL;
dc99f600
DM
1635 case PACKET_FANOUT_HASH:
1636 case PACKET_FANOUT_LB:
95ec3eb4 1637 case PACKET_FANOUT_CPU:
5df0ddfb 1638 case PACKET_FANOUT_RND:
2d36097d 1639 case PACKET_FANOUT_QM:
47dceb8e 1640 case PACKET_FANOUT_CBPF:
f2e52095 1641 case PACKET_FANOUT_EBPF:
dc99f600
DM
1642 break;
1643 default:
1644 return -EINVAL;
1645 }
1646
1647 if (!po->running)
1648 return -EINVAL;
1649
1650 if (po->fanout)
1651 return -EALREADY;
1652
4633c9e0
WB
1653 if (type == PACKET_FANOUT_ROLLOVER ||
1654 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
0648ab70
WB
1655 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1656 if (!po->rollover)
1657 return -ENOMEM;
a9b63918
WB
1658 atomic_long_set(&po->rollover->num, 0);
1659 atomic_long_set(&po->rollover->num_huge, 0);
1660 atomic_long_set(&po->rollover->num_failed, 0);
0648ab70
WB
1661 }
1662
dc99f600
DM
1663 mutex_lock(&fanout_mutex);
1664 match = NULL;
1665 list_for_each_entry(f, &fanout_list, list) {
1666 if (f->id == id &&
1667 read_pnet(&f->net) == sock_net(sk)) {
1668 match = f;
1669 break;
1670 }
1671 }
afe62c68 1672 err = -EINVAL;
77f65ebd 1673 if (match && match->flags != flags)
afe62c68 1674 goto out;
dc99f600 1675 if (!match) {
afe62c68 1676 err = -ENOMEM;
dc99f600 1677 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1678 if (!match)
1679 goto out;
1680 write_pnet(&match->net, sock_net(sk));
1681 match->id = id;
1682 match->type = type;
77f65ebd 1683 match->flags = flags;
afe62c68
ED
1684 INIT_LIST_HEAD(&match->list);
1685 spin_lock_init(&match->lock);
1686 atomic_set(&match->sk_ref, 0);
47dceb8e 1687 fanout_init_data(match);
afe62c68
ED
1688 match->prot_hook.type = po->prot_hook.type;
1689 match->prot_hook.dev = po->prot_hook.dev;
1690 match->prot_hook.func = packet_rcv_fanout;
1691 match->prot_hook.af_packet_priv = match;
c0de08d0 1692 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1693 dev_add_pack(&match->prot_hook);
1694 list_add(&match->list, &fanout_list);
dc99f600 1695 }
afe62c68
ED
1696 err = -EINVAL;
1697 if (match->type == type &&
1698 match->prot_hook.type == po->prot_hook.type &&
1699 match->prot_hook.dev == po->prot_hook.dev) {
1700 err = -ENOSPC;
1701 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1702 __dev_remove_pack(&po->prot_hook);
1703 po->fanout = match;
1704 atomic_inc(&match->sk_ref);
1705 __fanout_link(sk, po);
1706 err = 0;
dc99f600
DM
1707 }
1708 }
afe62c68 1709out:
dc99f600 1710 mutex_unlock(&fanout_mutex);
0648ab70
WB
1711 if (err) {
1712 kfree(po->rollover);
1713 po->rollover = NULL;
1714 }
dc99f600
DM
1715 return err;
1716}
1717
1718static void fanout_release(struct sock *sk)
1719{
1720 struct packet_sock *po = pkt_sk(sk);
1721 struct packet_fanout *f;
1722
1723 f = po->fanout;
1724 if (!f)
1725 return;
1726
fff3321d 1727 mutex_lock(&fanout_mutex);
dc99f600
DM
1728 po->fanout = NULL;
1729
dc99f600
DM
1730 if (atomic_dec_and_test(&f->sk_ref)) {
1731 list_del(&f->list);
1732 dev_remove_pack(&f->prot_hook);
47dceb8e 1733 fanout_release_data(f);
dc99f600
DM
1734 kfree(f);
1735 }
1736 mutex_unlock(&fanout_mutex);
0648ab70 1737
59f21118
WB
1738 if (po->rollover)
1739 kfree_rcu(po->rollover, rcu);
dc99f600 1740}
1da177e4 1741
90ddc4f0 1742static const struct proto_ops packet_ops;
1da177e4 1743
90ddc4f0 1744static const struct proto_ops packet_ops_spkt;
1da177e4 1745
40d4e3df
ED
1746static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1747 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1748{
1749 struct sock *sk;
1750 struct sockaddr_pkt *spkt;
1751
1752 /*
1753 * When we registered the protocol we saved the socket in the data
1754 * field for just this event.
1755 */
1756
1757 sk = pt->af_packet_priv;
1ce4f28b 1758
1da177e4
LT
1759 /*
1760 * Yank back the headers [hope the device set this
1761 * right or kerboom...]
1762 *
1763 * Incoming packets have ll header pulled,
1764 * push it back.
1765 *
98e399f8 1766 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1767 * so that this procedure is noop.
1768 */
1769
1770 if (skb->pkt_type == PACKET_LOOPBACK)
1771 goto out;
1772
09ad9bc7 1773 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1774 goto out;
1775
40d4e3df
ED
1776 skb = skb_share_check(skb, GFP_ATOMIC);
1777 if (skb == NULL)
1da177e4
LT
1778 goto oom;
1779
1780 /* drop any routing info */
adf30907 1781 skb_dst_drop(skb);
1da177e4 1782
84531c24
PO
1783 /* drop conntrack reference */
1784 nf_reset(skb);
1785
ffbc6111 1786 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1787
98e399f8 1788 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1789
1790 /*
1791 * The SOCK_PACKET socket receives _all_ frames.
1792 */
1793
1794 spkt->spkt_family = dev->type;
1795 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1796 spkt->spkt_protocol = skb->protocol;
1797
1798 /*
1799 * Charge the memory to the socket. This is done specifically
1800 * to prevent sockets using all the memory up.
1801 */
1802
40d4e3df 1803 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1804 return 0;
1805
1806out:
1807 kfree_skb(skb);
1808oom:
1809 return 0;
1810}
1811
1812
1813/*
1814 * Output a raw packet to a device layer. This bypasses all the other
1815 * protocol layers and you must therefore supply it with a complete frame
1816 */
1ce4f28b 1817
1b784140
YX
1818static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1819 size_t len)
1da177e4
LT
1820{
1821 struct sock *sk = sock->sk;
342dfc30 1822 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1823 struct sk_buff *skb = NULL;
1da177e4 1824 struct net_device *dev;
40d4e3df 1825 __be16 proto = 0;
1da177e4 1826 int err;
3bdc0eba 1827 int extra_len = 0;
1ce4f28b 1828
1da177e4 1829 /*
1ce4f28b 1830 * Get and verify the address.
1da177e4
LT
1831 */
1832
40d4e3df 1833 if (saddr) {
1da177e4 1834 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1835 return -EINVAL;
1836 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1837 proto = saddr->spkt_protocol;
1838 } else
1839 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1840
1841 /*
1ce4f28b 1842 * Find the device first to size check it
1da177e4
LT
1843 */
1844
de74e92a 1845 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1846retry:
654d1f8a
ED
1847 rcu_read_lock();
1848 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1849 err = -ENODEV;
1850 if (dev == NULL)
1851 goto out_unlock;
1ce4f28b 1852
d5e76b0a
DM
1853 err = -ENETDOWN;
1854 if (!(dev->flags & IFF_UP))
1855 goto out_unlock;
1856
1da177e4 1857 /*
40d4e3df
ED
1858 * You may not queue a frame bigger than the mtu. This is the lowest level
1859 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1860 */
1ce4f28b 1861
3bdc0eba
BG
1862 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1863 if (!netif_supports_nofcs(dev)) {
1864 err = -EPROTONOSUPPORT;
1865 goto out_unlock;
1866 }
1867 extra_len = 4; /* We're doing our own CRC */
1868 }
1869
1da177e4 1870 err = -EMSGSIZE;
3bdc0eba 1871 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1872 goto out_unlock;
1873
1a35ca80
ED
1874 if (!skb) {
1875 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1876 int tlen = dev->needed_tailroom;
1a35ca80
ED
1877 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1878
1879 rcu_read_unlock();
4ce40912 1880 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1881 if (skb == NULL)
1882 return -ENOBUFS;
1883 /* FIXME: Save some space for broken drivers that write a hard
1884 * header at transmission time by themselves. PPP is the notable
1885 * one here. This should really be fixed at the driver level.
1886 */
1887 skb_reserve(skb, reserved);
1888 skb_reset_network_header(skb);
1889
1890 /* Try to align data part correctly */
1891 if (hhlen) {
1892 skb->data -= hhlen;
1893 skb->tail -= hhlen;
1894 if (len < hhlen)
1895 skb_reset_network_header(skb);
1896 }
6ce8e9ce 1897 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1898 if (err)
1899 goto out_free;
1900 goto retry;
1da177e4
LT
1901 }
1902
3bdc0eba 1903 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1904 /* Earlier code assumed this would be a VLAN pkt,
1905 * double-check this now that we have the actual
1906 * packet in hand.
1907 */
1908 struct ethhdr *ehdr;
1909 skb_reset_mac_header(skb);
1910 ehdr = eth_hdr(skb);
1911 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1912 err = -EMSGSIZE;
1913 goto out_unlock;
1914 }
1915 }
1a35ca80 1916
1da177e4
LT
1917 skb->protocol = proto;
1918 skb->dev = dev;
1919 skb->priority = sk->sk_priority;
2d37a186 1920 skb->mark = sk->sk_mark;
bf84a010
DB
1921
1922 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1923
3bdc0eba
BG
1924 if (unlikely(extra_len == 4))
1925 skb->no_fcs = 1;
1926
40893fd0 1927 skb_probe_transport_header(skb, 0);
c1aad275 1928
1da177e4 1929 dev_queue_xmit(skb);
654d1f8a 1930 rcu_read_unlock();
40d4e3df 1931 return len;
1da177e4 1932
1da177e4 1933out_unlock:
654d1f8a 1934 rcu_read_unlock();
1a35ca80
ED
1935out_free:
1936 kfree_skb(skb);
1da177e4
LT
1937 return err;
1938}
1da177e4 1939
eea49cc9 1940static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1941 const struct sock *sk,
dbcb5855 1942 unsigned int res)
1da177e4
LT
1943{
1944 struct sk_filter *filter;
fda9ef5d 1945
80f8f102
ED
1946 rcu_read_lock();
1947 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1948 if (filter != NULL)
0a14842f 1949 res = SK_RUN_FILTER(filter, skb);
80f8f102 1950 rcu_read_unlock();
1da177e4 1951
dbcb5855 1952 return res;
1da177e4
LT
1953}
1954
1955/*
62ab0812
ED
1956 * This function makes lazy skb cloning in hope that most of packets
1957 * are discarded by BPF.
1958 *
1959 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1960 * and skb->cb are mangled. It works because (and until) packets
1961 * falling here are owned by current CPU. Output packets are cloned
1962 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1963 * sequencially, so that if we return skb to original state on exit,
1964 * we will not harm anyone.
1da177e4
LT
1965 */
1966
40d4e3df
ED
1967static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1968 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1969{
1970 struct sock *sk;
1971 struct sockaddr_ll *sll;
1972 struct packet_sock *po;
40d4e3df 1973 u8 *skb_head = skb->data;
1da177e4 1974 int skb_len = skb->len;
dbcb5855 1975 unsigned int snaplen, res;
1da177e4
LT
1976
1977 if (skb->pkt_type == PACKET_LOOPBACK)
1978 goto drop;
1979
1980 sk = pt->af_packet_priv;
1981 po = pkt_sk(sk);
1982
09ad9bc7 1983 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1984 goto drop;
1985
1da177e4
LT
1986 skb->dev = dev;
1987
3b04ddde 1988 if (dev->header_ops) {
1da177e4 1989 /* The device has an explicit notion of ll header,
62ab0812
ED
1990 * exported to higher levels.
1991 *
1992 * Otherwise, the device hides details of its frame
1993 * structure, so that corresponding packet head is
1994 * never delivered to user.
1da177e4
LT
1995 */
1996 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1997 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1998 else if (skb->pkt_type == PACKET_OUTGOING) {
1999 /* Special case: outgoing packets have ll header at head */
bbe735e4 2000 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2001 }
2002 }
2003
2004 snaplen = skb->len;
2005
dbcb5855
DM
2006 res = run_filter(skb, sk, snaplen);
2007 if (!res)
fda9ef5d 2008 goto drop_n_restore;
dbcb5855
DM
2009 if (snaplen > res)
2010 snaplen = res;
1da177e4 2011
0fd7bac6 2012 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2013 goto drop_n_acct;
2014
2015 if (skb_shared(skb)) {
2016 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2017 if (nskb == NULL)
2018 goto drop_n_acct;
2019
2020 if (skb_head != skb->data) {
2021 skb->data = skb_head;
2022 skb->len = skb_len;
2023 }
abc4e4fa 2024 consume_skb(skb);
1da177e4
LT
2025 skb = nskb;
2026 }
2027
b4772ef8 2028 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2029
2030 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2031 sll->sll_hatype = dev->type;
1da177e4 2032 sll->sll_pkttype = skb->pkt_type;
8032b464 2033 if (unlikely(po->origdev))
80feaacb
PWJ
2034 sll->sll_ifindex = orig_dev->ifindex;
2035 else
2036 sll->sll_ifindex = dev->ifindex;
1da177e4 2037
b95cce35 2038 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2039
2472d761
EB
2040 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2041 * Use their space for storing the original skb length.
2042 */
2043 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2044
1da177e4
LT
2045 if (pskb_trim(skb, snaplen))
2046 goto drop_n_acct;
2047
2048 skb_set_owner_r(skb, sk);
2049 skb->dev = NULL;
adf30907 2050 skb_dst_drop(skb);
1da177e4 2051
84531c24
PO
2052 /* drop conntrack reference */
2053 nf_reset(skb);
2054
1da177e4 2055 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2056 po->stats.stats1.tp_packets++;
3bc3b96f 2057 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2058 __skb_queue_tail(&sk->sk_receive_queue, skb);
2059 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2060 sk->sk_data_ready(sk);
1da177e4
LT
2061 return 0;
2062
2063drop_n_acct:
7091fbd8 2064 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2065 po->stats.stats1.tp_drops++;
7091fbd8
WB
2066 atomic_inc(&sk->sk_drops);
2067 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2068
2069drop_n_restore:
2070 if (skb_head != skb->data && skb_shared(skb)) {
2071 skb->data = skb_head;
2072 skb->len = skb_len;
2073 }
2074drop:
ead2ceb0 2075 consume_skb(skb);
1da177e4
LT
2076 return 0;
2077}
2078
40d4e3df
ED
2079static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2080 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2081{
2082 struct sock *sk;
2083 struct packet_sock *po;
2084 struct sockaddr_ll *sll;
184f489e 2085 union tpacket_uhdr h;
40d4e3df 2086 u8 *skb_head = skb->data;
1da177e4 2087 int skb_len = skb->len;
dbcb5855 2088 unsigned int snaplen, res;
f6fb8f10 2089 unsigned long status = TP_STATUS_USER;
bbd6ef87 2090 unsigned short macoff, netoff, hdrlen;
1da177e4 2091 struct sk_buff *copy_skb = NULL;
bbd6ef87 2092 struct timespec ts;
b9c32fb2 2093 __u32 ts_status;
1da177e4 2094
51846355
AW
2095 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2096 * We may add members to them until current aligned size without forcing
2097 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2098 */
2099 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2100 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2101
1da177e4
LT
2102 if (skb->pkt_type == PACKET_LOOPBACK)
2103 goto drop;
2104
2105 sk = pt->af_packet_priv;
2106 po = pkt_sk(sk);
2107
09ad9bc7 2108 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2109 goto drop;
2110
3b04ddde 2111 if (dev->header_ops) {
1da177e4 2112 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2113 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2114 else if (skb->pkt_type == PACKET_OUTGOING) {
2115 /* Special case: outgoing packets have ll header at head */
bbe735e4 2116 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2117 }
2118 }
2119
2120 snaplen = skb->len;
2121
dbcb5855
DM
2122 res = run_filter(skb, sk, snaplen);
2123 if (!res)
fda9ef5d 2124 goto drop_n_restore;
68c2e5de
AD
2125
2126 if (skb->ip_summed == CHECKSUM_PARTIAL)
2127 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2128 else if (skb->pkt_type != PACKET_OUTGOING &&
2129 (skb->ip_summed == CHECKSUM_COMPLETE ||
2130 skb_csum_unnecessary(skb)))
2131 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2132
dbcb5855
DM
2133 if (snaplen > res)
2134 snaplen = res;
1da177e4
LT
2135
2136 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2137 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2138 po->tp_reserve;
1da177e4 2139 } else {
95c96174 2140 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2141 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
2142 (maclen < 16 ? 16 : maclen)) +
2143 po->tp_reserve;
1da177e4
LT
2144 macoff = netoff - maclen;
2145 }
f6fb8f10 2146 if (po->tp_version <= TPACKET_V2) {
2147 if (macoff + snaplen > po->rx_ring.frame_size) {
2148 if (po->copy_thresh &&
0fd7bac6 2149 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2150 if (skb_shared(skb)) {
2151 copy_skb = skb_clone(skb, GFP_ATOMIC);
2152 } else {
2153 copy_skb = skb_get(skb);
2154 skb_head = skb->data;
2155 }
2156 if (copy_skb)
2157 skb_set_owner_r(copy_skb, sk);
1da177e4 2158 }
f6fb8f10 2159 snaplen = po->rx_ring.frame_size - macoff;
2160 if ((int)snaplen < 0)
2161 snaplen = 0;
1da177e4 2162 }
dc808110
ED
2163 } else if (unlikely(macoff + snaplen >
2164 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2165 u32 nval;
2166
2167 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2168 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2169 snaplen, nval, macoff);
2170 snaplen = nval;
2171 if (unlikely((int)snaplen < 0)) {
2172 snaplen = 0;
2173 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2174 }
1da177e4 2175 }
1da177e4 2176 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2177 h.raw = packet_current_rx_frame(po, skb,
2178 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2179 if (!h.raw)
1da177e4 2180 goto ring_is_full;
f6fb8f10 2181 if (po->tp_version <= TPACKET_V2) {
2182 packet_increment_rx_head(po, &po->rx_ring);
2183 /*
2184 * LOSING will be reported till you read the stats,
2185 * because it's COR - Clear On Read.
2186 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2187 * at packet level.
2188 */
ee80fbf3 2189 if (po->stats.stats1.tp_drops)
f6fb8f10 2190 status |= TP_STATUS_LOSING;
2191 }
ee80fbf3 2192 po->stats.stats1.tp_packets++;
1da177e4
LT
2193 if (copy_skb) {
2194 status |= TP_STATUS_COPY;
2195 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2196 }
1da177e4
LT
2197 spin_unlock(&sk->sk_receive_queue.lock);
2198
bbd6ef87 2199 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2200
2201 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2202 getnstimeofday(&ts);
1da177e4 2203
b9c32fb2
DB
2204 status |= ts_status;
2205
bbd6ef87
PM
2206 switch (po->tp_version) {
2207 case TPACKET_V1:
2208 h.h1->tp_len = skb->len;
2209 h.h1->tp_snaplen = snaplen;
2210 h.h1->tp_mac = macoff;
2211 h.h1->tp_net = netoff;
4b457bdf
DB
2212 h.h1->tp_sec = ts.tv_sec;
2213 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2214 hdrlen = sizeof(*h.h1);
2215 break;
2216 case TPACKET_V2:
2217 h.h2->tp_len = skb->len;
2218 h.h2->tp_snaplen = snaplen;
2219 h.h2->tp_mac = macoff;
2220 h.h2->tp_net = netoff;
bbd6ef87
PM
2221 h.h2->tp_sec = ts.tv_sec;
2222 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2223 if (skb_vlan_tag_present(skb)) {
2224 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2225 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2226 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2227 } else {
2228 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2229 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2230 }
e4d26f4b 2231 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2232 hdrlen = sizeof(*h.h2);
2233 break;
f6fb8f10 2234 case TPACKET_V3:
2235 /* tp_nxt_offset,vlan are already populated above.
2236 * So DONT clear those fields here
2237 */
2238 h.h3->tp_status |= status;
2239 h.h3->tp_len = skb->len;
2240 h.h3->tp_snaplen = snaplen;
2241 h.h3->tp_mac = macoff;
2242 h.h3->tp_net = netoff;
f6fb8f10 2243 h.h3->tp_sec = ts.tv_sec;
2244 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2245 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2246 hdrlen = sizeof(*h.h3);
2247 break;
bbd6ef87
PM
2248 default:
2249 BUG();
2250 }
1da177e4 2251
bbd6ef87 2252 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2253 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2254 sll->sll_family = AF_PACKET;
2255 sll->sll_hatype = dev->type;
2256 sll->sll_protocol = skb->protocol;
2257 sll->sll_pkttype = skb->pkt_type;
8032b464 2258 if (unlikely(po->origdev))
80feaacb
PWJ
2259 sll->sll_ifindex = orig_dev->ifindex;
2260 else
2261 sll->sll_ifindex = dev->ifindex;
1da177e4 2262
e16aa207 2263 smp_mb();
f0d4eb29 2264
f6dafa95 2265#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2266 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2267 u8 *start, *end;
2268
f0d4eb29
DB
2269 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2270 macoff + snaplen);
2271
2272 for (start = h.raw; start < end; start += PAGE_SIZE)
2273 flush_dcache_page(pgv_to_page(start));
1da177e4 2274 }
f0d4eb29 2275 smp_wmb();
f6dafa95 2276#endif
f0d4eb29 2277
da413eec 2278 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2279 __packet_set_status(po, h.raw, status);
da413eec
DC
2280 sk->sk_data_ready(sk);
2281 } else {
f6fb8f10 2282 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2283 }
1da177e4
LT
2284
2285drop_n_restore:
2286 if (skb_head != skb->data && skb_shared(skb)) {
2287 skb->data = skb_head;
2288 skb->len = skb_len;
2289 }
2290drop:
1ce4f28b 2291 kfree_skb(skb);
1da177e4
LT
2292 return 0;
2293
2294ring_is_full:
ee80fbf3 2295 po->stats.stats1.tp_drops++;
1da177e4
LT
2296 spin_unlock(&sk->sk_receive_queue.lock);
2297
676d2369 2298 sk->sk_data_ready(sk);
acb5d75b 2299 kfree_skb(copy_skb);
1da177e4
LT
2300 goto drop_n_restore;
2301}
2302
69e3c75f
JB
2303static void tpacket_destruct_skb(struct sk_buff *skb)
2304{
2305 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2306
69e3c75f 2307 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2308 void *ph;
b9c32fb2
DB
2309 __u32 ts;
2310
69e3c75f 2311 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2312 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2313
2314 ts = __packet_set_timestamp(po, ph, skb);
2315 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2316 }
2317
2318 sock_wfree(skb);
2319}
2320
9c707762
WB
2321static bool ll_header_truncated(const struct net_device *dev, int len)
2322{
2323 /* net device doesn't like empty head */
2324 if (unlikely(len <= dev->hard_header_len)) {
eee2f04b 2325 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
9c707762
WB
2326 current->comm, len, dev->hard_header_len);
2327 return true;
2328 }
2329
2330 return false;
2331}
2332
40d4e3df
ED
2333static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2334 void *frame, struct net_device *dev, int size_max,
ae641949 2335 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2336{
184f489e 2337 union tpacket_uhdr ph;
09effa67 2338 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2339 struct socket *sock = po->sk.sk_socket;
2340 struct page *page;
2341 void *data;
2342 int err;
2343
2344 ph.raw = frame;
2345
2346 skb->protocol = proto;
2347 skb->dev = dev;
2348 skb->priority = po->sk.sk_priority;
2d37a186 2349 skb->mark = po->sk.sk_mark;
2e31396f 2350 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2351 skb_shinfo(skb)->destructor_arg = ph.raw;
2352
2353 switch (po->tp_version) {
2354 case TPACKET_V2:
2355 tp_len = ph.h2->tp_len;
2356 break;
2357 default:
2358 tp_len = ph.h1->tp_len;
2359 break;
2360 }
09effa67
DM
2361 if (unlikely(tp_len > size_max)) {
2362 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2363 return -EMSGSIZE;
2364 }
69e3c75f 2365
ae641949 2366 skb_reserve(skb, hlen);
69e3c75f 2367 skb_reset_network_header(skb);
c1aad275 2368
d346a3fa
DB
2369 if (!packet_use_direct_xmit(po))
2370 skb_probe_transport_header(skb, 0);
2371 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2372 int off_min, off_max, off;
2373 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2374 off_max = po->tx_ring.frame_size - tp_len;
2375 if (sock->type == SOCK_DGRAM) {
2376 switch (po->tp_version) {
2377 case TPACKET_V2:
2378 off = ph.h2->tp_net;
2379 break;
2380 default:
2381 off = ph.h1->tp_net;
2382 break;
2383 }
2384 } else {
2385 switch (po->tp_version) {
2386 case TPACKET_V2:
2387 off = ph.h2->tp_mac;
2388 break;
2389 default:
2390 off = ph.h1->tp_mac;
2391 break;
2392 }
2393 }
2394 if (unlikely((off < off_min) || (off_max < off)))
2395 return -EINVAL;
2396 data = ph.raw + off;
2397 } else {
2398 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2399 }
69e3c75f
JB
2400 to_write = tp_len;
2401
2402 if (sock->type == SOCK_DGRAM) {
2403 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2404 NULL, tp_len);
2405 if (unlikely(err < 0))
2406 return -EINVAL;
40d4e3df 2407 } else if (dev->hard_header_len) {
9c707762 2408 if (ll_header_truncated(dev, tp_len))
69e3c75f 2409 return -EINVAL;
69e3c75f
JB
2410
2411 skb_push(skb, dev->hard_header_len);
2412 err = skb_store_bits(skb, 0, data,
2413 dev->hard_header_len);
2414 if (unlikely(err))
2415 return err;
2416
2417 data += dev->hard_header_len;
2418 to_write -= dev->hard_header_len;
2419 }
2420
69e3c75f
JB
2421 offset = offset_in_page(data);
2422 len_max = PAGE_SIZE - offset;
2423 len = ((to_write > len_max) ? len_max : to_write);
2424
2425 skb->data_len = to_write;
2426 skb->len += to_write;
2427 skb->truesize += to_write;
2428 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2429
2430 while (likely(to_write)) {
2431 nr_frags = skb_shinfo(skb)->nr_frags;
2432
2433 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2434 pr_err("Packet exceed the number of skb frags(%lu)\n",
2435 MAX_SKB_FRAGS);
69e3c75f
JB
2436 return -EFAULT;
2437 }
2438
0af55bb5
CG
2439 page = pgv_to_page(data);
2440 data += len;
69e3c75f
JB
2441 flush_dcache_page(page);
2442 get_page(page);
0af55bb5 2443 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2444 to_write -= len;
2445 offset = 0;
2446 len_max = PAGE_SIZE;
2447 len = ((to_write > len_max) ? len_max : to_write);
2448 }
2449
2450 return tp_len;
2451}
2452
2453static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2454{
69e3c75f
JB
2455 struct sk_buff *skb;
2456 struct net_device *dev;
2457 __be16 proto;
09effa67 2458 int err, reserve = 0;
40d4e3df 2459 void *ph;
342dfc30 2460 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2461 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2462 int tp_len, size_max;
2463 unsigned char *addr;
2464 int len_sum = 0;
9e67030a 2465 int status = TP_STATUS_AVAILABLE;
ae641949 2466 int hlen, tlen;
69e3c75f 2467
69e3c75f
JB
2468 mutex_lock(&po->pg_vec_lock);
2469
66e56cd4 2470 if (likely(saddr == NULL)) {
e40526cb 2471 dev = packet_cached_dev_get(po);
69e3c75f
JB
2472 proto = po->num;
2473 addr = NULL;
2474 } else {
2475 err = -EINVAL;
2476 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2477 goto out;
2478 if (msg->msg_namelen < (saddr->sll_halen
2479 + offsetof(struct sockaddr_ll,
2480 sll_addr)))
2481 goto out;
69e3c75f
JB
2482 proto = saddr->sll_protocol;
2483 addr = saddr->sll_addr;
827d9780 2484 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2485 }
2486
69e3c75f
JB
2487 err = -ENXIO;
2488 if (unlikely(dev == NULL))
2489 goto out;
69e3c75f
JB
2490 err = -ENETDOWN;
2491 if (unlikely(!(dev->flags & IFF_UP)))
2492 goto out_put;
2493
52f1454f 2494 reserve = dev->hard_header_len + VLAN_HLEN;
69e3c75f 2495 size_max = po->tx_ring.frame_size
b5dd884e 2496 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2497
09effa67
DM
2498 if (size_max > dev->mtu + reserve)
2499 size_max = dev->mtu + reserve;
2500
69e3c75f
JB
2501 do {
2502 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2503 TP_STATUS_SEND_REQUEST);
69e3c75f 2504 if (unlikely(ph == NULL)) {
87a2fd28
DB
2505 if (need_wait && need_resched())
2506 schedule();
69e3c75f
JB
2507 continue;
2508 }
2509
2510 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2511 hlen = LL_RESERVED_SPACE(dev);
2512 tlen = dev->needed_tailroom;
69e3c75f 2513 skb = sock_alloc_send_skb(&po->sk,
ae641949 2514 hlen + tlen + sizeof(struct sockaddr_ll),
fbf33a28 2515 !need_wait, &err);
69e3c75f 2516
fbf33a28
KM
2517 if (unlikely(skb == NULL)) {
2518 /* we assume the socket was initially writeable ... */
2519 if (likely(len_sum > 0))
2520 err = len_sum;
69e3c75f 2521 goto out_status;
fbf33a28 2522 }
69e3c75f 2523 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
52f1454f 2524 addr, hlen);
dbd46ab4
AD
2525 if (likely(tp_len >= 0) &&
2526 tp_len > dev->mtu + dev->hard_header_len) {
52f1454f
DB
2527 struct ethhdr *ehdr;
2528 /* Earlier code assumed this would be a VLAN pkt,
2529 * double-check this now that we have the actual
2530 * packet in hand.
2531 */
69e3c75f 2532
52f1454f
DB
2533 skb_reset_mac_header(skb);
2534 ehdr = eth_hdr(skb);
2535 if (ehdr->h_proto != htons(ETH_P_8021Q))
2536 tp_len = -EMSGSIZE;
2537 }
69e3c75f
JB
2538 if (unlikely(tp_len < 0)) {
2539 if (po->tp_loss) {
2540 __packet_set_status(po, ph,
2541 TP_STATUS_AVAILABLE);
2542 packet_increment_head(&po->tx_ring);
2543 kfree_skb(skb);
2544 continue;
2545 } else {
2546 status = TP_STATUS_WRONG_FORMAT;
2547 err = tp_len;
2548 goto out_status;
2549 }
2550 }
2551
0fd5d57b
DB
2552 packet_pick_tx_queue(dev, skb);
2553
69e3c75f
JB
2554 skb->destructor = tpacket_destruct_skb;
2555 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2556 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2557
2558 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2559 err = po->xmit(skb);
eb70df13
JP
2560 if (unlikely(err > 0)) {
2561 err = net_xmit_errno(err);
2562 if (err && __packet_get_status(po, ph) ==
2563 TP_STATUS_AVAILABLE) {
2564 /* skb was destructed already */
2565 skb = NULL;
2566 goto out_status;
2567 }
2568 /*
2569 * skb was dropped but not destructed yet;
2570 * let's treat it like congestion or err < 0
2571 */
2572 err = 0;
2573 }
69e3c75f
JB
2574 packet_increment_head(&po->tx_ring);
2575 len_sum += tp_len;
b0138408
DB
2576 } while (likely((ph != NULL) ||
2577 /* Note: packet_read_pending() might be slow if we have
2578 * to call it as it's per_cpu variable, but in fast-path
2579 * we already short-circuit the loop with the first
2580 * condition, and luckily don't have to go that path
2581 * anyway.
2582 */
2583 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2584
2585 err = len_sum;
2586 goto out_put;
2587
69e3c75f
JB
2588out_status:
2589 __packet_set_status(po, ph, status);
2590 kfree_skb(skb);
2591out_put:
e40526cb 2592 dev_put(dev);
69e3c75f
JB
2593out:
2594 mutex_unlock(&po->pg_vec_lock);
2595 return err;
2596}
69e3c75f 2597
eea49cc9
OJ
2598static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2599 size_t reserve, size_t len,
2600 size_t linear, int noblock,
2601 int *err)
bfd5f4a3
SS
2602{
2603 struct sk_buff *skb;
2604
2605 /* Under a page? Don't bother with paged skb. */
2606 if (prepad + len < PAGE_SIZE || !linear)
2607 linear = len;
2608
2609 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2610 err, 0);
bfd5f4a3
SS
2611 if (!skb)
2612 return NULL;
2613
2614 skb_reserve(skb, reserve);
2615 skb_put(skb, linear);
2616 skb->data_len = len - linear;
2617 skb->len += len - linear;
2618
2619 return skb;
2620}
2621
d346a3fa 2622static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2623{
2624 struct sock *sk = sock->sk;
342dfc30 2625 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2626 struct sk_buff *skb;
2627 struct net_device *dev;
0e11c91e 2628 __be16 proto;
1da177e4 2629 unsigned char *addr;
827d9780 2630 int err, reserve = 0;
bfd5f4a3
SS
2631 struct virtio_net_hdr vnet_hdr = { 0 };
2632 int offset = 0;
2633 int vnet_hdr_len;
2634 struct packet_sock *po = pkt_sk(sk);
2635 unsigned short gso_type = 0;
ae641949 2636 int hlen, tlen;
3bdc0eba 2637 int extra_len = 0;
8feb2fb2 2638 ssize_t n;
1da177e4
LT
2639
2640 /*
1ce4f28b 2641 * Get and verify the address.
1da177e4 2642 */
1ce4f28b 2643
66e56cd4 2644 if (likely(saddr == NULL)) {
e40526cb 2645 dev = packet_cached_dev_get(po);
1da177e4
LT
2646 proto = po->num;
2647 addr = NULL;
2648 } else {
2649 err = -EINVAL;
2650 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2651 goto out;
0fb375fb
EB
2652 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2653 goto out;
1da177e4
LT
2654 proto = saddr->sll_protocol;
2655 addr = saddr->sll_addr;
827d9780 2656 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2657 }
2658
1da177e4 2659 err = -ENXIO;
e40526cb 2660 if (unlikely(dev == NULL))
1da177e4 2661 goto out_unlock;
d5e76b0a 2662 err = -ENETDOWN;
e40526cb 2663 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2664 goto out_unlock;
2665
e40526cb
DB
2666 if (sock->type == SOCK_RAW)
2667 reserve = dev->hard_header_len;
bfd5f4a3
SS
2668 if (po->has_vnet_hdr) {
2669 vnet_hdr_len = sizeof(vnet_hdr);
2670
2671 err = -EINVAL;
2672 if (len < vnet_hdr_len)
2673 goto out_unlock;
2674
2675 len -= vnet_hdr_len;
2676
8feb2fb2 2677 err = -EFAULT;
c0371da6 2678 n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
8feb2fb2 2679 if (n != vnet_hdr_len)
bfd5f4a3
SS
2680 goto out_unlock;
2681
2682 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
dc9e5153
MT
2683 (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
2684 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
2685 __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
2686 vnet_hdr.hdr_len = __cpu_to_virtio16(false,
2687 __virtio16_to_cpu(false, vnet_hdr.csum_start) +
2688 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
bfd5f4a3
SS
2689
2690 err = -EINVAL;
dc9e5153 2691 if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
bfd5f4a3
SS
2692 goto out_unlock;
2693
2694 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2695 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2696 case VIRTIO_NET_HDR_GSO_TCPV4:
2697 gso_type = SKB_GSO_TCPV4;
2698 break;
2699 case VIRTIO_NET_HDR_GSO_TCPV6:
2700 gso_type = SKB_GSO_TCPV6;
2701 break;
2702 case VIRTIO_NET_HDR_GSO_UDP:
2703 gso_type = SKB_GSO_UDP;
2704 break;
2705 default:
2706 goto out_unlock;
2707 }
2708
2709 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2710 gso_type |= SKB_GSO_TCP_ECN;
2711
2712 if (vnet_hdr.gso_size == 0)
2713 goto out_unlock;
2714
2715 }
2716 }
2717
3bdc0eba
BG
2718 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2719 if (!netif_supports_nofcs(dev)) {
2720 err = -EPROTONOSUPPORT;
2721 goto out_unlock;
2722 }
2723 extra_len = 4; /* We're doing our own CRC */
2724 }
2725
1da177e4 2726 err = -EMSGSIZE;
3bdc0eba 2727 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2728 goto out_unlock;
2729
bfd5f4a3 2730 err = -ENOBUFS;
ae641949
HX
2731 hlen = LL_RESERVED_SPACE(dev);
2732 tlen = dev->needed_tailroom;
dc9e5153
MT
2733 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
2734 __virtio16_to_cpu(false, vnet_hdr.hdr_len),
bfd5f4a3 2735 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2736 if (skb == NULL)
1da177e4
LT
2737 goto out_unlock;
2738
bfd5f4a3 2739 skb_set_network_header(skb, reserve);
1da177e4 2740
0c4e8581 2741 err = -EINVAL;
9c707762
WB
2742 if (sock->type == SOCK_DGRAM) {
2743 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2744 if (unlikely(offset < 0))
9c707762
WB
2745 goto out_free;
2746 } else {
2747 if (ll_header_truncated(dev, len))
2748 goto out_free;
2749 }
1da177e4
LT
2750
2751 /* Returns -EFAULT on error */
c0371da6 2752 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2753 if (err)
2754 goto out_free;
bf84a010
DB
2755
2756 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2757
3bdc0eba 2758 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2759 /* Earlier code assumed this would be a VLAN pkt,
2760 * double-check this now that we have the actual
2761 * packet in hand.
2762 */
2763 struct ethhdr *ehdr;
2764 skb_reset_mac_header(skb);
2765 ehdr = eth_hdr(skb);
2766 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2767 err = -EMSGSIZE;
2768 goto out_free;
2769 }
57f89bfa
BG
2770 }
2771
09effa67
DM
2772 skb->protocol = proto;
2773 skb->dev = dev;
1da177e4 2774 skb->priority = sk->sk_priority;
2d37a186 2775 skb->mark = sk->sk_mark;
0fd5d57b
DB
2776
2777 packet_pick_tx_queue(dev, skb);
1da177e4 2778
bfd5f4a3
SS
2779 if (po->has_vnet_hdr) {
2780 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
dc9e5153
MT
2781 u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
2782 u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
2783 if (!skb_partial_csum_set(skb, s, o)) {
bfd5f4a3
SS
2784 err = -EINVAL;
2785 goto out_free;
2786 }
2787 }
2788
dc9e5153
MT
2789 skb_shinfo(skb)->gso_size =
2790 __virtio16_to_cpu(false, vnet_hdr.gso_size);
bfd5f4a3
SS
2791 skb_shinfo(skb)->gso_type = gso_type;
2792
2793 /* Header must be checked, and gso_segs computed. */
2794 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2795 skb_shinfo(skb)->gso_segs = 0;
2796
2797 len += vnet_hdr_len;
2798 }
2799
d346a3fa
DB
2800 if (!packet_use_direct_xmit(po))
2801 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2802 if (unlikely(extra_len == 4))
2803 skb->no_fcs = 1;
2804
d346a3fa 2805 err = po->xmit(skb);
1da177e4
LT
2806 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2807 goto out_unlock;
2808
e40526cb 2809 dev_put(dev);
1da177e4 2810
40d4e3df 2811 return len;
1da177e4
LT
2812
2813out_free:
2814 kfree_skb(skb);
2815out_unlock:
e40526cb 2816 if (dev)
1da177e4
LT
2817 dev_put(dev);
2818out:
2819 return err;
2820}
2821
1b784140 2822static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2823{
69e3c75f
JB
2824 struct sock *sk = sock->sk;
2825 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2826
69e3c75f
JB
2827 if (po->tx_ring.pg_vec)
2828 return tpacket_snd(po, msg);
2829 else
69e3c75f
JB
2830 return packet_snd(sock, msg, len);
2831}
2832
1da177e4
LT
2833/*
2834 * Close a PACKET socket. This is fairly simple. We immediately go
2835 * to 'closed' state and remove our protocol entry in the device list.
2836 */
2837
2838static int packet_release(struct socket *sock)
2839{
2840 struct sock *sk = sock->sk;
2841 struct packet_sock *po;
d12d01d6 2842 struct net *net;
f6fb8f10 2843 union tpacket_req_u req_u;
1da177e4
LT
2844
2845 if (!sk)
2846 return 0;
2847
3b1e0a65 2848 net = sock_net(sk);
1da177e4
LT
2849 po = pkt_sk(sk);
2850
0fa7fa98 2851 mutex_lock(&net->packet.sklist_lock);
808f5114 2852 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2853 mutex_unlock(&net->packet.sklist_lock);
2854
2855 preempt_disable();
920de804 2856 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2857 preempt_enable();
1da177e4 2858
808f5114 2859 spin_lock(&po->bind_lock);
ce06b03e 2860 unregister_prot_hook(sk, false);
66e56cd4
DB
2861 packet_cached_dev_reset(po);
2862
160ff18a
BG
2863 if (po->prot_hook.dev) {
2864 dev_put(po->prot_hook.dev);
2865 po->prot_hook.dev = NULL;
2866 }
808f5114 2867 spin_unlock(&po->bind_lock);
1da177e4 2868
1da177e4 2869 packet_flush_mclist(sk);
1da177e4 2870
9665d5d6
PS
2871 if (po->rx_ring.pg_vec) {
2872 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2873 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2874 }
69e3c75f 2875
9665d5d6
PS
2876 if (po->tx_ring.pg_vec) {
2877 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2878 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2879 }
1da177e4 2880
dc99f600
DM
2881 fanout_release(sk);
2882
808f5114 2883 synchronize_net();
1da177e4
LT
2884 /*
2885 * Now the socket is dead. No more input will appear.
2886 */
1da177e4
LT
2887 sock_orphan(sk);
2888 sock->sk = NULL;
2889
2890 /* Purge queues */
2891
2892 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2893 packet_free_pending(po);
17ab56a2 2894 sk_refcnt_debug_release(sk);
1da177e4
LT
2895
2896 sock_put(sk);
2897 return 0;
2898}
2899
2900/*
2901 * Attach a packet hook.
2902 */
2903
902fefb8 2904static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2905{
2906 struct packet_sock *po = pkt_sk(sk);
158cd4af 2907 struct net_device *dev_curr;
902fefb8
DB
2908 __be16 proto_curr;
2909 bool need_rehook;
dc99f600 2910
aef950b4
WY
2911 if (po->fanout) {
2912 if (dev)
2913 dev_put(dev);
2914
dc99f600 2915 return -EINVAL;
aef950b4 2916 }
1da177e4
LT
2917
2918 lock_sock(sk);
1da177e4 2919 spin_lock(&po->bind_lock);
66e56cd4 2920
902fefb8
DB
2921 proto_curr = po->prot_hook.type;
2922 dev_curr = po->prot_hook.dev;
2923
2924 need_rehook = proto_curr != proto || dev_curr != dev;
2925
2926 if (need_rehook) {
2927 unregister_prot_hook(sk, true);
1da177e4 2928
902fefb8
DB
2929 po->num = proto;
2930 po->prot_hook.type = proto;
902fefb8
DB
2931 po->prot_hook.dev = dev;
2932
2933 po->ifindex = dev ? dev->ifindex : 0;
2934 packet_cached_dev_assign(po, dev);
2935 }
158cd4af
LW
2936 if (dev_curr)
2937 dev_put(dev_curr);
66e56cd4 2938
902fefb8 2939 if (proto == 0 || !need_rehook)
1da177e4
LT
2940 goto out_unlock;
2941
be85d4ad 2942 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2943 register_prot_hook(sk);
be85d4ad
UT
2944 } else {
2945 sk->sk_err = ENETDOWN;
2946 if (!sock_flag(sk, SOCK_DEAD))
2947 sk->sk_error_report(sk);
1da177e4
LT
2948 }
2949
2950out_unlock:
2951 spin_unlock(&po->bind_lock);
2952 release_sock(sk);
2953 return 0;
2954}
2955
2956/*
2957 * Bind a packet socket to a device
2958 */
2959
40d4e3df
ED
2960static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2961 int addr_len)
1da177e4 2962{
40d4e3df 2963 struct sock *sk = sock->sk;
1da177e4
LT
2964 char name[15];
2965 struct net_device *dev;
2966 int err = -ENODEV;
1ce4f28b 2967
1da177e4
LT
2968 /*
2969 * Check legality
2970 */
1ce4f28b 2971
8ae55f04 2972 if (addr_len != sizeof(struct sockaddr))
1da177e4 2973 return -EINVAL;
40d4e3df 2974 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2975
3b1e0a65 2976 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2977 if (dev)
1da177e4 2978 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2979 return err;
2980}
1da177e4
LT
2981
2982static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2983{
40d4e3df
ED
2984 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2985 struct sock *sk = sock->sk;
1da177e4
LT
2986 struct net_device *dev = NULL;
2987 int err;
2988
2989
2990 /*
2991 * Check legality
2992 */
1ce4f28b 2993
1da177e4
LT
2994 if (addr_len < sizeof(struct sockaddr_ll))
2995 return -EINVAL;
2996 if (sll->sll_family != AF_PACKET)
2997 return -EINVAL;
2998
2999 if (sll->sll_ifindex) {
3000 err = -ENODEV;
3b1e0a65 3001 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
3002 if (dev == NULL)
3003 goto out;
3004 }
3005 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3006
3007out:
3008 return err;
3009}
3010
3011static struct proto packet_proto = {
3012 .name = "PACKET",
3013 .owner = THIS_MODULE,
3014 .obj_size = sizeof(struct packet_sock),
3015};
3016
3017/*
1ce4f28b 3018 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3019 */
3020
3f378b68
EP
3021static int packet_create(struct net *net, struct socket *sock, int protocol,
3022 int kern)
1da177e4
LT
3023{
3024 struct sock *sk;
3025 struct packet_sock *po;
0e11c91e 3026 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3027 int err;
3028
df008c91 3029 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3030 return -EPERM;
be02097c
DM
3031 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3032 sock->type != SOCK_PACKET)
1da177e4
LT
3033 return -ESOCKTNOSUPPORT;
3034
3035 sock->state = SS_UNCONNECTED;
3036
3037 err = -ENOBUFS;
11aa9c28 3038 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3039 if (sk == NULL)
3040 goto out;
3041
3042 sock->ops = &packet_ops;
1da177e4
LT
3043 if (sock->type == SOCK_PACKET)
3044 sock->ops = &packet_ops_spkt;
be02097c 3045
1da177e4
LT
3046 sock_init_data(sock, sk);
3047
3048 po = pkt_sk(sk);
3049 sk->sk_family = PF_PACKET;
0e11c91e 3050 po->num = proto;
d346a3fa 3051 po->xmit = dev_queue_xmit;
66e56cd4 3052
b0138408
DB
3053 err = packet_alloc_pending(po);
3054 if (err)
3055 goto out2;
3056
66e56cd4 3057 packet_cached_dev_reset(po);
1da177e4
LT
3058
3059 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3060 sk_refcnt_debug_inc(sk);
1da177e4
LT
3061
3062 /*
3063 * Attach a protocol block
3064 */
3065
3066 spin_lock_init(&po->bind_lock);
905db440 3067 mutex_init(&po->pg_vec_lock);
0648ab70 3068 po->rollover = NULL;
1da177e4 3069 po->prot_hook.func = packet_rcv;
be02097c 3070
1da177e4
LT
3071 if (sock->type == SOCK_PACKET)
3072 po->prot_hook.func = packet_rcv_spkt;
be02097c 3073
1da177e4
LT
3074 po->prot_hook.af_packet_priv = sk;
3075
0e11c91e
AV
3076 if (proto) {
3077 po->prot_hook.type = proto;
ce06b03e 3078 register_prot_hook(sk);
1da177e4
LT
3079 }
3080
0fa7fa98 3081 mutex_lock(&net->packet.sklist_lock);
808f5114 3082 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3083 mutex_unlock(&net->packet.sklist_lock);
3084
3085 preempt_disable();
3680453c 3086 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3087 preempt_enable();
808f5114 3088
40d4e3df 3089 return 0;
b0138408
DB
3090out2:
3091 sk_free(sk);
1da177e4
LT
3092out:
3093 return err;
3094}
3095
3096/*
3097 * Pull a packet from our receive queue and hand it to the user.
3098 * If necessary we block.
3099 */
3100
1b784140
YX
3101static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3102 int flags)
1da177e4
LT
3103{
3104 struct sock *sk = sock->sk;
3105 struct sk_buff *skb;
3106 int copied, err;
bfd5f4a3 3107 int vnet_hdr_len = 0;
2472d761 3108 unsigned int origlen = 0;
1da177e4
LT
3109
3110 err = -EINVAL;
ed85b565 3111 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3112 goto out;
3113
3114#if 0
3115 /* What error should we return now? EUNATTACH? */
3116 if (pkt_sk(sk)->ifindex < 0)
3117 return -ENODEV;
3118#endif
3119
ed85b565 3120 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3121 err = sock_recv_errqueue(sk, msg, len,
3122 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3123 goto out;
3124 }
3125
1da177e4
LT
3126 /*
3127 * Call the generic datagram receiver. This handles all sorts
3128 * of horrible races and re-entrancy so we can forget about it
3129 * in the protocol layers.
3130 *
3131 * Now it will return ENETDOWN, if device have just gone down,
3132 * but then it will block.
3133 */
3134
40d4e3df 3135 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3136
3137 /*
1ce4f28b 3138 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3139 * handles the blocking we don't see and worry about blocking
3140 * retries.
3141 */
3142
8ae55f04 3143 if (skb == NULL)
1da177e4
LT
3144 goto out;
3145
2ccdbaa6
WB
3146 if (pkt_sk(sk)->pressure)
3147 packet_rcv_has_room(pkt_sk(sk), NULL);
3148
bfd5f4a3
SS
3149 if (pkt_sk(sk)->has_vnet_hdr) {
3150 struct virtio_net_hdr vnet_hdr = { 0 };
3151
3152 err = -EINVAL;
3153 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 3154 if (len < vnet_hdr_len)
bfd5f4a3
SS
3155 goto out_free;
3156
1f18b717
MK
3157 len -= vnet_hdr_len;
3158
bfd5f4a3
SS
3159 if (skb_is_gso(skb)) {
3160 struct skb_shared_info *sinfo = skb_shinfo(skb);
3161
3162 /* This is a hint as to how much should be linear. */
dc9e5153
MT
3163 vnet_hdr.hdr_len =
3164 __cpu_to_virtio16(false, skb_headlen(skb));
3165 vnet_hdr.gso_size =
3166 __cpu_to_virtio16(false, sinfo->gso_size);
bfd5f4a3
SS
3167 if (sinfo->gso_type & SKB_GSO_TCPV4)
3168 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
3169 else if (sinfo->gso_type & SKB_GSO_TCPV6)
3170 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
3171 else if (sinfo->gso_type & SKB_GSO_UDP)
3172 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
3173 else if (sinfo->gso_type & SKB_GSO_FCOE)
3174 goto out_free;
3175 else
3176 BUG();
3177 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
3178 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
3179 } else
3180 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
3181
3182 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3183 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
dc9e5153
MT
3184 vnet_hdr.csum_start = __cpu_to_virtio16(false,
3185 skb_checksum_start_offset(skb));
3186 vnet_hdr.csum_offset = __cpu_to_virtio16(false,
3187 skb->csum_offset);
10a8d94a
JW
3188 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
3189 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
3190 } /* else everything is zero */
3191
7eab8d9e 3192 err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
bfd5f4a3
SS
3193 if (err < 0)
3194 goto out_free;
3195 }
3196
f3d33426
HFS
3197 /* You lose any data beyond the buffer you gave. If it worries
3198 * a user program they can ask the device for its MTU
3199 * anyway.
1da177e4 3200 */
1da177e4 3201 copied = skb->len;
40d4e3df
ED
3202 if (copied > len) {
3203 copied = len;
3204 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3205 }
3206
51f3d02b 3207 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3208 if (err)
3209 goto out_free;
3210
2472d761
EB
3211 if (sock->type != SOCK_PACKET) {
3212 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3213
3214 /* Original length was stored in sockaddr_ll fields */
3215 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3216 sll->sll_family = AF_PACKET;
3217 sll->sll_protocol = skb->protocol;
3218 }
3219
3b885787 3220 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3221
f3d33426
HFS
3222 if (msg->msg_name) {
3223 /* If the address length field is there to be filled
3224 * in, we fill it in now.
3225 */
3226 if (sock->type == SOCK_PACKET) {
342dfc30 3227 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3228 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3229 } else {
3230 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3231
f3d33426
HFS
3232 msg->msg_namelen = sll->sll_halen +
3233 offsetof(struct sockaddr_ll, sll_addr);
3234 }
ffbc6111
HX
3235 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3236 msg->msg_namelen);
f3d33426 3237 }
1da177e4 3238
8dc41944 3239 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3240 struct tpacket_auxdata aux;
3241
3242 aux.tp_status = TP_STATUS_USER;
3243 if (skb->ip_summed == CHECKSUM_PARTIAL)
3244 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3245 else if (skb->pkt_type != PACKET_OUTGOING &&
3246 (skb->ip_summed == CHECKSUM_COMPLETE ||
3247 skb_csum_unnecessary(skb)))
3248 aux.tp_status |= TP_STATUS_CSUM_VALID;
3249
2472d761 3250 aux.tp_len = origlen;
ffbc6111
HX
3251 aux.tp_snaplen = skb->len;
3252 aux.tp_mac = 0;
bbe735e4 3253 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3254 if (skb_vlan_tag_present(skb)) {
3255 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3256 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3257 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3258 } else {
3259 aux.tp_vlan_tci = 0;
a0cdfcf3 3260 aux.tp_vlan_tpid = 0;
a3bcc23e 3261 }
ffbc6111 3262 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3263 }
3264
1da177e4
LT
3265 /*
3266 * Free or return the buffer as appropriate. Again this
3267 * hides all the races and re-entrancy issues from us.
3268 */
bfd5f4a3 3269 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3270
3271out_free:
3272 skb_free_datagram(sk, skb);
3273out:
3274 return err;
3275}
3276
1da177e4
LT
3277static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3278 int *uaddr_len, int peer)
3279{
3280 struct net_device *dev;
3281 struct sock *sk = sock->sk;
3282
3283 if (peer)
3284 return -EOPNOTSUPP;
3285
3286 uaddr->sa_family = AF_PACKET;
2dc85bf3 3287 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3288 rcu_read_lock();
3289 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3290 if (dev)
2dc85bf3 3291 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3292 rcu_read_unlock();
1da177e4
LT
3293 *uaddr_len = sizeof(*uaddr);
3294
3295 return 0;
3296}
1da177e4
LT
3297
3298static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3299 int *uaddr_len, int peer)
3300{
3301 struct net_device *dev;
3302 struct sock *sk = sock->sk;
3303 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3304 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3305
3306 if (peer)
3307 return -EOPNOTSUPP;
3308
3309 sll->sll_family = AF_PACKET;
3310 sll->sll_ifindex = po->ifindex;
3311 sll->sll_protocol = po->num;
67286640 3312 sll->sll_pkttype = 0;
654d1f8a
ED
3313 rcu_read_lock();
3314 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3315 if (dev) {
3316 sll->sll_hatype = dev->type;
3317 sll->sll_halen = dev->addr_len;
3318 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3319 } else {
3320 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3321 sll->sll_halen = 0;
3322 }
654d1f8a 3323 rcu_read_unlock();
0fb375fb 3324 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3325
3326 return 0;
3327}
3328
2aeb0b88
WC
3329static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3330 int what)
1da177e4
LT
3331{
3332 switch (i->type) {
3333 case PACKET_MR_MULTICAST:
1162563f
JP
3334 if (i->alen != dev->addr_len)
3335 return -EINVAL;
1da177e4 3336 if (what > 0)
22bedad3 3337 return dev_mc_add(dev, i->addr);
1da177e4 3338 else
22bedad3 3339 return dev_mc_del(dev, i->addr);
1da177e4
LT
3340 break;
3341 case PACKET_MR_PROMISC:
2aeb0b88 3342 return dev_set_promiscuity(dev, what);
1da177e4 3343 case PACKET_MR_ALLMULTI:
2aeb0b88 3344 return dev_set_allmulti(dev, what);
d95ed927 3345 case PACKET_MR_UNICAST:
1162563f
JP
3346 if (i->alen != dev->addr_len)
3347 return -EINVAL;
d95ed927 3348 if (what > 0)
a748ee24 3349 return dev_uc_add(dev, i->addr);
d95ed927 3350 else
a748ee24 3351 return dev_uc_del(dev, i->addr);
d95ed927 3352 break;
40d4e3df
ED
3353 default:
3354 break;
1da177e4 3355 }
2aeb0b88 3356 return 0;
1da177e4
LT
3357}
3358
82f17091
FR
3359static void packet_dev_mclist_delete(struct net_device *dev,
3360 struct packet_mclist **mlp)
1da177e4 3361{
82f17091
FR
3362 struct packet_mclist *ml;
3363
3364 while ((ml = *mlp) != NULL) {
3365 if (ml->ifindex == dev->ifindex) {
3366 packet_dev_mc(dev, ml, -1);
3367 *mlp = ml->next;
3368 kfree(ml);
3369 } else
3370 mlp = &ml->next;
1da177e4
LT
3371 }
3372}
3373
0fb375fb 3374static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3375{
3376 struct packet_sock *po = pkt_sk(sk);
3377 struct packet_mclist *ml, *i;
3378 struct net_device *dev;
3379 int err;
3380
3381 rtnl_lock();
3382
3383 err = -ENODEV;
3b1e0a65 3384 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3385 if (!dev)
3386 goto done;
3387
3388 err = -EINVAL;
1162563f 3389 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3390 goto done;
3391
3392 err = -ENOBUFS;
8b3a7005 3393 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3394 if (i == NULL)
3395 goto done;
3396
3397 err = 0;
3398 for (ml = po->mclist; ml; ml = ml->next) {
3399 if (ml->ifindex == mreq->mr_ifindex &&
3400 ml->type == mreq->mr_type &&
3401 ml->alen == mreq->mr_alen &&
3402 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3403 ml->count++;
3404 /* Free the new element ... */
3405 kfree(i);
3406 goto done;
3407 }
3408 }
3409
3410 i->type = mreq->mr_type;
3411 i->ifindex = mreq->mr_ifindex;
3412 i->alen = mreq->mr_alen;
3413 memcpy(i->addr, mreq->mr_address, i->alen);
3414 i->count = 1;
3415 i->next = po->mclist;
3416 po->mclist = i;
2aeb0b88
WC
3417 err = packet_dev_mc(dev, i, 1);
3418 if (err) {
3419 po->mclist = i->next;
3420 kfree(i);
3421 }
1da177e4
LT
3422
3423done:
3424 rtnl_unlock();
3425 return err;
3426}
3427
0fb375fb 3428static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3429{
3430 struct packet_mclist *ml, **mlp;
3431
3432 rtnl_lock();
3433
3434 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3435 if (ml->ifindex == mreq->mr_ifindex &&
3436 ml->type == mreq->mr_type &&
3437 ml->alen == mreq->mr_alen &&
3438 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3439 if (--ml->count == 0) {
3440 struct net_device *dev;
3441 *mlp = ml->next;
ad959e76
ED
3442 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3443 if (dev)
1da177e4 3444 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3445 kfree(ml);
3446 }
82f17091 3447 break;
1da177e4
LT
3448 }
3449 }
3450 rtnl_unlock();
82f17091 3451 return 0;
1da177e4
LT
3452}
3453
3454static void packet_flush_mclist(struct sock *sk)
3455{
3456 struct packet_sock *po = pkt_sk(sk);
3457 struct packet_mclist *ml;
3458
3459 if (!po->mclist)
3460 return;
3461
3462 rtnl_lock();
3463 while ((ml = po->mclist) != NULL) {
3464 struct net_device *dev;
3465
3466 po->mclist = ml->next;
ad959e76
ED
3467 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3468 if (dev != NULL)
1da177e4 3469 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3470 kfree(ml);
3471 }
3472 rtnl_unlock();
3473}
1da177e4
LT
3474
3475static int
b7058842 3476packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3477{
3478 struct sock *sk = sock->sk;
8dc41944 3479 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3480 int ret;
3481
3482 if (level != SOL_PACKET)
3483 return -ENOPROTOOPT;
3484
69e3c75f 3485 switch (optname) {
1ce4f28b 3486 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3487 case PACKET_DROP_MEMBERSHIP:
3488 {
0fb375fb
EB
3489 struct packet_mreq_max mreq;
3490 int len = optlen;
3491 memset(&mreq, 0, sizeof(mreq));
3492 if (len < sizeof(struct packet_mreq))
1da177e4 3493 return -EINVAL;
0fb375fb
EB
3494 if (len > sizeof(mreq))
3495 len = sizeof(mreq);
40d4e3df 3496 if (copy_from_user(&mreq, optval, len))
1da177e4 3497 return -EFAULT;
0fb375fb
EB
3498 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3499 return -EINVAL;
1da177e4
LT
3500 if (optname == PACKET_ADD_MEMBERSHIP)
3501 ret = packet_mc_add(sk, &mreq);
3502 else
3503 ret = packet_mc_drop(sk, &mreq);
3504 return ret;
3505 }
a2efcfa0 3506
1da177e4 3507 case PACKET_RX_RING:
69e3c75f 3508 case PACKET_TX_RING:
1da177e4 3509 {
f6fb8f10 3510 union tpacket_req_u req_u;
3511 int len;
1da177e4 3512
f6fb8f10 3513 switch (po->tp_version) {
3514 case TPACKET_V1:
3515 case TPACKET_V2:
3516 len = sizeof(req_u.req);
3517 break;
3518 case TPACKET_V3:
3519 default:
3520 len = sizeof(req_u.req3);
3521 break;
3522 }
3523 if (optlen < len)
1da177e4 3524 return -EINVAL;
bfd5f4a3
SS
3525 if (pkt_sk(sk)->has_vnet_hdr)
3526 return -EINVAL;
f6fb8f10 3527 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3528 return -EFAULT;
f6fb8f10 3529 return packet_set_ring(sk, &req_u, 0,
3530 optname == PACKET_TX_RING);
1da177e4
LT
3531 }
3532 case PACKET_COPY_THRESH:
3533 {
3534 int val;
3535
40d4e3df 3536 if (optlen != sizeof(val))
1da177e4 3537 return -EINVAL;
40d4e3df 3538 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3539 return -EFAULT;
3540
3541 pkt_sk(sk)->copy_thresh = val;
3542 return 0;
3543 }
bbd6ef87
PM
3544 case PACKET_VERSION:
3545 {
3546 int val;
3547
3548 if (optlen != sizeof(val))
3549 return -EINVAL;
69e3c75f 3550 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3551 return -EBUSY;
3552 if (copy_from_user(&val, optval, sizeof(val)))
3553 return -EFAULT;
3554 switch (val) {
3555 case TPACKET_V1:
3556 case TPACKET_V2:
f6fb8f10 3557 case TPACKET_V3:
bbd6ef87
PM
3558 po->tp_version = val;
3559 return 0;
3560 default:
3561 return -EINVAL;
3562 }
3563 }
8913336a
PM
3564 case PACKET_RESERVE:
3565 {
3566 unsigned int val;
3567
3568 if (optlen != sizeof(val))
3569 return -EINVAL;
69e3c75f 3570 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3571 return -EBUSY;
3572 if (copy_from_user(&val, optval, sizeof(val)))
3573 return -EFAULT;
3574 po->tp_reserve = val;
3575 return 0;
3576 }
69e3c75f
JB
3577 case PACKET_LOSS:
3578 {
3579 unsigned int val;
3580
3581 if (optlen != sizeof(val))
3582 return -EINVAL;
3583 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3584 return -EBUSY;
3585 if (copy_from_user(&val, optval, sizeof(val)))
3586 return -EFAULT;
3587 po->tp_loss = !!val;
3588 return 0;
3589 }
8dc41944
HX
3590 case PACKET_AUXDATA:
3591 {
3592 int val;
3593
3594 if (optlen < sizeof(val))
3595 return -EINVAL;
3596 if (copy_from_user(&val, optval, sizeof(val)))
3597 return -EFAULT;
3598
3599 po->auxdata = !!val;
3600 return 0;
3601 }
80feaacb
PWJ
3602 case PACKET_ORIGDEV:
3603 {
3604 int val;
3605
3606 if (optlen < sizeof(val))
3607 return -EINVAL;
3608 if (copy_from_user(&val, optval, sizeof(val)))
3609 return -EFAULT;
3610
3611 po->origdev = !!val;
3612 return 0;
3613 }
bfd5f4a3
SS
3614 case PACKET_VNET_HDR:
3615 {
3616 int val;
3617
3618 if (sock->type != SOCK_RAW)
3619 return -EINVAL;
3620 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3621 return -EBUSY;
3622 if (optlen < sizeof(val))
3623 return -EINVAL;
3624 if (copy_from_user(&val, optval, sizeof(val)))
3625 return -EFAULT;
3626
3627 po->has_vnet_hdr = !!val;
3628 return 0;
3629 }
614f60fa
SM
3630 case PACKET_TIMESTAMP:
3631 {
3632 int val;
3633
3634 if (optlen != sizeof(val))
3635 return -EINVAL;
3636 if (copy_from_user(&val, optval, sizeof(val)))
3637 return -EFAULT;
3638
3639 po->tp_tstamp = val;
3640 return 0;
3641 }
dc99f600
DM
3642 case PACKET_FANOUT:
3643 {
3644 int val;
3645
3646 if (optlen != sizeof(val))
3647 return -EINVAL;
3648 if (copy_from_user(&val, optval, sizeof(val)))
3649 return -EFAULT;
3650
3651 return fanout_add(sk, val & 0xffff, val >> 16);
3652 }
47dceb8e
WB
3653 case PACKET_FANOUT_DATA:
3654 {
3655 if (!po->fanout)
3656 return -EINVAL;
3657
3658 return fanout_set_data(po, optval, optlen);
3659 }
5920cd3a
PC
3660 case PACKET_TX_HAS_OFF:
3661 {
3662 unsigned int val;
3663
3664 if (optlen != sizeof(val))
3665 return -EINVAL;
3666 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3667 return -EBUSY;
3668 if (copy_from_user(&val, optval, sizeof(val)))
3669 return -EFAULT;
3670 po->tp_tx_has_off = !!val;
3671 return 0;
3672 }
d346a3fa
DB
3673 case PACKET_QDISC_BYPASS:
3674 {
3675 int val;
3676
3677 if (optlen != sizeof(val))
3678 return -EINVAL;
3679 if (copy_from_user(&val, optval, sizeof(val)))
3680 return -EFAULT;
3681
3682 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3683 return 0;
3684 }
1da177e4
LT
3685 default:
3686 return -ENOPROTOOPT;
3687 }
3688}
3689
3690static int packet_getsockopt(struct socket *sock, int level, int optname,
3691 char __user *optval, int __user *optlen)
3692{
3693 int len;
c06fff6e 3694 int val, lv = sizeof(val);
1da177e4
LT
3695 struct sock *sk = sock->sk;
3696 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3697 void *data = &val;
ee80fbf3 3698 union tpacket_stats_u st;
a9b63918 3699 struct tpacket_rollover_stats rstats;
1da177e4
LT
3700
3701 if (level != SOL_PACKET)
3702 return -ENOPROTOOPT;
3703
8ae55f04
KK
3704 if (get_user(len, optlen))
3705 return -EFAULT;
1da177e4
LT
3706
3707 if (len < 0)
3708 return -EINVAL;
1ce4f28b 3709
69e3c75f 3710 switch (optname) {
1da177e4 3711 case PACKET_STATISTICS:
1da177e4 3712 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3713 memcpy(&st, &po->stats, sizeof(st));
3714 memset(&po->stats, 0, sizeof(po->stats));
3715 spin_unlock_bh(&sk->sk_receive_queue.lock);
3716
f6fb8f10 3717 if (po->tp_version == TPACKET_V3) {
c06fff6e 3718 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3719 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3720 data = &st.stats3;
f6fb8f10 3721 } else {
c06fff6e 3722 lv = sizeof(struct tpacket_stats);
8bcdeaff 3723 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3724 data = &st.stats1;
f6fb8f10 3725 }
ee80fbf3 3726
8dc41944
HX
3727 break;
3728 case PACKET_AUXDATA:
8dc41944 3729 val = po->auxdata;
80feaacb
PWJ
3730 break;
3731 case PACKET_ORIGDEV:
80feaacb 3732 val = po->origdev;
bfd5f4a3
SS
3733 break;
3734 case PACKET_VNET_HDR:
bfd5f4a3 3735 val = po->has_vnet_hdr;
1da177e4 3736 break;
bbd6ef87 3737 case PACKET_VERSION:
bbd6ef87 3738 val = po->tp_version;
bbd6ef87
PM
3739 break;
3740 case PACKET_HDRLEN:
3741 if (len > sizeof(int))
3742 len = sizeof(int);
3743 if (copy_from_user(&val, optval, len))
3744 return -EFAULT;
3745 switch (val) {
3746 case TPACKET_V1:
3747 val = sizeof(struct tpacket_hdr);
3748 break;
3749 case TPACKET_V2:
3750 val = sizeof(struct tpacket2_hdr);
3751 break;
f6fb8f10 3752 case TPACKET_V3:
3753 val = sizeof(struct tpacket3_hdr);
3754 break;
bbd6ef87
PM
3755 default:
3756 return -EINVAL;
3757 }
bbd6ef87 3758 break;
8913336a 3759 case PACKET_RESERVE:
8913336a 3760 val = po->tp_reserve;
8913336a 3761 break;
69e3c75f 3762 case PACKET_LOSS:
69e3c75f 3763 val = po->tp_loss;
69e3c75f 3764 break;
614f60fa 3765 case PACKET_TIMESTAMP:
614f60fa 3766 val = po->tp_tstamp;
614f60fa 3767 break;
dc99f600 3768 case PACKET_FANOUT:
dc99f600
DM
3769 val = (po->fanout ?
3770 ((u32)po->fanout->id |
77f65ebd
WB
3771 ((u32)po->fanout->type << 16) |
3772 ((u32)po->fanout->flags << 24)) :
dc99f600 3773 0);
dc99f600 3774 break;
a9b63918
WB
3775 case PACKET_ROLLOVER_STATS:
3776 if (!po->rollover)
3777 return -EINVAL;
3778 rstats.tp_all = atomic_long_read(&po->rollover->num);
3779 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3780 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3781 data = &rstats;
3782 lv = sizeof(rstats);
3783 break;
5920cd3a
PC
3784 case PACKET_TX_HAS_OFF:
3785 val = po->tp_tx_has_off;
3786 break;
d346a3fa
DB
3787 case PACKET_QDISC_BYPASS:
3788 val = packet_use_direct_xmit(po);
3789 break;
1da177e4
LT
3790 default:
3791 return -ENOPROTOOPT;
3792 }
3793
c06fff6e
ED
3794 if (len > lv)
3795 len = lv;
8ae55f04
KK
3796 if (put_user(len, optlen))
3797 return -EFAULT;
8dc41944
HX
3798 if (copy_to_user(optval, data, len))
3799 return -EFAULT;
8ae55f04 3800 return 0;
1da177e4
LT
3801}
3802
3803
351638e7
JP
3804static int packet_notifier(struct notifier_block *this,
3805 unsigned long msg, void *ptr)
1da177e4
LT
3806{
3807 struct sock *sk;
351638e7 3808 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3809 struct net *net = dev_net(dev);
1da177e4 3810
808f5114 3811 rcu_read_lock();
b67bfe0d 3812 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3813 struct packet_sock *po = pkt_sk(sk);
3814
3815 switch (msg) {
3816 case NETDEV_UNREGISTER:
1da177e4 3817 if (po->mclist)
82f17091 3818 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3819 /* fallthrough */
3820
1da177e4
LT
3821 case NETDEV_DOWN:
3822 if (dev->ifindex == po->ifindex) {
3823 spin_lock(&po->bind_lock);
3824 if (po->running) {
ce06b03e 3825 __unregister_prot_hook(sk, false);
1da177e4
LT
3826 sk->sk_err = ENETDOWN;
3827 if (!sock_flag(sk, SOCK_DEAD))
3828 sk->sk_error_report(sk);
3829 }
3830 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3831 packet_cached_dev_reset(po);
1da177e4 3832 po->ifindex = -1;
160ff18a
BG
3833 if (po->prot_hook.dev)
3834 dev_put(po->prot_hook.dev);
1da177e4
LT
3835 po->prot_hook.dev = NULL;
3836 }
3837 spin_unlock(&po->bind_lock);
3838 }
3839 break;
3840 case NETDEV_UP:
808f5114 3841 if (dev->ifindex == po->ifindex) {
3842 spin_lock(&po->bind_lock);
ce06b03e
DM
3843 if (po->num)
3844 register_prot_hook(sk);
808f5114 3845 spin_unlock(&po->bind_lock);
1da177e4 3846 }
1da177e4
LT
3847 break;
3848 }
3849 }
808f5114 3850 rcu_read_unlock();
1da177e4
LT
3851 return NOTIFY_DONE;
3852}
3853
3854
3855static int packet_ioctl(struct socket *sock, unsigned int cmd,
3856 unsigned long arg)
3857{
3858 struct sock *sk = sock->sk;
3859
69e3c75f 3860 switch (cmd) {
40d4e3df
ED
3861 case SIOCOUTQ:
3862 {
3863 int amount = sk_wmem_alloc_get(sk);
31e6d363 3864
40d4e3df
ED
3865 return put_user(amount, (int __user *)arg);
3866 }
3867 case SIOCINQ:
3868 {
3869 struct sk_buff *skb;
3870 int amount = 0;
3871
3872 spin_lock_bh(&sk->sk_receive_queue.lock);
3873 skb = skb_peek(&sk->sk_receive_queue);
3874 if (skb)
3875 amount = skb->len;
3876 spin_unlock_bh(&sk->sk_receive_queue.lock);
3877 return put_user(amount, (int __user *)arg);
3878 }
3879 case SIOCGSTAMP:
3880 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3881 case SIOCGSTAMPNS:
3882 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3883
1da177e4 3884#ifdef CONFIG_INET
40d4e3df
ED
3885 case SIOCADDRT:
3886 case SIOCDELRT:
3887 case SIOCDARP:
3888 case SIOCGARP:
3889 case SIOCSARP:
3890 case SIOCGIFADDR:
3891 case SIOCSIFADDR:
3892 case SIOCGIFBRDADDR:
3893 case SIOCSIFBRDADDR:
3894 case SIOCGIFNETMASK:
3895 case SIOCSIFNETMASK:
3896 case SIOCGIFDSTADDR:
3897 case SIOCSIFDSTADDR:
3898 case SIOCSIFFLAGS:
40d4e3df 3899 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3900#endif
3901
40d4e3df
ED
3902 default:
3903 return -ENOIOCTLCMD;
1da177e4
LT
3904 }
3905 return 0;
3906}
3907
40d4e3df 3908static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3909 poll_table *wait)
3910{
3911 struct sock *sk = sock->sk;
3912 struct packet_sock *po = pkt_sk(sk);
3913 unsigned int mask = datagram_poll(file, sock, wait);
3914
3915 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3916 if (po->rx_ring.pg_vec) {
f6fb8f10 3917 if (!packet_previous_rx_frame(po, &po->rx_ring,
3918 TP_STATUS_KERNEL))
1da177e4
LT
3919 mask |= POLLIN | POLLRDNORM;
3920 }
2ccdbaa6 3921 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 3922 po->pressure = 0;
1da177e4 3923 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3924 spin_lock_bh(&sk->sk_write_queue.lock);
3925 if (po->tx_ring.pg_vec) {
3926 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3927 mask |= POLLOUT | POLLWRNORM;
3928 }
3929 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3930 return mask;
3931}
3932
3933
3934/* Dirty? Well, I still did not learn better way to account
3935 * for user mmaps.
3936 */
3937
3938static void packet_mm_open(struct vm_area_struct *vma)
3939{
3940 struct file *file = vma->vm_file;
40d4e3df 3941 struct socket *sock = file->private_data;
1da177e4 3942 struct sock *sk = sock->sk;
1ce4f28b 3943
1da177e4
LT
3944 if (sk)
3945 atomic_inc(&pkt_sk(sk)->mapped);
3946}
3947
3948static void packet_mm_close(struct vm_area_struct *vma)
3949{
3950 struct file *file = vma->vm_file;
40d4e3df 3951 struct socket *sock = file->private_data;
1da177e4 3952 struct sock *sk = sock->sk;
1ce4f28b 3953
1da177e4
LT
3954 if (sk)
3955 atomic_dec(&pkt_sk(sk)->mapped);
3956}
3957
f0f37e2f 3958static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3959 .open = packet_mm_open,
3960 .close = packet_mm_close,
1da177e4
LT
3961};
3962
0e3125c7
NH
3963static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3964 unsigned int len)
1da177e4
LT
3965{
3966 int i;
3967
4ebf0ae2 3968 for (i = 0; i < len; i++) {
0e3125c7 3969 if (likely(pg_vec[i].buffer)) {
c56b4d90 3970 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3971 vfree(pg_vec[i].buffer);
3972 else
3973 free_pages((unsigned long)pg_vec[i].buffer,
3974 order);
3975 pg_vec[i].buffer = NULL;
3976 }
1da177e4
LT
3977 }
3978 kfree(pg_vec);
3979}
3980
eea49cc9 3981static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3982{
f0d4eb29 3983 char *buffer;
0e3125c7
NH
3984 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3985 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3986
3987 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3988 if (buffer)
3989 return buffer;
3990
f0d4eb29 3991 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 3992 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
3993 if (buffer)
3994 return buffer;
3995
f0d4eb29 3996 /* vmalloc failed, lets dig into swap here */
0e3125c7 3997 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 3998 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3999 if (buffer)
4000 return buffer;
4001
f0d4eb29 4002 /* complete and utter failure */
0e3125c7 4003 return NULL;
4ebf0ae2
DM
4004}
4005
0e3125c7 4006static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4007{
4008 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4009 struct pgv *pg_vec;
4ebf0ae2
DM
4010 int i;
4011
0e3125c7 4012 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4013 if (unlikely(!pg_vec))
4014 goto out;
4015
4016 for (i = 0; i < block_nr; i++) {
c56b4d90 4017 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4018 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4019 goto out_free_pgvec;
4020 }
4021
4022out:
4023 return pg_vec;
4024
4025out_free_pgvec:
4026 free_pg_vec(pg_vec, order, block_nr);
4027 pg_vec = NULL;
4028 goto out;
4029}
1da177e4 4030
f6fb8f10 4031static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4032 int closing, int tx_ring)
1da177e4 4033{
0e3125c7 4034 struct pgv *pg_vec = NULL;
1da177e4 4035 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4036 int was_running, order = 0;
69e3c75f
JB
4037 struct packet_ring_buffer *rb;
4038 struct sk_buff_head *rb_queue;
0e11c91e 4039 __be16 num;
f6fb8f10 4040 int err = -EINVAL;
4041 /* Added to avoid minimal code churn */
4042 struct tpacket_req *req = &req_u->req;
4043
4044 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4045 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
4046 WARN(1, "Tx-ring is not supported.\n");
4047 goto out;
4048 }
1ce4f28b 4049
69e3c75f
JB
4050 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4051 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4052
69e3c75f
JB
4053 err = -EBUSY;
4054 if (!closing) {
4055 if (atomic_read(&po->mapped))
4056 goto out;
b0138408 4057 if (packet_read_pending(rb))
69e3c75f
JB
4058 goto out;
4059 }
1da177e4 4060
69e3c75f
JB
4061 if (req->tp_block_nr) {
4062 /* Sanity tests and some calculations */
4063 err = -EBUSY;
4064 if (unlikely(rb->pg_vec))
4065 goto out;
1da177e4 4066
bbd6ef87
PM
4067 switch (po->tp_version) {
4068 case TPACKET_V1:
4069 po->tp_hdrlen = TPACKET_HDRLEN;
4070 break;
4071 case TPACKET_V2:
4072 po->tp_hdrlen = TPACKET2_HDRLEN;
4073 break;
f6fb8f10 4074 case TPACKET_V3:
4075 po->tp_hdrlen = TPACKET3_HDRLEN;
4076 break;
bbd6ef87
PM
4077 }
4078
69e3c75f 4079 err = -EINVAL;
4ebf0ae2 4080 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4081 goto out;
4ebf0ae2 4082 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 4083 goto out;
dc808110
ED
4084 if (po->tp_version >= TPACKET_V3 &&
4085 (int)(req->tp_block_size -
4086 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
4087 goto out;
8913336a 4088 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4089 po->tp_reserve))
4090 goto out;
4ebf0ae2 4091 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4092 goto out;
1da177e4 4093
69e3c75f
JB
4094 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
4095 if (unlikely(rb->frames_per_block <= 0))
4096 goto out;
4097 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4098 req->tp_frame_nr))
4099 goto out;
1da177e4
LT
4100
4101 err = -ENOMEM;
4ebf0ae2
DM
4102 order = get_order(req->tp_block_size);
4103 pg_vec = alloc_pg_vec(req, order);
4104 if (unlikely(!pg_vec))
1da177e4 4105 goto out;
f6fb8f10 4106 switch (po->tp_version) {
4107 case TPACKET_V3:
4108 /* Transmit path is not supported. We checked
4109 * it above but just being paranoid
4110 */
4111 if (!tx_ring)
e8e85cc5 4112 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 4113 break;
f6fb8f10 4114 default:
4115 break;
4116 }
69e3c75f
JB
4117 }
4118 /* Done */
4119 else {
4120 err = -EINVAL;
4ebf0ae2 4121 if (unlikely(req->tp_frame_nr))
69e3c75f 4122 goto out;
1da177e4
LT
4123 }
4124
4125 lock_sock(sk);
4126
4127 /* Detach socket from network */
4128 spin_lock(&po->bind_lock);
4129 was_running = po->running;
4130 num = po->num;
4131 if (was_running) {
1da177e4 4132 po->num = 0;
ce06b03e 4133 __unregister_prot_hook(sk, false);
1da177e4
LT
4134 }
4135 spin_unlock(&po->bind_lock);
1ce4f28b 4136
1da177e4
LT
4137 synchronize_net();
4138
4139 err = -EBUSY;
905db440 4140 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4141 if (closing || atomic_read(&po->mapped) == 0) {
4142 err = 0;
69e3c75f 4143 spin_lock_bh(&rb_queue->lock);
c053fd96 4144 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4145 rb->frame_max = (req->tp_frame_nr - 1);
4146 rb->head = 0;
4147 rb->frame_size = req->tp_frame_size;
4148 spin_unlock_bh(&rb_queue->lock);
4149
c053fd96
CG
4150 swap(rb->pg_vec_order, order);
4151 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4152
4153 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4154 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4155 tpacket_rcv : packet_rcv;
4156 skb_queue_purge(rb_queue);
1da177e4 4157 if (atomic_read(&po->mapped))
40d4e3df
ED
4158 pr_err("packet_mmap: vma is busy: %d\n",
4159 atomic_read(&po->mapped));
1da177e4 4160 }
905db440 4161 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4162
4163 spin_lock(&po->bind_lock);
ce06b03e 4164 if (was_running) {
1da177e4 4165 po->num = num;
ce06b03e 4166 register_prot_hook(sk);
1da177e4
LT
4167 }
4168 spin_unlock(&po->bind_lock);
f6fb8f10 4169 if (closing && (po->tp_version > TPACKET_V2)) {
4170 /* Because we don't support block-based V3 on tx-ring */
4171 if (!tx_ring)
73d0fcf2 4172 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4173 }
1da177e4
LT
4174 release_sock(sk);
4175
1da177e4
LT
4176 if (pg_vec)
4177 free_pg_vec(pg_vec, order, req->tp_block_nr);
4178out:
4179 return err;
4180}
4181
69e3c75f
JB
4182static int packet_mmap(struct file *file, struct socket *sock,
4183 struct vm_area_struct *vma)
1da177e4
LT
4184{
4185 struct sock *sk = sock->sk;
4186 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4187 unsigned long size, expected_size;
4188 struct packet_ring_buffer *rb;
1da177e4
LT
4189 unsigned long start;
4190 int err = -EINVAL;
4191 int i;
4192
4193 if (vma->vm_pgoff)
4194 return -EINVAL;
4195
905db440 4196 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4197
4198 expected_size = 0;
4199 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4200 if (rb->pg_vec) {
4201 expected_size += rb->pg_vec_len
4202 * rb->pg_vec_pages
4203 * PAGE_SIZE;
4204 }
4205 }
4206
4207 if (expected_size == 0)
1da177e4 4208 goto out;
69e3c75f
JB
4209
4210 size = vma->vm_end - vma->vm_start;
4211 if (size != expected_size)
1da177e4
LT
4212 goto out;
4213
1da177e4 4214 start = vma->vm_start;
69e3c75f
JB
4215 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4216 if (rb->pg_vec == NULL)
4217 continue;
4218
4219 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4220 struct page *page;
4221 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4222 int pg_num;
4223
c56b4d90
CG
4224 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4225 page = pgv_to_page(kaddr);
69e3c75f
JB
4226 err = vm_insert_page(vma, start, page);
4227 if (unlikely(err))
4228 goto out;
4229 start += PAGE_SIZE;
0e3125c7 4230 kaddr += PAGE_SIZE;
69e3c75f 4231 }
4ebf0ae2 4232 }
1da177e4 4233 }
69e3c75f 4234
4ebf0ae2 4235 atomic_inc(&po->mapped);
1da177e4
LT
4236 vma->vm_ops = &packet_mmap_ops;
4237 err = 0;
4238
4239out:
905db440 4240 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4241 return err;
4242}
1da177e4 4243
90ddc4f0 4244static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4245 .family = PF_PACKET,
4246 .owner = THIS_MODULE,
4247 .release = packet_release,
4248 .bind = packet_bind_spkt,
4249 .connect = sock_no_connect,
4250 .socketpair = sock_no_socketpair,
4251 .accept = sock_no_accept,
4252 .getname = packet_getname_spkt,
4253 .poll = datagram_poll,
4254 .ioctl = packet_ioctl,
4255 .listen = sock_no_listen,
4256 .shutdown = sock_no_shutdown,
4257 .setsockopt = sock_no_setsockopt,
4258 .getsockopt = sock_no_getsockopt,
4259 .sendmsg = packet_sendmsg_spkt,
4260 .recvmsg = packet_recvmsg,
4261 .mmap = sock_no_mmap,
4262 .sendpage = sock_no_sendpage,
4263};
1da177e4 4264
90ddc4f0 4265static const struct proto_ops packet_ops = {
1da177e4
LT
4266 .family = PF_PACKET,
4267 .owner = THIS_MODULE,
4268 .release = packet_release,
4269 .bind = packet_bind,
4270 .connect = sock_no_connect,
4271 .socketpair = sock_no_socketpair,
4272 .accept = sock_no_accept,
1ce4f28b 4273 .getname = packet_getname,
1da177e4
LT
4274 .poll = packet_poll,
4275 .ioctl = packet_ioctl,
4276 .listen = sock_no_listen,
4277 .shutdown = sock_no_shutdown,
4278 .setsockopt = packet_setsockopt,
4279 .getsockopt = packet_getsockopt,
4280 .sendmsg = packet_sendmsg,
4281 .recvmsg = packet_recvmsg,
4282 .mmap = packet_mmap,
4283 .sendpage = sock_no_sendpage,
4284};
4285
ec1b4cf7 4286static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4287 .family = PF_PACKET,
4288 .create = packet_create,
4289 .owner = THIS_MODULE,
4290};
4291
4292static struct notifier_block packet_netdev_notifier = {
40d4e3df 4293 .notifier_call = packet_notifier,
1da177e4
LT
4294};
4295
4296#ifdef CONFIG_PROC_FS
1da177e4
LT
4297
4298static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4299 __acquires(RCU)
1da177e4 4300{
e372c414 4301 struct net *net = seq_file_net(seq);
808f5114 4302
4303 rcu_read_lock();
4304 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4305}
4306
4307static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4308{
1bf40954 4309 struct net *net = seq_file_net(seq);
808f5114 4310 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4311}
4312
4313static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4314 __releases(RCU)
1da177e4 4315{
808f5114 4316 rcu_read_unlock();
1da177e4
LT
4317}
4318
1ce4f28b 4319static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4320{
4321 if (v == SEQ_START_TOKEN)
4322 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4323 else {
b7ceabd9 4324 struct sock *s = sk_entry(v);
1da177e4
LT
4325 const struct packet_sock *po = pkt_sk(s);
4326
4327 seq_printf(seq,
71338aa7 4328 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4329 s,
4330 atomic_read(&s->sk_refcnt),
4331 s->sk_type,
4332 ntohs(po->num),
4333 po->ifindex,
4334 po->running,
4335 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4336 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4337 sock_i_ino(s));
1da177e4
LT
4338 }
4339
4340 return 0;
4341}
4342
56b3d975 4343static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4344 .start = packet_seq_start,
4345 .next = packet_seq_next,
4346 .stop = packet_seq_stop,
4347 .show = packet_seq_show,
4348};
4349
4350static int packet_seq_open(struct inode *inode, struct file *file)
4351{
e372c414
DL
4352 return seq_open_net(inode, file, &packet_seq_ops,
4353 sizeof(struct seq_net_private));
1da177e4
LT
4354}
4355
da7071d7 4356static const struct file_operations packet_seq_fops = {
1da177e4
LT
4357 .owner = THIS_MODULE,
4358 .open = packet_seq_open,
4359 .read = seq_read,
4360 .llseek = seq_lseek,
e372c414 4361 .release = seq_release_net,
1da177e4
LT
4362};
4363
4364#endif
4365
2c8c1e72 4366static int __net_init packet_net_init(struct net *net)
d12d01d6 4367{
0fa7fa98 4368 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4369 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4370
d4beaa66 4371 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4372 return -ENOMEM;
4373
4374 return 0;
4375}
4376
2c8c1e72 4377static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4378{
ece31ffd 4379 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4380}
4381
4382static struct pernet_operations packet_net_ops = {
4383 .init = packet_net_init,
4384 .exit = packet_net_exit,
4385};
4386
4387
1da177e4
LT
4388static void __exit packet_exit(void)
4389{
1da177e4 4390 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4391 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4392 sock_unregister(PF_PACKET);
4393 proto_unregister(&packet_proto);
4394}
4395
4396static int __init packet_init(void)
4397{
4398 int rc = proto_register(&packet_proto, 0);
4399
4400 if (rc != 0)
4401 goto out;
4402
4403 sock_register(&packet_family_ops);
d12d01d6 4404 register_pernet_subsys(&packet_net_ops);
1da177e4 4405 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4406out:
4407 return rc;
4408}
4409
4410module_init(packet_init);
4411module_exit(packet_exit);
4412MODULE_LICENSE("GPL");
4413MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.156684 seconds and 5 git commands to generate.