Merge tag 'tty-4.6-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
1da177e4 96
2787b04b
PE
97#include "internal.h"
98
1da177e4
LT
99/*
100 Assumptions:
101 - if device has no dev->hard_header routine, it adds and removes ll header
102 inside itself. In this case ll header is invisible outside of device,
103 but higher levels still should reserve dev->hard_header_len.
104 Some devices are enough clever to reallocate skb, when header
105 will not fit to reserved space (tunnel), another ones are silly
106 (PPP).
107 - packet socket receives packets with pulled ll header,
108 so that SOCK_RAW should push it back.
109
110On receive:
111-----------
112
113Incoming, dev->hard_header!=NULL
b0e380b1
ACM
114 mac_header -> ll header
115 data -> data
1da177e4
LT
116
117Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
118 mac_header -> ll header
119 data -> ll header
1da177e4
LT
120
121Incoming, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> UNKNOWN position. It is very likely, that it points to ll
123 header. PPP makes it, that is wrong, because introduce
db0c58f9 124 assymetry between rx and tx paths.
b0e380b1 125 data -> data
1da177e4
LT
126
127Outgoing, dev->hard_header==NULL
b0e380b1
ACM
128 mac_header -> data. ll header is still not built!
129 data -> data
1da177e4
LT
130
131Resume
132 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133
134
135On transmit:
136------------
137
138dev->hard_header != NULL
b0e380b1
ACM
139 mac_header -> ll header
140 data -> ll header
1da177e4
LT
141
142dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
143 mac_header -> data
144 data -> data
1da177e4
LT
145
146 We should set nh.raw on output to correct posistion,
147 packet classifier depends on it.
148 */
149
1da177e4
LT
150/* Private packet socket structures. */
151
0fb375fb
EB
152/* identical to struct packet_mreq except it has
153 * a longer address field.
154 */
40d4e3df 155struct packet_mreq_max {
0fb375fb
EB
156 int mr_ifindex;
157 unsigned short mr_type;
158 unsigned short mr_alen;
159 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 160};
a2efcfa0 161
184f489e
DB
162union tpacket_uhdr {
163 struct tpacket_hdr *h1;
164 struct tpacket2_hdr *h2;
165 struct tpacket3_hdr *h3;
166 void *raw;
167};
168
f6fb8f10 169static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
170 int closing, int tx_ring);
171
f6fb8f10 172#define V3_ALIGNMENT (8)
173
bc59ba39 174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
f6fb8f10 179#define PGV_FROM_VMALLOC 1
69e3c75f 180
f6fb8f10 181#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
182#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
183#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
184#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
185#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
186#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
188
69e3c75f
JB
189struct packet_sock;
190static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
191static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
192 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 193
f6fb8f10 194static void *packet_previous_frame(struct packet_sock *po,
195 struct packet_ring_buffer *rb,
196 int status);
197static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 198static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
199 struct tpacket_block_desc *);
200static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 201 struct packet_sock *);
bc59ba39 202static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 203 struct packet_sock *, unsigned int status);
bc59ba39 204static int prb_queue_frozen(struct tpacket_kbdq_core *);
205static void prb_open_block(struct tpacket_kbdq_core *,
206 struct tpacket_block_desc *);
f6fb8f10 207static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 208static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
209static void prb_init_blk_timer(struct packet_sock *,
210 struct tpacket_kbdq_core *,
211 void (*func) (unsigned long));
212static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
213static void prb_clear_rxhash(struct tpacket_kbdq_core *,
214 struct tpacket3_hdr *);
215static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
216 struct tpacket3_hdr *);
1da177e4
LT
217static void packet_flush_mclist(struct sock *sk);
218
ffbc6111 219struct packet_skb_cb {
ffbc6111
HX
220 union {
221 struct sockaddr_pkt pkt;
2472d761
EB
222 union {
223 /* Trick: alias skb original length with
224 * ll.sll_family and ll.protocol in order
225 * to save room.
226 */
227 unsigned int origlen;
228 struct sockaddr_ll ll;
229 };
ffbc6111
HX
230 } sa;
231};
232
d3869efe
DW
233#define vio_le() virtio_legacy_is_little_endian()
234
ffbc6111 235#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 236
bc59ba39 237#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 238#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 239 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 240#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 241 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 242#define GET_NEXT_PRB_BLK_NUM(x) \
243 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
244 ((x)->kactive_blk_num+1) : 0)
245
dc99f600
DM
246static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
247static void __fanout_link(struct sock *sk, struct packet_sock *po);
248
d346a3fa
DB
249static int packet_direct_xmit(struct sk_buff *skb)
250{
251 struct net_device *dev = skb->dev;
d346a3fa
DB
252 netdev_features_t features;
253 struct netdev_queue *txq;
43279500 254 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
255
256 if (unlikely(!netif_running(dev) ||
43279500
DB
257 !netif_carrier_ok(dev)))
258 goto drop;
d346a3fa
DB
259
260 features = netif_skb_features(skb);
261 if (skb_needs_linearize(skb, features) &&
43279500
DB
262 __skb_linearize(skb))
263 goto drop;
d346a3fa 264
10c51b56 265 txq = skb_get_tx_queue(dev, skb);
d346a3fa 266
43279500
DB
267 local_bh_disable();
268
269 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 270 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 271 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 272 HARD_TX_UNLOCK(dev, txq);
d346a3fa 273
43279500
DB
274 local_bh_enable();
275
276 if (!dev_xmit_complete(ret))
d346a3fa 277 kfree_skb(skb);
43279500 278
d346a3fa 279 return ret;
43279500 280drop:
0f97ede4 281 atomic_long_inc(&dev->tx_dropped);
43279500
DB
282 kfree_skb(skb);
283 return NET_XMIT_DROP;
d346a3fa
DB
284}
285
66e56cd4
DB
286static struct net_device *packet_cached_dev_get(struct packet_sock *po)
287{
288 struct net_device *dev;
289
290 rcu_read_lock();
291 dev = rcu_dereference(po->cached_dev);
292 if (likely(dev))
293 dev_hold(dev);
294 rcu_read_unlock();
295
296 return dev;
297}
298
299static void packet_cached_dev_assign(struct packet_sock *po,
300 struct net_device *dev)
301{
302 rcu_assign_pointer(po->cached_dev, dev);
303}
304
305static void packet_cached_dev_reset(struct packet_sock *po)
306{
307 RCU_INIT_POINTER(po->cached_dev, NULL);
308}
309
d346a3fa
DB
310static bool packet_use_direct_xmit(const struct packet_sock *po)
311{
312 return po->xmit == packet_direct_xmit;
313}
314
0fd5d57b 315static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 316{
1cbac010 317 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
318}
319
0fd5d57b
DB
320static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
321{
322 const struct net_device_ops *ops = dev->netdev_ops;
323 u16 queue_index;
324
325 if (ops->ndo_select_queue) {
326 queue_index = ops->ndo_select_queue(dev, skb, NULL,
327 __packet_pick_tx_queue);
328 queue_index = netdev_cap_txqueue(dev, queue_index);
329 } else {
330 queue_index = __packet_pick_tx_queue(dev, skb);
331 }
332
333 skb_set_queue_mapping(skb, queue_index);
334}
335
ce06b03e
DM
336/* register_prot_hook must be invoked with the po->bind_lock held,
337 * or from a context in which asynchronous accesses to the packet
338 * socket is not possible (packet_create()).
339 */
340static void register_prot_hook(struct sock *sk)
341{
342 struct packet_sock *po = pkt_sk(sk);
e40526cb 343
ce06b03e 344 if (!po->running) {
66e56cd4 345 if (po->fanout)
dc99f600 346 __fanout_link(sk, po);
66e56cd4 347 else
dc99f600 348 dev_add_pack(&po->prot_hook);
e40526cb 349
ce06b03e
DM
350 sock_hold(sk);
351 po->running = 1;
352 }
353}
354
355/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
356 * held. If the sync parameter is true, we will temporarily drop
357 * the po->bind_lock and do a synchronize_net to make sure no
358 * asynchronous packet processing paths still refer to the elements
359 * of po->prot_hook. If the sync parameter is false, it is the
360 * callers responsibility to take care of this.
361 */
362static void __unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 po->running = 0;
66e56cd4
DB
367
368 if (po->fanout)
dc99f600 369 __fanout_unlink(sk, po);
66e56cd4 370 else
dc99f600 371 __dev_remove_pack(&po->prot_hook);
e40526cb 372
ce06b03e
DM
373 __sock_put(sk);
374
375 if (sync) {
376 spin_unlock(&po->bind_lock);
377 synchronize_net();
378 spin_lock(&po->bind_lock);
379 }
380}
381
382static void unregister_prot_hook(struct sock *sk, bool sync)
383{
384 struct packet_sock *po = pkt_sk(sk);
385
386 if (po->running)
387 __unregister_prot_hook(sk, sync);
388}
389
6e58040b 390static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
391{
392 if (is_vmalloc_addr(addr))
393 return vmalloc_to_page(addr);
394 return virt_to_page(addr);
395}
396
69e3c75f 397static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 398{
184f489e 399 union tpacket_uhdr h;
1da177e4 400
69e3c75f 401 h.raw = frame;
bbd6ef87
PM
402 switch (po->tp_version) {
403 case TPACKET_V1:
69e3c75f 404 h.h1->tp_status = status;
0af55bb5 405 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
406 break;
407 case TPACKET_V2:
69e3c75f 408 h.h2->tp_status = status;
0af55bb5 409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 410 break;
f6fb8f10 411 case TPACKET_V3:
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f 414 BUG();
bbd6ef87 415 }
69e3c75f
JB
416
417 smp_wmb();
bbd6ef87
PM
418}
419
69e3c75f 420static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 421{
184f489e 422 union tpacket_uhdr h;
bbd6ef87 423
69e3c75f
JB
424 smp_rmb();
425
bbd6ef87
PM
426 h.raw = frame;
427 switch (po->tp_version) {
428 case TPACKET_V1:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 430 return h.h1->tp_status;
bbd6ef87 431 case TPACKET_V2:
0af55bb5 432 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 433 return h.h2->tp_status;
f6fb8f10 434 case TPACKET_V3:
69e3c75f 435 default:
f6fb8f10 436 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
437 BUG();
438 return 0;
bbd6ef87 439 }
1da177e4 440}
69e3c75f 441
b9c32fb2
DB
442static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
443 unsigned int flags)
7a51384c
DB
444{
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
446
68a360e8
WB
447 if (shhwtstamps &&
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
451
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 453 return TP_STATUS_TS_SOFTWARE;
7a51384c 454
b9c32fb2 455 return 0;
7a51384c
DB
456}
457
b9c32fb2
DB
458static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
459 struct sk_buff *skb)
2e31396f
WB
460{
461 union tpacket_uhdr h;
462 struct timespec ts;
b9c32fb2 463 __u32 ts_status;
2e31396f 464
b9c32fb2
DB
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 return 0;
2e31396f
WB
467
468 h.raw = frame;
469 switch (po->tp_version) {
470 case TPACKET_V1:
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
473 break;
474 case TPACKET_V2:
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
477 break;
478 case TPACKET_V3:
479 default:
480 WARN(1, "TPACKET version not supported.\n");
481 BUG();
482 }
483
484 /* one flush is safe, as both fields always lie on the same cacheline */
485 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
486 smp_wmb();
b9c32fb2
DB
487
488 return ts_status;
2e31396f
WB
489}
490
69e3c75f
JB
491static void *packet_lookup_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 unsigned int position,
494 int status)
495{
496 unsigned int pg_vec_pos, frame_offset;
184f489e 497 union tpacket_uhdr h;
69e3c75f
JB
498
499 pg_vec_pos = position / rb->frames_per_block;
500 frame_offset = position % rb->frames_per_block;
501
0e3125c7
NH
502 h.raw = rb->pg_vec[pg_vec_pos].buffer +
503 (frame_offset * rb->frame_size);
69e3c75f
JB
504
505 if (status != __packet_get_status(po, h.raw))
506 return NULL;
507
508 return h.raw;
509}
510
eea49cc9 511static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
512 struct packet_ring_buffer *rb,
513 int status)
514{
515 return packet_lookup_frame(po, rb, rb->head, status);
516}
517
bc59ba39 518static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 519{
520 del_timer_sync(&pkc->retire_blk_timer);
521}
522
523static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 524 struct sk_buff_head *rb_queue)
525{
bc59ba39 526 struct tpacket_kbdq_core *pkc;
f6fb8f10 527
73d0fcf2 528 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 529
ec6f809f 530 spin_lock_bh(&rb_queue->lock);
f6fb8f10 531 pkc->delete_blk_timer = 1;
ec6f809f 532 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 533
534 prb_del_retire_blk_timer(pkc);
535}
536
537static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 538 struct tpacket_kbdq_core *pkc,
f6fb8f10 539 void (*func) (unsigned long))
540{
541 init_timer(&pkc->retire_blk_timer);
542 pkc->retire_blk_timer.data = (long)po;
543 pkc->retire_blk_timer.function = func;
544 pkc->retire_blk_timer.expires = jiffies;
545}
546
e8e85cc5 547static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 548{
bc59ba39 549 struct tpacket_kbdq_core *pkc;
f6fb8f10 550
e8e85cc5 551 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 552 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
553}
554
555static int prb_calc_retire_blk_tmo(struct packet_sock *po,
556 int blk_size_in_bytes)
557{
558 struct net_device *dev;
559 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 560 struct ethtool_link_ksettings ecmd;
4bc71cb9 561 int err;
f6fb8f10 562
4bc71cb9
JP
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
f6fb8f10 567 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 568 }
7cad1bac 569 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
570 rtnl_unlock();
571 if (!err) {
4bc71cb9
JP
572 /*
573 * If the link speed is so slow you don't really
574 * need to worry about perf anyways
575 */
7cad1bac
DD
576 if (ecmd.base.speed < SPEED_1000 ||
577 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 578 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 579 } else {
580 msec = 1;
7cad1bac 581 div = ecmd.base.speed / 1000;
f6fb8f10 582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
bc59ba39 597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
e8e85cc5 606 union tpacket_req_u *req_u)
f6fb8f10 607{
22781a5b 608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 609 struct tpacket_block_desc *pbd;
f6fb8f10 610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
bc59ba39 615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 616 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
ee80fbf3 622 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
dc808110 631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 632 prb_init_ft_ops(p1, req_u);
e8e85cc5 633 prb_setup_retire_blk_timer(po);
f6fb8f10 634 prb_open_block(p1, pbd);
635}
636
637/* Do NOT update the last_blk_num first.
638 * Assumes sk_buff_head lock is held.
639 */
bc59ba39 640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647/*
648 * Timer logic:
649 * 1) We refresh the timer only when we open a block.
650 * By doing this we don't waste cycles refreshing the timer
651 * on packet-by-packet basis.
652 *
653 * With a 1MB block-size, on a 1Gbps line, it will take
654 * i) ~8 ms to fill a block + ii) memcpy etc.
655 * In this cut we are not accounting for the memcpy time.
656 *
657 * So, if the user sets the 'tmo' to 10ms then the timer
658 * will never fire while the block is still getting filled
659 * (which is what we want). However, the user could choose
660 * to close a block early and that's fine.
661 *
662 * But when the timer does fire, we check whether or not to refresh it.
663 * Since the tmo granularity is in msecs, it is not too expensive
664 * to refresh the timer, lets say every '8' msecs.
665 * Either the user can set the 'tmo' or we can derive it based on
666 * a) line-speed and b) block-size.
667 * prb_calc_retire_blk_tmo() calculates the tmo.
668 *
669 */
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 674 unsigned int frozen;
bc59ba39 675 struct tpacket_block_desc *pbd;
f6fb8f10 676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685 /* We only need to plug the race when the block is partially filled.
686 * tpacket_rcv:
687 * lock(); increment BLOCK_NUM_PKTS; unlock()
688 * copy_bits() is in progress ...
689 * timer fires on other cpu:
690 * we can't retire the current block because copy_bits
691 * is in progress.
692 *
693 */
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696 /* Waiting for skb_copy_bits to finish... */
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
41a50d62
AD
703 if (!BLOCK_NUM_PKTS(pbd)) {
704 /* An empty block. Just refresh the timer. */
705 goto refresh_timer;
706 }
f6fb8f10 707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713 /* Case 1. Queue was frozen because user-space was
714 * lagging behind.
715 */
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717 /*
718 * Ok, user-space is still behind.
719 * So just refresh the timer.
720 */
721 goto refresh_timer;
722 } else {
723 /* Case 2. queue was frozen,user-space caught up,
724 * now the link went idle && the timer fired.
725 * We don't have a block to close.So we open this
726 * block and restart the timer.
727 * opening a block thaws the queue,restarts timer
728 * Thawing/timer-refresh is a side effect.
729 */
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
eea49cc9 743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 744 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 745{
746 /* Flush everything minus the block header */
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753 /* Skip the block header(we know header WILL fit in 4K) */
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763 /* Now update the block status. */
764
765 BLOCK_STATUS(pbd1) = status;
766
767 /* Flush the block header */
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777/*
778 * Side effect:
779 *
780 * 1) flush the block
781 * 2) Increment active_blk_num
782 *
783 * Note:We DONT refresh the timer on purpose.
784 * Because almost always the next block will be opened.
785 */
bc59ba39 786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
f6fb8f10 788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
bc59ba39 793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 794 struct sock *sk = &po->sk;
f6fb8f10 795
ee80fbf3 796 if (po->stats.stats3.tp_drops)
f6fb8f10 797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802 /* Get the ts of the last pkt */
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
41a50d62
AD
807 /* Ok, we tmo'd - so get the current time.
808 *
809 * It shouldn't really happen as we don't close empty
810 * blocks. See prb_retire_rx_blk_timer_expired().
811 */
f6fb8f10 812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820 /* Flush the block */
821 prb_flush_block(pkc1, pbd1, status);
822
da413eec
DC
823 sk->sk_data_ready(sk);
824
f6fb8f10 825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
eea49cc9 828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833/*
834 * Side effect of opening a block:
835 *
836 * 1) prb_queue is thawed.
837 * 2) retire_blk_timer is refreshed.
838 *
839 */
bc59ba39 840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
f6fb8f10 842{
843 struct timespec ts;
bc59ba39 844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 845
846 smp_rmb();
847
8da3056c
DB
848 /* We could have just memset this but we will lose the
849 * flexibility of making the priv area sticky
850 */
f6fb8f10 851
8da3056c
DB
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 855
8da3056c
DB
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 860
8da3056c
DB
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
f6fb8f10 875}
876
877/*
878 * Queue freeze logic:
879 * 1) Assume tp_block_nr = 8 blocks.
880 * 2) At time 't0', user opens Rx ring.
881 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882 * 4) user-space is either sleeping or processing block '0'.
883 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
884 * it will close block-7,loop around and try to fill block '0'.
885 * call-flow:
886 * __packet_lookup_frame_in_block
887 * prb_retire_current_block()
888 * prb_dispatch_next_block()
889 * |->(BLOCK_STATUS == USER) evaluates to true
890 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
891 * 6) Now there are two cases:
892 * 6.1) Link goes idle right after the queue is frozen.
893 * But remember, the last open_block() refreshed the timer.
894 * When this timer expires,it will refresh itself so that we can
895 * re-open block-0 in near future.
896 * 6.2) Link is busy and keeps on receiving packets. This is a simple
897 * case and __packet_lookup_frame_in_block will check if block-0
898 * is free and can now be re-used.
899 */
eea49cc9 900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 904 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909/*
910 * If the next block is free then we will dispatch it
911 * and return a good offset.
912 * Else, we will freeze the queue.
913 * So, caller must check the return value.
914 */
bc59ba39 915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct packet_sock *po)
917{
bc59ba39 918 struct tpacket_block_desc *pbd;
f6fb8f10 919
920 smp_rmb();
921
922 /* 1. Get current block num */
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925 /* 2. If this block is currently in_use then freeze the queue */
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931 /*
932 * 3.
933 * open this block and return the offset where the first packet
934 * needs to get stored.
935 */
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
bc59ba39 940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 941 struct packet_sock *po, unsigned int status)
942{
bc59ba39 943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 944
945 /* retire/close the current block */
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947 /*
948 * Plug the case where copy_bits() is in progress on
949 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950 * have space to copy the pkt in the current block and
951 * called prb_retire_current_block()
952 *
953 * We don't need to worry about the TMO case because
954 * the timer-handler already handled this case.
955 */
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958 /* Waiting for skb_copy_bits to finish... */
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
f6fb8f10 965}
966
eea49cc9 967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 968 struct tpacket_block_desc *pbd)
f6fb8f10 969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
eea49cc9 973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
eea49cc9 978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 979{
bc59ba39 980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
eea49cc9 984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 985 struct tpacket3_hdr *ppd)
986{
3958afa1 987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 988}
989
eea49cc9 990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
eea49cc9 996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 997 struct tpacket3_hdr *ppd)
998{
df8a39de
JP
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1003 } else {
9e67030a 1004 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1005 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1006 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1007 }
1008}
1009
bc59ba39 1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1011 struct tpacket3_hdr *ppd)
1012{
a0cdfcf3 1013 ppd->hv1.tp_padding = 0;
f6fb8f10 1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
eea49cc9 1022static void prb_fill_curr_block(char *curr,
bc59ba39 1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
f6fb8f10 1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039/* Assumes caller has the sk->rx_queue.lock */
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
bc59ba39 1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
f6fb8f10 1048 char *curr, *end;
1049
e3192690 1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053 /* Queue is frozen when user space is lagging behind */
1054 if (prb_queue_frozen(pkc)) {
1055 /*
1056 * Check if that last block which caused the queue to freeze,
1057 * is still in_use by user-space.
1058 */
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060 /* Can't record this packet */
1061 return NULL;
1062 } else {
1063 /*
1064 * Ok, the block was released by user-space.
1065 * Now let's open that block.
1066 * opening a block also thaws the queue.
1067 * Thawing is a side effect.
1068 */
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
e3192690 1076 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1077
1078 /* first try the current block */
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084 /* Ok, close the current block */
1085 prb_retire_current_block(pkc, po, 0);
1086
1087 /* Now, try to dispatch the next block */
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095 /*
1096 * No free blocks are available.user_space hasn't caught up yet.
1097 * Queue was just frozen and now this packet will get dropped.
1098 */
1099 return NULL;
1100}
1101
eea49cc9 1102static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
99aa3473 1118 return NULL;
f6fb8f10 1119 }
1120}
1121
eea49cc9 1122static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
77f65ebd 1124 unsigned int idx,
f6fb8f10 1125 int status)
1126{
bc59ba39 1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
eea49cc9 1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145/* Assumes caller has held the rx_queue.lock */
eea49cc9 1146static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
eea49cc9 1164static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
eea49cc9 1179static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
eea49cc9 1187static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
b0138408
DB
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207 /* We don't use pending refcount in rx_ring. */
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
9954729b
WB
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1239{
9954729b
WB
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
77f65ebd 1263
2ccdbaa6 1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1271 - (skb ? skb->truesize : 0);
9954729b
WB
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
77f65ebd 1279
9954729b
WB
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
2ccdbaa6
WB
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
54d7c01d
WB
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
54d7c01d
WB
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1306
9954729b 1307 return ret;
77f65ebd
WB
1308}
1309
1da177e4
LT
1310static void packet_sock_destruct(struct sock *sk)
1311{
ed85b565
RC
1312 skb_queue_purge(&sk->sk_error_queue);
1313
547b792c
IJ
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1319 return;
1320 }
1321
17ab56a2 1322 sk_refcnt_debug_dec(sk);
1da177e4
LT
1323}
1324
3b3a5b0a
WB
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
dc99f600 1342{
61b905da 1343 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1344}
1345
77f65ebd
WB
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
dc99f600 1349{
468479e6 1350 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1351
468479e6 1352 return val % num;
77f65ebd
WB
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
dc99f600
DM
1360}
1361
5df0ddfb
DB
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
f337db64 1366 return prandom_u32_max(num);
5df0ddfb
DB
1367}
1368
77f65ebd
WB
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
ad377cab 1371 unsigned int idx, bool try_self,
77f65ebd 1372 unsigned int num)
95ec3eb4 1373{
4633c9e0 1374 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1375 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1376
0648ab70 1377 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
4633c9e0 1384 po_skip = po;
3b3a5b0a 1385 }
ad377cab 1386
0648ab70 1387 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1388 do {
2ccdbaa6 1389 po_next = pkt_sk(f->arr[i]);
4633c9e0 1390 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1392 if (i != j)
0648ab70 1393 po->rollover->sock = i;
a9b63918
WB
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1397 return i;
1398 }
ad377cab 1399
77f65ebd
WB
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
a9b63918 1404 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1405 return idx;
1406}
1407
2d36097d
NH
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
47dceb8e
WB
1415static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1416 struct sk_buff *skb,
1417 unsigned int num)
1418{
1419 struct bpf_prog *prog;
1420 unsigned int ret = 0;
1421
1422 rcu_read_lock();
1423 prog = rcu_dereference(f->bpf_prog);
1424 if (prog)
ff936a04 1425 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1426 rcu_read_unlock();
1427
1428 return ret;
1429}
1430
77f65ebd
WB
1431static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1432{
1433 return f->flags & (flag >> 8);
95ec3eb4
DM
1434}
1435
95ec3eb4
DM
1436static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1437 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1438{
1439 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1440 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1441 struct net *net = read_pnet(&f->net);
dc99f600 1442 struct packet_sock *po;
77f65ebd 1443 unsigned int idx;
dc99f600 1444
19bcf9f2 1445 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1446 kfree_skb(skb);
1447 return 0;
1448 }
1449
3f34b24a 1450 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1451 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1452 if (!skb)
1453 return 0;
1454 }
95ec3eb4
DM
1455 switch (f->type) {
1456 case PACKET_FANOUT_HASH:
1457 default:
77f65ebd 1458 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1459 break;
1460 case PACKET_FANOUT_LB:
77f65ebd 1461 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1462 break;
1463 case PACKET_FANOUT_CPU:
77f65ebd
WB
1464 idx = fanout_demux_cpu(f, skb, num);
1465 break;
5df0ddfb
DB
1466 case PACKET_FANOUT_RND:
1467 idx = fanout_demux_rnd(f, skb, num);
1468 break;
2d36097d
NH
1469 case PACKET_FANOUT_QM:
1470 idx = fanout_demux_qm(f, skb, num);
1471 break;
77f65ebd 1472 case PACKET_FANOUT_ROLLOVER:
ad377cab 1473 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1474 break;
47dceb8e 1475 case PACKET_FANOUT_CBPF:
f2e52095 1476 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1477 idx = fanout_demux_bpf(f, skb, num);
1478 break;
dc99f600
DM
1479 }
1480
ad377cab
WB
1481 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1482 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1483
ad377cab 1484 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1485 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1486}
1487
fff3321d
PE
1488DEFINE_MUTEX(fanout_mutex);
1489EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1490static LIST_HEAD(fanout_list);
1491
1492static void __fanout_link(struct sock *sk, struct packet_sock *po)
1493{
1494 struct packet_fanout *f = po->fanout;
1495
1496 spin_lock(&f->lock);
1497 f->arr[f->num_members] = sk;
1498 smp_wmb();
1499 f->num_members++;
1500 spin_unlock(&f->lock);
1501}
1502
1503static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1504{
1505 struct packet_fanout *f = po->fanout;
1506 int i;
1507
1508 spin_lock(&f->lock);
1509 for (i = 0; i < f->num_members; i++) {
1510 if (f->arr[i] == sk)
1511 break;
1512 }
1513 BUG_ON(i >= f->num_members);
1514 f->arr[i] = f->arr[f->num_members - 1];
1515 f->num_members--;
1516 spin_unlock(&f->lock);
1517}
1518
d4dd8aee 1519static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1520{
161642e2
ED
1521 if (sk->sk_family != PF_PACKET)
1522 return false;
c0de08d0 1523
161642e2 1524 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1525}
1526
47dceb8e
WB
1527static void fanout_init_data(struct packet_fanout *f)
1528{
1529 switch (f->type) {
1530 case PACKET_FANOUT_LB:
1531 atomic_set(&f->rr_cur, 0);
1532 break;
1533 case PACKET_FANOUT_CBPF:
f2e52095 1534 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1535 RCU_INIT_POINTER(f->bpf_prog, NULL);
1536 break;
1537 }
1538}
1539
1540static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1541{
1542 struct bpf_prog *old;
1543
1544 spin_lock(&f->lock);
1545 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1546 rcu_assign_pointer(f->bpf_prog, new);
1547 spin_unlock(&f->lock);
1548
1549 if (old) {
1550 synchronize_net();
1551 bpf_prog_destroy(old);
1552 }
1553}
1554
1555static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1556 unsigned int len)
1557{
1558 struct bpf_prog *new;
1559 struct sock_fprog fprog;
1560 int ret;
1561
1562 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1563 return -EPERM;
1564 if (len != sizeof(fprog))
1565 return -EINVAL;
1566 if (copy_from_user(&fprog, data, len))
1567 return -EFAULT;
1568
bab18991 1569 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1570 if (ret)
1571 return ret;
1572
1573 __fanout_set_data_bpf(po->fanout, new);
1574 return 0;
1575}
1576
f2e52095
WB
1577static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1578 unsigned int len)
1579{
1580 struct bpf_prog *new;
1581 u32 fd;
1582
1583 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1584 return -EPERM;
1585 if (len != sizeof(fd))
1586 return -EINVAL;
1587 if (copy_from_user(&fd, data, len))
1588 return -EFAULT;
1589
1590 new = bpf_prog_get(fd);
1591 if (IS_ERR(new))
1592 return PTR_ERR(new);
1593 if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
1594 bpf_prog_put(new);
1595 return -EINVAL;
1596 }
1597
1598 __fanout_set_data_bpf(po->fanout, new);
1599 return 0;
1600}
1601
47dceb8e
WB
1602static int fanout_set_data(struct packet_sock *po, char __user *data,
1603 unsigned int len)
1604{
1605 switch (po->fanout->type) {
1606 case PACKET_FANOUT_CBPF:
1607 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1608 case PACKET_FANOUT_EBPF:
1609 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1610 default:
1611 return -EINVAL;
1612 };
1613}
1614
1615static void fanout_release_data(struct packet_fanout *f)
1616{
1617 switch (f->type) {
1618 case PACKET_FANOUT_CBPF:
f2e52095 1619 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1620 __fanout_set_data_bpf(f, NULL);
1621 };
1622}
1623
7736d33f 1624static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1625{
1626 struct packet_sock *po = pkt_sk(sk);
1627 struct packet_fanout *f, *match;
7736d33f 1628 u8 type = type_flags & 0xff;
77f65ebd 1629 u8 flags = type_flags >> 8;
dc99f600
DM
1630 int err;
1631
1632 switch (type) {
77f65ebd
WB
1633 case PACKET_FANOUT_ROLLOVER:
1634 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1635 return -EINVAL;
dc99f600
DM
1636 case PACKET_FANOUT_HASH:
1637 case PACKET_FANOUT_LB:
95ec3eb4 1638 case PACKET_FANOUT_CPU:
5df0ddfb 1639 case PACKET_FANOUT_RND:
2d36097d 1640 case PACKET_FANOUT_QM:
47dceb8e 1641 case PACKET_FANOUT_CBPF:
f2e52095 1642 case PACKET_FANOUT_EBPF:
dc99f600
DM
1643 break;
1644 default:
1645 return -EINVAL;
1646 }
1647
1648 if (!po->running)
1649 return -EINVAL;
1650
1651 if (po->fanout)
1652 return -EALREADY;
1653
4633c9e0
WB
1654 if (type == PACKET_FANOUT_ROLLOVER ||
1655 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
0648ab70
WB
1656 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1657 if (!po->rollover)
1658 return -ENOMEM;
a9b63918
WB
1659 atomic_long_set(&po->rollover->num, 0);
1660 atomic_long_set(&po->rollover->num_huge, 0);
1661 atomic_long_set(&po->rollover->num_failed, 0);
0648ab70
WB
1662 }
1663
dc99f600
DM
1664 mutex_lock(&fanout_mutex);
1665 match = NULL;
1666 list_for_each_entry(f, &fanout_list, list) {
1667 if (f->id == id &&
1668 read_pnet(&f->net) == sock_net(sk)) {
1669 match = f;
1670 break;
1671 }
1672 }
afe62c68 1673 err = -EINVAL;
77f65ebd 1674 if (match && match->flags != flags)
afe62c68 1675 goto out;
dc99f600 1676 if (!match) {
afe62c68 1677 err = -ENOMEM;
dc99f600 1678 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1679 if (!match)
1680 goto out;
1681 write_pnet(&match->net, sock_net(sk));
1682 match->id = id;
1683 match->type = type;
77f65ebd 1684 match->flags = flags;
afe62c68
ED
1685 INIT_LIST_HEAD(&match->list);
1686 spin_lock_init(&match->lock);
1687 atomic_set(&match->sk_ref, 0);
47dceb8e 1688 fanout_init_data(match);
afe62c68
ED
1689 match->prot_hook.type = po->prot_hook.type;
1690 match->prot_hook.dev = po->prot_hook.dev;
1691 match->prot_hook.func = packet_rcv_fanout;
1692 match->prot_hook.af_packet_priv = match;
c0de08d0 1693 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1694 dev_add_pack(&match->prot_hook);
1695 list_add(&match->list, &fanout_list);
dc99f600 1696 }
afe62c68
ED
1697 err = -EINVAL;
1698 if (match->type == type &&
1699 match->prot_hook.type == po->prot_hook.type &&
1700 match->prot_hook.dev == po->prot_hook.dev) {
1701 err = -ENOSPC;
1702 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1703 __dev_remove_pack(&po->prot_hook);
1704 po->fanout = match;
1705 atomic_inc(&match->sk_ref);
1706 __fanout_link(sk, po);
1707 err = 0;
dc99f600
DM
1708 }
1709 }
afe62c68 1710out:
dc99f600 1711 mutex_unlock(&fanout_mutex);
0648ab70
WB
1712 if (err) {
1713 kfree(po->rollover);
1714 po->rollover = NULL;
1715 }
dc99f600
DM
1716 return err;
1717}
1718
1719static void fanout_release(struct sock *sk)
1720{
1721 struct packet_sock *po = pkt_sk(sk);
1722 struct packet_fanout *f;
1723
1724 f = po->fanout;
1725 if (!f)
1726 return;
1727
fff3321d 1728 mutex_lock(&fanout_mutex);
dc99f600
DM
1729 po->fanout = NULL;
1730
dc99f600
DM
1731 if (atomic_dec_and_test(&f->sk_ref)) {
1732 list_del(&f->list);
1733 dev_remove_pack(&f->prot_hook);
47dceb8e 1734 fanout_release_data(f);
dc99f600
DM
1735 kfree(f);
1736 }
1737 mutex_unlock(&fanout_mutex);
0648ab70 1738
59f21118
WB
1739 if (po->rollover)
1740 kfree_rcu(po->rollover, rcu);
dc99f600 1741}
1da177e4 1742
3c70c132
DB
1743static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1744 struct sk_buff *skb)
1745{
1746 /* Earlier code assumed this would be a VLAN pkt, double-check
1747 * this now that we have the actual packet in hand. We can only
1748 * do this check on Ethernet devices.
1749 */
1750 if (unlikely(dev->type != ARPHRD_ETHER))
1751 return false;
1752
1753 skb_reset_mac_header(skb);
1754 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1755}
1756
90ddc4f0 1757static const struct proto_ops packet_ops;
1da177e4 1758
90ddc4f0 1759static const struct proto_ops packet_ops_spkt;
1da177e4 1760
40d4e3df
ED
1761static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1762 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1763{
1764 struct sock *sk;
1765 struct sockaddr_pkt *spkt;
1766
1767 /*
1768 * When we registered the protocol we saved the socket in the data
1769 * field for just this event.
1770 */
1771
1772 sk = pt->af_packet_priv;
1ce4f28b 1773
1da177e4
LT
1774 /*
1775 * Yank back the headers [hope the device set this
1776 * right or kerboom...]
1777 *
1778 * Incoming packets have ll header pulled,
1779 * push it back.
1780 *
98e399f8 1781 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1782 * so that this procedure is noop.
1783 */
1784
1785 if (skb->pkt_type == PACKET_LOOPBACK)
1786 goto out;
1787
09ad9bc7 1788 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1789 goto out;
1790
40d4e3df
ED
1791 skb = skb_share_check(skb, GFP_ATOMIC);
1792 if (skb == NULL)
1da177e4
LT
1793 goto oom;
1794
1795 /* drop any routing info */
adf30907 1796 skb_dst_drop(skb);
1da177e4 1797
84531c24
PO
1798 /* drop conntrack reference */
1799 nf_reset(skb);
1800
ffbc6111 1801 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1802
98e399f8 1803 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1804
1805 /*
1806 * The SOCK_PACKET socket receives _all_ frames.
1807 */
1808
1809 spkt->spkt_family = dev->type;
1810 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1811 spkt->spkt_protocol = skb->protocol;
1812
1813 /*
1814 * Charge the memory to the socket. This is done specifically
1815 * to prevent sockets using all the memory up.
1816 */
1817
40d4e3df 1818 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1819 return 0;
1820
1821out:
1822 kfree_skb(skb);
1823oom:
1824 return 0;
1825}
1826
1827
1828/*
1829 * Output a raw packet to a device layer. This bypasses all the other
1830 * protocol layers and you must therefore supply it with a complete frame
1831 */
1ce4f28b 1832
1b784140
YX
1833static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1834 size_t len)
1da177e4
LT
1835{
1836 struct sock *sk = sock->sk;
342dfc30 1837 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1838 struct sk_buff *skb = NULL;
1da177e4 1839 struct net_device *dev;
40d4e3df 1840 __be16 proto = 0;
1da177e4 1841 int err;
3bdc0eba 1842 int extra_len = 0;
1ce4f28b 1843
1da177e4 1844 /*
1ce4f28b 1845 * Get and verify the address.
1da177e4
LT
1846 */
1847
40d4e3df 1848 if (saddr) {
1da177e4 1849 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1850 return -EINVAL;
1851 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1852 proto = saddr->spkt_protocol;
1853 } else
1854 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1855
1856 /*
1ce4f28b 1857 * Find the device first to size check it
1da177e4
LT
1858 */
1859
de74e92a 1860 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1861retry:
654d1f8a
ED
1862 rcu_read_lock();
1863 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1864 err = -ENODEV;
1865 if (dev == NULL)
1866 goto out_unlock;
1ce4f28b 1867
d5e76b0a
DM
1868 err = -ENETDOWN;
1869 if (!(dev->flags & IFF_UP))
1870 goto out_unlock;
1871
1da177e4 1872 /*
40d4e3df
ED
1873 * You may not queue a frame bigger than the mtu. This is the lowest level
1874 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1875 */
1ce4f28b 1876
3bdc0eba
BG
1877 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1878 if (!netif_supports_nofcs(dev)) {
1879 err = -EPROTONOSUPPORT;
1880 goto out_unlock;
1881 }
1882 extra_len = 4; /* We're doing our own CRC */
1883 }
1884
1da177e4 1885 err = -EMSGSIZE;
3bdc0eba 1886 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1887 goto out_unlock;
1888
1a35ca80
ED
1889 if (!skb) {
1890 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1891 int tlen = dev->needed_tailroom;
1a35ca80
ED
1892 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1893
1894 rcu_read_unlock();
4ce40912 1895 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1896 if (skb == NULL)
1897 return -ENOBUFS;
1898 /* FIXME: Save some space for broken drivers that write a hard
1899 * header at transmission time by themselves. PPP is the notable
1900 * one here. This should really be fixed at the driver level.
1901 */
1902 skb_reserve(skb, reserved);
1903 skb_reset_network_header(skb);
1904
1905 /* Try to align data part correctly */
1906 if (hhlen) {
1907 skb->data -= hhlen;
1908 skb->tail -= hhlen;
1909 if (len < hhlen)
1910 skb_reset_network_header(skb);
1911 }
6ce8e9ce 1912 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1913 if (err)
1914 goto out_free;
1915 goto retry;
1da177e4
LT
1916 }
1917
9ed988cd
WB
1918 if (!dev_validate_header(dev, skb->data, len)) {
1919 err = -EINVAL;
1920 goto out_unlock;
1921 }
3c70c132
DB
1922 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1923 !packet_extra_vlan_len_allowed(dev, skb)) {
1924 err = -EMSGSIZE;
1925 goto out_unlock;
57f89bfa 1926 }
1a35ca80 1927
1da177e4
LT
1928 skb->protocol = proto;
1929 skb->dev = dev;
1930 skb->priority = sk->sk_priority;
2d37a186 1931 skb->mark = sk->sk_mark;
bf84a010
DB
1932
1933 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1934
3bdc0eba
BG
1935 if (unlikely(extra_len == 4))
1936 skb->no_fcs = 1;
1937
40893fd0 1938 skb_probe_transport_header(skb, 0);
c1aad275 1939
1da177e4 1940 dev_queue_xmit(skb);
654d1f8a 1941 rcu_read_unlock();
40d4e3df 1942 return len;
1da177e4 1943
1da177e4 1944out_unlock:
654d1f8a 1945 rcu_read_unlock();
1a35ca80
ED
1946out_free:
1947 kfree_skb(skb);
1da177e4
LT
1948 return err;
1949}
1da177e4 1950
ff936a04
AS
1951static unsigned int run_filter(struct sk_buff *skb,
1952 const struct sock *sk,
1953 unsigned int res)
1da177e4
LT
1954{
1955 struct sk_filter *filter;
fda9ef5d 1956
80f8f102
ED
1957 rcu_read_lock();
1958 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1959 if (filter != NULL)
ff936a04 1960 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1961 rcu_read_unlock();
1da177e4 1962
dbcb5855 1963 return res;
1da177e4
LT
1964}
1965
16cc1400
WB
1966static int __packet_rcv_vnet(const struct sk_buff *skb,
1967 struct virtio_net_hdr *vnet_hdr)
1968{
1969 *vnet_hdr = (const struct virtio_net_hdr) { 0 };
1970
1971 if (skb_is_gso(skb)) {
1972 struct skb_shared_info *sinfo = skb_shinfo(skb);
1973
1974 /* This is a hint as to how much should be linear. */
1975 vnet_hdr->hdr_len =
1976 __cpu_to_virtio16(vio_le(), skb_headlen(skb));
1977 vnet_hdr->gso_size =
1978 __cpu_to_virtio16(vio_le(), sinfo->gso_size);
1979
1980 if (sinfo->gso_type & SKB_GSO_TCPV4)
1981 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1982 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1983 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1984 else if (sinfo->gso_type & SKB_GSO_UDP)
1985 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
1986 else if (sinfo->gso_type & SKB_GSO_FCOE)
1987 return -EINVAL;
1988 else
1989 BUG();
1990
1991 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1992 vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1993 } else
1994 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
1995
1996 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1997 vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1998 vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(),
1999 skb_checksum_start_offset(skb));
2000 vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(),
2001 skb->csum_offset);
2002 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2003 vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
2004 } /* else everything is zero */
2005
2006 return 0;
2007}
2008
2009static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2010 size_t *len)
2011{
2012 struct virtio_net_hdr vnet_hdr;
2013
2014 if (*len < sizeof(vnet_hdr))
2015 return -EINVAL;
2016 *len -= sizeof(vnet_hdr);
2017
2018 if (__packet_rcv_vnet(skb, &vnet_hdr))
2019 return -EINVAL;
2020
2021 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2022}
2023
1da177e4 2024/*
62ab0812
ED
2025 * This function makes lazy skb cloning in hope that most of packets
2026 * are discarded by BPF.
2027 *
2028 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2029 * and skb->cb are mangled. It works because (and until) packets
2030 * falling here are owned by current CPU. Output packets are cloned
2031 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2032 * sequencially, so that if we return skb to original state on exit,
2033 * we will not harm anyone.
1da177e4
LT
2034 */
2035
40d4e3df
ED
2036static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2037 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2038{
2039 struct sock *sk;
2040 struct sockaddr_ll *sll;
2041 struct packet_sock *po;
40d4e3df 2042 u8 *skb_head = skb->data;
1da177e4 2043 int skb_len = skb->len;
dbcb5855 2044 unsigned int snaplen, res;
1da177e4
LT
2045
2046 if (skb->pkt_type == PACKET_LOOPBACK)
2047 goto drop;
2048
2049 sk = pt->af_packet_priv;
2050 po = pkt_sk(sk);
2051
09ad9bc7 2052 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2053 goto drop;
2054
1da177e4
LT
2055 skb->dev = dev;
2056
3b04ddde 2057 if (dev->header_ops) {
1da177e4 2058 /* The device has an explicit notion of ll header,
62ab0812
ED
2059 * exported to higher levels.
2060 *
2061 * Otherwise, the device hides details of its frame
2062 * structure, so that corresponding packet head is
2063 * never delivered to user.
1da177e4
LT
2064 */
2065 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2066 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2067 else if (skb->pkt_type == PACKET_OUTGOING) {
2068 /* Special case: outgoing packets have ll header at head */
bbe735e4 2069 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2070 }
2071 }
2072
2073 snaplen = skb->len;
2074
dbcb5855
DM
2075 res = run_filter(skb, sk, snaplen);
2076 if (!res)
fda9ef5d 2077 goto drop_n_restore;
dbcb5855
DM
2078 if (snaplen > res)
2079 snaplen = res;
1da177e4 2080
0fd7bac6 2081 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2082 goto drop_n_acct;
2083
2084 if (skb_shared(skb)) {
2085 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2086 if (nskb == NULL)
2087 goto drop_n_acct;
2088
2089 if (skb_head != skb->data) {
2090 skb->data = skb_head;
2091 skb->len = skb_len;
2092 }
abc4e4fa 2093 consume_skb(skb);
1da177e4
LT
2094 skb = nskb;
2095 }
2096
b4772ef8 2097 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2098
2099 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2100 sll->sll_hatype = dev->type;
1da177e4 2101 sll->sll_pkttype = skb->pkt_type;
8032b464 2102 if (unlikely(po->origdev))
80feaacb
PWJ
2103 sll->sll_ifindex = orig_dev->ifindex;
2104 else
2105 sll->sll_ifindex = dev->ifindex;
1da177e4 2106
b95cce35 2107 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2108
2472d761
EB
2109 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2110 * Use their space for storing the original skb length.
2111 */
2112 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2113
1da177e4
LT
2114 if (pskb_trim(skb, snaplen))
2115 goto drop_n_acct;
2116
2117 skb_set_owner_r(skb, sk);
2118 skb->dev = NULL;
adf30907 2119 skb_dst_drop(skb);
1da177e4 2120
84531c24
PO
2121 /* drop conntrack reference */
2122 nf_reset(skb);
2123
1da177e4 2124 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2125 po->stats.stats1.tp_packets++;
3bc3b96f 2126 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2127 __skb_queue_tail(&sk->sk_receive_queue, skb);
2128 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2129 sk->sk_data_ready(sk);
1da177e4
LT
2130 return 0;
2131
2132drop_n_acct:
7091fbd8 2133 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2134 po->stats.stats1.tp_drops++;
7091fbd8
WB
2135 atomic_inc(&sk->sk_drops);
2136 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2137
2138drop_n_restore:
2139 if (skb_head != skb->data && skb_shared(skb)) {
2140 skb->data = skb_head;
2141 skb->len = skb_len;
2142 }
2143drop:
ead2ceb0 2144 consume_skb(skb);
1da177e4
LT
2145 return 0;
2146}
2147
40d4e3df
ED
2148static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2149 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2150{
2151 struct sock *sk;
2152 struct packet_sock *po;
2153 struct sockaddr_ll *sll;
184f489e 2154 union tpacket_uhdr h;
40d4e3df 2155 u8 *skb_head = skb->data;
1da177e4 2156 int skb_len = skb->len;
dbcb5855 2157 unsigned int snaplen, res;
f6fb8f10 2158 unsigned long status = TP_STATUS_USER;
bbd6ef87 2159 unsigned short macoff, netoff, hdrlen;
1da177e4 2160 struct sk_buff *copy_skb = NULL;
bbd6ef87 2161 struct timespec ts;
b9c32fb2 2162 __u32 ts_status;
1da177e4 2163
51846355
AW
2164 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2165 * We may add members to them until current aligned size without forcing
2166 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2167 */
2168 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2169 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2170
1da177e4
LT
2171 if (skb->pkt_type == PACKET_LOOPBACK)
2172 goto drop;
2173
2174 sk = pt->af_packet_priv;
2175 po = pkt_sk(sk);
2176
09ad9bc7 2177 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2178 goto drop;
2179
3b04ddde 2180 if (dev->header_ops) {
1da177e4 2181 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2182 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2183 else if (skb->pkt_type == PACKET_OUTGOING) {
2184 /* Special case: outgoing packets have ll header at head */
bbe735e4 2185 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2186 }
2187 }
2188
2189 snaplen = skb->len;
2190
dbcb5855
DM
2191 res = run_filter(skb, sk, snaplen);
2192 if (!res)
fda9ef5d 2193 goto drop_n_restore;
68c2e5de
AD
2194
2195 if (skb->ip_summed == CHECKSUM_PARTIAL)
2196 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2197 else if (skb->pkt_type != PACKET_OUTGOING &&
2198 (skb->ip_summed == CHECKSUM_COMPLETE ||
2199 skb_csum_unnecessary(skb)))
2200 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2201
dbcb5855
DM
2202 if (snaplen > res)
2203 snaplen = res;
1da177e4
LT
2204
2205 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2206 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2207 po->tp_reserve;
1da177e4 2208 } else {
95c96174 2209 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2210 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2211 (maclen < 16 ? 16 : maclen)) +
58d19b19
WB
2212 po->tp_reserve;
2213 if (po->has_vnet_hdr)
2214 netoff += sizeof(struct virtio_net_hdr);
1da177e4
LT
2215 macoff = netoff - maclen;
2216 }
f6fb8f10 2217 if (po->tp_version <= TPACKET_V2) {
2218 if (macoff + snaplen > po->rx_ring.frame_size) {
2219 if (po->copy_thresh &&
0fd7bac6 2220 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2221 if (skb_shared(skb)) {
2222 copy_skb = skb_clone(skb, GFP_ATOMIC);
2223 } else {
2224 copy_skb = skb_get(skb);
2225 skb_head = skb->data;
2226 }
2227 if (copy_skb)
2228 skb_set_owner_r(copy_skb, sk);
1da177e4 2229 }
f6fb8f10 2230 snaplen = po->rx_ring.frame_size - macoff;
2231 if ((int)snaplen < 0)
2232 snaplen = 0;
1da177e4 2233 }
dc808110
ED
2234 } else if (unlikely(macoff + snaplen >
2235 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2236 u32 nval;
2237
2238 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2239 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2240 snaplen, nval, macoff);
2241 snaplen = nval;
2242 if (unlikely((int)snaplen < 0)) {
2243 snaplen = 0;
2244 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2245 }
1da177e4 2246 }
1da177e4 2247 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2248 h.raw = packet_current_rx_frame(po, skb,
2249 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2250 if (!h.raw)
58d19b19 2251 goto drop_n_account;
f6fb8f10 2252 if (po->tp_version <= TPACKET_V2) {
2253 packet_increment_rx_head(po, &po->rx_ring);
2254 /*
2255 * LOSING will be reported till you read the stats,
2256 * because it's COR - Clear On Read.
2257 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2258 * at packet level.
2259 */
ee80fbf3 2260 if (po->stats.stats1.tp_drops)
f6fb8f10 2261 status |= TP_STATUS_LOSING;
2262 }
ee80fbf3 2263 po->stats.stats1.tp_packets++;
1da177e4
LT
2264 if (copy_skb) {
2265 status |= TP_STATUS_COPY;
2266 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2267 }
1da177e4
LT
2268 spin_unlock(&sk->sk_receive_queue.lock);
2269
58d19b19
WB
2270 if (po->has_vnet_hdr) {
2271 if (__packet_rcv_vnet(skb, h.raw + macoff -
2272 sizeof(struct virtio_net_hdr))) {
2273 spin_lock(&sk->sk_receive_queue.lock);
2274 goto drop_n_account;
2275 }
2276 }
2277
bbd6ef87 2278 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2279
2280 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2281 getnstimeofday(&ts);
1da177e4 2282
b9c32fb2
DB
2283 status |= ts_status;
2284
bbd6ef87
PM
2285 switch (po->tp_version) {
2286 case TPACKET_V1:
2287 h.h1->tp_len = skb->len;
2288 h.h1->tp_snaplen = snaplen;
2289 h.h1->tp_mac = macoff;
2290 h.h1->tp_net = netoff;
4b457bdf
DB
2291 h.h1->tp_sec = ts.tv_sec;
2292 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2293 hdrlen = sizeof(*h.h1);
2294 break;
2295 case TPACKET_V2:
2296 h.h2->tp_len = skb->len;
2297 h.h2->tp_snaplen = snaplen;
2298 h.h2->tp_mac = macoff;
2299 h.h2->tp_net = netoff;
bbd6ef87
PM
2300 h.h2->tp_sec = ts.tv_sec;
2301 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2302 if (skb_vlan_tag_present(skb)) {
2303 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2304 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2305 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2306 } else {
2307 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2308 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2309 }
e4d26f4b 2310 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2311 hdrlen = sizeof(*h.h2);
2312 break;
f6fb8f10 2313 case TPACKET_V3:
2314 /* tp_nxt_offset,vlan are already populated above.
2315 * So DONT clear those fields here
2316 */
2317 h.h3->tp_status |= status;
2318 h.h3->tp_len = skb->len;
2319 h.h3->tp_snaplen = snaplen;
2320 h.h3->tp_mac = macoff;
2321 h.h3->tp_net = netoff;
f6fb8f10 2322 h.h3->tp_sec = ts.tv_sec;
2323 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2324 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2325 hdrlen = sizeof(*h.h3);
2326 break;
bbd6ef87
PM
2327 default:
2328 BUG();
2329 }
1da177e4 2330
bbd6ef87 2331 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2332 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2333 sll->sll_family = AF_PACKET;
2334 sll->sll_hatype = dev->type;
2335 sll->sll_protocol = skb->protocol;
2336 sll->sll_pkttype = skb->pkt_type;
8032b464 2337 if (unlikely(po->origdev))
80feaacb
PWJ
2338 sll->sll_ifindex = orig_dev->ifindex;
2339 else
2340 sll->sll_ifindex = dev->ifindex;
1da177e4 2341
e16aa207 2342 smp_mb();
f0d4eb29 2343
f6dafa95 2344#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2345 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2346 u8 *start, *end;
2347
f0d4eb29
DB
2348 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2349 macoff + snaplen);
2350
2351 for (start = h.raw; start < end; start += PAGE_SIZE)
2352 flush_dcache_page(pgv_to_page(start));
1da177e4 2353 }
f0d4eb29 2354 smp_wmb();
f6dafa95 2355#endif
f0d4eb29 2356
da413eec 2357 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2358 __packet_set_status(po, h.raw, status);
da413eec
DC
2359 sk->sk_data_ready(sk);
2360 } else {
f6fb8f10 2361 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2362 }
1da177e4
LT
2363
2364drop_n_restore:
2365 if (skb_head != skb->data && skb_shared(skb)) {
2366 skb->data = skb_head;
2367 skb->len = skb_len;
2368 }
2369drop:
1ce4f28b 2370 kfree_skb(skb);
1da177e4
LT
2371 return 0;
2372
58d19b19 2373drop_n_account:
ee80fbf3 2374 po->stats.stats1.tp_drops++;
1da177e4
LT
2375 spin_unlock(&sk->sk_receive_queue.lock);
2376
676d2369 2377 sk->sk_data_ready(sk);
acb5d75b 2378 kfree_skb(copy_skb);
1da177e4
LT
2379 goto drop_n_restore;
2380}
2381
69e3c75f
JB
2382static void tpacket_destruct_skb(struct sk_buff *skb)
2383{
2384 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2385
69e3c75f 2386 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2387 void *ph;
b9c32fb2
DB
2388 __u32 ts;
2389
69e3c75f 2390 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2391 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2392
2393 ts = __packet_set_timestamp(po, ph, skb);
2394 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2395 }
2396
2397 sock_wfree(skb);
2398}
2399
c72219b7
DB
2400static void tpacket_set_protocol(const struct net_device *dev,
2401 struct sk_buff *skb)
2402{
2403 if (dev->type == ARPHRD_ETHER) {
2404 skb_reset_mac_header(skb);
2405 skb->protocol = eth_hdr(skb)->h_proto;
2406 }
2407}
2408
16cc1400
WB
2409static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2410{
2411 unsigned short gso_type = 0;
2412
2413 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2414 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2415 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2416 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2417 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2418 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2419 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2420
2421 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2422 return -EINVAL;
2423
2424 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2425 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2426 case VIRTIO_NET_HDR_GSO_TCPV4:
2427 gso_type = SKB_GSO_TCPV4;
2428 break;
2429 case VIRTIO_NET_HDR_GSO_TCPV6:
2430 gso_type = SKB_GSO_TCPV6;
2431 break;
2432 case VIRTIO_NET_HDR_GSO_UDP:
2433 gso_type = SKB_GSO_UDP;
2434 break;
2435 default:
2436 return -EINVAL;
2437 }
2438
2439 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
2440 gso_type |= SKB_GSO_TCP_ECN;
2441
2442 if (vnet_hdr->gso_size == 0)
2443 return -EINVAL;
2444 }
2445
2446 vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */
2447 return 0;
2448}
2449
2450static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2451 struct virtio_net_hdr *vnet_hdr)
2452{
2453 int n;
2454
2455 if (*len < sizeof(*vnet_hdr))
2456 return -EINVAL;
2457 *len -= sizeof(*vnet_hdr);
2458
2459 n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
2460 if (n != sizeof(*vnet_hdr))
2461 return -EFAULT;
2462
2463 return __packet_snd_vnet_parse(vnet_hdr, *len);
2464}
2465
2466static int packet_snd_vnet_gso(struct sk_buff *skb,
2467 struct virtio_net_hdr *vnet_hdr)
2468{
2469 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2470 u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
2471 u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
2472
2473 if (!skb_partial_csum_set(skb, s, o))
2474 return -EINVAL;
2475 }
2476
2477 skb_shinfo(skb)->gso_size =
2478 __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
2479 skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
2480
2481 /* Header must be checked, and gso_segs computed. */
2482 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2483 skb_shinfo(skb)->gso_segs = 0;
2484 return 0;
2485}
2486
40d4e3df 2487static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2488 void *frame, struct net_device *dev, void *data, int tp_len,
1d036d25 2489 __be16 proto, unsigned char *addr, int hlen, int copylen)
69e3c75f 2490{
184f489e 2491 union tpacket_uhdr ph;
8d39b4a6 2492 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2493 struct socket *sock = po->sk.sk_socket;
2494 struct page *page;
69e3c75f
JB
2495 int err;
2496
2497 ph.raw = frame;
2498
2499 skb->protocol = proto;
2500 skb->dev = dev;
2501 skb->priority = po->sk.sk_priority;
2d37a186 2502 skb->mark = po->sk.sk_mark;
2e31396f 2503 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2504 skb_shinfo(skb)->destructor_arg = ph.raw;
2505
ae641949 2506 skb_reserve(skb, hlen);
69e3c75f 2507 skb_reset_network_header(skb);
c1aad275 2508
69e3c75f
JB
2509 to_write = tp_len;
2510
2511 if (sock->type == SOCK_DGRAM) {
2512 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2513 NULL, tp_len);
2514 if (unlikely(err < 0))
2515 return -EINVAL;
1d036d25 2516 } else if (copylen) {
9ed988cd
WB
2517 int hdrlen = min_t(int, copylen, tp_len);
2518
69e3c75f 2519 skb_push(skb, dev->hard_header_len);
1d036d25 2520 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2521 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2522 if (unlikely(err))
2523 return err;
9ed988cd
WB
2524 if (!dev_validate_header(dev, skb->data, hdrlen))
2525 return -EINVAL;
c72219b7
DB
2526 if (!skb->protocol)
2527 tpacket_set_protocol(dev, skb);
69e3c75f 2528
9ed988cd
WB
2529 data += hdrlen;
2530 to_write -= hdrlen;
69e3c75f
JB
2531 }
2532
69e3c75f
JB
2533 offset = offset_in_page(data);
2534 len_max = PAGE_SIZE - offset;
2535 len = ((to_write > len_max) ? len_max : to_write);
2536
2537 skb->data_len = to_write;
2538 skb->len += to_write;
2539 skb->truesize += to_write;
2540 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2541
2542 while (likely(to_write)) {
2543 nr_frags = skb_shinfo(skb)->nr_frags;
2544
2545 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2546 pr_err("Packet exceed the number of skb frags(%lu)\n",
2547 MAX_SKB_FRAGS);
69e3c75f
JB
2548 return -EFAULT;
2549 }
2550
0af55bb5
CG
2551 page = pgv_to_page(data);
2552 data += len;
69e3c75f
JB
2553 flush_dcache_page(page);
2554 get_page(page);
0af55bb5 2555 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2556 to_write -= len;
2557 offset = 0;
2558 len_max = PAGE_SIZE;
2559 len = ((to_write > len_max) ? len_max : to_write);
2560 }
2561
8fd6c80d 2562 skb_probe_transport_header(skb, 0);
efdfa2f7 2563
69e3c75f
JB
2564 return tp_len;
2565}
2566
8d39b4a6
WB
2567static int tpacket_parse_header(struct packet_sock *po, void *frame,
2568 int size_max, void **data)
2569{
2570 union tpacket_uhdr ph;
2571 int tp_len, off;
2572
2573 ph.raw = frame;
2574
2575 switch (po->tp_version) {
2576 case TPACKET_V2:
2577 tp_len = ph.h2->tp_len;
2578 break;
2579 default:
2580 tp_len = ph.h1->tp_len;
2581 break;
2582 }
2583 if (unlikely(tp_len > size_max)) {
2584 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2585 return -EMSGSIZE;
2586 }
2587
2588 if (unlikely(po->tp_tx_has_off)) {
2589 int off_min, off_max;
2590
2591 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2592 off_max = po->tx_ring.frame_size - tp_len;
2593 if (po->sk.sk_type == SOCK_DGRAM) {
2594 switch (po->tp_version) {
2595 case TPACKET_V2:
2596 off = ph.h2->tp_net;
2597 break;
2598 default:
2599 off = ph.h1->tp_net;
2600 break;
2601 }
2602 } else {
2603 switch (po->tp_version) {
2604 case TPACKET_V2:
2605 off = ph.h2->tp_mac;
2606 break;
2607 default:
2608 off = ph.h1->tp_mac;
2609 break;
2610 }
2611 }
2612 if (unlikely((off < off_min) || (off_max < off)))
2613 return -EINVAL;
2614 } else {
2615 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2616 }
2617
2618 *data = frame + off;
2619 return tp_len;
2620}
2621
69e3c75f
JB
2622static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2623{
69e3c75f
JB
2624 struct sk_buff *skb;
2625 struct net_device *dev;
1d036d25 2626 struct virtio_net_hdr *vnet_hdr = NULL;
69e3c75f 2627 __be16 proto;
09effa67 2628 int err, reserve = 0;
40d4e3df 2629 void *ph;
342dfc30 2630 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2631 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2632 int tp_len, size_max;
2633 unsigned char *addr;
8d39b4a6 2634 void *data;
69e3c75f 2635 int len_sum = 0;
9e67030a 2636 int status = TP_STATUS_AVAILABLE;
1d036d25 2637 int hlen, tlen, copylen = 0;
69e3c75f 2638
69e3c75f
JB
2639 mutex_lock(&po->pg_vec_lock);
2640
66e56cd4 2641 if (likely(saddr == NULL)) {
e40526cb 2642 dev = packet_cached_dev_get(po);
69e3c75f
JB
2643 proto = po->num;
2644 addr = NULL;
2645 } else {
2646 err = -EINVAL;
2647 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2648 goto out;
2649 if (msg->msg_namelen < (saddr->sll_halen
2650 + offsetof(struct sockaddr_ll,
2651 sll_addr)))
2652 goto out;
69e3c75f
JB
2653 proto = saddr->sll_protocol;
2654 addr = saddr->sll_addr;
827d9780 2655 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2656 }
2657
69e3c75f
JB
2658 err = -ENXIO;
2659 if (unlikely(dev == NULL))
2660 goto out;
69e3c75f
JB
2661 err = -ENETDOWN;
2662 if (unlikely(!(dev->flags & IFF_UP)))
2663 goto out_put;
2664
5cfb4c8d
DB
2665 if (po->sk.sk_socket->type == SOCK_RAW)
2666 reserve = dev->hard_header_len;
69e3c75f 2667 size_max = po->tx_ring.frame_size
b5dd884e 2668 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2669
1d036d25 2670 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2671 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2672
69e3c75f
JB
2673 do {
2674 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2675 TP_STATUS_SEND_REQUEST);
69e3c75f 2676 if (unlikely(ph == NULL)) {
87a2fd28
DB
2677 if (need_wait && need_resched())
2678 schedule();
69e3c75f
JB
2679 continue;
2680 }
2681
8d39b4a6
WB
2682 skb = NULL;
2683 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2684 if (tp_len < 0)
2685 goto tpacket_error;
2686
69e3c75f 2687 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2688 hlen = LL_RESERVED_SPACE(dev);
2689 tlen = dev->needed_tailroom;
1d036d25
WB
2690 if (po->has_vnet_hdr) {
2691 vnet_hdr = data;
2692 data += sizeof(*vnet_hdr);
2693 tp_len -= sizeof(*vnet_hdr);
2694 if (tp_len < 0 ||
2695 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2696 tp_len = -EINVAL;
2697 goto tpacket_error;
2698 }
2699 copylen = __virtio16_to_cpu(vio_le(),
2700 vnet_hdr->hdr_len);
2701 }
9ed988cd 2702 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2703 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2704 hlen + tlen + sizeof(struct sockaddr_ll) +
2705 (copylen - dev->hard_header_len),
fbf33a28 2706 !need_wait, &err);
69e3c75f 2707
fbf33a28
KM
2708 if (unlikely(skb == NULL)) {
2709 /* we assume the socket was initially writeable ... */
2710 if (likely(len_sum > 0))
2711 err = len_sum;
69e3c75f 2712 goto out_status;
fbf33a28 2713 }
8d39b4a6 2714 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
1d036d25 2715 addr, hlen, copylen);
dbd46ab4 2716 if (likely(tp_len >= 0) &&
5cfb4c8d 2717 tp_len > dev->mtu + reserve &&
1d036d25 2718 !po->has_vnet_hdr &&
3c70c132
DB
2719 !packet_extra_vlan_len_allowed(dev, skb))
2720 tp_len = -EMSGSIZE;
69e3c75f
JB
2721
2722 if (unlikely(tp_len < 0)) {
8d39b4a6 2723tpacket_error:
69e3c75f
JB
2724 if (po->tp_loss) {
2725 __packet_set_status(po, ph,
2726 TP_STATUS_AVAILABLE);
2727 packet_increment_head(&po->tx_ring);
2728 kfree_skb(skb);
2729 continue;
2730 } else {
2731 status = TP_STATUS_WRONG_FORMAT;
2732 err = tp_len;
2733 goto out_status;
2734 }
2735 }
2736
1d036d25
WB
2737 if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
2738 tp_len = -EINVAL;
2739 goto tpacket_error;
2740 }
2741
0fd5d57b
DB
2742 packet_pick_tx_queue(dev, skb);
2743
69e3c75f
JB
2744 skb->destructor = tpacket_destruct_skb;
2745 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2746 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2747
2748 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2749 err = po->xmit(skb);
eb70df13
JP
2750 if (unlikely(err > 0)) {
2751 err = net_xmit_errno(err);
2752 if (err && __packet_get_status(po, ph) ==
2753 TP_STATUS_AVAILABLE) {
2754 /* skb was destructed already */
2755 skb = NULL;
2756 goto out_status;
2757 }
2758 /*
2759 * skb was dropped but not destructed yet;
2760 * let's treat it like congestion or err < 0
2761 */
2762 err = 0;
2763 }
69e3c75f
JB
2764 packet_increment_head(&po->tx_ring);
2765 len_sum += tp_len;
b0138408
DB
2766 } while (likely((ph != NULL) ||
2767 /* Note: packet_read_pending() might be slow if we have
2768 * to call it as it's per_cpu variable, but in fast-path
2769 * we already short-circuit the loop with the first
2770 * condition, and luckily don't have to go that path
2771 * anyway.
2772 */
2773 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2774
2775 err = len_sum;
2776 goto out_put;
2777
69e3c75f
JB
2778out_status:
2779 __packet_set_status(po, ph, status);
2780 kfree_skb(skb);
2781out_put:
e40526cb 2782 dev_put(dev);
69e3c75f
JB
2783out:
2784 mutex_unlock(&po->pg_vec_lock);
2785 return err;
2786}
69e3c75f 2787
eea49cc9
OJ
2788static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2789 size_t reserve, size_t len,
2790 size_t linear, int noblock,
2791 int *err)
bfd5f4a3
SS
2792{
2793 struct sk_buff *skb;
2794
2795 /* Under a page? Don't bother with paged skb. */
2796 if (prepad + len < PAGE_SIZE || !linear)
2797 linear = len;
2798
2799 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2800 err, 0);
bfd5f4a3
SS
2801 if (!skb)
2802 return NULL;
2803
2804 skb_reserve(skb, reserve);
2805 skb_put(skb, linear);
2806 skb->data_len = len - linear;
2807 skb->len += len - linear;
2808
2809 return skb;
2810}
2811
d346a3fa 2812static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2813{
2814 struct sock *sk = sock->sk;
342dfc30 2815 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2816 struct sk_buff *skb;
2817 struct net_device *dev;
0e11c91e 2818 __be16 proto;
1da177e4 2819 unsigned char *addr;
827d9780 2820 int err, reserve = 0;
c7d39e32 2821 struct sockcm_cookie sockc;
bfd5f4a3
SS
2822 struct virtio_net_hdr vnet_hdr = { 0 };
2823 int offset = 0;
bfd5f4a3 2824 struct packet_sock *po = pkt_sk(sk);
ae641949 2825 int hlen, tlen;
3bdc0eba 2826 int extra_len = 0;
1da177e4
LT
2827
2828 /*
1ce4f28b 2829 * Get and verify the address.
1da177e4 2830 */
1ce4f28b 2831
66e56cd4 2832 if (likely(saddr == NULL)) {
e40526cb 2833 dev = packet_cached_dev_get(po);
1da177e4
LT
2834 proto = po->num;
2835 addr = NULL;
2836 } else {
2837 err = -EINVAL;
2838 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2839 goto out;
0fb375fb
EB
2840 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2841 goto out;
1da177e4
LT
2842 proto = saddr->sll_protocol;
2843 addr = saddr->sll_addr;
827d9780 2844 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2845 }
2846
1da177e4 2847 err = -ENXIO;
e40526cb 2848 if (unlikely(dev == NULL))
1da177e4 2849 goto out_unlock;
d5e76b0a 2850 err = -ENETDOWN;
e40526cb 2851 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2852 goto out_unlock;
2853
c7d39e32
EJ
2854 sockc.mark = sk->sk_mark;
2855 if (msg->msg_controllen) {
2856 err = sock_cmsg_send(sk, msg, &sockc);
2857 if (unlikely(err))
2858 goto out_unlock;
2859 }
2860
e40526cb
DB
2861 if (sock->type == SOCK_RAW)
2862 reserve = dev->hard_header_len;
bfd5f4a3 2863 if (po->has_vnet_hdr) {
16cc1400
WB
2864 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2865 if (err)
bfd5f4a3 2866 goto out_unlock;
bfd5f4a3
SS
2867 }
2868
3bdc0eba
BG
2869 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2870 if (!netif_supports_nofcs(dev)) {
2871 err = -EPROTONOSUPPORT;
2872 goto out_unlock;
2873 }
2874 extra_len = 4; /* We're doing our own CRC */
2875 }
2876
1da177e4 2877 err = -EMSGSIZE;
16cc1400
WB
2878 if (!vnet_hdr.gso_type &&
2879 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2880 goto out_unlock;
2881
bfd5f4a3 2882 err = -ENOBUFS;
ae641949
HX
2883 hlen = LL_RESERVED_SPACE(dev);
2884 tlen = dev->needed_tailroom;
dc9e5153 2885 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
d3869efe 2886 __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
bfd5f4a3 2887 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2888 if (skb == NULL)
1da177e4
LT
2889 goto out_unlock;
2890
bfd5f4a3 2891 skb_set_network_header(skb, reserve);
1da177e4 2892
0c4e8581 2893 err = -EINVAL;
9c707762
WB
2894 if (sock->type == SOCK_DGRAM) {
2895 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2896 if (unlikely(offset < 0))
9c707762 2897 goto out_free;
9c707762 2898 }
1da177e4
LT
2899
2900 /* Returns -EFAULT on error */
c0371da6 2901 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2902 if (err)
2903 goto out_free;
bf84a010 2904
9ed988cd
WB
2905 if (sock->type == SOCK_RAW &&
2906 !dev_validate_header(dev, skb->data, len)) {
2907 err = -EINVAL;
2908 goto out_free;
2909 }
2910
bf84a010 2911 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2912
16cc1400 2913 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2914 !packet_extra_vlan_len_allowed(dev, skb)) {
2915 err = -EMSGSIZE;
2916 goto out_free;
57f89bfa
BG
2917 }
2918
09effa67
DM
2919 skb->protocol = proto;
2920 skb->dev = dev;
1da177e4 2921 skb->priority = sk->sk_priority;
c7d39e32 2922 skb->mark = sockc.mark;
0fd5d57b
DB
2923
2924 packet_pick_tx_queue(dev, skb);
1da177e4 2925
bfd5f4a3 2926 if (po->has_vnet_hdr) {
16cc1400
WB
2927 err = packet_snd_vnet_gso(skb, &vnet_hdr);
2928 if (err)
2929 goto out_free;
2930 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2931 }
2932
8fd6c80d
DB
2933 skb_probe_transport_header(skb, reserve);
2934
3bdc0eba
BG
2935 if (unlikely(extra_len == 4))
2936 skb->no_fcs = 1;
2937
d346a3fa 2938 err = po->xmit(skb);
1da177e4
LT
2939 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2940 goto out_unlock;
2941
e40526cb 2942 dev_put(dev);
1da177e4 2943
40d4e3df 2944 return len;
1da177e4
LT
2945
2946out_free:
2947 kfree_skb(skb);
2948out_unlock:
e40526cb 2949 if (dev)
1da177e4
LT
2950 dev_put(dev);
2951out:
2952 return err;
2953}
2954
1b784140 2955static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2956{
69e3c75f
JB
2957 struct sock *sk = sock->sk;
2958 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2959
69e3c75f
JB
2960 if (po->tx_ring.pg_vec)
2961 return tpacket_snd(po, msg);
2962 else
69e3c75f
JB
2963 return packet_snd(sock, msg, len);
2964}
2965
1da177e4
LT
2966/*
2967 * Close a PACKET socket. This is fairly simple. We immediately go
2968 * to 'closed' state and remove our protocol entry in the device list.
2969 */
2970
2971static int packet_release(struct socket *sock)
2972{
2973 struct sock *sk = sock->sk;
2974 struct packet_sock *po;
d12d01d6 2975 struct net *net;
f6fb8f10 2976 union tpacket_req_u req_u;
1da177e4
LT
2977
2978 if (!sk)
2979 return 0;
2980
3b1e0a65 2981 net = sock_net(sk);
1da177e4
LT
2982 po = pkt_sk(sk);
2983
0fa7fa98 2984 mutex_lock(&net->packet.sklist_lock);
808f5114 2985 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2986 mutex_unlock(&net->packet.sklist_lock);
2987
2988 preempt_disable();
920de804 2989 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2990 preempt_enable();
1da177e4 2991
808f5114 2992 spin_lock(&po->bind_lock);
ce06b03e 2993 unregister_prot_hook(sk, false);
66e56cd4
DB
2994 packet_cached_dev_reset(po);
2995
160ff18a
BG
2996 if (po->prot_hook.dev) {
2997 dev_put(po->prot_hook.dev);
2998 po->prot_hook.dev = NULL;
2999 }
808f5114 3000 spin_unlock(&po->bind_lock);
1da177e4 3001
1da177e4 3002 packet_flush_mclist(sk);
1da177e4 3003
9665d5d6
PS
3004 if (po->rx_ring.pg_vec) {
3005 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3006 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3007 }
69e3c75f 3008
9665d5d6
PS
3009 if (po->tx_ring.pg_vec) {
3010 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3011 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3012 }
1da177e4 3013
dc99f600
DM
3014 fanout_release(sk);
3015
808f5114 3016 synchronize_net();
1da177e4
LT
3017 /*
3018 * Now the socket is dead. No more input will appear.
3019 */
1da177e4
LT
3020 sock_orphan(sk);
3021 sock->sk = NULL;
3022
3023 /* Purge queues */
3024
3025 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3026 packet_free_pending(po);
17ab56a2 3027 sk_refcnt_debug_release(sk);
1da177e4
LT
3028
3029 sock_put(sk);
3030 return 0;
3031}
3032
3033/*
3034 * Attach a packet hook.
3035 */
3036
30f7ea1c
FR
3037static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3038 __be16 proto)
1da177e4
LT
3039{
3040 struct packet_sock *po = pkt_sk(sk);
158cd4af 3041 struct net_device *dev_curr;
902fefb8
DB
3042 __be16 proto_curr;
3043 bool need_rehook;
30f7ea1c
FR
3044 struct net_device *dev = NULL;
3045 int ret = 0;
3046 bool unlisted = false;
dc99f600 3047
30f7ea1c 3048 if (po->fanout)
dc99f600 3049 return -EINVAL;
1da177e4
LT
3050
3051 lock_sock(sk);
1da177e4 3052 spin_lock(&po->bind_lock);
30f7ea1c
FR
3053 rcu_read_lock();
3054
3055 if (name) {
3056 dev = dev_get_by_name_rcu(sock_net(sk), name);
3057 if (!dev) {
3058 ret = -ENODEV;
3059 goto out_unlock;
3060 }
3061 } else if (ifindex) {
3062 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3063 if (!dev) {
3064 ret = -ENODEV;
3065 goto out_unlock;
3066 }
3067 }
3068
3069 if (dev)
3070 dev_hold(dev);
66e56cd4 3071
902fefb8
DB
3072 proto_curr = po->prot_hook.type;
3073 dev_curr = po->prot_hook.dev;
3074
3075 need_rehook = proto_curr != proto || dev_curr != dev;
3076
3077 if (need_rehook) {
30f7ea1c
FR
3078 if (po->running) {
3079 rcu_read_unlock();
3080 __unregister_prot_hook(sk, true);
3081 rcu_read_lock();
3082 dev_curr = po->prot_hook.dev;
3083 if (dev)
3084 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3085 dev->ifindex);
3086 }
1da177e4 3087
902fefb8
DB
3088 po->num = proto;
3089 po->prot_hook.type = proto;
902fefb8 3090
30f7ea1c
FR
3091 if (unlikely(unlisted)) {
3092 dev_put(dev);
3093 po->prot_hook.dev = NULL;
3094 po->ifindex = -1;
3095 packet_cached_dev_reset(po);
3096 } else {
3097 po->prot_hook.dev = dev;
3098 po->ifindex = dev ? dev->ifindex : 0;
3099 packet_cached_dev_assign(po, dev);
3100 }
902fefb8 3101 }
158cd4af
LW
3102 if (dev_curr)
3103 dev_put(dev_curr);
66e56cd4 3104
902fefb8 3105 if (proto == 0 || !need_rehook)
1da177e4
LT
3106 goto out_unlock;
3107
30f7ea1c 3108 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3109 register_prot_hook(sk);
be85d4ad
UT
3110 } else {
3111 sk->sk_err = ENETDOWN;
3112 if (!sock_flag(sk, SOCK_DEAD))
3113 sk->sk_error_report(sk);
1da177e4
LT
3114 }
3115
3116out_unlock:
30f7ea1c 3117 rcu_read_unlock();
1da177e4
LT
3118 spin_unlock(&po->bind_lock);
3119 release_sock(sk);
30f7ea1c 3120 return ret;
1da177e4
LT
3121}
3122
3123/*
3124 * Bind a packet socket to a device
3125 */
3126
40d4e3df
ED
3127static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3128 int addr_len)
1da177e4 3129{
40d4e3df 3130 struct sock *sk = sock->sk;
1da177e4 3131 char name[15];
1ce4f28b 3132
1da177e4
LT
3133 /*
3134 * Check legality
3135 */
1ce4f28b 3136
8ae55f04 3137 if (addr_len != sizeof(struct sockaddr))
1da177e4 3138 return -EINVAL;
40d4e3df 3139 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 3140
30f7ea1c 3141 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3142}
1da177e4
LT
3143
3144static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3145{
40d4e3df
ED
3146 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3147 struct sock *sk = sock->sk;
1da177e4
LT
3148
3149 /*
3150 * Check legality
3151 */
1ce4f28b 3152
1da177e4
LT
3153 if (addr_len < sizeof(struct sockaddr_ll))
3154 return -EINVAL;
3155 if (sll->sll_family != AF_PACKET)
3156 return -EINVAL;
3157
30f7ea1c
FR
3158 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3159 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3160}
3161
3162static struct proto packet_proto = {
3163 .name = "PACKET",
3164 .owner = THIS_MODULE,
3165 .obj_size = sizeof(struct packet_sock),
3166};
3167
3168/*
1ce4f28b 3169 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3170 */
3171
3f378b68
EP
3172static int packet_create(struct net *net, struct socket *sock, int protocol,
3173 int kern)
1da177e4
LT
3174{
3175 struct sock *sk;
3176 struct packet_sock *po;
0e11c91e 3177 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3178 int err;
3179
df008c91 3180 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3181 return -EPERM;
be02097c
DM
3182 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3183 sock->type != SOCK_PACKET)
1da177e4
LT
3184 return -ESOCKTNOSUPPORT;
3185
3186 sock->state = SS_UNCONNECTED;
3187
3188 err = -ENOBUFS;
11aa9c28 3189 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3190 if (sk == NULL)
3191 goto out;
3192
3193 sock->ops = &packet_ops;
1da177e4
LT
3194 if (sock->type == SOCK_PACKET)
3195 sock->ops = &packet_ops_spkt;
be02097c 3196
1da177e4
LT
3197 sock_init_data(sock, sk);
3198
3199 po = pkt_sk(sk);
3200 sk->sk_family = PF_PACKET;
0e11c91e 3201 po->num = proto;
d346a3fa 3202 po->xmit = dev_queue_xmit;
66e56cd4 3203
b0138408
DB
3204 err = packet_alloc_pending(po);
3205 if (err)
3206 goto out2;
3207
66e56cd4 3208 packet_cached_dev_reset(po);
1da177e4
LT
3209
3210 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3211 sk_refcnt_debug_inc(sk);
1da177e4
LT
3212
3213 /*
3214 * Attach a protocol block
3215 */
3216
3217 spin_lock_init(&po->bind_lock);
905db440 3218 mutex_init(&po->pg_vec_lock);
0648ab70 3219 po->rollover = NULL;
1da177e4 3220 po->prot_hook.func = packet_rcv;
be02097c 3221
1da177e4
LT
3222 if (sock->type == SOCK_PACKET)
3223 po->prot_hook.func = packet_rcv_spkt;
be02097c 3224
1da177e4
LT
3225 po->prot_hook.af_packet_priv = sk;
3226
0e11c91e
AV
3227 if (proto) {
3228 po->prot_hook.type = proto;
ce06b03e 3229 register_prot_hook(sk);
1da177e4
LT
3230 }
3231
0fa7fa98 3232 mutex_lock(&net->packet.sklist_lock);
808f5114 3233 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3234 mutex_unlock(&net->packet.sklist_lock);
3235
3236 preempt_disable();
3680453c 3237 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3238 preempt_enable();
808f5114 3239
40d4e3df 3240 return 0;
b0138408
DB
3241out2:
3242 sk_free(sk);
1da177e4
LT
3243out:
3244 return err;
3245}
3246
3247/*
3248 * Pull a packet from our receive queue and hand it to the user.
3249 * If necessary we block.
3250 */
3251
1b784140
YX
3252static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3253 int flags)
1da177e4
LT
3254{
3255 struct sock *sk = sock->sk;
3256 struct sk_buff *skb;
3257 int copied, err;
bfd5f4a3 3258 int vnet_hdr_len = 0;
2472d761 3259 unsigned int origlen = 0;
1da177e4
LT
3260
3261 err = -EINVAL;
ed85b565 3262 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3263 goto out;
3264
3265#if 0
3266 /* What error should we return now? EUNATTACH? */
3267 if (pkt_sk(sk)->ifindex < 0)
3268 return -ENODEV;
3269#endif
3270
ed85b565 3271 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3272 err = sock_recv_errqueue(sk, msg, len,
3273 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3274 goto out;
3275 }
3276
1da177e4
LT
3277 /*
3278 * Call the generic datagram receiver. This handles all sorts
3279 * of horrible races and re-entrancy so we can forget about it
3280 * in the protocol layers.
3281 *
3282 * Now it will return ENETDOWN, if device have just gone down,
3283 * but then it will block.
3284 */
3285
40d4e3df 3286 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3287
3288 /*
1ce4f28b 3289 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3290 * handles the blocking we don't see and worry about blocking
3291 * retries.
3292 */
3293
8ae55f04 3294 if (skb == NULL)
1da177e4
LT
3295 goto out;
3296
2ccdbaa6
WB
3297 if (pkt_sk(sk)->pressure)
3298 packet_rcv_has_room(pkt_sk(sk), NULL);
3299
bfd5f4a3 3300 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3301 err = packet_rcv_vnet(msg, skb, &len);
3302 if (err)
bfd5f4a3 3303 goto out_free;
16cc1400 3304 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3305 }
3306
f3d33426
HFS
3307 /* You lose any data beyond the buffer you gave. If it worries
3308 * a user program they can ask the device for its MTU
3309 * anyway.
1da177e4 3310 */
1da177e4 3311 copied = skb->len;
40d4e3df
ED
3312 if (copied > len) {
3313 copied = len;
3314 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3315 }
3316
51f3d02b 3317 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3318 if (err)
3319 goto out_free;
3320
2472d761
EB
3321 if (sock->type != SOCK_PACKET) {
3322 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3323
3324 /* Original length was stored in sockaddr_ll fields */
3325 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3326 sll->sll_family = AF_PACKET;
3327 sll->sll_protocol = skb->protocol;
3328 }
3329
3b885787 3330 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3331
f3d33426
HFS
3332 if (msg->msg_name) {
3333 /* If the address length field is there to be filled
3334 * in, we fill it in now.
3335 */
3336 if (sock->type == SOCK_PACKET) {
342dfc30 3337 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3338 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3339 } else {
3340 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3341
f3d33426
HFS
3342 msg->msg_namelen = sll->sll_halen +
3343 offsetof(struct sockaddr_ll, sll_addr);
3344 }
ffbc6111
HX
3345 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3346 msg->msg_namelen);
f3d33426 3347 }
1da177e4 3348
8dc41944 3349 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3350 struct tpacket_auxdata aux;
3351
3352 aux.tp_status = TP_STATUS_USER;
3353 if (skb->ip_summed == CHECKSUM_PARTIAL)
3354 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3355 else if (skb->pkt_type != PACKET_OUTGOING &&
3356 (skb->ip_summed == CHECKSUM_COMPLETE ||
3357 skb_csum_unnecessary(skb)))
3358 aux.tp_status |= TP_STATUS_CSUM_VALID;
3359
2472d761 3360 aux.tp_len = origlen;
ffbc6111
HX
3361 aux.tp_snaplen = skb->len;
3362 aux.tp_mac = 0;
bbe735e4 3363 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3364 if (skb_vlan_tag_present(skb)) {
3365 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3366 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3367 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3368 } else {
3369 aux.tp_vlan_tci = 0;
a0cdfcf3 3370 aux.tp_vlan_tpid = 0;
a3bcc23e 3371 }
ffbc6111 3372 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3373 }
3374
1da177e4
LT
3375 /*
3376 * Free or return the buffer as appropriate. Again this
3377 * hides all the races and re-entrancy issues from us.
3378 */
bfd5f4a3 3379 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3380
3381out_free:
3382 skb_free_datagram(sk, skb);
3383out:
3384 return err;
3385}
3386
1da177e4
LT
3387static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3388 int *uaddr_len, int peer)
3389{
3390 struct net_device *dev;
3391 struct sock *sk = sock->sk;
3392
3393 if (peer)
3394 return -EOPNOTSUPP;
3395
3396 uaddr->sa_family = AF_PACKET;
2dc85bf3 3397 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3398 rcu_read_lock();
3399 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3400 if (dev)
2dc85bf3 3401 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3402 rcu_read_unlock();
1da177e4
LT
3403 *uaddr_len = sizeof(*uaddr);
3404
3405 return 0;
3406}
1da177e4
LT
3407
3408static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3409 int *uaddr_len, int peer)
3410{
3411 struct net_device *dev;
3412 struct sock *sk = sock->sk;
3413 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3414 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3415
3416 if (peer)
3417 return -EOPNOTSUPP;
3418
3419 sll->sll_family = AF_PACKET;
3420 sll->sll_ifindex = po->ifindex;
3421 sll->sll_protocol = po->num;
67286640 3422 sll->sll_pkttype = 0;
654d1f8a
ED
3423 rcu_read_lock();
3424 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3425 if (dev) {
3426 sll->sll_hatype = dev->type;
3427 sll->sll_halen = dev->addr_len;
3428 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3429 } else {
3430 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3431 sll->sll_halen = 0;
3432 }
654d1f8a 3433 rcu_read_unlock();
0fb375fb 3434 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3435
3436 return 0;
3437}
3438
2aeb0b88
WC
3439static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3440 int what)
1da177e4
LT
3441{
3442 switch (i->type) {
3443 case PACKET_MR_MULTICAST:
1162563f
JP
3444 if (i->alen != dev->addr_len)
3445 return -EINVAL;
1da177e4 3446 if (what > 0)
22bedad3 3447 return dev_mc_add(dev, i->addr);
1da177e4 3448 else
22bedad3 3449 return dev_mc_del(dev, i->addr);
1da177e4
LT
3450 break;
3451 case PACKET_MR_PROMISC:
2aeb0b88 3452 return dev_set_promiscuity(dev, what);
1da177e4 3453 case PACKET_MR_ALLMULTI:
2aeb0b88 3454 return dev_set_allmulti(dev, what);
d95ed927 3455 case PACKET_MR_UNICAST:
1162563f
JP
3456 if (i->alen != dev->addr_len)
3457 return -EINVAL;
d95ed927 3458 if (what > 0)
a748ee24 3459 return dev_uc_add(dev, i->addr);
d95ed927 3460 else
a748ee24 3461 return dev_uc_del(dev, i->addr);
d95ed927 3462 break;
40d4e3df
ED
3463 default:
3464 break;
1da177e4 3465 }
2aeb0b88 3466 return 0;
1da177e4
LT
3467}
3468
82f17091
FR
3469static void packet_dev_mclist_delete(struct net_device *dev,
3470 struct packet_mclist **mlp)
1da177e4 3471{
82f17091
FR
3472 struct packet_mclist *ml;
3473
3474 while ((ml = *mlp) != NULL) {
3475 if (ml->ifindex == dev->ifindex) {
3476 packet_dev_mc(dev, ml, -1);
3477 *mlp = ml->next;
3478 kfree(ml);
3479 } else
3480 mlp = &ml->next;
1da177e4
LT
3481 }
3482}
3483
0fb375fb 3484static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3485{
3486 struct packet_sock *po = pkt_sk(sk);
3487 struct packet_mclist *ml, *i;
3488 struct net_device *dev;
3489 int err;
3490
3491 rtnl_lock();
3492
3493 err = -ENODEV;
3b1e0a65 3494 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3495 if (!dev)
3496 goto done;
3497
3498 err = -EINVAL;
1162563f 3499 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3500 goto done;
3501
3502 err = -ENOBUFS;
8b3a7005 3503 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3504 if (i == NULL)
3505 goto done;
3506
3507 err = 0;
3508 for (ml = po->mclist; ml; ml = ml->next) {
3509 if (ml->ifindex == mreq->mr_ifindex &&
3510 ml->type == mreq->mr_type &&
3511 ml->alen == mreq->mr_alen &&
3512 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3513 ml->count++;
3514 /* Free the new element ... */
3515 kfree(i);
3516 goto done;
3517 }
3518 }
3519
3520 i->type = mreq->mr_type;
3521 i->ifindex = mreq->mr_ifindex;
3522 i->alen = mreq->mr_alen;
3523 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3524 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3525 i->count = 1;
3526 i->next = po->mclist;
3527 po->mclist = i;
2aeb0b88
WC
3528 err = packet_dev_mc(dev, i, 1);
3529 if (err) {
3530 po->mclist = i->next;
3531 kfree(i);
3532 }
1da177e4
LT
3533
3534done:
3535 rtnl_unlock();
3536 return err;
3537}
3538
0fb375fb 3539static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3540{
3541 struct packet_mclist *ml, **mlp;
3542
3543 rtnl_lock();
3544
3545 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3546 if (ml->ifindex == mreq->mr_ifindex &&
3547 ml->type == mreq->mr_type &&
3548 ml->alen == mreq->mr_alen &&
3549 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3550 if (--ml->count == 0) {
3551 struct net_device *dev;
3552 *mlp = ml->next;
ad959e76
ED
3553 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3554 if (dev)
1da177e4 3555 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3556 kfree(ml);
3557 }
82f17091 3558 break;
1da177e4
LT
3559 }
3560 }
3561 rtnl_unlock();
82f17091 3562 return 0;
1da177e4
LT
3563}
3564
3565static void packet_flush_mclist(struct sock *sk)
3566{
3567 struct packet_sock *po = pkt_sk(sk);
3568 struct packet_mclist *ml;
3569
3570 if (!po->mclist)
3571 return;
3572
3573 rtnl_lock();
3574 while ((ml = po->mclist) != NULL) {
3575 struct net_device *dev;
3576
3577 po->mclist = ml->next;
ad959e76
ED
3578 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3579 if (dev != NULL)
1da177e4 3580 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3581 kfree(ml);
3582 }
3583 rtnl_unlock();
3584}
1da177e4
LT
3585
3586static int
b7058842 3587packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3588{
3589 struct sock *sk = sock->sk;
8dc41944 3590 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3591 int ret;
3592
3593 if (level != SOL_PACKET)
3594 return -ENOPROTOOPT;
3595
69e3c75f 3596 switch (optname) {
1ce4f28b 3597 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3598 case PACKET_DROP_MEMBERSHIP:
3599 {
0fb375fb
EB
3600 struct packet_mreq_max mreq;
3601 int len = optlen;
3602 memset(&mreq, 0, sizeof(mreq));
3603 if (len < sizeof(struct packet_mreq))
1da177e4 3604 return -EINVAL;
0fb375fb
EB
3605 if (len > sizeof(mreq))
3606 len = sizeof(mreq);
40d4e3df 3607 if (copy_from_user(&mreq, optval, len))
1da177e4 3608 return -EFAULT;
0fb375fb
EB
3609 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3610 return -EINVAL;
1da177e4
LT
3611 if (optname == PACKET_ADD_MEMBERSHIP)
3612 ret = packet_mc_add(sk, &mreq);
3613 else
3614 ret = packet_mc_drop(sk, &mreq);
3615 return ret;
3616 }
a2efcfa0 3617
1da177e4 3618 case PACKET_RX_RING:
69e3c75f 3619 case PACKET_TX_RING:
1da177e4 3620 {
f6fb8f10 3621 union tpacket_req_u req_u;
3622 int len;
1da177e4 3623
f6fb8f10 3624 switch (po->tp_version) {
3625 case TPACKET_V1:
3626 case TPACKET_V2:
3627 len = sizeof(req_u.req);
3628 break;
3629 case TPACKET_V3:
3630 default:
3631 len = sizeof(req_u.req3);
3632 break;
3633 }
3634 if (optlen < len)
1da177e4 3635 return -EINVAL;
f6fb8f10 3636 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3637 return -EFAULT;
f6fb8f10 3638 return packet_set_ring(sk, &req_u, 0,
3639 optname == PACKET_TX_RING);
1da177e4
LT
3640 }
3641 case PACKET_COPY_THRESH:
3642 {
3643 int val;
3644
40d4e3df 3645 if (optlen != sizeof(val))
1da177e4 3646 return -EINVAL;
40d4e3df 3647 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3648 return -EFAULT;
3649
3650 pkt_sk(sk)->copy_thresh = val;
3651 return 0;
3652 }
bbd6ef87
PM
3653 case PACKET_VERSION:
3654 {
3655 int val;
3656
3657 if (optlen != sizeof(val))
3658 return -EINVAL;
69e3c75f 3659 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3660 return -EBUSY;
3661 if (copy_from_user(&val, optval, sizeof(val)))
3662 return -EFAULT;
3663 switch (val) {
3664 case TPACKET_V1:
3665 case TPACKET_V2:
f6fb8f10 3666 case TPACKET_V3:
bbd6ef87
PM
3667 po->tp_version = val;
3668 return 0;
3669 default:
3670 return -EINVAL;
3671 }
3672 }
8913336a
PM
3673 case PACKET_RESERVE:
3674 {
3675 unsigned int val;
3676
3677 if (optlen != sizeof(val))
3678 return -EINVAL;
69e3c75f 3679 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3680 return -EBUSY;
3681 if (copy_from_user(&val, optval, sizeof(val)))
3682 return -EFAULT;
3683 po->tp_reserve = val;
3684 return 0;
3685 }
69e3c75f
JB
3686 case PACKET_LOSS:
3687 {
3688 unsigned int val;
3689
3690 if (optlen != sizeof(val))
3691 return -EINVAL;
3692 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3693 return -EBUSY;
3694 if (copy_from_user(&val, optval, sizeof(val)))
3695 return -EFAULT;
3696 po->tp_loss = !!val;
3697 return 0;
3698 }
8dc41944
HX
3699 case PACKET_AUXDATA:
3700 {
3701 int val;
3702
3703 if (optlen < sizeof(val))
3704 return -EINVAL;
3705 if (copy_from_user(&val, optval, sizeof(val)))
3706 return -EFAULT;
3707
3708 po->auxdata = !!val;
3709 return 0;
3710 }
80feaacb
PWJ
3711 case PACKET_ORIGDEV:
3712 {
3713 int val;
3714
3715 if (optlen < sizeof(val))
3716 return -EINVAL;
3717 if (copy_from_user(&val, optval, sizeof(val)))
3718 return -EFAULT;
3719
3720 po->origdev = !!val;
3721 return 0;
3722 }
bfd5f4a3
SS
3723 case PACKET_VNET_HDR:
3724 {
3725 int val;
3726
3727 if (sock->type != SOCK_RAW)
3728 return -EINVAL;
3729 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3730 return -EBUSY;
3731 if (optlen < sizeof(val))
3732 return -EINVAL;
3733 if (copy_from_user(&val, optval, sizeof(val)))
3734 return -EFAULT;
3735
3736 po->has_vnet_hdr = !!val;
3737 return 0;
3738 }
614f60fa
SM
3739 case PACKET_TIMESTAMP:
3740 {
3741 int val;
3742
3743 if (optlen != sizeof(val))
3744 return -EINVAL;
3745 if (copy_from_user(&val, optval, sizeof(val)))
3746 return -EFAULT;
3747
3748 po->tp_tstamp = val;
3749 return 0;
3750 }
dc99f600
DM
3751 case PACKET_FANOUT:
3752 {
3753 int val;
3754
3755 if (optlen != sizeof(val))
3756 return -EINVAL;
3757 if (copy_from_user(&val, optval, sizeof(val)))
3758 return -EFAULT;
3759
3760 return fanout_add(sk, val & 0xffff, val >> 16);
3761 }
47dceb8e
WB
3762 case PACKET_FANOUT_DATA:
3763 {
3764 if (!po->fanout)
3765 return -EINVAL;
3766
3767 return fanout_set_data(po, optval, optlen);
3768 }
5920cd3a
PC
3769 case PACKET_TX_HAS_OFF:
3770 {
3771 unsigned int val;
3772
3773 if (optlen != sizeof(val))
3774 return -EINVAL;
3775 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3776 return -EBUSY;
3777 if (copy_from_user(&val, optval, sizeof(val)))
3778 return -EFAULT;
3779 po->tp_tx_has_off = !!val;
3780 return 0;
3781 }
d346a3fa
DB
3782 case PACKET_QDISC_BYPASS:
3783 {
3784 int val;
3785
3786 if (optlen != sizeof(val))
3787 return -EINVAL;
3788 if (copy_from_user(&val, optval, sizeof(val)))
3789 return -EFAULT;
3790
3791 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3792 return 0;
3793 }
1da177e4
LT
3794 default:
3795 return -ENOPROTOOPT;
3796 }
3797}
3798
3799static int packet_getsockopt(struct socket *sock, int level, int optname,
3800 char __user *optval, int __user *optlen)
3801{
3802 int len;
c06fff6e 3803 int val, lv = sizeof(val);
1da177e4
LT
3804 struct sock *sk = sock->sk;
3805 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3806 void *data = &val;
ee80fbf3 3807 union tpacket_stats_u st;
a9b63918 3808 struct tpacket_rollover_stats rstats;
1da177e4
LT
3809
3810 if (level != SOL_PACKET)
3811 return -ENOPROTOOPT;
3812
8ae55f04
KK
3813 if (get_user(len, optlen))
3814 return -EFAULT;
1da177e4
LT
3815
3816 if (len < 0)
3817 return -EINVAL;
1ce4f28b 3818
69e3c75f 3819 switch (optname) {
1da177e4 3820 case PACKET_STATISTICS:
1da177e4 3821 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3822 memcpy(&st, &po->stats, sizeof(st));
3823 memset(&po->stats, 0, sizeof(po->stats));
3824 spin_unlock_bh(&sk->sk_receive_queue.lock);
3825
f6fb8f10 3826 if (po->tp_version == TPACKET_V3) {
c06fff6e 3827 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3828 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3829 data = &st.stats3;
f6fb8f10 3830 } else {
c06fff6e 3831 lv = sizeof(struct tpacket_stats);
8bcdeaff 3832 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3833 data = &st.stats1;
f6fb8f10 3834 }
ee80fbf3 3835
8dc41944
HX
3836 break;
3837 case PACKET_AUXDATA:
8dc41944 3838 val = po->auxdata;
80feaacb
PWJ
3839 break;
3840 case PACKET_ORIGDEV:
80feaacb 3841 val = po->origdev;
bfd5f4a3
SS
3842 break;
3843 case PACKET_VNET_HDR:
bfd5f4a3 3844 val = po->has_vnet_hdr;
1da177e4 3845 break;
bbd6ef87 3846 case PACKET_VERSION:
bbd6ef87 3847 val = po->tp_version;
bbd6ef87
PM
3848 break;
3849 case PACKET_HDRLEN:
3850 if (len > sizeof(int))
3851 len = sizeof(int);
3852 if (copy_from_user(&val, optval, len))
3853 return -EFAULT;
3854 switch (val) {
3855 case TPACKET_V1:
3856 val = sizeof(struct tpacket_hdr);
3857 break;
3858 case TPACKET_V2:
3859 val = sizeof(struct tpacket2_hdr);
3860 break;
f6fb8f10 3861 case TPACKET_V3:
3862 val = sizeof(struct tpacket3_hdr);
3863 break;
bbd6ef87
PM
3864 default:
3865 return -EINVAL;
3866 }
bbd6ef87 3867 break;
8913336a 3868 case PACKET_RESERVE:
8913336a 3869 val = po->tp_reserve;
8913336a 3870 break;
69e3c75f 3871 case PACKET_LOSS:
69e3c75f 3872 val = po->tp_loss;
69e3c75f 3873 break;
614f60fa 3874 case PACKET_TIMESTAMP:
614f60fa 3875 val = po->tp_tstamp;
614f60fa 3876 break;
dc99f600 3877 case PACKET_FANOUT:
dc99f600
DM
3878 val = (po->fanout ?
3879 ((u32)po->fanout->id |
77f65ebd
WB
3880 ((u32)po->fanout->type << 16) |
3881 ((u32)po->fanout->flags << 24)) :
dc99f600 3882 0);
dc99f600 3883 break;
a9b63918
WB
3884 case PACKET_ROLLOVER_STATS:
3885 if (!po->rollover)
3886 return -EINVAL;
3887 rstats.tp_all = atomic_long_read(&po->rollover->num);
3888 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3889 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3890 data = &rstats;
3891 lv = sizeof(rstats);
3892 break;
5920cd3a
PC
3893 case PACKET_TX_HAS_OFF:
3894 val = po->tp_tx_has_off;
3895 break;
d346a3fa
DB
3896 case PACKET_QDISC_BYPASS:
3897 val = packet_use_direct_xmit(po);
3898 break;
1da177e4
LT
3899 default:
3900 return -ENOPROTOOPT;
3901 }
3902
c06fff6e
ED
3903 if (len > lv)
3904 len = lv;
8ae55f04
KK
3905 if (put_user(len, optlen))
3906 return -EFAULT;
8dc41944
HX
3907 if (copy_to_user(optval, data, len))
3908 return -EFAULT;
8ae55f04 3909 return 0;
1da177e4
LT
3910}
3911
3912
351638e7
JP
3913static int packet_notifier(struct notifier_block *this,
3914 unsigned long msg, void *ptr)
1da177e4
LT
3915{
3916 struct sock *sk;
351638e7 3917 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3918 struct net *net = dev_net(dev);
1da177e4 3919
808f5114 3920 rcu_read_lock();
b67bfe0d 3921 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3922 struct packet_sock *po = pkt_sk(sk);
3923
3924 switch (msg) {
3925 case NETDEV_UNREGISTER:
1da177e4 3926 if (po->mclist)
82f17091 3927 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3928 /* fallthrough */
3929
1da177e4
LT
3930 case NETDEV_DOWN:
3931 if (dev->ifindex == po->ifindex) {
3932 spin_lock(&po->bind_lock);
3933 if (po->running) {
ce06b03e 3934 __unregister_prot_hook(sk, false);
1da177e4
LT
3935 sk->sk_err = ENETDOWN;
3936 if (!sock_flag(sk, SOCK_DEAD))
3937 sk->sk_error_report(sk);
3938 }
3939 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3940 packet_cached_dev_reset(po);
1da177e4 3941 po->ifindex = -1;
160ff18a
BG
3942 if (po->prot_hook.dev)
3943 dev_put(po->prot_hook.dev);
1da177e4
LT
3944 po->prot_hook.dev = NULL;
3945 }
3946 spin_unlock(&po->bind_lock);
3947 }
3948 break;
3949 case NETDEV_UP:
808f5114 3950 if (dev->ifindex == po->ifindex) {
3951 spin_lock(&po->bind_lock);
ce06b03e
DM
3952 if (po->num)
3953 register_prot_hook(sk);
808f5114 3954 spin_unlock(&po->bind_lock);
1da177e4 3955 }
1da177e4
LT
3956 break;
3957 }
3958 }
808f5114 3959 rcu_read_unlock();
1da177e4
LT
3960 return NOTIFY_DONE;
3961}
3962
3963
3964static int packet_ioctl(struct socket *sock, unsigned int cmd,
3965 unsigned long arg)
3966{
3967 struct sock *sk = sock->sk;
3968
69e3c75f 3969 switch (cmd) {
40d4e3df
ED
3970 case SIOCOUTQ:
3971 {
3972 int amount = sk_wmem_alloc_get(sk);
31e6d363 3973
40d4e3df
ED
3974 return put_user(amount, (int __user *)arg);
3975 }
3976 case SIOCINQ:
3977 {
3978 struct sk_buff *skb;
3979 int amount = 0;
3980
3981 spin_lock_bh(&sk->sk_receive_queue.lock);
3982 skb = skb_peek(&sk->sk_receive_queue);
3983 if (skb)
3984 amount = skb->len;
3985 spin_unlock_bh(&sk->sk_receive_queue.lock);
3986 return put_user(amount, (int __user *)arg);
3987 }
3988 case SIOCGSTAMP:
3989 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3990 case SIOCGSTAMPNS:
3991 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3992
1da177e4 3993#ifdef CONFIG_INET
40d4e3df
ED
3994 case SIOCADDRT:
3995 case SIOCDELRT:
3996 case SIOCDARP:
3997 case SIOCGARP:
3998 case SIOCSARP:
3999 case SIOCGIFADDR:
4000 case SIOCSIFADDR:
4001 case SIOCGIFBRDADDR:
4002 case SIOCSIFBRDADDR:
4003 case SIOCGIFNETMASK:
4004 case SIOCSIFNETMASK:
4005 case SIOCGIFDSTADDR:
4006 case SIOCSIFDSTADDR:
4007 case SIOCSIFFLAGS:
40d4e3df 4008 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4009#endif
4010
40d4e3df
ED
4011 default:
4012 return -ENOIOCTLCMD;
1da177e4
LT
4013 }
4014 return 0;
4015}
4016
40d4e3df 4017static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
4018 poll_table *wait)
4019{
4020 struct sock *sk = sock->sk;
4021 struct packet_sock *po = pkt_sk(sk);
4022 unsigned int mask = datagram_poll(file, sock, wait);
4023
4024 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4025 if (po->rx_ring.pg_vec) {
f6fb8f10 4026 if (!packet_previous_rx_frame(po, &po->rx_ring,
4027 TP_STATUS_KERNEL))
1da177e4
LT
4028 mask |= POLLIN | POLLRDNORM;
4029 }
2ccdbaa6 4030 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4031 po->pressure = 0;
1da177e4 4032 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4033 spin_lock_bh(&sk->sk_write_queue.lock);
4034 if (po->tx_ring.pg_vec) {
4035 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4036 mask |= POLLOUT | POLLWRNORM;
4037 }
4038 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4039 return mask;
4040}
4041
4042
4043/* Dirty? Well, I still did not learn better way to account
4044 * for user mmaps.
4045 */
4046
4047static void packet_mm_open(struct vm_area_struct *vma)
4048{
4049 struct file *file = vma->vm_file;
40d4e3df 4050 struct socket *sock = file->private_data;
1da177e4 4051 struct sock *sk = sock->sk;
1ce4f28b 4052
1da177e4
LT
4053 if (sk)
4054 atomic_inc(&pkt_sk(sk)->mapped);
4055}
4056
4057static void packet_mm_close(struct vm_area_struct *vma)
4058{
4059 struct file *file = vma->vm_file;
40d4e3df 4060 struct socket *sock = file->private_data;
1da177e4 4061 struct sock *sk = sock->sk;
1ce4f28b 4062
1da177e4
LT
4063 if (sk)
4064 atomic_dec(&pkt_sk(sk)->mapped);
4065}
4066
f0f37e2f 4067static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4068 .open = packet_mm_open,
4069 .close = packet_mm_close,
1da177e4
LT
4070};
4071
0e3125c7
NH
4072static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4073 unsigned int len)
1da177e4
LT
4074{
4075 int i;
4076
4ebf0ae2 4077 for (i = 0; i < len; i++) {
0e3125c7 4078 if (likely(pg_vec[i].buffer)) {
c56b4d90 4079 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4080 vfree(pg_vec[i].buffer);
4081 else
4082 free_pages((unsigned long)pg_vec[i].buffer,
4083 order);
4084 pg_vec[i].buffer = NULL;
4085 }
1da177e4
LT
4086 }
4087 kfree(pg_vec);
4088}
4089
eea49cc9 4090static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4091{
f0d4eb29 4092 char *buffer;
0e3125c7
NH
4093 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4094 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4095
4096 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4097 if (buffer)
4098 return buffer;
4099
f0d4eb29 4100 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4101 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4102 if (buffer)
4103 return buffer;
4104
f0d4eb29 4105 /* vmalloc failed, lets dig into swap here */
0e3125c7 4106 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4107 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4108 if (buffer)
4109 return buffer;
4110
f0d4eb29 4111 /* complete and utter failure */
0e3125c7 4112 return NULL;
4ebf0ae2
DM
4113}
4114
0e3125c7 4115static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4116{
4117 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4118 struct pgv *pg_vec;
4ebf0ae2
DM
4119 int i;
4120
0e3125c7 4121 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4122 if (unlikely(!pg_vec))
4123 goto out;
4124
4125 for (i = 0; i < block_nr; i++) {
c56b4d90 4126 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4127 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4128 goto out_free_pgvec;
4129 }
4130
4131out:
4132 return pg_vec;
4133
4134out_free_pgvec:
4135 free_pg_vec(pg_vec, order, block_nr);
4136 pg_vec = NULL;
4137 goto out;
4138}
1da177e4 4139
f6fb8f10 4140static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4141 int closing, int tx_ring)
1da177e4 4142{
0e3125c7 4143 struct pgv *pg_vec = NULL;
1da177e4 4144 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4145 int was_running, order = 0;
69e3c75f
JB
4146 struct packet_ring_buffer *rb;
4147 struct sk_buff_head *rb_queue;
0e11c91e 4148 __be16 num;
f6fb8f10 4149 int err = -EINVAL;
4150 /* Added to avoid minimal code churn */
4151 struct tpacket_req *req = &req_u->req;
4152
4153 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4154 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
6ae81ced 4155 net_warn_ratelimited("Tx-ring is not supported.\n");
f6fb8f10 4156 goto out;
4157 }
1ce4f28b 4158
69e3c75f
JB
4159 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4160 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4161
69e3c75f
JB
4162 err = -EBUSY;
4163 if (!closing) {
4164 if (atomic_read(&po->mapped))
4165 goto out;
b0138408 4166 if (packet_read_pending(rb))
69e3c75f
JB
4167 goto out;
4168 }
1da177e4 4169
69e3c75f
JB
4170 if (req->tp_block_nr) {
4171 /* Sanity tests and some calculations */
4172 err = -EBUSY;
4173 if (unlikely(rb->pg_vec))
4174 goto out;
1da177e4 4175
bbd6ef87
PM
4176 switch (po->tp_version) {
4177 case TPACKET_V1:
4178 po->tp_hdrlen = TPACKET_HDRLEN;
4179 break;
4180 case TPACKET_V2:
4181 po->tp_hdrlen = TPACKET2_HDRLEN;
4182 break;
f6fb8f10 4183 case TPACKET_V3:
4184 po->tp_hdrlen = TPACKET3_HDRLEN;
4185 break;
bbd6ef87
PM
4186 }
4187
69e3c75f 4188 err = -EINVAL;
4ebf0ae2 4189 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4190 goto out;
90836b67 4191 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4192 goto out;
dc808110
ED
4193 if (po->tp_version >= TPACKET_V3 &&
4194 (int)(req->tp_block_size -
4195 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
4196 goto out;
8913336a 4197 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4198 po->tp_reserve))
4199 goto out;
4ebf0ae2 4200 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4201 goto out;
1da177e4 4202
4194b491
TK
4203 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4204 if (unlikely(rb->frames_per_block == 0))
69e3c75f
JB
4205 goto out;
4206 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4207 req->tp_frame_nr))
4208 goto out;
1da177e4
LT
4209
4210 err = -ENOMEM;
4ebf0ae2
DM
4211 order = get_order(req->tp_block_size);
4212 pg_vec = alloc_pg_vec(req, order);
4213 if (unlikely(!pg_vec))
1da177e4 4214 goto out;
f6fb8f10 4215 switch (po->tp_version) {
4216 case TPACKET_V3:
4217 /* Transmit path is not supported. We checked
4218 * it above but just being paranoid
4219 */
4220 if (!tx_ring)
e8e85cc5 4221 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 4222 break;
f6fb8f10 4223 default:
4224 break;
4225 }
69e3c75f
JB
4226 }
4227 /* Done */
4228 else {
4229 err = -EINVAL;
4ebf0ae2 4230 if (unlikely(req->tp_frame_nr))
69e3c75f 4231 goto out;
1da177e4
LT
4232 }
4233
4234 lock_sock(sk);
4235
4236 /* Detach socket from network */
4237 spin_lock(&po->bind_lock);
4238 was_running = po->running;
4239 num = po->num;
4240 if (was_running) {
1da177e4 4241 po->num = 0;
ce06b03e 4242 __unregister_prot_hook(sk, false);
1da177e4
LT
4243 }
4244 spin_unlock(&po->bind_lock);
1ce4f28b 4245
1da177e4
LT
4246 synchronize_net();
4247
4248 err = -EBUSY;
905db440 4249 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4250 if (closing || atomic_read(&po->mapped) == 0) {
4251 err = 0;
69e3c75f 4252 spin_lock_bh(&rb_queue->lock);
c053fd96 4253 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4254 rb->frame_max = (req->tp_frame_nr - 1);
4255 rb->head = 0;
4256 rb->frame_size = req->tp_frame_size;
4257 spin_unlock_bh(&rb_queue->lock);
4258
c053fd96
CG
4259 swap(rb->pg_vec_order, order);
4260 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4261
4262 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4263 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4264 tpacket_rcv : packet_rcv;
4265 skb_queue_purge(rb_queue);
1da177e4 4266 if (atomic_read(&po->mapped))
40d4e3df
ED
4267 pr_err("packet_mmap: vma is busy: %d\n",
4268 atomic_read(&po->mapped));
1da177e4 4269 }
905db440 4270 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4271
4272 spin_lock(&po->bind_lock);
ce06b03e 4273 if (was_running) {
1da177e4 4274 po->num = num;
ce06b03e 4275 register_prot_hook(sk);
1da177e4
LT
4276 }
4277 spin_unlock(&po->bind_lock);
f6fb8f10 4278 if (closing && (po->tp_version > TPACKET_V2)) {
4279 /* Because we don't support block-based V3 on tx-ring */
4280 if (!tx_ring)
73d0fcf2 4281 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4282 }
1da177e4
LT
4283 release_sock(sk);
4284
1da177e4
LT
4285 if (pg_vec)
4286 free_pg_vec(pg_vec, order, req->tp_block_nr);
4287out:
4288 return err;
4289}
4290
69e3c75f
JB
4291static int packet_mmap(struct file *file, struct socket *sock,
4292 struct vm_area_struct *vma)
1da177e4
LT
4293{
4294 struct sock *sk = sock->sk;
4295 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4296 unsigned long size, expected_size;
4297 struct packet_ring_buffer *rb;
1da177e4
LT
4298 unsigned long start;
4299 int err = -EINVAL;
4300 int i;
4301
4302 if (vma->vm_pgoff)
4303 return -EINVAL;
4304
905db440 4305 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4306
4307 expected_size = 0;
4308 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4309 if (rb->pg_vec) {
4310 expected_size += rb->pg_vec_len
4311 * rb->pg_vec_pages
4312 * PAGE_SIZE;
4313 }
4314 }
4315
4316 if (expected_size == 0)
1da177e4 4317 goto out;
69e3c75f
JB
4318
4319 size = vma->vm_end - vma->vm_start;
4320 if (size != expected_size)
1da177e4
LT
4321 goto out;
4322
1da177e4 4323 start = vma->vm_start;
69e3c75f
JB
4324 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4325 if (rb->pg_vec == NULL)
4326 continue;
4327
4328 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4329 struct page *page;
4330 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4331 int pg_num;
4332
c56b4d90
CG
4333 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4334 page = pgv_to_page(kaddr);
69e3c75f
JB
4335 err = vm_insert_page(vma, start, page);
4336 if (unlikely(err))
4337 goto out;
4338 start += PAGE_SIZE;
0e3125c7 4339 kaddr += PAGE_SIZE;
69e3c75f 4340 }
4ebf0ae2 4341 }
1da177e4 4342 }
69e3c75f 4343
4ebf0ae2 4344 atomic_inc(&po->mapped);
1da177e4
LT
4345 vma->vm_ops = &packet_mmap_ops;
4346 err = 0;
4347
4348out:
905db440 4349 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4350 return err;
4351}
1da177e4 4352
90ddc4f0 4353static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4354 .family = PF_PACKET,
4355 .owner = THIS_MODULE,
4356 .release = packet_release,
4357 .bind = packet_bind_spkt,
4358 .connect = sock_no_connect,
4359 .socketpair = sock_no_socketpair,
4360 .accept = sock_no_accept,
4361 .getname = packet_getname_spkt,
4362 .poll = datagram_poll,
4363 .ioctl = packet_ioctl,
4364 .listen = sock_no_listen,
4365 .shutdown = sock_no_shutdown,
4366 .setsockopt = sock_no_setsockopt,
4367 .getsockopt = sock_no_getsockopt,
4368 .sendmsg = packet_sendmsg_spkt,
4369 .recvmsg = packet_recvmsg,
4370 .mmap = sock_no_mmap,
4371 .sendpage = sock_no_sendpage,
4372};
1da177e4 4373
90ddc4f0 4374static const struct proto_ops packet_ops = {
1da177e4
LT
4375 .family = PF_PACKET,
4376 .owner = THIS_MODULE,
4377 .release = packet_release,
4378 .bind = packet_bind,
4379 .connect = sock_no_connect,
4380 .socketpair = sock_no_socketpair,
4381 .accept = sock_no_accept,
1ce4f28b 4382 .getname = packet_getname,
1da177e4
LT
4383 .poll = packet_poll,
4384 .ioctl = packet_ioctl,
4385 .listen = sock_no_listen,
4386 .shutdown = sock_no_shutdown,
4387 .setsockopt = packet_setsockopt,
4388 .getsockopt = packet_getsockopt,
4389 .sendmsg = packet_sendmsg,
4390 .recvmsg = packet_recvmsg,
4391 .mmap = packet_mmap,
4392 .sendpage = sock_no_sendpage,
4393};
4394
ec1b4cf7 4395static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4396 .family = PF_PACKET,
4397 .create = packet_create,
4398 .owner = THIS_MODULE,
4399};
4400
4401static struct notifier_block packet_netdev_notifier = {
40d4e3df 4402 .notifier_call = packet_notifier,
1da177e4
LT
4403};
4404
4405#ifdef CONFIG_PROC_FS
1da177e4
LT
4406
4407static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4408 __acquires(RCU)
1da177e4 4409{
e372c414 4410 struct net *net = seq_file_net(seq);
808f5114 4411
4412 rcu_read_lock();
4413 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4414}
4415
4416static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4417{
1bf40954 4418 struct net *net = seq_file_net(seq);
808f5114 4419 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4420}
4421
4422static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4423 __releases(RCU)
1da177e4 4424{
808f5114 4425 rcu_read_unlock();
1da177e4
LT
4426}
4427
1ce4f28b 4428static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4429{
4430 if (v == SEQ_START_TOKEN)
4431 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4432 else {
b7ceabd9 4433 struct sock *s = sk_entry(v);
1da177e4
LT
4434 const struct packet_sock *po = pkt_sk(s);
4435
4436 seq_printf(seq,
71338aa7 4437 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4438 s,
4439 atomic_read(&s->sk_refcnt),
4440 s->sk_type,
4441 ntohs(po->num),
4442 po->ifindex,
4443 po->running,
4444 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4445 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4446 sock_i_ino(s));
1da177e4
LT
4447 }
4448
4449 return 0;
4450}
4451
56b3d975 4452static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4453 .start = packet_seq_start,
4454 .next = packet_seq_next,
4455 .stop = packet_seq_stop,
4456 .show = packet_seq_show,
4457};
4458
4459static int packet_seq_open(struct inode *inode, struct file *file)
4460{
e372c414
DL
4461 return seq_open_net(inode, file, &packet_seq_ops,
4462 sizeof(struct seq_net_private));
1da177e4
LT
4463}
4464
da7071d7 4465static const struct file_operations packet_seq_fops = {
1da177e4
LT
4466 .owner = THIS_MODULE,
4467 .open = packet_seq_open,
4468 .read = seq_read,
4469 .llseek = seq_lseek,
e372c414 4470 .release = seq_release_net,
1da177e4
LT
4471};
4472
4473#endif
4474
2c8c1e72 4475static int __net_init packet_net_init(struct net *net)
d12d01d6 4476{
0fa7fa98 4477 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4478 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4479
d4beaa66 4480 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4481 return -ENOMEM;
4482
4483 return 0;
4484}
4485
2c8c1e72 4486static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4487{
ece31ffd 4488 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4489}
4490
4491static struct pernet_operations packet_net_ops = {
4492 .init = packet_net_init,
4493 .exit = packet_net_exit,
4494};
4495
4496
1da177e4
LT
4497static void __exit packet_exit(void)
4498{
1da177e4 4499 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4500 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4501 sock_unregister(PF_PACKET);
4502 proto_unregister(&packet_proto);
4503}
4504
4505static int __init packet_init(void)
4506{
4507 int rc = proto_register(&packet_proto, 0);
4508
4509 if (rc != 0)
4510 goto out;
4511
4512 sock_register(&packet_family_ops);
d12d01d6 4513 register_pernet_subsys(&packet_net_ops);
1da177e4 4514 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4515out:
4516 return rc;
4517}
4518
4519module_init(packet_init);
4520module_exit(packet_exit);
4521MODULE_LICENSE("GPL");
4522MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.448062 seconds and 5 git commands to generate.