bpf: generally move prog destruction to RCU deferral
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define PGV_FROM_VMALLOC 1
69e3c75f 181
f6fb8f10 182#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
183#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
184#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
185#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
186#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
187#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
188#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189
69e3c75f
JB
190struct packet_sock;
191static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
192static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
193 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 194
f6fb8f10 195static void *packet_previous_frame(struct packet_sock *po,
196 struct packet_ring_buffer *rb,
197 int status);
198static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 199static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
200 struct tpacket_block_desc *);
201static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *);
bc59ba39 203static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 204 struct packet_sock *, unsigned int status);
bc59ba39 205static int prb_queue_frozen(struct tpacket_kbdq_core *);
206static void prb_open_block(struct tpacket_kbdq_core *,
207 struct tpacket_block_desc *);
f6fb8f10 208static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 209static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
210static void prb_init_blk_timer(struct packet_sock *,
211 struct tpacket_kbdq_core *,
212 void (*func) (unsigned long));
213static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
214static void prb_clear_rxhash(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
216static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
217 struct tpacket3_hdr *);
1da177e4
LT
218static void packet_flush_mclist(struct sock *sk);
219
ffbc6111 220struct packet_skb_cb {
ffbc6111
HX
221 union {
222 struct sockaddr_pkt pkt;
2472d761
EB
223 union {
224 /* Trick: alias skb original length with
225 * ll.sll_family and ll.protocol in order
226 * to save room.
227 */
228 unsigned int origlen;
229 struct sockaddr_ll ll;
230 };
ffbc6111
HX
231 } sa;
232};
233
d3869efe
DW
234#define vio_le() virtio_legacy_is_little_endian()
235
ffbc6111 236#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 237
bc59ba39 238#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 239#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 240 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 241#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 242 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 243#define GET_NEXT_PRB_BLK_NUM(x) \
244 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
245 ((x)->kactive_blk_num+1) : 0)
246
dc99f600
DM
247static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
248static void __fanout_link(struct sock *sk, struct packet_sock *po);
249
d346a3fa
DB
250static int packet_direct_xmit(struct sk_buff *skb)
251{
252 struct net_device *dev = skb->dev;
d346a3fa
DB
253 netdev_features_t features;
254 struct netdev_queue *txq;
43279500 255 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
256
257 if (unlikely(!netif_running(dev) ||
43279500
DB
258 !netif_carrier_ok(dev)))
259 goto drop;
d346a3fa
DB
260
261 features = netif_skb_features(skb);
262 if (skb_needs_linearize(skb, features) &&
43279500
DB
263 __skb_linearize(skb))
264 goto drop;
d346a3fa 265
10c51b56 266 txq = skb_get_tx_queue(dev, skb);
d346a3fa 267
43279500
DB
268 local_bh_disable();
269
270 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 271 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 272 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 273 HARD_TX_UNLOCK(dev, txq);
d346a3fa 274
43279500
DB
275 local_bh_enable();
276
277 if (!dev_xmit_complete(ret))
d346a3fa 278 kfree_skb(skb);
43279500 279
d346a3fa 280 return ret;
43279500 281drop:
0f97ede4 282 atomic_long_inc(&dev->tx_dropped);
43279500
DB
283 kfree_skb(skb);
284 return NET_XMIT_DROP;
d346a3fa
DB
285}
286
66e56cd4
DB
287static struct net_device *packet_cached_dev_get(struct packet_sock *po)
288{
289 struct net_device *dev;
290
291 rcu_read_lock();
292 dev = rcu_dereference(po->cached_dev);
293 if (likely(dev))
294 dev_hold(dev);
295 rcu_read_unlock();
296
297 return dev;
298}
299
300static void packet_cached_dev_assign(struct packet_sock *po,
301 struct net_device *dev)
302{
303 rcu_assign_pointer(po->cached_dev, dev);
304}
305
306static void packet_cached_dev_reset(struct packet_sock *po)
307{
308 RCU_INIT_POINTER(po->cached_dev, NULL);
309}
310
d346a3fa
DB
311static bool packet_use_direct_xmit(const struct packet_sock *po)
312{
313 return po->xmit == packet_direct_xmit;
314}
315
0fd5d57b 316static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 317{
1cbac010 318 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
319}
320
0fd5d57b
DB
321static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
322{
323 const struct net_device_ops *ops = dev->netdev_ops;
324 u16 queue_index;
325
326 if (ops->ndo_select_queue) {
327 queue_index = ops->ndo_select_queue(dev, skb, NULL,
328 __packet_pick_tx_queue);
329 queue_index = netdev_cap_txqueue(dev, queue_index);
330 } else {
331 queue_index = __packet_pick_tx_queue(dev, skb);
332 }
333
334 skb_set_queue_mapping(skb, queue_index);
335}
336
ce06b03e
DM
337/* register_prot_hook must be invoked with the po->bind_lock held,
338 * or from a context in which asynchronous accesses to the packet
339 * socket is not possible (packet_create()).
340 */
341static void register_prot_hook(struct sock *sk)
342{
343 struct packet_sock *po = pkt_sk(sk);
e40526cb 344
ce06b03e 345 if (!po->running) {
66e56cd4 346 if (po->fanout)
dc99f600 347 __fanout_link(sk, po);
66e56cd4 348 else
dc99f600 349 dev_add_pack(&po->prot_hook);
e40526cb 350
ce06b03e
DM
351 sock_hold(sk);
352 po->running = 1;
353 }
354}
355
356/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
357 * held. If the sync parameter is true, we will temporarily drop
358 * the po->bind_lock and do a synchronize_net to make sure no
359 * asynchronous packet processing paths still refer to the elements
360 * of po->prot_hook. If the sync parameter is false, it is the
361 * callers responsibility to take care of this.
362 */
363static void __unregister_prot_hook(struct sock *sk, bool sync)
364{
365 struct packet_sock *po = pkt_sk(sk);
366
367 po->running = 0;
66e56cd4
DB
368
369 if (po->fanout)
dc99f600 370 __fanout_unlink(sk, po);
66e56cd4 371 else
dc99f600 372 __dev_remove_pack(&po->prot_hook);
e40526cb 373
ce06b03e
DM
374 __sock_put(sk);
375
376 if (sync) {
377 spin_unlock(&po->bind_lock);
378 synchronize_net();
379 spin_lock(&po->bind_lock);
380 }
381}
382
383static void unregister_prot_hook(struct sock *sk, bool sync)
384{
385 struct packet_sock *po = pkt_sk(sk);
386
387 if (po->running)
388 __unregister_prot_hook(sk, sync);
389}
390
6e58040b 391static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
392{
393 if (is_vmalloc_addr(addr))
394 return vmalloc_to_page(addr);
395 return virt_to_page(addr);
396}
397
69e3c75f 398static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 399{
184f489e 400 union tpacket_uhdr h;
1da177e4 401
69e3c75f 402 h.raw = frame;
bbd6ef87
PM
403 switch (po->tp_version) {
404 case TPACKET_V1:
69e3c75f 405 h.h1->tp_status = status;
0af55bb5 406 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
407 break;
408 case TPACKET_V2:
69e3c75f 409 h.h2->tp_status = status;
0af55bb5 410 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 411 break;
f6fb8f10 412 case TPACKET_V3:
69e3c75f 413 default:
f6fb8f10 414 WARN(1, "TPACKET version not supported.\n");
69e3c75f 415 BUG();
bbd6ef87 416 }
69e3c75f
JB
417
418 smp_wmb();
bbd6ef87
PM
419}
420
69e3c75f 421static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 422{
184f489e 423 union tpacket_uhdr h;
bbd6ef87 424
69e3c75f
JB
425 smp_rmb();
426
bbd6ef87
PM
427 h.raw = frame;
428 switch (po->tp_version) {
429 case TPACKET_V1:
0af55bb5 430 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 431 return h.h1->tp_status;
bbd6ef87 432 case TPACKET_V2:
0af55bb5 433 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 434 return h.h2->tp_status;
f6fb8f10 435 case TPACKET_V3:
69e3c75f 436 default:
f6fb8f10 437 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
438 BUG();
439 return 0;
bbd6ef87 440 }
1da177e4 441}
69e3c75f 442
b9c32fb2
DB
443static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
444 unsigned int flags)
7a51384c
DB
445{
446 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
447
68a360e8
WB
448 if (shhwtstamps &&
449 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
450 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
451 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
452
453 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 454 return TP_STATUS_TS_SOFTWARE;
7a51384c 455
b9c32fb2 456 return 0;
7a51384c
DB
457}
458
b9c32fb2
DB
459static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
460 struct sk_buff *skb)
2e31396f
WB
461{
462 union tpacket_uhdr h;
463 struct timespec ts;
b9c32fb2 464 __u32 ts_status;
2e31396f 465
b9c32fb2
DB
466 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
467 return 0;
2e31396f
WB
468
469 h.raw = frame;
470 switch (po->tp_version) {
471 case TPACKET_V1:
472 h.h1->tp_sec = ts.tv_sec;
473 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
474 break;
475 case TPACKET_V2:
476 h.h2->tp_sec = ts.tv_sec;
477 h.h2->tp_nsec = ts.tv_nsec;
478 break;
479 case TPACKET_V3:
480 default:
481 WARN(1, "TPACKET version not supported.\n");
482 BUG();
483 }
484
485 /* one flush is safe, as both fields always lie on the same cacheline */
486 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
487 smp_wmb();
b9c32fb2
DB
488
489 return ts_status;
2e31396f
WB
490}
491
69e3c75f
JB
492static void *packet_lookup_frame(struct packet_sock *po,
493 struct packet_ring_buffer *rb,
494 unsigned int position,
495 int status)
496{
497 unsigned int pg_vec_pos, frame_offset;
184f489e 498 union tpacket_uhdr h;
69e3c75f
JB
499
500 pg_vec_pos = position / rb->frames_per_block;
501 frame_offset = position % rb->frames_per_block;
502
0e3125c7
NH
503 h.raw = rb->pg_vec[pg_vec_pos].buffer +
504 (frame_offset * rb->frame_size);
69e3c75f
JB
505
506 if (status != __packet_get_status(po, h.raw))
507 return NULL;
508
509 return h.raw;
510}
511
eea49cc9 512static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
513 struct packet_ring_buffer *rb,
514 int status)
515{
516 return packet_lookup_frame(po, rb, rb->head, status);
517}
518
bc59ba39 519static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 520{
521 del_timer_sync(&pkc->retire_blk_timer);
522}
523
524static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 525 struct sk_buff_head *rb_queue)
526{
bc59ba39 527 struct tpacket_kbdq_core *pkc;
f6fb8f10 528
73d0fcf2 529 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 530
ec6f809f 531 spin_lock_bh(&rb_queue->lock);
f6fb8f10 532 pkc->delete_blk_timer = 1;
ec6f809f 533 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 534
535 prb_del_retire_blk_timer(pkc);
536}
537
538static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 539 struct tpacket_kbdq_core *pkc,
f6fb8f10 540 void (*func) (unsigned long))
541{
542 init_timer(&pkc->retire_blk_timer);
543 pkc->retire_blk_timer.data = (long)po;
544 pkc->retire_blk_timer.function = func;
545 pkc->retire_blk_timer.expires = jiffies;
546}
547
e8e85cc5 548static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 549{
bc59ba39 550 struct tpacket_kbdq_core *pkc;
f6fb8f10 551
e8e85cc5 552 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 553 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
554}
555
556static int prb_calc_retire_blk_tmo(struct packet_sock *po,
557 int blk_size_in_bytes)
558{
559 struct net_device *dev;
560 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 561 struct ethtool_link_ksettings ecmd;
4bc71cb9 562 int err;
f6fb8f10 563
4bc71cb9
JP
564 rtnl_lock();
565 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
566 if (unlikely(!dev)) {
567 rtnl_unlock();
f6fb8f10 568 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 569 }
7cad1bac 570 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
571 rtnl_unlock();
572 if (!err) {
4bc71cb9
JP
573 /*
574 * If the link speed is so slow you don't really
575 * need to worry about perf anyways
576 */
7cad1bac
DD
577 if (ecmd.base.speed < SPEED_1000 ||
578 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 579 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 580 } else {
581 msec = 1;
7cad1bac 582 div = ecmd.base.speed / 1000;
f6fb8f10 583 }
584 }
585
586 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
587
588 if (div)
589 mbits /= div;
590
591 tmo = mbits * msec;
592
593 if (div)
594 return tmo+1;
595 return tmo;
596}
597
bc59ba39 598static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 599 union tpacket_req_u *req_u)
600{
601 p1->feature_req_word = req_u->req3.tp_feature_req_word;
602}
603
604static void init_prb_bdqc(struct packet_sock *po,
605 struct packet_ring_buffer *rb,
606 struct pgv *pg_vec,
e8e85cc5 607 union tpacket_req_u *req_u)
f6fb8f10 608{
22781a5b 609 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 610 struct tpacket_block_desc *pbd;
f6fb8f10 611
612 memset(p1, 0x0, sizeof(*p1));
613
614 p1->knxt_seq_num = 1;
615 p1->pkbdq = pg_vec;
bc59ba39 616 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 617 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 618 p1->kblk_size = req_u->req3.tp_block_size;
619 p1->knum_blocks = req_u->req3.tp_block_nr;
620 p1->hdrlen = po->tp_hdrlen;
621 p1->version = po->tp_version;
622 p1->last_kactive_blk_num = 0;
ee80fbf3 623 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 624 if (req_u->req3.tp_retire_blk_tov)
625 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
626 else
627 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
628 req_u->req3.tp_block_size);
629 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
630 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
631
dc808110 632 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 633 prb_init_ft_ops(p1, req_u);
e8e85cc5 634 prb_setup_retire_blk_timer(po);
f6fb8f10 635 prb_open_block(p1, pbd);
636}
637
638/* Do NOT update the last_blk_num first.
639 * Assumes sk_buff_head lock is held.
640 */
bc59ba39 641static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 642{
643 mod_timer(&pkc->retire_blk_timer,
644 jiffies + pkc->tov_in_jiffies);
645 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
646}
647
648/*
649 * Timer logic:
650 * 1) We refresh the timer only when we open a block.
651 * By doing this we don't waste cycles refreshing the timer
652 * on packet-by-packet basis.
653 *
654 * With a 1MB block-size, on a 1Gbps line, it will take
655 * i) ~8 ms to fill a block + ii) memcpy etc.
656 * In this cut we are not accounting for the memcpy time.
657 *
658 * So, if the user sets the 'tmo' to 10ms then the timer
659 * will never fire while the block is still getting filled
660 * (which is what we want). However, the user could choose
661 * to close a block early and that's fine.
662 *
663 * But when the timer does fire, we check whether or not to refresh it.
664 * Since the tmo granularity is in msecs, it is not too expensive
665 * to refresh the timer, lets say every '8' msecs.
666 * Either the user can set the 'tmo' or we can derive it based on
667 * a) line-speed and b) block-size.
668 * prb_calc_retire_blk_tmo() calculates the tmo.
669 *
670 */
671static void prb_retire_rx_blk_timer_expired(unsigned long data)
672{
673 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 674 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 675 unsigned int frozen;
bc59ba39 676 struct tpacket_block_desc *pbd;
f6fb8f10 677
678 spin_lock(&po->sk.sk_receive_queue.lock);
679
680 frozen = prb_queue_frozen(pkc);
681 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
682
683 if (unlikely(pkc->delete_blk_timer))
684 goto out;
685
686 /* We only need to plug the race when the block is partially filled.
687 * tpacket_rcv:
688 * lock(); increment BLOCK_NUM_PKTS; unlock()
689 * copy_bits() is in progress ...
690 * timer fires on other cpu:
691 * we can't retire the current block because copy_bits
692 * is in progress.
693 *
694 */
695 if (BLOCK_NUM_PKTS(pbd)) {
696 while (atomic_read(&pkc->blk_fill_in_prog)) {
697 /* Waiting for skb_copy_bits to finish... */
698 cpu_relax();
699 }
700 }
701
702 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
703 if (!frozen) {
41a50d62
AD
704 if (!BLOCK_NUM_PKTS(pbd)) {
705 /* An empty block. Just refresh the timer. */
706 goto refresh_timer;
707 }
f6fb8f10 708 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
709 if (!prb_dispatch_next_block(pkc, po))
710 goto refresh_timer;
711 else
712 goto out;
713 } else {
714 /* Case 1. Queue was frozen because user-space was
715 * lagging behind.
716 */
717 if (prb_curr_blk_in_use(pkc, pbd)) {
718 /*
719 * Ok, user-space is still behind.
720 * So just refresh the timer.
721 */
722 goto refresh_timer;
723 } else {
724 /* Case 2. queue was frozen,user-space caught up,
725 * now the link went idle && the timer fired.
726 * We don't have a block to close.So we open this
727 * block and restart the timer.
728 * opening a block thaws the queue,restarts timer
729 * Thawing/timer-refresh is a side effect.
730 */
731 prb_open_block(pkc, pbd);
732 goto out;
733 }
734 }
735 }
736
737refresh_timer:
738 _prb_refresh_rx_retire_blk_timer(pkc);
739
740out:
741 spin_unlock(&po->sk.sk_receive_queue.lock);
742}
743
eea49cc9 744static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 745 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 746{
747 /* Flush everything minus the block header */
748
749#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
750 u8 *start, *end;
751
752 start = (u8 *)pbd1;
753
754 /* Skip the block header(we know header WILL fit in 4K) */
755 start += PAGE_SIZE;
756
757 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
758 for (; start < end; start += PAGE_SIZE)
759 flush_dcache_page(pgv_to_page(start));
760
761 smp_wmb();
762#endif
763
764 /* Now update the block status. */
765
766 BLOCK_STATUS(pbd1) = status;
767
768 /* Flush the block header */
769
770#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
771 start = (u8 *)pbd1;
772 flush_dcache_page(pgv_to_page(start));
773
774 smp_wmb();
775#endif
776}
777
778/*
779 * Side effect:
780 *
781 * 1) flush the block
782 * 2) Increment active_blk_num
783 *
784 * Note:We DONT refresh the timer on purpose.
785 * Because almost always the next block will be opened.
786 */
bc59ba39 787static void prb_close_block(struct tpacket_kbdq_core *pkc1,
788 struct tpacket_block_desc *pbd1,
f6fb8f10 789 struct packet_sock *po, unsigned int stat)
790{
791 __u32 status = TP_STATUS_USER | stat;
792
793 struct tpacket3_hdr *last_pkt;
bc59ba39 794 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 795 struct sock *sk = &po->sk;
f6fb8f10 796
ee80fbf3 797 if (po->stats.stats3.tp_drops)
f6fb8f10 798 status |= TP_STATUS_LOSING;
799
800 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
801 last_pkt->tp_next_offset = 0;
802
803 /* Get the ts of the last pkt */
804 if (BLOCK_NUM_PKTS(pbd1)) {
805 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
806 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
807 } else {
41a50d62
AD
808 /* Ok, we tmo'd - so get the current time.
809 *
810 * It shouldn't really happen as we don't close empty
811 * blocks. See prb_retire_rx_blk_timer_expired().
812 */
f6fb8f10 813 struct timespec ts;
814 getnstimeofday(&ts);
815 h1->ts_last_pkt.ts_sec = ts.tv_sec;
816 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
817 }
818
819 smp_wmb();
820
821 /* Flush the block */
822 prb_flush_block(pkc1, pbd1, status);
823
da413eec
DC
824 sk->sk_data_ready(sk);
825
f6fb8f10 826 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
827}
828
eea49cc9 829static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 830{
831 pkc->reset_pending_on_curr_blk = 0;
832}
833
834/*
835 * Side effect of opening a block:
836 *
837 * 1) prb_queue is thawed.
838 * 2) retire_blk_timer is refreshed.
839 *
840 */
bc59ba39 841static void prb_open_block(struct tpacket_kbdq_core *pkc1,
842 struct tpacket_block_desc *pbd1)
f6fb8f10 843{
844 struct timespec ts;
bc59ba39 845 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 846
847 smp_rmb();
848
8da3056c
DB
849 /* We could have just memset this but we will lose the
850 * flexibility of making the priv area sticky
851 */
f6fb8f10 852
8da3056c
DB
853 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
854 BLOCK_NUM_PKTS(pbd1) = 0;
855 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 856
8da3056c
DB
857 getnstimeofday(&ts);
858
859 h1->ts_first_pkt.ts_sec = ts.tv_sec;
860 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 861
8da3056c
DB
862 pkc1->pkblk_start = (char *)pbd1;
863 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
864
865 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
866 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
867
868 pbd1->version = pkc1->version;
869 pkc1->prev = pkc1->nxt_offset;
870 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
871
872 prb_thaw_queue(pkc1);
873 _prb_refresh_rx_retire_blk_timer(pkc1);
874
875 smp_wmb();
f6fb8f10 876}
877
878/*
879 * Queue freeze logic:
880 * 1) Assume tp_block_nr = 8 blocks.
881 * 2) At time 't0', user opens Rx ring.
882 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
883 * 4) user-space is either sleeping or processing block '0'.
884 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
885 * it will close block-7,loop around and try to fill block '0'.
886 * call-flow:
887 * __packet_lookup_frame_in_block
888 * prb_retire_current_block()
889 * prb_dispatch_next_block()
890 * |->(BLOCK_STATUS == USER) evaluates to true
891 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
892 * 6) Now there are two cases:
893 * 6.1) Link goes idle right after the queue is frozen.
894 * But remember, the last open_block() refreshed the timer.
895 * When this timer expires,it will refresh itself so that we can
896 * re-open block-0 in near future.
897 * 6.2) Link is busy and keeps on receiving packets. This is a simple
898 * case and __packet_lookup_frame_in_block will check if block-0
899 * is free and can now be re-used.
900 */
eea49cc9 901static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 902 struct packet_sock *po)
903{
904 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 905 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 906}
907
908#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
909
910/*
911 * If the next block is free then we will dispatch it
912 * and return a good offset.
913 * Else, we will freeze the queue.
914 * So, caller must check the return value.
915 */
bc59ba39 916static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 917 struct packet_sock *po)
918{
bc59ba39 919 struct tpacket_block_desc *pbd;
f6fb8f10 920
921 smp_rmb();
922
923 /* 1. Get current block num */
924 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
925
926 /* 2. If this block is currently in_use then freeze the queue */
927 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
928 prb_freeze_queue(pkc, po);
929 return NULL;
930 }
931
932 /*
933 * 3.
934 * open this block and return the offset where the first packet
935 * needs to get stored.
936 */
937 prb_open_block(pkc, pbd);
938 return (void *)pkc->nxt_offset;
939}
940
bc59ba39 941static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 942 struct packet_sock *po, unsigned int status)
943{
bc59ba39 944 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 945
946 /* retire/close the current block */
947 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
948 /*
949 * Plug the case where copy_bits() is in progress on
950 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
951 * have space to copy the pkt in the current block and
952 * called prb_retire_current_block()
953 *
954 * We don't need to worry about the TMO case because
955 * the timer-handler already handled this case.
956 */
957 if (!(status & TP_STATUS_BLK_TMO)) {
958 while (atomic_read(&pkc->blk_fill_in_prog)) {
959 /* Waiting for skb_copy_bits to finish... */
960 cpu_relax();
961 }
962 }
963 prb_close_block(pkc, pbd, po, status);
964 return;
965 }
f6fb8f10 966}
967
eea49cc9 968static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 969 struct tpacket_block_desc *pbd)
f6fb8f10 970{
971 return TP_STATUS_USER & BLOCK_STATUS(pbd);
972}
973
eea49cc9 974static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 975{
976 return pkc->reset_pending_on_curr_blk;
977}
978
eea49cc9 979static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 980{
bc59ba39 981 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 982 atomic_dec(&pkc->blk_fill_in_prog);
983}
984
eea49cc9 985static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 986 struct tpacket3_hdr *ppd)
987{
3958afa1 988 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 989}
990
eea49cc9 991static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 992 struct tpacket3_hdr *ppd)
993{
994 ppd->hv1.tp_rxhash = 0;
995}
996
eea49cc9 997static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 998 struct tpacket3_hdr *ppd)
999{
df8a39de
JP
1000 if (skb_vlan_tag_present(pkc->skb)) {
1001 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1002 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1003 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1004 } else {
9e67030a 1005 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1006 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1007 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1008 }
1009}
1010
bc59ba39 1011static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1012 struct tpacket3_hdr *ppd)
1013{
a0cdfcf3 1014 ppd->hv1.tp_padding = 0;
f6fb8f10 1015 prb_fill_vlan_info(pkc, ppd);
1016
1017 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1018 prb_fill_rxhash(pkc, ppd);
1019 else
1020 prb_clear_rxhash(pkc, ppd);
1021}
1022
eea49cc9 1023static void prb_fill_curr_block(char *curr,
bc59ba39 1024 struct tpacket_kbdq_core *pkc,
1025 struct tpacket_block_desc *pbd,
f6fb8f10 1026 unsigned int len)
1027{
1028 struct tpacket3_hdr *ppd;
1029
1030 ppd = (struct tpacket3_hdr *)curr;
1031 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1032 pkc->prev = curr;
1033 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1035 BLOCK_NUM_PKTS(pbd) += 1;
1036 atomic_inc(&pkc->blk_fill_in_prog);
1037 prb_run_all_ft_ops(pkc, ppd);
1038}
1039
1040/* Assumes caller has the sk->rx_queue.lock */
1041static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1042 struct sk_buff *skb,
1043 int status,
1044 unsigned int len
1045 )
1046{
bc59ba39 1047 struct tpacket_kbdq_core *pkc;
1048 struct tpacket_block_desc *pbd;
f6fb8f10 1049 char *curr, *end;
1050
e3192690 1051 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1052 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1053
1054 /* Queue is frozen when user space is lagging behind */
1055 if (prb_queue_frozen(pkc)) {
1056 /*
1057 * Check if that last block which caused the queue to freeze,
1058 * is still in_use by user-space.
1059 */
1060 if (prb_curr_blk_in_use(pkc, pbd)) {
1061 /* Can't record this packet */
1062 return NULL;
1063 } else {
1064 /*
1065 * Ok, the block was released by user-space.
1066 * Now let's open that block.
1067 * opening a block also thaws the queue.
1068 * Thawing is a side effect.
1069 */
1070 prb_open_block(pkc, pbd);
1071 }
1072 }
1073
1074 smp_mb();
1075 curr = pkc->nxt_offset;
1076 pkc->skb = skb;
e3192690 1077 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1078
1079 /* first try the current block */
1080 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1081 prb_fill_curr_block(curr, pkc, pbd, len);
1082 return (void *)curr;
1083 }
1084
1085 /* Ok, close the current block */
1086 prb_retire_current_block(pkc, po, 0);
1087
1088 /* Now, try to dispatch the next block */
1089 curr = (char *)prb_dispatch_next_block(pkc, po);
1090 if (curr) {
1091 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1092 prb_fill_curr_block(curr, pkc, pbd, len);
1093 return (void *)curr;
1094 }
1095
1096 /*
1097 * No free blocks are available.user_space hasn't caught up yet.
1098 * Queue was just frozen and now this packet will get dropped.
1099 */
1100 return NULL;
1101}
1102
eea49cc9 1103static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1104 struct sk_buff *skb,
1105 int status, unsigned int len)
1106{
1107 char *curr = NULL;
1108 switch (po->tp_version) {
1109 case TPACKET_V1:
1110 case TPACKET_V2:
1111 curr = packet_lookup_frame(po, &po->rx_ring,
1112 po->rx_ring.head, status);
1113 return curr;
1114 case TPACKET_V3:
1115 return __packet_lookup_frame_in_block(po, skb, status, len);
1116 default:
1117 WARN(1, "TPACKET version not supported\n");
1118 BUG();
99aa3473 1119 return NULL;
f6fb8f10 1120 }
1121}
1122
eea49cc9 1123static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1124 struct packet_ring_buffer *rb,
77f65ebd 1125 unsigned int idx,
f6fb8f10 1126 int status)
1127{
bc59ba39 1128 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1129 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1130
1131 if (status != BLOCK_STATUS(pbd))
1132 return NULL;
1133 return pbd;
1134}
1135
eea49cc9 1136static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1137{
1138 unsigned int prev;
1139 if (rb->prb_bdqc.kactive_blk_num)
1140 prev = rb->prb_bdqc.kactive_blk_num-1;
1141 else
1142 prev = rb->prb_bdqc.knum_blocks-1;
1143 return prev;
1144}
1145
1146/* Assumes caller has held the rx_queue.lock */
eea49cc9 1147static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1148 struct packet_ring_buffer *rb,
1149 int status)
1150{
1151 unsigned int previous = prb_previous_blk_num(rb);
1152 return prb_lookup_block(po, rb, previous, status);
1153}
1154
eea49cc9 1155static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1156 struct packet_ring_buffer *rb,
1157 int status)
1158{
1159 if (po->tp_version <= TPACKET_V2)
1160 return packet_previous_frame(po, rb, status);
1161
1162 return __prb_previous_block(po, rb, status);
1163}
1164
eea49cc9 1165static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1166 struct packet_ring_buffer *rb)
1167{
1168 switch (po->tp_version) {
1169 case TPACKET_V1:
1170 case TPACKET_V2:
1171 return packet_increment_head(rb);
1172 case TPACKET_V3:
1173 default:
1174 WARN(1, "TPACKET version not supported.\n");
1175 BUG();
1176 return;
1177 }
1178}
1179
eea49cc9 1180static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1181 struct packet_ring_buffer *rb,
1182 int status)
1183{
1184 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1185 return packet_lookup_frame(po, rb, previous, status);
1186}
1187
eea49cc9 1188static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1189{
1190 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1191}
1192
b0138408
DB
1193static void packet_inc_pending(struct packet_ring_buffer *rb)
1194{
1195 this_cpu_inc(*rb->pending_refcnt);
1196}
1197
1198static void packet_dec_pending(struct packet_ring_buffer *rb)
1199{
1200 this_cpu_dec(*rb->pending_refcnt);
1201}
1202
1203static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1204{
1205 unsigned int refcnt = 0;
1206 int cpu;
1207
1208 /* We don't use pending refcount in rx_ring. */
1209 if (rb->pending_refcnt == NULL)
1210 return 0;
1211
1212 for_each_possible_cpu(cpu)
1213 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1214
1215 return refcnt;
1216}
1217
1218static int packet_alloc_pending(struct packet_sock *po)
1219{
1220 po->rx_ring.pending_refcnt = NULL;
1221
1222 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1223 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1224 return -ENOBUFS;
1225
1226 return 0;
1227}
1228
1229static void packet_free_pending(struct packet_sock *po)
1230{
1231 free_percpu(po->tx_ring.pending_refcnt);
1232}
1233
9954729b
WB
1234#define ROOM_POW_OFF 2
1235#define ROOM_NONE 0x0
1236#define ROOM_LOW 0x1
1237#define ROOM_NORMAL 0x2
1238
1239static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1240{
9954729b
WB
1241 int idx, len;
1242
1243 len = po->rx_ring.frame_max + 1;
1244 idx = po->rx_ring.head;
1245 if (pow_off)
1246 idx += len >> pow_off;
1247 if (idx >= len)
1248 idx -= len;
1249 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1250}
1251
1252static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1253{
1254 int idx, len;
1255
1256 len = po->rx_ring.prb_bdqc.knum_blocks;
1257 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1258 if (pow_off)
1259 idx += len >> pow_off;
1260 if (idx >= len)
1261 idx -= len;
1262 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1263}
77f65ebd 1264
2ccdbaa6 1265static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1266{
1267 struct sock *sk = &po->sk;
1268 int ret = ROOM_NONE;
1269
1270 if (po->prot_hook.func != tpacket_rcv) {
1271 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1272 - (skb ? skb->truesize : 0);
9954729b
WB
1273 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1274 return ROOM_NORMAL;
1275 else if (avail > 0)
1276 return ROOM_LOW;
1277 else
1278 return ROOM_NONE;
1279 }
77f65ebd 1280
9954729b
WB
1281 if (po->tp_version == TPACKET_V3) {
1282 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1283 ret = ROOM_NORMAL;
1284 else if (__tpacket_v3_has_room(po, 0))
1285 ret = ROOM_LOW;
1286 } else {
1287 if (__tpacket_has_room(po, ROOM_POW_OFF))
1288 ret = ROOM_NORMAL;
1289 else if (__tpacket_has_room(po, 0))
1290 ret = ROOM_LOW;
1291 }
2ccdbaa6
WB
1292
1293 return ret;
1294}
1295
1296static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1297{
1298 int ret;
1299 bool has_room;
1300
54d7c01d
WB
1301 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1302 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1303 has_room = ret == ROOM_NORMAL;
1304 if (po->pressure == has_room)
54d7c01d
WB
1305 po->pressure = !has_room;
1306 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1307
9954729b 1308 return ret;
77f65ebd
WB
1309}
1310
1da177e4
LT
1311static void packet_sock_destruct(struct sock *sk)
1312{
ed85b565
RC
1313 skb_queue_purge(&sk->sk_error_queue);
1314
547b792c
IJ
1315 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1316 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1317
1318 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1319 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1320 return;
1321 }
1322
17ab56a2 1323 sk_refcnt_debug_dec(sk);
1da177e4
LT
1324}
1325
3b3a5b0a
WB
1326static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1327{
1328 u32 rxhash;
1329 int i, count = 0;
1330
1331 rxhash = skb_get_hash(skb);
1332 for (i = 0; i < ROLLOVER_HLEN; i++)
1333 if (po->rollover->history[i] == rxhash)
1334 count++;
1335
1336 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1337 return count > (ROLLOVER_HLEN >> 1);
1338}
1339
77f65ebd
WB
1340static unsigned int fanout_demux_hash(struct packet_fanout *f,
1341 struct sk_buff *skb,
1342 unsigned int num)
dc99f600 1343{
61b905da 1344 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1345}
1346
77f65ebd
WB
1347static unsigned int fanout_demux_lb(struct packet_fanout *f,
1348 struct sk_buff *skb,
1349 unsigned int num)
dc99f600 1350{
468479e6 1351 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1352
468479e6 1353 return val % num;
77f65ebd
WB
1354}
1355
1356static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1357 struct sk_buff *skb,
1358 unsigned int num)
1359{
1360 return smp_processor_id() % num;
dc99f600
DM
1361}
1362
5df0ddfb
DB
1363static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1364 struct sk_buff *skb,
1365 unsigned int num)
1366{
f337db64 1367 return prandom_u32_max(num);
5df0ddfb
DB
1368}
1369
77f65ebd
WB
1370static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1371 struct sk_buff *skb,
ad377cab 1372 unsigned int idx, bool try_self,
77f65ebd 1373 unsigned int num)
95ec3eb4 1374{
4633c9e0 1375 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1376 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1377
0648ab70 1378 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1379
1380 if (try_self) {
1381 room = packet_rcv_has_room(po, skb);
1382 if (room == ROOM_NORMAL ||
1383 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1384 return idx;
4633c9e0 1385 po_skip = po;
3b3a5b0a 1386 }
ad377cab 1387
0648ab70 1388 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1389 do {
2ccdbaa6 1390 po_next = pkt_sk(f->arr[i]);
4633c9e0 1391 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1392 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1393 if (i != j)
0648ab70 1394 po->rollover->sock = i;
a9b63918
WB
1395 atomic_long_inc(&po->rollover->num);
1396 if (room == ROOM_LOW)
1397 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1398 return i;
1399 }
ad377cab 1400
77f65ebd
WB
1401 if (++i == num)
1402 i = 0;
1403 } while (i != j);
1404
a9b63918 1405 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1406 return idx;
1407}
1408
2d36097d
NH
1409static unsigned int fanout_demux_qm(struct packet_fanout *f,
1410 struct sk_buff *skb,
1411 unsigned int num)
1412{
1413 return skb_get_queue_mapping(skb) % num;
1414}
1415
47dceb8e
WB
1416static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1417 struct sk_buff *skb,
1418 unsigned int num)
1419{
1420 struct bpf_prog *prog;
1421 unsigned int ret = 0;
1422
1423 rcu_read_lock();
1424 prog = rcu_dereference(f->bpf_prog);
1425 if (prog)
ff936a04 1426 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1427 rcu_read_unlock();
1428
1429 return ret;
1430}
1431
77f65ebd
WB
1432static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1433{
1434 return f->flags & (flag >> 8);
95ec3eb4
DM
1435}
1436
95ec3eb4
DM
1437static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1438 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1439{
1440 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1441 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1442 struct net *net = read_pnet(&f->net);
dc99f600 1443 struct packet_sock *po;
77f65ebd 1444 unsigned int idx;
dc99f600 1445
19bcf9f2 1446 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1447 kfree_skb(skb);
1448 return 0;
1449 }
1450
3f34b24a 1451 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1452 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1453 if (!skb)
1454 return 0;
1455 }
95ec3eb4
DM
1456 switch (f->type) {
1457 case PACKET_FANOUT_HASH:
1458 default:
77f65ebd 1459 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1460 break;
1461 case PACKET_FANOUT_LB:
77f65ebd 1462 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1463 break;
1464 case PACKET_FANOUT_CPU:
77f65ebd
WB
1465 idx = fanout_demux_cpu(f, skb, num);
1466 break;
5df0ddfb
DB
1467 case PACKET_FANOUT_RND:
1468 idx = fanout_demux_rnd(f, skb, num);
1469 break;
2d36097d
NH
1470 case PACKET_FANOUT_QM:
1471 idx = fanout_demux_qm(f, skb, num);
1472 break;
77f65ebd 1473 case PACKET_FANOUT_ROLLOVER:
ad377cab 1474 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1475 break;
47dceb8e 1476 case PACKET_FANOUT_CBPF:
f2e52095 1477 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1478 idx = fanout_demux_bpf(f, skb, num);
1479 break;
dc99f600
DM
1480 }
1481
ad377cab
WB
1482 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1483 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1484
ad377cab 1485 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1486 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1487}
1488
fff3321d
PE
1489DEFINE_MUTEX(fanout_mutex);
1490EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1491static LIST_HEAD(fanout_list);
1492
1493static void __fanout_link(struct sock *sk, struct packet_sock *po)
1494{
1495 struct packet_fanout *f = po->fanout;
1496
1497 spin_lock(&f->lock);
1498 f->arr[f->num_members] = sk;
1499 smp_wmb();
1500 f->num_members++;
1501 spin_unlock(&f->lock);
1502}
1503
1504static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1505{
1506 struct packet_fanout *f = po->fanout;
1507 int i;
1508
1509 spin_lock(&f->lock);
1510 for (i = 0; i < f->num_members; i++) {
1511 if (f->arr[i] == sk)
1512 break;
1513 }
1514 BUG_ON(i >= f->num_members);
1515 f->arr[i] = f->arr[f->num_members - 1];
1516 f->num_members--;
1517 spin_unlock(&f->lock);
1518}
1519
d4dd8aee 1520static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1521{
161642e2
ED
1522 if (sk->sk_family != PF_PACKET)
1523 return false;
c0de08d0 1524
161642e2 1525 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1526}
1527
47dceb8e
WB
1528static void fanout_init_data(struct packet_fanout *f)
1529{
1530 switch (f->type) {
1531 case PACKET_FANOUT_LB:
1532 atomic_set(&f->rr_cur, 0);
1533 break;
1534 case PACKET_FANOUT_CBPF:
f2e52095 1535 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1536 RCU_INIT_POINTER(f->bpf_prog, NULL);
1537 break;
1538 }
1539}
1540
1541static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1542{
1543 struct bpf_prog *old;
1544
1545 spin_lock(&f->lock);
1546 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1547 rcu_assign_pointer(f->bpf_prog, new);
1548 spin_unlock(&f->lock);
1549
1550 if (old) {
1551 synchronize_net();
1552 bpf_prog_destroy(old);
1553 }
1554}
1555
1556static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1557 unsigned int len)
1558{
1559 struct bpf_prog *new;
1560 struct sock_fprog fprog;
1561 int ret;
1562
1563 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1564 return -EPERM;
1565 if (len != sizeof(fprog))
1566 return -EINVAL;
1567 if (copy_from_user(&fprog, data, len))
1568 return -EFAULT;
1569
bab18991 1570 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1571 if (ret)
1572 return ret;
1573
1574 __fanout_set_data_bpf(po->fanout, new);
1575 return 0;
1576}
1577
f2e52095
WB
1578static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1579 unsigned int len)
1580{
1581 struct bpf_prog *new;
1582 u32 fd;
1583
1584 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1585 return -EPERM;
1586 if (len != sizeof(fd))
1587 return -EINVAL;
1588 if (copy_from_user(&fd, data, len))
1589 return -EFAULT;
1590
1591 new = bpf_prog_get(fd);
1592 if (IS_ERR(new))
1593 return PTR_ERR(new);
1594 if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
1595 bpf_prog_put(new);
1596 return -EINVAL;
1597 }
1598
1599 __fanout_set_data_bpf(po->fanout, new);
1600 return 0;
1601}
1602
47dceb8e
WB
1603static int fanout_set_data(struct packet_sock *po, char __user *data,
1604 unsigned int len)
1605{
1606 switch (po->fanout->type) {
1607 case PACKET_FANOUT_CBPF:
1608 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1609 case PACKET_FANOUT_EBPF:
1610 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1611 default:
1612 return -EINVAL;
1613 };
1614}
1615
1616static void fanout_release_data(struct packet_fanout *f)
1617{
1618 switch (f->type) {
1619 case PACKET_FANOUT_CBPF:
f2e52095 1620 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1621 __fanout_set_data_bpf(f, NULL);
1622 };
1623}
1624
7736d33f 1625static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1626{
1627 struct packet_sock *po = pkt_sk(sk);
1628 struct packet_fanout *f, *match;
7736d33f 1629 u8 type = type_flags & 0xff;
77f65ebd 1630 u8 flags = type_flags >> 8;
dc99f600
DM
1631 int err;
1632
1633 switch (type) {
77f65ebd
WB
1634 case PACKET_FANOUT_ROLLOVER:
1635 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1636 return -EINVAL;
dc99f600
DM
1637 case PACKET_FANOUT_HASH:
1638 case PACKET_FANOUT_LB:
95ec3eb4 1639 case PACKET_FANOUT_CPU:
5df0ddfb 1640 case PACKET_FANOUT_RND:
2d36097d 1641 case PACKET_FANOUT_QM:
47dceb8e 1642 case PACKET_FANOUT_CBPF:
f2e52095 1643 case PACKET_FANOUT_EBPF:
dc99f600
DM
1644 break;
1645 default:
1646 return -EINVAL;
1647 }
1648
1649 if (!po->running)
1650 return -EINVAL;
1651
1652 if (po->fanout)
1653 return -EALREADY;
1654
4633c9e0
WB
1655 if (type == PACKET_FANOUT_ROLLOVER ||
1656 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
0648ab70
WB
1657 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1658 if (!po->rollover)
1659 return -ENOMEM;
a9b63918
WB
1660 atomic_long_set(&po->rollover->num, 0);
1661 atomic_long_set(&po->rollover->num_huge, 0);
1662 atomic_long_set(&po->rollover->num_failed, 0);
0648ab70
WB
1663 }
1664
dc99f600
DM
1665 mutex_lock(&fanout_mutex);
1666 match = NULL;
1667 list_for_each_entry(f, &fanout_list, list) {
1668 if (f->id == id &&
1669 read_pnet(&f->net) == sock_net(sk)) {
1670 match = f;
1671 break;
1672 }
1673 }
afe62c68 1674 err = -EINVAL;
77f65ebd 1675 if (match && match->flags != flags)
afe62c68 1676 goto out;
dc99f600 1677 if (!match) {
afe62c68 1678 err = -ENOMEM;
dc99f600 1679 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1680 if (!match)
1681 goto out;
1682 write_pnet(&match->net, sock_net(sk));
1683 match->id = id;
1684 match->type = type;
77f65ebd 1685 match->flags = flags;
afe62c68
ED
1686 INIT_LIST_HEAD(&match->list);
1687 spin_lock_init(&match->lock);
1688 atomic_set(&match->sk_ref, 0);
47dceb8e 1689 fanout_init_data(match);
afe62c68
ED
1690 match->prot_hook.type = po->prot_hook.type;
1691 match->prot_hook.dev = po->prot_hook.dev;
1692 match->prot_hook.func = packet_rcv_fanout;
1693 match->prot_hook.af_packet_priv = match;
c0de08d0 1694 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1695 dev_add_pack(&match->prot_hook);
1696 list_add(&match->list, &fanout_list);
dc99f600 1697 }
afe62c68
ED
1698 err = -EINVAL;
1699 if (match->type == type &&
1700 match->prot_hook.type == po->prot_hook.type &&
1701 match->prot_hook.dev == po->prot_hook.dev) {
1702 err = -ENOSPC;
1703 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1704 __dev_remove_pack(&po->prot_hook);
1705 po->fanout = match;
1706 atomic_inc(&match->sk_ref);
1707 __fanout_link(sk, po);
1708 err = 0;
dc99f600
DM
1709 }
1710 }
afe62c68 1711out:
dc99f600 1712 mutex_unlock(&fanout_mutex);
0648ab70
WB
1713 if (err) {
1714 kfree(po->rollover);
1715 po->rollover = NULL;
1716 }
dc99f600
DM
1717 return err;
1718}
1719
1720static void fanout_release(struct sock *sk)
1721{
1722 struct packet_sock *po = pkt_sk(sk);
1723 struct packet_fanout *f;
1724
1725 f = po->fanout;
1726 if (!f)
1727 return;
1728
fff3321d 1729 mutex_lock(&fanout_mutex);
dc99f600
DM
1730 po->fanout = NULL;
1731
dc99f600
DM
1732 if (atomic_dec_and_test(&f->sk_ref)) {
1733 list_del(&f->list);
1734 dev_remove_pack(&f->prot_hook);
47dceb8e 1735 fanout_release_data(f);
dc99f600
DM
1736 kfree(f);
1737 }
1738 mutex_unlock(&fanout_mutex);
0648ab70 1739
59f21118
WB
1740 if (po->rollover)
1741 kfree_rcu(po->rollover, rcu);
dc99f600 1742}
1da177e4 1743
3c70c132
DB
1744static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1745 struct sk_buff *skb)
1746{
1747 /* Earlier code assumed this would be a VLAN pkt, double-check
1748 * this now that we have the actual packet in hand. We can only
1749 * do this check on Ethernet devices.
1750 */
1751 if (unlikely(dev->type != ARPHRD_ETHER))
1752 return false;
1753
1754 skb_reset_mac_header(skb);
1755 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1756}
1757
90ddc4f0 1758static const struct proto_ops packet_ops;
1da177e4 1759
90ddc4f0 1760static const struct proto_ops packet_ops_spkt;
1da177e4 1761
40d4e3df
ED
1762static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1763 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1764{
1765 struct sock *sk;
1766 struct sockaddr_pkt *spkt;
1767
1768 /*
1769 * When we registered the protocol we saved the socket in the data
1770 * field for just this event.
1771 */
1772
1773 sk = pt->af_packet_priv;
1ce4f28b 1774
1da177e4
LT
1775 /*
1776 * Yank back the headers [hope the device set this
1777 * right or kerboom...]
1778 *
1779 * Incoming packets have ll header pulled,
1780 * push it back.
1781 *
98e399f8 1782 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1783 * so that this procedure is noop.
1784 */
1785
1786 if (skb->pkt_type == PACKET_LOOPBACK)
1787 goto out;
1788
09ad9bc7 1789 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1790 goto out;
1791
40d4e3df
ED
1792 skb = skb_share_check(skb, GFP_ATOMIC);
1793 if (skb == NULL)
1da177e4
LT
1794 goto oom;
1795
1796 /* drop any routing info */
adf30907 1797 skb_dst_drop(skb);
1da177e4 1798
84531c24
PO
1799 /* drop conntrack reference */
1800 nf_reset(skb);
1801
ffbc6111 1802 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1803
98e399f8 1804 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1805
1806 /*
1807 * The SOCK_PACKET socket receives _all_ frames.
1808 */
1809
1810 spkt->spkt_family = dev->type;
1811 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1812 spkt->spkt_protocol = skb->protocol;
1813
1814 /*
1815 * Charge the memory to the socket. This is done specifically
1816 * to prevent sockets using all the memory up.
1817 */
1818
40d4e3df 1819 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1820 return 0;
1821
1822out:
1823 kfree_skb(skb);
1824oom:
1825 return 0;
1826}
1827
1828
1829/*
1830 * Output a raw packet to a device layer. This bypasses all the other
1831 * protocol layers and you must therefore supply it with a complete frame
1832 */
1ce4f28b 1833
1b784140
YX
1834static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1835 size_t len)
1da177e4
LT
1836{
1837 struct sock *sk = sock->sk;
342dfc30 1838 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1839 struct sk_buff *skb = NULL;
1da177e4 1840 struct net_device *dev;
c14ac945 1841 struct sockcm_cookie sockc;
40d4e3df 1842 __be16 proto = 0;
1da177e4 1843 int err;
3bdc0eba 1844 int extra_len = 0;
1ce4f28b 1845
1da177e4 1846 /*
1ce4f28b 1847 * Get and verify the address.
1da177e4
LT
1848 */
1849
40d4e3df 1850 if (saddr) {
1da177e4 1851 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1852 return -EINVAL;
1853 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1854 proto = saddr->spkt_protocol;
1855 } else
1856 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1857
1858 /*
1ce4f28b 1859 * Find the device first to size check it
1da177e4
LT
1860 */
1861
de74e92a 1862 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1863retry:
654d1f8a
ED
1864 rcu_read_lock();
1865 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1866 err = -ENODEV;
1867 if (dev == NULL)
1868 goto out_unlock;
1ce4f28b 1869
d5e76b0a
DM
1870 err = -ENETDOWN;
1871 if (!(dev->flags & IFF_UP))
1872 goto out_unlock;
1873
1da177e4 1874 /*
40d4e3df
ED
1875 * You may not queue a frame bigger than the mtu. This is the lowest level
1876 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1877 */
1ce4f28b 1878
3bdc0eba
BG
1879 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1880 if (!netif_supports_nofcs(dev)) {
1881 err = -EPROTONOSUPPORT;
1882 goto out_unlock;
1883 }
1884 extra_len = 4; /* We're doing our own CRC */
1885 }
1886
1da177e4 1887 err = -EMSGSIZE;
3bdc0eba 1888 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1889 goto out_unlock;
1890
1a35ca80
ED
1891 if (!skb) {
1892 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1893 int tlen = dev->needed_tailroom;
1a35ca80
ED
1894 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1895
1896 rcu_read_unlock();
4ce40912 1897 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1898 if (skb == NULL)
1899 return -ENOBUFS;
1900 /* FIXME: Save some space for broken drivers that write a hard
1901 * header at transmission time by themselves. PPP is the notable
1902 * one here. This should really be fixed at the driver level.
1903 */
1904 skb_reserve(skb, reserved);
1905 skb_reset_network_header(skb);
1906
1907 /* Try to align data part correctly */
1908 if (hhlen) {
1909 skb->data -= hhlen;
1910 skb->tail -= hhlen;
1911 if (len < hhlen)
1912 skb_reset_network_header(skb);
1913 }
6ce8e9ce 1914 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1915 if (err)
1916 goto out_free;
1917 goto retry;
1da177e4
LT
1918 }
1919
9ed988cd
WB
1920 if (!dev_validate_header(dev, skb->data, len)) {
1921 err = -EINVAL;
1922 goto out_unlock;
1923 }
3c70c132
DB
1924 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1925 !packet_extra_vlan_len_allowed(dev, skb)) {
1926 err = -EMSGSIZE;
1927 goto out_unlock;
57f89bfa 1928 }
1a35ca80 1929
c14ac945
SHY
1930 sockc.tsflags = 0;
1931 if (msg->msg_controllen) {
1932 err = sock_cmsg_send(sk, msg, &sockc);
1933 if (unlikely(err)) {
1934 err = -EINVAL;
1935 goto out_unlock;
1936 }
1937 }
1938
1da177e4
LT
1939 skb->protocol = proto;
1940 skb->dev = dev;
1941 skb->priority = sk->sk_priority;
2d37a186 1942 skb->mark = sk->sk_mark;
bf84a010 1943
c14ac945 1944 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 1945
3bdc0eba
BG
1946 if (unlikely(extra_len == 4))
1947 skb->no_fcs = 1;
1948
40893fd0 1949 skb_probe_transport_header(skb, 0);
c1aad275 1950
1da177e4 1951 dev_queue_xmit(skb);
654d1f8a 1952 rcu_read_unlock();
40d4e3df 1953 return len;
1da177e4 1954
1da177e4 1955out_unlock:
654d1f8a 1956 rcu_read_unlock();
1a35ca80
ED
1957out_free:
1958 kfree_skb(skb);
1da177e4
LT
1959 return err;
1960}
1da177e4 1961
ff936a04
AS
1962static unsigned int run_filter(struct sk_buff *skb,
1963 const struct sock *sk,
1964 unsigned int res)
1da177e4
LT
1965{
1966 struct sk_filter *filter;
fda9ef5d 1967
80f8f102
ED
1968 rcu_read_lock();
1969 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1970 if (filter != NULL)
ff936a04 1971 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1972 rcu_read_unlock();
1da177e4 1973
dbcb5855 1974 return res;
1da177e4
LT
1975}
1976
16cc1400
WB
1977static int __packet_rcv_vnet(const struct sk_buff *skb,
1978 struct virtio_net_hdr *vnet_hdr)
1979{
1980 *vnet_hdr = (const struct virtio_net_hdr) { 0 };
1981
1276f24e
MR
1982 if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le()))
1983 BUG();
16cc1400
WB
1984
1985 return 0;
1986}
1987
1988static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
1989 size_t *len)
1990{
1991 struct virtio_net_hdr vnet_hdr;
1992
1993 if (*len < sizeof(vnet_hdr))
1994 return -EINVAL;
1995 *len -= sizeof(vnet_hdr);
1996
1997 if (__packet_rcv_vnet(skb, &vnet_hdr))
1998 return -EINVAL;
1999
2000 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2001}
2002
1da177e4 2003/*
62ab0812
ED
2004 * This function makes lazy skb cloning in hope that most of packets
2005 * are discarded by BPF.
2006 *
2007 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2008 * and skb->cb are mangled. It works because (and until) packets
2009 * falling here are owned by current CPU. Output packets are cloned
2010 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2011 * sequencially, so that if we return skb to original state on exit,
2012 * we will not harm anyone.
1da177e4
LT
2013 */
2014
40d4e3df
ED
2015static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2016 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2017{
2018 struct sock *sk;
2019 struct sockaddr_ll *sll;
2020 struct packet_sock *po;
40d4e3df 2021 u8 *skb_head = skb->data;
1da177e4 2022 int skb_len = skb->len;
dbcb5855 2023 unsigned int snaplen, res;
da37845f 2024 bool is_drop_n_account = false;
1da177e4
LT
2025
2026 if (skb->pkt_type == PACKET_LOOPBACK)
2027 goto drop;
2028
2029 sk = pt->af_packet_priv;
2030 po = pkt_sk(sk);
2031
09ad9bc7 2032 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2033 goto drop;
2034
1da177e4
LT
2035 skb->dev = dev;
2036
3b04ddde 2037 if (dev->header_ops) {
1da177e4 2038 /* The device has an explicit notion of ll header,
62ab0812
ED
2039 * exported to higher levels.
2040 *
2041 * Otherwise, the device hides details of its frame
2042 * structure, so that corresponding packet head is
2043 * never delivered to user.
1da177e4
LT
2044 */
2045 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2046 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2047 else if (skb->pkt_type == PACKET_OUTGOING) {
2048 /* Special case: outgoing packets have ll header at head */
bbe735e4 2049 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2050 }
2051 }
2052
2053 snaplen = skb->len;
2054
dbcb5855
DM
2055 res = run_filter(skb, sk, snaplen);
2056 if (!res)
fda9ef5d 2057 goto drop_n_restore;
dbcb5855
DM
2058 if (snaplen > res)
2059 snaplen = res;
1da177e4 2060
0fd7bac6 2061 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2062 goto drop_n_acct;
2063
2064 if (skb_shared(skb)) {
2065 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2066 if (nskb == NULL)
2067 goto drop_n_acct;
2068
2069 if (skb_head != skb->data) {
2070 skb->data = skb_head;
2071 skb->len = skb_len;
2072 }
abc4e4fa 2073 consume_skb(skb);
1da177e4
LT
2074 skb = nskb;
2075 }
2076
b4772ef8 2077 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2078
2079 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2080 sll->sll_hatype = dev->type;
1da177e4 2081 sll->sll_pkttype = skb->pkt_type;
8032b464 2082 if (unlikely(po->origdev))
80feaacb
PWJ
2083 sll->sll_ifindex = orig_dev->ifindex;
2084 else
2085 sll->sll_ifindex = dev->ifindex;
1da177e4 2086
b95cce35 2087 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2088
2472d761
EB
2089 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2090 * Use their space for storing the original skb length.
2091 */
2092 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2093
1da177e4
LT
2094 if (pskb_trim(skb, snaplen))
2095 goto drop_n_acct;
2096
2097 skb_set_owner_r(skb, sk);
2098 skb->dev = NULL;
adf30907 2099 skb_dst_drop(skb);
1da177e4 2100
84531c24
PO
2101 /* drop conntrack reference */
2102 nf_reset(skb);
2103
1da177e4 2104 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2105 po->stats.stats1.tp_packets++;
3bc3b96f 2106 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2107 __skb_queue_tail(&sk->sk_receive_queue, skb);
2108 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2109 sk->sk_data_ready(sk);
1da177e4
LT
2110 return 0;
2111
2112drop_n_acct:
da37845f 2113 is_drop_n_account = true;
7091fbd8 2114 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2115 po->stats.stats1.tp_drops++;
7091fbd8
WB
2116 atomic_inc(&sk->sk_drops);
2117 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2118
2119drop_n_restore:
2120 if (skb_head != skb->data && skb_shared(skb)) {
2121 skb->data = skb_head;
2122 skb->len = skb_len;
2123 }
2124drop:
da37845f
WJ
2125 if (!is_drop_n_account)
2126 consume_skb(skb);
2127 else
2128 kfree_skb(skb);
1da177e4
LT
2129 return 0;
2130}
2131
40d4e3df
ED
2132static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2133 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2134{
2135 struct sock *sk;
2136 struct packet_sock *po;
2137 struct sockaddr_ll *sll;
184f489e 2138 union tpacket_uhdr h;
40d4e3df 2139 u8 *skb_head = skb->data;
1da177e4 2140 int skb_len = skb->len;
dbcb5855 2141 unsigned int snaplen, res;
f6fb8f10 2142 unsigned long status = TP_STATUS_USER;
bbd6ef87 2143 unsigned short macoff, netoff, hdrlen;
1da177e4 2144 struct sk_buff *copy_skb = NULL;
bbd6ef87 2145 struct timespec ts;
b9c32fb2 2146 __u32 ts_status;
da37845f 2147 bool is_drop_n_account = false;
1da177e4 2148
51846355
AW
2149 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2150 * We may add members to them until current aligned size without forcing
2151 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2152 */
2153 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2154 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2155
1da177e4
LT
2156 if (skb->pkt_type == PACKET_LOOPBACK)
2157 goto drop;
2158
2159 sk = pt->af_packet_priv;
2160 po = pkt_sk(sk);
2161
09ad9bc7 2162 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2163 goto drop;
2164
3b04ddde 2165 if (dev->header_ops) {
1da177e4 2166 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2167 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2168 else if (skb->pkt_type == PACKET_OUTGOING) {
2169 /* Special case: outgoing packets have ll header at head */
bbe735e4 2170 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2171 }
2172 }
2173
2174 snaplen = skb->len;
2175
dbcb5855
DM
2176 res = run_filter(skb, sk, snaplen);
2177 if (!res)
fda9ef5d 2178 goto drop_n_restore;
68c2e5de
AD
2179
2180 if (skb->ip_summed == CHECKSUM_PARTIAL)
2181 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2182 else if (skb->pkt_type != PACKET_OUTGOING &&
2183 (skb->ip_summed == CHECKSUM_COMPLETE ||
2184 skb_csum_unnecessary(skb)))
2185 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2186
dbcb5855
DM
2187 if (snaplen > res)
2188 snaplen = res;
1da177e4
LT
2189
2190 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2191 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2192 po->tp_reserve;
1da177e4 2193 } else {
95c96174 2194 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2195 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2196 (maclen < 16 ? 16 : maclen)) +
58d19b19
WB
2197 po->tp_reserve;
2198 if (po->has_vnet_hdr)
2199 netoff += sizeof(struct virtio_net_hdr);
1da177e4
LT
2200 macoff = netoff - maclen;
2201 }
f6fb8f10 2202 if (po->tp_version <= TPACKET_V2) {
2203 if (macoff + snaplen > po->rx_ring.frame_size) {
2204 if (po->copy_thresh &&
0fd7bac6 2205 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2206 if (skb_shared(skb)) {
2207 copy_skb = skb_clone(skb, GFP_ATOMIC);
2208 } else {
2209 copy_skb = skb_get(skb);
2210 skb_head = skb->data;
2211 }
2212 if (copy_skb)
2213 skb_set_owner_r(copy_skb, sk);
1da177e4 2214 }
f6fb8f10 2215 snaplen = po->rx_ring.frame_size - macoff;
2216 if ((int)snaplen < 0)
2217 snaplen = 0;
1da177e4 2218 }
dc808110
ED
2219 } else if (unlikely(macoff + snaplen >
2220 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2221 u32 nval;
2222
2223 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2224 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2225 snaplen, nval, macoff);
2226 snaplen = nval;
2227 if (unlikely((int)snaplen < 0)) {
2228 snaplen = 0;
2229 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2230 }
1da177e4 2231 }
1da177e4 2232 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2233 h.raw = packet_current_rx_frame(po, skb,
2234 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2235 if (!h.raw)
58d19b19 2236 goto drop_n_account;
f6fb8f10 2237 if (po->tp_version <= TPACKET_V2) {
2238 packet_increment_rx_head(po, &po->rx_ring);
2239 /*
2240 * LOSING will be reported till you read the stats,
2241 * because it's COR - Clear On Read.
2242 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2243 * at packet level.
2244 */
ee80fbf3 2245 if (po->stats.stats1.tp_drops)
f6fb8f10 2246 status |= TP_STATUS_LOSING;
2247 }
ee80fbf3 2248 po->stats.stats1.tp_packets++;
1da177e4
LT
2249 if (copy_skb) {
2250 status |= TP_STATUS_COPY;
2251 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2252 }
1da177e4
LT
2253 spin_unlock(&sk->sk_receive_queue.lock);
2254
58d19b19
WB
2255 if (po->has_vnet_hdr) {
2256 if (__packet_rcv_vnet(skb, h.raw + macoff -
2257 sizeof(struct virtio_net_hdr))) {
2258 spin_lock(&sk->sk_receive_queue.lock);
2259 goto drop_n_account;
2260 }
2261 }
2262
bbd6ef87 2263 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2264
2265 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2266 getnstimeofday(&ts);
1da177e4 2267
b9c32fb2
DB
2268 status |= ts_status;
2269
bbd6ef87
PM
2270 switch (po->tp_version) {
2271 case TPACKET_V1:
2272 h.h1->tp_len = skb->len;
2273 h.h1->tp_snaplen = snaplen;
2274 h.h1->tp_mac = macoff;
2275 h.h1->tp_net = netoff;
4b457bdf
DB
2276 h.h1->tp_sec = ts.tv_sec;
2277 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2278 hdrlen = sizeof(*h.h1);
2279 break;
2280 case TPACKET_V2:
2281 h.h2->tp_len = skb->len;
2282 h.h2->tp_snaplen = snaplen;
2283 h.h2->tp_mac = macoff;
2284 h.h2->tp_net = netoff;
bbd6ef87
PM
2285 h.h2->tp_sec = ts.tv_sec;
2286 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2287 if (skb_vlan_tag_present(skb)) {
2288 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2289 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2290 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2291 } else {
2292 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2293 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2294 }
e4d26f4b 2295 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2296 hdrlen = sizeof(*h.h2);
2297 break;
f6fb8f10 2298 case TPACKET_V3:
2299 /* tp_nxt_offset,vlan are already populated above.
2300 * So DONT clear those fields here
2301 */
2302 h.h3->tp_status |= status;
2303 h.h3->tp_len = skb->len;
2304 h.h3->tp_snaplen = snaplen;
2305 h.h3->tp_mac = macoff;
2306 h.h3->tp_net = netoff;
f6fb8f10 2307 h.h3->tp_sec = ts.tv_sec;
2308 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2309 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2310 hdrlen = sizeof(*h.h3);
2311 break;
bbd6ef87
PM
2312 default:
2313 BUG();
2314 }
1da177e4 2315
bbd6ef87 2316 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2317 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2318 sll->sll_family = AF_PACKET;
2319 sll->sll_hatype = dev->type;
2320 sll->sll_protocol = skb->protocol;
2321 sll->sll_pkttype = skb->pkt_type;
8032b464 2322 if (unlikely(po->origdev))
80feaacb
PWJ
2323 sll->sll_ifindex = orig_dev->ifindex;
2324 else
2325 sll->sll_ifindex = dev->ifindex;
1da177e4 2326
e16aa207 2327 smp_mb();
f0d4eb29 2328
f6dafa95 2329#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2330 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2331 u8 *start, *end;
2332
f0d4eb29
DB
2333 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2334 macoff + snaplen);
2335
2336 for (start = h.raw; start < end; start += PAGE_SIZE)
2337 flush_dcache_page(pgv_to_page(start));
1da177e4 2338 }
f0d4eb29 2339 smp_wmb();
f6dafa95 2340#endif
f0d4eb29 2341
da413eec 2342 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2343 __packet_set_status(po, h.raw, status);
da413eec
DC
2344 sk->sk_data_ready(sk);
2345 } else {
f6fb8f10 2346 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2347 }
1da177e4
LT
2348
2349drop_n_restore:
2350 if (skb_head != skb->data && skb_shared(skb)) {
2351 skb->data = skb_head;
2352 skb->len = skb_len;
2353 }
2354drop:
da37845f
WJ
2355 if (!is_drop_n_account)
2356 consume_skb(skb);
2357 else
2358 kfree_skb(skb);
1da177e4
LT
2359 return 0;
2360
58d19b19 2361drop_n_account:
da37845f 2362 is_drop_n_account = true;
ee80fbf3 2363 po->stats.stats1.tp_drops++;
1da177e4
LT
2364 spin_unlock(&sk->sk_receive_queue.lock);
2365
676d2369 2366 sk->sk_data_ready(sk);
acb5d75b 2367 kfree_skb(copy_skb);
1da177e4
LT
2368 goto drop_n_restore;
2369}
2370
69e3c75f
JB
2371static void tpacket_destruct_skb(struct sk_buff *skb)
2372{
2373 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2374
69e3c75f 2375 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2376 void *ph;
b9c32fb2
DB
2377 __u32 ts;
2378
69e3c75f 2379 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2380 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2381
2382 ts = __packet_set_timestamp(po, ph, skb);
2383 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2384 }
2385
2386 sock_wfree(skb);
2387}
2388
c72219b7
DB
2389static void tpacket_set_protocol(const struct net_device *dev,
2390 struct sk_buff *skb)
2391{
2392 if (dev->type == ARPHRD_ETHER) {
2393 skb_reset_mac_header(skb);
2394 skb->protocol = eth_hdr(skb)->h_proto;
2395 }
2396}
2397
16cc1400
WB
2398static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2399{
2400 unsigned short gso_type = 0;
2401
2402 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2403 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2404 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2405 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2406 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2407 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2408 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2409
2410 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2411 return -EINVAL;
2412
2413 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2414 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2415 case VIRTIO_NET_HDR_GSO_TCPV4:
2416 gso_type = SKB_GSO_TCPV4;
2417 break;
2418 case VIRTIO_NET_HDR_GSO_TCPV6:
2419 gso_type = SKB_GSO_TCPV6;
2420 break;
2421 case VIRTIO_NET_HDR_GSO_UDP:
2422 gso_type = SKB_GSO_UDP;
2423 break;
2424 default:
2425 return -EINVAL;
2426 }
2427
2428 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
2429 gso_type |= SKB_GSO_TCP_ECN;
2430
2431 if (vnet_hdr->gso_size == 0)
2432 return -EINVAL;
2433 }
2434
2435 vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */
2436 return 0;
2437}
2438
2439static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2440 struct virtio_net_hdr *vnet_hdr)
2441{
2442 int n;
2443
2444 if (*len < sizeof(*vnet_hdr))
2445 return -EINVAL;
2446 *len -= sizeof(*vnet_hdr);
2447
2448 n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
2449 if (n != sizeof(*vnet_hdr))
2450 return -EFAULT;
2451
2452 return __packet_snd_vnet_parse(vnet_hdr, *len);
2453}
2454
2455static int packet_snd_vnet_gso(struct sk_buff *skb,
2456 struct virtio_net_hdr *vnet_hdr)
2457{
2458 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2459 u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
2460 u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
2461
2462 if (!skb_partial_csum_set(skb, s, o))
2463 return -EINVAL;
2464 }
2465
2466 skb_shinfo(skb)->gso_size =
2467 __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
2468 skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
2469
2470 /* Header must be checked, and gso_segs computed. */
2471 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2472 skb_shinfo(skb)->gso_segs = 0;
2473 return 0;
2474}
2475
40d4e3df 2476static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2477 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2478 __be16 proto, unsigned char *addr, int hlen, int copylen,
2479 const struct sockcm_cookie *sockc)
69e3c75f 2480{
184f489e 2481 union tpacket_uhdr ph;
8d39b4a6 2482 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2483 struct socket *sock = po->sk.sk_socket;
2484 struct page *page;
69e3c75f
JB
2485 int err;
2486
2487 ph.raw = frame;
2488
2489 skb->protocol = proto;
2490 skb->dev = dev;
2491 skb->priority = po->sk.sk_priority;
2d37a186 2492 skb->mark = po->sk.sk_mark;
c14ac945 2493 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2494 skb_shinfo(skb)->destructor_arg = ph.raw;
2495
ae641949 2496 skb_reserve(skb, hlen);
69e3c75f 2497 skb_reset_network_header(skb);
c1aad275 2498
69e3c75f
JB
2499 to_write = tp_len;
2500
2501 if (sock->type == SOCK_DGRAM) {
2502 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2503 NULL, tp_len);
2504 if (unlikely(err < 0))
2505 return -EINVAL;
1d036d25 2506 } else if (copylen) {
9ed988cd
WB
2507 int hdrlen = min_t(int, copylen, tp_len);
2508
69e3c75f 2509 skb_push(skb, dev->hard_header_len);
1d036d25 2510 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2511 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2512 if (unlikely(err))
2513 return err;
9ed988cd
WB
2514 if (!dev_validate_header(dev, skb->data, hdrlen))
2515 return -EINVAL;
c72219b7
DB
2516 if (!skb->protocol)
2517 tpacket_set_protocol(dev, skb);
69e3c75f 2518
9ed988cd
WB
2519 data += hdrlen;
2520 to_write -= hdrlen;
69e3c75f
JB
2521 }
2522
69e3c75f
JB
2523 offset = offset_in_page(data);
2524 len_max = PAGE_SIZE - offset;
2525 len = ((to_write > len_max) ? len_max : to_write);
2526
2527 skb->data_len = to_write;
2528 skb->len += to_write;
2529 skb->truesize += to_write;
2530 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2531
2532 while (likely(to_write)) {
2533 nr_frags = skb_shinfo(skb)->nr_frags;
2534
2535 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2536 pr_err("Packet exceed the number of skb frags(%lu)\n",
2537 MAX_SKB_FRAGS);
69e3c75f
JB
2538 return -EFAULT;
2539 }
2540
0af55bb5
CG
2541 page = pgv_to_page(data);
2542 data += len;
69e3c75f
JB
2543 flush_dcache_page(page);
2544 get_page(page);
0af55bb5 2545 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2546 to_write -= len;
2547 offset = 0;
2548 len_max = PAGE_SIZE;
2549 len = ((to_write > len_max) ? len_max : to_write);
2550 }
2551
8fd6c80d 2552 skb_probe_transport_header(skb, 0);
efdfa2f7 2553
69e3c75f
JB
2554 return tp_len;
2555}
2556
8d39b4a6
WB
2557static int tpacket_parse_header(struct packet_sock *po, void *frame,
2558 int size_max, void **data)
2559{
2560 union tpacket_uhdr ph;
2561 int tp_len, off;
2562
2563 ph.raw = frame;
2564
2565 switch (po->tp_version) {
2566 case TPACKET_V2:
2567 tp_len = ph.h2->tp_len;
2568 break;
2569 default:
2570 tp_len = ph.h1->tp_len;
2571 break;
2572 }
2573 if (unlikely(tp_len > size_max)) {
2574 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2575 return -EMSGSIZE;
2576 }
2577
2578 if (unlikely(po->tp_tx_has_off)) {
2579 int off_min, off_max;
2580
2581 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2582 off_max = po->tx_ring.frame_size - tp_len;
2583 if (po->sk.sk_type == SOCK_DGRAM) {
2584 switch (po->tp_version) {
2585 case TPACKET_V2:
2586 off = ph.h2->tp_net;
2587 break;
2588 default:
2589 off = ph.h1->tp_net;
2590 break;
2591 }
2592 } else {
2593 switch (po->tp_version) {
2594 case TPACKET_V2:
2595 off = ph.h2->tp_mac;
2596 break;
2597 default:
2598 off = ph.h1->tp_mac;
2599 break;
2600 }
2601 }
2602 if (unlikely((off < off_min) || (off_max < off)))
2603 return -EINVAL;
2604 } else {
2605 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2606 }
2607
2608 *data = frame + off;
2609 return tp_len;
2610}
2611
69e3c75f
JB
2612static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2613{
69e3c75f
JB
2614 struct sk_buff *skb;
2615 struct net_device *dev;
1d036d25 2616 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2617 struct sockcm_cookie sockc;
69e3c75f 2618 __be16 proto;
09effa67 2619 int err, reserve = 0;
40d4e3df 2620 void *ph;
342dfc30 2621 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2622 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2623 int tp_len, size_max;
2624 unsigned char *addr;
8d39b4a6 2625 void *data;
69e3c75f 2626 int len_sum = 0;
9e67030a 2627 int status = TP_STATUS_AVAILABLE;
1d036d25 2628 int hlen, tlen, copylen = 0;
69e3c75f 2629
69e3c75f
JB
2630 mutex_lock(&po->pg_vec_lock);
2631
66e56cd4 2632 if (likely(saddr == NULL)) {
e40526cb 2633 dev = packet_cached_dev_get(po);
69e3c75f
JB
2634 proto = po->num;
2635 addr = NULL;
2636 } else {
2637 err = -EINVAL;
2638 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2639 goto out;
2640 if (msg->msg_namelen < (saddr->sll_halen
2641 + offsetof(struct sockaddr_ll,
2642 sll_addr)))
2643 goto out;
69e3c75f
JB
2644 proto = saddr->sll_protocol;
2645 addr = saddr->sll_addr;
827d9780 2646 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2647 }
2648
c14ac945
SHY
2649 sockc.tsflags = 0;
2650 if (msg->msg_controllen) {
2651 err = sock_cmsg_send(&po->sk, msg, &sockc);
2652 if (unlikely(err))
2653 goto out;
2654 }
2655
69e3c75f
JB
2656 err = -ENXIO;
2657 if (unlikely(dev == NULL))
2658 goto out;
69e3c75f
JB
2659 err = -ENETDOWN;
2660 if (unlikely(!(dev->flags & IFF_UP)))
2661 goto out_put;
2662
5cfb4c8d
DB
2663 if (po->sk.sk_socket->type == SOCK_RAW)
2664 reserve = dev->hard_header_len;
69e3c75f 2665 size_max = po->tx_ring.frame_size
b5dd884e 2666 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2667
1d036d25 2668 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2669 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2670
69e3c75f
JB
2671 do {
2672 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2673 TP_STATUS_SEND_REQUEST);
69e3c75f 2674 if (unlikely(ph == NULL)) {
87a2fd28
DB
2675 if (need_wait && need_resched())
2676 schedule();
69e3c75f
JB
2677 continue;
2678 }
2679
8d39b4a6
WB
2680 skb = NULL;
2681 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2682 if (tp_len < 0)
2683 goto tpacket_error;
2684
69e3c75f 2685 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2686 hlen = LL_RESERVED_SPACE(dev);
2687 tlen = dev->needed_tailroom;
1d036d25
WB
2688 if (po->has_vnet_hdr) {
2689 vnet_hdr = data;
2690 data += sizeof(*vnet_hdr);
2691 tp_len -= sizeof(*vnet_hdr);
2692 if (tp_len < 0 ||
2693 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2694 tp_len = -EINVAL;
2695 goto tpacket_error;
2696 }
2697 copylen = __virtio16_to_cpu(vio_le(),
2698 vnet_hdr->hdr_len);
2699 }
9ed988cd 2700 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2701 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2702 hlen + tlen + sizeof(struct sockaddr_ll) +
2703 (copylen - dev->hard_header_len),
fbf33a28 2704 !need_wait, &err);
69e3c75f 2705
fbf33a28
KM
2706 if (unlikely(skb == NULL)) {
2707 /* we assume the socket was initially writeable ... */
2708 if (likely(len_sum > 0))
2709 err = len_sum;
69e3c75f 2710 goto out_status;
fbf33a28 2711 }
8d39b4a6 2712 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2713 addr, hlen, copylen, &sockc);
dbd46ab4 2714 if (likely(tp_len >= 0) &&
5cfb4c8d 2715 tp_len > dev->mtu + reserve &&
1d036d25 2716 !po->has_vnet_hdr &&
3c70c132
DB
2717 !packet_extra_vlan_len_allowed(dev, skb))
2718 tp_len = -EMSGSIZE;
69e3c75f
JB
2719
2720 if (unlikely(tp_len < 0)) {
8d39b4a6 2721tpacket_error:
69e3c75f
JB
2722 if (po->tp_loss) {
2723 __packet_set_status(po, ph,
2724 TP_STATUS_AVAILABLE);
2725 packet_increment_head(&po->tx_ring);
2726 kfree_skb(skb);
2727 continue;
2728 } else {
2729 status = TP_STATUS_WRONG_FORMAT;
2730 err = tp_len;
2731 goto out_status;
2732 }
2733 }
2734
1d036d25
WB
2735 if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
2736 tp_len = -EINVAL;
2737 goto tpacket_error;
2738 }
2739
0fd5d57b
DB
2740 packet_pick_tx_queue(dev, skb);
2741
69e3c75f
JB
2742 skb->destructor = tpacket_destruct_skb;
2743 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2744 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2745
2746 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2747 err = po->xmit(skb);
eb70df13
JP
2748 if (unlikely(err > 0)) {
2749 err = net_xmit_errno(err);
2750 if (err && __packet_get_status(po, ph) ==
2751 TP_STATUS_AVAILABLE) {
2752 /* skb was destructed already */
2753 skb = NULL;
2754 goto out_status;
2755 }
2756 /*
2757 * skb was dropped but not destructed yet;
2758 * let's treat it like congestion or err < 0
2759 */
2760 err = 0;
2761 }
69e3c75f
JB
2762 packet_increment_head(&po->tx_ring);
2763 len_sum += tp_len;
b0138408
DB
2764 } while (likely((ph != NULL) ||
2765 /* Note: packet_read_pending() might be slow if we have
2766 * to call it as it's per_cpu variable, but in fast-path
2767 * we already short-circuit the loop with the first
2768 * condition, and luckily don't have to go that path
2769 * anyway.
2770 */
2771 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2772
2773 err = len_sum;
2774 goto out_put;
2775
69e3c75f
JB
2776out_status:
2777 __packet_set_status(po, ph, status);
2778 kfree_skb(skb);
2779out_put:
e40526cb 2780 dev_put(dev);
69e3c75f
JB
2781out:
2782 mutex_unlock(&po->pg_vec_lock);
2783 return err;
2784}
69e3c75f 2785
eea49cc9
OJ
2786static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2787 size_t reserve, size_t len,
2788 size_t linear, int noblock,
2789 int *err)
bfd5f4a3
SS
2790{
2791 struct sk_buff *skb;
2792
2793 /* Under a page? Don't bother with paged skb. */
2794 if (prepad + len < PAGE_SIZE || !linear)
2795 linear = len;
2796
2797 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2798 err, 0);
bfd5f4a3
SS
2799 if (!skb)
2800 return NULL;
2801
2802 skb_reserve(skb, reserve);
2803 skb_put(skb, linear);
2804 skb->data_len = len - linear;
2805 skb->len += len - linear;
2806
2807 return skb;
2808}
2809
d346a3fa 2810static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2811{
2812 struct sock *sk = sock->sk;
342dfc30 2813 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2814 struct sk_buff *skb;
2815 struct net_device *dev;
0e11c91e 2816 __be16 proto;
1da177e4 2817 unsigned char *addr;
827d9780 2818 int err, reserve = 0;
c7d39e32 2819 struct sockcm_cookie sockc;
bfd5f4a3
SS
2820 struct virtio_net_hdr vnet_hdr = { 0 };
2821 int offset = 0;
bfd5f4a3 2822 struct packet_sock *po = pkt_sk(sk);
ae641949 2823 int hlen, tlen;
3bdc0eba 2824 int extra_len = 0;
1da177e4
LT
2825
2826 /*
1ce4f28b 2827 * Get and verify the address.
1da177e4 2828 */
1ce4f28b 2829
66e56cd4 2830 if (likely(saddr == NULL)) {
e40526cb 2831 dev = packet_cached_dev_get(po);
1da177e4
LT
2832 proto = po->num;
2833 addr = NULL;
2834 } else {
2835 err = -EINVAL;
2836 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2837 goto out;
0fb375fb
EB
2838 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2839 goto out;
1da177e4
LT
2840 proto = saddr->sll_protocol;
2841 addr = saddr->sll_addr;
827d9780 2842 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2843 }
2844
1da177e4 2845 err = -ENXIO;
e40526cb 2846 if (unlikely(dev == NULL))
1da177e4 2847 goto out_unlock;
d5e76b0a 2848 err = -ENETDOWN;
e40526cb 2849 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2850 goto out_unlock;
2851
c14ac945 2852 sockc.tsflags = 0;
c7d39e32
EJ
2853 sockc.mark = sk->sk_mark;
2854 if (msg->msg_controllen) {
2855 err = sock_cmsg_send(sk, msg, &sockc);
2856 if (unlikely(err))
2857 goto out_unlock;
2858 }
2859
e40526cb
DB
2860 if (sock->type == SOCK_RAW)
2861 reserve = dev->hard_header_len;
bfd5f4a3 2862 if (po->has_vnet_hdr) {
16cc1400
WB
2863 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2864 if (err)
bfd5f4a3 2865 goto out_unlock;
bfd5f4a3
SS
2866 }
2867
3bdc0eba
BG
2868 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2869 if (!netif_supports_nofcs(dev)) {
2870 err = -EPROTONOSUPPORT;
2871 goto out_unlock;
2872 }
2873 extra_len = 4; /* We're doing our own CRC */
2874 }
2875
1da177e4 2876 err = -EMSGSIZE;
16cc1400
WB
2877 if (!vnet_hdr.gso_type &&
2878 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2879 goto out_unlock;
2880
bfd5f4a3 2881 err = -ENOBUFS;
ae641949
HX
2882 hlen = LL_RESERVED_SPACE(dev);
2883 tlen = dev->needed_tailroom;
dc9e5153 2884 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
d3869efe 2885 __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
bfd5f4a3 2886 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2887 if (skb == NULL)
1da177e4
LT
2888 goto out_unlock;
2889
bfd5f4a3 2890 skb_set_network_header(skb, reserve);
1da177e4 2891
0c4e8581 2892 err = -EINVAL;
9c707762
WB
2893 if (sock->type == SOCK_DGRAM) {
2894 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2895 if (unlikely(offset < 0))
9c707762 2896 goto out_free;
9c707762 2897 }
1da177e4
LT
2898
2899 /* Returns -EFAULT on error */
c0371da6 2900 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2901 if (err)
2902 goto out_free;
bf84a010 2903
9ed988cd
WB
2904 if (sock->type == SOCK_RAW &&
2905 !dev_validate_header(dev, skb->data, len)) {
2906 err = -EINVAL;
2907 goto out_free;
2908 }
2909
c14ac945 2910 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2911
16cc1400 2912 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2913 !packet_extra_vlan_len_allowed(dev, skb)) {
2914 err = -EMSGSIZE;
2915 goto out_free;
57f89bfa
BG
2916 }
2917
09effa67
DM
2918 skb->protocol = proto;
2919 skb->dev = dev;
1da177e4 2920 skb->priority = sk->sk_priority;
c7d39e32 2921 skb->mark = sockc.mark;
0fd5d57b
DB
2922
2923 packet_pick_tx_queue(dev, skb);
1da177e4 2924
bfd5f4a3 2925 if (po->has_vnet_hdr) {
16cc1400
WB
2926 err = packet_snd_vnet_gso(skb, &vnet_hdr);
2927 if (err)
2928 goto out_free;
2929 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2930 }
2931
8fd6c80d
DB
2932 skb_probe_transport_header(skb, reserve);
2933
3bdc0eba
BG
2934 if (unlikely(extra_len == 4))
2935 skb->no_fcs = 1;
2936
d346a3fa 2937 err = po->xmit(skb);
1da177e4
LT
2938 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2939 goto out_unlock;
2940
e40526cb 2941 dev_put(dev);
1da177e4 2942
40d4e3df 2943 return len;
1da177e4
LT
2944
2945out_free:
2946 kfree_skb(skb);
2947out_unlock:
e40526cb 2948 if (dev)
1da177e4
LT
2949 dev_put(dev);
2950out:
2951 return err;
2952}
2953
1b784140 2954static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2955{
69e3c75f
JB
2956 struct sock *sk = sock->sk;
2957 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2958
69e3c75f
JB
2959 if (po->tx_ring.pg_vec)
2960 return tpacket_snd(po, msg);
2961 else
69e3c75f
JB
2962 return packet_snd(sock, msg, len);
2963}
2964
1da177e4
LT
2965/*
2966 * Close a PACKET socket. This is fairly simple. We immediately go
2967 * to 'closed' state and remove our protocol entry in the device list.
2968 */
2969
2970static int packet_release(struct socket *sock)
2971{
2972 struct sock *sk = sock->sk;
2973 struct packet_sock *po;
d12d01d6 2974 struct net *net;
f6fb8f10 2975 union tpacket_req_u req_u;
1da177e4
LT
2976
2977 if (!sk)
2978 return 0;
2979
3b1e0a65 2980 net = sock_net(sk);
1da177e4
LT
2981 po = pkt_sk(sk);
2982
0fa7fa98 2983 mutex_lock(&net->packet.sklist_lock);
808f5114 2984 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2985 mutex_unlock(&net->packet.sklist_lock);
2986
2987 preempt_disable();
920de804 2988 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2989 preempt_enable();
1da177e4 2990
808f5114 2991 spin_lock(&po->bind_lock);
ce06b03e 2992 unregister_prot_hook(sk, false);
66e56cd4
DB
2993 packet_cached_dev_reset(po);
2994
160ff18a
BG
2995 if (po->prot_hook.dev) {
2996 dev_put(po->prot_hook.dev);
2997 po->prot_hook.dev = NULL;
2998 }
808f5114 2999 spin_unlock(&po->bind_lock);
1da177e4 3000
1da177e4 3001 packet_flush_mclist(sk);
1da177e4 3002
9665d5d6
PS
3003 if (po->rx_ring.pg_vec) {
3004 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3005 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3006 }
69e3c75f 3007
9665d5d6
PS
3008 if (po->tx_ring.pg_vec) {
3009 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3010 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3011 }
1da177e4 3012
dc99f600
DM
3013 fanout_release(sk);
3014
808f5114 3015 synchronize_net();
1da177e4
LT
3016 /*
3017 * Now the socket is dead. No more input will appear.
3018 */
1da177e4
LT
3019 sock_orphan(sk);
3020 sock->sk = NULL;
3021
3022 /* Purge queues */
3023
3024 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3025 packet_free_pending(po);
17ab56a2 3026 sk_refcnt_debug_release(sk);
1da177e4
LT
3027
3028 sock_put(sk);
3029 return 0;
3030}
3031
3032/*
3033 * Attach a packet hook.
3034 */
3035
30f7ea1c
FR
3036static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3037 __be16 proto)
1da177e4
LT
3038{
3039 struct packet_sock *po = pkt_sk(sk);
158cd4af 3040 struct net_device *dev_curr;
902fefb8
DB
3041 __be16 proto_curr;
3042 bool need_rehook;
30f7ea1c
FR
3043 struct net_device *dev = NULL;
3044 int ret = 0;
3045 bool unlisted = false;
dc99f600 3046
30f7ea1c 3047 if (po->fanout)
dc99f600 3048 return -EINVAL;
1da177e4
LT
3049
3050 lock_sock(sk);
1da177e4 3051 spin_lock(&po->bind_lock);
30f7ea1c
FR
3052 rcu_read_lock();
3053
3054 if (name) {
3055 dev = dev_get_by_name_rcu(sock_net(sk), name);
3056 if (!dev) {
3057 ret = -ENODEV;
3058 goto out_unlock;
3059 }
3060 } else if (ifindex) {
3061 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3062 if (!dev) {
3063 ret = -ENODEV;
3064 goto out_unlock;
3065 }
3066 }
3067
3068 if (dev)
3069 dev_hold(dev);
66e56cd4 3070
902fefb8
DB
3071 proto_curr = po->prot_hook.type;
3072 dev_curr = po->prot_hook.dev;
3073
3074 need_rehook = proto_curr != proto || dev_curr != dev;
3075
3076 if (need_rehook) {
30f7ea1c
FR
3077 if (po->running) {
3078 rcu_read_unlock();
3079 __unregister_prot_hook(sk, true);
3080 rcu_read_lock();
3081 dev_curr = po->prot_hook.dev;
3082 if (dev)
3083 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3084 dev->ifindex);
3085 }
1da177e4 3086
902fefb8
DB
3087 po->num = proto;
3088 po->prot_hook.type = proto;
902fefb8 3089
30f7ea1c
FR
3090 if (unlikely(unlisted)) {
3091 dev_put(dev);
3092 po->prot_hook.dev = NULL;
3093 po->ifindex = -1;
3094 packet_cached_dev_reset(po);
3095 } else {
3096 po->prot_hook.dev = dev;
3097 po->ifindex = dev ? dev->ifindex : 0;
3098 packet_cached_dev_assign(po, dev);
3099 }
902fefb8 3100 }
158cd4af
LW
3101 if (dev_curr)
3102 dev_put(dev_curr);
66e56cd4 3103
902fefb8 3104 if (proto == 0 || !need_rehook)
1da177e4
LT
3105 goto out_unlock;
3106
30f7ea1c 3107 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3108 register_prot_hook(sk);
be85d4ad
UT
3109 } else {
3110 sk->sk_err = ENETDOWN;
3111 if (!sock_flag(sk, SOCK_DEAD))
3112 sk->sk_error_report(sk);
1da177e4
LT
3113 }
3114
3115out_unlock:
30f7ea1c 3116 rcu_read_unlock();
1da177e4
LT
3117 spin_unlock(&po->bind_lock);
3118 release_sock(sk);
30f7ea1c 3119 return ret;
1da177e4
LT
3120}
3121
3122/*
3123 * Bind a packet socket to a device
3124 */
3125
40d4e3df
ED
3126static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3127 int addr_len)
1da177e4 3128{
40d4e3df 3129 struct sock *sk = sock->sk;
1da177e4 3130 char name[15];
1ce4f28b 3131
1da177e4
LT
3132 /*
3133 * Check legality
3134 */
1ce4f28b 3135
8ae55f04 3136 if (addr_len != sizeof(struct sockaddr))
1da177e4 3137 return -EINVAL;
40d4e3df 3138 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 3139
30f7ea1c 3140 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3141}
1da177e4
LT
3142
3143static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3144{
40d4e3df
ED
3145 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3146 struct sock *sk = sock->sk;
1da177e4
LT
3147
3148 /*
3149 * Check legality
3150 */
1ce4f28b 3151
1da177e4
LT
3152 if (addr_len < sizeof(struct sockaddr_ll))
3153 return -EINVAL;
3154 if (sll->sll_family != AF_PACKET)
3155 return -EINVAL;
3156
30f7ea1c
FR
3157 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3158 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3159}
3160
3161static struct proto packet_proto = {
3162 .name = "PACKET",
3163 .owner = THIS_MODULE,
3164 .obj_size = sizeof(struct packet_sock),
3165};
3166
3167/*
1ce4f28b 3168 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3169 */
3170
3f378b68
EP
3171static int packet_create(struct net *net, struct socket *sock, int protocol,
3172 int kern)
1da177e4
LT
3173{
3174 struct sock *sk;
3175 struct packet_sock *po;
0e11c91e 3176 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3177 int err;
3178
df008c91 3179 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3180 return -EPERM;
be02097c
DM
3181 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3182 sock->type != SOCK_PACKET)
1da177e4
LT
3183 return -ESOCKTNOSUPPORT;
3184
3185 sock->state = SS_UNCONNECTED;
3186
3187 err = -ENOBUFS;
11aa9c28 3188 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3189 if (sk == NULL)
3190 goto out;
3191
3192 sock->ops = &packet_ops;
1da177e4
LT
3193 if (sock->type == SOCK_PACKET)
3194 sock->ops = &packet_ops_spkt;
be02097c 3195
1da177e4
LT
3196 sock_init_data(sock, sk);
3197
3198 po = pkt_sk(sk);
3199 sk->sk_family = PF_PACKET;
0e11c91e 3200 po->num = proto;
d346a3fa 3201 po->xmit = dev_queue_xmit;
66e56cd4 3202
b0138408
DB
3203 err = packet_alloc_pending(po);
3204 if (err)
3205 goto out2;
3206
66e56cd4 3207 packet_cached_dev_reset(po);
1da177e4
LT
3208
3209 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3210 sk_refcnt_debug_inc(sk);
1da177e4
LT
3211
3212 /*
3213 * Attach a protocol block
3214 */
3215
3216 spin_lock_init(&po->bind_lock);
905db440 3217 mutex_init(&po->pg_vec_lock);
0648ab70 3218 po->rollover = NULL;
1da177e4 3219 po->prot_hook.func = packet_rcv;
be02097c 3220
1da177e4
LT
3221 if (sock->type == SOCK_PACKET)
3222 po->prot_hook.func = packet_rcv_spkt;
be02097c 3223
1da177e4
LT
3224 po->prot_hook.af_packet_priv = sk;
3225
0e11c91e
AV
3226 if (proto) {
3227 po->prot_hook.type = proto;
ce06b03e 3228 register_prot_hook(sk);
1da177e4
LT
3229 }
3230
0fa7fa98 3231 mutex_lock(&net->packet.sklist_lock);
808f5114 3232 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3233 mutex_unlock(&net->packet.sklist_lock);
3234
3235 preempt_disable();
3680453c 3236 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3237 preempt_enable();
808f5114 3238
40d4e3df 3239 return 0;
b0138408
DB
3240out2:
3241 sk_free(sk);
1da177e4
LT
3242out:
3243 return err;
3244}
3245
3246/*
3247 * Pull a packet from our receive queue and hand it to the user.
3248 * If necessary we block.
3249 */
3250
1b784140
YX
3251static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3252 int flags)
1da177e4
LT
3253{
3254 struct sock *sk = sock->sk;
3255 struct sk_buff *skb;
3256 int copied, err;
bfd5f4a3 3257 int vnet_hdr_len = 0;
2472d761 3258 unsigned int origlen = 0;
1da177e4
LT
3259
3260 err = -EINVAL;
ed85b565 3261 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3262 goto out;
3263
3264#if 0
3265 /* What error should we return now? EUNATTACH? */
3266 if (pkt_sk(sk)->ifindex < 0)
3267 return -ENODEV;
3268#endif
3269
ed85b565 3270 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3271 err = sock_recv_errqueue(sk, msg, len,
3272 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3273 goto out;
3274 }
3275
1da177e4
LT
3276 /*
3277 * Call the generic datagram receiver. This handles all sorts
3278 * of horrible races and re-entrancy so we can forget about it
3279 * in the protocol layers.
3280 *
3281 * Now it will return ENETDOWN, if device have just gone down,
3282 * but then it will block.
3283 */
3284
40d4e3df 3285 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3286
3287 /*
1ce4f28b 3288 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3289 * handles the blocking we don't see and worry about blocking
3290 * retries.
3291 */
3292
8ae55f04 3293 if (skb == NULL)
1da177e4
LT
3294 goto out;
3295
2ccdbaa6
WB
3296 if (pkt_sk(sk)->pressure)
3297 packet_rcv_has_room(pkt_sk(sk), NULL);
3298
bfd5f4a3 3299 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3300 err = packet_rcv_vnet(msg, skb, &len);
3301 if (err)
bfd5f4a3 3302 goto out_free;
16cc1400 3303 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3304 }
3305
f3d33426
HFS
3306 /* You lose any data beyond the buffer you gave. If it worries
3307 * a user program they can ask the device for its MTU
3308 * anyway.
1da177e4 3309 */
1da177e4 3310 copied = skb->len;
40d4e3df
ED
3311 if (copied > len) {
3312 copied = len;
3313 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3314 }
3315
51f3d02b 3316 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3317 if (err)
3318 goto out_free;
3319
2472d761
EB
3320 if (sock->type != SOCK_PACKET) {
3321 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3322
3323 /* Original length was stored in sockaddr_ll fields */
3324 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3325 sll->sll_family = AF_PACKET;
3326 sll->sll_protocol = skb->protocol;
3327 }
3328
3b885787 3329 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3330
f3d33426
HFS
3331 if (msg->msg_name) {
3332 /* If the address length field is there to be filled
3333 * in, we fill it in now.
3334 */
3335 if (sock->type == SOCK_PACKET) {
342dfc30 3336 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3337 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3338 } else {
3339 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3340
f3d33426
HFS
3341 msg->msg_namelen = sll->sll_halen +
3342 offsetof(struct sockaddr_ll, sll_addr);
3343 }
ffbc6111
HX
3344 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3345 msg->msg_namelen);
f3d33426 3346 }
1da177e4 3347
8dc41944 3348 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3349 struct tpacket_auxdata aux;
3350
3351 aux.tp_status = TP_STATUS_USER;
3352 if (skb->ip_summed == CHECKSUM_PARTIAL)
3353 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3354 else if (skb->pkt_type != PACKET_OUTGOING &&
3355 (skb->ip_summed == CHECKSUM_COMPLETE ||
3356 skb_csum_unnecessary(skb)))
3357 aux.tp_status |= TP_STATUS_CSUM_VALID;
3358
2472d761 3359 aux.tp_len = origlen;
ffbc6111
HX
3360 aux.tp_snaplen = skb->len;
3361 aux.tp_mac = 0;
bbe735e4 3362 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3363 if (skb_vlan_tag_present(skb)) {
3364 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3365 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3366 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3367 } else {
3368 aux.tp_vlan_tci = 0;
a0cdfcf3 3369 aux.tp_vlan_tpid = 0;
a3bcc23e 3370 }
ffbc6111 3371 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3372 }
3373
1da177e4
LT
3374 /*
3375 * Free or return the buffer as appropriate. Again this
3376 * hides all the races and re-entrancy issues from us.
3377 */
bfd5f4a3 3378 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3379
3380out_free:
3381 skb_free_datagram(sk, skb);
3382out:
3383 return err;
3384}
3385
1da177e4
LT
3386static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3387 int *uaddr_len, int peer)
3388{
3389 struct net_device *dev;
3390 struct sock *sk = sock->sk;
3391
3392 if (peer)
3393 return -EOPNOTSUPP;
3394
3395 uaddr->sa_family = AF_PACKET;
2dc85bf3 3396 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3397 rcu_read_lock();
3398 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3399 if (dev)
2dc85bf3 3400 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3401 rcu_read_unlock();
1da177e4
LT
3402 *uaddr_len = sizeof(*uaddr);
3403
3404 return 0;
3405}
1da177e4
LT
3406
3407static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3408 int *uaddr_len, int peer)
3409{
3410 struct net_device *dev;
3411 struct sock *sk = sock->sk;
3412 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3413 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3414
3415 if (peer)
3416 return -EOPNOTSUPP;
3417
3418 sll->sll_family = AF_PACKET;
3419 sll->sll_ifindex = po->ifindex;
3420 sll->sll_protocol = po->num;
67286640 3421 sll->sll_pkttype = 0;
654d1f8a
ED
3422 rcu_read_lock();
3423 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3424 if (dev) {
3425 sll->sll_hatype = dev->type;
3426 sll->sll_halen = dev->addr_len;
3427 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3428 } else {
3429 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3430 sll->sll_halen = 0;
3431 }
654d1f8a 3432 rcu_read_unlock();
0fb375fb 3433 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3434
3435 return 0;
3436}
3437
2aeb0b88
WC
3438static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3439 int what)
1da177e4
LT
3440{
3441 switch (i->type) {
3442 case PACKET_MR_MULTICAST:
1162563f
JP
3443 if (i->alen != dev->addr_len)
3444 return -EINVAL;
1da177e4 3445 if (what > 0)
22bedad3 3446 return dev_mc_add(dev, i->addr);
1da177e4 3447 else
22bedad3 3448 return dev_mc_del(dev, i->addr);
1da177e4
LT
3449 break;
3450 case PACKET_MR_PROMISC:
2aeb0b88 3451 return dev_set_promiscuity(dev, what);
1da177e4 3452 case PACKET_MR_ALLMULTI:
2aeb0b88 3453 return dev_set_allmulti(dev, what);
d95ed927 3454 case PACKET_MR_UNICAST:
1162563f
JP
3455 if (i->alen != dev->addr_len)
3456 return -EINVAL;
d95ed927 3457 if (what > 0)
a748ee24 3458 return dev_uc_add(dev, i->addr);
d95ed927 3459 else
a748ee24 3460 return dev_uc_del(dev, i->addr);
d95ed927 3461 break;
40d4e3df
ED
3462 default:
3463 break;
1da177e4 3464 }
2aeb0b88 3465 return 0;
1da177e4
LT
3466}
3467
82f17091
FR
3468static void packet_dev_mclist_delete(struct net_device *dev,
3469 struct packet_mclist **mlp)
1da177e4 3470{
82f17091
FR
3471 struct packet_mclist *ml;
3472
3473 while ((ml = *mlp) != NULL) {
3474 if (ml->ifindex == dev->ifindex) {
3475 packet_dev_mc(dev, ml, -1);
3476 *mlp = ml->next;
3477 kfree(ml);
3478 } else
3479 mlp = &ml->next;
1da177e4
LT
3480 }
3481}
3482
0fb375fb 3483static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3484{
3485 struct packet_sock *po = pkt_sk(sk);
3486 struct packet_mclist *ml, *i;
3487 struct net_device *dev;
3488 int err;
3489
3490 rtnl_lock();
3491
3492 err = -ENODEV;
3b1e0a65 3493 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3494 if (!dev)
3495 goto done;
3496
3497 err = -EINVAL;
1162563f 3498 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3499 goto done;
3500
3501 err = -ENOBUFS;
8b3a7005 3502 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3503 if (i == NULL)
3504 goto done;
3505
3506 err = 0;
3507 for (ml = po->mclist; ml; ml = ml->next) {
3508 if (ml->ifindex == mreq->mr_ifindex &&
3509 ml->type == mreq->mr_type &&
3510 ml->alen == mreq->mr_alen &&
3511 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3512 ml->count++;
3513 /* Free the new element ... */
3514 kfree(i);
3515 goto done;
3516 }
3517 }
3518
3519 i->type = mreq->mr_type;
3520 i->ifindex = mreq->mr_ifindex;
3521 i->alen = mreq->mr_alen;
3522 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3523 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3524 i->count = 1;
3525 i->next = po->mclist;
3526 po->mclist = i;
2aeb0b88
WC
3527 err = packet_dev_mc(dev, i, 1);
3528 if (err) {
3529 po->mclist = i->next;
3530 kfree(i);
3531 }
1da177e4
LT
3532
3533done:
3534 rtnl_unlock();
3535 return err;
3536}
3537
0fb375fb 3538static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3539{
3540 struct packet_mclist *ml, **mlp;
3541
3542 rtnl_lock();
3543
3544 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3545 if (ml->ifindex == mreq->mr_ifindex &&
3546 ml->type == mreq->mr_type &&
3547 ml->alen == mreq->mr_alen &&
3548 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3549 if (--ml->count == 0) {
3550 struct net_device *dev;
3551 *mlp = ml->next;
ad959e76
ED
3552 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3553 if (dev)
1da177e4 3554 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3555 kfree(ml);
3556 }
82f17091 3557 break;
1da177e4
LT
3558 }
3559 }
3560 rtnl_unlock();
82f17091 3561 return 0;
1da177e4
LT
3562}
3563
3564static void packet_flush_mclist(struct sock *sk)
3565{
3566 struct packet_sock *po = pkt_sk(sk);
3567 struct packet_mclist *ml;
3568
3569 if (!po->mclist)
3570 return;
3571
3572 rtnl_lock();
3573 while ((ml = po->mclist) != NULL) {
3574 struct net_device *dev;
3575
3576 po->mclist = ml->next;
ad959e76
ED
3577 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3578 if (dev != NULL)
1da177e4 3579 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3580 kfree(ml);
3581 }
3582 rtnl_unlock();
3583}
1da177e4
LT
3584
3585static int
b7058842 3586packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3587{
3588 struct sock *sk = sock->sk;
8dc41944 3589 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3590 int ret;
3591
3592 if (level != SOL_PACKET)
3593 return -ENOPROTOOPT;
3594
69e3c75f 3595 switch (optname) {
1ce4f28b 3596 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3597 case PACKET_DROP_MEMBERSHIP:
3598 {
0fb375fb
EB
3599 struct packet_mreq_max mreq;
3600 int len = optlen;
3601 memset(&mreq, 0, sizeof(mreq));
3602 if (len < sizeof(struct packet_mreq))
1da177e4 3603 return -EINVAL;
0fb375fb
EB
3604 if (len > sizeof(mreq))
3605 len = sizeof(mreq);
40d4e3df 3606 if (copy_from_user(&mreq, optval, len))
1da177e4 3607 return -EFAULT;
0fb375fb
EB
3608 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3609 return -EINVAL;
1da177e4
LT
3610 if (optname == PACKET_ADD_MEMBERSHIP)
3611 ret = packet_mc_add(sk, &mreq);
3612 else
3613 ret = packet_mc_drop(sk, &mreq);
3614 return ret;
3615 }
a2efcfa0 3616
1da177e4 3617 case PACKET_RX_RING:
69e3c75f 3618 case PACKET_TX_RING:
1da177e4 3619 {
f6fb8f10 3620 union tpacket_req_u req_u;
3621 int len;
1da177e4 3622
f6fb8f10 3623 switch (po->tp_version) {
3624 case TPACKET_V1:
3625 case TPACKET_V2:
3626 len = sizeof(req_u.req);
3627 break;
3628 case TPACKET_V3:
3629 default:
3630 len = sizeof(req_u.req3);
3631 break;
3632 }
3633 if (optlen < len)
1da177e4 3634 return -EINVAL;
f6fb8f10 3635 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3636 return -EFAULT;
f6fb8f10 3637 return packet_set_ring(sk, &req_u, 0,
3638 optname == PACKET_TX_RING);
1da177e4
LT
3639 }
3640 case PACKET_COPY_THRESH:
3641 {
3642 int val;
3643
40d4e3df 3644 if (optlen != sizeof(val))
1da177e4 3645 return -EINVAL;
40d4e3df 3646 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3647 return -EFAULT;
3648
3649 pkt_sk(sk)->copy_thresh = val;
3650 return 0;
3651 }
bbd6ef87
PM
3652 case PACKET_VERSION:
3653 {
3654 int val;
3655
3656 if (optlen != sizeof(val))
3657 return -EINVAL;
69e3c75f 3658 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3659 return -EBUSY;
3660 if (copy_from_user(&val, optval, sizeof(val)))
3661 return -EFAULT;
3662 switch (val) {
3663 case TPACKET_V1:
3664 case TPACKET_V2:
f6fb8f10 3665 case TPACKET_V3:
bbd6ef87
PM
3666 po->tp_version = val;
3667 return 0;
3668 default:
3669 return -EINVAL;
3670 }
3671 }
8913336a
PM
3672 case PACKET_RESERVE:
3673 {
3674 unsigned int val;
3675
3676 if (optlen != sizeof(val))
3677 return -EINVAL;
69e3c75f 3678 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3679 return -EBUSY;
3680 if (copy_from_user(&val, optval, sizeof(val)))
3681 return -EFAULT;
3682 po->tp_reserve = val;
3683 return 0;
3684 }
69e3c75f
JB
3685 case PACKET_LOSS:
3686 {
3687 unsigned int val;
3688
3689 if (optlen != sizeof(val))
3690 return -EINVAL;
3691 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3692 return -EBUSY;
3693 if (copy_from_user(&val, optval, sizeof(val)))
3694 return -EFAULT;
3695 po->tp_loss = !!val;
3696 return 0;
3697 }
8dc41944
HX
3698 case PACKET_AUXDATA:
3699 {
3700 int val;
3701
3702 if (optlen < sizeof(val))
3703 return -EINVAL;
3704 if (copy_from_user(&val, optval, sizeof(val)))
3705 return -EFAULT;
3706
3707 po->auxdata = !!val;
3708 return 0;
3709 }
80feaacb
PWJ
3710 case PACKET_ORIGDEV:
3711 {
3712 int val;
3713
3714 if (optlen < sizeof(val))
3715 return -EINVAL;
3716 if (copy_from_user(&val, optval, sizeof(val)))
3717 return -EFAULT;
3718
3719 po->origdev = !!val;
3720 return 0;
3721 }
bfd5f4a3
SS
3722 case PACKET_VNET_HDR:
3723 {
3724 int val;
3725
3726 if (sock->type != SOCK_RAW)
3727 return -EINVAL;
3728 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3729 return -EBUSY;
3730 if (optlen < sizeof(val))
3731 return -EINVAL;
3732 if (copy_from_user(&val, optval, sizeof(val)))
3733 return -EFAULT;
3734
3735 po->has_vnet_hdr = !!val;
3736 return 0;
3737 }
614f60fa
SM
3738 case PACKET_TIMESTAMP:
3739 {
3740 int val;
3741
3742 if (optlen != sizeof(val))
3743 return -EINVAL;
3744 if (copy_from_user(&val, optval, sizeof(val)))
3745 return -EFAULT;
3746
3747 po->tp_tstamp = val;
3748 return 0;
3749 }
dc99f600
DM
3750 case PACKET_FANOUT:
3751 {
3752 int val;
3753
3754 if (optlen != sizeof(val))
3755 return -EINVAL;
3756 if (copy_from_user(&val, optval, sizeof(val)))
3757 return -EFAULT;
3758
3759 return fanout_add(sk, val & 0xffff, val >> 16);
3760 }
47dceb8e
WB
3761 case PACKET_FANOUT_DATA:
3762 {
3763 if (!po->fanout)
3764 return -EINVAL;
3765
3766 return fanout_set_data(po, optval, optlen);
3767 }
5920cd3a
PC
3768 case PACKET_TX_HAS_OFF:
3769 {
3770 unsigned int val;
3771
3772 if (optlen != sizeof(val))
3773 return -EINVAL;
3774 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3775 return -EBUSY;
3776 if (copy_from_user(&val, optval, sizeof(val)))
3777 return -EFAULT;
3778 po->tp_tx_has_off = !!val;
3779 return 0;
3780 }
d346a3fa
DB
3781 case PACKET_QDISC_BYPASS:
3782 {
3783 int val;
3784
3785 if (optlen != sizeof(val))
3786 return -EINVAL;
3787 if (copy_from_user(&val, optval, sizeof(val)))
3788 return -EFAULT;
3789
3790 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3791 return 0;
3792 }
1da177e4
LT
3793 default:
3794 return -ENOPROTOOPT;
3795 }
3796}
3797
3798static int packet_getsockopt(struct socket *sock, int level, int optname,
3799 char __user *optval, int __user *optlen)
3800{
3801 int len;
c06fff6e 3802 int val, lv = sizeof(val);
1da177e4
LT
3803 struct sock *sk = sock->sk;
3804 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3805 void *data = &val;
ee80fbf3 3806 union tpacket_stats_u st;
a9b63918 3807 struct tpacket_rollover_stats rstats;
1da177e4
LT
3808
3809 if (level != SOL_PACKET)
3810 return -ENOPROTOOPT;
3811
8ae55f04
KK
3812 if (get_user(len, optlen))
3813 return -EFAULT;
1da177e4
LT
3814
3815 if (len < 0)
3816 return -EINVAL;
1ce4f28b 3817
69e3c75f 3818 switch (optname) {
1da177e4 3819 case PACKET_STATISTICS:
1da177e4 3820 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3821 memcpy(&st, &po->stats, sizeof(st));
3822 memset(&po->stats, 0, sizeof(po->stats));
3823 spin_unlock_bh(&sk->sk_receive_queue.lock);
3824
f6fb8f10 3825 if (po->tp_version == TPACKET_V3) {
c06fff6e 3826 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3827 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3828 data = &st.stats3;
f6fb8f10 3829 } else {
c06fff6e 3830 lv = sizeof(struct tpacket_stats);
8bcdeaff 3831 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3832 data = &st.stats1;
f6fb8f10 3833 }
ee80fbf3 3834
8dc41944
HX
3835 break;
3836 case PACKET_AUXDATA:
8dc41944 3837 val = po->auxdata;
80feaacb
PWJ
3838 break;
3839 case PACKET_ORIGDEV:
80feaacb 3840 val = po->origdev;
bfd5f4a3
SS
3841 break;
3842 case PACKET_VNET_HDR:
bfd5f4a3 3843 val = po->has_vnet_hdr;
1da177e4 3844 break;
bbd6ef87 3845 case PACKET_VERSION:
bbd6ef87 3846 val = po->tp_version;
bbd6ef87
PM
3847 break;
3848 case PACKET_HDRLEN:
3849 if (len > sizeof(int))
3850 len = sizeof(int);
3851 if (copy_from_user(&val, optval, len))
3852 return -EFAULT;
3853 switch (val) {
3854 case TPACKET_V1:
3855 val = sizeof(struct tpacket_hdr);
3856 break;
3857 case TPACKET_V2:
3858 val = sizeof(struct tpacket2_hdr);
3859 break;
f6fb8f10 3860 case TPACKET_V3:
3861 val = sizeof(struct tpacket3_hdr);
3862 break;
bbd6ef87
PM
3863 default:
3864 return -EINVAL;
3865 }
bbd6ef87 3866 break;
8913336a 3867 case PACKET_RESERVE:
8913336a 3868 val = po->tp_reserve;
8913336a 3869 break;
69e3c75f 3870 case PACKET_LOSS:
69e3c75f 3871 val = po->tp_loss;
69e3c75f 3872 break;
614f60fa 3873 case PACKET_TIMESTAMP:
614f60fa 3874 val = po->tp_tstamp;
614f60fa 3875 break;
dc99f600 3876 case PACKET_FANOUT:
dc99f600
DM
3877 val = (po->fanout ?
3878 ((u32)po->fanout->id |
77f65ebd
WB
3879 ((u32)po->fanout->type << 16) |
3880 ((u32)po->fanout->flags << 24)) :
dc99f600 3881 0);
dc99f600 3882 break;
a9b63918
WB
3883 case PACKET_ROLLOVER_STATS:
3884 if (!po->rollover)
3885 return -EINVAL;
3886 rstats.tp_all = atomic_long_read(&po->rollover->num);
3887 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3888 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3889 data = &rstats;
3890 lv = sizeof(rstats);
3891 break;
5920cd3a
PC
3892 case PACKET_TX_HAS_OFF:
3893 val = po->tp_tx_has_off;
3894 break;
d346a3fa
DB
3895 case PACKET_QDISC_BYPASS:
3896 val = packet_use_direct_xmit(po);
3897 break;
1da177e4
LT
3898 default:
3899 return -ENOPROTOOPT;
3900 }
3901
c06fff6e
ED
3902 if (len > lv)
3903 len = lv;
8ae55f04
KK
3904 if (put_user(len, optlen))
3905 return -EFAULT;
8dc41944
HX
3906 if (copy_to_user(optval, data, len))
3907 return -EFAULT;
8ae55f04 3908 return 0;
1da177e4
LT
3909}
3910
3911
719c44d3
WB
3912#ifdef CONFIG_COMPAT
3913static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3914 char __user *optval, unsigned int optlen)
3915{
3916 struct packet_sock *po = pkt_sk(sock->sk);
3917
3918 if (level != SOL_PACKET)
3919 return -ENOPROTOOPT;
3920
3921 if (optname == PACKET_FANOUT_DATA &&
3922 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3923 optval = (char __user *)get_compat_bpf_fprog(optval);
3924 if (!optval)
3925 return -EFAULT;
3926 optlen = sizeof(struct sock_fprog);
3927 }
3928
3929 return packet_setsockopt(sock, level, optname, optval, optlen);
3930}
3931#endif
3932
351638e7
JP
3933static int packet_notifier(struct notifier_block *this,
3934 unsigned long msg, void *ptr)
1da177e4
LT
3935{
3936 struct sock *sk;
351638e7 3937 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3938 struct net *net = dev_net(dev);
1da177e4 3939
808f5114 3940 rcu_read_lock();
b67bfe0d 3941 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3942 struct packet_sock *po = pkt_sk(sk);
3943
3944 switch (msg) {
3945 case NETDEV_UNREGISTER:
1da177e4 3946 if (po->mclist)
82f17091 3947 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3948 /* fallthrough */
3949
1da177e4
LT
3950 case NETDEV_DOWN:
3951 if (dev->ifindex == po->ifindex) {
3952 spin_lock(&po->bind_lock);
3953 if (po->running) {
ce06b03e 3954 __unregister_prot_hook(sk, false);
1da177e4
LT
3955 sk->sk_err = ENETDOWN;
3956 if (!sock_flag(sk, SOCK_DEAD))
3957 sk->sk_error_report(sk);
3958 }
3959 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3960 packet_cached_dev_reset(po);
1da177e4 3961 po->ifindex = -1;
160ff18a
BG
3962 if (po->prot_hook.dev)
3963 dev_put(po->prot_hook.dev);
1da177e4
LT
3964 po->prot_hook.dev = NULL;
3965 }
3966 spin_unlock(&po->bind_lock);
3967 }
3968 break;
3969 case NETDEV_UP:
808f5114 3970 if (dev->ifindex == po->ifindex) {
3971 spin_lock(&po->bind_lock);
ce06b03e
DM
3972 if (po->num)
3973 register_prot_hook(sk);
808f5114 3974 spin_unlock(&po->bind_lock);
1da177e4 3975 }
1da177e4
LT
3976 break;
3977 }
3978 }
808f5114 3979 rcu_read_unlock();
1da177e4
LT
3980 return NOTIFY_DONE;
3981}
3982
3983
3984static int packet_ioctl(struct socket *sock, unsigned int cmd,
3985 unsigned long arg)
3986{
3987 struct sock *sk = sock->sk;
3988
69e3c75f 3989 switch (cmd) {
40d4e3df
ED
3990 case SIOCOUTQ:
3991 {
3992 int amount = sk_wmem_alloc_get(sk);
31e6d363 3993
40d4e3df
ED
3994 return put_user(amount, (int __user *)arg);
3995 }
3996 case SIOCINQ:
3997 {
3998 struct sk_buff *skb;
3999 int amount = 0;
4000
4001 spin_lock_bh(&sk->sk_receive_queue.lock);
4002 skb = skb_peek(&sk->sk_receive_queue);
4003 if (skb)
4004 amount = skb->len;
4005 spin_unlock_bh(&sk->sk_receive_queue.lock);
4006 return put_user(amount, (int __user *)arg);
4007 }
4008 case SIOCGSTAMP:
4009 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4010 case SIOCGSTAMPNS:
4011 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 4012
1da177e4 4013#ifdef CONFIG_INET
40d4e3df
ED
4014 case SIOCADDRT:
4015 case SIOCDELRT:
4016 case SIOCDARP:
4017 case SIOCGARP:
4018 case SIOCSARP:
4019 case SIOCGIFADDR:
4020 case SIOCSIFADDR:
4021 case SIOCGIFBRDADDR:
4022 case SIOCSIFBRDADDR:
4023 case SIOCGIFNETMASK:
4024 case SIOCSIFNETMASK:
4025 case SIOCGIFDSTADDR:
4026 case SIOCSIFDSTADDR:
4027 case SIOCSIFFLAGS:
40d4e3df 4028 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4029#endif
4030
40d4e3df
ED
4031 default:
4032 return -ENOIOCTLCMD;
1da177e4
LT
4033 }
4034 return 0;
4035}
4036
40d4e3df 4037static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
4038 poll_table *wait)
4039{
4040 struct sock *sk = sock->sk;
4041 struct packet_sock *po = pkt_sk(sk);
4042 unsigned int mask = datagram_poll(file, sock, wait);
4043
4044 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4045 if (po->rx_ring.pg_vec) {
f6fb8f10 4046 if (!packet_previous_rx_frame(po, &po->rx_ring,
4047 TP_STATUS_KERNEL))
1da177e4
LT
4048 mask |= POLLIN | POLLRDNORM;
4049 }
2ccdbaa6 4050 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4051 po->pressure = 0;
1da177e4 4052 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4053 spin_lock_bh(&sk->sk_write_queue.lock);
4054 if (po->tx_ring.pg_vec) {
4055 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4056 mask |= POLLOUT | POLLWRNORM;
4057 }
4058 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4059 return mask;
4060}
4061
4062
4063/* Dirty? Well, I still did not learn better way to account
4064 * for user mmaps.
4065 */
4066
4067static void packet_mm_open(struct vm_area_struct *vma)
4068{
4069 struct file *file = vma->vm_file;
40d4e3df 4070 struct socket *sock = file->private_data;
1da177e4 4071 struct sock *sk = sock->sk;
1ce4f28b 4072
1da177e4
LT
4073 if (sk)
4074 atomic_inc(&pkt_sk(sk)->mapped);
4075}
4076
4077static void packet_mm_close(struct vm_area_struct *vma)
4078{
4079 struct file *file = vma->vm_file;
40d4e3df 4080 struct socket *sock = file->private_data;
1da177e4 4081 struct sock *sk = sock->sk;
1ce4f28b 4082
1da177e4
LT
4083 if (sk)
4084 atomic_dec(&pkt_sk(sk)->mapped);
4085}
4086
f0f37e2f 4087static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4088 .open = packet_mm_open,
4089 .close = packet_mm_close,
1da177e4
LT
4090};
4091
0e3125c7
NH
4092static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4093 unsigned int len)
1da177e4
LT
4094{
4095 int i;
4096
4ebf0ae2 4097 for (i = 0; i < len; i++) {
0e3125c7 4098 if (likely(pg_vec[i].buffer)) {
c56b4d90 4099 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4100 vfree(pg_vec[i].buffer);
4101 else
4102 free_pages((unsigned long)pg_vec[i].buffer,
4103 order);
4104 pg_vec[i].buffer = NULL;
4105 }
1da177e4
LT
4106 }
4107 kfree(pg_vec);
4108}
4109
eea49cc9 4110static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4111{
f0d4eb29 4112 char *buffer;
0e3125c7
NH
4113 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4114 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4115
4116 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4117 if (buffer)
4118 return buffer;
4119
f0d4eb29 4120 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4121 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4122 if (buffer)
4123 return buffer;
4124
f0d4eb29 4125 /* vmalloc failed, lets dig into swap here */
0e3125c7 4126 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4127 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4128 if (buffer)
4129 return buffer;
4130
f0d4eb29 4131 /* complete and utter failure */
0e3125c7 4132 return NULL;
4ebf0ae2
DM
4133}
4134
0e3125c7 4135static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4136{
4137 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4138 struct pgv *pg_vec;
4ebf0ae2
DM
4139 int i;
4140
0e3125c7 4141 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4142 if (unlikely(!pg_vec))
4143 goto out;
4144
4145 for (i = 0; i < block_nr; i++) {
c56b4d90 4146 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4147 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4148 goto out_free_pgvec;
4149 }
4150
4151out:
4152 return pg_vec;
4153
4154out_free_pgvec:
4155 free_pg_vec(pg_vec, order, block_nr);
4156 pg_vec = NULL;
4157 goto out;
4158}
1da177e4 4159
f6fb8f10 4160static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4161 int closing, int tx_ring)
1da177e4 4162{
0e3125c7 4163 struct pgv *pg_vec = NULL;
1da177e4 4164 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4165 int was_running, order = 0;
69e3c75f
JB
4166 struct packet_ring_buffer *rb;
4167 struct sk_buff_head *rb_queue;
0e11c91e 4168 __be16 num;
f6fb8f10 4169 int err = -EINVAL;
4170 /* Added to avoid minimal code churn */
4171 struct tpacket_req *req = &req_u->req;
4172
4173 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4174 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
6ae81ced 4175 net_warn_ratelimited("Tx-ring is not supported.\n");
f6fb8f10 4176 goto out;
4177 }
1ce4f28b 4178
69e3c75f
JB
4179 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4180 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4181
69e3c75f
JB
4182 err = -EBUSY;
4183 if (!closing) {
4184 if (atomic_read(&po->mapped))
4185 goto out;
b0138408 4186 if (packet_read_pending(rb))
69e3c75f
JB
4187 goto out;
4188 }
1da177e4 4189
69e3c75f
JB
4190 if (req->tp_block_nr) {
4191 /* Sanity tests and some calculations */
4192 err = -EBUSY;
4193 if (unlikely(rb->pg_vec))
4194 goto out;
1da177e4 4195
bbd6ef87
PM
4196 switch (po->tp_version) {
4197 case TPACKET_V1:
4198 po->tp_hdrlen = TPACKET_HDRLEN;
4199 break;
4200 case TPACKET_V2:
4201 po->tp_hdrlen = TPACKET2_HDRLEN;
4202 break;
f6fb8f10 4203 case TPACKET_V3:
4204 po->tp_hdrlen = TPACKET3_HDRLEN;
4205 break;
bbd6ef87
PM
4206 }
4207
69e3c75f 4208 err = -EINVAL;
4ebf0ae2 4209 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4210 goto out;
90836b67 4211 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4212 goto out;
dc808110
ED
4213 if (po->tp_version >= TPACKET_V3 &&
4214 (int)(req->tp_block_size -
4215 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
4216 goto out;
8913336a 4217 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4218 po->tp_reserve))
4219 goto out;
4ebf0ae2 4220 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4221 goto out;
1da177e4 4222
4194b491
TK
4223 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4224 if (unlikely(rb->frames_per_block == 0))
69e3c75f
JB
4225 goto out;
4226 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4227 req->tp_frame_nr))
4228 goto out;
1da177e4
LT
4229
4230 err = -ENOMEM;
4ebf0ae2
DM
4231 order = get_order(req->tp_block_size);
4232 pg_vec = alloc_pg_vec(req, order);
4233 if (unlikely(!pg_vec))
1da177e4 4234 goto out;
f6fb8f10 4235 switch (po->tp_version) {
4236 case TPACKET_V3:
4237 /* Transmit path is not supported. We checked
4238 * it above but just being paranoid
4239 */
4240 if (!tx_ring)
e8e85cc5 4241 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 4242 break;
f6fb8f10 4243 default:
4244 break;
4245 }
69e3c75f
JB
4246 }
4247 /* Done */
4248 else {
4249 err = -EINVAL;
4ebf0ae2 4250 if (unlikely(req->tp_frame_nr))
69e3c75f 4251 goto out;
1da177e4
LT
4252 }
4253
4254 lock_sock(sk);
4255
4256 /* Detach socket from network */
4257 spin_lock(&po->bind_lock);
4258 was_running = po->running;
4259 num = po->num;
4260 if (was_running) {
1da177e4 4261 po->num = 0;
ce06b03e 4262 __unregister_prot_hook(sk, false);
1da177e4
LT
4263 }
4264 spin_unlock(&po->bind_lock);
1ce4f28b 4265
1da177e4
LT
4266 synchronize_net();
4267
4268 err = -EBUSY;
905db440 4269 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4270 if (closing || atomic_read(&po->mapped) == 0) {
4271 err = 0;
69e3c75f 4272 spin_lock_bh(&rb_queue->lock);
c053fd96 4273 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4274 rb->frame_max = (req->tp_frame_nr - 1);
4275 rb->head = 0;
4276 rb->frame_size = req->tp_frame_size;
4277 spin_unlock_bh(&rb_queue->lock);
4278
c053fd96
CG
4279 swap(rb->pg_vec_order, order);
4280 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4281
4282 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4283 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4284 tpacket_rcv : packet_rcv;
4285 skb_queue_purge(rb_queue);
1da177e4 4286 if (atomic_read(&po->mapped))
40d4e3df
ED
4287 pr_err("packet_mmap: vma is busy: %d\n",
4288 atomic_read(&po->mapped));
1da177e4 4289 }
905db440 4290 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4291
4292 spin_lock(&po->bind_lock);
ce06b03e 4293 if (was_running) {
1da177e4 4294 po->num = num;
ce06b03e 4295 register_prot_hook(sk);
1da177e4
LT
4296 }
4297 spin_unlock(&po->bind_lock);
f6fb8f10 4298 if (closing && (po->tp_version > TPACKET_V2)) {
4299 /* Because we don't support block-based V3 on tx-ring */
4300 if (!tx_ring)
73d0fcf2 4301 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4302 }
1da177e4
LT
4303 release_sock(sk);
4304
1da177e4
LT
4305 if (pg_vec)
4306 free_pg_vec(pg_vec, order, req->tp_block_nr);
4307out:
4308 return err;
4309}
4310
69e3c75f
JB
4311static int packet_mmap(struct file *file, struct socket *sock,
4312 struct vm_area_struct *vma)
1da177e4
LT
4313{
4314 struct sock *sk = sock->sk;
4315 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4316 unsigned long size, expected_size;
4317 struct packet_ring_buffer *rb;
1da177e4
LT
4318 unsigned long start;
4319 int err = -EINVAL;
4320 int i;
4321
4322 if (vma->vm_pgoff)
4323 return -EINVAL;
4324
905db440 4325 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4326
4327 expected_size = 0;
4328 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4329 if (rb->pg_vec) {
4330 expected_size += rb->pg_vec_len
4331 * rb->pg_vec_pages
4332 * PAGE_SIZE;
4333 }
4334 }
4335
4336 if (expected_size == 0)
1da177e4 4337 goto out;
69e3c75f
JB
4338
4339 size = vma->vm_end - vma->vm_start;
4340 if (size != expected_size)
1da177e4
LT
4341 goto out;
4342
1da177e4 4343 start = vma->vm_start;
69e3c75f
JB
4344 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4345 if (rb->pg_vec == NULL)
4346 continue;
4347
4348 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4349 struct page *page;
4350 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4351 int pg_num;
4352
c56b4d90
CG
4353 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4354 page = pgv_to_page(kaddr);
69e3c75f
JB
4355 err = vm_insert_page(vma, start, page);
4356 if (unlikely(err))
4357 goto out;
4358 start += PAGE_SIZE;
0e3125c7 4359 kaddr += PAGE_SIZE;
69e3c75f 4360 }
4ebf0ae2 4361 }
1da177e4 4362 }
69e3c75f 4363
4ebf0ae2 4364 atomic_inc(&po->mapped);
1da177e4
LT
4365 vma->vm_ops = &packet_mmap_ops;
4366 err = 0;
4367
4368out:
905db440 4369 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4370 return err;
4371}
1da177e4 4372
90ddc4f0 4373static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4374 .family = PF_PACKET,
4375 .owner = THIS_MODULE,
4376 .release = packet_release,
4377 .bind = packet_bind_spkt,
4378 .connect = sock_no_connect,
4379 .socketpair = sock_no_socketpair,
4380 .accept = sock_no_accept,
4381 .getname = packet_getname_spkt,
4382 .poll = datagram_poll,
4383 .ioctl = packet_ioctl,
4384 .listen = sock_no_listen,
4385 .shutdown = sock_no_shutdown,
4386 .setsockopt = sock_no_setsockopt,
4387 .getsockopt = sock_no_getsockopt,
4388 .sendmsg = packet_sendmsg_spkt,
4389 .recvmsg = packet_recvmsg,
4390 .mmap = sock_no_mmap,
4391 .sendpage = sock_no_sendpage,
4392};
1da177e4 4393
90ddc4f0 4394static const struct proto_ops packet_ops = {
1da177e4
LT
4395 .family = PF_PACKET,
4396 .owner = THIS_MODULE,
4397 .release = packet_release,
4398 .bind = packet_bind,
4399 .connect = sock_no_connect,
4400 .socketpair = sock_no_socketpair,
4401 .accept = sock_no_accept,
1ce4f28b 4402 .getname = packet_getname,
1da177e4
LT
4403 .poll = packet_poll,
4404 .ioctl = packet_ioctl,
4405 .listen = sock_no_listen,
4406 .shutdown = sock_no_shutdown,
4407 .setsockopt = packet_setsockopt,
4408 .getsockopt = packet_getsockopt,
719c44d3
WB
4409#ifdef CONFIG_COMPAT
4410 .compat_setsockopt = compat_packet_setsockopt,
4411#endif
1da177e4
LT
4412 .sendmsg = packet_sendmsg,
4413 .recvmsg = packet_recvmsg,
4414 .mmap = packet_mmap,
4415 .sendpage = sock_no_sendpage,
4416};
4417
ec1b4cf7 4418static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4419 .family = PF_PACKET,
4420 .create = packet_create,
4421 .owner = THIS_MODULE,
4422};
4423
4424static struct notifier_block packet_netdev_notifier = {
40d4e3df 4425 .notifier_call = packet_notifier,
1da177e4
LT
4426};
4427
4428#ifdef CONFIG_PROC_FS
1da177e4
LT
4429
4430static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4431 __acquires(RCU)
1da177e4 4432{
e372c414 4433 struct net *net = seq_file_net(seq);
808f5114 4434
4435 rcu_read_lock();
4436 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4437}
4438
4439static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4440{
1bf40954 4441 struct net *net = seq_file_net(seq);
808f5114 4442 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4443}
4444
4445static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4446 __releases(RCU)
1da177e4 4447{
808f5114 4448 rcu_read_unlock();
1da177e4
LT
4449}
4450
1ce4f28b 4451static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4452{
4453 if (v == SEQ_START_TOKEN)
4454 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4455 else {
b7ceabd9 4456 struct sock *s = sk_entry(v);
1da177e4
LT
4457 const struct packet_sock *po = pkt_sk(s);
4458
4459 seq_printf(seq,
71338aa7 4460 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4461 s,
4462 atomic_read(&s->sk_refcnt),
4463 s->sk_type,
4464 ntohs(po->num),
4465 po->ifindex,
4466 po->running,
4467 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4468 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4469 sock_i_ino(s));
1da177e4
LT
4470 }
4471
4472 return 0;
4473}
4474
56b3d975 4475static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4476 .start = packet_seq_start,
4477 .next = packet_seq_next,
4478 .stop = packet_seq_stop,
4479 .show = packet_seq_show,
4480};
4481
4482static int packet_seq_open(struct inode *inode, struct file *file)
4483{
e372c414
DL
4484 return seq_open_net(inode, file, &packet_seq_ops,
4485 sizeof(struct seq_net_private));
1da177e4
LT
4486}
4487
da7071d7 4488static const struct file_operations packet_seq_fops = {
1da177e4
LT
4489 .owner = THIS_MODULE,
4490 .open = packet_seq_open,
4491 .read = seq_read,
4492 .llseek = seq_lseek,
e372c414 4493 .release = seq_release_net,
1da177e4
LT
4494};
4495
4496#endif
4497
2c8c1e72 4498static int __net_init packet_net_init(struct net *net)
d12d01d6 4499{
0fa7fa98 4500 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4501 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4502
d4beaa66 4503 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4504 return -ENOMEM;
4505
4506 return 0;
4507}
4508
2c8c1e72 4509static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4510{
ece31ffd 4511 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4512}
4513
4514static struct pernet_operations packet_net_ops = {
4515 .init = packet_net_init,
4516 .exit = packet_net_exit,
4517};
4518
4519
1da177e4
LT
4520static void __exit packet_exit(void)
4521{
1da177e4 4522 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4523 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4524 sock_unregister(PF_PACKET);
4525 proto_unregister(&packet_proto);
4526}
4527
4528static int __init packet_init(void)
4529{
4530 int rc = proto_register(&packet_proto, 0);
4531
4532 if (rc != 0)
4533 goto out;
4534
4535 sock_register(&packet_family_ops);
d12d01d6 4536 register_pernet_subsys(&packet_net_ops);
1da177e4 4537 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4538out:
4539 return rc;
4540}
4541
4542module_init(packet_init);
4543module_exit(packet_exit);
4544MODULE_LICENSE("GPL");
4545MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.229852 seconds and 5 git commands to generate.