net: bridge: use __ethtool_get_ksettings
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
1da177e4 96
2787b04b
PE
97#include "internal.h"
98
1da177e4
LT
99/*
100 Assumptions:
101 - if device has no dev->hard_header routine, it adds and removes ll header
102 inside itself. In this case ll header is invisible outside of device,
103 but higher levels still should reserve dev->hard_header_len.
104 Some devices are enough clever to reallocate skb, when header
105 will not fit to reserved space (tunnel), another ones are silly
106 (PPP).
107 - packet socket receives packets with pulled ll header,
108 so that SOCK_RAW should push it back.
109
110On receive:
111-----------
112
113Incoming, dev->hard_header!=NULL
b0e380b1
ACM
114 mac_header -> ll header
115 data -> data
1da177e4
LT
116
117Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
118 mac_header -> ll header
119 data -> ll header
1da177e4
LT
120
121Incoming, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> UNKNOWN position. It is very likely, that it points to ll
123 header. PPP makes it, that is wrong, because introduce
db0c58f9 124 assymetry between rx and tx paths.
b0e380b1 125 data -> data
1da177e4
LT
126
127Outgoing, dev->hard_header==NULL
b0e380b1
ACM
128 mac_header -> data. ll header is still not built!
129 data -> data
1da177e4
LT
130
131Resume
132 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133
134
135On transmit:
136------------
137
138dev->hard_header != NULL
b0e380b1
ACM
139 mac_header -> ll header
140 data -> ll header
1da177e4
LT
141
142dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
143 mac_header -> data
144 data -> data
1da177e4
LT
145
146 We should set nh.raw on output to correct posistion,
147 packet classifier depends on it.
148 */
149
1da177e4
LT
150/* Private packet socket structures. */
151
0fb375fb
EB
152/* identical to struct packet_mreq except it has
153 * a longer address field.
154 */
40d4e3df 155struct packet_mreq_max {
0fb375fb
EB
156 int mr_ifindex;
157 unsigned short mr_type;
158 unsigned short mr_alen;
159 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 160};
a2efcfa0 161
184f489e
DB
162union tpacket_uhdr {
163 struct tpacket_hdr *h1;
164 struct tpacket2_hdr *h2;
165 struct tpacket3_hdr *h3;
166 void *raw;
167};
168
f6fb8f10 169static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
170 int closing, int tx_ring);
171
f6fb8f10 172#define V3_ALIGNMENT (8)
173
bc59ba39 174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
f6fb8f10 179#define PGV_FROM_VMALLOC 1
69e3c75f 180
f6fb8f10 181#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
182#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
183#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
184#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
185#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
186#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
188
69e3c75f
JB
189struct packet_sock;
190static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
191static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
192 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 193
f6fb8f10 194static void *packet_previous_frame(struct packet_sock *po,
195 struct packet_ring_buffer *rb,
196 int status);
197static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 198static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
199 struct tpacket_block_desc *);
200static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 201 struct packet_sock *);
bc59ba39 202static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 203 struct packet_sock *, unsigned int status);
bc59ba39 204static int prb_queue_frozen(struct tpacket_kbdq_core *);
205static void prb_open_block(struct tpacket_kbdq_core *,
206 struct tpacket_block_desc *);
f6fb8f10 207static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 208static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
209static void prb_init_blk_timer(struct packet_sock *,
210 struct tpacket_kbdq_core *,
211 void (*func) (unsigned long));
212static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
213static void prb_clear_rxhash(struct tpacket_kbdq_core *,
214 struct tpacket3_hdr *);
215static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
216 struct tpacket3_hdr *);
1da177e4
LT
217static void packet_flush_mclist(struct sock *sk);
218
ffbc6111 219struct packet_skb_cb {
ffbc6111
HX
220 union {
221 struct sockaddr_pkt pkt;
2472d761
EB
222 union {
223 /* Trick: alias skb original length with
224 * ll.sll_family and ll.protocol in order
225 * to save room.
226 */
227 unsigned int origlen;
228 struct sockaddr_ll ll;
229 };
ffbc6111
HX
230 } sa;
231};
232
d3869efe
DW
233#define vio_le() virtio_legacy_is_little_endian()
234
ffbc6111 235#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 236
bc59ba39 237#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 238#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 239 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 240#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 241 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 242#define GET_NEXT_PRB_BLK_NUM(x) \
243 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
244 ((x)->kactive_blk_num+1) : 0)
245
dc99f600
DM
246static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
247static void __fanout_link(struct sock *sk, struct packet_sock *po);
248
d346a3fa
DB
249static int packet_direct_xmit(struct sk_buff *skb)
250{
251 struct net_device *dev = skb->dev;
d346a3fa
DB
252 netdev_features_t features;
253 struct netdev_queue *txq;
43279500 254 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
255
256 if (unlikely(!netif_running(dev) ||
43279500
DB
257 !netif_carrier_ok(dev)))
258 goto drop;
d346a3fa
DB
259
260 features = netif_skb_features(skb);
261 if (skb_needs_linearize(skb, features) &&
43279500
DB
262 __skb_linearize(skb))
263 goto drop;
d346a3fa 264
10c51b56 265 txq = skb_get_tx_queue(dev, skb);
d346a3fa 266
43279500
DB
267 local_bh_disable();
268
269 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 270 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 271 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 272 HARD_TX_UNLOCK(dev, txq);
d346a3fa 273
43279500
DB
274 local_bh_enable();
275
276 if (!dev_xmit_complete(ret))
d346a3fa 277 kfree_skb(skb);
43279500 278
d346a3fa 279 return ret;
43279500 280drop:
0f97ede4 281 atomic_long_inc(&dev->tx_dropped);
43279500
DB
282 kfree_skb(skb);
283 return NET_XMIT_DROP;
d346a3fa
DB
284}
285
66e56cd4
DB
286static struct net_device *packet_cached_dev_get(struct packet_sock *po)
287{
288 struct net_device *dev;
289
290 rcu_read_lock();
291 dev = rcu_dereference(po->cached_dev);
292 if (likely(dev))
293 dev_hold(dev);
294 rcu_read_unlock();
295
296 return dev;
297}
298
299static void packet_cached_dev_assign(struct packet_sock *po,
300 struct net_device *dev)
301{
302 rcu_assign_pointer(po->cached_dev, dev);
303}
304
305static void packet_cached_dev_reset(struct packet_sock *po)
306{
307 RCU_INIT_POINTER(po->cached_dev, NULL);
308}
309
d346a3fa
DB
310static bool packet_use_direct_xmit(const struct packet_sock *po)
311{
312 return po->xmit == packet_direct_xmit;
313}
314
0fd5d57b 315static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 316{
1cbac010 317 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
318}
319
0fd5d57b
DB
320static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
321{
322 const struct net_device_ops *ops = dev->netdev_ops;
323 u16 queue_index;
324
325 if (ops->ndo_select_queue) {
326 queue_index = ops->ndo_select_queue(dev, skb, NULL,
327 __packet_pick_tx_queue);
328 queue_index = netdev_cap_txqueue(dev, queue_index);
329 } else {
330 queue_index = __packet_pick_tx_queue(dev, skb);
331 }
332
333 skb_set_queue_mapping(skb, queue_index);
334}
335
ce06b03e
DM
336/* register_prot_hook must be invoked with the po->bind_lock held,
337 * or from a context in which asynchronous accesses to the packet
338 * socket is not possible (packet_create()).
339 */
340static void register_prot_hook(struct sock *sk)
341{
342 struct packet_sock *po = pkt_sk(sk);
e40526cb 343
ce06b03e 344 if (!po->running) {
66e56cd4 345 if (po->fanout)
dc99f600 346 __fanout_link(sk, po);
66e56cd4 347 else
dc99f600 348 dev_add_pack(&po->prot_hook);
e40526cb 349
ce06b03e
DM
350 sock_hold(sk);
351 po->running = 1;
352 }
353}
354
355/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
356 * held. If the sync parameter is true, we will temporarily drop
357 * the po->bind_lock and do a synchronize_net to make sure no
358 * asynchronous packet processing paths still refer to the elements
359 * of po->prot_hook. If the sync parameter is false, it is the
360 * callers responsibility to take care of this.
361 */
362static void __unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 po->running = 0;
66e56cd4
DB
367
368 if (po->fanout)
dc99f600 369 __fanout_unlink(sk, po);
66e56cd4 370 else
dc99f600 371 __dev_remove_pack(&po->prot_hook);
e40526cb 372
ce06b03e
DM
373 __sock_put(sk);
374
375 if (sync) {
376 spin_unlock(&po->bind_lock);
377 synchronize_net();
378 spin_lock(&po->bind_lock);
379 }
380}
381
382static void unregister_prot_hook(struct sock *sk, bool sync)
383{
384 struct packet_sock *po = pkt_sk(sk);
385
386 if (po->running)
387 __unregister_prot_hook(sk, sync);
388}
389
6e58040b 390static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
391{
392 if (is_vmalloc_addr(addr))
393 return vmalloc_to_page(addr);
394 return virt_to_page(addr);
395}
396
69e3c75f 397static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 398{
184f489e 399 union tpacket_uhdr h;
1da177e4 400
69e3c75f 401 h.raw = frame;
bbd6ef87
PM
402 switch (po->tp_version) {
403 case TPACKET_V1:
69e3c75f 404 h.h1->tp_status = status;
0af55bb5 405 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
406 break;
407 case TPACKET_V2:
69e3c75f 408 h.h2->tp_status = status;
0af55bb5 409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 410 break;
f6fb8f10 411 case TPACKET_V3:
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f 414 BUG();
bbd6ef87 415 }
69e3c75f
JB
416
417 smp_wmb();
bbd6ef87
PM
418}
419
69e3c75f 420static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 421{
184f489e 422 union tpacket_uhdr h;
bbd6ef87 423
69e3c75f
JB
424 smp_rmb();
425
bbd6ef87
PM
426 h.raw = frame;
427 switch (po->tp_version) {
428 case TPACKET_V1:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 430 return h.h1->tp_status;
bbd6ef87 431 case TPACKET_V2:
0af55bb5 432 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 433 return h.h2->tp_status;
f6fb8f10 434 case TPACKET_V3:
69e3c75f 435 default:
f6fb8f10 436 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
437 BUG();
438 return 0;
bbd6ef87 439 }
1da177e4 440}
69e3c75f 441
b9c32fb2
DB
442static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
443 unsigned int flags)
7a51384c
DB
444{
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
446
68a360e8
WB
447 if (shhwtstamps &&
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
451
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 453 return TP_STATUS_TS_SOFTWARE;
7a51384c 454
b9c32fb2 455 return 0;
7a51384c
DB
456}
457
b9c32fb2
DB
458static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
459 struct sk_buff *skb)
2e31396f
WB
460{
461 union tpacket_uhdr h;
462 struct timespec ts;
b9c32fb2 463 __u32 ts_status;
2e31396f 464
b9c32fb2
DB
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 return 0;
2e31396f
WB
467
468 h.raw = frame;
469 switch (po->tp_version) {
470 case TPACKET_V1:
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
473 break;
474 case TPACKET_V2:
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
477 break;
478 case TPACKET_V3:
479 default:
480 WARN(1, "TPACKET version not supported.\n");
481 BUG();
482 }
483
484 /* one flush is safe, as both fields always lie on the same cacheline */
485 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
486 smp_wmb();
b9c32fb2
DB
487
488 return ts_status;
2e31396f
WB
489}
490
69e3c75f
JB
491static void *packet_lookup_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 unsigned int position,
494 int status)
495{
496 unsigned int pg_vec_pos, frame_offset;
184f489e 497 union tpacket_uhdr h;
69e3c75f
JB
498
499 pg_vec_pos = position / rb->frames_per_block;
500 frame_offset = position % rb->frames_per_block;
501
0e3125c7
NH
502 h.raw = rb->pg_vec[pg_vec_pos].buffer +
503 (frame_offset * rb->frame_size);
69e3c75f
JB
504
505 if (status != __packet_get_status(po, h.raw))
506 return NULL;
507
508 return h.raw;
509}
510
eea49cc9 511static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
512 struct packet_ring_buffer *rb,
513 int status)
514{
515 return packet_lookup_frame(po, rb, rb->head, status);
516}
517
bc59ba39 518static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 519{
520 del_timer_sync(&pkc->retire_blk_timer);
521}
522
523static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 524 struct sk_buff_head *rb_queue)
525{
bc59ba39 526 struct tpacket_kbdq_core *pkc;
f6fb8f10 527
73d0fcf2 528 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 529
ec6f809f 530 spin_lock_bh(&rb_queue->lock);
f6fb8f10 531 pkc->delete_blk_timer = 1;
ec6f809f 532 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 533
534 prb_del_retire_blk_timer(pkc);
535}
536
537static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 538 struct tpacket_kbdq_core *pkc,
f6fb8f10 539 void (*func) (unsigned long))
540{
541 init_timer(&pkc->retire_blk_timer);
542 pkc->retire_blk_timer.data = (long)po;
543 pkc->retire_blk_timer.function = func;
544 pkc->retire_blk_timer.expires = jiffies;
545}
546
e8e85cc5 547static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 548{
bc59ba39 549 struct tpacket_kbdq_core *pkc;
f6fb8f10 550
e8e85cc5 551 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 552 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
553}
554
555static int prb_calc_retire_blk_tmo(struct packet_sock *po,
556 int blk_size_in_bytes)
557{
558 struct net_device *dev;
559 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
560 struct ethtool_cmd ecmd;
561 int err;
e440cf2c 562 u32 speed;
f6fb8f10 563
4bc71cb9
JP
564 rtnl_lock();
565 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
566 if (unlikely(!dev)) {
567 rtnl_unlock();
f6fb8f10 568 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
569 }
570 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 571 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
572 rtnl_unlock();
573 if (!err) {
4bc71cb9
JP
574 /*
575 * If the link speed is so slow you don't really
576 * need to worry about perf anyways
577 */
e440cf2c 578 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 579 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 580 } else {
581 msec = 1;
582 div = speed / 1000;
f6fb8f10 583 }
584 }
585
586 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
587
588 if (div)
589 mbits /= div;
590
591 tmo = mbits * msec;
592
593 if (div)
594 return tmo+1;
595 return tmo;
596}
597
bc59ba39 598static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 599 union tpacket_req_u *req_u)
600{
601 p1->feature_req_word = req_u->req3.tp_feature_req_word;
602}
603
604static void init_prb_bdqc(struct packet_sock *po,
605 struct packet_ring_buffer *rb,
606 struct pgv *pg_vec,
e8e85cc5 607 union tpacket_req_u *req_u)
f6fb8f10 608{
22781a5b 609 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 610 struct tpacket_block_desc *pbd;
f6fb8f10 611
612 memset(p1, 0x0, sizeof(*p1));
613
614 p1->knxt_seq_num = 1;
615 p1->pkbdq = pg_vec;
bc59ba39 616 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 617 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 618 p1->kblk_size = req_u->req3.tp_block_size;
619 p1->knum_blocks = req_u->req3.tp_block_nr;
620 p1->hdrlen = po->tp_hdrlen;
621 p1->version = po->tp_version;
622 p1->last_kactive_blk_num = 0;
ee80fbf3 623 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 624 if (req_u->req3.tp_retire_blk_tov)
625 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
626 else
627 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
628 req_u->req3.tp_block_size);
629 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
630 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
631
dc808110 632 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 633 prb_init_ft_ops(p1, req_u);
e8e85cc5 634 prb_setup_retire_blk_timer(po);
f6fb8f10 635 prb_open_block(p1, pbd);
636}
637
638/* Do NOT update the last_blk_num first.
639 * Assumes sk_buff_head lock is held.
640 */
bc59ba39 641static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 642{
643 mod_timer(&pkc->retire_blk_timer,
644 jiffies + pkc->tov_in_jiffies);
645 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
646}
647
648/*
649 * Timer logic:
650 * 1) We refresh the timer only when we open a block.
651 * By doing this we don't waste cycles refreshing the timer
652 * on packet-by-packet basis.
653 *
654 * With a 1MB block-size, on a 1Gbps line, it will take
655 * i) ~8 ms to fill a block + ii) memcpy etc.
656 * In this cut we are not accounting for the memcpy time.
657 *
658 * So, if the user sets the 'tmo' to 10ms then the timer
659 * will never fire while the block is still getting filled
660 * (which is what we want). However, the user could choose
661 * to close a block early and that's fine.
662 *
663 * But when the timer does fire, we check whether or not to refresh it.
664 * Since the tmo granularity is in msecs, it is not too expensive
665 * to refresh the timer, lets say every '8' msecs.
666 * Either the user can set the 'tmo' or we can derive it based on
667 * a) line-speed and b) block-size.
668 * prb_calc_retire_blk_tmo() calculates the tmo.
669 *
670 */
671static void prb_retire_rx_blk_timer_expired(unsigned long data)
672{
673 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 674 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 675 unsigned int frozen;
bc59ba39 676 struct tpacket_block_desc *pbd;
f6fb8f10 677
678 spin_lock(&po->sk.sk_receive_queue.lock);
679
680 frozen = prb_queue_frozen(pkc);
681 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
682
683 if (unlikely(pkc->delete_blk_timer))
684 goto out;
685
686 /* We only need to plug the race when the block is partially filled.
687 * tpacket_rcv:
688 * lock(); increment BLOCK_NUM_PKTS; unlock()
689 * copy_bits() is in progress ...
690 * timer fires on other cpu:
691 * we can't retire the current block because copy_bits
692 * is in progress.
693 *
694 */
695 if (BLOCK_NUM_PKTS(pbd)) {
696 while (atomic_read(&pkc->blk_fill_in_prog)) {
697 /* Waiting for skb_copy_bits to finish... */
698 cpu_relax();
699 }
700 }
701
702 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
703 if (!frozen) {
41a50d62
AD
704 if (!BLOCK_NUM_PKTS(pbd)) {
705 /* An empty block. Just refresh the timer. */
706 goto refresh_timer;
707 }
f6fb8f10 708 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
709 if (!prb_dispatch_next_block(pkc, po))
710 goto refresh_timer;
711 else
712 goto out;
713 } else {
714 /* Case 1. Queue was frozen because user-space was
715 * lagging behind.
716 */
717 if (prb_curr_blk_in_use(pkc, pbd)) {
718 /*
719 * Ok, user-space is still behind.
720 * So just refresh the timer.
721 */
722 goto refresh_timer;
723 } else {
724 /* Case 2. queue was frozen,user-space caught up,
725 * now the link went idle && the timer fired.
726 * We don't have a block to close.So we open this
727 * block and restart the timer.
728 * opening a block thaws the queue,restarts timer
729 * Thawing/timer-refresh is a side effect.
730 */
731 prb_open_block(pkc, pbd);
732 goto out;
733 }
734 }
735 }
736
737refresh_timer:
738 _prb_refresh_rx_retire_blk_timer(pkc);
739
740out:
741 spin_unlock(&po->sk.sk_receive_queue.lock);
742}
743
eea49cc9 744static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 745 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 746{
747 /* Flush everything minus the block header */
748
749#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
750 u8 *start, *end;
751
752 start = (u8 *)pbd1;
753
754 /* Skip the block header(we know header WILL fit in 4K) */
755 start += PAGE_SIZE;
756
757 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
758 for (; start < end; start += PAGE_SIZE)
759 flush_dcache_page(pgv_to_page(start));
760
761 smp_wmb();
762#endif
763
764 /* Now update the block status. */
765
766 BLOCK_STATUS(pbd1) = status;
767
768 /* Flush the block header */
769
770#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
771 start = (u8 *)pbd1;
772 flush_dcache_page(pgv_to_page(start));
773
774 smp_wmb();
775#endif
776}
777
778/*
779 * Side effect:
780 *
781 * 1) flush the block
782 * 2) Increment active_blk_num
783 *
784 * Note:We DONT refresh the timer on purpose.
785 * Because almost always the next block will be opened.
786 */
bc59ba39 787static void prb_close_block(struct tpacket_kbdq_core *pkc1,
788 struct tpacket_block_desc *pbd1,
f6fb8f10 789 struct packet_sock *po, unsigned int stat)
790{
791 __u32 status = TP_STATUS_USER | stat;
792
793 struct tpacket3_hdr *last_pkt;
bc59ba39 794 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 795 struct sock *sk = &po->sk;
f6fb8f10 796
ee80fbf3 797 if (po->stats.stats3.tp_drops)
f6fb8f10 798 status |= TP_STATUS_LOSING;
799
800 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
801 last_pkt->tp_next_offset = 0;
802
803 /* Get the ts of the last pkt */
804 if (BLOCK_NUM_PKTS(pbd1)) {
805 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
806 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
807 } else {
41a50d62
AD
808 /* Ok, we tmo'd - so get the current time.
809 *
810 * It shouldn't really happen as we don't close empty
811 * blocks. See prb_retire_rx_blk_timer_expired().
812 */
f6fb8f10 813 struct timespec ts;
814 getnstimeofday(&ts);
815 h1->ts_last_pkt.ts_sec = ts.tv_sec;
816 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
817 }
818
819 smp_wmb();
820
821 /* Flush the block */
822 prb_flush_block(pkc1, pbd1, status);
823
da413eec
DC
824 sk->sk_data_ready(sk);
825
f6fb8f10 826 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
827}
828
eea49cc9 829static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 830{
831 pkc->reset_pending_on_curr_blk = 0;
832}
833
834/*
835 * Side effect of opening a block:
836 *
837 * 1) prb_queue is thawed.
838 * 2) retire_blk_timer is refreshed.
839 *
840 */
bc59ba39 841static void prb_open_block(struct tpacket_kbdq_core *pkc1,
842 struct tpacket_block_desc *pbd1)
f6fb8f10 843{
844 struct timespec ts;
bc59ba39 845 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 846
847 smp_rmb();
848
8da3056c
DB
849 /* We could have just memset this but we will lose the
850 * flexibility of making the priv area sticky
851 */
f6fb8f10 852
8da3056c
DB
853 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
854 BLOCK_NUM_PKTS(pbd1) = 0;
855 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 856
8da3056c
DB
857 getnstimeofday(&ts);
858
859 h1->ts_first_pkt.ts_sec = ts.tv_sec;
860 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 861
8da3056c
DB
862 pkc1->pkblk_start = (char *)pbd1;
863 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
864
865 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
866 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
867
868 pbd1->version = pkc1->version;
869 pkc1->prev = pkc1->nxt_offset;
870 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
871
872 prb_thaw_queue(pkc1);
873 _prb_refresh_rx_retire_blk_timer(pkc1);
874
875 smp_wmb();
f6fb8f10 876}
877
878/*
879 * Queue freeze logic:
880 * 1) Assume tp_block_nr = 8 blocks.
881 * 2) At time 't0', user opens Rx ring.
882 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
883 * 4) user-space is either sleeping or processing block '0'.
884 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
885 * it will close block-7,loop around and try to fill block '0'.
886 * call-flow:
887 * __packet_lookup_frame_in_block
888 * prb_retire_current_block()
889 * prb_dispatch_next_block()
890 * |->(BLOCK_STATUS == USER) evaluates to true
891 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
892 * 6) Now there are two cases:
893 * 6.1) Link goes idle right after the queue is frozen.
894 * But remember, the last open_block() refreshed the timer.
895 * When this timer expires,it will refresh itself so that we can
896 * re-open block-0 in near future.
897 * 6.2) Link is busy and keeps on receiving packets. This is a simple
898 * case and __packet_lookup_frame_in_block will check if block-0
899 * is free and can now be re-used.
900 */
eea49cc9 901static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 902 struct packet_sock *po)
903{
904 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 905 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 906}
907
908#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
909
910/*
911 * If the next block is free then we will dispatch it
912 * and return a good offset.
913 * Else, we will freeze the queue.
914 * So, caller must check the return value.
915 */
bc59ba39 916static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 917 struct packet_sock *po)
918{
bc59ba39 919 struct tpacket_block_desc *pbd;
f6fb8f10 920
921 smp_rmb();
922
923 /* 1. Get current block num */
924 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
925
926 /* 2. If this block is currently in_use then freeze the queue */
927 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
928 prb_freeze_queue(pkc, po);
929 return NULL;
930 }
931
932 /*
933 * 3.
934 * open this block and return the offset where the first packet
935 * needs to get stored.
936 */
937 prb_open_block(pkc, pbd);
938 return (void *)pkc->nxt_offset;
939}
940
bc59ba39 941static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 942 struct packet_sock *po, unsigned int status)
943{
bc59ba39 944 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 945
946 /* retire/close the current block */
947 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
948 /*
949 * Plug the case where copy_bits() is in progress on
950 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
951 * have space to copy the pkt in the current block and
952 * called prb_retire_current_block()
953 *
954 * We don't need to worry about the TMO case because
955 * the timer-handler already handled this case.
956 */
957 if (!(status & TP_STATUS_BLK_TMO)) {
958 while (atomic_read(&pkc->blk_fill_in_prog)) {
959 /* Waiting for skb_copy_bits to finish... */
960 cpu_relax();
961 }
962 }
963 prb_close_block(pkc, pbd, po, status);
964 return;
965 }
f6fb8f10 966}
967
eea49cc9 968static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 969 struct tpacket_block_desc *pbd)
f6fb8f10 970{
971 return TP_STATUS_USER & BLOCK_STATUS(pbd);
972}
973
eea49cc9 974static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 975{
976 return pkc->reset_pending_on_curr_blk;
977}
978
eea49cc9 979static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 980{
bc59ba39 981 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 982 atomic_dec(&pkc->blk_fill_in_prog);
983}
984
eea49cc9 985static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 986 struct tpacket3_hdr *ppd)
987{
3958afa1 988 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 989}
990
eea49cc9 991static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 992 struct tpacket3_hdr *ppd)
993{
994 ppd->hv1.tp_rxhash = 0;
995}
996
eea49cc9 997static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 998 struct tpacket3_hdr *ppd)
999{
df8a39de
JP
1000 if (skb_vlan_tag_present(pkc->skb)) {
1001 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1002 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1003 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1004 } else {
9e67030a 1005 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1006 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1007 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1008 }
1009}
1010
bc59ba39 1011static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1012 struct tpacket3_hdr *ppd)
1013{
a0cdfcf3 1014 ppd->hv1.tp_padding = 0;
f6fb8f10 1015 prb_fill_vlan_info(pkc, ppd);
1016
1017 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1018 prb_fill_rxhash(pkc, ppd);
1019 else
1020 prb_clear_rxhash(pkc, ppd);
1021}
1022
eea49cc9 1023static void prb_fill_curr_block(char *curr,
bc59ba39 1024 struct tpacket_kbdq_core *pkc,
1025 struct tpacket_block_desc *pbd,
f6fb8f10 1026 unsigned int len)
1027{
1028 struct tpacket3_hdr *ppd;
1029
1030 ppd = (struct tpacket3_hdr *)curr;
1031 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1032 pkc->prev = curr;
1033 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1035 BLOCK_NUM_PKTS(pbd) += 1;
1036 atomic_inc(&pkc->blk_fill_in_prog);
1037 prb_run_all_ft_ops(pkc, ppd);
1038}
1039
1040/* Assumes caller has the sk->rx_queue.lock */
1041static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1042 struct sk_buff *skb,
1043 int status,
1044 unsigned int len
1045 )
1046{
bc59ba39 1047 struct tpacket_kbdq_core *pkc;
1048 struct tpacket_block_desc *pbd;
f6fb8f10 1049 char *curr, *end;
1050
e3192690 1051 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1052 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1053
1054 /* Queue is frozen when user space is lagging behind */
1055 if (prb_queue_frozen(pkc)) {
1056 /*
1057 * Check if that last block which caused the queue to freeze,
1058 * is still in_use by user-space.
1059 */
1060 if (prb_curr_blk_in_use(pkc, pbd)) {
1061 /* Can't record this packet */
1062 return NULL;
1063 } else {
1064 /*
1065 * Ok, the block was released by user-space.
1066 * Now let's open that block.
1067 * opening a block also thaws the queue.
1068 * Thawing is a side effect.
1069 */
1070 prb_open_block(pkc, pbd);
1071 }
1072 }
1073
1074 smp_mb();
1075 curr = pkc->nxt_offset;
1076 pkc->skb = skb;
e3192690 1077 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1078
1079 /* first try the current block */
1080 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1081 prb_fill_curr_block(curr, pkc, pbd, len);
1082 return (void *)curr;
1083 }
1084
1085 /* Ok, close the current block */
1086 prb_retire_current_block(pkc, po, 0);
1087
1088 /* Now, try to dispatch the next block */
1089 curr = (char *)prb_dispatch_next_block(pkc, po);
1090 if (curr) {
1091 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1092 prb_fill_curr_block(curr, pkc, pbd, len);
1093 return (void *)curr;
1094 }
1095
1096 /*
1097 * No free blocks are available.user_space hasn't caught up yet.
1098 * Queue was just frozen and now this packet will get dropped.
1099 */
1100 return NULL;
1101}
1102
eea49cc9 1103static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1104 struct sk_buff *skb,
1105 int status, unsigned int len)
1106{
1107 char *curr = NULL;
1108 switch (po->tp_version) {
1109 case TPACKET_V1:
1110 case TPACKET_V2:
1111 curr = packet_lookup_frame(po, &po->rx_ring,
1112 po->rx_ring.head, status);
1113 return curr;
1114 case TPACKET_V3:
1115 return __packet_lookup_frame_in_block(po, skb, status, len);
1116 default:
1117 WARN(1, "TPACKET version not supported\n");
1118 BUG();
99aa3473 1119 return NULL;
f6fb8f10 1120 }
1121}
1122
eea49cc9 1123static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1124 struct packet_ring_buffer *rb,
77f65ebd 1125 unsigned int idx,
f6fb8f10 1126 int status)
1127{
bc59ba39 1128 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1129 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1130
1131 if (status != BLOCK_STATUS(pbd))
1132 return NULL;
1133 return pbd;
1134}
1135
eea49cc9 1136static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1137{
1138 unsigned int prev;
1139 if (rb->prb_bdqc.kactive_blk_num)
1140 prev = rb->prb_bdqc.kactive_blk_num-1;
1141 else
1142 prev = rb->prb_bdqc.knum_blocks-1;
1143 return prev;
1144}
1145
1146/* Assumes caller has held the rx_queue.lock */
eea49cc9 1147static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1148 struct packet_ring_buffer *rb,
1149 int status)
1150{
1151 unsigned int previous = prb_previous_blk_num(rb);
1152 return prb_lookup_block(po, rb, previous, status);
1153}
1154
eea49cc9 1155static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1156 struct packet_ring_buffer *rb,
1157 int status)
1158{
1159 if (po->tp_version <= TPACKET_V2)
1160 return packet_previous_frame(po, rb, status);
1161
1162 return __prb_previous_block(po, rb, status);
1163}
1164
eea49cc9 1165static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1166 struct packet_ring_buffer *rb)
1167{
1168 switch (po->tp_version) {
1169 case TPACKET_V1:
1170 case TPACKET_V2:
1171 return packet_increment_head(rb);
1172 case TPACKET_V3:
1173 default:
1174 WARN(1, "TPACKET version not supported.\n");
1175 BUG();
1176 return;
1177 }
1178}
1179
eea49cc9 1180static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1181 struct packet_ring_buffer *rb,
1182 int status)
1183{
1184 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1185 return packet_lookup_frame(po, rb, previous, status);
1186}
1187
eea49cc9 1188static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1189{
1190 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1191}
1192
b0138408
DB
1193static void packet_inc_pending(struct packet_ring_buffer *rb)
1194{
1195 this_cpu_inc(*rb->pending_refcnt);
1196}
1197
1198static void packet_dec_pending(struct packet_ring_buffer *rb)
1199{
1200 this_cpu_dec(*rb->pending_refcnt);
1201}
1202
1203static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1204{
1205 unsigned int refcnt = 0;
1206 int cpu;
1207
1208 /* We don't use pending refcount in rx_ring. */
1209 if (rb->pending_refcnt == NULL)
1210 return 0;
1211
1212 for_each_possible_cpu(cpu)
1213 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1214
1215 return refcnt;
1216}
1217
1218static int packet_alloc_pending(struct packet_sock *po)
1219{
1220 po->rx_ring.pending_refcnt = NULL;
1221
1222 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1223 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1224 return -ENOBUFS;
1225
1226 return 0;
1227}
1228
1229static void packet_free_pending(struct packet_sock *po)
1230{
1231 free_percpu(po->tx_ring.pending_refcnt);
1232}
1233
9954729b
WB
1234#define ROOM_POW_OFF 2
1235#define ROOM_NONE 0x0
1236#define ROOM_LOW 0x1
1237#define ROOM_NORMAL 0x2
1238
1239static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1240{
9954729b
WB
1241 int idx, len;
1242
1243 len = po->rx_ring.frame_max + 1;
1244 idx = po->rx_ring.head;
1245 if (pow_off)
1246 idx += len >> pow_off;
1247 if (idx >= len)
1248 idx -= len;
1249 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1250}
1251
1252static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1253{
1254 int idx, len;
1255
1256 len = po->rx_ring.prb_bdqc.knum_blocks;
1257 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1258 if (pow_off)
1259 idx += len >> pow_off;
1260 if (idx >= len)
1261 idx -= len;
1262 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1263}
77f65ebd 1264
2ccdbaa6 1265static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1266{
1267 struct sock *sk = &po->sk;
1268 int ret = ROOM_NONE;
1269
1270 if (po->prot_hook.func != tpacket_rcv) {
1271 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1272 - (skb ? skb->truesize : 0);
9954729b
WB
1273 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1274 return ROOM_NORMAL;
1275 else if (avail > 0)
1276 return ROOM_LOW;
1277 else
1278 return ROOM_NONE;
1279 }
77f65ebd 1280
9954729b
WB
1281 if (po->tp_version == TPACKET_V3) {
1282 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1283 ret = ROOM_NORMAL;
1284 else if (__tpacket_v3_has_room(po, 0))
1285 ret = ROOM_LOW;
1286 } else {
1287 if (__tpacket_has_room(po, ROOM_POW_OFF))
1288 ret = ROOM_NORMAL;
1289 else if (__tpacket_has_room(po, 0))
1290 ret = ROOM_LOW;
1291 }
2ccdbaa6
WB
1292
1293 return ret;
1294}
1295
1296static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1297{
1298 int ret;
1299 bool has_room;
1300
54d7c01d
WB
1301 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1302 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1303 has_room = ret == ROOM_NORMAL;
1304 if (po->pressure == has_room)
54d7c01d
WB
1305 po->pressure = !has_room;
1306 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1307
9954729b 1308 return ret;
77f65ebd
WB
1309}
1310
1da177e4
LT
1311static void packet_sock_destruct(struct sock *sk)
1312{
ed85b565
RC
1313 skb_queue_purge(&sk->sk_error_queue);
1314
547b792c
IJ
1315 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1316 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1317
1318 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1319 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1320 return;
1321 }
1322
17ab56a2 1323 sk_refcnt_debug_dec(sk);
1da177e4
LT
1324}
1325
3b3a5b0a
WB
1326static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1327{
1328 u32 rxhash;
1329 int i, count = 0;
1330
1331 rxhash = skb_get_hash(skb);
1332 for (i = 0; i < ROLLOVER_HLEN; i++)
1333 if (po->rollover->history[i] == rxhash)
1334 count++;
1335
1336 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1337 return count > (ROLLOVER_HLEN >> 1);
1338}
1339
77f65ebd
WB
1340static unsigned int fanout_demux_hash(struct packet_fanout *f,
1341 struct sk_buff *skb,
1342 unsigned int num)
dc99f600 1343{
61b905da 1344 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1345}
1346
77f65ebd
WB
1347static unsigned int fanout_demux_lb(struct packet_fanout *f,
1348 struct sk_buff *skb,
1349 unsigned int num)
dc99f600 1350{
468479e6 1351 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1352
468479e6 1353 return val % num;
77f65ebd
WB
1354}
1355
1356static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1357 struct sk_buff *skb,
1358 unsigned int num)
1359{
1360 return smp_processor_id() % num;
dc99f600
DM
1361}
1362
5df0ddfb
DB
1363static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1364 struct sk_buff *skb,
1365 unsigned int num)
1366{
f337db64 1367 return prandom_u32_max(num);
5df0ddfb
DB
1368}
1369
77f65ebd
WB
1370static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1371 struct sk_buff *skb,
ad377cab 1372 unsigned int idx, bool try_self,
77f65ebd 1373 unsigned int num)
95ec3eb4 1374{
4633c9e0 1375 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1376 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1377
0648ab70 1378 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1379
1380 if (try_self) {
1381 room = packet_rcv_has_room(po, skb);
1382 if (room == ROOM_NORMAL ||
1383 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1384 return idx;
4633c9e0 1385 po_skip = po;
3b3a5b0a 1386 }
ad377cab 1387
0648ab70 1388 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1389 do {
2ccdbaa6 1390 po_next = pkt_sk(f->arr[i]);
4633c9e0 1391 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1392 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1393 if (i != j)
0648ab70 1394 po->rollover->sock = i;
a9b63918
WB
1395 atomic_long_inc(&po->rollover->num);
1396 if (room == ROOM_LOW)
1397 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1398 return i;
1399 }
ad377cab 1400
77f65ebd
WB
1401 if (++i == num)
1402 i = 0;
1403 } while (i != j);
1404
a9b63918 1405 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1406 return idx;
1407}
1408
2d36097d
NH
1409static unsigned int fanout_demux_qm(struct packet_fanout *f,
1410 struct sk_buff *skb,
1411 unsigned int num)
1412{
1413 return skb_get_queue_mapping(skb) % num;
1414}
1415
47dceb8e
WB
1416static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1417 struct sk_buff *skb,
1418 unsigned int num)
1419{
1420 struct bpf_prog *prog;
1421 unsigned int ret = 0;
1422
1423 rcu_read_lock();
1424 prog = rcu_dereference(f->bpf_prog);
1425 if (prog)
ff936a04 1426 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1427 rcu_read_unlock();
1428
1429 return ret;
1430}
1431
77f65ebd
WB
1432static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1433{
1434 return f->flags & (flag >> 8);
95ec3eb4
DM
1435}
1436
95ec3eb4
DM
1437static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1438 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1439{
1440 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1441 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1442 struct net *net = read_pnet(&f->net);
dc99f600 1443 struct packet_sock *po;
77f65ebd 1444 unsigned int idx;
dc99f600 1445
19bcf9f2 1446 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1447 kfree_skb(skb);
1448 return 0;
1449 }
1450
3f34b24a 1451 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1452 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1453 if (!skb)
1454 return 0;
1455 }
95ec3eb4
DM
1456 switch (f->type) {
1457 case PACKET_FANOUT_HASH:
1458 default:
77f65ebd 1459 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1460 break;
1461 case PACKET_FANOUT_LB:
77f65ebd 1462 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1463 break;
1464 case PACKET_FANOUT_CPU:
77f65ebd
WB
1465 idx = fanout_demux_cpu(f, skb, num);
1466 break;
5df0ddfb
DB
1467 case PACKET_FANOUT_RND:
1468 idx = fanout_demux_rnd(f, skb, num);
1469 break;
2d36097d
NH
1470 case PACKET_FANOUT_QM:
1471 idx = fanout_demux_qm(f, skb, num);
1472 break;
77f65ebd 1473 case PACKET_FANOUT_ROLLOVER:
ad377cab 1474 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1475 break;
47dceb8e 1476 case PACKET_FANOUT_CBPF:
f2e52095 1477 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1478 idx = fanout_demux_bpf(f, skb, num);
1479 break;
dc99f600
DM
1480 }
1481
ad377cab
WB
1482 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1483 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1484
ad377cab 1485 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1486 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1487}
1488
fff3321d
PE
1489DEFINE_MUTEX(fanout_mutex);
1490EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1491static LIST_HEAD(fanout_list);
1492
1493static void __fanout_link(struct sock *sk, struct packet_sock *po)
1494{
1495 struct packet_fanout *f = po->fanout;
1496
1497 spin_lock(&f->lock);
1498 f->arr[f->num_members] = sk;
1499 smp_wmb();
1500 f->num_members++;
1501 spin_unlock(&f->lock);
1502}
1503
1504static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1505{
1506 struct packet_fanout *f = po->fanout;
1507 int i;
1508
1509 spin_lock(&f->lock);
1510 for (i = 0; i < f->num_members; i++) {
1511 if (f->arr[i] == sk)
1512 break;
1513 }
1514 BUG_ON(i >= f->num_members);
1515 f->arr[i] = f->arr[f->num_members - 1];
1516 f->num_members--;
1517 spin_unlock(&f->lock);
1518}
1519
d4dd8aee 1520static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1521{
161642e2
ED
1522 if (sk->sk_family != PF_PACKET)
1523 return false;
c0de08d0 1524
161642e2 1525 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1526}
1527
47dceb8e
WB
1528static void fanout_init_data(struct packet_fanout *f)
1529{
1530 switch (f->type) {
1531 case PACKET_FANOUT_LB:
1532 atomic_set(&f->rr_cur, 0);
1533 break;
1534 case PACKET_FANOUT_CBPF:
f2e52095 1535 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1536 RCU_INIT_POINTER(f->bpf_prog, NULL);
1537 break;
1538 }
1539}
1540
1541static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1542{
1543 struct bpf_prog *old;
1544
1545 spin_lock(&f->lock);
1546 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1547 rcu_assign_pointer(f->bpf_prog, new);
1548 spin_unlock(&f->lock);
1549
1550 if (old) {
1551 synchronize_net();
1552 bpf_prog_destroy(old);
1553 }
1554}
1555
1556static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1557 unsigned int len)
1558{
1559 struct bpf_prog *new;
1560 struct sock_fprog fprog;
1561 int ret;
1562
1563 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1564 return -EPERM;
1565 if (len != sizeof(fprog))
1566 return -EINVAL;
1567 if (copy_from_user(&fprog, data, len))
1568 return -EFAULT;
1569
bab18991 1570 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1571 if (ret)
1572 return ret;
1573
1574 __fanout_set_data_bpf(po->fanout, new);
1575 return 0;
1576}
1577
f2e52095
WB
1578static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1579 unsigned int len)
1580{
1581 struct bpf_prog *new;
1582 u32 fd;
1583
1584 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1585 return -EPERM;
1586 if (len != sizeof(fd))
1587 return -EINVAL;
1588 if (copy_from_user(&fd, data, len))
1589 return -EFAULT;
1590
1591 new = bpf_prog_get(fd);
1592 if (IS_ERR(new))
1593 return PTR_ERR(new);
1594 if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
1595 bpf_prog_put(new);
1596 return -EINVAL;
1597 }
1598
1599 __fanout_set_data_bpf(po->fanout, new);
1600 return 0;
1601}
1602
47dceb8e
WB
1603static int fanout_set_data(struct packet_sock *po, char __user *data,
1604 unsigned int len)
1605{
1606 switch (po->fanout->type) {
1607 case PACKET_FANOUT_CBPF:
1608 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1609 case PACKET_FANOUT_EBPF:
1610 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1611 default:
1612 return -EINVAL;
1613 };
1614}
1615
1616static void fanout_release_data(struct packet_fanout *f)
1617{
1618 switch (f->type) {
1619 case PACKET_FANOUT_CBPF:
f2e52095 1620 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1621 __fanout_set_data_bpf(f, NULL);
1622 };
1623}
1624
7736d33f 1625static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1626{
1627 struct packet_sock *po = pkt_sk(sk);
1628 struct packet_fanout *f, *match;
7736d33f 1629 u8 type = type_flags & 0xff;
77f65ebd 1630 u8 flags = type_flags >> 8;
dc99f600
DM
1631 int err;
1632
1633 switch (type) {
77f65ebd
WB
1634 case PACKET_FANOUT_ROLLOVER:
1635 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1636 return -EINVAL;
dc99f600
DM
1637 case PACKET_FANOUT_HASH:
1638 case PACKET_FANOUT_LB:
95ec3eb4 1639 case PACKET_FANOUT_CPU:
5df0ddfb 1640 case PACKET_FANOUT_RND:
2d36097d 1641 case PACKET_FANOUT_QM:
47dceb8e 1642 case PACKET_FANOUT_CBPF:
f2e52095 1643 case PACKET_FANOUT_EBPF:
dc99f600
DM
1644 break;
1645 default:
1646 return -EINVAL;
1647 }
1648
1649 if (!po->running)
1650 return -EINVAL;
1651
1652 if (po->fanout)
1653 return -EALREADY;
1654
4633c9e0
WB
1655 if (type == PACKET_FANOUT_ROLLOVER ||
1656 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
0648ab70
WB
1657 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1658 if (!po->rollover)
1659 return -ENOMEM;
a9b63918
WB
1660 atomic_long_set(&po->rollover->num, 0);
1661 atomic_long_set(&po->rollover->num_huge, 0);
1662 atomic_long_set(&po->rollover->num_failed, 0);
0648ab70
WB
1663 }
1664
dc99f600
DM
1665 mutex_lock(&fanout_mutex);
1666 match = NULL;
1667 list_for_each_entry(f, &fanout_list, list) {
1668 if (f->id == id &&
1669 read_pnet(&f->net) == sock_net(sk)) {
1670 match = f;
1671 break;
1672 }
1673 }
afe62c68 1674 err = -EINVAL;
77f65ebd 1675 if (match && match->flags != flags)
afe62c68 1676 goto out;
dc99f600 1677 if (!match) {
afe62c68 1678 err = -ENOMEM;
dc99f600 1679 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1680 if (!match)
1681 goto out;
1682 write_pnet(&match->net, sock_net(sk));
1683 match->id = id;
1684 match->type = type;
77f65ebd 1685 match->flags = flags;
afe62c68
ED
1686 INIT_LIST_HEAD(&match->list);
1687 spin_lock_init(&match->lock);
1688 atomic_set(&match->sk_ref, 0);
47dceb8e 1689 fanout_init_data(match);
afe62c68
ED
1690 match->prot_hook.type = po->prot_hook.type;
1691 match->prot_hook.dev = po->prot_hook.dev;
1692 match->prot_hook.func = packet_rcv_fanout;
1693 match->prot_hook.af_packet_priv = match;
c0de08d0 1694 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1695 dev_add_pack(&match->prot_hook);
1696 list_add(&match->list, &fanout_list);
dc99f600 1697 }
afe62c68
ED
1698 err = -EINVAL;
1699 if (match->type == type &&
1700 match->prot_hook.type == po->prot_hook.type &&
1701 match->prot_hook.dev == po->prot_hook.dev) {
1702 err = -ENOSPC;
1703 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1704 __dev_remove_pack(&po->prot_hook);
1705 po->fanout = match;
1706 atomic_inc(&match->sk_ref);
1707 __fanout_link(sk, po);
1708 err = 0;
dc99f600
DM
1709 }
1710 }
afe62c68 1711out:
dc99f600 1712 mutex_unlock(&fanout_mutex);
0648ab70
WB
1713 if (err) {
1714 kfree(po->rollover);
1715 po->rollover = NULL;
1716 }
dc99f600
DM
1717 return err;
1718}
1719
1720static void fanout_release(struct sock *sk)
1721{
1722 struct packet_sock *po = pkt_sk(sk);
1723 struct packet_fanout *f;
1724
1725 f = po->fanout;
1726 if (!f)
1727 return;
1728
fff3321d 1729 mutex_lock(&fanout_mutex);
dc99f600
DM
1730 po->fanout = NULL;
1731
dc99f600
DM
1732 if (atomic_dec_and_test(&f->sk_ref)) {
1733 list_del(&f->list);
1734 dev_remove_pack(&f->prot_hook);
47dceb8e 1735 fanout_release_data(f);
dc99f600
DM
1736 kfree(f);
1737 }
1738 mutex_unlock(&fanout_mutex);
0648ab70 1739
59f21118
WB
1740 if (po->rollover)
1741 kfree_rcu(po->rollover, rcu);
dc99f600 1742}
1da177e4 1743
3c70c132
DB
1744static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1745 struct sk_buff *skb)
1746{
1747 /* Earlier code assumed this would be a VLAN pkt, double-check
1748 * this now that we have the actual packet in hand. We can only
1749 * do this check on Ethernet devices.
1750 */
1751 if (unlikely(dev->type != ARPHRD_ETHER))
1752 return false;
1753
1754 skb_reset_mac_header(skb);
1755 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1756}
1757
90ddc4f0 1758static const struct proto_ops packet_ops;
1da177e4 1759
90ddc4f0 1760static const struct proto_ops packet_ops_spkt;
1da177e4 1761
40d4e3df
ED
1762static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1763 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1764{
1765 struct sock *sk;
1766 struct sockaddr_pkt *spkt;
1767
1768 /*
1769 * When we registered the protocol we saved the socket in the data
1770 * field for just this event.
1771 */
1772
1773 sk = pt->af_packet_priv;
1ce4f28b 1774
1da177e4
LT
1775 /*
1776 * Yank back the headers [hope the device set this
1777 * right or kerboom...]
1778 *
1779 * Incoming packets have ll header pulled,
1780 * push it back.
1781 *
98e399f8 1782 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1783 * so that this procedure is noop.
1784 */
1785
1786 if (skb->pkt_type == PACKET_LOOPBACK)
1787 goto out;
1788
09ad9bc7 1789 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1790 goto out;
1791
40d4e3df
ED
1792 skb = skb_share_check(skb, GFP_ATOMIC);
1793 if (skb == NULL)
1da177e4
LT
1794 goto oom;
1795
1796 /* drop any routing info */
adf30907 1797 skb_dst_drop(skb);
1da177e4 1798
84531c24
PO
1799 /* drop conntrack reference */
1800 nf_reset(skb);
1801
ffbc6111 1802 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1803
98e399f8 1804 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1805
1806 /*
1807 * The SOCK_PACKET socket receives _all_ frames.
1808 */
1809
1810 spkt->spkt_family = dev->type;
1811 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1812 spkt->spkt_protocol = skb->protocol;
1813
1814 /*
1815 * Charge the memory to the socket. This is done specifically
1816 * to prevent sockets using all the memory up.
1817 */
1818
40d4e3df 1819 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1820 return 0;
1821
1822out:
1823 kfree_skb(skb);
1824oom:
1825 return 0;
1826}
1827
1828
1829/*
1830 * Output a raw packet to a device layer. This bypasses all the other
1831 * protocol layers and you must therefore supply it with a complete frame
1832 */
1ce4f28b 1833
1b784140
YX
1834static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1835 size_t len)
1da177e4
LT
1836{
1837 struct sock *sk = sock->sk;
342dfc30 1838 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1839 struct sk_buff *skb = NULL;
1da177e4 1840 struct net_device *dev;
40d4e3df 1841 __be16 proto = 0;
1da177e4 1842 int err;
3bdc0eba 1843 int extra_len = 0;
1ce4f28b 1844
1da177e4 1845 /*
1ce4f28b 1846 * Get and verify the address.
1da177e4
LT
1847 */
1848
40d4e3df 1849 if (saddr) {
1da177e4 1850 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1851 return -EINVAL;
1852 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1853 proto = saddr->spkt_protocol;
1854 } else
1855 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1856
1857 /*
1ce4f28b 1858 * Find the device first to size check it
1da177e4
LT
1859 */
1860
de74e92a 1861 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1862retry:
654d1f8a
ED
1863 rcu_read_lock();
1864 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1865 err = -ENODEV;
1866 if (dev == NULL)
1867 goto out_unlock;
1ce4f28b 1868
d5e76b0a
DM
1869 err = -ENETDOWN;
1870 if (!(dev->flags & IFF_UP))
1871 goto out_unlock;
1872
1da177e4 1873 /*
40d4e3df
ED
1874 * You may not queue a frame bigger than the mtu. This is the lowest level
1875 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1876 */
1ce4f28b 1877
3bdc0eba
BG
1878 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1879 if (!netif_supports_nofcs(dev)) {
1880 err = -EPROTONOSUPPORT;
1881 goto out_unlock;
1882 }
1883 extra_len = 4; /* We're doing our own CRC */
1884 }
1885
1da177e4 1886 err = -EMSGSIZE;
3bdc0eba 1887 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1888 goto out_unlock;
1889
1a35ca80
ED
1890 if (!skb) {
1891 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1892 int tlen = dev->needed_tailroom;
1a35ca80
ED
1893 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1894
1895 rcu_read_unlock();
4ce40912 1896 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1897 if (skb == NULL)
1898 return -ENOBUFS;
1899 /* FIXME: Save some space for broken drivers that write a hard
1900 * header at transmission time by themselves. PPP is the notable
1901 * one here. This should really be fixed at the driver level.
1902 */
1903 skb_reserve(skb, reserved);
1904 skb_reset_network_header(skb);
1905
1906 /* Try to align data part correctly */
1907 if (hhlen) {
1908 skb->data -= hhlen;
1909 skb->tail -= hhlen;
1910 if (len < hhlen)
1911 skb_reset_network_header(skb);
1912 }
6ce8e9ce 1913 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1914 if (err)
1915 goto out_free;
1916 goto retry;
1da177e4
LT
1917 }
1918
3c70c132
DB
1919 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1920 !packet_extra_vlan_len_allowed(dev, skb)) {
1921 err = -EMSGSIZE;
1922 goto out_unlock;
57f89bfa 1923 }
1a35ca80 1924
1da177e4
LT
1925 skb->protocol = proto;
1926 skb->dev = dev;
1927 skb->priority = sk->sk_priority;
2d37a186 1928 skb->mark = sk->sk_mark;
bf84a010
DB
1929
1930 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1931
3bdc0eba
BG
1932 if (unlikely(extra_len == 4))
1933 skb->no_fcs = 1;
1934
40893fd0 1935 skb_probe_transport_header(skb, 0);
c1aad275 1936
1da177e4 1937 dev_queue_xmit(skb);
654d1f8a 1938 rcu_read_unlock();
40d4e3df 1939 return len;
1da177e4 1940
1da177e4 1941out_unlock:
654d1f8a 1942 rcu_read_unlock();
1a35ca80
ED
1943out_free:
1944 kfree_skb(skb);
1da177e4
LT
1945 return err;
1946}
1da177e4 1947
ff936a04
AS
1948static unsigned int run_filter(struct sk_buff *skb,
1949 const struct sock *sk,
1950 unsigned int res)
1da177e4
LT
1951{
1952 struct sk_filter *filter;
fda9ef5d 1953
80f8f102
ED
1954 rcu_read_lock();
1955 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1956 if (filter != NULL)
ff936a04 1957 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1958 rcu_read_unlock();
1da177e4 1959
dbcb5855 1960 return res;
1da177e4
LT
1961}
1962
16cc1400
WB
1963static int __packet_rcv_vnet(const struct sk_buff *skb,
1964 struct virtio_net_hdr *vnet_hdr)
1965{
1966 *vnet_hdr = (const struct virtio_net_hdr) { 0 };
1967
1968 if (skb_is_gso(skb)) {
1969 struct skb_shared_info *sinfo = skb_shinfo(skb);
1970
1971 /* This is a hint as to how much should be linear. */
1972 vnet_hdr->hdr_len =
1973 __cpu_to_virtio16(vio_le(), skb_headlen(skb));
1974 vnet_hdr->gso_size =
1975 __cpu_to_virtio16(vio_le(), sinfo->gso_size);
1976
1977 if (sinfo->gso_type & SKB_GSO_TCPV4)
1978 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1979 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1980 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1981 else if (sinfo->gso_type & SKB_GSO_UDP)
1982 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
1983 else if (sinfo->gso_type & SKB_GSO_FCOE)
1984 return -EINVAL;
1985 else
1986 BUG();
1987
1988 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1989 vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1990 } else
1991 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
1992
1993 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1994 vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1995 vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(),
1996 skb_checksum_start_offset(skb));
1997 vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(),
1998 skb->csum_offset);
1999 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2000 vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
2001 } /* else everything is zero */
2002
2003 return 0;
2004}
2005
2006static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2007 size_t *len)
2008{
2009 struct virtio_net_hdr vnet_hdr;
2010
2011 if (*len < sizeof(vnet_hdr))
2012 return -EINVAL;
2013 *len -= sizeof(vnet_hdr);
2014
2015 if (__packet_rcv_vnet(skb, &vnet_hdr))
2016 return -EINVAL;
2017
2018 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2019}
2020
1da177e4 2021/*
62ab0812
ED
2022 * This function makes lazy skb cloning in hope that most of packets
2023 * are discarded by BPF.
2024 *
2025 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2026 * and skb->cb are mangled. It works because (and until) packets
2027 * falling here are owned by current CPU. Output packets are cloned
2028 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2029 * sequencially, so that if we return skb to original state on exit,
2030 * we will not harm anyone.
1da177e4
LT
2031 */
2032
40d4e3df
ED
2033static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2034 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2035{
2036 struct sock *sk;
2037 struct sockaddr_ll *sll;
2038 struct packet_sock *po;
40d4e3df 2039 u8 *skb_head = skb->data;
1da177e4 2040 int skb_len = skb->len;
dbcb5855 2041 unsigned int snaplen, res;
1da177e4
LT
2042
2043 if (skb->pkt_type == PACKET_LOOPBACK)
2044 goto drop;
2045
2046 sk = pt->af_packet_priv;
2047 po = pkt_sk(sk);
2048
09ad9bc7 2049 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2050 goto drop;
2051
1da177e4
LT
2052 skb->dev = dev;
2053
3b04ddde 2054 if (dev->header_ops) {
1da177e4 2055 /* The device has an explicit notion of ll header,
62ab0812
ED
2056 * exported to higher levels.
2057 *
2058 * Otherwise, the device hides details of its frame
2059 * structure, so that corresponding packet head is
2060 * never delivered to user.
1da177e4
LT
2061 */
2062 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2063 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2064 else if (skb->pkt_type == PACKET_OUTGOING) {
2065 /* Special case: outgoing packets have ll header at head */
bbe735e4 2066 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2067 }
2068 }
2069
2070 snaplen = skb->len;
2071
dbcb5855
DM
2072 res = run_filter(skb, sk, snaplen);
2073 if (!res)
fda9ef5d 2074 goto drop_n_restore;
dbcb5855
DM
2075 if (snaplen > res)
2076 snaplen = res;
1da177e4 2077
0fd7bac6 2078 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2079 goto drop_n_acct;
2080
2081 if (skb_shared(skb)) {
2082 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2083 if (nskb == NULL)
2084 goto drop_n_acct;
2085
2086 if (skb_head != skb->data) {
2087 skb->data = skb_head;
2088 skb->len = skb_len;
2089 }
abc4e4fa 2090 consume_skb(skb);
1da177e4
LT
2091 skb = nskb;
2092 }
2093
b4772ef8 2094 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2095
2096 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2097 sll->sll_hatype = dev->type;
1da177e4 2098 sll->sll_pkttype = skb->pkt_type;
8032b464 2099 if (unlikely(po->origdev))
80feaacb
PWJ
2100 sll->sll_ifindex = orig_dev->ifindex;
2101 else
2102 sll->sll_ifindex = dev->ifindex;
1da177e4 2103
b95cce35 2104 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2105
2472d761
EB
2106 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2107 * Use their space for storing the original skb length.
2108 */
2109 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2110
1da177e4
LT
2111 if (pskb_trim(skb, snaplen))
2112 goto drop_n_acct;
2113
2114 skb_set_owner_r(skb, sk);
2115 skb->dev = NULL;
adf30907 2116 skb_dst_drop(skb);
1da177e4 2117
84531c24
PO
2118 /* drop conntrack reference */
2119 nf_reset(skb);
2120
1da177e4 2121 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2122 po->stats.stats1.tp_packets++;
3bc3b96f 2123 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2124 __skb_queue_tail(&sk->sk_receive_queue, skb);
2125 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2126 sk->sk_data_ready(sk);
1da177e4
LT
2127 return 0;
2128
2129drop_n_acct:
7091fbd8 2130 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2131 po->stats.stats1.tp_drops++;
7091fbd8
WB
2132 atomic_inc(&sk->sk_drops);
2133 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2134
2135drop_n_restore:
2136 if (skb_head != skb->data && skb_shared(skb)) {
2137 skb->data = skb_head;
2138 skb->len = skb_len;
2139 }
2140drop:
ead2ceb0 2141 consume_skb(skb);
1da177e4
LT
2142 return 0;
2143}
2144
40d4e3df
ED
2145static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2146 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2147{
2148 struct sock *sk;
2149 struct packet_sock *po;
2150 struct sockaddr_ll *sll;
184f489e 2151 union tpacket_uhdr h;
40d4e3df 2152 u8 *skb_head = skb->data;
1da177e4 2153 int skb_len = skb->len;
dbcb5855 2154 unsigned int snaplen, res;
f6fb8f10 2155 unsigned long status = TP_STATUS_USER;
bbd6ef87 2156 unsigned short macoff, netoff, hdrlen;
1da177e4 2157 struct sk_buff *copy_skb = NULL;
bbd6ef87 2158 struct timespec ts;
b9c32fb2 2159 __u32 ts_status;
1da177e4 2160
51846355
AW
2161 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2162 * We may add members to them until current aligned size without forcing
2163 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2164 */
2165 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2166 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2167
1da177e4
LT
2168 if (skb->pkt_type == PACKET_LOOPBACK)
2169 goto drop;
2170
2171 sk = pt->af_packet_priv;
2172 po = pkt_sk(sk);
2173
09ad9bc7 2174 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2175 goto drop;
2176
3b04ddde 2177 if (dev->header_ops) {
1da177e4 2178 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2179 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2180 else if (skb->pkt_type == PACKET_OUTGOING) {
2181 /* Special case: outgoing packets have ll header at head */
bbe735e4 2182 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2183 }
2184 }
2185
2186 snaplen = skb->len;
2187
dbcb5855
DM
2188 res = run_filter(skb, sk, snaplen);
2189 if (!res)
fda9ef5d 2190 goto drop_n_restore;
68c2e5de
AD
2191
2192 if (skb->ip_summed == CHECKSUM_PARTIAL)
2193 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2194 else if (skb->pkt_type != PACKET_OUTGOING &&
2195 (skb->ip_summed == CHECKSUM_COMPLETE ||
2196 skb_csum_unnecessary(skb)))
2197 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2198
dbcb5855
DM
2199 if (snaplen > res)
2200 snaplen = res;
1da177e4
LT
2201
2202 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2203 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2204 po->tp_reserve;
1da177e4 2205 } else {
95c96174 2206 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2207 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2208 (maclen < 16 ? 16 : maclen)) +
58d19b19
WB
2209 po->tp_reserve;
2210 if (po->has_vnet_hdr)
2211 netoff += sizeof(struct virtio_net_hdr);
1da177e4
LT
2212 macoff = netoff - maclen;
2213 }
f6fb8f10 2214 if (po->tp_version <= TPACKET_V2) {
2215 if (macoff + snaplen > po->rx_ring.frame_size) {
2216 if (po->copy_thresh &&
0fd7bac6 2217 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2218 if (skb_shared(skb)) {
2219 copy_skb = skb_clone(skb, GFP_ATOMIC);
2220 } else {
2221 copy_skb = skb_get(skb);
2222 skb_head = skb->data;
2223 }
2224 if (copy_skb)
2225 skb_set_owner_r(copy_skb, sk);
1da177e4 2226 }
f6fb8f10 2227 snaplen = po->rx_ring.frame_size - macoff;
2228 if ((int)snaplen < 0)
2229 snaplen = 0;
1da177e4 2230 }
dc808110
ED
2231 } else if (unlikely(macoff + snaplen >
2232 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2233 u32 nval;
2234
2235 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2236 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2237 snaplen, nval, macoff);
2238 snaplen = nval;
2239 if (unlikely((int)snaplen < 0)) {
2240 snaplen = 0;
2241 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2242 }
1da177e4 2243 }
1da177e4 2244 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2245 h.raw = packet_current_rx_frame(po, skb,
2246 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2247 if (!h.raw)
58d19b19 2248 goto drop_n_account;
f6fb8f10 2249 if (po->tp_version <= TPACKET_V2) {
2250 packet_increment_rx_head(po, &po->rx_ring);
2251 /*
2252 * LOSING will be reported till you read the stats,
2253 * because it's COR - Clear On Read.
2254 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2255 * at packet level.
2256 */
ee80fbf3 2257 if (po->stats.stats1.tp_drops)
f6fb8f10 2258 status |= TP_STATUS_LOSING;
2259 }
ee80fbf3 2260 po->stats.stats1.tp_packets++;
1da177e4
LT
2261 if (copy_skb) {
2262 status |= TP_STATUS_COPY;
2263 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2264 }
1da177e4
LT
2265 spin_unlock(&sk->sk_receive_queue.lock);
2266
58d19b19
WB
2267 if (po->has_vnet_hdr) {
2268 if (__packet_rcv_vnet(skb, h.raw + macoff -
2269 sizeof(struct virtio_net_hdr))) {
2270 spin_lock(&sk->sk_receive_queue.lock);
2271 goto drop_n_account;
2272 }
2273 }
2274
bbd6ef87 2275 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2276
2277 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2278 getnstimeofday(&ts);
1da177e4 2279
b9c32fb2
DB
2280 status |= ts_status;
2281
bbd6ef87
PM
2282 switch (po->tp_version) {
2283 case TPACKET_V1:
2284 h.h1->tp_len = skb->len;
2285 h.h1->tp_snaplen = snaplen;
2286 h.h1->tp_mac = macoff;
2287 h.h1->tp_net = netoff;
4b457bdf
DB
2288 h.h1->tp_sec = ts.tv_sec;
2289 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2290 hdrlen = sizeof(*h.h1);
2291 break;
2292 case TPACKET_V2:
2293 h.h2->tp_len = skb->len;
2294 h.h2->tp_snaplen = snaplen;
2295 h.h2->tp_mac = macoff;
2296 h.h2->tp_net = netoff;
bbd6ef87
PM
2297 h.h2->tp_sec = ts.tv_sec;
2298 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2299 if (skb_vlan_tag_present(skb)) {
2300 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2301 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2302 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2303 } else {
2304 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2305 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2306 }
e4d26f4b 2307 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2308 hdrlen = sizeof(*h.h2);
2309 break;
f6fb8f10 2310 case TPACKET_V3:
2311 /* tp_nxt_offset,vlan are already populated above.
2312 * So DONT clear those fields here
2313 */
2314 h.h3->tp_status |= status;
2315 h.h3->tp_len = skb->len;
2316 h.h3->tp_snaplen = snaplen;
2317 h.h3->tp_mac = macoff;
2318 h.h3->tp_net = netoff;
f6fb8f10 2319 h.h3->tp_sec = ts.tv_sec;
2320 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2321 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2322 hdrlen = sizeof(*h.h3);
2323 break;
bbd6ef87
PM
2324 default:
2325 BUG();
2326 }
1da177e4 2327
bbd6ef87 2328 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2329 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2330 sll->sll_family = AF_PACKET;
2331 sll->sll_hatype = dev->type;
2332 sll->sll_protocol = skb->protocol;
2333 sll->sll_pkttype = skb->pkt_type;
8032b464 2334 if (unlikely(po->origdev))
80feaacb
PWJ
2335 sll->sll_ifindex = orig_dev->ifindex;
2336 else
2337 sll->sll_ifindex = dev->ifindex;
1da177e4 2338
e16aa207 2339 smp_mb();
f0d4eb29 2340
f6dafa95 2341#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2342 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2343 u8 *start, *end;
2344
f0d4eb29
DB
2345 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2346 macoff + snaplen);
2347
2348 for (start = h.raw; start < end; start += PAGE_SIZE)
2349 flush_dcache_page(pgv_to_page(start));
1da177e4 2350 }
f0d4eb29 2351 smp_wmb();
f6dafa95 2352#endif
f0d4eb29 2353
da413eec 2354 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2355 __packet_set_status(po, h.raw, status);
da413eec
DC
2356 sk->sk_data_ready(sk);
2357 } else {
f6fb8f10 2358 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2359 }
1da177e4
LT
2360
2361drop_n_restore:
2362 if (skb_head != skb->data && skb_shared(skb)) {
2363 skb->data = skb_head;
2364 skb->len = skb_len;
2365 }
2366drop:
1ce4f28b 2367 kfree_skb(skb);
1da177e4
LT
2368 return 0;
2369
58d19b19 2370drop_n_account:
ee80fbf3 2371 po->stats.stats1.tp_drops++;
1da177e4
LT
2372 spin_unlock(&sk->sk_receive_queue.lock);
2373
676d2369 2374 sk->sk_data_ready(sk);
acb5d75b 2375 kfree_skb(copy_skb);
1da177e4
LT
2376 goto drop_n_restore;
2377}
2378
69e3c75f
JB
2379static void tpacket_destruct_skb(struct sk_buff *skb)
2380{
2381 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2382
69e3c75f 2383 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2384 void *ph;
b9c32fb2
DB
2385 __u32 ts;
2386
69e3c75f 2387 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2388 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2389
2390 ts = __packet_set_timestamp(po, ph, skb);
2391 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2392 }
2393
2394 sock_wfree(skb);
2395}
2396
9c707762
WB
2397static bool ll_header_truncated(const struct net_device *dev, int len)
2398{
2399 /* net device doesn't like empty head */
880621c2
MB
2400 if (unlikely(len < dev->hard_header_len)) {
2401 net_warn_ratelimited("%s: packet size is too short (%d < %d)\n",
9c707762
WB
2402 current->comm, len, dev->hard_header_len);
2403 return true;
2404 }
2405
2406 return false;
2407}
2408
c72219b7
DB
2409static void tpacket_set_protocol(const struct net_device *dev,
2410 struct sk_buff *skb)
2411{
2412 if (dev->type == ARPHRD_ETHER) {
2413 skb_reset_mac_header(skb);
2414 skb->protocol = eth_hdr(skb)->h_proto;
2415 }
2416}
2417
16cc1400
WB
2418static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2419{
2420 unsigned short gso_type = 0;
2421
2422 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2423 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2424 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2425 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2426 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2427 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2428 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2429
2430 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2431 return -EINVAL;
2432
2433 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2434 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2435 case VIRTIO_NET_HDR_GSO_TCPV4:
2436 gso_type = SKB_GSO_TCPV4;
2437 break;
2438 case VIRTIO_NET_HDR_GSO_TCPV6:
2439 gso_type = SKB_GSO_TCPV6;
2440 break;
2441 case VIRTIO_NET_HDR_GSO_UDP:
2442 gso_type = SKB_GSO_UDP;
2443 break;
2444 default:
2445 return -EINVAL;
2446 }
2447
2448 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
2449 gso_type |= SKB_GSO_TCP_ECN;
2450
2451 if (vnet_hdr->gso_size == 0)
2452 return -EINVAL;
2453 }
2454
2455 vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */
2456 return 0;
2457}
2458
2459static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2460 struct virtio_net_hdr *vnet_hdr)
2461{
2462 int n;
2463
2464 if (*len < sizeof(*vnet_hdr))
2465 return -EINVAL;
2466 *len -= sizeof(*vnet_hdr);
2467
2468 n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
2469 if (n != sizeof(*vnet_hdr))
2470 return -EFAULT;
2471
2472 return __packet_snd_vnet_parse(vnet_hdr, *len);
2473}
2474
2475static int packet_snd_vnet_gso(struct sk_buff *skb,
2476 struct virtio_net_hdr *vnet_hdr)
2477{
2478 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2479 u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
2480 u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
2481
2482 if (!skb_partial_csum_set(skb, s, o))
2483 return -EINVAL;
2484 }
2485
2486 skb_shinfo(skb)->gso_size =
2487 __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
2488 skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
2489
2490 /* Header must be checked, and gso_segs computed. */
2491 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2492 skb_shinfo(skb)->gso_segs = 0;
2493 return 0;
2494}
2495
40d4e3df 2496static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2497 void *frame, struct net_device *dev, void *data, int tp_len,
1d036d25 2498 __be16 proto, unsigned char *addr, int hlen, int copylen)
69e3c75f 2499{
184f489e 2500 union tpacket_uhdr ph;
8d39b4a6 2501 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2502 struct socket *sock = po->sk.sk_socket;
2503 struct page *page;
69e3c75f
JB
2504 int err;
2505
2506 ph.raw = frame;
2507
2508 skb->protocol = proto;
2509 skb->dev = dev;
2510 skb->priority = po->sk.sk_priority;
2d37a186 2511 skb->mark = po->sk.sk_mark;
2e31396f 2512 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2513 skb_shinfo(skb)->destructor_arg = ph.raw;
2514
ae641949 2515 skb_reserve(skb, hlen);
69e3c75f 2516 skb_reset_network_header(skb);
c1aad275 2517
69e3c75f
JB
2518 to_write = tp_len;
2519
2520 if (sock->type == SOCK_DGRAM) {
2521 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2522 NULL, tp_len);
2523 if (unlikely(err < 0))
2524 return -EINVAL;
1d036d25 2525 } else if (copylen) {
69e3c75f 2526 skb_push(skb, dev->hard_header_len);
1d036d25
WB
2527 skb_put(skb, copylen - dev->hard_header_len);
2528 err = skb_store_bits(skb, 0, data, copylen);
69e3c75f
JB
2529 if (unlikely(err))
2530 return err;
c72219b7
DB
2531 if (!skb->protocol)
2532 tpacket_set_protocol(dev, skb);
69e3c75f 2533
1d036d25
WB
2534 data += copylen;
2535 to_write -= copylen;
69e3c75f
JB
2536 }
2537
69e3c75f
JB
2538 offset = offset_in_page(data);
2539 len_max = PAGE_SIZE - offset;
2540 len = ((to_write > len_max) ? len_max : to_write);
2541
2542 skb->data_len = to_write;
2543 skb->len += to_write;
2544 skb->truesize += to_write;
2545 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2546
2547 while (likely(to_write)) {
2548 nr_frags = skb_shinfo(skb)->nr_frags;
2549
2550 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2551 pr_err("Packet exceed the number of skb frags(%lu)\n",
2552 MAX_SKB_FRAGS);
69e3c75f
JB
2553 return -EFAULT;
2554 }
2555
0af55bb5
CG
2556 page = pgv_to_page(data);
2557 data += len;
69e3c75f
JB
2558 flush_dcache_page(page);
2559 get_page(page);
0af55bb5 2560 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2561 to_write -= len;
2562 offset = 0;
2563 len_max = PAGE_SIZE;
2564 len = ((to_write > len_max) ? len_max : to_write);
2565 }
2566
8fd6c80d 2567 skb_probe_transport_header(skb, 0);
efdfa2f7 2568
69e3c75f
JB
2569 return tp_len;
2570}
2571
8d39b4a6
WB
2572static int tpacket_parse_header(struct packet_sock *po, void *frame,
2573 int size_max, void **data)
2574{
2575 union tpacket_uhdr ph;
2576 int tp_len, off;
2577
2578 ph.raw = frame;
2579
2580 switch (po->tp_version) {
2581 case TPACKET_V2:
2582 tp_len = ph.h2->tp_len;
2583 break;
2584 default:
2585 tp_len = ph.h1->tp_len;
2586 break;
2587 }
2588 if (unlikely(tp_len > size_max)) {
2589 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2590 return -EMSGSIZE;
2591 }
2592
2593 if (unlikely(po->tp_tx_has_off)) {
2594 int off_min, off_max;
2595
2596 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2597 off_max = po->tx_ring.frame_size - tp_len;
2598 if (po->sk.sk_type == SOCK_DGRAM) {
2599 switch (po->tp_version) {
2600 case TPACKET_V2:
2601 off = ph.h2->tp_net;
2602 break;
2603 default:
2604 off = ph.h1->tp_net;
2605 break;
2606 }
2607 } else {
2608 switch (po->tp_version) {
2609 case TPACKET_V2:
2610 off = ph.h2->tp_mac;
2611 break;
2612 default:
2613 off = ph.h1->tp_mac;
2614 break;
2615 }
2616 }
2617 if (unlikely((off < off_min) || (off_max < off)))
2618 return -EINVAL;
2619 } else {
2620 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2621 }
2622
2623 *data = frame + off;
2624 return tp_len;
2625}
2626
69e3c75f
JB
2627static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2628{
69e3c75f
JB
2629 struct sk_buff *skb;
2630 struct net_device *dev;
1d036d25 2631 struct virtio_net_hdr *vnet_hdr = NULL;
69e3c75f 2632 __be16 proto;
09effa67 2633 int err, reserve = 0;
40d4e3df 2634 void *ph;
342dfc30 2635 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2636 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2637 int tp_len, size_max;
2638 unsigned char *addr;
8d39b4a6 2639 void *data;
69e3c75f 2640 int len_sum = 0;
9e67030a 2641 int status = TP_STATUS_AVAILABLE;
1d036d25 2642 int hlen, tlen, copylen = 0;
69e3c75f 2643
69e3c75f
JB
2644 mutex_lock(&po->pg_vec_lock);
2645
66e56cd4 2646 if (likely(saddr == NULL)) {
e40526cb 2647 dev = packet_cached_dev_get(po);
69e3c75f
JB
2648 proto = po->num;
2649 addr = NULL;
2650 } else {
2651 err = -EINVAL;
2652 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2653 goto out;
2654 if (msg->msg_namelen < (saddr->sll_halen
2655 + offsetof(struct sockaddr_ll,
2656 sll_addr)))
2657 goto out;
69e3c75f
JB
2658 proto = saddr->sll_protocol;
2659 addr = saddr->sll_addr;
827d9780 2660 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2661 }
2662
69e3c75f
JB
2663 err = -ENXIO;
2664 if (unlikely(dev == NULL))
2665 goto out;
69e3c75f
JB
2666 err = -ENETDOWN;
2667 if (unlikely(!(dev->flags & IFF_UP)))
2668 goto out_put;
2669
5cfb4c8d
DB
2670 if (po->sk.sk_socket->type == SOCK_RAW)
2671 reserve = dev->hard_header_len;
69e3c75f 2672 size_max = po->tx_ring.frame_size
b5dd884e 2673 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2674
1d036d25 2675 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2676 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2677
69e3c75f
JB
2678 do {
2679 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2680 TP_STATUS_SEND_REQUEST);
69e3c75f 2681 if (unlikely(ph == NULL)) {
87a2fd28
DB
2682 if (need_wait && need_resched())
2683 schedule();
69e3c75f
JB
2684 continue;
2685 }
2686
8d39b4a6
WB
2687 skb = NULL;
2688 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2689 if (tp_len < 0)
2690 goto tpacket_error;
2691
69e3c75f 2692 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2693 hlen = LL_RESERVED_SPACE(dev);
2694 tlen = dev->needed_tailroom;
1d036d25
WB
2695 if (po->has_vnet_hdr) {
2696 vnet_hdr = data;
2697 data += sizeof(*vnet_hdr);
2698 tp_len -= sizeof(*vnet_hdr);
2699 if (tp_len < 0 ||
2700 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2701 tp_len = -EINVAL;
2702 goto tpacket_error;
2703 }
2704 copylen = __virtio16_to_cpu(vio_le(),
2705 vnet_hdr->hdr_len);
2706 }
2707 if (dev->hard_header_len) {
2708 if (ll_header_truncated(dev, tp_len)) {
2709 tp_len = -EINVAL;
2710 goto tpacket_error;
2711 }
2712 copylen = max_t(int, copylen, dev->hard_header_len);
2713 }
69e3c75f 2714 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2715 hlen + tlen + sizeof(struct sockaddr_ll) +
2716 (copylen - dev->hard_header_len),
fbf33a28 2717 !need_wait, &err);
69e3c75f 2718
fbf33a28
KM
2719 if (unlikely(skb == NULL)) {
2720 /* we assume the socket was initially writeable ... */
2721 if (likely(len_sum > 0))
2722 err = len_sum;
69e3c75f 2723 goto out_status;
fbf33a28 2724 }
8d39b4a6 2725 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
1d036d25 2726 addr, hlen, copylen);
dbd46ab4 2727 if (likely(tp_len >= 0) &&
5cfb4c8d 2728 tp_len > dev->mtu + reserve &&
1d036d25 2729 !po->has_vnet_hdr &&
3c70c132
DB
2730 !packet_extra_vlan_len_allowed(dev, skb))
2731 tp_len = -EMSGSIZE;
69e3c75f
JB
2732
2733 if (unlikely(tp_len < 0)) {
8d39b4a6 2734tpacket_error:
69e3c75f
JB
2735 if (po->tp_loss) {
2736 __packet_set_status(po, ph,
2737 TP_STATUS_AVAILABLE);
2738 packet_increment_head(&po->tx_ring);
2739 kfree_skb(skb);
2740 continue;
2741 } else {
2742 status = TP_STATUS_WRONG_FORMAT;
2743 err = tp_len;
2744 goto out_status;
2745 }
2746 }
2747
1d036d25
WB
2748 if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
2749 tp_len = -EINVAL;
2750 goto tpacket_error;
2751 }
2752
0fd5d57b
DB
2753 packet_pick_tx_queue(dev, skb);
2754
69e3c75f
JB
2755 skb->destructor = tpacket_destruct_skb;
2756 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2757 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2758
2759 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2760 err = po->xmit(skb);
eb70df13
JP
2761 if (unlikely(err > 0)) {
2762 err = net_xmit_errno(err);
2763 if (err && __packet_get_status(po, ph) ==
2764 TP_STATUS_AVAILABLE) {
2765 /* skb was destructed already */
2766 skb = NULL;
2767 goto out_status;
2768 }
2769 /*
2770 * skb was dropped but not destructed yet;
2771 * let's treat it like congestion or err < 0
2772 */
2773 err = 0;
2774 }
69e3c75f
JB
2775 packet_increment_head(&po->tx_ring);
2776 len_sum += tp_len;
b0138408
DB
2777 } while (likely((ph != NULL) ||
2778 /* Note: packet_read_pending() might be slow if we have
2779 * to call it as it's per_cpu variable, but in fast-path
2780 * we already short-circuit the loop with the first
2781 * condition, and luckily don't have to go that path
2782 * anyway.
2783 */
2784 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2785
2786 err = len_sum;
2787 goto out_put;
2788
69e3c75f
JB
2789out_status:
2790 __packet_set_status(po, ph, status);
2791 kfree_skb(skb);
2792out_put:
e40526cb 2793 dev_put(dev);
69e3c75f
JB
2794out:
2795 mutex_unlock(&po->pg_vec_lock);
2796 return err;
2797}
69e3c75f 2798
eea49cc9
OJ
2799static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2800 size_t reserve, size_t len,
2801 size_t linear, int noblock,
2802 int *err)
bfd5f4a3
SS
2803{
2804 struct sk_buff *skb;
2805
2806 /* Under a page? Don't bother with paged skb. */
2807 if (prepad + len < PAGE_SIZE || !linear)
2808 linear = len;
2809
2810 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2811 err, 0);
bfd5f4a3
SS
2812 if (!skb)
2813 return NULL;
2814
2815 skb_reserve(skb, reserve);
2816 skb_put(skb, linear);
2817 skb->data_len = len - linear;
2818 skb->len += len - linear;
2819
2820 return skb;
2821}
2822
d346a3fa 2823static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2824{
2825 struct sock *sk = sock->sk;
342dfc30 2826 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2827 struct sk_buff *skb;
2828 struct net_device *dev;
0e11c91e 2829 __be16 proto;
1da177e4 2830 unsigned char *addr;
827d9780 2831 int err, reserve = 0;
c7d39e32 2832 struct sockcm_cookie sockc;
bfd5f4a3
SS
2833 struct virtio_net_hdr vnet_hdr = { 0 };
2834 int offset = 0;
bfd5f4a3 2835 struct packet_sock *po = pkt_sk(sk);
ae641949 2836 int hlen, tlen;
3bdc0eba 2837 int extra_len = 0;
1da177e4
LT
2838
2839 /*
1ce4f28b 2840 * Get and verify the address.
1da177e4 2841 */
1ce4f28b 2842
66e56cd4 2843 if (likely(saddr == NULL)) {
e40526cb 2844 dev = packet_cached_dev_get(po);
1da177e4
LT
2845 proto = po->num;
2846 addr = NULL;
2847 } else {
2848 err = -EINVAL;
2849 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2850 goto out;
0fb375fb
EB
2851 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2852 goto out;
1da177e4
LT
2853 proto = saddr->sll_protocol;
2854 addr = saddr->sll_addr;
827d9780 2855 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2856 }
2857
1da177e4 2858 err = -ENXIO;
e40526cb 2859 if (unlikely(dev == NULL))
1da177e4 2860 goto out_unlock;
d5e76b0a 2861 err = -ENETDOWN;
e40526cb 2862 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2863 goto out_unlock;
2864
c7d39e32
EJ
2865 sockc.mark = sk->sk_mark;
2866 if (msg->msg_controllen) {
2867 err = sock_cmsg_send(sk, msg, &sockc);
2868 if (unlikely(err))
2869 goto out_unlock;
2870 }
2871
e40526cb
DB
2872 if (sock->type == SOCK_RAW)
2873 reserve = dev->hard_header_len;
bfd5f4a3 2874 if (po->has_vnet_hdr) {
16cc1400
WB
2875 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2876 if (err)
bfd5f4a3 2877 goto out_unlock;
bfd5f4a3
SS
2878 }
2879
3bdc0eba
BG
2880 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2881 if (!netif_supports_nofcs(dev)) {
2882 err = -EPROTONOSUPPORT;
2883 goto out_unlock;
2884 }
2885 extra_len = 4; /* We're doing our own CRC */
2886 }
2887
1da177e4 2888 err = -EMSGSIZE;
16cc1400
WB
2889 if (!vnet_hdr.gso_type &&
2890 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2891 goto out_unlock;
2892
bfd5f4a3 2893 err = -ENOBUFS;
ae641949
HX
2894 hlen = LL_RESERVED_SPACE(dev);
2895 tlen = dev->needed_tailroom;
dc9e5153 2896 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
d3869efe 2897 __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
bfd5f4a3 2898 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2899 if (skb == NULL)
1da177e4
LT
2900 goto out_unlock;
2901
bfd5f4a3 2902 skb_set_network_header(skb, reserve);
1da177e4 2903
0c4e8581 2904 err = -EINVAL;
9c707762
WB
2905 if (sock->type == SOCK_DGRAM) {
2906 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2907 if (unlikely(offset < 0))
9c707762
WB
2908 goto out_free;
2909 } else {
2910 if (ll_header_truncated(dev, len))
2911 goto out_free;
2912 }
1da177e4
LT
2913
2914 /* Returns -EFAULT on error */
c0371da6 2915 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2916 if (err)
2917 goto out_free;
bf84a010
DB
2918
2919 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2920
16cc1400 2921 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2922 !packet_extra_vlan_len_allowed(dev, skb)) {
2923 err = -EMSGSIZE;
2924 goto out_free;
57f89bfa
BG
2925 }
2926
09effa67
DM
2927 skb->protocol = proto;
2928 skb->dev = dev;
1da177e4 2929 skb->priority = sk->sk_priority;
c7d39e32 2930 skb->mark = sockc.mark;
0fd5d57b
DB
2931
2932 packet_pick_tx_queue(dev, skb);
1da177e4 2933
bfd5f4a3 2934 if (po->has_vnet_hdr) {
16cc1400
WB
2935 err = packet_snd_vnet_gso(skb, &vnet_hdr);
2936 if (err)
2937 goto out_free;
2938 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2939 }
2940
8fd6c80d
DB
2941 skb_probe_transport_header(skb, reserve);
2942
3bdc0eba
BG
2943 if (unlikely(extra_len == 4))
2944 skb->no_fcs = 1;
2945
d346a3fa 2946 err = po->xmit(skb);
1da177e4
LT
2947 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2948 goto out_unlock;
2949
e40526cb 2950 dev_put(dev);
1da177e4 2951
40d4e3df 2952 return len;
1da177e4
LT
2953
2954out_free:
2955 kfree_skb(skb);
2956out_unlock:
e40526cb 2957 if (dev)
1da177e4
LT
2958 dev_put(dev);
2959out:
2960 return err;
2961}
2962
1b784140 2963static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2964{
69e3c75f
JB
2965 struct sock *sk = sock->sk;
2966 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2967
69e3c75f
JB
2968 if (po->tx_ring.pg_vec)
2969 return tpacket_snd(po, msg);
2970 else
69e3c75f
JB
2971 return packet_snd(sock, msg, len);
2972}
2973
1da177e4
LT
2974/*
2975 * Close a PACKET socket. This is fairly simple. We immediately go
2976 * to 'closed' state and remove our protocol entry in the device list.
2977 */
2978
2979static int packet_release(struct socket *sock)
2980{
2981 struct sock *sk = sock->sk;
2982 struct packet_sock *po;
d12d01d6 2983 struct net *net;
f6fb8f10 2984 union tpacket_req_u req_u;
1da177e4
LT
2985
2986 if (!sk)
2987 return 0;
2988
3b1e0a65 2989 net = sock_net(sk);
1da177e4
LT
2990 po = pkt_sk(sk);
2991
0fa7fa98 2992 mutex_lock(&net->packet.sklist_lock);
808f5114 2993 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2994 mutex_unlock(&net->packet.sklist_lock);
2995
2996 preempt_disable();
920de804 2997 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2998 preempt_enable();
1da177e4 2999
808f5114 3000 spin_lock(&po->bind_lock);
ce06b03e 3001 unregister_prot_hook(sk, false);
66e56cd4
DB
3002 packet_cached_dev_reset(po);
3003
160ff18a
BG
3004 if (po->prot_hook.dev) {
3005 dev_put(po->prot_hook.dev);
3006 po->prot_hook.dev = NULL;
3007 }
808f5114 3008 spin_unlock(&po->bind_lock);
1da177e4 3009
1da177e4 3010 packet_flush_mclist(sk);
1da177e4 3011
9665d5d6
PS
3012 if (po->rx_ring.pg_vec) {
3013 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3014 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3015 }
69e3c75f 3016
9665d5d6
PS
3017 if (po->tx_ring.pg_vec) {
3018 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3019 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3020 }
1da177e4 3021
dc99f600
DM
3022 fanout_release(sk);
3023
808f5114 3024 synchronize_net();
1da177e4
LT
3025 /*
3026 * Now the socket is dead. No more input will appear.
3027 */
1da177e4
LT
3028 sock_orphan(sk);
3029 sock->sk = NULL;
3030
3031 /* Purge queues */
3032
3033 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3034 packet_free_pending(po);
17ab56a2 3035 sk_refcnt_debug_release(sk);
1da177e4
LT
3036
3037 sock_put(sk);
3038 return 0;
3039}
3040
3041/*
3042 * Attach a packet hook.
3043 */
3044
30f7ea1c
FR
3045static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3046 __be16 proto)
1da177e4
LT
3047{
3048 struct packet_sock *po = pkt_sk(sk);
158cd4af 3049 struct net_device *dev_curr;
902fefb8
DB
3050 __be16 proto_curr;
3051 bool need_rehook;
30f7ea1c
FR
3052 struct net_device *dev = NULL;
3053 int ret = 0;
3054 bool unlisted = false;
dc99f600 3055
30f7ea1c 3056 if (po->fanout)
dc99f600 3057 return -EINVAL;
1da177e4
LT
3058
3059 lock_sock(sk);
1da177e4 3060 spin_lock(&po->bind_lock);
30f7ea1c
FR
3061 rcu_read_lock();
3062
3063 if (name) {
3064 dev = dev_get_by_name_rcu(sock_net(sk), name);
3065 if (!dev) {
3066 ret = -ENODEV;
3067 goto out_unlock;
3068 }
3069 } else if (ifindex) {
3070 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3071 if (!dev) {
3072 ret = -ENODEV;
3073 goto out_unlock;
3074 }
3075 }
3076
3077 if (dev)
3078 dev_hold(dev);
66e56cd4 3079
902fefb8
DB
3080 proto_curr = po->prot_hook.type;
3081 dev_curr = po->prot_hook.dev;
3082
3083 need_rehook = proto_curr != proto || dev_curr != dev;
3084
3085 if (need_rehook) {
30f7ea1c
FR
3086 if (po->running) {
3087 rcu_read_unlock();
3088 __unregister_prot_hook(sk, true);
3089 rcu_read_lock();
3090 dev_curr = po->prot_hook.dev;
3091 if (dev)
3092 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3093 dev->ifindex);
3094 }
1da177e4 3095
902fefb8
DB
3096 po->num = proto;
3097 po->prot_hook.type = proto;
902fefb8 3098
30f7ea1c
FR
3099 if (unlikely(unlisted)) {
3100 dev_put(dev);
3101 po->prot_hook.dev = NULL;
3102 po->ifindex = -1;
3103 packet_cached_dev_reset(po);
3104 } else {
3105 po->prot_hook.dev = dev;
3106 po->ifindex = dev ? dev->ifindex : 0;
3107 packet_cached_dev_assign(po, dev);
3108 }
902fefb8 3109 }
158cd4af
LW
3110 if (dev_curr)
3111 dev_put(dev_curr);
66e56cd4 3112
902fefb8 3113 if (proto == 0 || !need_rehook)
1da177e4
LT
3114 goto out_unlock;
3115
30f7ea1c 3116 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3117 register_prot_hook(sk);
be85d4ad
UT
3118 } else {
3119 sk->sk_err = ENETDOWN;
3120 if (!sock_flag(sk, SOCK_DEAD))
3121 sk->sk_error_report(sk);
1da177e4
LT
3122 }
3123
3124out_unlock:
30f7ea1c 3125 rcu_read_unlock();
1da177e4
LT
3126 spin_unlock(&po->bind_lock);
3127 release_sock(sk);
30f7ea1c 3128 return ret;
1da177e4
LT
3129}
3130
3131/*
3132 * Bind a packet socket to a device
3133 */
3134
40d4e3df
ED
3135static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3136 int addr_len)
1da177e4 3137{
40d4e3df 3138 struct sock *sk = sock->sk;
1da177e4 3139 char name[15];
1ce4f28b 3140
1da177e4
LT
3141 /*
3142 * Check legality
3143 */
1ce4f28b 3144
8ae55f04 3145 if (addr_len != sizeof(struct sockaddr))
1da177e4 3146 return -EINVAL;
40d4e3df 3147 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 3148
30f7ea1c 3149 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3150}
1da177e4
LT
3151
3152static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3153{
40d4e3df
ED
3154 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3155 struct sock *sk = sock->sk;
1da177e4
LT
3156
3157 /*
3158 * Check legality
3159 */
1ce4f28b 3160
1da177e4
LT
3161 if (addr_len < sizeof(struct sockaddr_ll))
3162 return -EINVAL;
3163 if (sll->sll_family != AF_PACKET)
3164 return -EINVAL;
3165
30f7ea1c
FR
3166 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3167 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3168}
3169
3170static struct proto packet_proto = {
3171 .name = "PACKET",
3172 .owner = THIS_MODULE,
3173 .obj_size = sizeof(struct packet_sock),
3174};
3175
3176/*
1ce4f28b 3177 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3178 */
3179
3f378b68
EP
3180static int packet_create(struct net *net, struct socket *sock, int protocol,
3181 int kern)
1da177e4
LT
3182{
3183 struct sock *sk;
3184 struct packet_sock *po;
0e11c91e 3185 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3186 int err;
3187
df008c91 3188 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3189 return -EPERM;
be02097c
DM
3190 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3191 sock->type != SOCK_PACKET)
1da177e4
LT
3192 return -ESOCKTNOSUPPORT;
3193
3194 sock->state = SS_UNCONNECTED;
3195
3196 err = -ENOBUFS;
11aa9c28 3197 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3198 if (sk == NULL)
3199 goto out;
3200
3201 sock->ops = &packet_ops;
1da177e4
LT
3202 if (sock->type == SOCK_PACKET)
3203 sock->ops = &packet_ops_spkt;
be02097c 3204
1da177e4
LT
3205 sock_init_data(sock, sk);
3206
3207 po = pkt_sk(sk);
3208 sk->sk_family = PF_PACKET;
0e11c91e 3209 po->num = proto;
d346a3fa 3210 po->xmit = dev_queue_xmit;
66e56cd4 3211
b0138408
DB
3212 err = packet_alloc_pending(po);
3213 if (err)
3214 goto out2;
3215
66e56cd4 3216 packet_cached_dev_reset(po);
1da177e4
LT
3217
3218 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3219 sk_refcnt_debug_inc(sk);
1da177e4
LT
3220
3221 /*
3222 * Attach a protocol block
3223 */
3224
3225 spin_lock_init(&po->bind_lock);
905db440 3226 mutex_init(&po->pg_vec_lock);
0648ab70 3227 po->rollover = NULL;
1da177e4 3228 po->prot_hook.func = packet_rcv;
be02097c 3229
1da177e4
LT
3230 if (sock->type == SOCK_PACKET)
3231 po->prot_hook.func = packet_rcv_spkt;
be02097c 3232
1da177e4
LT
3233 po->prot_hook.af_packet_priv = sk;
3234
0e11c91e
AV
3235 if (proto) {
3236 po->prot_hook.type = proto;
ce06b03e 3237 register_prot_hook(sk);
1da177e4
LT
3238 }
3239
0fa7fa98 3240 mutex_lock(&net->packet.sklist_lock);
808f5114 3241 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3242 mutex_unlock(&net->packet.sklist_lock);
3243
3244 preempt_disable();
3680453c 3245 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3246 preempt_enable();
808f5114 3247
40d4e3df 3248 return 0;
b0138408
DB
3249out2:
3250 sk_free(sk);
1da177e4
LT
3251out:
3252 return err;
3253}
3254
3255/*
3256 * Pull a packet from our receive queue and hand it to the user.
3257 * If necessary we block.
3258 */
3259
1b784140
YX
3260static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3261 int flags)
1da177e4
LT
3262{
3263 struct sock *sk = sock->sk;
3264 struct sk_buff *skb;
3265 int copied, err;
bfd5f4a3 3266 int vnet_hdr_len = 0;
2472d761 3267 unsigned int origlen = 0;
1da177e4
LT
3268
3269 err = -EINVAL;
ed85b565 3270 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3271 goto out;
3272
3273#if 0
3274 /* What error should we return now? EUNATTACH? */
3275 if (pkt_sk(sk)->ifindex < 0)
3276 return -ENODEV;
3277#endif
3278
ed85b565 3279 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3280 err = sock_recv_errqueue(sk, msg, len,
3281 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3282 goto out;
3283 }
3284
1da177e4
LT
3285 /*
3286 * Call the generic datagram receiver. This handles all sorts
3287 * of horrible races and re-entrancy so we can forget about it
3288 * in the protocol layers.
3289 *
3290 * Now it will return ENETDOWN, if device have just gone down,
3291 * but then it will block.
3292 */
3293
40d4e3df 3294 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3295
3296 /*
1ce4f28b 3297 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3298 * handles the blocking we don't see and worry about blocking
3299 * retries.
3300 */
3301
8ae55f04 3302 if (skb == NULL)
1da177e4
LT
3303 goto out;
3304
2ccdbaa6
WB
3305 if (pkt_sk(sk)->pressure)
3306 packet_rcv_has_room(pkt_sk(sk), NULL);
3307
bfd5f4a3 3308 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3309 err = packet_rcv_vnet(msg, skb, &len);
3310 if (err)
bfd5f4a3 3311 goto out_free;
16cc1400 3312 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3313 }
3314
f3d33426
HFS
3315 /* You lose any data beyond the buffer you gave. If it worries
3316 * a user program they can ask the device for its MTU
3317 * anyway.
1da177e4 3318 */
1da177e4 3319 copied = skb->len;
40d4e3df
ED
3320 if (copied > len) {
3321 copied = len;
3322 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3323 }
3324
51f3d02b 3325 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3326 if (err)
3327 goto out_free;
3328
2472d761
EB
3329 if (sock->type != SOCK_PACKET) {
3330 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3331
3332 /* Original length was stored in sockaddr_ll fields */
3333 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3334 sll->sll_family = AF_PACKET;
3335 sll->sll_protocol = skb->protocol;
3336 }
3337
3b885787 3338 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3339
f3d33426
HFS
3340 if (msg->msg_name) {
3341 /* If the address length field is there to be filled
3342 * in, we fill it in now.
3343 */
3344 if (sock->type == SOCK_PACKET) {
342dfc30 3345 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3346 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3347 } else {
3348 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3349
f3d33426
HFS
3350 msg->msg_namelen = sll->sll_halen +
3351 offsetof(struct sockaddr_ll, sll_addr);
3352 }
ffbc6111
HX
3353 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3354 msg->msg_namelen);
f3d33426 3355 }
1da177e4 3356
8dc41944 3357 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3358 struct tpacket_auxdata aux;
3359
3360 aux.tp_status = TP_STATUS_USER;
3361 if (skb->ip_summed == CHECKSUM_PARTIAL)
3362 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3363 else if (skb->pkt_type != PACKET_OUTGOING &&
3364 (skb->ip_summed == CHECKSUM_COMPLETE ||
3365 skb_csum_unnecessary(skb)))
3366 aux.tp_status |= TP_STATUS_CSUM_VALID;
3367
2472d761 3368 aux.tp_len = origlen;
ffbc6111
HX
3369 aux.tp_snaplen = skb->len;
3370 aux.tp_mac = 0;
bbe735e4 3371 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3372 if (skb_vlan_tag_present(skb)) {
3373 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3374 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3375 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3376 } else {
3377 aux.tp_vlan_tci = 0;
a0cdfcf3 3378 aux.tp_vlan_tpid = 0;
a3bcc23e 3379 }
ffbc6111 3380 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3381 }
3382
1da177e4
LT
3383 /*
3384 * Free or return the buffer as appropriate. Again this
3385 * hides all the races and re-entrancy issues from us.
3386 */
bfd5f4a3 3387 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3388
3389out_free:
3390 skb_free_datagram(sk, skb);
3391out:
3392 return err;
3393}
3394
1da177e4
LT
3395static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3396 int *uaddr_len, int peer)
3397{
3398 struct net_device *dev;
3399 struct sock *sk = sock->sk;
3400
3401 if (peer)
3402 return -EOPNOTSUPP;
3403
3404 uaddr->sa_family = AF_PACKET;
2dc85bf3 3405 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3406 rcu_read_lock();
3407 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3408 if (dev)
2dc85bf3 3409 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3410 rcu_read_unlock();
1da177e4
LT
3411 *uaddr_len = sizeof(*uaddr);
3412
3413 return 0;
3414}
1da177e4
LT
3415
3416static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3417 int *uaddr_len, int peer)
3418{
3419 struct net_device *dev;
3420 struct sock *sk = sock->sk;
3421 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3422 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3423
3424 if (peer)
3425 return -EOPNOTSUPP;
3426
3427 sll->sll_family = AF_PACKET;
3428 sll->sll_ifindex = po->ifindex;
3429 sll->sll_protocol = po->num;
67286640 3430 sll->sll_pkttype = 0;
654d1f8a
ED
3431 rcu_read_lock();
3432 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3433 if (dev) {
3434 sll->sll_hatype = dev->type;
3435 sll->sll_halen = dev->addr_len;
3436 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3437 } else {
3438 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3439 sll->sll_halen = 0;
3440 }
654d1f8a 3441 rcu_read_unlock();
0fb375fb 3442 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3443
3444 return 0;
3445}
3446
2aeb0b88
WC
3447static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3448 int what)
1da177e4
LT
3449{
3450 switch (i->type) {
3451 case PACKET_MR_MULTICAST:
1162563f
JP
3452 if (i->alen != dev->addr_len)
3453 return -EINVAL;
1da177e4 3454 if (what > 0)
22bedad3 3455 return dev_mc_add(dev, i->addr);
1da177e4 3456 else
22bedad3 3457 return dev_mc_del(dev, i->addr);
1da177e4
LT
3458 break;
3459 case PACKET_MR_PROMISC:
2aeb0b88 3460 return dev_set_promiscuity(dev, what);
1da177e4 3461 case PACKET_MR_ALLMULTI:
2aeb0b88 3462 return dev_set_allmulti(dev, what);
d95ed927 3463 case PACKET_MR_UNICAST:
1162563f
JP
3464 if (i->alen != dev->addr_len)
3465 return -EINVAL;
d95ed927 3466 if (what > 0)
a748ee24 3467 return dev_uc_add(dev, i->addr);
d95ed927 3468 else
a748ee24 3469 return dev_uc_del(dev, i->addr);
d95ed927 3470 break;
40d4e3df
ED
3471 default:
3472 break;
1da177e4 3473 }
2aeb0b88 3474 return 0;
1da177e4
LT
3475}
3476
82f17091
FR
3477static void packet_dev_mclist_delete(struct net_device *dev,
3478 struct packet_mclist **mlp)
1da177e4 3479{
82f17091
FR
3480 struct packet_mclist *ml;
3481
3482 while ((ml = *mlp) != NULL) {
3483 if (ml->ifindex == dev->ifindex) {
3484 packet_dev_mc(dev, ml, -1);
3485 *mlp = ml->next;
3486 kfree(ml);
3487 } else
3488 mlp = &ml->next;
1da177e4
LT
3489 }
3490}
3491
0fb375fb 3492static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3493{
3494 struct packet_sock *po = pkt_sk(sk);
3495 struct packet_mclist *ml, *i;
3496 struct net_device *dev;
3497 int err;
3498
3499 rtnl_lock();
3500
3501 err = -ENODEV;
3b1e0a65 3502 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3503 if (!dev)
3504 goto done;
3505
3506 err = -EINVAL;
1162563f 3507 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3508 goto done;
3509
3510 err = -ENOBUFS;
8b3a7005 3511 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3512 if (i == NULL)
3513 goto done;
3514
3515 err = 0;
3516 for (ml = po->mclist; ml; ml = ml->next) {
3517 if (ml->ifindex == mreq->mr_ifindex &&
3518 ml->type == mreq->mr_type &&
3519 ml->alen == mreq->mr_alen &&
3520 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3521 ml->count++;
3522 /* Free the new element ... */
3523 kfree(i);
3524 goto done;
3525 }
3526 }
3527
3528 i->type = mreq->mr_type;
3529 i->ifindex = mreq->mr_ifindex;
3530 i->alen = mreq->mr_alen;
3531 memcpy(i->addr, mreq->mr_address, i->alen);
3532 i->count = 1;
3533 i->next = po->mclist;
3534 po->mclist = i;
2aeb0b88
WC
3535 err = packet_dev_mc(dev, i, 1);
3536 if (err) {
3537 po->mclist = i->next;
3538 kfree(i);
3539 }
1da177e4
LT
3540
3541done:
3542 rtnl_unlock();
3543 return err;
3544}
3545
0fb375fb 3546static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3547{
3548 struct packet_mclist *ml, **mlp;
3549
3550 rtnl_lock();
3551
3552 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3553 if (ml->ifindex == mreq->mr_ifindex &&
3554 ml->type == mreq->mr_type &&
3555 ml->alen == mreq->mr_alen &&
3556 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3557 if (--ml->count == 0) {
3558 struct net_device *dev;
3559 *mlp = ml->next;
ad959e76
ED
3560 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3561 if (dev)
1da177e4 3562 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3563 kfree(ml);
3564 }
82f17091 3565 break;
1da177e4
LT
3566 }
3567 }
3568 rtnl_unlock();
82f17091 3569 return 0;
1da177e4
LT
3570}
3571
3572static void packet_flush_mclist(struct sock *sk)
3573{
3574 struct packet_sock *po = pkt_sk(sk);
3575 struct packet_mclist *ml;
3576
3577 if (!po->mclist)
3578 return;
3579
3580 rtnl_lock();
3581 while ((ml = po->mclist) != NULL) {
3582 struct net_device *dev;
3583
3584 po->mclist = ml->next;
ad959e76
ED
3585 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3586 if (dev != NULL)
1da177e4 3587 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3588 kfree(ml);
3589 }
3590 rtnl_unlock();
3591}
1da177e4
LT
3592
3593static int
b7058842 3594packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3595{
3596 struct sock *sk = sock->sk;
8dc41944 3597 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3598 int ret;
3599
3600 if (level != SOL_PACKET)
3601 return -ENOPROTOOPT;
3602
69e3c75f 3603 switch (optname) {
1ce4f28b 3604 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3605 case PACKET_DROP_MEMBERSHIP:
3606 {
0fb375fb
EB
3607 struct packet_mreq_max mreq;
3608 int len = optlen;
3609 memset(&mreq, 0, sizeof(mreq));
3610 if (len < sizeof(struct packet_mreq))
1da177e4 3611 return -EINVAL;
0fb375fb
EB
3612 if (len > sizeof(mreq))
3613 len = sizeof(mreq);
40d4e3df 3614 if (copy_from_user(&mreq, optval, len))
1da177e4 3615 return -EFAULT;
0fb375fb
EB
3616 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3617 return -EINVAL;
1da177e4
LT
3618 if (optname == PACKET_ADD_MEMBERSHIP)
3619 ret = packet_mc_add(sk, &mreq);
3620 else
3621 ret = packet_mc_drop(sk, &mreq);
3622 return ret;
3623 }
a2efcfa0 3624
1da177e4 3625 case PACKET_RX_RING:
69e3c75f 3626 case PACKET_TX_RING:
1da177e4 3627 {
f6fb8f10 3628 union tpacket_req_u req_u;
3629 int len;
1da177e4 3630
f6fb8f10 3631 switch (po->tp_version) {
3632 case TPACKET_V1:
3633 case TPACKET_V2:
3634 len = sizeof(req_u.req);
3635 break;
3636 case TPACKET_V3:
3637 default:
3638 len = sizeof(req_u.req3);
3639 break;
3640 }
3641 if (optlen < len)
1da177e4 3642 return -EINVAL;
f6fb8f10 3643 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3644 return -EFAULT;
f6fb8f10 3645 return packet_set_ring(sk, &req_u, 0,
3646 optname == PACKET_TX_RING);
1da177e4
LT
3647 }
3648 case PACKET_COPY_THRESH:
3649 {
3650 int val;
3651
40d4e3df 3652 if (optlen != sizeof(val))
1da177e4 3653 return -EINVAL;
40d4e3df 3654 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3655 return -EFAULT;
3656
3657 pkt_sk(sk)->copy_thresh = val;
3658 return 0;
3659 }
bbd6ef87
PM
3660 case PACKET_VERSION:
3661 {
3662 int val;
3663
3664 if (optlen != sizeof(val))
3665 return -EINVAL;
69e3c75f 3666 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3667 return -EBUSY;
3668 if (copy_from_user(&val, optval, sizeof(val)))
3669 return -EFAULT;
3670 switch (val) {
3671 case TPACKET_V1:
3672 case TPACKET_V2:
f6fb8f10 3673 case TPACKET_V3:
bbd6ef87
PM
3674 po->tp_version = val;
3675 return 0;
3676 default:
3677 return -EINVAL;
3678 }
3679 }
8913336a
PM
3680 case PACKET_RESERVE:
3681 {
3682 unsigned int val;
3683
3684 if (optlen != sizeof(val))
3685 return -EINVAL;
69e3c75f 3686 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3687 return -EBUSY;
3688 if (copy_from_user(&val, optval, sizeof(val)))
3689 return -EFAULT;
3690 po->tp_reserve = val;
3691 return 0;
3692 }
69e3c75f
JB
3693 case PACKET_LOSS:
3694 {
3695 unsigned int val;
3696
3697 if (optlen != sizeof(val))
3698 return -EINVAL;
3699 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3700 return -EBUSY;
3701 if (copy_from_user(&val, optval, sizeof(val)))
3702 return -EFAULT;
3703 po->tp_loss = !!val;
3704 return 0;
3705 }
8dc41944
HX
3706 case PACKET_AUXDATA:
3707 {
3708 int val;
3709
3710 if (optlen < sizeof(val))
3711 return -EINVAL;
3712 if (copy_from_user(&val, optval, sizeof(val)))
3713 return -EFAULT;
3714
3715 po->auxdata = !!val;
3716 return 0;
3717 }
80feaacb
PWJ
3718 case PACKET_ORIGDEV:
3719 {
3720 int val;
3721
3722 if (optlen < sizeof(val))
3723 return -EINVAL;
3724 if (copy_from_user(&val, optval, sizeof(val)))
3725 return -EFAULT;
3726
3727 po->origdev = !!val;
3728 return 0;
3729 }
bfd5f4a3
SS
3730 case PACKET_VNET_HDR:
3731 {
3732 int val;
3733
3734 if (sock->type != SOCK_RAW)
3735 return -EINVAL;
3736 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3737 return -EBUSY;
3738 if (optlen < sizeof(val))
3739 return -EINVAL;
3740 if (copy_from_user(&val, optval, sizeof(val)))
3741 return -EFAULT;
3742
3743 po->has_vnet_hdr = !!val;
3744 return 0;
3745 }
614f60fa
SM
3746 case PACKET_TIMESTAMP:
3747 {
3748 int val;
3749
3750 if (optlen != sizeof(val))
3751 return -EINVAL;
3752 if (copy_from_user(&val, optval, sizeof(val)))
3753 return -EFAULT;
3754
3755 po->tp_tstamp = val;
3756 return 0;
3757 }
dc99f600
DM
3758 case PACKET_FANOUT:
3759 {
3760 int val;
3761
3762 if (optlen != sizeof(val))
3763 return -EINVAL;
3764 if (copy_from_user(&val, optval, sizeof(val)))
3765 return -EFAULT;
3766
3767 return fanout_add(sk, val & 0xffff, val >> 16);
3768 }
47dceb8e
WB
3769 case PACKET_FANOUT_DATA:
3770 {
3771 if (!po->fanout)
3772 return -EINVAL;
3773
3774 return fanout_set_data(po, optval, optlen);
3775 }
5920cd3a
PC
3776 case PACKET_TX_HAS_OFF:
3777 {
3778 unsigned int val;
3779
3780 if (optlen != sizeof(val))
3781 return -EINVAL;
3782 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3783 return -EBUSY;
3784 if (copy_from_user(&val, optval, sizeof(val)))
3785 return -EFAULT;
3786 po->tp_tx_has_off = !!val;
3787 return 0;
3788 }
d346a3fa
DB
3789 case PACKET_QDISC_BYPASS:
3790 {
3791 int val;
3792
3793 if (optlen != sizeof(val))
3794 return -EINVAL;
3795 if (copy_from_user(&val, optval, sizeof(val)))
3796 return -EFAULT;
3797
3798 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3799 return 0;
3800 }
1da177e4
LT
3801 default:
3802 return -ENOPROTOOPT;
3803 }
3804}
3805
3806static int packet_getsockopt(struct socket *sock, int level, int optname,
3807 char __user *optval, int __user *optlen)
3808{
3809 int len;
c06fff6e 3810 int val, lv = sizeof(val);
1da177e4
LT
3811 struct sock *sk = sock->sk;
3812 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3813 void *data = &val;
ee80fbf3 3814 union tpacket_stats_u st;
a9b63918 3815 struct tpacket_rollover_stats rstats;
1da177e4
LT
3816
3817 if (level != SOL_PACKET)
3818 return -ENOPROTOOPT;
3819
8ae55f04
KK
3820 if (get_user(len, optlen))
3821 return -EFAULT;
1da177e4
LT
3822
3823 if (len < 0)
3824 return -EINVAL;
1ce4f28b 3825
69e3c75f 3826 switch (optname) {
1da177e4 3827 case PACKET_STATISTICS:
1da177e4 3828 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3829 memcpy(&st, &po->stats, sizeof(st));
3830 memset(&po->stats, 0, sizeof(po->stats));
3831 spin_unlock_bh(&sk->sk_receive_queue.lock);
3832
f6fb8f10 3833 if (po->tp_version == TPACKET_V3) {
c06fff6e 3834 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3835 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3836 data = &st.stats3;
f6fb8f10 3837 } else {
c06fff6e 3838 lv = sizeof(struct tpacket_stats);
8bcdeaff 3839 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3840 data = &st.stats1;
f6fb8f10 3841 }
ee80fbf3 3842
8dc41944
HX
3843 break;
3844 case PACKET_AUXDATA:
8dc41944 3845 val = po->auxdata;
80feaacb
PWJ
3846 break;
3847 case PACKET_ORIGDEV:
80feaacb 3848 val = po->origdev;
bfd5f4a3
SS
3849 break;
3850 case PACKET_VNET_HDR:
bfd5f4a3 3851 val = po->has_vnet_hdr;
1da177e4 3852 break;
bbd6ef87 3853 case PACKET_VERSION:
bbd6ef87 3854 val = po->tp_version;
bbd6ef87
PM
3855 break;
3856 case PACKET_HDRLEN:
3857 if (len > sizeof(int))
3858 len = sizeof(int);
3859 if (copy_from_user(&val, optval, len))
3860 return -EFAULT;
3861 switch (val) {
3862 case TPACKET_V1:
3863 val = sizeof(struct tpacket_hdr);
3864 break;
3865 case TPACKET_V2:
3866 val = sizeof(struct tpacket2_hdr);
3867 break;
f6fb8f10 3868 case TPACKET_V3:
3869 val = sizeof(struct tpacket3_hdr);
3870 break;
bbd6ef87
PM
3871 default:
3872 return -EINVAL;
3873 }
bbd6ef87 3874 break;
8913336a 3875 case PACKET_RESERVE:
8913336a 3876 val = po->tp_reserve;
8913336a 3877 break;
69e3c75f 3878 case PACKET_LOSS:
69e3c75f 3879 val = po->tp_loss;
69e3c75f 3880 break;
614f60fa 3881 case PACKET_TIMESTAMP:
614f60fa 3882 val = po->tp_tstamp;
614f60fa 3883 break;
dc99f600 3884 case PACKET_FANOUT:
dc99f600
DM
3885 val = (po->fanout ?
3886 ((u32)po->fanout->id |
77f65ebd
WB
3887 ((u32)po->fanout->type << 16) |
3888 ((u32)po->fanout->flags << 24)) :
dc99f600 3889 0);
dc99f600 3890 break;
a9b63918
WB
3891 case PACKET_ROLLOVER_STATS:
3892 if (!po->rollover)
3893 return -EINVAL;
3894 rstats.tp_all = atomic_long_read(&po->rollover->num);
3895 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3896 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3897 data = &rstats;
3898 lv = sizeof(rstats);
3899 break;
5920cd3a
PC
3900 case PACKET_TX_HAS_OFF:
3901 val = po->tp_tx_has_off;
3902 break;
d346a3fa
DB
3903 case PACKET_QDISC_BYPASS:
3904 val = packet_use_direct_xmit(po);
3905 break;
1da177e4
LT
3906 default:
3907 return -ENOPROTOOPT;
3908 }
3909
c06fff6e
ED
3910 if (len > lv)
3911 len = lv;
8ae55f04
KK
3912 if (put_user(len, optlen))
3913 return -EFAULT;
8dc41944
HX
3914 if (copy_to_user(optval, data, len))
3915 return -EFAULT;
8ae55f04 3916 return 0;
1da177e4
LT
3917}
3918
3919
351638e7
JP
3920static int packet_notifier(struct notifier_block *this,
3921 unsigned long msg, void *ptr)
1da177e4
LT
3922{
3923 struct sock *sk;
351638e7 3924 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3925 struct net *net = dev_net(dev);
1da177e4 3926
808f5114 3927 rcu_read_lock();
b67bfe0d 3928 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3929 struct packet_sock *po = pkt_sk(sk);
3930
3931 switch (msg) {
3932 case NETDEV_UNREGISTER:
1da177e4 3933 if (po->mclist)
82f17091 3934 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3935 /* fallthrough */
3936
1da177e4
LT
3937 case NETDEV_DOWN:
3938 if (dev->ifindex == po->ifindex) {
3939 spin_lock(&po->bind_lock);
3940 if (po->running) {
ce06b03e 3941 __unregister_prot_hook(sk, false);
1da177e4
LT
3942 sk->sk_err = ENETDOWN;
3943 if (!sock_flag(sk, SOCK_DEAD))
3944 sk->sk_error_report(sk);
3945 }
3946 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3947 packet_cached_dev_reset(po);
1da177e4 3948 po->ifindex = -1;
160ff18a
BG
3949 if (po->prot_hook.dev)
3950 dev_put(po->prot_hook.dev);
1da177e4
LT
3951 po->prot_hook.dev = NULL;
3952 }
3953 spin_unlock(&po->bind_lock);
3954 }
3955 break;
3956 case NETDEV_UP:
808f5114 3957 if (dev->ifindex == po->ifindex) {
3958 spin_lock(&po->bind_lock);
ce06b03e
DM
3959 if (po->num)
3960 register_prot_hook(sk);
808f5114 3961 spin_unlock(&po->bind_lock);
1da177e4 3962 }
1da177e4
LT
3963 break;
3964 }
3965 }
808f5114 3966 rcu_read_unlock();
1da177e4
LT
3967 return NOTIFY_DONE;
3968}
3969
3970
3971static int packet_ioctl(struct socket *sock, unsigned int cmd,
3972 unsigned long arg)
3973{
3974 struct sock *sk = sock->sk;
3975
69e3c75f 3976 switch (cmd) {
40d4e3df
ED
3977 case SIOCOUTQ:
3978 {
3979 int amount = sk_wmem_alloc_get(sk);
31e6d363 3980
40d4e3df
ED
3981 return put_user(amount, (int __user *)arg);
3982 }
3983 case SIOCINQ:
3984 {
3985 struct sk_buff *skb;
3986 int amount = 0;
3987
3988 spin_lock_bh(&sk->sk_receive_queue.lock);
3989 skb = skb_peek(&sk->sk_receive_queue);
3990 if (skb)
3991 amount = skb->len;
3992 spin_unlock_bh(&sk->sk_receive_queue.lock);
3993 return put_user(amount, (int __user *)arg);
3994 }
3995 case SIOCGSTAMP:
3996 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3997 case SIOCGSTAMPNS:
3998 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3999
1da177e4 4000#ifdef CONFIG_INET
40d4e3df
ED
4001 case SIOCADDRT:
4002 case SIOCDELRT:
4003 case SIOCDARP:
4004 case SIOCGARP:
4005 case SIOCSARP:
4006 case SIOCGIFADDR:
4007 case SIOCSIFADDR:
4008 case SIOCGIFBRDADDR:
4009 case SIOCSIFBRDADDR:
4010 case SIOCGIFNETMASK:
4011 case SIOCSIFNETMASK:
4012 case SIOCGIFDSTADDR:
4013 case SIOCSIFDSTADDR:
4014 case SIOCSIFFLAGS:
40d4e3df 4015 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4016#endif
4017
40d4e3df
ED
4018 default:
4019 return -ENOIOCTLCMD;
1da177e4
LT
4020 }
4021 return 0;
4022}
4023
40d4e3df 4024static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
4025 poll_table *wait)
4026{
4027 struct sock *sk = sock->sk;
4028 struct packet_sock *po = pkt_sk(sk);
4029 unsigned int mask = datagram_poll(file, sock, wait);
4030
4031 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4032 if (po->rx_ring.pg_vec) {
f6fb8f10 4033 if (!packet_previous_rx_frame(po, &po->rx_ring,
4034 TP_STATUS_KERNEL))
1da177e4
LT
4035 mask |= POLLIN | POLLRDNORM;
4036 }
2ccdbaa6 4037 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4038 po->pressure = 0;
1da177e4 4039 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4040 spin_lock_bh(&sk->sk_write_queue.lock);
4041 if (po->tx_ring.pg_vec) {
4042 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4043 mask |= POLLOUT | POLLWRNORM;
4044 }
4045 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4046 return mask;
4047}
4048
4049
4050/* Dirty? Well, I still did not learn better way to account
4051 * for user mmaps.
4052 */
4053
4054static void packet_mm_open(struct vm_area_struct *vma)
4055{
4056 struct file *file = vma->vm_file;
40d4e3df 4057 struct socket *sock = file->private_data;
1da177e4 4058 struct sock *sk = sock->sk;
1ce4f28b 4059
1da177e4
LT
4060 if (sk)
4061 atomic_inc(&pkt_sk(sk)->mapped);
4062}
4063
4064static void packet_mm_close(struct vm_area_struct *vma)
4065{
4066 struct file *file = vma->vm_file;
40d4e3df 4067 struct socket *sock = file->private_data;
1da177e4 4068 struct sock *sk = sock->sk;
1ce4f28b 4069
1da177e4
LT
4070 if (sk)
4071 atomic_dec(&pkt_sk(sk)->mapped);
4072}
4073
f0f37e2f 4074static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4075 .open = packet_mm_open,
4076 .close = packet_mm_close,
1da177e4
LT
4077};
4078
0e3125c7
NH
4079static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4080 unsigned int len)
1da177e4
LT
4081{
4082 int i;
4083
4ebf0ae2 4084 for (i = 0; i < len; i++) {
0e3125c7 4085 if (likely(pg_vec[i].buffer)) {
c56b4d90 4086 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4087 vfree(pg_vec[i].buffer);
4088 else
4089 free_pages((unsigned long)pg_vec[i].buffer,
4090 order);
4091 pg_vec[i].buffer = NULL;
4092 }
1da177e4
LT
4093 }
4094 kfree(pg_vec);
4095}
4096
eea49cc9 4097static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4098{
f0d4eb29 4099 char *buffer;
0e3125c7
NH
4100 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4101 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4102
4103 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4104 if (buffer)
4105 return buffer;
4106
f0d4eb29 4107 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4108 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4109 if (buffer)
4110 return buffer;
4111
f0d4eb29 4112 /* vmalloc failed, lets dig into swap here */
0e3125c7 4113 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4114 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4115 if (buffer)
4116 return buffer;
4117
f0d4eb29 4118 /* complete and utter failure */
0e3125c7 4119 return NULL;
4ebf0ae2
DM
4120}
4121
0e3125c7 4122static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4123{
4124 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4125 struct pgv *pg_vec;
4ebf0ae2
DM
4126 int i;
4127
0e3125c7 4128 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4129 if (unlikely(!pg_vec))
4130 goto out;
4131
4132 for (i = 0; i < block_nr; i++) {
c56b4d90 4133 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4134 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4135 goto out_free_pgvec;
4136 }
4137
4138out:
4139 return pg_vec;
4140
4141out_free_pgvec:
4142 free_pg_vec(pg_vec, order, block_nr);
4143 pg_vec = NULL;
4144 goto out;
4145}
1da177e4 4146
f6fb8f10 4147static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4148 int closing, int tx_ring)
1da177e4 4149{
0e3125c7 4150 struct pgv *pg_vec = NULL;
1da177e4 4151 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4152 int was_running, order = 0;
69e3c75f
JB
4153 struct packet_ring_buffer *rb;
4154 struct sk_buff_head *rb_queue;
0e11c91e 4155 __be16 num;
f6fb8f10 4156 int err = -EINVAL;
4157 /* Added to avoid minimal code churn */
4158 struct tpacket_req *req = &req_u->req;
4159
4160 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4161 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
4162 WARN(1, "Tx-ring is not supported.\n");
4163 goto out;
4164 }
1ce4f28b 4165
69e3c75f
JB
4166 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4167 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4168
69e3c75f
JB
4169 err = -EBUSY;
4170 if (!closing) {
4171 if (atomic_read(&po->mapped))
4172 goto out;
b0138408 4173 if (packet_read_pending(rb))
69e3c75f
JB
4174 goto out;
4175 }
1da177e4 4176
69e3c75f
JB
4177 if (req->tp_block_nr) {
4178 /* Sanity tests and some calculations */
4179 err = -EBUSY;
4180 if (unlikely(rb->pg_vec))
4181 goto out;
1da177e4 4182
bbd6ef87
PM
4183 switch (po->tp_version) {
4184 case TPACKET_V1:
4185 po->tp_hdrlen = TPACKET_HDRLEN;
4186 break;
4187 case TPACKET_V2:
4188 po->tp_hdrlen = TPACKET2_HDRLEN;
4189 break;
f6fb8f10 4190 case TPACKET_V3:
4191 po->tp_hdrlen = TPACKET3_HDRLEN;
4192 break;
bbd6ef87
PM
4193 }
4194
69e3c75f 4195 err = -EINVAL;
4ebf0ae2 4196 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4197 goto out;
90836b67 4198 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4199 goto out;
dc808110
ED
4200 if (po->tp_version >= TPACKET_V3 &&
4201 (int)(req->tp_block_size -
4202 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
4203 goto out;
8913336a 4204 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4205 po->tp_reserve))
4206 goto out;
4ebf0ae2 4207 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4208 goto out;
1da177e4 4209
4194b491
TK
4210 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4211 if (unlikely(rb->frames_per_block == 0))
69e3c75f
JB
4212 goto out;
4213 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4214 req->tp_frame_nr))
4215 goto out;
1da177e4
LT
4216
4217 err = -ENOMEM;
4ebf0ae2
DM
4218 order = get_order(req->tp_block_size);
4219 pg_vec = alloc_pg_vec(req, order);
4220 if (unlikely(!pg_vec))
1da177e4 4221 goto out;
f6fb8f10 4222 switch (po->tp_version) {
4223 case TPACKET_V3:
4224 /* Transmit path is not supported. We checked
4225 * it above but just being paranoid
4226 */
4227 if (!tx_ring)
e8e85cc5 4228 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 4229 break;
f6fb8f10 4230 default:
4231 break;
4232 }
69e3c75f
JB
4233 }
4234 /* Done */
4235 else {
4236 err = -EINVAL;
4ebf0ae2 4237 if (unlikely(req->tp_frame_nr))
69e3c75f 4238 goto out;
1da177e4
LT
4239 }
4240
4241 lock_sock(sk);
4242
4243 /* Detach socket from network */
4244 spin_lock(&po->bind_lock);
4245 was_running = po->running;
4246 num = po->num;
4247 if (was_running) {
1da177e4 4248 po->num = 0;
ce06b03e 4249 __unregister_prot_hook(sk, false);
1da177e4
LT
4250 }
4251 spin_unlock(&po->bind_lock);
1ce4f28b 4252
1da177e4
LT
4253 synchronize_net();
4254
4255 err = -EBUSY;
905db440 4256 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4257 if (closing || atomic_read(&po->mapped) == 0) {
4258 err = 0;
69e3c75f 4259 spin_lock_bh(&rb_queue->lock);
c053fd96 4260 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4261 rb->frame_max = (req->tp_frame_nr - 1);
4262 rb->head = 0;
4263 rb->frame_size = req->tp_frame_size;
4264 spin_unlock_bh(&rb_queue->lock);
4265
c053fd96
CG
4266 swap(rb->pg_vec_order, order);
4267 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4268
4269 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4270 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4271 tpacket_rcv : packet_rcv;
4272 skb_queue_purge(rb_queue);
1da177e4 4273 if (atomic_read(&po->mapped))
40d4e3df
ED
4274 pr_err("packet_mmap: vma is busy: %d\n",
4275 atomic_read(&po->mapped));
1da177e4 4276 }
905db440 4277 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4278
4279 spin_lock(&po->bind_lock);
ce06b03e 4280 if (was_running) {
1da177e4 4281 po->num = num;
ce06b03e 4282 register_prot_hook(sk);
1da177e4
LT
4283 }
4284 spin_unlock(&po->bind_lock);
f6fb8f10 4285 if (closing && (po->tp_version > TPACKET_V2)) {
4286 /* Because we don't support block-based V3 on tx-ring */
4287 if (!tx_ring)
73d0fcf2 4288 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4289 }
1da177e4
LT
4290 release_sock(sk);
4291
1da177e4
LT
4292 if (pg_vec)
4293 free_pg_vec(pg_vec, order, req->tp_block_nr);
4294out:
4295 return err;
4296}
4297
69e3c75f
JB
4298static int packet_mmap(struct file *file, struct socket *sock,
4299 struct vm_area_struct *vma)
1da177e4
LT
4300{
4301 struct sock *sk = sock->sk;
4302 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4303 unsigned long size, expected_size;
4304 struct packet_ring_buffer *rb;
1da177e4
LT
4305 unsigned long start;
4306 int err = -EINVAL;
4307 int i;
4308
4309 if (vma->vm_pgoff)
4310 return -EINVAL;
4311
905db440 4312 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4313
4314 expected_size = 0;
4315 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4316 if (rb->pg_vec) {
4317 expected_size += rb->pg_vec_len
4318 * rb->pg_vec_pages
4319 * PAGE_SIZE;
4320 }
4321 }
4322
4323 if (expected_size == 0)
1da177e4 4324 goto out;
69e3c75f
JB
4325
4326 size = vma->vm_end - vma->vm_start;
4327 if (size != expected_size)
1da177e4
LT
4328 goto out;
4329
1da177e4 4330 start = vma->vm_start;
69e3c75f
JB
4331 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4332 if (rb->pg_vec == NULL)
4333 continue;
4334
4335 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4336 struct page *page;
4337 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4338 int pg_num;
4339
c56b4d90
CG
4340 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4341 page = pgv_to_page(kaddr);
69e3c75f
JB
4342 err = vm_insert_page(vma, start, page);
4343 if (unlikely(err))
4344 goto out;
4345 start += PAGE_SIZE;
0e3125c7 4346 kaddr += PAGE_SIZE;
69e3c75f 4347 }
4ebf0ae2 4348 }
1da177e4 4349 }
69e3c75f 4350
4ebf0ae2 4351 atomic_inc(&po->mapped);
1da177e4
LT
4352 vma->vm_ops = &packet_mmap_ops;
4353 err = 0;
4354
4355out:
905db440 4356 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4357 return err;
4358}
1da177e4 4359
90ddc4f0 4360static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4361 .family = PF_PACKET,
4362 .owner = THIS_MODULE,
4363 .release = packet_release,
4364 .bind = packet_bind_spkt,
4365 .connect = sock_no_connect,
4366 .socketpair = sock_no_socketpair,
4367 .accept = sock_no_accept,
4368 .getname = packet_getname_spkt,
4369 .poll = datagram_poll,
4370 .ioctl = packet_ioctl,
4371 .listen = sock_no_listen,
4372 .shutdown = sock_no_shutdown,
4373 .setsockopt = sock_no_setsockopt,
4374 .getsockopt = sock_no_getsockopt,
4375 .sendmsg = packet_sendmsg_spkt,
4376 .recvmsg = packet_recvmsg,
4377 .mmap = sock_no_mmap,
4378 .sendpage = sock_no_sendpage,
4379};
1da177e4 4380
90ddc4f0 4381static const struct proto_ops packet_ops = {
1da177e4
LT
4382 .family = PF_PACKET,
4383 .owner = THIS_MODULE,
4384 .release = packet_release,
4385 .bind = packet_bind,
4386 .connect = sock_no_connect,
4387 .socketpair = sock_no_socketpair,
4388 .accept = sock_no_accept,
1ce4f28b 4389 .getname = packet_getname,
1da177e4
LT
4390 .poll = packet_poll,
4391 .ioctl = packet_ioctl,
4392 .listen = sock_no_listen,
4393 .shutdown = sock_no_shutdown,
4394 .setsockopt = packet_setsockopt,
4395 .getsockopt = packet_getsockopt,
4396 .sendmsg = packet_sendmsg,
4397 .recvmsg = packet_recvmsg,
4398 .mmap = packet_mmap,
4399 .sendpage = sock_no_sendpage,
4400};
4401
ec1b4cf7 4402static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4403 .family = PF_PACKET,
4404 .create = packet_create,
4405 .owner = THIS_MODULE,
4406};
4407
4408static struct notifier_block packet_netdev_notifier = {
40d4e3df 4409 .notifier_call = packet_notifier,
1da177e4
LT
4410};
4411
4412#ifdef CONFIG_PROC_FS
1da177e4
LT
4413
4414static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4415 __acquires(RCU)
1da177e4 4416{
e372c414 4417 struct net *net = seq_file_net(seq);
808f5114 4418
4419 rcu_read_lock();
4420 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4421}
4422
4423static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4424{
1bf40954 4425 struct net *net = seq_file_net(seq);
808f5114 4426 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4427}
4428
4429static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4430 __releases(RCU)
1da177e4 4431{
808f5114 4432 rcu_read_unlock();
1da177e4
LT
4433}
4434
1ce4f28b 4435static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4436{
4437 if (v == SEQ_START_TOKEN)
4438 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4439 else {
b7ceabd9 4440 struct sock *s = sk_entry(v);
1da177e4
LT
4441 const struct packet_sock *po = pkt_sk(s);
4442
4443 seq_printf(seq,
71338aa7 4444 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4445 s,
4446 atomic_read(&s->sk_refcnt),
4447 s->sk_type,
4448 ntohs(po->num),
4449 po->ifindex,
4450 po->running,
4451 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4452 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4453 sock_i_ino(s));
1da177e4
LT
4454 }
4455
4456 return 0;
4457}
4458
56b3d975 4459static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4460 .start = packet_seq_start,
4461 .next = packet_seq_next,
4462 .stop = packet_seq_stop,
4463 .show = packet_seq_show,
4464};
4465
4466static int packet_seq_open(struct inode *inode, struct file *file)
4467{
e372c414
DL
4468 return seq_open_net(inode, file, &packet_seq_ops,
4469 sizeof(struct seq_net_private));
1da177e4
LT
4470}
4471
da7071d7 4472static const struct file_operations packet_seq_fops = {
1da177e4
LT
4473 .owner = THIS_MODULE,
4474 .open = packet_seq_open,
4475 .read = seq_read,
4476 .llseek = seq_lseek,
e372c414 4477 .release = seq_release_net,
1da177e4
LT
4478};
4479
4480#endif
4481
2c8c1e72 4482static int __net_init packet_net_init(struct net *net)
d12d01d6 4483{
0fa7fa98 4484 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4485 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4486
d4beaa66 4487 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4488 return -ENOMEM;
4489
4490 return 0;
4491}
4492
2c8c1e72 4493static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4494{
ece31ffd 4495 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4496}
4497
4498static struct pernet_operations packet_net_ops = {
4499 .init = packet_net_init,
4500 .exit = packet_net_exit,
4501};
4502
4503
1da177e4
LT
4504static void __exit packet_exit(void)
4505{
1da177e4 4506 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4507 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4508 sock_unregister(PF_PACKET);
4509 proto_unregister(&packet_proto);
4510}
4511
4512static int __init packet_init(void)
4513{
4514 int rc = proto_register(&packet_proto, 0);
4515
4516 if (rc != 0)
4517 goto out;
4518
4519 sock_register(&packet_family_ops);
d12d01d6 4520 register_pernet_subsys(&packet_net_ops);
1da177e4 4521 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4522out:
4523 return rc;
4524}
4525
4526module_init(packet_init);
4527module_exit(packet_exit);
4528MODULE_LICENSE("GPL");
4529MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.932229 seconds and 5 git commands to generate.