macvtap: fix network header pointer for VLAN tagged pkts
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111 218struct packet_skb_cb {
ffbc6111
HX
219 union {
220 struct sockaddr_pkt pkt;
2472d761
EB
221 union {
222 /* Trick: alias skb original length with
223 * ll.sll_family and ll.protocol in order
224 * to save room.
225 */
226 unsigned int origlen;
227 struct sockaddr_ll ll;
228 };
ffbc6111
HX
229 } sa;
230};
231
232#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 233
bc59ba39 234#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 235#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 237#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 238 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 239#define GET_NEXT_PRB_BLK_NUM(x) \
240 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241 ((x)->kactive_blk_num+1) : 0)
242
dc99f600
DM
243static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
244static void __fanout_link(struct sock *sk, struct packet_sock *po);
245
d346a3fa
DB
246static int packet_direct_xmit(struct sk_buff *skb)
247{
248 struct net_device *dev = skb->dev;
d346a3fa
DB
249 netdev_features_t features;
250 struct netdev_queue *txq;
43279500 251 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
252
253 if (unlikely(!netif_running(dev) ||
43279500
DB
254 !netif_carrier_ok(dev)))
255 goto drop;
d346a3fa
DB
256
257 features = netif_skb_features(skb);
258 if (skb_needs_linearize(skb, features) &&
43279500
DB
259 __skb_linearize(skb))
260 goto drop;
d346a3fa 261
10c51b56 262 txq = skb_get_tx_queue(dev, skb);
d346a3fa 263
43279500
DB
264 local_bh_disable();
265
266 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 267 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 268 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 269 HARD_TX_UNLOCK(dev, txq);
d346a3fa 270
43279500
DB
271 local_bh_enable();
272
273 if (!dev_xmit_complete(ret))
d346a3fa 274 kfree_skb(skb);
43279500 275
d346a3fa 276 return ret;
43279500 277drop:
0f97ede4 278 atomic_long_inc(&dev->tx_dropped);
43279500
DB
279 kfree_skb(skb);
280 return NET_XMIT_DROP;
d346a3fa
DB
281}
282
66e56cd4
DB
283static struct net_device *packet_cached_dev_get(struct packet_sock *po)
284{
285 struct net_device *dev;
286
287 rcu_read_lock();
288 dev = rcu_dereference(po->cached_dev);
289 if (likely(dev))
290 dev_hold(dev);
291 rcu_read_unlock();
292
293 return dev;
294}
295
296static void packet_cached_dev_assign(struct packet_sock *po,
297 struct net_device *dev)
298{
299 rcu_assign_pointer(po->cached_dev, dev);
300}
301
302static void packet_cached_dev_reset(struct packet_sock *po)
303{
304 RCU_INIT_POINTER(po->cached_dev, NULL);
305}
306
d346a3fa
DB
307static bool packet_use_direct_xmit(const struct packet_sock *po)
308{
309 return po->xmit == packet_direct_xmit;
310}
311
0fd5d57b 312static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 313{
1cbac010 314 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
315}
316
0fd5d57b
DB
317static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
318{
319 const struct net_device_ops *ops = dev->netdev_ops;
320 u16 queue_index;
321
322 if (ops->ndo_select_queue) {
323 queue_index = ops->ndo_select_queue(dev, skb, NULL,
324 __packet_pick_tx_queue);
325 queue_index = netdev_cap_txqueue(dev, queue_index);
326 } else {
327 queue_index = __packet_pick_tx_queue(dev, skb);
328 }
329
330 skb_set_queue_mapping(skb, queue_index);
331}
332
ce06b03e
DM
333/* register_prot_hook must be invoked with the po->bind_lock held,
334 * or from a context in which asynchronous accesses to the packet
335 * socket is not possible (packet_create()).
336 */
337static void register_prot_hook(struct sock *sk)
338{
339 struct packet_sock *po = pkt_sk(sk);
e40526cb 340
ce06b03e 341 if (!po->running) {
66e56cd4 342 if (po->fanout)
dc99f600 343 __fanout_link(sk, po);
66e56cd4 344 else
dc99f600 345 dev_add_pack(&po->prot_hook);
e40526cb 346
ce06b03e
DM
347 sock_hold(sk);
348 po->running = 1;
349 }
350}
351
352/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
353 * held. If the sync parameter is true, we will temporarily drop
354 * the po->bind_lock and do a synchronize_net to make sure no
355 * asynchronous packet processing paths still refer to the elements
356 * of po->prot_hook. If the sync parameter is false, it is the
357 * callers responsibility to take care of this.
358 */
359static void __unregister_prot_hook(struct sock *sk, bool sync)
360{
361 struct packet_sock *po = pkt_sk(sk);
362
363 po->running = 0;
66e56cd4
DB
364
365 if (po->fanout)
dc99f600 366 __fanout_unlink(sk, po);
66e56cd4 367 else
dc99f600 368 __dev_remove_pack(&po->prot_hook);
e40526cb 369
ce06b03e
DM
370 __sock_put(sk);
371
372 if (sync) {
373 spin_unlock(&po->bind_lock);
374 synchronize_net();
375 spin_lock(&po->bind_lock);
376 }
377}
378
379static void unregister_prot_hook(struct sock *sk, bool sync)
380{
381 struct packet_sock *po = pkt_sk(sk);
382
383 if (po->running)
384 __unregister_prot_hook(sk, sync);
385}
386
6e58040b 387static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
388{
389 if (is_vmalloc_addr(addr))
390 return vmalloc_to_page(addr);
391 return virt_to_page(addr);
392}
393
69e3c75f 394static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 395{
184f489e 396 union tpacket_uhdr h;
1da177e4 397
69e3c75f 398 h.raw = frame;
bbd6ef87
PM
399 switch (po->tp_version) {
400 case TPACKET_V1:
69e3c75f 401 h.h1->tp_status = status;
0af55bb5 402 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
403 break;
404 case TPACKET_V2:
69e3c75f 405 h.h2->tp_status = status;
0af55bb5 406 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 407 break;
f6fb8f10 408 case TPACKET_V3:
69e3c75f 409 default:
f6fb8f10 410 WARN(1, "TPACKET version not supported.\n");
69e3c75f 411 BUG();
bbd6ef87 412 }
69e3c75f
JB
413
414 smp_wmb();
bbd6ef87
PM
415}
416
69e3c75f 417static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 418{
184f489e 419 union tpacket_uhdr h;
bbd6ef87 420
69e3c75f
JB
421 smp_rmb();
422
bbd6ef87
PM
423 h.raw = frame;
424 switch (po->tp_version) {
425 case TPACKET_V1:
0af55bb5 426 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 427 return h.h1->tp_status;
bbd6ef87 428 case TPACKET_V2:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 430 return h.h2->tp_status;
f6fb8f10 431 case TPACKET_V3:
69e3c75f 432 default:
f6fb8f10 433 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
434 BUG();
435 return 0;
bbd6ef87 436 }
1da177e4 437}
69e3c75f 438
b9c32fb2
DB
439static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
440 unsigned int flags)
7a51384c
DB
441{
442 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
443
68a360e8
WB
444 if (shhwtstamps &&
445 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
446 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
447 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
448
449 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 450 return TP_STATUS_TS_SOFTWARE;
7a51384c 451
b9c32fb2 452 return 0;
7a51384c
DB
453}
454
b9c32fb2
DB
455static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
456 struct sk_buff *skb)
2e31396f
WB
457{
458 union tpacket_uhdr h;
459 struct timespec ts;
b9c32fb2 460 __u32 ts_status;
2e31396f 461
b9c32fb2
DB
462 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
463 return 0;
2e31396f
WB
464
465 h.raw = frame;
466 switch (po->tp_version) {
467 case TPACKET_V1:
468 h.h1->tp_sec = ts.tv_sec;
469 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
470 break;
471 case TPACKET_V2:
472 h.h2->tp_sec = ts.tv_sec;
473 h.h2->tp_nsec = ts.tv_nsec;
474 break;
475 case TPACKET_V3:
476 default:
477 WARN(1, "TPACKET version not supported.\n");
478 BUG();
479 }
480
481 /* one flush is safe, as both fields always lie on the same cacheline */
482 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
483 smp_wmb();
b9c32fb2
DB
484
485 return ts_status;
2e31396f
WB
486}
487
69e3c75f
JB
488static void *packet_lookup_frame(struct packet_sock *po,
489 struct packet_ring_buffer *rb,
490 unsigned int position,
491 int status)
492{
493 unsigned int pg_vec_pos, frame_offset;
184f489e 494 union tpacket_uhdr h;
69e3c75f
JB
495
496 pg_vec_pos = position / rb->frames_per_block;
497 frame_offset = position % rb->frames_per_block;
498
0e3125c7
NH
499 h.raw = rb->pg_vec[pg_vec_pos].buffer +
500 (frame_offset * rb->frame_size);
69e3c75f
JB
501
502 if (status != __packet_get_status(po, h.raw))
503 return NULL;
504
505 return h.raw;
506}
507
eea49cc9 508static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
509 struct packet_ring_buffer *rb,
510 int status)
511{
512 return packet_lookup_frame(po, rb, rb->head, status);
513}
514
bc59ba39 515static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 516{
517 del_timer_sync(&pkc->retire_blk_timer);
518}
519
520static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
521 int tx_ring,
522 struct sk_buff_head *rb_queue)
523{
bc59ba39 524 struct tpacket_kbdq_core *pkc;
f6fb8f10 525
22781a5b
DJ
526 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
527 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 528
ec6f809f 529 spin_lock_bh(&rb_queue->lock);
f6fb8f10 530 pkc->delete_blk_timer = 1;
ec6f809f 531 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 532
533 prb_del_retire_blk_timer(pkc);
534}
535
536static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 537 struct tpacket_kbdq_core *pkc,
f6fb8f10 538 void (*func) (unsigned long))
539{
540 init_timer(&pkc->retire_blk_timer);
541 pkc->retire_blk_timer.data = (long)po;
542 pkc->retire_blk_timer.function = func;
543 pkc->retire_blk_timer.expires = jiffies;
544}
545
e8e85cc5 546static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 547{
bc59ba39 548 struct tpacket_kbdq_core *pkc;
f6fb8f10 549
e8e85cc5 550 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 551 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
552}
553
554static int prb_calc_retire_blk_tmo(struct packet_sock *po,
555 int blk_size_in_bytes)
556{
557 struct net_device *dev;
558 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
559 struct ethtool_cmd ecmd;
560 int err;
e440cf2c 561 u32 speed;
f6fb8f10 562
4bc71cb9
JP
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
f6fb8f10 567 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
568 }
569 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 570 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
571 rtnl_unlock();
572 if (!err) {
4bc71cb9
JP
573 /*
574 * If the link speed is so slow you don't really
575 * need to worry about perf anyways
576 */
e440cf2c 577 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 578 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 579 } else {
580 msec = 1;
581 div = speed / 1000;
f6fb8f10 582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
bc59ba39 597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
e8e85cc5 606 union tpacket_req_u *req_u)
f6fb8f10 607{
22781a5b 608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 609 struct tpacket_block_desc *pbd;
f6fb8f10 610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
bc59ba39 615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 616 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
ee80fbf3 622 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
dc808110 631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 632 prb_init_ft_ops(p1, req_u);
e8e85cc5 633 prb_setup_retire_blk_timer(po);
f6fb8f10 634 prb_open_block(p1, pbd);
635}
636
637/* Do NOT update the last_blk_num first.
638 * Assumes sk_buff_head lock is held.
639 */
bc59ba39 640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647/*
648 * Timer logic:
649 * 1) We refresh the timer only when we open a block.
650 * By doing this we don't waste cycles refreshing the timer
651 * on packet-by-packet basis.
652 *
653 * With a 1MB block-size, on a 1Gbps line, it will take
654 * i) ~8 ms to fill a block + ii) memcpy etc.
655 * In this cut we are not accounting for the memcpy time.
656 *
657 * So, if the user sets the 'tmo' to 10ms then the timer
658 * will never fire while the block is still getting filled
659 * (which is what we want). However, the user could choose
660 * to close a block early and that's fine.
661 *
662 * But when the timer does fire, we check whether or not to refresh it.
663 * Since the tmo granularity is in msecs, it is not too expensive
664 * to refresh the timer, lets say every '8' msecs.
665 * Either the user can set the 'tmo' or we can derive it based on
666 * a) line-speed and b) block-size.
667 * prb_calc_retire_blk_tmo() calculates the tmo.
668 *
669 */
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 674 unsigned int frozen;
bc59ba39 675 struct tpacket_block_desc *pbd;
f6fb8f10 676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685 /* We only need to plug the race when the block is partially filled.
686 * tpacket_rcv:
687 * lock(); increment BLOCK_NUM_PKTS; unlock()
688 * copy_bits() is in progress ...
689 * timer fires on other cpu:
690 * we can't retire the current block because copy_bits
691 * is in progress.
692 *
693 */
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696 /* Waiting for skb_copy_bits to finish... */
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
41a50d62
AD
703 if (!BLOCK_NUM_PKTS(pbd)) {
704 /* An empty block. Just refresh the timer. */
705 goto refresh_timer;
706 }
f6fb8f10 707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713 /* Case 1. Queue was frozen because user-space was
714 * lagging behind.
715 */
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717 /*
718 * Ok, user-space is still behind.
719 * So just refresh the timer.
720 */
721 goto refresh_timer;
722 } else {
723 /* Case 2. queue was frozen,user-space caught up,
724 * now the link went idle && the timer fired.
725 * We don't have a block to close.So we open this
726 * block and restart the timer.
727 * opening a block thaws the queue,restarts timer
728 * Thawing/timer-refresh is a side effect.
729 */
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
eea49cc9 743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 744 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 745{
746 /* Flush everything minus the block header */
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753 /* Skip the block header(we know header WILL fit in 4K) */
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763 /* Now update the block status. */
764
765 BLOCK_STATUS(pbd1) = status;
766
767 /* Flush the block header */
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777/*
778 * Side effect:
779 *
780 * 1) flush the block
781 * 2) Increment active_blk_num
782 *
783 * Note:We DONT refresh the timer on purpose.
784 * Because almost always the next block will be opened.
785 */
bc59ba39 786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
f6fb8f10 788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
bc59ba39 793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 794 struct sock *sk = &po->sk;
f6fb8f10 795
ee80fbf3 796 if (po->stats.stats3.tp_drops)
f6fb8f10 797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802 /* Get the ts of the last pkt */
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
41a50d62
AD
807 /* Ok, we tmo'd - so get the current time.
808 *
809 * It shouldn't really happen as we don't close empty
810 * blocks. See prb_retire_rx_blk_timer_expired().
811 */
f6fb8f10 812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820 /* Flush the block */
821 prb_flush_block(pkc1, pbd1, status);
822
da413eec
DC
823 sk->sk_data_ready(sk);
824
f6fb8f10 825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
eea49cc9 828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833/*
834 * Side effect of opening a block:
835 *
836 * 1) prb_queue is thawed.
837 * 2) retire_blk_timer is refreshed.
838 *
839 */
bc59ba39 840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
f6fb8f10 842{
843 struct timespec ts;
bc59ba39 844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 845
846 smp_rmb();
847
8da3056c
DB
848 /* We could have just memset this but we will lose the
849 * flexibility of making the priv area sticky
850 */
f6fb8f10 851
8da3056c
DB
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 855
8da3056c
DB
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 860
8da3056c
DB
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
f6fb8f10 875}
876
877/*
878 * Queue freeze logic:
879 * 1) Assume tp_block_nr = 8 blocks.
880 * 2) At time 't0', user opens Rx ring.
881 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882 * 4) user-space is either sleeping or processing block '0'.
883 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
884 * it will close block-7,loop around and try to fill block '0'.
885 * call-flow:
886 * __packet_lookup_frame_in_block
887 * prb_retire_current_block()
888 * prb_dispatch_next_block()
889 * |->(BLOCK_STATUS == USER) evaluates to true
890 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
891 * 6) Now there are two cases:
892 * 6.1) Link goes idle right after the queue is frozen.
893 * But remember, the last open_block() refreshed the timer.
894 * When this timer expires,it will refresh itself so that we can
895 * re-open block-0 in near future.
896 * 6.2) Link is busy and keeps on receiving packets. This is a simple
897 * case and __packet_lookup_frame_in_block will check if block-0
898 * is free and can now be re-used.
899 */
eea49cc9 900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 904 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909/*
910 * If the next block is free then we will dispatch it
911 * and return a good offset.
912 * Else, we will freeze the queue.
913 * So, caller must check the return value.
914 */
bc59ba39 915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct packet_sock *po)
917{
bc59ba39 918 struct tpacket_block_desc *pbd;
f6fb8f10 919
920 smp_rmb();
921
922 /* 1. Get current block num */
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925 /* 2. If this block is currently in_use then freeze the queue */
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931 /*
932 * 3.
933 * open this block and return the offset where the first packet
934 * needs to get stored.
935 */
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
bc59ba39 940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 941 struct packet_sock *po, unsigned int status)
942{
bc59ba39 943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 944
945 /* retire/close the current block */
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947 /*
948 * Plug the case where copy_bits() is in progress on
949 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950 * have space to copy the pkt in the current block and
951 * called prb_retire_current_block()
952 *
953 * We don't need to worry about the TMO case because
954 * the timer-handler already handled this case.
955 */
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958 /* Waiting for skb_copy_bits to finish... */
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
f6fb8f10 965}
966
eea49cc9 967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 968 struct tpacket_block_desc *pbd)
f6fb8f10 969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
eea49cc9 973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
eea49cc9 978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 979{
bc59ba39 980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
eea49cc9 984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 985 struct tpacket3_hdr *ppd)
986{
3958afa1 987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 988}
989
eea49cc9 990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
eea49cc9 996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 997 struct tpacket3_hdr *ppd)
998{
df8a39de
JP
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1003 } else {
9e67030a 1004 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1005 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1006 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1007 }
1008}
1009
bc59ba39 1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1011 struct tpacket3_hdr *ppd)
1012{
a0cdfcf3 1013 ppd->hv1.tp_padding = 0;
f6fb8f10 1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
eea49cc9 1022static void prb_fill_curr_block(char *curr,
bc59ba39 1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
f6fb8f10 1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039/* Assumes caller has the sk->rx_queue.lock */
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
bc59ba39 1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
f6fb8f10 1048 char *curr, *end;
1049
e3192690 1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053 /* Queue is frozen when user space is lagging behind */
1054 if (prb_queue_frozen(pkc)) {
1055 /*
1056 * Check if that last block which caused the queue to freeze,
1057 * is still in_use by user-space.
1058 */
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060 /* Can't record this packet */
1061 return NULL;
1062 } else {
1063 /*
1064 * Ok, the block was released by user-space.
1065 * Now let's open that block.
1066 * opening a block also thaws the queue.
1067 * Thawing is a side effect.
1068 */
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
e3192690 1076 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1077
1078 /* first try the current block */
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084 /* Ok, close the current block */
1085 prb_retire_current_block(pkc, po, 0);
1086
1087 /* Now, try to dispatch the next block */
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095 /*
1096 * No free blocks are available.user_space hasn't caught up yet.
1097 * Queue was just frozen and now this packet will get dropped.
1098 */
1099 return NULL;
1100}
1101
eea49cc9 1102static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
99aa3473 1118 return NULL;
f6fb8f10 1119 }
1120}
1121
eea49cc9 1122static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
77f65ebd 1124 unsigned int idx,
f6fb8f10 1125 int status)
1126{
bc59ba39 1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
eea49cc9 1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145/* Assumes caller has held the rx_queue.lock */
eea49cc9 1146static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
eea49cc9 1164static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
eea49cc9 1179static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
eea49cc9 1187static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
b0138408
DB
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207 /* We don't use pending refcount in rx_ring. */
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
9954729b
WB
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1239{
9954729b
WB
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
77f65ebd 1263
2ccdbaa6 1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1271 - (skb ? skb->truesize : 0);
9954729b
WB
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
77f65ebd 1279
9954729b
WB
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
2ccdbaa6
WB
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
54d7c01d
WB
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
54d7c01d
WB
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1306
9954729b 1307 return ret;
77f65ebd
WB
1308}
1309
1da177e4
LT
1310static void packet_sock_destruct(struct sock *sk)
1311{
ed85b565
RC
1312 skb_queue_purge(&sk->sk_error_queue);
1313
547b792c
IJ
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1319 return;
1320 }
1321
17ab56a2 1322 sk_refcnt_debug_dec(sk);
1da177e4
LT
1323}
1324
3b3a5b0a
WB
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
dc99f600 1342{
61b905da 1343 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1344}
1345
77f65ebd
WB
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
dc99f600 1349{
468479e6 1350 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1351
468479e6 1352 return val % num;
77f65ebd
WB
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
dc99f600
DM
1360}
1361
5df0ddfb
DB
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
f337db64 1366 return prandom_u32_max(num);
5df0ddfb
DB
1367}
1368
77f65ebd
WB
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
ad377cab 1371 unsigned int idx, bool try_self,
77f65ebd 1372 unsigned int num)
95ec3eb4 1373{
4633c9e0 1374 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1375 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1376
0648ab70 1377 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
4633c9e0 1384 po_skip = po;
3b3a5b0a 1385 }
ad377cab 1386
0648ab70 1387 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1388 do {
2ccdbaa6 1389 po_next = pkt_sk(f->arr[i]);
4633c9e0 1390 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1392 if (i != j)
0648ab70 1393 po->rollover->sock = i;
a9b63918
WB
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1397 return i;
1398 }
ad377cab 1399
77f65ebd
WB
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
a9b63918 1404 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1405 return idx;
1406}
1407
2d36097d
NH
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
77f65ebd
WB
1415static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1416{
1417 return f->flags & (flag >> 8);
95ec3eb4
DM
1418}
1419
95ec3eb4
DM
1420static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1421 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1422{
1423 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1424 unsigned int num = READ_ONCE(f->num_members);
dc99f600 1425 struct packet_sock *po;
77f65ebd 1426 unsigned int idx;
dc99f600
DM
1427
1428 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1429 !num) {
1430 kfree_skb(skb);
1431 return 0;
1432 }
1433
3f34b24a
AD
1434 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1435 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1436 if (!skb)
1437 return 0;
1438 }
95ec3eb4
DM
1439 switch (f->type) {
1440 case PACKET_FANOUT_HASH:
1441 default:
77f65ebd 1442 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1443 break;
1444 case PACKET_FANOUT_LB:
77f65ebd 1445 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1446 break;
1447 case PACKET_FANOUT_CPU:
77f65ebd
WB
1448 idx = fanout_demux_cpu(f, skb, num);
1449 break;
5df0ddfb
DB
1450 case PACKET_FANOUT_RND:
1451 idx = fanout_demux_rnd(f, skb, num);
1452 break;
2d36097d
NH
1453 case PACKET_FANOUT_QM:
1454 idx = fanout_demux_qm(f, skb, num);
1455 break;
77f65ebd 1456 case PACKET_FANOUT_ROLLOVER:
ad377cab 1457 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1458 break;
dc99f600
DM
1459 }
1460
ad377cab
WB
1461 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1462 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1463
ad377cab 1464 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1465 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1466}
1467
fff3321d
PE
1468DEFINE_MUTEX(fanout_mutex);
1469EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1470static LIST_HEAD(fanout_list);
1471
1472static void __fanout_link(struct sock *sk, struct packet_sock *po)
1473{
1474 struct packet_fanout *f = po->fanout;
1475
1476 spin_lock(&f->lock);
1477 f->arr[f->num_members] = sk;
1478 smp_wmb();
1479 f->num_members++;
1480 spin_unlock(&f->lock);
1481}
1482
1483static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1484{
1485 struct packet_fanout *f = po->fanout;
1486 int i;
1487
1488 spin_lock(&f->lock);
1489 for (i = 0; i < f->num_members; i++) {
1490 if (f->arr[i] == sk)
1491 break;
1492 }
1493 BUG_ON(i >= f->num_members);
1494 f->arr[i] = f->arr[f->num_members - 1];
1495 f->num_members--;
1496 spin_unlock(&f->lock);
1497}
1498
d4dd8aee 1499static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1500{
d4dd8aee 1501 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1502 return true;
1503
1504 return false;
1505}
1506
7736d33f 1507static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1508{
1509 struct packet_sock *po = pkt_sk(sk);
1510 struct packet_fanout *f, *match;
7736d33f 1511 u8 type = type_flags & 0xff;
77f65ebd 1512 u8 flags = type_flags >> 8;
dc99f600
DM
1513 int err;
1514
1515 switch (type) {
77f65ebd
WB
1516 case PACKET_FANOUT_ROLLOVER:
1517 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1518 return -EINVAL;
dc99f600
DM
1519 case PACKET_FANOUT_HASH:
1520 case PACKET_FANOUT_LB:
95ec3eb4 1521 case PACKET_FANOUT_CPU:
5df0ddfb 1522 case PACKET_FANOUT_RND:
2d36097d 1523 case PACKET_FANOUT_QM:
dc99f600
DM
1524 break;
1525 default:
1526 return -EINVAL;
1527 }
1528
1529 if (!po->running)
1530 return -EINVAL;
1531
1532 if (po->fanout)
1533 return -EALREADY;
1534
4633c9e0
WB
1535 if (type == PACKET_FANOUT_ROLLOVER ||
1536 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
0648ab70
WB
1537 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1538 if (!po->rollover)
1539 return -ENOMEM;
a9b63918
WB
1540 atomic_long_set(&po->rollover->num, 0);
1541 atomic_long_set(&po->rollover->num_huge, 0);
1542 atomic_long_set(&po->rollover->num_failed, 0);
0648ab70
WB
1543 }
1544
dc99f600
DM
1545 mutex_lock(&fanout_mutex);
1546 match = NULL;
1547 list_for_each_entry(f, &fanout_list, list) {
1548 if (f->id == id &&
1549 read_pnet(&f->net) == sock_net(sk)) {
1550 match = f;
1551 break;
1552 }
1553 }
afe62c68 1554 err = -EINVAL;
77f65ebd 1555 if (match && match->flags != flags)
afe62c68 1556 goto out;
dc99f600 1557 if (!match) {
afe62c68 1558 err = -ENOMEM;
dc99f600 1559 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1560 if (!match)
1561 goto out;
1562 write_pnet(&match->net, sock_net(sk));
1563 match->id = id;
1564 match->type = type;
77f65ebd 1565 match->flags = flags;
afe62c68
ED
1566 atomic_set(&match->rr_cur, 0);
1567 INIT_LIST_HEAD(&match->list);
1568 spin_lock_init(&match->lock);
1569 atomic_set(&match->sk_ref, 0);
1570 match->prot_hook.type = po->prot_hook.type;
1571 match->prot_hook.dev = po->prot_hook.dev;
1572 match->prot_hook.func = packet_rcv_fanout;
1573 match->prot_hook.af_packet_priv = match;
c0de08d0 1574 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1575 dev_add_pack(&match->prot_hook);
1576 list_add(&match->list, &fanout_list);
dc99f600 1577 }
afe62c68
ED
1578 err = -EINVAL;
1579 if (match->type == type &&
1580 match->prot_hook.type == po->prot_hook.type &&
1581 match->prot_hook.dev == po->prot_hook.dev) {
1582 err = -ENOSPC;
1583 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1584 __dev_remove_pack(&po->prot_hook);
1585 po->fanout = match;
1586 atomic_inc(&match->sk_ref);
1587 __fanout_link(sk, po);
1588 err = 0;
dc99f600
DM
1589 }
1590 }
afe62c68 1591out:
dc99f600 1592 mutex_unlock(&fanout_mutex);
0648ab70
WB
1593 if (err) {
1594 kfree(po->rollover);
1595 po->rollover = NULL;
1596 }
dc99f600
DM
1597 return err;
1598}
1599
1600static void fanout_release(struct sock *sk)
1601{
1602 struct packet_sock *po = pkt_sk(sk);
1603 struct packet_fanout *f;
1604
1605 f = po->fanout;
1606 if (!f)
1607 return;
1608
fff3321d 1609 mutex_lock(&fanout_mutex);
dc99f600
DM
1610 po->fanout = NULL;
1611
dc99f600
DM
1612 if (atomic_dec_and_test(&f->sk_ref)) {
1613 list_del(&f->list);
1614 dev_remove_pack(&f->prot_hook);
1615 kfree(f);
1616 }
1617 mutex_unlock(&fanout_mutex);
0648ab70 1618
59f21118
WB
1619 if (po->rollover)
1620 kfree_rcu(po->rollover, rcu);
dc99f600 1621}
1da177e4 1622
90ddc4f0 1623static const struct proto_ops packet_ops;
1da177e4 1624
90ddc4f0 1625static const struct proto_ops packet_ops_spkt;
1da177e4 1626
40d4e3df
ED
1627static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1628 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1629{
1630 struct sock *sk;
1631 struct sockaddr_pkt *spkt;
1632
1633 /*
1634 * When we registered the protocol we saved the socket in the data
1635 * field for just this event.
1636 */
1637
1638 sk = pt->af_packet_priv;
1ce4f28b 1639
1da177e4
LT
1640 /*
1641 * Yank back the headers [hope the device set this
1642 * right or kerboom...]
1643 *
1644 * Incoming packets have ll header pulled,
1645 * push it back.
1646 *
98e399f8 1647 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1648 * so that this procedure is noop.
1649 */
1650
1651 if (skb->pkt_type == PACKET_LOOPBACK)
1652 goto out;
1653
09ad9bc7 1654 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1655 goto out;
1656
40d4e3df
ED
1657 skb = skb_share_check(skb, GFP_ATOMIC);
1658 if (skb == NULL)
1da177e4
LT
1659 goto oom;
1660
1661 /* drop any routing info */
adf30907 1662 skb_dst_drop(skb);
1da177e4 1663
84531c24
PO
1664 /* drop conntrack reference */
1665 nf_reset(skb);
1666
ffbc6111 1667 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1668
98e399f8 1669 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1670
1671 /*
1672 * The SOCK_PACKET socket receives _all_ frames.
1673 */
1674
1675 spkt->spkt_family = dev->type;
1676 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1677 spkt->spkt_protocol = skb->protocol;
1678
1679 /*
1680 * Charge the memory to the socket. This is done specifically
1681 * to prevent sockets using all the memory up.
1682 */
1683
40d4e3df 1684 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1685 return 0;
1686
1687out:
1688 kfree_skb(skb);
1689oom:
1690 return 0;
1691}
1692
1693
1694/*
1695 * Output a raw packet to a device layer. This bypasses all the other
1696 * protocol layers and you must therefore supply it with a complete frame
1697 */
1ce4f28b 1698
1b784140
YX
1699static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1700 size_t len)
1da177e4
LT
1701{
1702 struct sock *sk = sock->sk;
342dfc30 1703 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1704 struct sk_buff *skb = NULL;
1da177e4 1705 struct net_device *dev;
40d4e3df 1706 __be16 proto = 0;
1da177e4 1707 int err;
3bdc0eba 1708 int extra_len = 0;
1ce4f28b 1709
1da177e4 1710 /*
1ce4f28b 1711 * Get and verify the address.
1da177e4
LT
1712 */
1713
40d4e3df 1714 if (saddr) {
1da177e4 1715 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1716 return -EINVAL;
1717 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1718 proto = saddr->spkt_protocol;
1719 } else
1720 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1721
1722 /*
1ce4f28b 1723 * Find the device first to size check it
1da177e4
LT
1724 */
1725
de74e92a 1726 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1727retry:
654d1f8a
ED
1728 rcu_read_lock();
1729 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1730 err = -ENODEV;
1731 if (dev == NULL)
1732 goto out_unlock;
1ce4f28b 1733
d5e76b0a
DM
1734 err = -ENETDOWN;
1735 if (!(dev->flags & IFF_UP))
1736 goto out_unlock;
1737
1da177e4 1738 /*
40d4e3df
ED
1739 * You may not queue a frame bigger than the mtu. This is the lowest level
1740 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1741 */
1ce4f28b 1742
3bdc0eba
BG
1743 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1744 if (!netif_supports_nofcs(dev)) {
1745 err = -EPROTONOSUPPORT;
1746 goto out_unlock;
1747 }
1748 extra_len = 4; /* We're doing our own CRC */
1749 }
1750
1da177e4 1751 err = -EMSGSIZE;
3bdc0eba 1752 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1753 goto out_unlock;
1754
1a35ca80
ED
1755 if (!skb) {
1756 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1757 int tlen = dev->needed_tailroom;
1a35ca80
ED
1758 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1759
1760 rcu_read_unlock();
4ce40912 1761 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1762 if (skb == NULL)
1763 return -ENOBUFS;
1764 /* FIXME: Save some space for broken drivers that write a hard
1765 * header at transmission time by themselves. PPP is the notable
1766 * one here. This should really be fixed at the driver level.
1767 */
1768 skb_reserve(skb, reserved);
1769 skb_reset_network_header(skb);
1770
1771 /* Try to align data part correctly */
1772 if (hhlen) {
1773 skb->data -= hhlen;
1774 skb->tail -= hhlen;
1775 if (len < hhlen)
1776 skb_reset_network_header(skb);
1777 }
6ce8e9ce 1778 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1779 if (err)
1780 goto out_free;
1781 goto retry;
1da177e4
LT
1782 }
1783
3bdc0eba 1784 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1785 /* Earlier code assumed this would be a VLAN pkt,
1786 * double-check this now that we have the actual
1787 * packet in hand.
1788 */
1789 struct ethhdr *ehdr;
1790 skb_reset_mac_header(skb);
1791 ehdr = eth_hdr(skb);
1792 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1793 err = -EMSGSIZE;
1794 goto out_unlock;
1795 }
1796 }
1a35ca80 1797
1da177e4
LT
1798 skb->protocol = proto;
1799 skb->dev = dev;
1800 skb->priority = sk->sk_priority;
2d37a186 1801 skb->mark = sk->sk_mark;
bf84a010
DB
1802
1803 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1804
3bdc0eba
BG
1805 if (unlikely(extra_len == 4))
1806 skb->no_fcs = 1;
1807
40893fd0 1808 skb_probe_transport_header(skb, 0);
c1aad275 1809
1da177e4 1810 dev_queue_xmit(skb);
654d1f8a 1811 rcu_read_unlock();
40d4e3df 1812 return len;
1da177e4 1813
1da177e4 1814out_unlock:
654d1f8a 1815 rcu_read_unlock();
1a35ca80
ED
1816out_free:
1817 kfree_skb(skb);
1da177e4
LT
1818 return err;
1819}
1da177e4 1820
eea49cc9 1821static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1822 const struct sock *sk,
dbcb5855 1823 unsigned int res)
1da177e4
LT
1824{
1825 struct sk_filter *filter;
fda9ef5d 1826
80f8f102
ED
1827 rcu_read_lock();
1828 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1829 if (filter != NULL)
0a14842f 1830 res = SK_RUN_FILTER(filter, skb);
80f8f102 1831 rcu_read_unlock();
1da177e4 1832
dbcb5855 1833 return res;
1da177e4
LT
1834}
1835
1836/*
62ab0812
ED
1837 * This function makes lazy skb cloning in hope that most of packets
1838 * are discarded by BPF.
1839 *
1840 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1841 * and skb->cb are mangled. It works because (and until) packets
1842 * falling here are owned by current CPU. Output packets are cloned
1843 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1844 * sequencially, so that if we return skb to original state on exit,
1845 * we will not harm anyone.
1da177e4
LT
1846 */
1847
40d4e3df
ED
1848static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1849 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1850{
1851 struct sock *sk;
1852 struct sockaddr_ll *sll;
1853 struct packet_sock *po;
40d4e3df 1854 u8 *skb_head = skb->data;
1da177e4 1855 int skb_len = skb->len;
dbcb5855 1856 unsigned int snaplen, res;
1da177e4
LT
1857
1858 if (skb->pkt_type == PACKET_LOOPBACK)
1859 goto drop;
1860
1861 sk = pt->af_packet_priv;
1862 po = pkt_sk(sk);
1863
09ad9bc7 1864 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1865 goto drop;
1866
1da177e4
LT
1867 skb->dev = dev;
1868
3b04ddde 1869 if (dev->header_ops) {
1da177e4 1870 /* The device has an explicit notion of ll header,
62ab0812
ED
1871 * exported to higher levels.
1872 *
1873 * Otherwise, the device hides details of its frame
1874 * structure, so that corresponding packet head is
1875 * never delivered to user.
1da177e4
LT
1876 */
1877 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1878 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1879 else if (skb->pkt_type == PACKET_OUTGOING) {
1880 /* Special case: outgoing packets have ll header at head */
bbe735e4 1881 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1882 }
1883 }
1884
1885 snaplen = skb->len;
1886
dbcb5855
DM
1887 res = run_filter(skb, sk, snaplen);
1888 if (!res)
fda9ef5d 1889 goto drop_n_restore;
dbcb5855
DM
1890 if (snaplen > res)
1891 snaplen = res;
1da177e4 1892
0fd7bac6 1893 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1894 goto drop_n_acct;
1895
1896 if (skb_shared(skb)) {
1897 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1898 if (nskb == NULL)
1899 goto drop_n_acct;
1900
1901 if (skb_head != skb->data) {
1902 skb->data = skb_head;
1903 skb->len = skb_len;
1904 }
abc4e4fa 1905 consume_skb(skb);
1da177e4
LT
1906 skb = nskb;
1907 }
1908
b4772ef8 1909 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
1910
1911 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 1912 sll->sll_hatype = dev->type;
1da177e4 1913 sll->sll_pkttype = skb->pkt_type;
8032b464 1914 if (unlikely(po->origdev))
80feaacb
PWJ
1915 sll->sll_ifindex = orig_dev->ifindex;
1916 else
1917 sll->sll_ifindex = dev->ifindex;
1da177e4 1918
b95cce35 1919 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1920
2472d761
EB
1921 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
1922 * Use their space for storing the original skb length.
1923 */
1924 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 1925
1da177e4
LT
1926 if (pskb_trim(skb, snaplen))
1927 goto drop_n_acct;
1928
1929 skb_set_owner_r(skb, sk);
1930 skb->dev = NULL;
adf30907 1931 skb_dst_drop(skb);
1da177e4 1932
84531c24
PO
1933 /* drop conntrack reference */
1934 nf_reset(skb);
1935
1da177e4 1936 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1937 po->stats.stats1.tp_packets++;
3bc3b96f 1938 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
1939 __skb_queue_tail(&sk->sk_receive_queue, skb);
1940 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 1941 sk->sk_data_ready(sk);
1da177e4
LT
1942 return 0;
1943
1944drop_n_acct:
7091fbd8 1945 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1946 po->stats.stats1.tp_drops++;
7091fbd8
WB
1947 atomic_inc(&sk->sk_drops);
1948 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1949
1950drop_n_restore:
1951 if (skb_head != skb->data && skb_shared(skb)) {
1952 skb->data = skb_head;
1953 skb->len = skb_len;
1954 }
1955drop:
ead2ceb0 1956 consume_skb(skb);
1da177e4
LT
1957 return 0;
1958}
1959
40d4e3df
ED
1960static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1961 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1962{
1963 struct sock *sk;
1964 struct packet_sock *po;
1965 struct sockaddr_ll *sll;
184f489e 1966 union tpacket_uhdr h;
40d4e3df 1967 u8 *skb_head = skb->data;
1da177e4 1968 int skb_len = skb->len;
dbcb5855 1969 unsigned int snaplen, res;
f6fb8f10 1970 unsigned long status = TP_STATUS_USER;
bbd6ef87 1971 unsigned short macoff, netoff, hdrlen;
1da177e4 1972 struct sk_buff *copy_skb = NULL;
bbd6ef87 1973 struct timespec ts;
b9c32fb2 1974 __u32 ts_status;
1da177e4 1975
51846355
AW
1976 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1977 * We may add members to them until current aligned size without forcing
1978 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1979 */
1980 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1981 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1982
1da177e4
LT
1983 if (skb->pkt_type == PACKET_LOOPBACK)
1984 goto drop;
1985
1986 sk = pt->af_packet_priv;
1987 po = pkt_sk(sk);
1988
09ad9bc7 1989 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1990 goto drop;
1991
3b04ddde 1992 if (dev->header_ops) {
1da177e4 1993 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1994 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1995 else if (skb->pkt_type == PACKET_OUTGOING) {
1996 /* Special case: outgoing packets have ll header at head */
bbe735e4 1997 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1998 }
1999 }
2000
2001 snaplen = skb->len;
2002
dbcb5855
DM
2003 res = run_filter(skb, sk, snaplen);
2004 if (!res)
fda9ef5d 2005 goto drop_n_restore;
68c2e5de
AD
2006
2007 if (skb->ip_summed == CHECKSUM_PARTIAL)
2008 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2009 else if (skb->pkt_type != PACKET_OUTGOING &&
2010 (skb->ip_summed == CHECKSUM_COMPLETE ||
2011 skb_csum_unnecessary(skb)))
2012 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2013
dbcb5855
DM
2014 if (snaplen > res)
2015 snaplen = res;
1da177e4
LT
2016
2017 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2018 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2019 po->tp_reserve;
1da177e4 2020 } else {
95c96174 2021 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2022 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
2023 (maclen < 16 ? 16 : maclen)) +
2024 po->tp_reserve;
1da177e4
LT
2025 macoff = netoff - maclen;
2026 }
f6fb8f10 2027 if (po->tp_version <= TPACKET_V2) {
2028 if (macoff + snaplen > po->rx_ring.frame_size) {
2029 if (po->copy_thresh &&
0fd7bac6 2030 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2031 if (skb_shared(skb)) {
2032 copy_skb = skb_clone(skb, GFP_ATOMIC);
2033 } else {
2034 copy_skb = skb_get(skb);
2035 skb_head = skb->data;
2036 }
2037 if (copy_skb)
2038 skb_set_owner_r(copy_skb, sk);
1da177e4 2039 }
f6fb8f10 2040 snaplen = po->rx_ring.frame_size - macoff;
2041 if ((int)snaplen < 0)
2042 snaplen = 0;
1da177e4 2043 }
dc808110
ED
2044 } else if (unlikely(macoff + snaplen >
2045 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2046 u32 nval;
2047
2048 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2049 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2050 snaplen, nval, macoff);
2051 snaplen = nval;
2052 if (unlikely((int)snaplen < 0)) {
2053 snaplen = 0;
2054 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2055 }
1da177e4 2056 }
1da177e4 2057 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2058 h.raw = packet_current_rx_frame(po, skb,
2059 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2060 if (!h.raw)
1da177e4 2061 goto ring_is_full;
f6fb8f10 2062 if (po->tp_version <= TPACKET_V2) {
2063 packet_increment_rx_head(po, &po->rx_ring);
2064 /*
2065 * LOSING will be reported till you read the stats,
2066 * because it's COR - Clear On Read.
2067 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2068 * at packet level.
2069 */
ee80fbf3 2070 if (po->stats.stats1.tp_drops)
f6fb8f10 2071 status |= TP_STATUS_LOSING;
2072 }
ee80fbf3 2073 po->stats.stats1.tp_packets++;
1da177e4
LT
2074 if (copy_skb) {
2075 status |= TP_STATUS_COPY;
2076 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2077 }
1da177e4
LT
2078 spin_unlock(&sk->sk_receive_queue.lock);
2079
bbd6ef87 2080 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2081
2082 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2083 getnstimeofday(&ts);
1da177e4 2084
b9c32fb2
DB
2085 status |= ts_status;
2086
bbd6ef87
PM
2087 switch (po->tp_version) {
2088 case TPACKET_V1:
2089 h.h1->tp_len = skb->len;
2090 h.h1->tp_snaplen = snaplen;
2091 h.h1->tp_mac = macoff;
2092 h.h1->tp_net = netoff;
4b457bdf
DB
2093 h.h1->tp_sec = ts.tv_sec;
2094 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2095 hdrlen = sizeof(*h.h1);
2096 break;
2097 case TPACKET_V2:
2098 h.h2->tp_len = skb->len;
2099 h.h2->tp_snaplen = snaplen;
2100 h.h2->tp_mac = macoff;
2101 h.h2->tp_net = netoff;
bbd6ef87
PM
2102 h.h2->tp_sec = ts.tv_sec;
2103 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2104 if (skb_vlan_tag_present(skb)) {
2105 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2106 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2107 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2108 } else {
2109 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2110 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2111 }
e4d26f4b 2112 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2113 hdrlen = sizeof(*h.h2);
2114 break;
f6fb8f10 2115 case TPACKET_V3:
2116 /* tp_nxt_offset,vlan are already populated above.
2117 * So DONT clear those fields here
2118 */
2119 h.h3->tp_status |= status;
2120 h.h3->tp_len = skb->len;
2121 h.h3->tp_snaplen = snaplen;
2122 h.h3->tp_mac = macoff;
2123 h.h3->tp_net = netoff;
f6fb8f10 2124 h.h3->tp_sec = ts.tv_sec;
2125 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2126 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2127 hdrlen = sizeof(*h.h3);
2128 break;
bbd6ef87
PM
2129 default:
2130 BUG();
2131 }
1da177e4 2132
bbd6ef87 2133 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2134 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2135 sll->sll_family = AF_PACKET;
2136 sll->sll_hatype = dev->type;
2137 sll->sll_protocol = skb->protocol;
2138 sll->sll_pkttype = skb->pkt_type;
8032b464 2139 if (unlikely(po->origdev))
80feaacb
PWJ
2140 sll->sll_ifindex = orig_dev->ifindex;
2141 else
2142 sll->sll_ifindex = dev->ifindex;
1da177e4 2143
e16aa207 2144 smp_mb();
f0d4eb29 2145
f6dafa95 2146#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2147 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2148 u8 *start, *end;
2149
f0d4eb29
DB
2150 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2151 macoff + snaplen);
2152
2153 for (start = h.raw; start < end; start += PAGE_SIZE)
2154 flush_dcache_page(pgv_to_page(start));
1da177e4 2155 }
f0d4eb29 2156 smp_wmb();
f6dafa95 2157#endif
f0d4eb29 2158
da413eec 2159 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2160 __packet_set_status(po, h.raw, status);
da413eec
DC
2161 sk->sk_data_ready(sk);
2162 } else {
f6fb8f10 2163 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2164 }
1da177e4
LT
2165
2166drop_n_restore:
2167 if (skb_head != skb->data && skb_shared(skb)) {
2168 skb->data = skb_head;
2169 skb->len = skb_len;
2170 }
2171drop:
1ce4f28b 2172 kfree_skb(skb);
1da177e4
LT
2173 return 0;
2174
2175ring_is_full:
ee80fbf3 2176 po->stats.stats1.tp_drops++;
1da177e4
LT
2177 spin_unlock(&sk->sk_receive_queue.lock);
2178
676d2369 2179 sk->sk_data_ready(sk);
acb5d75b 2180 kfree_skb(copy_skb);
1da177e4
LT
2181 goto drop_n_restore;
2182}
2183
69e3c75f
JB
2184static void tpacket_destruct_skb(struct sk_buff *skb)
2185{
2186 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2187
69e3c75f 2188 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2189 void *ph;
b9c32fb2
DB
2190 __u32 ts;
2191
69e3c75f 2192 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2193 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2194
2195 ts = __packet_set_timestamp(po, ph, skb);
2196 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2197 }
2198
2199 sock_wfree(skb);
2200}
2201
9c707762
WB
2202static bool ll_header_truncated(const struct net_device *dev, int len)
2203{
2204 /* net device doesn't like empty head */
2205 if (unlikely(len <= dev->hard_header_len)) {
eee2f04b 2206 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
9c707762
WB
2207 current->comm, len, dev->hard_header_len);
2208 return true;
2209 }
2210
2211 return false;
2212}
2213
40d4e3df
ED
2214static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2215 void *frame, struct net_device *dev, int size_max,
ae641949 2216 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2217{
184f489e 2218 union tpacket_uhdr ph;
09effa67 2219 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2220 struct socket *sock = po->sk.sk_socket;
2221 struct page *page;
2222 void *data;
2223 int err;
2224
2225 ph.raw = frame;
2226
2227 skb->protocol = proto;
2228 skb->dev = dev;
2229 skb->priority = po->sk.sk_priority;
2d37a186 2230 skb->mark = po->sk.sk_mark;
2e31396f 2231 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2232 skb_shinfo(skb)->destructor_arg = ph.raw;
2233
2234 switch (po->tp_version) {
2235 case TPACKET_V2:
2236 tp_len = ph.h2->tp_len;
2237 break;
2238 default:
2239 tp_len = ph.h1->tp_len;
2240 break;
2241 }
09effa67
DM
2242 if (unlikely(tp_len > size_max)) {
2243 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2244 return -EMSGSIZE;
2245 }
69e3c75f 2246
ae641949 2247 skb_reserve(skb, hlen);
69e3c75f 2248 skb_reset_network_header(skb);
c1aad275 2249
d346a3fa
DB
2250 if (!packet_use_direct_xmit(po))
2251 skb_probe_transport_header(skb, 0);
2252 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2253 int off_min, off_max, off;
2254 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2255 off_max = po->tx_ring.frame_size - tp_len;
2256 if (sock->type == SOCK_DGRAM) {
2257 switch (po->tp_version) {
2258 case TPACKET_V2:
2259 off = ph.h2->tp_net;
2260 break;
2261 default:
2262 off = ph.h1->tp_net;
2263 break;
2264 }
2265 } else {
2266 switch (po->tp_version) {
2267 case TPACKET_V2:
2268 off = ph.h2->tp_mac;
2269 break;
2270 default:
2271 off = ph.h1->tp_mac;
2272 break;
2273 }
2274 }
2275 if (unlikely((off < off_min) || (off_max < off)))
2276 return -EINVAL;
2277 data = ph.raw + off;
2278 } else {
2279 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2280 }
69e3c75f
JB
2281 to_write = tp_len;
2282
2283 if (sock->type == SOCK_DGRAM) {
2284 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2285 NULL, tp_len);
2286 if (unlikely(err < 0))
2287 return -EINVAL;
40d4e3df 2288 } else if (dev->hard_header_len) {
9c707762 2289 if (ll_header_truncated(dev, tp_len))
69e3c75f 2290 return -EINVAL;
69e3c75f
JB
2291
2292 skb_push(skb, dev->hard_header_len);
2293 err = skb_store_bits(skb, 0, data,
2294 dev->hard_header_len);
2295 if (unlikely(err))
2296 return err;
2297
2298 data += dev->hard_header_len;
2299 to_write -= dev->hard_header_len;
2300 }
2301
69e3c75f
JB
2302 offset = offset_in_page(data);
2303 len_max = PAGE_SIZE - offset;
2304 len = ((to_write > len_max) ? len_max : to_write);
2305
2306 skb->data_len = to_write;
2307 skb->len += to_write;
2308 skb->truesize += to_write;
2309 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2310
2311 while (likely(to_write)) {
2312 nr_frags = skb_shinfo(skb)->nr_frags;
2313
2314 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2315 pr_err("Packet exceed the number of skb frags(%lu)\n",
2316 MAX_SKB_FRAGS);
69e3c75f
JB
2317 return -EFAULT;
2318 }
2319
0af55bb5
CG
2320 page = pgv_to_page(data);
2321 data += len;
69e3c75f
JB
2322 flush_dcache_page(page);
2323 get_page(page);
0af55bb5 2324 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2325 to_write -= len;
2326 offset = 0;
2327 len_max = PAGE_SIZE;
2328 len = ((to_write > len_max) ? len_max : to_write);
2329 }
2330
2331 return tp_len;
2332}
2333
2334static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2335{
69e3c75f
JB
2336 struct sk_buff *skb;
2337 struct net_device *dev;
2338 __be16 proto;
09effa67 2339 int err, reserve = 0;
40d4e3df 2340 void *ph;
342dfc30 2341 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2342 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2343 int tp_len, size_max;
2344 unsigned char *addr;
2345 int len_sum = 0;
9e67030a 2346 int status = TP_STATUS_AVAILABLE;
ae641949 2347 int hlen, tlen;
69e3c75f 2348
69e3c75f
JB
2349 mutex_lock(&po->pg_vec_lock);
2350
66e56cd4 2351 if (likely(saddr == NULL)) {
e40526cb 2352 dev = packet_cached_dev_get(po);
69e3c75f
JB
2353 proto = po->num;
2354 addr = NULL;
2355 } else {
2356 err = -EINVAL;
2357 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2358 goto out;
2359 if (msg->msg_namelen < (saddr->sll_halen
2360 + offsetof(struct sockaddr_ll,
2361 sll_addr)))
2362 goto out;
69e3c75f
JB
2363 proto = saddr->sll_protocol;
2364 addr = saddr->sll_addr;
827d9780 2365 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2366 }
2367
69e3c75f
JB
2368 err = -ENXIO;
2369 if (unlikely(dev == NULL))
2370 goto out;
69e3c75f
JB
2371 err = -ENETDOWN;
2372 if (unlikely(!(dev->flags & IFF_UP)))
2373 goto out_put;
2374
52f1454f 2375 reserve = dev->hard_header_len + VLAN_HLEN;
69e3c75f 2376 size_max = po->tx_ring.frame_size
b5dd884e 2377 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2378
09effa67
DM
2379 if (size_max > dev->mtu + reserve)
2380 size_max = dev->mtu + reserve;
2381
69e3c75f
JB
2382 do {
2383 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2384 TP_STATUS_SEND_REQUEST);
69e3c75f 2385 if (unlikely(ph == NULL)) {
87a2fd28
DB
2386 if (need_wait && need_resched())
2387 schedule();
69e3c75f
JB
2388 continue;
2389 }
2390
2391 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2392 hlen = LL_RESERVED_SPACE(dev);
2393 tlen = dev->needed_tailroom;
69e3c75f 2394 skb = sock_alloc_send_skb(&po->sk,
ae641949 2395 hlen + tlen + sizeof(struct sockaddr_ll),
fbf33a28 2396 !need_wait, &err);
69e3c75f 2397
fbf33a28
KM
2398 if (unlikely(skb == NULL)) {
2399 /* we assume the socket was initially writeable ... */
2400 if (likely(len_sum > 0))
2401 err = len_sum;
69e3c75f 2402 goto out_status;
fbf33a28 2403 }
69e3c75f 2404 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
52f1454f
DB
2405 addr, hlen);
2406 if (tp_len > dev->mtu + dev->hard_header_len) {
2407 struct ethhdr *ehdr;
2408 /* Earlier code assumed this would be a VLAN pkt,
2409 * double-check this now that we have the actual
2410 * packet in hand.
2411 */
69e3c75f 2412
52f1454f
DB
2413 skb_reset_mac_header(skb);
2414 ehdr = eth_hdr(skb);
2415 if (ehdr->h_proto != htons(ETH_P_8021Q))
2416 tp_len = -EMSGSIZE;
2417 }
69e3c75f
JB
2418 if (unlikely(tp_len < 0)) {
2419 if (po->tp_loss) {
2420 __packet_set_status(po, ph,
2421 TP_STATUS_AVAILABLE);
2422 packet_increment_head(&po->tx_ring);
2423 kfree_skb(skb);
2424 continue;
2425 } else {
2426 status = TP_STATUS_WRONG_FORMAT;
2427 err = tp_len;
2428 goto out_status;
2429 }
2430 }
2431
0fd5d57b
DB
2432 packet_pick_tx_queue(dev, skb);
2433
69e3c75f
JB
2434 skb->destructor = tpacket_destruct_skb;
2435 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2436 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2437
2438 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2439 err = po->xmit(skb);
eb70df13
JP
2440 if (unlikely(err > 0)) {
2441 err = net_xmit_errno(err);
2442 if (err && __packet_get_status(po, ph) ==
2443 TP_STATUS_AVAILABLE) {
2444 /* skb was destructed already */
2445 skb = NULL;
2446 goto out_status;
2447 }
2448 /*
2449 * skb was dropped but not destructed yet;
2450 * let's treat it like congestion or err < 0
2451 */
2452 err = 0;
2453 }
69e3c75f
JB
2454 packet_increment_head(&po->tx_ring);
2455 len_sum += tp_len;
b0138408
DB
2456 } while (likely((ph != NULL) ||
2457 /* Note: packet_read_pending() might be slow if we have
2458 * to call it as it's per_cpu variable, but in fast-path
2459 * we already short-circuit the loop with the first
2460 * condition, and luckily don't have to go that path
2461 * anyway.
2462 */
2463 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2464
2465 err = len_sum;
2466 goto out_put;
2467
69e3c75f
JB
2468out_status:
2469 __packet_set_status(po, ph, status);
2470 kfree_skb(skb);
2471out_put:
e40526cb 2472 dev_put(dev);
69e3c75f
JB
2473out:
2474 mutex_unlock(&po->pg_vec_lock);
2475 return err;
2476}
69e3c75f 2477
eea49cc9
OJ
2478static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2479 size_t reserve, size_t len,
2480 size_t linear, int noblock,
2481 int *err)
bfd5f4a3
SS
2482{
2483 struct sk_buff *skb;
2484
2485 /* Under a page? Don't bother with paged skb. */
2486 if (prepad + len < PAGE_SIZE || !linear)
2487 linear = len;
2488
2489 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2490 err, 0);
bfd5f4a3
SS
2491 if (!skb)
2492 return NULL;
2493
2494 skb_reserve(skb, reserve);
2495 skb_put(skb, linear);
2496 skb->data_len = len - linear;
2497 skb->len += len - linear;
2498
2499 return skb;
2500}
2501
d346a3fa 2502static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2503{
2504 struct sock *sk = sock->sk;
342dfc30 2505 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2506 struct sk_buff *skb;
2507 struct net_device *dev;
0e11c91e 2508 __be16 proto;
1da177e4 2509 unsigned char *addr;
827d9780 2510 int err, reserve = 0;
bfd5f4a3
SS
2511 struct virtio_net_hdr vnet_hdr = { 0 };
2512 int offset = 0;
2513 int vnet_hdr_len;
2514 struct packet_sock *po = pkt_sk(sk);
2515 unsigned short gso_type = 0;
ae641949 2516 int hlen, tlen;
3bdc0eba 2517 int extra_len = 0;
8feb2fb2 2518 ssize_t n;
1da177e4
LT
2519
2520 /*
1ce4f28b 2521 * Get and verify the address.
1da177e4 2522 */
1ce4f28b 2523
66e56cd4 2524 if (likely(saddr == NULL)) {
e40526cb 2525 dev = packet_cached_dev_get(po);
1da177e4
LT
2526 proto = po->num;
2527 addr = NULL;
2528 } else {
2529 err = -EINVAL;
2530 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2531 goto out;
0fb375fb
EB
2532 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2533 goto out;
1da177e4
LT
2534 proto = saddr->sll_protocol;
2535 addr = saddr->sll_addr;
827d9780 2536 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2537 }
2538
1da177e4 2539 err = -ENXIO;
e40526cb 2540 if (unlikely(dev == NULL))
1da177e4 2541 goto out_unlock;
d5e76b0a 2542 err = -ENETDOWN;
e40526cb 2543 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2544 goto out_unlock;
2545
e40526cb
DB
2546 if (sock->type == SOCK_RAW)
2547 reserve = dev->hard_header_len;
bfd5f4a3
SS
2548 if (po->has_vnet_hdr) {
2549 vnet_hdr_len = sizeof(vnet_hdr);
2550
2551 err = -EINVAL;
2552 if (len < vnet_hdr_len)
2553 goto out_unlock;
2554
2555 len -= vnet_hdr_len;
2556
8feb2fb2 2557 err = -EFAULT;
c0371da6 2558 n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
8feb2fb2 2559 if (n != vnet_hdr_len)
bfd5f4a3
SS
2560 goto out_unlock;
2561
2562 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
dc9e5153
MT
2563 (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
2564 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
2565 __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
2566 vnet_hdr.hdr_len = __cpu_to_virtio16(false,
2567 __virtio16_to_cpu(false, vnet_hdr.csum_start) +
2568 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
bfd5f4a3
SS
2569
2570 err = -EINVAL;
dc9e5153 2571 if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
bfd5f4a3
SS
2572 goto out_unlock;
2573
2574 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2575 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2576 case VIRTIO_NET_HDR_GSO_TCPV4:
2577 gso_type = SKB_GSO_TCPV4;
2578 break;
2579 case VIRTIO_NET_HDR_GSO_TCPV6:
2580 gso_type = SKB_GSO_TCPV6;
2581 break;
2582 case VIRTIO_NET_HDR_GSO_UDP:
2583 gso_type = SKB_GSO_UDP;
2584 break;
2585 default:
2586 goto out_unlock;
2587 }
2588
2589 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2590 gso_type |= SKB_GSO_TCP_ECN;
2591
2592 if (vnet_hdr.gso_size == 0)
2593 goto out_unlock;
2594
2595 }
2596 }
2597
3bdc0eba
BG
2598 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2599 if (!netif_supports_nofcs(dev)) {
2600 err = -EPROTONOSUPPORT;
2601 goto out_unlock;
2602 }
2603 extra_len = 4; /* We're doing our own CRC */
2604 }
2605
1da177e4 2606 err = -EMSGSIZE;
3bdc0eba 2607 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2608 goto out_unlock;
2609
bfd5f4a3 2610 err = -ENOBUFS;
ae641949
HX
2611 hlen = LL_RESERVED_SPACE(dev);
2612 tlen = dev->needed_tailroom;
dc9e5153
MT
2613 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
2614 __virtio16_to_cpu(false, vnet_hdr.hdr_len),
bfd5f4a3 2615 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2616 if (skb == NULL)
1da177e4
LT
2617 goto out_unlock;
2618
bfd5f4a3 2619 skb_set_network_header(skb, reserve);
1da177e4 2620
0c4e8581 2621 err = -EINVAL;
9c707762
WB
2622 if (sock->type == SOCK_DGRAM) {
2623 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2624 if (unlikely(offset < 0))
9c707762
WB
2625 goto out_free;
2626 } else {
2627 if (ll_header_truncated(dev, len))
2628 goto out_free;
2629 }
1da177e4
LT
2630
2631 /* Returns -EFAULT on error */
c0371da6 2632 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2633 if (err)
2634 goto out_free;
bf84a010
DB
2635
2636 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2637
3bdc0eba 2638 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2639 /* Earlier code assumed this would be a VLAN pkt,
2640 * double-check this now that we have the actual
2641 * packet in hand.
2642 */
2643 struct ethhdr *ehdr;
2644 skb_reset_mac_header(skb);
2645 ehdr = eth_hdr(skb);
2646 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2647 err = -EMSGSIZE;
2648 goto out_free;
2649 }
57f89bfa
BG
2650 }
2651
09effa67
DM
2652 skb->protocol = proto;
2653 skb->dev = dev;
1da177e4 2654 skb->priority = sk->sk_priority;
2d37a186 2655 skb->mark = sk->sk_mark;
0fd5d57b
DB
2656
2657 packet_pick_tx_queue(dev, skb);
1da177e4 2658
bfd5f4a3
SS
2659 if (po->has_vnet_hdr) {
2660 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
dc9e5153
MT
2661 u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
2662 u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
2663 if (!skb_partial_csum_set(skb, s, o)) {
bfd5f4a3
SS
2664 err = -EINVAL;
2665 goto out_free;
2666 }
2667 }
2668
dc9e5153
MT
2669 skb_shinfo(skb)->gso_size =
2670 __virtio16_to_cpu(false, vnet_hdr.gso_size);
bfd5f4a3
SS
2671 skb_shinfo(skb)->gso_type = gso_type;
2672
2673 /* Header must be checked, and gso_segs computed. */
2674 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2675 skb_shinfo(skb)->gso_segs = 0;
2676
2677 len += vnet_hdr_len;
2678 }
2679
d346a3fa
DB
2680 if (!packet_use_direct_xmit(po))
2681 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2682 if (unlikely(extra_len == 4))
2683 skb->no_fcs = 1;
2684
d346a3fa 2685 err = po->xmit(skb);
1da177e4
LT
2686 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2687 goto out_unlock;
2688
e40526cb 2689 dev_put(dev);
1da177e4 2690
40d4e3df 2691 return len;
1da177e4
LT
2692
2693out_free:
2694 kfree_skb(skb);
2695out_unlock:
e40526cb 2696 if (dev)
1da177e4
LT
2697 dev_put(dev);
2698out:
2699 return err;
2700}
2701
1b784140 2702static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2703{
69e3c75f
JB
2704 struct sock *sk = sock->sk;
2705 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2706
69e3c75f
JB
2707 if (po->tx_ring.pg_vec)
2708 return tpacket_snd(po, msg);
2709 else
69e3c75f
JB
2710 return packet_snd(sock, msg, len);
2711}
2712
1da177e4
LT
2713/*
2714 * Close a PACKET socket. This is fairly simple. We immediately go
2715 * to 'closed' state and remove our protocol entry in the device list.
2716 */
2717
2718static int packet_release(struct socket *sock)
2719{
2720 struct sock *sk = sock->sk;
2721 struct packet_sock *po;
d12d01d6 2722 struct net *net;
f6fb8f10 2723 union tpacket_req_u req_u;
1da177e4
LT
2724
2725 if (!sk)
2726 return 0;
2727
3b1e0a65 2728 net = sock_net(sk);
1da177e4
LT
2729 po = pkt_sk(sk);
2730
0fa7fa98 2731 mutex_lock(&net->packet.sklist_lock);
808f5114 2732 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2733 mutex_unlock(&net->packet.sklist_lock);
2734
2735 preempt_disable();
920de804 2736 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2737 preempt_enable();
1da177e4 2738
808f5114 2739 spin_lock(&po->bind_lock);
ce06b03e 2740 unregister_prot_hook(sk, false);
66e56cd4
DB
2741 packet_cached_dev_reset(po);
2742
160ff18a
BG
2743 if (po->prot_hook.dev) {
2744 dev_put(po->prot_hook.dev);
2745 po->prot_hook.dev = NULL;
2746 }
808f5114 2747 spin_unlock(&po->bind_lock);
1da177e4 2748
1da177e4 2749 packet_flush_mclist(sk);
1da177e4 2750
9665d5d6
PS
2751 if (po->rx_ring.pg_vec) {
2752 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2753 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2754 }
69e3c75f 2755
9665d5d6
PS
2756 if (po->tx_ring.pg_vec) {
2757 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2758 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2759 }
1da177e4 2760
dc99f600
DM
2761 fanout_release(sk);
2762
808f5114 2763 synchronize_net();
1da177e4
LT
2764 /*
2765 * Now the socket is dead. No more input will appear.
2766 */
1da177e4
LT
2767 sock_orphan(sk);
2768 sock->sk = NULL;
2769
2770 /* Purge queues */
2771
2772 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2773 packet_free_pending(po);
17ab56a2 2774 sk_refcnt_debug_release(sk);
1da177e4
LT
2775
2776 sock_put(sk);
2777 return 0;
2778}
2779
2780/*
2781 * Attach a packet hook.
2782 */
2783
902fefb8 2784static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2785{
2786 struct packet_sock *po = pkt_sk(sk);
902fefb8
DB
2787 const struct net_device *dev_curr;
2788 __be16 proto_curr;
2789 bool need_rehook;
dc99f600 2790
aef950b4
WY
2791 if (po->fanout) {
2792 if (dev)
2793 dev_put(dev);
2794
dc99f600 2795 return -EINVAL;
aef950b4 2796 }
1da177e4
LT
2797
2798 lock_sock(sk);
1da177e4 2799 spin_lock(&po->bind_lock);
66e56cd4 2800
902fefb8
DB
2801 proto_curr = po->prot_hook.type;
2802 dev_curr = po->prot_hook.dev;
2803
2804 need_rehook = proto_curr != proto || dev_curr != dev;
2805
2806 if (need_rehook) {
2807 unregister_prot_hook(sk, true);
1da177e4 2808
902fefb8
DB
2809 po->num = proto;
2810 po->prot_hook.type = proto;
1da177e4 2811
902fefb8
DB
2812 if (po->prot_hook.dev)
2813 dev_put(po->prot_hook.dev);
2814
2815 po->prot_hook.dev = dev;
2816
2817 po->ifindex = dev ? dev->ifindex : 0;
2818 packet_cached_dev_assign(po, dev);
2819 }
66e56cd4 2820
902fefb8 2821 if (proto == 0 || !need_rehook)
1da177e4
LT
2822 goto out_unlock;
2823
be85d4ad 2824 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2825 register_prot_hook(sk);
be85d4ad
UT
2826 } else {
2827 sk->sk_err = ENETDOWN;
2828 if (!sock_flag(sk, SOCK_DEAD))
2829 sk->sk_error_report(sk);
1da177e4
LT
2830 }
2831
2832out_unlock:
2833 spin_unlock(&po->bind_lock);
2834 release_sock(sk);
2835 return 0;
2836}
2837
2838/*
2839 * Bind a packet socket to a device
2840 */
2841
40d4e3df
ED
2842static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2843 int addr_len)
1da177e4 2844{
40d4e3df 2845 struct sock *sk = sock->sk;
1da177e4
LT
2846 char name[15];
2847 struct net_device *dev;
2848 int err = -ENODEV;
1ce4f28b 2849
1da177e4
LT
2850 /*
2851 * Check legality
2852 */
1ce4f28b 2853
8ae55f04 2854 if (addr_len != sizeof(struct sockaddr))
1da177e4 2855 return -EINVAL;
40d4e3df 2856 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2857
3b1e0a65 2858 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2859 if (dev)
1da177e4 2860 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2861 return err;
2862}
1da177e4
LT
2863
2864static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2865{
40d4e3df
ED
2866 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2867 struct sock *sk = sock->sk;
1da177e4
LT
2868 struct net_device *dev = NULL;
2869 int err;
2870
2871
2872 /*
2873 * Check legality
2874 */
1ce4f28b 2875
1da177e4
LT
2876 if (addr_len < sizeof(struct sockaddr_ll))
2877 return -EINVAL;
2878 if (sll->sll_family != AF_PACKET)
2879 return -EINVAL;
2880
2881 if (sll->sll_ifindex) {
2882 err = -ENODEV;
3b1e0a65 2883 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2884 if (dev == NULL)
2885 goto out;
2886 }
2887 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2888
2889out:
2890 return err;
2891}
2892
2893static struct proto packet_proto = {
2894 .name = "PACKET",
2895 .owner = THIS_MODULE,
2896 .obj_size = sizeof(struct packet_sock),
2897};
2898
2899/*
1ce4f28b 2900 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2901 */
2902
3f378b68
EP
2903static int packet_create(struct net *net, struct socket *sock, int protocol,
2904 int kern)
1da177e4
LT
2905{
2906 struct sock *sk;
2907 struct packet_sock *po;
0e11c91e 2908 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2909 int err;
2910
df008c91 2911 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2912 return -EPERM;
be02097c
DM
2913 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2914 sock->type != SOCK_PACKET)
1da177e4
LT
2915 return -ESOCKTNOSUPPORT;
2916
2917 sock->state = SS_UNCONNECTED;
2918
2919 err = -ENOBUFS;
11aa9c28 2920 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
2921 if (sk == NULL)
2922 goto out;
2923
2924 sock->ops = &packet_ops;
1da177e4
LT
2925 if (sock->type == SOCK_PACKET)
2926 sock->ops = &packet_ops_spkt;
be02097c 2927
1da177e4
LT
2928 sock_init_data(sock, sk);
2929
2930 po = pkt_sk(sk);
2931 sk->sk_family = PF_PACKET;
0e11c91e 2932 po->num = proto;
d346a3fa 2933 po->xmit = dev_queue_xmit;
66e56cd4 2934
b0138408
DB
2935 err = packet_alloc_pending(po);
2936 if (err)
2937 goto out2;
2938
66e56cd4 2939 packet_cached_dev_reset(po);
1da177e4
LT
2940
2941 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2942 sk_refcnt_debug_inc(sk);
1da177e4
LT
2943
2944 /*
2945 * Attach a protocol block
2946 */
2947
2948 spin_lock_init(&po->bind_lock);
905db440 2949 mutex_init(&po->pg_vec_lock);
0648ab70 2950 po->rollover = NULL;
1da177e4 2951 po->prot_hook.func = packet_rcv;
be02097c 2952
1da177e4
LT
2953 if (sock->type == SOCK_PACKET)
2954 po->prot_hook.func = packet_rcv_spkt;
be02097c 2955
1da177e4
LT
2956 po->prot_hook.af_packet_priv = sk;
2957
0e11c91e
AV
2958 if (proto) {
2959 po->prot_hook.type = proto;
ce06b03e 2960 register_prot_hook(sk);
1da177e4
LT
2961 }
2962
0fa7fa98 2963 mutex_lock(&net->packet.sklist_lock);
808f5114 2964 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2965 mutex_unlock(&net->packet.sklist_lock);
2966
2967 preempt_disable();
3680453c 2968 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2969 preempt_enable();
808f5114 2970
40d4e3df 2971 return 0;
b0138408
DB
2972out2:
2973 sk_free(sk);
1da177e4
LT
2974out:
2975 return err;
2976}
2977
2978/*
2979 * Pull a packet from our receive queue and hand it to the user.
2980 * If necessary we block.
2981 */
2982
1b784140
YX
2983static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
2984 int flags)
1da177e4
LT
2985{
2986 struct sock *sk = sock->sk;
2987 struct sk_buff *skb;
2988 int copied, err;
bfd5f4a3 2989 int vnet_hdr_len = 0;
2472d761 2990 unsigned int origlen = 0;
1da177e4
LT
2991
2992 err = -EINVAL;
ed85b565 2993 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2994 goto out;
2995
2996#if 0
2997 /* What error should we return now? EUNATTACH? */
2998 if (pkt_sk(sk)->ifindex < 0)
2999 return -ENODEV;
3000#endif
3001
ed85b565 3002 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3003 err = sock_recv_errqueue(sk, msg, len,
3004 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3005 goto out;
3006 }
3007
1da177e4
LT
3008 /*
3009 * Call the generic datagram receiver. This handles all sorts
3010 * of horrible races and re-entrancy so we can forget about it
3011 * in the protocol layers.
3012 *
3013 * Now it will return ENETDOWN, if device have just gone down,
3014 * but then it will block.
3015 */
3016
40d4e3df 3017 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3018
3019 /*
1ce4f28b 3020 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3021 * handles the blocking we don't see and worry about blocking
3022 * retries.
3023 */
3024
8ae55f04 3025 if (skb == NULL)
1da177e4
LT
3026 goto out;
3027
2ccdbaa6
WB
3028 if (pkt_sk(sk)->pressure)
3029 packet_rcv_has_room(pkt_sk(sk), NULL);
3030
bfd5f4a3
SS
3031 if (pkt_sk(sk)->has_vnet_hdr) {
3032 struct virtio_net_hdr vnet_hdr = { 0 };
3033
3034 err = -EINVAL;
3035 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 3036 if (len < vnet_hdr_len)
bfd5f4a3
SS
3037 goto out_free;
3038
1f18b717
MK
3039 len -= vnet_hdr_len;
3040
bfd5f4a3
SS
3041 if (skb_is_gso(skb)) {
3042 struct skb_shared_info *sinfo = skb_shinfo(skb);
3043
3044 /* This is a hint as to how much should be linear. */
dc9e5153
MT
3045 vnet_hdr.hdr_len =
3046 __cpu_to_virtio16(false, skb_headlen(skb));
3047 vnet_hdr.gso_size =
3048 __cpu_to_virtio16(false, sinfo->gso_size);
bfd5f4a3
SS
3049 if (sinfo->gso_type & SKB_GSO_TCPV4)
3050 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
3051 else if (sinfo->gso_type & SKB_GSO_TCPV6)
3052 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
3053 else if (sinfo->gso_type & SKB_GSO_UDP)
3054 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
3055 else if (sinfo->gso_type & SKB_GSO_FCOE)
3056 goto out_free;
3057 else
3058 BUG();
3059 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
3060 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
3061 } else
3062 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
3063
3064 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3065 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
dc9e5153
MT
3066 vnet_hdr.csum_start = __cpu_to_virtio16(false,
3067 skb_checksum_start_offset(skb));
3068 vnet_hdr.csum_offset = __cpu_to_virtio16(false,
3069 skb->csum_offset);
10a8d94a
JW
3070 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
3071 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
3072 } /* else everything is zero */
3073
7eab8d9e 3074 err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
bfd5f4a3
SS
3075 if (err < 0)
3076 goto out_free;
3077 }
3078
f3d33426
HFS
3079 /* You lose any data beyond the buffer you gave. If it worries
3080 * a user program they can ask the device for its MTU
3081 * anyway.
1da177e4 3082 */
1da177e4 3083 copied = skb->len;
40d4e3df
ED
3084 if (copied > len) {
3085 copied = len;
3086 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3087 }
3088
51f3d02b 3089 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3090 if (err)
3091 goto out_free;
3092
2472d761
EB
3093 if (sock->type != SOCK_PACKET) {
3094 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3095
3096 /* Original length was stored in sockaddr_ll fields */
3097 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3098 sll->sll_family = AF_PACKET;
3099 sll->sll_protocol = skb->protocol;
3100 }
3101
3b885787 3102 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3103
f3d33426
HFS
3104 if (msg->msg_name) {
3105 /* If the address length field is there to be filled
3106 * in, we fill it in now.
3107 */
3108 if (sock->type == SOCK_PACKET) {
342dfc30 3109 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3110 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3111 } else {
3112 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3113
f3d33426
HFS
3114 msg->msg_namelen = sll->sll_halen +
3115 offsetof(struct sockaddr_ll, sll_addr);
3116 }
ffbc6111
HX
3117 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3118 msg->msg_namelen);
f3d33426 3119 }
1da177e4 3120
8dc41944 3121 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3122 struct tpacket_auxdata aux;
3123
3124 aux.tp_status = TP_STATUS_USER;
3125 if (skb->ip_summed == CHECKSUM_PARTIAL)
3126 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3127 else if (skb->pkt_type != PACKET_OUTGOING &&
3128 (skb->ip_summed == CHECKSUM_COMPLETE ||
3129 skb_csum_unnecessary(skb)))
3130 aux.tp_status |= TP_STATUS_CSUM_VALID;
3131
2472d761 3132 aux.tp_len = origlen;
ffbc6111
HX
3133 aux.tp_snaplen = skb->len;
3134 aux.tp_mac = 0;
bbe735e4 3135 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3136 if (skb_vlan_tag_present(skb)) {
3137 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3138 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3139 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3140 } else {
3141 aux.tp_vlan_tci = 0;
a0cdfcf3 3142 aux.tp_vlan_tpid = 0;
a3bcc23e 3143 }
ffbc6111 3144 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3145 }
3146
1da177e4
LT
3147 /*
3148 * Free or return the buffer as appropriate. Again this
3149 * hides all the races and re-entrancy issues from us.
3150 */
bfd5f4a3 3151 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3152
3153out_free:
3154 skb_free_datagram(sk, skb);
3155out:
3156 return err;
3157}
3158
1da177e4
LT
3159static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3160 int *uaddr_len, int peer)
3161{
3162 struct net_device *dev;
3163 struct sock *sk = sock->sk;
3164
3165 if (peer)
3166 return -EOPNOTSUPP;
3167
3168 uaddr->sa_family = AF_PACKET;
2dc85bf3 3169 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3170 rcu_read_lock();
3171 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3172 if (dev)
2dc85bf3 3173 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3174 rcu_read_unlock();
1da177e4
LT
3175 *uaddr_len = sizeof(*uaddr);
3176
3177 return 0;
3178}
1da177e4
LT
3179
3180static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3181 int *uaddr_len, int peer)
3182{
3183 struct net_device *dev;
3184 struct sock *sk = sock->sk;
3185 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3186 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3187
3188 if (peer)
3189 return -EOPNOTSUPP;
3190
3191 sll->sll_family = AF_PACKET;
3192 sll->sll_ifindex = po->ifindex;
3193 sll->sll_protocol = po->num;
67286640 3194 sll->sll_pkttype = 0;
654d1f8a
ED
3195 rcu_read_lock();
3196 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3197 if (dev) {
3198 sll->sll_hatype = dev->type;
3199 sll->sll_halen = dev->addr_len;
3200 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3201 } else {
3202 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3203 sll->sll_halen = 0;
3204 }
654d1f8a 3205 rcu_read_unlock();
0fb375fb 3206 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3207
3208 return 0;
3209}
3210
2aeb0b88
WC
3211static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3212 int what)
1da177e4
LT
3213{
3214 switch (i->type) {
3215 case PACKET_MR_MULTICAST:
1162563f
JP
3216 if (i->alen != dev->addr_len)
3217 return -EINVAL;
1da177e4 3218 if (what > 0)
22bedad3 3219 return dev_mc_add(dev, i->addr);
1da177e4 3220 else
22bedad3 3221 return dev_mc_del(dev, i->addr);
1da177e4
LT
3222 break;
3223 case PACKET_MR_PROMISC:
2aeb0b88 3224 return dev_set_promiscuity(dev, what);
1da177e4 3225 case PACKET_MR_ALLMULTI:
2aeb0b88 3226 return dev_set_allmulti(dev, what);
d95ed927 3227 case PACKET_MR_UNICAST:
1162563f
JP
3228 if (i->alen != dev->addr_len)
3229 return -EINVAL;
d95ed927 3230 if (what > 0)
a748ee24 3231 return dev_uc_add(dev, i->addr);
d95ed927 3232 else
a748ee24 3233 return dev_uc_del(dev, i->addr);
d95ed927 3234 break;
40d4e3df
ED
3235 default:
3236 break;
1da177e4 3237 }
2aeb0b88 3238 return 0;
1da177e4
LT
3239}
3240
82f17091
FR
3241static void packet_dev_mclist_delete(struct net_device *dev,
3242 struct packet_mclist **mlp)
1da177e4 3243{
82f17091
FR
3244 struct packet_mclist *ml;
3245
3246 while ((ml = *mlp) != NULL) {
3247 if (ml->ifindex == dev->ifindex) {
3248 packet_dev_mc(dev, ml, -1);
3249 *mlp = ml->next;
3250 kfree(ml);
3251 } else
3252 mlp = &ml->next;
1da177e4
LT
3253 }
3254}
3255
0fb375fb 3256static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3257{
3258 struct packet_sock *po = pkt_sk(sk);
3259 struct packet_mclist *ml, *i;
3260 struct net_device *dev;
3261 int err;
3262
3263 rtnl_lock();
3264
3265 err = -ENODEV;
3b1e0a65 3266 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3267 if (!dev)
3268 goto done;
3269
3270 err = -EINVAL;
1162563f 3271 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3272 goto done;
3273
3274 err = -ENOBUFS;
8b3a7005 3275 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3276 if (i == NULL)
3277 goto done;
3278
3279 err = 0;
3280 for (ml = po->mclist; ml; ml = ml->next) {
3281 if (ml->ifindex == mreq->mr_ifindex &&
3282 ml->type == mreq->mr_type &&
3283 ml->alen == mreq->mr_alen &&
3284 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3285 ml->count++;
3286 /* Free the new element ... */
3287 kfree(i);
3288 goto done;
3289 }
3290 }
3291
3292 i->type = mreq->mr_type;
3293 i->ifindex = mreq->mr_ifindex;
3294 i->alen = mreq->mr_alen;
3295 memcpy(i->addr, mreq->mr_address, i->alen);
3296 i->count = 1;
3297 i->next = po->mclist;
3298 po->mclist = i;
2aeb0b88
WC
3299 err = packet_dev_mc(dev, i, 1);
3300 if (err) {
3301 po->mclist = i->next;
3302 kfree(i);
3303 }
1da177e4
LT
3304
3305done:
3306 rtnl_unlock();
3307 return err;
3308}
3309
0fb375fb 3310static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3311{
3312 struct packet_mclist *ml, **mlp;
3313
3314 rtnl_lock();
3315
3316 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3317 if (ml->ifindex == mreq->mr_ifindex &&
3318 ml->type == mreq->mr_type &&
3319 ml->alen == mreq->mr_alen &&
3320 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3321 if (--ml->count == 0) {
3322 struct net_device *dev;
3323 *mlp = ml->next;
ad959e76
ED
3324 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3325 if (dev)
1da177e4 3326 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3327 kfree(ml);
3328 }
82f17091 3329 break;
1da177e4
LT
3330 }
3331 }
3332 rtnl_unlock();
82f17091 3333 return 0;
1da177e4
LT
3334}
3335
3336static void packet_flush_mclist(struct sock *sk)
3337{
3338 struct packet_sock *po = pkt_sk(sk);
3339 struct packet_mclist *ml;
3340
3341 if (!po->mclist)
3342 return;
3343
3344 rtnl_lock();
3345 while ((ml = po->mclist) != NULL) {
3346 struct net_device *dev;
3347
3348 po->mclist = ml->next;
ad959e76
ED
3349 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3350 if (dev != NULL)
1da177e4 3351 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3352 kfree(ml);
3353 }
3354 rtnl_unlock();
3355}
1da177e4
LT
3356
3357static int
b7058842 3358packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3359{
3360 struct sock *sk = sock->sk;
8dc41944 3361 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3362 int ret;
3363
3364 if (level != SOL_PACKET)
3365 return -ENOPROTOOPT;
3366
69e3c75f 3367 switch (optname) {
1ce4f28b 3368 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3369 case PACKET_DROP_MEMBERSHIP:
3370 {
0fb375fb
EB
3371 struct packet_mreq_max mreq;
3372 int len = optlen;
3373 memset(&mreq, 0, sizeof(mreq));
3374 if (len < sizeof(struct packet_mreq))
1da177e4 3375 return -EINVAL;
0fb375fb
EB
3376 if (len > sizeof(mreq))
3377 len = sizeof(mreq);
40d4e3df 3378 if (copy_from_user(&mreq, optval, len))
1da177e4 3379 return -EFAULT;
0fb375fb
EB
3380 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3381 return -EINVAL;
1da177e4
LT
3382 if (optname == PACKET_ADD_MEMBERSHIP)
3383 ret = packet_mc_add(sk, &mreq);
3384 else
3385 ret = packet_mc_drop(sk, &mreq);
3386 return ret;
3387 }
a2efcfa0 3388
1da177e4 3389 case PACKET_RX_RING:
69e3c75f 3390 case PACKET_TX_RING:
1da177e4 3391 {
f6fb8f10 3392 union tpacket_req_u req_u;
3393 int len;
1da177e4 3394
f6fb8f10 3395 switch (po->tp_version) {
3396 case TPACKET_V1:
3397 case TPACKET_V2:
3398 len = sizeof(req_u.req);
3399 break;
3400 case TPACKET_V3:
3401 default:
3402 len = sizeof(req_u.req3);
3403 break;
3404 }
3405 if (optlen < len)
1da177e4 3406 return -EINVAL;
bfd5f4a3
SS
3407 if (pkt_sk(sk)->has_vnet_hdr)
3408 return -EINVAL;
f6fb8f10 3409 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3410 return -EFAULT;
f6fb8f10 3411 return packet_set_ring(sk, &req_u, 0,
3412 optname == PACKET_TX_RING);
1da177e4
LT
3413 }
3414 case PACKET_COPY_THRESH:
3415 {
3416 int val;
3417
40d4e3df 3418 if (optlen != sizeof(val))
1da177e4 3419 return -EINVAL;
40d4e3df 3420 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3421 return -EFAULT;
3422
3423 pkt_sk(sk)->copy_thresh = val;
3424 return 0;
3425 }
bbd6ef87
PM
3426 case PACKET_VERSION:
3427 {
3428 int val;
3429
3430 if (optlen != sizeof(val))
3431 return -EINVAL;
69e3c75f 3432 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3433 return -EBUSY;
3434 if (copy_from_user(&val, optval, sizeof(val)))
3435 return -EFAULT;
3436 switch (val) {
3437 case TPACKET_V1:
3438 case TPACKET_V2:
f6fb8f10 3439 case TPACKET_V3:
bbd6ef87
PM
3440 po->tp_version = val;
3441 return 0;
3442 default:
3443 return -EINVAL;
3444 }
3445 }
8913336a
PM
3446 case PACKET_RESERVE:
3447 {
3448 unsigned int val;
3449
3450 if (optlen != sizeof(val))
3451 return -EINVAL;
69e3c75f 3452 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3453 return -EBUSY;
3454 if (copy_from_user(&val, optval, sizeof(val)))
3455 return -EFAULT;
3456 po->tp_reserve = val;
3457 return 0;
3458 }
69e3c75f
JB
3459 case PACKET_LOSS:
3460 {
3461 unsigned int val;
3462
3463 if (optlen != sizeof(val))
3464 return -EINVAL;
3465 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3466 return -EBUSY;
3467 if (copy_from_user(&val, optval, sizeof(val)))
3468 return -EFAULT;
3469 po->tp_loss = !!val;
3470 return 0;
3471 }
8dc41944
HX
3472 case PACKET_AUXDATA:
3473 {
3474 int val;
3475
3476 if (optlen < sizeof(val))
3477 return -EINVAL;
3478 if (copy_from_user(&val, optval, sizeof(val)))
3479 return -EFAULT;
3480
3481 po->auxdata = !!val;
3482 return 0;
3483 }
80feaacb
PWJ
3484 case PACKET_ORIGDEV:
3485 {
3486 int val;
3487
3488 if (optlen < sizeof(val))
3489 return -EINVAL;
3490 if (copy_from_user(&val, optval, sizeof(val)))
3491 return -EFAULT;
3492
3493 po->origdev = !!val;
3494 return 0;
3495 }
bfd5f4a3
SS
3496 case PACKET_VNET_HDR:
3497 {
3498 int val;
3499
3500 if (sock->type != SOCK_RAW)
3501 return -EINVAL;
3502 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3503 return -EBUSY;
3504 if (optlen < sizeof(val))
3505 return -EINVAL;
3506 if (copy_from_user(&val, optval, sizeof(val)))
3507 return -EFAULT;
3508
3509 po->has_vnet_hdr = !!val;
3510 return 0;
3511 }
614f60fa
SM
3512 case PACKET_TIMESTAMP:
3513 {
3514 int val;
3515
3516 if (optlen != sizeof(val))
3517 return -EINVAL;
3518 if (copy_from_user(&val, optval, sizeof(val)))
3519 return -EFAULT;
3520
3521 po->tp_tstamp = val;
3522 return 0;
3523 }
dc99f600
DM
3524 case PACKET_FANOUT:
3525 {
3526 int val;
3527
3528 if (optlen != sizeof(val))
3529 return -EINVAL;
3530 if (copy_from_user(&val, optval, sizeof(val)))
3531 return -EFAULT;
3532
3533 return fanout_add(sk, val & 0xffff, val >> 16);
3534 }
5920cd3a
PC
3535 case PACKET_TX_HAS_OFF:
3536 {
3537 unsigned int val;
3538
3539 if (optlen != sizeof(val))
3540 return -EINVAL;
3541 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3542 return -EBUSY;
3543 if (copy_from_user(&val, optval, sizeof(val)))
3544 return -EFAULT;
3545 po->tp_tx_has_off = !!val;
3546 return 0;
3547 }
d346a3fa
DB
3548 case PACKET_QDISC_BYPASS:
3549 {
3550 int val;
3551
3552 if (optlen != sizeof(val))
3553 return -EINVAL;
3554 if (copy_from_user(&val, optval, sizeof(val)))
3555 return -EFAULT;
3556
3557 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3558 return 0;
3559 }
1da177e4
LT
3560 default:
3561 return -ENOPROTOOPT;
3562 }
3563}
3564
3565static int packet_getsockopt(struct socket *sock, int level, int optname,
3566 char __user *optval, int __user *optlen)
3567{
3568 int len;
c06fff6e 3569 int val, lv = sizeof(val);
1da177e4
LT
3570 struct sock *sk = sock->sk;
3571 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3572 void *data = &val;
ee80fbf3 3573 union tpacket_stats_u st;
a9b63918 3574 struct tpacket_rollover_stats rstats;
1da177e4
LT
3575
3576 if (level != SOL_PACKET)
3577 return -ENOPROTOOPT;
3578
8ae55f04
KK
3579 if (get_user(len, optlen))
3580 return -EFAULT;
1da177e4
LT
3581
3582 if (len < 0)
3583 return -EINVAL;
1ce4f28b 3584
69e3c75f 3585 switch (optname) {
1da177e4 3586 case PACKET_STATISTICS:
1da177e4 3587 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3588 memcpy(&st, &po->stats, sizeof(st));
3589 memset(&po->stats, 0, sizeof(po->stats));
3590 spin_unlock_bh(&sk->sk_receive_queue.lock);
3591
f6fb8f10 3592 if (po->tp_version == TPACKET_V3) {
c06fff6e 3593 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3594 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3595 data = &st.stats3;
f6fb8f10 3596 } else {
c06fff6e 3597 lv = sizeof(struct tpacket_stats);
8bcdeaff 3598 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3599 data = &st.stats1;
f6fb8f10 3600 }
ee80fbf3 3601
8dc41944
HX
3602 break;
3603 case PACKET_AUXDATA:
8dc41944 3604 val = po->auxdata;
80feaacb
PWJ
3605 break;
3606 case PACKET_ORIGDEV:
80feaacb 3607 val = po->origdev;
bfd5f4a3
SS
3608 break;
3609 case PACKET_VNET_HDR:
bfd5f4a3 3610 val = po->has_vnet_hdr;
1da177e4 3611 break;
bbd6ef87 3612 case PACKET_VERSION:
bbd6ef87 3613 val = po->tp_version;
bbd6ef87
PM
3614 break;
3615 case PACKET_HDRLEN:
3616 if (len > sizeof(int))
3617 len = sizeof(int);
3618 if (copy_from_user(&val, optval, len))
3619 return -EFAULT;
3620 switch (val) {
3621 case TPACKET_V1:
3622 val = sizeof(struct tpacket_hdr);
3623 break;
3624 case TPACKET_V2:
3625 val = sizeof(struct tpacket2_hdr);
3626 break;
f6fb8f10 3627 case TPACKET_V3:
3628 val = sizeof(struct tpacket3_hdr);
3629 break;
bbd6ef87
PM
3630 default:
3631 return -EINVAL;
3632 }
bbd6ef87 3633 break;
8913336a 3634 case PACKET_RESERVE:
8913336a 3635 val = po->tp_reserve;
8913336a 3636 break;
69e3c75f 3637 case PACKET_LOSS:
69e3c75f 3638 val = po->tp_loss;
69e3c75f 3639 break;
614f60fa 3640 case PACKET_TIMESTAMP:
614f60fa 3641 val = po->tp_tstamp;
614f60fa 3642 break;
dc99f600 3643 case PACKET_FANOUT:
dc99f600
DM
3644 val = (po->fanout ?
3645 ((u32)po->fanout->id |
77f65ebd
WB
3646 ((u32)po->fanout->type << 16) |
3647 ((u32)po->fanout->flags << 24)) :
dc99f600 3648 0);
dc99f600 3649 break;
a9b63918
WB
3650 case PACKET_ROLLOVER_STATS:
3651 if (!po->rollover)
3652 return -EINVAL;
3653 rstats.tp_all = atomic_long_read(&po->rollover->num);
3654 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3655 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3656 data = &rstats;
3657 lv = sizeof(rstats);
3658 break;
5920cd3a
PC
3659 case PACKET_TX_HAS_OFF:
3660 val = po->tp_tx_has_off;
3661 break;
d346a3fa
DB
3662 case PACKET_QDISC_BYPASS:
3663 val = packet_use_direct_xmit(po);
3664 break;
1da177e4
LT
3665 default:
3666 return -ENOPROTOOPT;
3667 }
3668
c06fff6e
ED
3669 if (len > lv)
3670 len = lv;
8ae55f04
KK
3671 if (put_user(len, optlen))
3672 return -EFAULT;
8dc41944
HX
3673 if (copy_to_user(optval, data, len))
3674 return -EFAULT;
8ae55f04 3675 return 0;
1da177e4
LT
3676}
3677
3678
351638e7
JP
3679static int packet_notifier(struct notifier_block *this,
3680 unsigned long msg, void *ptr)
1da177e4
LT
3681{
3682 struct sock *sk;
351638e7 3683 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3684 struct net *net = dev_net(dev);
1da177e4 3685
808f5114 3686 rcu_read_lock();
b67bfe0d 3687 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3688 struct packet_sock *po = pkt_sk(sk);
3689
3690 switch (msg) {
3691 case NETDEV_UNREGISTER:
1da177e4 3692 if (po->mclist)
82f17091 3693 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3694 /* fallthrough */
3695
1da177e4
LT
3696 case NETDEV_DOWN:
3697 if (dev->ifindex == po->ifindex) {
3698 spin_lock(&po->bind_lock);
3699 if (po->running) {
ce06b03e 3700 __unregister_prot_hook(sk, false);
1da177e4
LT
3701 sk->sk_err = ENETDOWN;
3702 if (!sock_flag(sk, SOCK_DEAD))
3703 sk->sk_error_report(sk);
3704 }
3705 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3706 packet_cached_dev_reset(po);
1da177e4 3707 po->ifindex = -1;
160ff18a
BG
3708 if (po->prot_hook.dev)
3709 dev_put(po->prot_hook.dev);
1da177e4
LT
3710 po->prot_hook.dev = NULL;
3711 }
3712 spin_unlock(&po->bind_lock);
3713 }
3714 break;
3715 case NETDEV_UP:
808f5114 3716 if (dev->ifindex == po->ifindex) {
3717 spin_lock(&po->bind_lock);
ce06b03e
DM
3718 if (po->num)
3719 register_prot_hook(sk);
808f5114 3720 spin_unlock(&po->bind_lock);
1da177e4 3721 }
1da177e4
LT
3722 break;
3723 }
3724 }
808f5114 3725 rcu_read_unlock();
1da177e4
LT
3726 return NOTIFY_DONE;
3727}
3728
3729
3730static int packet_ioctl(struct socket *sock, unsigned int cmd,
3731 unsigned long arg)
3732{
3733 struct sock *sk = sock->sk;
3734
69e3c75f 3735 switch (cmd) {
40d4e3df
ED
3736 case SIOCOUTQ:
3737 {
3738 int amount = sk_wmem_alloc_get(sk);
31e6d363 3739
40d4e3df
ED
3740 return put_user(amount, (int __user *)arg);
3741 }
3742 case SIOCINQ:
3743 {
3744 struct sk_buff *skb;
3745 int amount = 0;
3746
3747 spin_lock_bh(&sk->sk_receive_queue.lock);
3748 skb = skb_peek(&sk->sk_receive_queue);
3749 if (skb)
3750 amount = skb->len;
3751 spin_unlock_bh(&sk->sk_receive_queue.lock);
3752 return put_user(amount, (int __user *)arg);
3753 }
3754 case SIOCGSTAMP:
3755 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3756 case SIOCGSTAMPNS:
3757 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3758
1da177e4 3759#ifdef CONFIG_INET
40d4e3df
ED
3760 case SIOCADDRT:
3761 case SIOCDELRT:
3762 case SIOCDARP:
3763 case SIOCGARP:
3764 case SIOCSARP:
3765 case SIOCGIFADDR:
3766 case SIOCSIFADDR:
3767 case SIOCGIFBRDADDR:
3768 case SIOCSIFBRDADDR:
3769 case SIOCGIFNETMASK:
3770 case SIOCSIFNETMASK:
3771 case SIOCGIFDSTADDR:
3772 case SIOCSIFDSTADDR:
3773 case SIOCSIFFLAGS:
40d4e3df 3774 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3775#endif
3776
40d4e3df
ED
3777 default:
3778 return -ENOIOCTLCMD;
1da177e4
LT
3779 }
3780 return 0;
3781}
3782
40d4e3df 3783static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3784 poll_table *wait)
3785{
3786 struct sock *sk = sock->sk;
3787 struct packet_sock *po = pkt_sk(sk);
3788 unsigned int mask = datagram_poll(file, sock, wait);
3789
3790 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3791 if (po->rx_ring.pg_vec) {
f6fb8f10 3792 if (!packet_previous_rx_frame(po, &po->rx_ring,
3793 TP_STATUS_KERNEL))
1da177e4
LT
3794 mask |= POLLIN | POLLRDNORM;
3795 }
2ccdbaa6 3796 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 3797 po->pressure = 0;
1da177e4 3798 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3799 spin_lock_bh(&sk->sk_write_queue.lock);
3800 if (po->tx_ring.pg_vec) {
3801 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3802 mask |= POLLOUT | POLLWRNORM;
3803 }
3804 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3805 return mask;
3806}
3807
3808
3809/* Dirty? Well, I still did not learn better way to account
3810 * for user mmaps.
3811 */
3812
3813static void packet_mm_open(struct vm_area_struct *vma)
3814{
3815 struct file *file = vma->vm_file;
40d4e3df 3816 struct socket *sock = file->private_data;
1da177e4 3817 struct sock *sk = sock->sk;
1ce4f28b 3818
1da177e4
LT
3819 if (sk)
3820 atomic_inc(&pkt_sk(sk)->mapped);
3821}
3822
3823static void packet_mm_close(struct vm_area_struct *vma)
3824{
3825 struct file *file = vma->vm_file;
40d4e3df 3826 struct socket *sock = file->private_data;
1da177e4 3827 struct sock *sk = sock->sk;
1ce4f28b 3828
1da177e4
LT
3829 if (sk)
3830 atomic_dec(&pkt_sk(sk)->mapped);
3831}
3832
f0f37e2f 3833static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3834 .open = packet_mm_open,
3835 .close = packet_mm_close,
1da177e4
LT
3836};
3837
0e3125c7
NH
3838static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3839 unsigned int len)
1da177e4
LT
3840{
3841 int i;
3842
4ebf0ae2 3843 for (i = 0; i < len; i++) {
0e3125c7 3844 if (likely(pg_vec[i].buffer)) {
c56b4d90 3845 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3846 vfree(pg_vec[i].buffer);
3847 else
3848 free_pages((unsigned long)pg_vec[i].buffer,
3849 order);
3850 pg_vec[i].buffer = NULL;
3851 }
1da177e4
LT
3852 }
3853 kfree(pg_vec);
3854}
3855
eea49cc9 3856static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3857{
f0d4eb29 3858 char *buffer;
0e3125c7
NH
3859 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3860 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3861
3862 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3863 if (buffer)
3864 return buffer;
3865
f0d4eb29 3866 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 3867 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
3868 if (buffer)
3869 return buffer;
3870
f0d4eb29 3871 /* vmalloc failed, lets dig into swap here */
0e3125c7 3872 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 3873 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3874 if (buffer)
3875 return buffer;
3876
f0d4eb29 3877 /* complete and utter failure */
0e3125c7 3878 return NULL;
4ebf0ae2
DM
3879}
3880
0e3125c7 3881static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3882{
3883 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3884 struct pgv *pg_vec;
4ebf0ae2
DM
3885 int i;
3886
0e3125c7 3887 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3888 if (unlikely(!pg_vec))
3889 goto out;
3890
3891 for (i = 0; i < block_nr; i++) {
c56b4d90 3892 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3893 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3894 goto out_free_pgvec;
3895 }
3896
3897out:
3898 return pg_vec;
3899
3900out_free_pgvec:
3901 free_pg_vec(pg_vec, order, block_nr);
3902 pg_vec = NULL;
3903 goto out;
3904}
1da177e4 3905
f6fb8f10 3906static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3907 int closing, int tx_ring)
1da177e4 3908{
0e3125c7 3909 struct pgv *pg_vec = NULL;
1da177e4 3910 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3911 int was_running, order = 0;
69e3c75f
JB
3912 struct packet_ring_buffer *rb;
3913 struct sk_buff_head *rb_queue;
0e11c91e 3914 __be16 num;
f6fb8f10 3915 int err = -EINVAL;
3916 /* Added to avoid minimal code churn */
3917 struct tpacket_req *req = &req_u->req;
3918
3919 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3920 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3921 WARN(1, "Tx-ring is not supported.\n");
3922 goto out;
3923 }
1ce4f28b 3924
69e3c75f
JB
3925 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3926 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3927
69e3c75f
JB
3928 err = -EBUSY;
3929 if (!closing) {
3930 if (atomic_read(&po->mapped))
3931 goto out;
b0138408 3932 if (packet_read_pending(rb))
69e3c75f
JB
3933 goto out;
3934 }
1da177e4 3935
69e3c75f
JB
3936 if (req->tp_block_nr) {
3937 /* Sanity tests and some calculations */
3938 err = -EBUSY;
3939 if (unlikely(rb->pg_vec))
3940 goto out;
1da177e4 3941
bbd6ef87
PM
3942 switch (po->tp_version) {
3943 case TPACKET_V1:
3944 po->tp_hdrlen = TPACKET_HDRLEN;
3945 break;
3946 case TPACKET_V2:
3947 po->tp_hdrlen = TPACKET2_HDRLEN;
3948 break;
f6fb8f10 3949 case TPACKET_V3:
3950 po->tp_hdrlen = TPACKET3_HDRLEN;
3951 break;
bbd6ef87
PM
3952 }
3953
69e3c75f 3954 err = -EINVAL;
4ebf0ae2 3955 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3956 goto out;
4ebf0ae2 3957 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3958 goto out;
dc808110
ED
3959 if (po->tp_version >= TPACKET_V3 &&
3960 (int)(req->tp_block_size -
3961 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
3962 goto out;
8913336a 3963 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3964 po->tp_reserve))
3965 goto out;
4ebf0ae2 3966 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3967 goto out;
1da177e4 3968
69e3c75f
JB
3969 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3970 if (unlikely(rb->frames_per_block <= 0))
3971 goto out;
3972 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3973 req->tp_frame_nr))
3974 goto out;
1da177e4
LT
3975
3976 err = -ENOMEM;
4ebf0ae2
DM
3977 order = get_order(req->tp_block_size);
3978 pg_vec = alloc_pg_vec(req, order);
3979 if (unlikely(!pg_vec))
1da177e4 3980 goto out;
f6fb8f10 3981 switch (po->tp_version) {
3982 case TPACKET_V3:
3983 /* Transmit path is not supported. We checked
3984 * it above but just being paranoid
3985 */
3986 if (!tx_ring)
e8e85cc5 3987 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 3988 break;
f6fb8f10 3989 default:
3990 break;
3991 }
69e3c75f
JB
3992 }
3993 /* Done */
3994 else {
3995 err = -EINVAL;
4ebf0ae2 3996 if (unlikely(req->tp_frame_nr))
69e3c75f 3997 goto out;
1da177e4
LT
3998 }
3999
4000 lock_sock(sk);
4001
4002 /* Detach socket from network */
4003 spin_lock(&po->bind_lock);
4004 was_running = po->running;
4005 num = po->num;
4006 if (was_running) {
1da177e4 4007 po->num = 0;
ce06b03e 4008 __unregister_prot_hook(sk, false);
1da177e4
LT
4009 }
4010 spin_unlock(&po->bind_lock);
1ce4f28b 4011
1da177e4
LT
4012 synchronize_net();
4013
4014 err = -EBUSY;
905db440 4015 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4016 if (closing || atomic_read(&po->mapped) == 0) {
4017 err = 0;
69e3c75f 4018 spin_lock_bh(&rb_queue->lock);
c053fd96 4019 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4020 rb->frame_max = (req->tp_frame_nr - 1);
4021 rb->head = 0;
4022 rb->frame_size = req->tp_frame_size;
4023 spin_unlock_bh(&rb_queue->lock);
4024
c053fd96
CG
4025 swap(rb->pg_vec_order, order);
4026 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4027
4028 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4029 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4030 tpacket_rcv : packet_rcv;
4031 skb_queue_purge(rb_queue);
1da177e4 4032 if (atomic_read(&po->mapped))
40d4e3df
ED
4033 pr_err("packet_mmap: vma is busy: %d\n",
4034 atomic_read(&po->mapped));
1da177e4 4035 }
905db440 4036 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4037
4038 spin_lock(&po->bind_lock);
ce06b03e 4039 if (was_running) {
1da177e4 4040 po->num = num;
ce06b03e 4041 register_prot_hook(sk);
1da177e4
LT
4042 }
4043 spin_unlock(&po->bind_lock);
f6fb8f10 4044 if (closing && (po->tp_version > TPACKET_V2)) {
4045 /* Because we don't support block-based V3 on tx-ring */
4046 if (!tx_ring)
4047 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
4048 }
1da177e4
LT
4049 release_sock(sk);
4050
1da177e4
LT
4051 if (pg_vec)
4052 free_pg_vec(pg_vec, order, req->tp_block_nr);
4053out:
4054 return err;
4055}
4056
69e3c75f
JB
4057static int packet_mmap(struct file *file, struct socket *sock,
4058 struct vm_area_struct *vma)
1da177e4
LT
4059{
4060 struct sock *sk = sock->sk;
4061 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4062 unsigned long size, expected_size;
4063 struct packet_ring_buffer *rb;
1da177e4
LT
4064 unsigned long start;
4065 int err = -EINVAL;
4066 int i;
4067
4068 if (vma->vm_pgoff)
4069 return -EINVAL;
4070
905db440 4071 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4072
4073 expected_size = 0;
4074 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4075 if (rb->pg_vec) {
4076 expected_size += rb->pg_vec_len
4077 * rb->pg_vec_pages
4078 * PAGE_SIZE;
4079 }
4080 }
4081
4082 if (expected_size == 0)
1da177e4 4083 goto out;
69e3c75f
JB
4084
4085 size = vma->vm_end - vma->vm_start;
4086 if (size != expected_size)
1da177e4
LT
4087 goto out;
4088
1da177e4 4089 start = vma->vm_start;
69e3c75f
JB
4090 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4091 if (rb->pg_vec == NULL)
4092 continue;
4093
4094 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4095 struct page *page;
4096 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4097 int pg_num;
4098
c56b4d90
CG
4099 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4100 page = pgv_to_page(kaddr);
69e3c75f
JB
4101 err = vm_insert_page(vma, start, page);
4102 if (unlikely(err))
4103 goto out;
4104 start += PAGE_SIZE;
0e3125c7 4105 kaddr += PAGE_SIZE;
69e3c75f 4106 }
4ebf0ae2 4107 }
1da177e4 4108 }
69e3c75f 4109
4ebf0ae2 4110 atomic_inc(&po->mapped);
1da177e4
LT
4111 vma->vm_ops = &packet_mmap_ops;
4112 err = 0;
4113
4114out:
905db440 4115 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4116 return err;
4117}
1da177e4 4118
90ddc4f0 4119static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4120 .family = PF_PACKET,
4121 .owner = THIS_MODULE,
4122 .release = packet_release,
4123 .bind = packet_bind_spkt,
4124 .connect = sock_no_connect,
4125 .socketpair = sock_no_socketpair,
4126 .accept = sock_no_accept,
4127 .getname = packet_getname_spkt,
4128 .poll = datagram_poll,
4129 .ioctl = packet_ioctl,
4130 .listen = sock_no_listen,
4131 .shutdown = sock_no_shutdown,
4132 .setsockopt = sock_no_setsockopt,
4133 .getsockopt = sock_no_getsockopt,
4134 .sendmsg = packet_sendmsg_spkt,
4135 .recvmsg = packet_recvmsg,
4136 .mmap = sock_no_mmap,
4137 .sendpage = sock_no_sendpage,
4138};
1da177e4 4139
90ddc4f0 4140static const struct proto_ops packet_ops = {
1da177e4
LT
4141 .family = PF_PACKET,
4142 .owner = THIS_MODULE,
4143 .release = packet_release,
4144 .bind = packet_bind,
4145 .connect = sock_no_connect,
4146 .socketpair = sock_no_socketpair,
4147 .accept = sock_no_accept,
1ce4f28b 4148 .getname = packet_getname,
1da177e4
LT
4149 .poll = packet_poll,
4150 .ioctl = packet_ioctl,
4151 .listen = sock_no_listen,
4152 .shutdown = sock_no_shutdown,
4153 .setsockopt = packet_setsockopt,
4154 .getsockopt = packet_getsockopt,
4155 .sendmsg = packet_sendmsg,
4156 .recvmsg = packet_recvmsg,
4157 .mmap = packet_mmap,
4158 .sendpage = sock_no_sendpage,
4159};
4160
ec1b4cf7 4161static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4162 .family = PF_PACKET,
4163 .create = packet_create,
4164 .owner = THIS_MODULE,
4165};
4166
4167static struct notifier_block packet_netdev_notifier = {
40d4e3df 4168 .notifier_call = packet_notifier,
1da177e4
LT
4169};
4170
4171#ifdef CONFIG_PROC_FS
1da177e4
LT
4172
4173static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4174 __acquires(RCU)
1da177e4 4175{
e372c414 4176 struct net *net = seq_file_net(seq);
808f5114 4177
4178 rcu_read_lock();
4179 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4180}
4181
4182static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4183{
1bf40954 4184 struct net *net = seq_file_net(seq);
808f5114 4185 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4186}
4187
4188static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4189 __releases(RCU)
1da177e4 4190{
808f5114 4191 rcu_read_unlock();
1da177e4
LT
4192}
4193
1ce4f28b 4194static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4195{
4196 if (v == SEQ_START_TOKEN)
4197 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4198 else {
b7ceabd9 4199 struct sock *s = sk_entry(v);
1da177e4
LT
4200 const struct packet_sock *po = pkt_sk(s);
4201
4202 seq_printf(seq,
71338aa7 4203 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4204 s,
4205 atomic_read(&s->sk_refcnt),
4206 s->sk_type,
4207 ntohs(po->num),
4208 po->ifindex,
4209 po->running,
4210 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4211 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4212 sock_i_ino(s));
1da177e4
LT
4213 }
4214
4215 return 0;
4216}
4217
56b3d975 4218static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4219 .start = packet_seq_start,
4220 .next = packet_seq_next,
4221 .stop = packet_seq_stop,
4222 .show = packet_seq_show,
4223};
4224
4225static int packet_seq_open(struct inode *inode, struct file *file)
4226{
e372c414
DL
4227 return seq_open_net(inode, file, &packet_seq_ops,
4228 sizeof(struct seq_net_private));
1da177e4
LT
4229}
4230
da7071d7 4231static const struct file_operations packet_seq_fops = {
1da177e4
LT
4232 .owner = THIS_MODULE,
4233 .open = packet_seq_open,
4234 .read = seq_read,
4235 .llseek = seq_lseek,
e372c414 4236 .release = seq_release_net,
1da177e4
LT
4237};
4238
4239#endif
4240
2c8c1e72 4241static int __net_init packet_net_init(struct net *net)
d12d01d6 4242{
0fa7fa98 4243 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4244 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4245
d4beaa66 4246 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4247 return -ENOMEM;
4248
4249 return 0;
4250}
4251
2c8c1e72 4252static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4253{
ece31ffd 4254 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4255}
4256
4257static struct pernet_operations packet_net_ops = {
4258 .init = packet_net_init,
4259 .exit = packet_net_exit,
4260};
4261
4262
1da177e4
LT
4263static void __exit packet_exit(void)
4264{
1da177e4 4265 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4266 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4267 sock_unregister(PF_PACKET);
4268 proto_unregister(&packet_proto);
4269}
4270
4271static int __init packet_init(void)
4272{
4273 int rc = proto_register(&packet_proto, 0);
4274
4275 if (rc != 0)
4276 goto out;
4277
4278 sock_register(&packet_family_ops);
d12d01d6 4279 register_pernet_subsys(&packet_net_ops);
1da177e4 4280 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4281out:
4282 return rc;
4283}
4284
4285module_init(packet_init);
4286module_exit(packet_exit);
4287MODULE_LICENSE("GPL");
4288MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.30878 seconds and 5 git commands to generate.