net: introduce SO_BPF_EXTENSIONS
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
5df0ddfb 91#include <linux/reciprocal_div.h>
b0138408 92#include <linux/percpu.h>
1da177e4
LT
93#ifdef CONFIG_INET
94#include <net/inet_common.h>
95#endif
96
2787b04b
PE
97#include "internal.h"
98
1da177e4
LT
99/*
100 Assumptions:
101 - if device has no dev->hard_header routine, it adds and removes ll header
102 inside itself. In this case ll header is invisible outside of device,
103 but higher levels still should reserve dev->hard_header_len.
104 Some devices are enough clever to reallocate skb, when header
105 will not fit to reserved space (tunnel), another ones are silly
106 (PPP).
107 - packet socket receives packets with pulled ll header,
108 so that SOCK_RAW should push it back.
109
110On receive:
111-----------
112
113Incoming, dev->hard_header!=NULL
b0e380b1
ACM
114 mac_header -> ll header
115 data -> data
1da177e4
LT
116
117Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
118 mac_header -> ll header
119 data -> ll header
1da177e4
LT
120
121Incoming, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> UNKNOWN position. It is very likely, that it points to ll
123 header. PPP makes it, that is wrong, because introduce
db0c58f9 124 assymetry between rx and tx paths.
b0e380b1 125 data -> data
1da177e4
LT
126
127Outgoing, dev->hard_header==NULL
b0e380b1
ACM
128 mac_header -> data. ll header is still not built!
129 data -> data
1da177e4
LT
130
131Resume
132 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133
134
135On transmit:
136------------
137
138dev->hard_header != NULL
b0e380b1
ACM
139 mac_header -> ll header
140 data -> ll header
1da177e4
LT
141
142dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
143 mac_header -> data
144 data -> data
1da177e4
LT
145
146 We should set nh.raw on output to correct posistion,
147 packet classifier depends on it.
148 */
149
1da177e4
LT
150/* Private packet socket structures. */
151
0fb375fb
EB
152/* identical to struct packet_mreq except it has
153 * a longer address field.
154 */
40d4e3df 155struct packet_mreq_max {
0fb375fb
EB
156 int mr_ifindex;
157 unsigned short mr_type;
158 unsigned short mr_alen;
159 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 160};
a2efcfa0 161
184f489e
DB
162union tpacket_uhdr {
163 struct tpacket_hdr *h1;
164 struct tpacket2_hdr *h2;
165 struct tpacket3_hdr *h3;
166 void *raw;
167};
168
f6fb8f10 169static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
170 int closing, int tx_ring);
171
f6fb8f10 172#define V3_ALIGNMENT (8)
173
bc59ba39 174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
f6fb8f10 179#define PGV_FROM_VMALLOC 1
69e3c75f 180
f6fb8f10 181#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
182#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
183#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
184#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
185#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
186#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
188
69e3c75f
JB
189struct packet_sock;
190static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
191static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
192 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 193
f6fb8f10 194static void *packet_previous_frame(struct packet_sock *po,
195 struct packet_ring_buffer *rb,
196 int status);
197static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 198static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
199 struct tpacket_block_desc *);
200static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 201 struct packet_sock *);
bc59ba39 202static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 203 struct packet_sock *, unsigned int status);
bc59ba39 204static int prb_queue_frozen(struct tpacket_kbdq_core *);
205static void prb_open_block(struct tpacket_kbdq_core *,
206 struct tpacket_block_desc *);
f6fb8f10 207static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 208static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
209static void prb_init_blk_timer(struct packet_sock *,
210 struct tpacket_kbdq_core *,
211 void (*func) (unsigned long));
212static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
213static void prb_clear_rxhash(struct tpacket_kbdq_core *,
214 struct tpacket3_hdr *);
215static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
216 struct tpacket3_hdr *);
1da177e4
LT
217static void packet_flush_mclist(struct sock *sk);
218
ffbc6111
HX
219struct packet_skb_cb {
220 unsigned int origlen;
221 union {
222 struct sockaddr_pkt pkt;
223 struct sockaddr_ll ll;
224 } sa;
225};
226
227#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 228
bc59ba39 229#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 230#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 231 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 232#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 233 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 234#define GET_NEXT_PRB_BLK_NUM(x) \
235 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
236 ((x)->kactive_blk_num+1) : 0)
237
dc99f600
DM
238static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
239static void __fanout_link(struct sock *sk, struct packet_sock *po);
240
d346a3fa
DB
241static int packet_direct_xmit(struct sk_buff *skb)
242{
243 struct net_device *dev = skb->dev;
244 const struct net_device_ops *ops = dev->netdev_ops;
245 netdev_features_t features;
246 struct netdev_queue *txq;
247 u16 queue_map;
248 int ret;
249
250 if (unlikely(!netif_running(dev) ||
251 !netif_carrier_ok(dev))) {
252 kfree_skb(skb);
253 return NET_XMIT_DROP;
254 }
255
256 features = netif_skb_features(skb);
257 if (skb_needs_linearize(skb, features) &&
258 __skb_linearize(skb)) {
259 kfree_skb(skb);
260 return NET_XMIT_DROP;
261 }
262
263 queue_map = skb_get_queue_mapping(skb);
264 txq = netdev_get_tx_queue(dev, queue_map);
265
266 __netif_tx_lock_bh(txq);
267 if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
268 ret = NETDEV_TX_BUSY;
269 kfree_skb(skb);
270 goto out;
271 }
272
273 ret = ops->ndo_start_xmit(skb, dev);
274 if (likely(dev_xmit_complete(ret)))
275 txq_trans_update(txq);
276 else
277 kfree_skb(skb);
278out:
279 __netif_tx_unlock_bh(txq);
280 return ret;
281}
282
66e56cd4
DB
283static struct net_device *packet_cached_dev_get(struct packet_sock *po)
284{
285 struct net_device *dev;
286
287 rcu_read_lock();
288 dev = rcu_dereference(po->cached_dev);
289 if (likely(dev))
290 dev_hold(dev);
291 rcu_read_unlock();
292
293 return dev;
294}
295
296static void packet_cached_dev_assign(struct packet_sock *po,
297 struct net_device *dev)
298{
299 rcu_assign_pointer(po->cached_dev, dev);
300}
301
302static void packet_cached_dev_reset(struct packet_sock *po)
303{
304 RCU_INIT_POINTER(po->cached_dev, NULL);
305}
306
d346a3fa
DB
307static bool packet_use_direct_xmit(const struct packet_sock *po)
308{
309 return po->xmit == packet_direct_xmit;
310}
311
312static u16 packet_pick_tx_queue(struct net_device *dev)
313{
1cbac010 314 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
315}
316
ce06b03e
DM
317/* register_prot_hook must be invoked with the po->bind_lock held,
318 * or from a context in which asynchronous accesses to the packet
319 * socket is not possible (packet_create()).
320 */
321static void register_prot_hook(struct sock *sk)
322{
323 struct packet_sock *po = pkt_sk(sk);
e40526cb 324
ce06b03e 325 if (!po->running) {
66e56cd4 326 if (po->fanout)
dc99f600 327 __fanout_link(sk, po);
66e56cd4 328 else
dc99f600 329 dev_add_pack(&po->prot_hook);
e40526cb 330
ce06b03e
DM
331 sock_hold(sk);
332 po->running = 1;
333 }
334}
335
336/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
337 * held. If the sync parameter is true, we will temporarily drop
338 * the po->bind_lock and do a synchronize_net to make sure no
339 * asynchronous packet processing paths still refer to the elements
340 * of po->prot_hook. If the sync parameter is false, it is the
341 * callers responsibility to take care of this.
342 */
343static void __unregister_prot_hook(struct sock *sk, bool sync)
344{
345 struct packet_sock *po = pkt_sk(sk);
346
347 po->running = 0;
66e56cd4
DB
348
349 if (po->fanout)
dc99f600 350 __fanout_unlink(sk, po);
66e56cd4 351 else
dc99f600 352 __dev_remove_pack(&po->prot_hook);
e40526cb 353
ce06b03e
DM
354 __sock_put(sk);
355
356 if (sync) {
357 spin_unlock(&po->bind_lock);
358 synchronize_net();
359 spin_lock(&po->bind_lock);
360 }
361}
362
363static void unregister_prot_hook(struct sock *sk, bool sync)
364{
365 struct packet_sock *po = pkt_sk(sk);
366
367 if (po->running)
368 __unregister_prot_hook(sk, sync);
369}
370
f6dafa95 371static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
372{
373 if (is_vmalloc_addr(addr))
374 return vmalloc_to_page(addr);
375 return virt_to_page(addr);
376}
377
69e3c75f 378static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 379{
184f489e 380 union tpacket_uhdr h;
1da177e4 381
69e3c75f 382 h.raw = frame;
bbd6ef87
PM
383 switch (po->tp_version) {
384 case TPACKET_V1:
69e3c75f 385 h.h1->tp_status = status;
0af55bb5 386 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
387 break;
388 case TPACKET_V2:
69e3c75f 389 h.h2->tp_status = status;
0af55bb5 390 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 391 break;
f6fb8f10 392 case TPACKET_V3:
69e3c75f 393 default:
f6fb8f10 394 WARN(1, "TPACKET version not supported.\n");
69e3c75f 395 BUG();
bbd6ef87 396 }
69e3c75f
JB
397
398 smp_wmb();
bbd6ef87
PM
399}
400
69e3c75f 401static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 402{
184f489e 403 union tpacket_uhdr h;
bbd6ef87 404
69e3c75f
JB
405 smp_rmb();
406
bbd6ef87
PM
407 h.raw = frame;
408 switch (po->tp_version) {
409 case TPACKET_V1:
0af55bb5 410 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 411 return h.h1->tp_status;
bbd6ef87 412 case TPACKET_V2:
0af55bb5 413 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 414 return h.h2->tp_status;
f6fb8f10 415 case TPACKET_V3:
69e3c75f 416 default:
f6fb8f10 417 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
418 BUG();
419 return 0;
bbd6ef87 420 }
1da177e4 421}
69e3c75f 422
b9c32fb2
DB
423static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
424 unsigned int flags)
7a51384c
DB
425{
426 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
427
428 if (shhwtstamps) {
429 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
430 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 431 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
432 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
433 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 434 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
435 }
436
437 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 438 return TP_STATUS_TS_SOFTWARE;
7a51384c 439
b9c32fb2 440 return 0;
7a51384c
DB
441}
442
b9c32fb2
DB
443static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
444 struct sk_buff *skb)
2e31396f
WB
445{
446 union tpacket_uhdr h;
447 struct timespec ts;
b9c32fb2 448 __u32 ts_status;
2e31396f 449
b9c32fb2
DB
450 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
451 return 0;
2e31396f
WB
452
453 h.raw = frame;
454 switch (po->tp_version) {
455 case TPACKET_V1:
456 h.h1->tp_sec = ts.tv_sec;
457 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
458 break;
459 case TPACKET_V2:
460 h.h2->tp_sec = ts.tv_sec;
461 h.h2->tp_nsec = ts.tv_nsec;
462 break;
463 case TPACKET_V3:
464 default:
465 WARN(1, "TPACKET version not supported.\n");
466 BUG();
467 }
468
469 /* one flush is safe, as both fields always lie on the same cacheline */
470 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
471 smp_wmb();
b9c32fb2
DB
472
473 return ts_status;
2e31396f
WB
474}
475
69e3c75f
JB
476static void *packet_lookup_frame(struct packet_sock *po,
477 struct packet_ring_buffer *rb,
478 unsigned int position,
479 int status)
480{
481 unsigned int pg_vec_pos, frame_offset;
184f489e 482 union tpacket_uhdr h;
69e3c75f
JB
483
484 pg_vec_pos = position / rb->frames_per_block;
485 frame_offset = position % rb->frames_per_block;
486
0e3125c7
NH
487 h.raw = rb->pg_vec[pg_vec_pos].buffer +
488 (frame_offset * rb->frame_size);
69e3c75f
JB
489
490 if (status != __packet_get_status(po, h.raw))
491 return NULL;
492
493 return h.raw;
494}
495
eea49cc9 496static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
497 struct packet_ring_buffer *rb,
498 int status)
499{
500 return packet_lookup_frame(po, rb, rb->head, status);
501}
502
bc59ba39 503static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 504{
505 del_timer_sync(&pkc->retire_blk_timer);
506}
507
508static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
509 int tx_ring,
510 struct sk_buff_head *rb_queue)
511{
bc59ba39 512 struct tpacket_kbdq_core *pkc;
f6fb8f10 513
22781a5b
DJ
514 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
515 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 516
ec6f809f 517 spin_lock_bh(&rb_queue->lock);
f6fb8f10 518 pkc->delete_blk_timer = 1;
ec6f809f 519 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 520
521 prb_del_retire_blk_timer(pkc);
522}
523
524static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 525 struct tpacket_kbdq_core *pkc,
f6fb8f10 526 void (*func) (unsigned long))
527{
528 init_timer(&pkc->retire_blk_timer);
529 pkc->retire_blk_timer.data = (long)po;
530 pkc->retire_blk_timer.function = func;
531 pkc->retire_blk_timer.expires = jiffies;
532}
533
534static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
535{
bc59ba39 536 struct tpacket_kbdq_core *pkc;
f6fb8f10 537
538 if (tx_ring)
539 BUG();
540
22781a5b
DJ
541 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
542 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 543 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
544}
545
546static int prb_calc_retire_blk_tmo(struct packet_sock *po,
547 int blk_size_in_bytes)
548{
549 struct net_device *dev;
550 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
551 struct ethtool_cmd ecmd;
552 int err;
e440cf2c 553 u32 speed;
f6fb8f10 554
4bc71cb9
JP
555 rtnl_lock();
556 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
557 if (unlikely(!dev)) {
558 rtnl_unlock();
f6fb8f10 559 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
560 }
561 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 562 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
563 rtnl_unlock();
564 if (!err) {
4bc71cb9
JP
565 /*
566 * If the link speed is so slow you don't really
567 * need to worry about perf anyways
568 */
e440cf2c 569 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 570 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 571 } else {
572 msec = 1;
573 div = speed / 1000;
f6fb8f10 574 }
575 }
576
577 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
578
579 if (div)
580 mbits /= div;
581
582 tmo = mbits * msec;
583
584 if (div)
585 return tmo+1;
586 return tmo;
587}
588
bc59ba39 589static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 590 union tpacket_req_u *req_u)
591{
592 p1->feature_req_word = req_u->req3.tp_feature_req_word;
593}
594
595static void init_prb_bdqc(struct packet_sock *po,
596 struct packet_ring_buffer *rb,
597 struct pgv *pg_vec,
598 union tpacket_req_u *req_u, int tx_ring)
599{
22781a5b 600 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 601 struct tpacket_block_desc *pbd;
f6fb8f10 602
603 memset(p1, 0x0, sizeof(*p1));
604
605 p1->knxt_seq_num = 1;
606 p1->pkbdq = pg_vec;
bc59ba39 607 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 608 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 609 p1->kblk_size = req_u->req3.tp_block_size;
610 p1->knum_blocks = req_u->req3.tp_block_nr;
611 p1->hdrlen = po->tp_hdrlen;
612 p1->version = po->tp_version;
613 p1->last_kactive_blk_num = 0;
ee80fbf3 614 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 615 if (req_u->req3.tp_retire_blk_tov)
616 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
617 else
618 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
619 req_u->req3.tp_block_size);
620 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
621 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
622
623 prb_init_ft_ops(p1, req_u);
624 prb_setup_retire_blk_timer(po, tx_ring);
625 prb_open_block(p1, pbd);
626}
627
628/* Do NOT update the last_blk_num first.
629 * Assumes sk_buff_head lock is held.
630 */
bc59ba39 631static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 632{
633 mod_timer(&pkc->retire_blk_timer,
634 jiffies + pkc->tov_in_jiffies);
635 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
636}
637
638/*
639 * Timer logic:
640 * 1) We refresh the timer only when we open a block.
641 * By doing this we don't waste cycles refreshing the timer
642 * on packet-by-packet basis.
643 *
644 * With a 1MB block-size, on a 1Gbps line, it will take
645 * i) ~8 ms to fill a block + ii) memcpy etc.
646 * In this cut we are not accounting for the memcpy time.
647 *
648 * So, if the user sets the 'tmo' to 10ms then the timer
649 * will never fire while the block is still getting filled
650 * (which is what we want). However, the user could choose
651 * to close a block early and that's fine.
652 *
653 * But when the timer does fire, we check whether or not to refresh it.
654 * Since the tmo granularity is in msecs, it is not too expensive
655 * to refresh the timer, lets say every '8' msecs.
656 * Either the user can set the 'tmo' or we can derive it based on
657 * a) line-speed and b) block-size.
658 * prb_calc_retire_blk_tmo() calculates the tmo.
659 *
660 */
661static void prb_retire_rx_blk_timer_expired(unsigned long data)
662{
663 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 664 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 665 unsigned int frozen;
bc59ba39 666 struct tpacket_block_desc *pbd;
f6fb8f10 667
668 spin_lock(&po->sk.sk_receive_queue.lock);
669
670 frozen = prb_queue_frozen(pkc);
671 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
672
673 if (unlikely(pkc->delete_blk_timer))
674 goto out;
675
676 /* We only need to plug the race when the block is partially filled.
677 * tpacket_rcv:
678 * lock(); increment BLOCK_NUM_PKTS; unlock()
679 * copy_bits() is in progress ...
680 * timer fires on other cpu:
681 * we can't retire the current block because copy_bits
682 * is in progress.
683 *
684 */
685 if (BLOCK_NUM_PKTS(pbd)) {
686 while (atomic_read(&pkc->blk_fill_in_prog)) {
687 /* Waiting for skb_copy_bits to finish... */
688 cpu_relax();
689 }
690 }
691
692 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
693 if (!frozen) {
694 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
695 if (!prb_dispatch_next_block(pkc, po))
696 goto refresh_timer;
697 else
698 goto out;
699 } else {
700 /* Case 1. Queue was frozen because user-space was
701 * lagging behind.
702 */
703 if (prb_curr_blk_in_use(pkc, pbd)) {
704 /*
705 * Ok, user-space is still behind.
706 * So just refresh the timer.
707 */
708 goto refresh_timer;
709 } else {
710 /* Case 2. queue was frozen,user-space caught up,
711 * now the link went idle && the timer fired.
712 * We don't have a block to close.So we open this
713 * block and restart the timer.
714 * opening a block thaws the queue,restarts timer
715 * Thawing/timer-refresh is a side effect.
716 */
717 prb_open_block(pkc, pbd);
718 goto out;
719 }
720 }
721 }
722
723refresh_timer:
724 _prb_refresh_rx_retire_blk_timer(pkc);
725
726out:
727 spin_unlock(&po->sk.sk_receive_queue.lock);
728}
729
eea49cc9 730static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 731 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 732{
733 /* Flush everything minus the block header */
734
735#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
736 u8 *start, *end;
737
738 start = (u8 *)pbd1;
739
740 /* Skip the block header(we know header WILL fit in 4K) */
741 start += PAGE_SIZE;
742
743 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
744 for (; start < end; start += PAGE_SIZE)
745 flush_dcache_page(pgv_to_page(start));
746
747 smp_wmb();
748#endif
749
750 /* Now update the block status. */
751
752 BLOCK_STATUS(pbd1) = status;
753
754 /* Flush the block header */
755
756#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
757 start = (u8 *)pbd1;
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762}
763
764/*
765 * Side effect:
766 *
767 * 1) flush the block
768 * 2) Increment active_blk_num
769 *
770 * Note:We DONT refresh the timer on purpose.
771 * Because almost always the next block will be opened.
772 */
bc59ba39 773static void prb_close_block(struct tpacket_kbdq_core *pkc1,
774 struct tpacket_block_desc *pbd1,
f6fb8f10 775 struct packet_sock *po, unsigned int stat)
776{
777 __u32 status = TP_STATUS_USER | stat;
778
779 struct tpacket3_hdr *last_pkt;
bc59ba39 780 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 781
ee80fbf3 782 if (po->stats.stats3.tp_drops)
f6fb8f10 783 status |= TP_STATUS_LOSING;
784
785 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
786 last_pkt->tp_next_offset = 0;
787
788 /* Get the ts of the last pkt */
789 if (BLOCK_NUM_PKTS(pbd1)) {
790 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
791 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
792 } else {
793 /* Ok, we tmo'd - so get the current time */
794 struct timespec ts;
795 getnstimeofday(&ts);
796 h1->ts_last_pkt.ts_sec = ts.tv_sec;
797 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
798 }
799
800 smp_wmb();
801
802 /* Flush the block */
803 prb_flush_block(pkc1, pbd1, status);
804
805 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
806}
807
eea49cc9 808static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 809{
810 pkc->reset_pending_on_curr_blk = 0;
811}
812
813/*
814 * Side effect of opening a block:
815 *
816 * 1) prb_queue is thawed.
817 * 2) retire_blk_timer is refreshed.
818 *
819 */
bc59ba39 820static void prb_open_block(struct tpacket_kbdq_core *pkc1,
821 struct tpacket_block_desc *pbd1)
f6fb8f10 822{
823 struct timespec ts;
bc59ba39 824 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 825
826 smp_rmb();
827
8da3056c
DB
828 /* We could have just memset this but we will lose the
829 * flexibility of making the priv area sticky
830 */
f6fb8f10 831
8da3056c
DB
832 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
833 BLOCK_NUM_PKTS(pbd1) = 0;
834 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 835
8da3056c
DB
836 getnstimeofday(&ts);
837
838 h1->ts_first_pkt.ts_sec = ts.tv_sec;
839 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 840
8da3056c
DB
841 pkc1->pkblk_start = (char *)pbd1;
842 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
843
844 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
845 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
846
847 pbd1->version = pkc1->version;
848 pkc1->prev = pkc1->nxt_offset;
849 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
850
851 prb_thaw_queue(pkc1);
852 _prb_refresh_rx_retire_blk_timer(pkc1);
853
854 smp_wmb();
f6fb8f10 855}
856
857/*
858 * Queue freeze logic:
859 * 1) Assume tp_block_nr = 8 blocks.
860 * 2) At time 't0', user opens Rx ring.
861 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
862 * 4) user-space is either sleeping or processing block '0'.
863 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
864 * it will close block-7,loop around and try to fill block '0'.
865 * call-flow:
866 * __packet_lookup_frame_in_block
867 * prb_retire_current_block()
868 * prb_dispatch_next_block()
869 * |->(BLOCK_STATUS == USER) evaluates to true
870 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
871 * 6) Now there are two cases:
872 * 6.1) Link goes idle right after the queue is frozen.
873 * But remember, the last open_block() refreshed the timer.
874 * When this timer expires,it will refresh itself so that we can
875 * re-open block-0 in near future.
876 * 6.2) Link is busy and keeps on receiving packets. This is a simple
877 * case and __packet_lookup_frame_in_block will check if block-0
878 * is free and can now be re-used.
879 */
eea49cc9 880static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 881 struct packet_sock *po)
882{
883 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 884 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 885}
886
887#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
888
889/*
890 * If the next block is free then we will dispatch it
891 * and return a good offset.
892 * Else, we will freeze the queue.
893 * So, caller must check the return value.
894 */
bc59ba39 895static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 896 struct packet_sock *po)
897{
bc59ba39 898 struct tpacket_block_desc *pbd;
f6fb8f10 899
900 smp_rmb();
901
902 /* 1. Get current block num */
903 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
904
905 /* 2. If this block is currently in_use then freeze the queue */
906 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
907 prb_freeze_queue(pkc, po);
908 return NULL;
909 }
910
911 /*
912 * 3.
913 * open this block and return the offset where the first packet
914 * needs to get stored.
915 */
916 prb_open_block(pkc, pbd);
917 return (void *)pkc->nxt_offset;
918}
919
bc59ba39 920static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 921 struct packet_sock *po, unsigned int status)
922{
bc59ba39 923 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 924
925 /* retire/close the current block */
926 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
927 /*
928 * Plug the case where copy_bits() is in progress on
929 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
930 * have space to copy the pkt in the current block and
931 * called prb_retire_current_block()
932 *
933 * We don't need to worry about the TMO case because
934 * the timer-handler already handled this case.
935 */
936 if (!(status & TP_STATUS_BLK_TMO)) {
937 while (atomic_read(&pkc->blk_fill_in_prog)) {
938 /* Waiting for skb_copy_bits to finish... */
939 cpu_relax();
940 }
941 }
942 prb_close_block(pkc, pbd, po, status);
943 return;
944 }
f6fb8f10 945}
946
eea49cc9 947static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 948 struct tpacket_block_desc *pbd)
f6fb8f10 949{
950 return TP_STATUS_USER & BLOCK_STATUS(pbd);
951}
952
eea49cc9 953static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 954{
955 return pkc->reset_pending_on_curr_blk;
956}
957
eea49cc9 958static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 959{
bc59ba39 960 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 961 atomic_dec(&pkc->blk_fill_in_prog);
962}
963
eea49cc9 964static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 965 struct tpacket3_hdr *ppd)
966{
3958afa1 967 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 968}
969
eea49cc9 970static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 971 struct tpacket3_hdr *ppd)
972{
973 ppd->hv1.tp_rxhash = 0;
974}
975
eea49cc9 976static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 977 struct tpacket3_hdr *ppd)
978{
979 if (vlan_tx_tag_present(pkc->skb)) {
980 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
a0cdfcf3
AW
981 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
982 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 983 } else {
9e67030a 984 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 985 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 986 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 987 }
988}
989
bc59ba39 990static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
a0cdfcf3 993 ppd->hv1.tp_padding = 0;
f6fb8f10 994 prb_fill_vlan_info(pkc, ppd);
995
996 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
997 prb_fill_rxhash(pkc, ppd);
998 else
999 prb_clear_rxhash(pkc, ppd);
1000}
1001
eea49cc9 1002static void prb_fill_curr_block(char *curr,
bc59ba39 1003 struct tpacket_kbdq_core *pkc,
1004 struct tpacket_block_desc *pbd,
f6fb8f10 1005 unsigned int len)
1006{
1007 struct tpacket3_hdr *ppd;
1008
1009 ppd = (struct tpacket3_hdr *)curr;
1010 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1011 pkc->prev = curr;
1012 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1013 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1014 BLOCK_NUM_PKTS(pbd) += 1;
1015 atomic_inc(&pkc->blk_fill_in_prog);
1016 prb_run_all_ft_ops(pkc, ppd);
1017}
1018
1019/* Assumes caller has the sk->rx_queue.lock */
1020static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1021 struct sk_buff *skb,
1022 int status,
1023 unsigned int len
1024 )
1025{
bc59ba39 1026 struct tpacket_kbdq_core *pkc;
1027 struct tpacket_block_desc *pbd;
f6fb8f10 1028 char *curr, *end;
1029
e3192690 1030 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1031 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1032
1033 /* Queue is frozen when user space is lagging behind */
1034 if (prb_queue_frozen(pkc)) {
1035 /*
1036 * Check if that last block which caused the queue to freeze,
1037 * is still in_use by user-space.
1038 */
1039 if (prb_curr_blk_in_use(pkc, pbd)) {
1040 /* Can't record this packet */
1041 return NULL;
1042 } else {
1043 /*
1044 * Ok, the block was released by user-space.
1045 * Now let's open that block.
1046 * opening a block also thaws the queue.
1047 * Thawing is a side effect.
1048 */
1049 prb_open_block(pkc, pbd);
1050 }
1051 }
1052
1053 smp_mb();
1054 curr = pkc->nxt_offset;
1055 pkc->skb = skb;
e3192690 1056 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1057
1058 /* first try the current block */
1059 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1060 prb_fill_curr_block(curr, pkc, pbd, len);
1061 return (void *)curr;
1062 }
1063
1064 /* Ok, close the current block */
1065 prb_retire_current_block(pkc, po, 0);
1066
1067 /* Now, try to dispatch the next block */
1068 curr = (char *)prb_dispatch_next_block(pkc, po);
1069 if (curr) {
1070 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1071 prb_fill_curr_block(curr, pkc, pbd, len);
1072 return (void *)curr;
1073 }
1074
1075 /*
1076 * No free blocks are available.user_space hasn't caught up yet.
1077 * Queue was just frozen and now this packet will get dropped.
1078 */
1079 return NULL;
1080}
1081
eea49cc9 1082static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1083 struct sk_buff *skb,
1084 int status, unsigned int len)
1085{
1086 char *curr = NULL;
1087 switch (po->tp_version) {
1088 case TPACKET_V1:
1089 case TPACKET_V2:
1090 curr = packet_lookup_frame(po, &po->rx_ring,
1091 po->rx_ring.head, status);
1092 return curr;
1093 case TPACKET_V3:
1094 return __packet_lookup_frame_in_block(po, skb, status, len);
1095 default:
1096 WARN(1, "TPACKET version not supported\n");
1097 BUG();
99aa3473 1098 return NULL;
f6fb8f10 1099 }
1100}
1101
eea49cc9 1102static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1103 struct packet_ring_buffer *rb,
77f65ebd 1104 unsigned int idx,
f6fb8f10 1105 int status)
1106{
bc59ba39 1107 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1108 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1109
1110 if (status != BLOCK_STATUS(pbd))
1111 return NULL;
1112 return pbd;
1113}
1114
eea49cc9 1115static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1116{
1117 unsigned int prev;
1118 if (rb->prb_bdqc.kactive_blk_num)
1119 prev = rb->prb_bdqc.kactive_blk_num-1;
1120 else
1121 prev = rb->prb_bdqc.knum_blocks-1;
1122 return prev;
1123}
1124
1125/* Assumes caller has held the rx_queue.lock */
eea49cc9 1126static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1127 struct packet_ring_buffer *rb,
1128 int status)
1129{
1130 unsigned int previous = prb_previous_blk_num(rb);
1131 return prb_lookup_block(po, rb, previous, status);
1132}
1133
eea49cc9 1134static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1135 struct packet_ring_buffer *rb,
1136 int status)
1137{
1138 if (po->tp_version <= TPACKET_V2)
1139 return packet_previous_frame(po, rb, status);
1140
1141 return __prb_previous_block(po, rb, status);
1142}
1143
eea49cc9 1144static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1145 struct packet_ring_buffer *rb)
1146{
1147 switch (po->tp_version) {
1148 case TPACKET_V1:
1149 case TPACKET_V2:
1150 return packet_increment_head(rb);
1151 case TPACKET_V3:
1152 default:
1153 WARN(1, "TPACKET version not supported.\n");
1154 BUG();
1155 return;
1156 }
1157}
1158
eea49cc9 1159static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1160 struct packet_ring_buffer *rb,
1161 int status)
1162{
1163 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1164 return packet_lookup_frame(po, rb, previous, status);
1165}
1166
eea49cc9 1167static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1168{
1169 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1170}
1171
b0138408
DB
1172static void packet_inc_pending(struct packet_ring_buffer *rb)
1173{
1174 this_cpu_inc(*rb->pending_refcnt);
1175}
1176
1177static void packet_dec_pending(struct packet_ring_buffer *rb)
1178{
1179 this_cpu_dec(*rb->pending_refcnt);
1180}
1181
1182static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1183{
1184 unsigned int refcnt = 0;
1185 int cpu;
1186
1187 /* We don't use pending refcount in rx_ring. */
1188 if (rb->pending_refcnt == NULL)
1189 return 0;
1190
1191 for_each_possible_cpu(cpu)
1192 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1193
1194 return refcnt;
1195}
1196
1197static int packet_alloc_pending(struct packet_sock *po)
1198{
1199 po->rx_ring.pending_refcnt = NULL;
1200
1201 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1202 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1203 return -ENOBUFS;
1204
1205 return 0;
1206}
1207
1208static void packet_free_pending(struct packet_sock *po)
1209{
1210 free_percpu(po->tx_ring.pending_refcnt);
1211}
1212
77f65ebd
WB
1213static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1214{
1215 struct sock *sk = &po->sk;
1216 bool has_room;
1217
1218 if (po->prot_hook.func != tpacket_rcv)
1219 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1220 <= sk->sk_rcvbuf;
1221
1222 spin_lock(&sk->sk_receive_queue.lock);
1223 if (po->tp_version == TPACKET_V3)
1224 has_room = prb_lookup_block(po, &po->rx_ring,
1225 po->rx_ring.prb_bdqc.kactive_blk_num,
1226 TP_STATUS_KERNEL);
1227 else
1228 has_room = packet_lookup_frame(po, &po->rx_ring,
1229 po->rx_ring.head,
1230 TP_STATUS_KERNEL);
1231 spin_unlock(&sk->sk_receive_queue.lock);
1232
1233 return has_room;
1234}
1235
1da177e4
LT
1236static void packet_sock_destruct(struct sock *sk)
1237{
ed85b565
RC
1238 skb_queue_purge(&sk->sk_error_queue);
1239
547b792c
IJ
1240 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1241 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1242
1243 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1244 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1245 return;
1246 }
1247
17ab56a2 1248 sk_refcnt_debug_dec(sk);
1da177e4
LT
1249}
1250
dc99f600
DM
1251static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1252{
1253 int x = atomic_read(&f->rr_cur) + 1;
1254
1255 if (x >= num)
1256 x = 0;
1257
1258 return x;
1259}
1260
77f65ebd
WB
1261static unsigned int fanout_demux_hash(struct packet_fanout *f,
1262 struct sk_buff *skb,
1263 unsigned int num)
dc99f600 1264{
f55d112e 1265 return reciprocal_divide(skb->rxhash, num);
dc99f600
DM
1266}
1267
77f65ebd
WB
1268static unsigned int fanout_demux_lb(struct packet_fanout *f,
1269 struct sk_buff *skb,
1270 unsigned int num)
dc99f600
DM
1271{
1272 int cur, old;
1273
1274 cur = atomic_read(&f->rr_cur);
1275 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1276 fanout_rr_next(f, num))) != cur)
1277 cur = old;
77f65ebd
WB
1278 return cur;
1279}
1280
1281static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1282 struct sk_buff *skb,
1283 unsigned int num)
1284{
1285 return smp_processor_id() % num;
dc99f600
DM
1286}
1287
5df0ddfb
DB
1288static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1289 struct sk_buff *skb,
1290 unsigned int num)
1291{
1292 return reciprocal_divide(prandom_u32(), num);
1293}
1294
77f65ebd
WB
1295static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1296 struct sk_buff *skb,
1297 unsigned int idx, unsigned int skip,
1298 unsigned int num)
95ec3eb4 1299{
77f65ebd 1300 unsigned int i, j;
95ec3eb4 1301
77f65ebd
WB
1302 i = j = min_t(int, f->next[idx], num - 1);
1303 do {
1304 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1305 if (i != j)
1306 f->next[idx] = i;
1307 return i;
1308 }
1309 if (++i == num)
1310 i = 0;
1311 } while (i != j);
1312
1313 return idx;
1314}
1315
1316static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1317{
1318 return f->flags & (flag >> 8);
95ec3eb4
DM
1319}
1320
95ec3eb4
DM
1321static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1322 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1323{
1324 struct packet_fanout *f = pt->af_packet_priv;
1325 unsigned int num = f->num_members;
1326 struct packet_sock *po;
77f65ebd 1327 unsigned int idx;
dc99f600
DM
1328
1329 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1330 !num) {
1331 kfree_skb(skb);
1332 return 0;
1333 }
1334
95ec3eb4
DM
1335 switch (f->type) {
1336 case PACKET_FANOUT_HASH:
1337 default:
77f65ebd 1338 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1339 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1340 if (!skb)
1341 return 0;
1342 }
3958afa1 1343 skb_get_hash(skb);
77f65ebd 1344 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1345 break;
1346 case PACKET_FANOUT_LB:
77f65ebd 1347 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1348 break;
1349 case PACKET_FANOUT_CPU:
77f65ebd
WB
1350 idx = fanout_demux_cpu(f, skb, num);
1351 break;
5df0ddfb
DB
1352 case PACKET_FANOUT_RND:
1353 idx = fanout_demux_rnd(f, skb, num);
1354 break;
77f65ebd
WB
1355 case PACKET_FANOUT_ROLLOVER:
1356 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1357 break;
dc99f600
DM
1358 }
1359
77f65ebd
WB
1360 po = pkt_sk(f->arr[idx]);
1361 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1362 unlikely(!packet_rcv_has_room(po, skb))) {
1363 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1364 po = pkt_sk(f->arr[idx]);
1365 }
dc99f600
DM
1366
1367 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1368}
1369
fff3321d
PE
1370DEFINE_MUTEX(fanout_mutex);
1371EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1372static LIST_HEAD(fanout_list);
1373
1374static void __fanout_link(struct sock *sk, struct packet_sock *po)
1375{
1376 struct packet_fanout *f = po->fanout;
1377
1378 spin_lock(&f->lock);
1379 f->arr[f->num_members] = sk;
1380 smp_wmb();
1381 f->num_members++;
1382 spin_unlock(&f->lock);
1383}
1384
1385static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1386{
1387 struct packet_fanout *f = po->fanout;
1388 int i;
1389
1390 spin_lock(&f->lock);
1391 for (i = 0; i < f->num_members; i++) {
1392 if (f->arr[i] == sk)
1393 break;
1394 }
1395 BUG_ON(i >= f->num_members);
1396 f->arr[i] = f->arr[f->num_members - 1];
1397 f->num_members--;
1398 spin_unlock(&f->lock);
1399}
1400
d4dd8aee 1401static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1402{
d4dd8aee 1403 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1404 return true;
1405
1406 return false;
1407}
1408
7736d33f 1409static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1410{
1411 struct packet_sock *po = pkt_sk(sk);
1412 struct packet_fanout *f, *match;
7736d33f 1413 u8 type = type_flags & 0xff;
77f65ebd 1414 u8 flags = type_flags >> 8;
dc99f600
DM
1415 int err;
1416
1417 switch (type) {
77f65ebd
WB
1418 case PACKET_FANOUT_ROLLOVER:
1419 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1420 return -EINVAL;
dc99f600
DM
1421 case PACKET_FANOUT_HASH:
1422 case PACKET_FANOUT_LB:
95ec3eb4 1423 case PACKET_FANOUT_CPU:
5df0ddfb 1424 case PACKET_FANOUT_RND:
dc99f600
DM
1425 break;
1426 default:
1427 return -EINVAL;
1428 }
1429
1430 if (!po->running)
1431 return -EINVAL;
1432
1433 if (po->fanout)
1434 return -EALREADY;
1435
1436 mutex_lock(&fanout_mutex);
1437 match = NULL;
1438 list_for_each_entry(f, &fanout_list, list) {
1439 if (f->id == id &&
1440 read_pnet(&f->net) == sock_net(sk)) {
1441 match = f;
1442 break;
1443 }
1444 }
afe62c68 1445 err = -EINVAL;
77f65ebd 1446 if (match && match->flags != flags)
afe62c68 1447 goto out;
dc99f600 1448 if (!match) {
afe62c68 1449 err = -ENOMEM;
dc99f600 1450 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1451 if (!match)
1452 goto out;
1453 write_pnet(&match->net, sock_net(sk));
1454 match->id = id;
1455 match->type = type;
77f65ebd 1456 match->flags = flags;
afe62c68
ED
1457 atomic_set(&match->rr_cur, 0);
1458 INIT_LIST_HEAD(&match->list);
1459 spin_lock_init(&match->lock);
1460 atomic_set(&match->sk_ref, 0);
1461 match->prot_hook.type = po->prot_hook.type;
1462 match->prot_hook.dev = po->prot_hook.dev;
1463 match->prot_hook.func = packet_rcv_fanout;
1464 match->prot_hook.af_packet_priv = match;
c0de08d0 1465 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1466 dev_add_pack(&match->prot_hook);
1467 list_add(&match->list, &fanout_list);
dc99f600 1468 }
afe62c68
ED
1469 err = -EINVAL;
1470 if (match->type == type &&
1471 match->prot_hook.type == po->prot_hook.type &&
1472 match->prot_hook.dev == po->prot_hook.dev) {
1473 err = -ENOSPC;
1474 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1475 __dev_remove_pack(&po->prot_hook);
1476 po->fanout = match;
1477 atomic_inc(&match->sk_ref);
1478 __fanout_link(sk, po);
1479 err = 0;
dc99f600
DM
1480 }
1481 }
afe62c68 1482out:
dc99f600
DM
1483 mutex_unlock(&fanout_mutex);
1484 return err;
1485}
1486
1487static void fanout_release(struct sock *sk)
1488{
1489 struct packet_sock *po = pkt_sk(sk);
1490 struct packet_fanout *f;
1491
1492 f = po->fanout;
1493 if (!f)
1494 return;
1495
fff3321d 1496 mutex_lock(&fanout_mutex);
dc99f600
DM
1497 po->fanout = NULL;
1498
dc99f600
DM
1499 if (atomic_dec_and_test(&f->sk_ref)) {
1500 list_del(&f->list);
1501 dev_remove_pack(&f->prot_hook);
1502 kfree(f);
1503 }
1504 mutex_unlock(&fanout_mutex);
1505}
1da177e4 1506
90ddc4f0 1507static const struct proto_ops packet_ops;
1da177e4 1508
90ddc4f0 1509static const struct proto_ops packet_ops_spkt;
1da177e4 1510
40d4e3df
ED
1511static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1512 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1513{
1514 struct sock *sk;
1515 struct sockaddr_pkt *spkt;
1516
1517 /*
1518 * When we registered the protocol we saved the socket in the data
1519 * field for just this event.
1520 */
1521
1522 sk = pt->af_packet_priv;
1ce4f28b 1523
1da177e4
LT
1524 /*
1525 * Yank back the headers [hope the device set this
1526 * right or kerboom...]
1527 *
1528 * Incoming packets have ll header pulled,
1529 * push it back.
1530 *
98e399f8 1531 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1532 * so that this procedure is noop.
1533 */
1534
1535 if (skb->pkt_type == PACKET_LOOPBACK)
1536 goto out;
1537
09ad9bc7 1538 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1539 goto out;
1540
40d4e3df
ED
1541 skb = skb_share_check(skb, GFP_ATOMIC);
1542 if (skb == NULL)
1da177e4
LT
1543 goto oom;
1544
1545 /* drop any routing info */
adf30907 1546 skb_dst_drop(skb);
1da177e4 1547
84531c24
PO
1548 /* drop conntrack reference */
1549 nf_reset(skb);
1550
ffbc6111 1551 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1552
98e399f8 1553 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1554
1555 /*
1556 * The SOCK_PACKET socket receives _all_ frames.
1557 */
1558
1559 spkt->spkt_family = dev->type;
1560 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1561 spkt->spkt_protocol = skb->protocol;
1562
1563 /*
1564 * Charge the memory to the socket. This is done specifically
1565 * to prevent sockets using all the memory up.
1566 */
1567
40d4e3df 1568 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1569 return 0;
1570
1571out:
1572 kfree_skb(skb);
1573oom:
1574 return 0;
1575}
1576
1577
1578/*
1579 * Output a raw packet to a device layer. This bypasses all the other
1580 * protocol layers and you must therefore supply it with a complete frame
1581 */
1ce4f28b 1582
1da177e4
LT
1583static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1584 struct msghdr *msg, size_t len)
1585{
1586 struct sock *sk = sock->sk;
40d4e3df 1587 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1588 struct sk_buff *skb = NULL;
1da177e4 1589 struct net_device *dev;
40d4e3df 1590 __be16 proto = 0;
1da177e4 1591 int err;
3bdc0eba 1592 int extra_len = 0;
1ce4f28b 1593
1da177e4 1594 /*
1ce4f28b 1595 * Get and verify the address.
1da177e4
LT
1596 */
1597
40d4e3df 1598 if (saddr) {
1da177e4 1599 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1600 return -EINVAL;
1601 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1602 proto = saddr->spkt_protocol;
1603 } else
1604 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1605
1606 /*
1ce4f28b 1607 * Find the device first to size check it
1da177e4
LT
1608 */
1609
de74e92a 1610 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1611retry:
654d1f8a
ED
1612 rcu_read_lock();
1613 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1614 err = -ENODEV;
1615 if (dev == NULL)
1616 goto out_unlock;
1ce4f28b 1617
d5e76b0a
DM
1618 err = -ENETDOWN;
1619 if (!(dev->flags & IFF_UP))
1620 goto out_unlock;
1621
1da177e4 1622 /*
40d4e3df
ED
1623 * You may not queue a frame bigger than the mtu. This is the lowest level
1624 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1625 */
1ce4f28b 1626
3bdc0eba
BG
1627 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1628 if (!netif_supports_nofcs(dev)) {
1629 err = -EPROTONOSUPPORT;
1630 goto out_unlock;
1631 }
1632 extra_len = 4; /* We're doing our own CRC */
1633 }
1634
1da177e4 1635 err = -EMSGSIZE;
3bdc0eba 1636 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1637 goto out_unlock;
1638
1a35ca80
ED
1639 if (!skb) {
1640 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1641 int tlen = dev->needed_tailroom;
1a35ca80
ED
1642 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1643
1644 rcu_read_unlock();
4ce40912 1645 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1646 if (skb == NULL)
1647 return -ENOBUFS;
1648 /* FIXME: Save some space for broken drivers that write a hard
1649 * header at transmission time by themselves. PPP is the notable
1650 * one here. This should really be fixed at the driver level.
1651 */
1652 skb_reserve(skb, reserved);
1653 skb_reset_network_header(skb);
1654
1655 /* Try to align data part correctly */
1656 if (hhlen) {
1657 skb->data -= hhlen;
1658 skb->tail -= hhlen;
1659 if (len < hhlen)
1660 skb_reset_network_header(skb);
1661 }
1662 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1663 if (err)
1664 goto out_free;
1665 goto retry;
1da177e4
LT
1666 }
1667
3bdc0eba 1668 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1669 /* Earlier code assumed this would be a VLAN pkt,
1670 * double-check this now that we have the actual
1671 * packet in hand.
1672 */
1673 struct ethhdr *ehdr;
1674 skb_reset_mac_header(skb);
1675 ehdr = eth_hdr(skb);
1676 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1677 err = -EMSGSIZE;
1678 goto out_unlock;
1679 }
1680 }
1a35ca80 1681
1da177e4
LT
1682 skb->protocol = proto;
1683 skb->dev = dev;
1684 skb->priority = sk->sk_priority;
2d37a186 1685 skb->mark = sk->sk_mark;
bf84a010
DB
1686
1687 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1688
3bdc0eba
BG
1689 if (unlikely(extra_len == 4))
1690 skb->no_fcs = 1;
1691
40893fd0 1692 skb_probe_transport_header(skb, 0);
c1aad275 1693
1da177e4 1694 dev_queue_xmit(skb);
654d1f8a 1695 rcu_read_unlock();
40d4e3df 1696 return len;
1da177e4 1697
1da177e4 1698out_unlock:
654d1f8a 1699 rcu_read_unlock();
1a35ca80
ED
1700out_free:
1701 kfree_skb(skb);
1da177e4
LT
1702 return err;
1703}
1da177e4 1704
eea49cc9 1705static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1706 const struct sock *sk,
dbcb5855 1707 unsigned int res)
1da177e4
LT
1708{
1709 struct sk_filter *filter;
fda9ef5d 1710
80f8f102
ED
1711 rcu_read_lock();
1712 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1713 if (filter != NULL)
0a14842f 1714 res = SK_RUN_FILTER(filter, skb);
80f8f102 1715 rcu_read_unlock();
1da177e4 1716
dbcb5855 1717 return res;
1da177e4
LT
1718}
1719
1720/*
62ab0812
ED
1721 * This function makes lazy skb cloning in hope that most of packets
1722 * are discarded by BPF.
1723 *
1724 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1725 * and skb->cb are mangled. It works because (and until) packets
1726 * falling here are owned by current CPU. Output packets are cloned
1727 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1728 * sequencially, so that if we return skb to original state on exit,
1729 * we will not harm anyone.
1da177e4
LT
1730 */
1731
40d4e3df
ED
1732static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1733 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1734{
1735 struct sock *sk;
1736 struct sockaddr_ll *sll;
1737 struct packet_sock *po;
40d4e3df 1738 u8 *skb_head = skb->data;
1da177e4 1739 int skb_len = skb->len;
dbcb5855 1740 unsigned int snaplen, res;
1da177e4
LT
1741
1742 if (skb->pkt_type == PACKET_LOOPBACK)
1743 goto drop;
1744
1745 sk = pt->af_packet_priv;
1746 po = pkt_sk(sk);
1747
09ad9bc7 1748 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1749 goto drop;
1750
1da177e4
LT
1751 skb->dev = dev;
1752
3b04ddde 1753 if (dev->header_ops) {
1da177e4 1754 /* The device has an explicit notion of ll header,
62ab0812
ED
1755 * exported to higher levels.
1756 *
1757 * Otherwise, the device hides details of its frame
1758 * structure, so that corresponding packet head is
1759 * never delivered to user.
1da177e4
LT
1760 */
1761 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1762 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1763 else if (skb->pkt_type == PACKET_OUTGOING) {
1764 /* Special case: outgoing packets have ll header at head */
bbe735e4 1765 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1766 }
1767 }
1768
1769 snaplen = skb->len;
1770
dbcb5855
DM
1771 res = run_filter(skb, sk, snaplen);
1772 if (!res)
fda9ef5d 1773 goto drop_n_restore;
dbcb5855
DM
1774 if (snaplen > res)
1775 snaplen = res;
1da177e4 1776
0fd7bac6 1777 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1778 goto drop_n_acct;
1779
1780 if (skb_shared(skb)) {
1781 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1782 if (nskb == NULL)
1783 goto drop_n_acct;
1784
1785 if (skb_head != skb->data) {
1786 skb->data = skb_head;
1787 skb->len = skb_len;
1788 }
abc4e4fa 1789 consume_skb(skb);
1da177e4
LT
1790 skb = nskb;
1791 }
1792
ffbc6111
HX
1793 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1794 sizeof(skb->cb));
1795
1796 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1797 sll->sll_family = AF_PACKET;
1798 sll->sll_hatype = dev->type;
1799 sll->sll_protocol = skb->protocol;
1800 sll->sll_pkttype = skb->pkt_type;
8032b464 1801 if (unlikely(po->origdev))
80feaacb
PWJ
1802 sll->sll_ifindex = orig_dev->ifindex;
1803 else
1804 sll->sll_ifindex = dev->ifindex;
1da177e4 1805
b95cce35 1806 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1807
ffbc6111 1808 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1809
1da177e4
LT
1810 if (pskb_trim(skb, snaplen))
1811 goto drop_n_acct;
1812
1813 skb_set_owner_r(skb, sk);
1814 skb->dev = NULL;
adf30907 1815 skb_dst_drop(skb);
1da177e4 1816
84531c24
PO
1817 /* drop conntrack reference */
1818 nf_reset(skb);
1819
1da177e4 1820 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1821 po->stats.stats1.tp_packets++;
3b885787 1822 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1823 __skb_queue_tail(&sk->sk_receive_queue, skb);
1824 spin_unlock(&sk->sk_receive_queue.lock);
1825 sk->sk_data_ready(sk, skb->len);
1826 return 0;
1827
1828drop_n_acct:
7091fbd8 1829 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1830 po->stats.stats1.tp_drops++;
7091fbd8
WB
1831 atomic_inc(&sk->sk_drops);
1832 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1833
1834drop_n_restore:
1835 if (skb_head != skb->data && skb_shared(skb)) {
1836 skb->data = skb_head;
1837 skb->len = skb_len;
1838 }
1839drop:
ead2ceb0 1840 consume_skb(skb);
1da177e4
LT
1841 return 0;
1842}
1843
40d4e3df
ED
1844static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1845 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1846{
1847 struct sock *sk;
1848 struct packet_sock *po;
1849 struct sockaddr_ll *sll;
184f489e 1850 union tpacket_uhdr h;
40d4e3df 1851 u8 *skb_head = skb->data;
1da177e4 1852 int skb_len = skb->len;
dbcb5855 1853 unsigned int snaplen, res;
f6fb8f10 1854 unsigned long status = TP_STATUS_USER;
bbd6ef87 1855 unsigned short macoff, netoff, hdrlen;
1da177e4 1856 struct sk_buff *copy_skb = NULL;
bbd6ef87 1857 struct timespec ts;
b9c32fb2 1858 __u32 ts_status;
1da177e4 1859
51846355
AW
1860 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1861 * We may add members to them until current aligned size without forcing
1862 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1863 */
1864 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1865 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1866
1da177e4
LT
1867 if (skb->pkt_type == PACKET_LOOPBACK)
1868 goto drop;
1869
1870 sk = pt->af_packet_priv;
1871 po = pkt_sk(sk);
1872
09ad9bc7 1873 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1874 goto drop;
1875
3b04ddde 1876 if (dev->header_ops) {
1da177e4 1877 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1878 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1879 else if (skb->pkt_type == PACKET_OUTGOING) {
1880 /* Special case: outgoing packets have ll header at head */
bbe735e4 1881 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1882 }
1883 }
1884
8dc41944
HX
1885 if (skb->ip_summed == CHECKSUM_PARTIAL)
1886 status |= TP_STATUS_CSUMNOTREADY;
1887
1da177e4
LT
1888 snaplen = skb->len;
1889
dbcb5855
DM
1890 res = run_filter(skb, sk, snaplen);
1891 if (!res)
fda9ef5d 1892 goto drop_n_restore;
dbcb5855
DM
1893 if (snaplen > res)
1894 snaplen = res;
1da177e4
LT
1895
1896 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1897 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1898 po->tp_reserve;
1da177e4 1899 } else {
95c96174 1900 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1901 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1902 (maclen < 16 ? 16 : maclen)) +
1903 po->tp_reserve;
1da177e4
LT
1904 macoff = netoff - maclen;
1905 }
f6fb8f10 1906 if (po->tp_version <= TPACKET_V2) {
1907 if (macoff + snaplen > po->rx_ring.frame_size) {
1908 if (po->copy_thresh &&
0fd7bac6 1909 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1910 if (skb_shared(skb)) {
1911 copy_skb = skb_clone(skb, GFP_ATOMIC);
1912 } else {
1913 copy_skb = skb_get(skb);
1914 skb_head = skb->data;
1915 }
1916 if (copy_skb)
1917 skb_set_owner_r(copy_skb, sk);
1da177e4 1918 }
f6fb8f10 1919 snaplen = po->rx_ring.frame_size - macoff;
1920 if ((int)snaplen < 0)
1921 snaplen = 0;
1da177e4 1922 }
1da177e4 1923 }
1da177e4 1924 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1925 h.raw = packet_current_rx_frame(po, skb,
1926 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1927 if (!h.raw)
1da177e4 1928 goto ring_is_full;
f6fb8f10 1929 if (po->tp_version <= TPACKET_V2) {
1930 packet_increment_rx_head(po, &po->rx_ring);
1931 /*
1932 * LOSING will be reported till you read the stats,
1933 * because it's COR - Clear On Read.
1934 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1935 * at packet level.
1936 */
ee80fbf3 1937 if (po->stats.stats1.tp_drops)
f6fb8f10 1938 status |= TP_STATUS_LOSING;
1939 }
ee80fbf3 1940 po->stats.stats1.tp_packets++;
1da177e4
LT
1941 if (copy_skb) {
1942 status |= TP_STATUS_COPY;
1943 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1944 }
1da177e4
LT
1945 spin_unlock(&sk->sk_receive_queue.lock);
1946
bbd6ef87 1947 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1948
1949 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1950 getnstimeofday(&ts);
1da177e4 1951
b9c32fb2
DB
1952 status |= ts_status;
1953
bbd6ef87
PM
1954 switch (po->tp_version) {
1955 case TPACKET_V1:
1956 h.h1->tp_len = skb->len;
1957 h.h1->tp_snaplen = snaplen;
1958 h.h1->tp_mac = macoff;
1959 h.h1->tp_net = netoff;
4b457bdf
DB
1960 h.h1->tp_sec = ts.tv_sec;
1961 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1962 hdrlen = sizeof(*h.h1);
1963 break;
1964 case TPACKET_V2:
1965 h.h2->tp_len = skb->len;
1966 h.h2->tp_snaplen = snaplen;
1967 h.h2->tp_mac = macoff;
1968 h.h2->tp_net = netoff;
bbd6ef87
PM
1969 h.h2->tp_sec = ts.tv_sec;
1970 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1971 if (vlan_tx_tag_present(skb)) {
1972 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
a0cdfcf3
AW
1973 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
1974 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
1975 } else {
1976 h.h2->tp_vlan_tci = 0;
a0cdfcf3 1977 h.h2->tp_vlan_tpid = 0;
a3bcc23e 1978 }
e4d26f4b 1979 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
1980 hdrlen = sizeof(*h.h2);
1981 break;
f6fb8f10 1982 case TPACKET_V3:
1983 /* tp_nxt_offset,vlan are already populated above.
1984 * So DONT clear those fields here
1985 */
1986 h.h3->tp_status |= status;
1987 h.h3->tp_len = skb->len;
1988 h.h3->tp_snaplen = snaplen;
1989 h.h3->tp_mac = macoff;
1990 h.h3->tp_net = netoff;
f6fb8f10 1991 h.h3->tp_sec = ts.tv_sec;
1992 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 1993 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 1994 hdrlen = sizeof(*h.h3);
1995 break;
bbd6ef87
PM
1996 default:
1997 BUG();
1998 }
1da177e4 1999
bbd6ef87 2000 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2001 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2002 sll->sll_family = AF_PACKET;
2003 sll->sll_hatype = dev->type;
2004 sll->sll_protocol = skb->protocol;
2005 sll->sll_pkttype = skb->pkt_type;
8032b464 2006 if (unlikely(po->origdev))
80feaacb
PWJ
2007 sll->sll_ifindex = orig_dev->ifindex;
2008 else
2009 sll->sll_ifindex = dev->ifindex;
1da177e4 2010
e16aa207 2011 smp_mb();
f6dafa95 2012#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 2013 {
0af55bb5
CG
2014 u8 *start, *end;
2015
f6fb8f10 2016 if (po->tp_version <= TPACKET_V2) {
2017 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
2018 + macoff + snaplen);
2019 for (start = h.raw; start < end; start += PAGE_SIZE)
2020 flush_dcache_page(pgv_to_page(start));
2021 }
cc9f01b2 2022 smp_wmb();
1da177e4 2023 }
f6dafa95 2024#endif
f6fb8f10 2025 if (po->tp_version <= TPACKET_V2)
2026 __packet_set_status(po, h.raw, status);
2027 else
2028 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
2029
2030 sk->sk_data_ready(sk, 0);
2031
2032drop_n_restore:
2033 if (skb_head != skb->data && skb_shared(skb)) {
2034 skb->data = skb_head;
2035 skb->len = skb_len;
2036 }
2037drop:
1ce4f28b 2038 kfree_skb(skb);
1da177e4
LT
2039 return 0;
2040
2041ring_is_full:
ee80fbf3 2042 po->stats.stats1.tp_drops++;
1da177e4
LT
2043 spin_unlock(&sk->sk_receive_queue.lock);
2044
2045 sk->sk_data_ready(sk, 0);
acb5d75b 2046 kfree_skb(copy_skb);
1da177e4
LT
2047 goto drop_n_restore;
2048}
2049
69e3c75f
JB
2050static void tpacket_destruct_skb(struct sk_buff *skb)
2051{
2052 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 2053 void *ph;
1da177e4 2054
69e3c75f 2055 if (likely(po->tx_ring.pg_vec)) {
b9c32fb2
DB
2056 __u32 ts;
2057
69e3c75f 2058 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2059 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2060
2061 ts = __packet_set_timestamp(po, ph, skb);
2062 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2063 }
2064
2065 sock_wfree(skb);
2066}
2067
40d4e3df
ED
2068static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2069 void *frame, struct net_device *dev, int size_max,
ae641949 2070 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2071{
184f489e 2072 union tpacket_uhdr ph;
09effa67 2073 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2074 struct socket *sock = po->sk.sk_socket;
2075 struct page *page;
2076 void *data;
2077 int err;
2078
2079 ph.raw = frame;
2080
2081 skb->protocol = proto;
2082 skb->dev = dev;
2083 skb->priority = po->sk.sk_priority;
2d37a186 2084 skb->mark = po->sk.sk_mark;
2e31396f 2085 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2086 skb_shinfo(skb)->destructor_arg = ph.raw;
2087
2088 switch (po->tp_version) {
2089 case TPACKET_V2:
2090 tp_len = ph.h2->tp_len;
2091 break;
2092 default:
2093 tp_len = ph.h1->tp_len;
2094 break;
2095 }
09effa67
DM
2096 if (unlikely(tp_len > size_max)) {
2097 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2098 return -EMSGSIZE;
2099 }
69e3c75f 2100
ae641949 2101 skb_reserve(skb, hlen);
69e3c75f 2102 skb_reset_network_header(skb);
c1aad275 2103
d346a3fa
DB
2104 if (!packet_use_direct_xmit(po))
2105 skb_probe_transport_header(skb, 0);
2106 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2107 int off_min, off_max, off;
2108 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2109 off_max = po->tx_ring.frame_size - tp_len;
2110 if (sock->type == SOCK_DGRAM) {
2111 switch (po->tp_version) {
2112 case TPACKET_V2:
2113 off = ph.h2->tp_net;
2114 break;
2115 default:
2116 off = ph.h1->tp_net;
2117 break;
2118 }
2119 } else {
2120 switch (po->tp_version) {
2121 case TPACKET_V2:
2122 off = ph.h2->tp_mac;
2123 break;
2124 default:
2125 off = ph.h1->tp_mac;
2126 break;
2127 }
2128 }
2129 if (unlikely((off < off_min) || (off_max < off)))
2130 return -EINVAL;
2131 data = ph.raw + off;
2132 } else {
2133 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2134 }
69e3c75f
JB
2135 to_write = tp_len;
2136
2137 if (sock->type == SOCK_DGRAM) {
2138 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2139 NULL, tp_len);
2140 if (unlikely(err < 0))
2141 return -EINVAL;
40d4e3df 2142 } else if (dev->hard_header_len) {
69e3c75f
JB
2143 /* net device doesn't like empty head */
2144 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2145 pr_err("packet size is too short (%d < %d)\n",
2146 tp_len, dev->hard_header_len);
69e3c75f
JB
2147 return -EINVAL;
2148 }
2149
2150 skb_push(skb, dev->hard_header_len);
2151 err = skb_store_bits(skb, 0, data,
2152 dev->hard_header_len);
2153 if (unlikely(err))
2154 return err;
2155
2156 data += dev->hard_header_len;
2157 to_write -= dev->hard_header_len;
2158 }
2159
69e3c75f
JB
2160 offset = offset_in_page(data);
2161 len_max = PAGE_SIZE - offset;
2162 len = ((to_write > len_max) ? len_max : to_write);
2163
2164 skb->data_len = to_write;
2165 skb->len += to_write;
2166 skb->truesize += to_write;
2167 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2168
2169 while (likely(to_write)) {
2170 nr_frags = skb_shinfo(skb)->nr_frags;
2171
2172 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2173 pr_err("Packet exceed the number of skb frags(%lu)\n",
2174 MAX_SKB_FRAGS);
69e3c75f
JB
2175 return -EFAULT;
2176 }
2177
0af55bb5
CG
2178 page = pgv_to_page(data);
2179 data += len;
69e3c75f
JB
2180 flush_dcache_page(page);
2181 get_page(page);
0af55bb5 2182 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2183 to_write -= len;
2184 offset = 0;
2185 len_max = PAGE_SIZE;
2186 len = ((to_write > len_max) ? len_max : to_write);
2187 }
2188
2189 return tp_len;
2190}
2191
2192static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2193{
69e3c75f
JB
2194 struct sk_buff *skb;
2195 struct net_device *dev;
2196 __be16 proto;
09effa67 2197 int err, reserve = 0;
40d4e3df
ED
2198 void *ph;
2199 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
87a2fd28 2200 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2201 int tp_len, size_max;
2202 unsigned char *addr;
2203 int len_sum = 0;
9e67030a 2204 int status = TP_STATUS_AVAILABLE;
ae641949 2205 int hlen, tlen;
69e3c75f 2206
69e3c75f
JB
2207 mutex_lock(&po->pg_vec_lock);
2208
66e56cd4 2209 if (likely(saddr == NULL)) {
e40526cb 2210 dev = packet_cached_dev_get(po);
69e3c75f
JB
2211 proto = po->num;
2212 addr = NULL;
2213 } else {
2214 err = -EINVAL;
2215 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2216 goto out;
2217 if (msg->msg_namelen < (saddr->sll_halen
2218 + offsetof(struct sockaddr_ll,
2219 sll_addr)))
2220 goto out;
69e3c75f
JB
2221 proto = saddr->sll_protocol;
2222 addr = saddr->sll_addr;
827d9780 2223 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2224 }
2225
69e3c75f
JB
2226 err = -ENXIO;
2227 if (unlikely(dev == NULL))
2228 goto out;
69e3c75f
JB
2229 err = -ENETDOWN;
2230 if (unlikely(!(dev->flags & IFF_UP)))
2231 goto out_put;
2232
e40526cb
DB
2233 reserve = dev->hard_header_len;
2234
69e3c75f 2235 size_max = po->tx_ring.frame_size
b5dd884e 2236 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2237
09effa67
DM
2238 if (size_max > dev->mtu + reserve)
2239 size_max = dev->mtu + reserve;
2240
69e3c75f
JB
2241 do {
2242 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2243 TP_STATUS_SEND_REQUEST);
69e3c75f 2244 if (unlikely(ph == NULL)) {
87a2fd28
DB
2245 if (need_wait && need_resched())
2246 schedule();
69e3c75f
JB
2247 continue;
2248 }
2249
2250 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2251 hlen = LL_RESERVED_SPACE(dev);
2252 tlen = dev->needed_tailroom;
69e3c75f 2253 skb = sock_alloc_send_skb(&po->sk,
ae641949 2254 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2255 0, &err);
2256
2257 if (unlikely(skb == NULL))
2258 goto out_status;
2259
2260 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2261 addr, hlen);
69e3c75f
JB
2262
2263 if (unlikely(tp_len < 0)) {
2264 if (po->tp_loss) {
2265 __packet_set_status(po, ph,
2266 TP_STATUS_AVAILABLE);
2267 packet_increment_head(&po->tx_ring);
2268 kfree_skb(skb);
2269 continue;
2270 } else {
2271 status = TP_STATUS_WRONG_FORMAT;
2272 err = tp_len;
2273 goto out_status;
2274 }
2275 }
2276
d346a3fa 2277 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
69e3c75f
JB
2278 skb->destructor = tpacket_destruct_skb;
2279 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2280 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2281
2282 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2283 err = po->xmit(skb);
eb70df13
JP
2284 if (unlikely(err > 0)) {
2285 err = net_xmit_errno(err);
2286 if (err && __packet_get_status(po, ph) ==
2287 TP_STATUS_AVAILABLE) {
2288 /* skb was destructed already */
2289 skb = NULL;
2290 goto out_status;
2291 }
2292 /*
2293 * skb was dropped but not destructed yet;
2294 * let's treat it like congestion or err < 0
2295 */
2296 err = 0;
2297 }
69e3c75f
JB
2298 packet_increment_head(&po->tx_ring);
2299 len_sum += tp_len;
b0138408
DB
2300 } while (likely((ph != NULL) ||
2301 /* Note: packet_read_pending() might be slow if we have
2302 * to call it as it's per_cpu variable, but in fast-path
2303 * we already short-circuit the loop with the first
2304 * condition, and luckily don't have to go that path
2305 * anyway.
2306 */
2307 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2308
2309 err = len_sum;
2310 goto out_put;
2311
69e3c75f
JB
2312out_status:
2313 __packet_set_status(po, ph, status);
2314 kfree_skb(skb);
2315out_put:
e40526cb 2316 dev_put(dev);
69e3c75f
JB
2317out:
2318 mutex_unlock(&po->pg_vec_lock);
2319 return err;
2320}
69e3c75f 2321
eea49cc9
OJ
2322static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2323 size_t reserve, size_t len,
2324 size_t linear, int noblock,
2325 int *err)
bfd5f4a3
SS
2326{
2327 struct sk_buff *skb;
2328
2329 /* Under a page? Don't bother with paged skb. */
2330 if (prepad + len < PAGE_SIZE || !linear)
2331 linear = len;
2332
2333 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2334 err, 0);
bfd5f4a3
SS
2335 if (!skb)
2336 return NULL;
2337
2338 skb_reserve(skb, reserve);
2339 skb_put(skb, linear);
2340 skb->data_len = len - linear;
2341 skb->len += len - linear;
2342
2343 return skb;
2344}
2345
d346a3fa 2346static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2347{
2348 struct sock *sk = sock->sk;
40d4e3df 2349 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2350 struct sk_buff *skb;
2351 struct net_device *dev;
0e11c91e 2352 __be16 proto;
1da177e4 2353 unsigned char *addr;
827d9780 2354 int err, reserve = 0;
bfd5f4a3
SS
2355 struct virtio_net_hdr vnet_hdr = { 0 };
2356 int offset = 0;
2357 int vnet_hdr_len;
2358 struct packet_sock *po = pkt_sk(sk);
2359 unsigned short gso_type = 0;
ae641949 2360 int hlen, tlen;
3bdc0eba 2361 int extra_len = 0;
1da177e4
LT
2362
2363 /*
1ce4f28b 2364 * Get and verify the address.
1da177e4 2365 */
1ce4f28b 2366
66e56cd4 2367 if (likely(saddr == NULL)) {
e40526cb 2368 dev = packet_cached_dev_get(po);
1da177e4
LT
2369 proto = po->num;
2370 addr = NULL;
2371 } else {
2372 err = -EINVAL;
2373 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2374 goto out;
0fb375fb
EB
2375 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2376 goto out;
1da177e4
LT
2377 proto = saddr->sll_protocol;
2378 addr = saddr->sll_addr;
827d9780 2379 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2380 }
2381
1da177e4 2382 err = -ENXIO;
e40526cb 2383 if (unlikely(dev == NULL))
1da177e4 2384 goto out_unlock;
d5e76b0a 2385 err = -ENETDOWN;
e40526cb 2386 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2387 goto out_unlock;
2388
e40526cb
DB
2389 if (sock->type == SOCK_RAW)
2390 reserve = dev->hard_header_len;
bfd5f4a3
SS
2391 if (po->has_vnet_hdr) {
2392 vnet_hdr_len = sizeof(vnet_hdr);
2393
2394 err = -EINVAL;
2395 if (len < vnet_hdr_len)
2396 goto out_unlock;
2397
2398 len -= vnet_hdr_len;
2399
2400 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2401 vnet_hdr_len);
2402 if (err < 0)
2403 goto out_unlock;
2404
2405 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2406 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2407 vnet_hdr.hdr_len))
2408 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2409 vnet_hdr.csum_offset + 2;
2410
2411 err = -EINVAL;
2412 if (vnet_hdr.hdr_len > len)
2413 goto out_unlock;
2414
2415 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2416 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2417 case VIRTIO_NET_HDR_GSO_TCPV4:
2418 gso_type = SKB_GSO_TCPV4;
2419 break;
2420 case VIRTIO_NET_HDR_GSO_TCPV6:
2421 gso_type = SKB_GSO_TCPV6;
2422 break;
2423 case VIRTIO_NET_HDR_GSO_UDP:
2424 gso_type = SKB_GSO_UDP;
2425 break;
2426 default:
2427 goto out_unlock;
2428 }
2429
2430 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2431 gso_type |= SKB_GSO_TCP_ECN;
2432
2433 if (vnet_hdr.gso_size == 0)
2434 goto out_unlock;
2435
2436 }
2437 }
2438
3bdc0eba
BG
2439 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2440 if (!netif_supports_nofcs(dev)) {
2441 err = -EPROTONOSUPPORT;
2442 goto out_unlock;
2443 }
2444 extra_len = 4; /* We're doing our own CRC */
2445 }
2446
1da177e4 2447 err = -EMSGSIZE;
3bdc0eba 2448 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2449 goto out_unlock;
2450
bfd5f4a3 2451 err = -ENOBUFS;
ae641949
HX
2452 hlen = LL_RESERVED_SPACE(dev);
2453 tlen = dev->needed_tailroom;
2454 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2455 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2456 if (skb == NULL)
1da177e4
LT
2457 goto out_unlock;
2458
bfd5f4a3 2459 skb_set_network_header(skb, reserve);
1da177e4 2460
0c4e8581
SH
2461 err = -EINVAL;
2462 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2463 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2464 goto out_free;
1da177e4
LT
2465
2466 /* Returns -EFAULT on error */
bfd5f4a3 2467 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2468 if (err)
2469 goto out_free;
bf84a010
DB
2470
2471 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2472
3bdc0eba 2473 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2474 /* Earlier code assumed this would be a VLAN pkt,
2475 * double-check this now that we have the actual
2476 * packet in hand.
2477 */
2478 struct ethhdr *ehdr;
2479 skb_reset_mac_header(skb);
2480 ehdr = eth_hdr(skb);
2481 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2482 err = -EMSGSIZE;
2483 goto out_free;
2484 }
57f89bfa
BG
2485 }
2486
09effa67
DM
2487 skb->protocol = proto;
2488 skb->dev = dev;
1da177e4 2489 skb->priority = sk->sk_priority;
2d37a186 2490 skb->mark = sk->sk_mark;
d346a3fa 2491 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
1da177e4 2492
bfd5f4a3
SS
2493 if (po->has_vnet_hdr) {
2494 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2495 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2496 vnet_hdr.csum_offset)) {
2497 err = -EINVAL;
2498 goto out_free;
2499 }
2500 }
2501
2502 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2503 skb_shinfo(skb)->gso_type = gso_type;
2504
2505 /* Header must be checked, and gso_segs computed. */
2506 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2507 skb_shinfo(skb)->gso_segs = 0;
2508
2509 len += vnet_hdr_len;
2510 }
2511
d346a3fa
DB
2512 if (!packet_use_direct_xmit(po))
2513 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2514 if (unlikely(extra_len == 4))
2515 skb->no_fcs = 1;
2516
d346a3fa 2517 err = po->xmit(skb);
1da177e4
LT
2518 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2519 goto out_unlock;
2520
e40526cb 2521 dev_put(dev);
1da177e4 2522
40d4e3df 2523 return len;
1da177e4
LT
2524
2525out_free:
2526 kfree_skb(skb);
2527out_unlock:
e40526cb 2528 if (dev)
1da177e4
LT
2529 dev_put(dev);
2530out:
2531 return err;
2532}
2533
69e3c75f
JB
2534static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2535 struct msghdr *msg, size_t len)
2536{
69e3c75f
JB
2537 struct sock *sk = sock->sk;
2538 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2539
69e3c75f
JB
2540 if (po->tx_ring.pg_vec)
2541 return tpacket_snd(po, msg);
2542 else
69e3c75f
JB
2543 return packet_snd(sock, msg, len);
2544}
2545
1da177e4
LT
2546/*
2547 * Close a PACKET socket. This is fairly simple. We immediately go
2548 * to 'closed' state and remove our protocol entry in the device list.
2549 */
2550
2551static int packet_release(struct socket *sock)
2552{
2553 struct sock *sk = sock->sk;
2554 struct packet_sock *po;
d12d01d6 2555 struct net *net;
f6fb8f10 2556 union tpacket_req_u req_u;
1da177e4
LT
2557
2558 if (!sk)
2559 return 0;
2560
3b1e0a65 2561 net = sock_net(sk);
1da177e4
LT
2562 po = pkt_sk(sk);
2563
0fa7fa98 2564 mutex_lock(&net->packet.sklist_lock);
808f5114 2565 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2566 mutex_unlock(&net->packet.sklist_lock);
2567
2568 preempt_disable();
920de804 2569 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2570 preempt_enable();
1da177e4 2571
808f5114 2572 spin_lock(&po->bind_lock);
ce06b03e 2573 unregister_prot_hook(sk, false);
66e56cd4
DB
2574 packet_cached_dev_reset(po);
2575
160ff18a
BG
2576 if (po->prot_hook.dev) {
2577 dev_put(po->prot_hook.dev);
2578 po->prot_hook.dev = NULL;
2579 }
808f5114 2580 spin_unlock(&po->bind_lock);
1da177e4 2581
1da177e4 2582 packet_flush_mclist(sk);
1da177e4 2583
9665d5d6
PS
2584 if (po->rx_ring.pg_vec) {
2585 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2586 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2587 }
69e3c75f 2588
9665d5d6
PS
2589 if (po->tx_ring.pg_vec) {
2590 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2591 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2592 }
1da177e4 2593
dc99f600
DM
2594 fanout_release(sk);
2595
808f5114 2596 synchronize_net();
1da177e4
LT
2597 /*
2598 * Now the socket is dead. No more input will appear.
2599 */
1da177e4
LT
2600 sock_orphan(sk);
2601 sock->sk = NULL;
2602
2603 /* Purge queues */
2604
2605 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2606 packet_free_pending(po);
17ab56a2 2607 sk_refcnt_debug_release(sk);
1da177e4
LT
2608
2609 sock_put(sk);
2610 return 0;
2611}
2612
2613/*
2614 * Attach a packet hook.
2615 */
2616
902fefb8 2617static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2618{
2619 struct packet_sock *po = pkt_sk(sk);
902fefb8
DB
2620 const struct net_device *dev_curr;
2621 __be16 proto_curr;
2622 bool need_rehook;
dc99f600 2623
aef950b4
WY
2624 if (po->fanout) {
2625 if (dev)
2626 dev_put(dev);
2627
dc99f600 2628 return -EINVAL;
aef950b4 2629 }
1da177e4
LT
2630
2631 lock_sock(sk);
1da177e4 2632 spin_lock(&po->bind_lock);
66e56cd4 2633
902fefb8
DB
2634 proto_curr = po->prot_hook.type;
2635 dev_curr = po->prot_hook.dev;
2636
2637 need_rehook = proto_curr != proto || dev_curr != dev;
2638
2639 if (need_rehook) {
2640 unregister_prot_hook(sk, true);
1da177e4 2641
902fefb8
DB
2642 po->num = proto;
2643 po->prot_hook.type = proto;
1da177e4 2644
902fefb8
DB
2645 if (po->prot_hook.dev)
2646 dev_put(po->prot_hook.dev);
2647
2648 po->prot_hook.dev = dev;
2649
2650 po->ifindex = dev ? dev->ifindex : 0;
2651 packet_cached_dev_assign(po, dev);
2652 }
66e56cd4 2653
902fefb8 2654 if (proto == 0 || !need_rehook)
1da177e4
LT
2655 goto out_unlock;
2656
be85d4ad 2657 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2658 register_prot_hook(sk);
be85d4ad
UT
2659 } else {
2660 sk->sk_err = ENETDOWN;
2661 if (!sock_flag(sk, SOCK_DEAD))
2662 sk->sk_error_report(sk);
1da177e4
LT
2663 }
2664
2665out_unlock:
2666 spin_unlock(&po->bind_lock);
2667 release_sock(sk);
2668 return 0;
2669}
2670
2671/*
2672 * Bind a packet socket to a device
2673 */
2674
40d4e3df
ED
2675static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2676 int addr_len)
1da177e4 2677{
40d4e3df 2678 struct sock *sk = sock->sk;
1da177e4
LT
2679 char name[15];
2680 struct net_device *dev;
2681 int err = -ENODEV;
1ce4f28b 2682
1da177e4
LT
2683 /*
2684 * Check legality
2685 */
1ce4f28b 2686
8ae55f04 2687 if (addr_len != sizeof(struct sockaddr))
1da177e4 2688 return -EINVAL;
40d4e3df 2689 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2690
3b1e0a65 2691 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2692 if (dev)
1da177e4 2693 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2694 return err;
2695}
1da177e4
LT
2696
2697static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2698{
40d4e3df
ED
2699 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2700 struct sock *sk = sock->sk;
1da177e4
LT
2701 struct net_device *dev = NULL;
2702 int err;
2703
2704
2705 /*
2706 * Check legality
2707 */
1ce4f28b 2708
1da177e4
LT
2709 if (addr_len < sizeof(struct sockaddr_ll))
2710 return -EINVAL;
2711 if (sll->sll_family != AF_PACKET)
2712 return -EINVAL;
2713
2714 if (sll->sll_ifindex) {
2715 err = -ENODEV;
3b1e0a65 2716 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2717 if (dev == NULL)
2718 goto out;
2719 }
2720 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2721
2722out:
2723 return err;
2724}
2725
2726static struct proto packet_proto = {
2727 .name = "PACKET",
2728 .owner = THIS_MODULE,
2729 .obj_size = sizeof(struct packet_sock),
2730};
2731
2732/*
1ce4f28b 2733 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2734 */
2735
3f378b68
EP
2736static int packet_create(struct net *net, struct socket *sock, int protocol,
2737 int kern)
1da177e4
LT
2738{
2739 struct sock *sk;
2740 struct packet_sock *po;
0e11c91e 2741 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2742 int err;
2743
df008c91 2744 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2745 return -EPERM;
be02097c
DM
2746 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2747 sock->type != SOCK_PACKET)
1da177e4
LT
2748 return -ESOCKTNOSUPPORT;
2749
2750 sock->state = SS_UNCONNECTED;
2751
2752 err = -ENOBUFS;
6257ff21 2753 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2754 if (sk == NULL)
2755 goto out;
2756
2757 sock->ops = &packet_ops;
1da177e4
LT
2758 if (sock->type == SOCK_PACKET)
2759 sock->ops = &packet_ops_spkt;
be02097c 2760
1da177e4
LT
2761 sock_init_data(sock, sk);
2762
2763 po = pkt_sk(sk);
2764 sk->sk_family = PF_PACKET;
0e11c91e 2765 po->num = proto;
d346a3fa 2766 po->xmit = dev_queue_xmit;
66e56cd4 2767
b0138408
DB
2768 err = packet_alloc_pending(po);
2769 if (err)
2770 goto out2;
2771
66e56cd4 2772 packet_cached_dev_reset(po);
1da177e4
LT
2773
2774 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2775 sk_refcnt_debug_inc(sk);
1da177e4
LT
2776
2777 /*
2778 * Attach a protocol block
2779 */
2780
2781 spin_lock_init(&po->bind_lock);
905db440 2782 mutex_init(&po->pg_vec_lock);
1da177e4 2783 po->prot_hook.func = packet_rcv;
be02097c 2784
1da177e4
LT
2785 if (sock->type == SOCK_PACKET)
2786 po->prot_hook.func = packet_rcv_spkt;
be02097c 2787
1da177e4
LT
2788 po->prot_hook.af_packet_priv = sk;
2789
0e11c91e
AV
2790 if (proto) {
2791 po->prot_hook.type = proto;
ce06b03e 2792 register_prot_hook(sk);
1da177e4
LT
2793 }
2794
0fa7fa98 2795 mutex_lock(&net->packet.sklist_lock);
808f5114 2796 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2797 mutex_unlock(&net->packet.sklist_lock);
2798
2799 preempt_disable();
3680453c 2800 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2801 preempt_enable();
808f5114 2802
40d4e3df 2803 return 0;
b0138408
DB
2804out2:
2805 sk_free(sk);
1da177e4
LT
2806out:
2807 return err;
2808}
2809
2810/*
2811 * Pull a packet from our receive queue and hand it to the user.
2812 * If necessary we block.
2813 */
2814
2815static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2816 struct msghdr *msg, size_t len, int flags)
2817{
2818 struct sock *sk = sock->sk;
2819 struct sk_buff *skb;
2820 int copied, err;
bfd5f4a3 2821 int vnet_hdr_len = 0;
1da177e4
LT
2822
2823 err = -EINVAL;
ed85b565 2824 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2825 goto out;
2826
2827#if 0
2828 /* What error should we return now? EUNATTACH? */
2829 if (pkt_sk(sk)->ifindex < 0)
2830 return -ENODEV;
2831#endif
2832
ed85b565 2833 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
2834 err = sock_recv_errqueue(sk, msg, len,
2835 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
2836 goto out;
2837 }
2838
1da177e4
LT
2839 /*
2840 * Call the generic datagram receiver. This handles all sorts
2841 * of horrible races and re-entrancy so we can forget about it
2842 * in the protocol layers.
2843 *
2844 * Now it will return ENETDOWN, if device have just gone down,
2845 * but then it will block.
2846 */
2847
40d4e3df 2848 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2849
2850 /*
1ce4f28b 2851 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2852 * handles the blocking we don't see and worry about blocking
2853 * retries.
2854 */
2855
8ae55f04 2856 if (skb == NULL)
1da177e4
LT
2857 goto out;
2858
bfd5f4a3
SS
2859 if (pkt_sk(sk)->has_vnet_hdr) {
2860 struct virtio_net_hdr vnet_hdr = { 0 };
2861
2862 err = -EINVAL;
2863 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2864 if (len < vnet_hdr_len)
bfd5f4a3
SS
2865 goto out_free;
2866
1f18b717
MK
2867 len -= vnet_hdr_len;
2868
bfd5f4a3
SS
2869 if (skb_is_gso(skb)) {
2870 struct skb_shared_info *sinfo = skb_shinfo(skb);
2871
2872 /* This is a hint as to how much should be linear. */
2873 vnet_hdr.hdr_len = skb_headlen(skb);
2874 vnet_hdr.gso_size = sinfo->gso_size;
2875 if (sinfo->gso_type & SKB_GSO_TCPV4)
2876 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2877 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2878 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2879 else if (sinfo->gso_type & SKB_GSO_UDP)
2880 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2881 else if (sinfo->gso_type & SKB_GSO_FCOE)
2882 goto out_free;
2883 else
2884 BUG();
2885 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2886 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2887 } else
2888 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2889
2890 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2891 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2892 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2893 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2894 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2895 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2896 } /* else everything is zero */
2897
2898 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2899 vnet_hdr_len);
2900 if (err < 0)
2901 goto out_free;
2902 }
2903
f3d33426
HFS
2904 /* You lose any data beyond the buffer you gave. If it worries
2905 * a user program they can ask the device for its MTU
2906 * anyway.
1da177e4 2907 */
1da177e4 2908 copied = skb->len;
40d4e3df
ED
2909 if (copied > len) {
2910 copied = len;
2911 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2912 }
2913
2914 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2915 if (err)
2916 goto out_free;
2917
3b885787 2918 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 2919
f3d33426
HFS
2920 if (msg->msg_name) {
2921 /* If the address length field is there to be filled
2922 * in, we fill it in now.
2923 */
2924 if (sock->type == SOCK_PACKET) {
2925 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2926 } else {
2927 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2928 msg->msg_namelen = sll->sll_halen +
2929 offsetof(struct sockaddr_ll, sll_addr);
2930 }
ffbc6111
HX
2931 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2932 msg->msg_namelen);
f3d33426 2933 }
1da177e4 2934
8dc41944 2935 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2936 struct tpacket_auxdata aux;
2937
2938 aux.tp_status = TP_STATUS_USER;
2939 if (skb->ip_summed == CHECKSUM_PARTIAL)
2940 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2941 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2942 aux.tp_snaplen = skb->len;
2943 aux.tp_mac = 0;
bbe735e4 2944 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2945 if (vlan_tx_tag_present(skb)) {
2946 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
a0cdfcf3
AW
2947 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
2948 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2949 } else {
2950 aux.tp_vlan_tci = 0;
a0cdfcf3 2951 aux.tp_vlan_tpid = 0;
a3bcc23e 2952 }
ffbc6111 2953 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2954 }
2955
1da177e4
LT
2956 /*
2957 * Free or return the buffer as appropriate. Again this
2958 * hides all the races and re-entrancy issues from us.
2959 */
bfd5f4a3 2960 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2961
2962out_free:
2963 skb_free_datagram(sk, skb);
2964out:
2965 return err;
2966}
2967
1da177e4
LT
2968static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2969 int *uaddr_len, int peer)
2970{
2971 struct net_device *dev;
2972 struct sock *sk = sock->sk;
2973
2974 if (peer)
2975 return -EOPNOTSUPP;
2976
2977 uaddr->sa_family = AF_PACKET;
2dc85bf3 2978 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
2979 rcu_read_lock();
2980 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2981 if (dev)
2dc85bf3 2982 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 2983 rcu_read_unlock();
1da177e4
LT
2984 *uaddr_len = sizeof(*uaddr);
2985
2986 return 0;
2987}
1da177e4
LT
2988
2989static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2990 int *uaddr_len, int peer)
2991{
2992 struct net_device *dev;
2993 struct sock *sk = sock->sk;
2994 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2995 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2996
2997 if (peer)
2998 return -EOPNOTSUPP;
2999
3000 sll->sll_family = AF_PACKET;
3001 sll->sll_ifindex = po->ifindex;
3002 sll->sll_protocol = po->num;
67286640 3003 sll->sll_pkttype = 0;
654d1f8a
ED
3004 rcu_read_lock();
3005 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3006 if (dev) {
3007 sll->sll_hatype = dev->type;
3008 sll->sll_halen = dev->addr_len;
3009 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3010 } else {
3011 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3012 sll->sll_halen = 0;
3013 }
654d1f8a 3014 rcu_read_unlock();
0fb375fb 3015 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3016
3017 return 0;
3018}
3019
2aeb0b88
WC
3020static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3021 int what)
1da177e4
LT
3022{
3023 switch (i->type) {
3024 case PACKET_MR_MULTICAST:
1162563f
JP
3025 if (i->alen != dev->addr_len)
3026 return -EINVAL;
1da177e4 3027 if (what > 0)
22bedad3 3028 return dev_mc_add(dev, i->addr);
1da177e4 3029 else
22bedad3 3030 return dev_mc_del(dev, i->addr);
1da177e4
LT
3031 break;
3032 case PACKET_MR_PROMISC:
2aeb0b88 3033 return dev_set_promiscuity(dev, what);
1da177e4
LT
3034 break;
3035 case PACKET_MR_ALLMULTI:
2aeb0b88 3036 return dev_set_allmulti(dev, what);
1da177e4 3037 break;
d95ed927 3038 case PACKET_MR_UNICAST:
1162563f
JP
3039 if (i->alen != dev->addr_len)
3040 return -EINVAL;
d95ed927 3041 if (what > 0)
a748ee24 3042 return dev_uc_add(dev, i->addr);
d95ed927 3043 else
a748ee24 3044 return dev_uc_del(dev, i->addr);
d95ed927 3045 break;
40d4e3df
ED
3046 default:
3047 break;
1da177e4 3048 }
2aeb0b88 3049 return 0;
1da177e4
LT
3050}
3051
3052static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
3053{
40d4e3df 3054 for ( ; i; i = i->next) {
1da177e4
LT
3055 if (i->ifindex == dev->ifindex)
3056 packet_dev_mc(dev, i, what);
3057 }
3058}
3059
0fb375fb 3060static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3061{
3062 struct packet_sock *po = pkt_sk(sk);
3063 struct packet_mclist *ml, *i;
3064 struct net_device *dev;
3065 int err;
3066
3067 rtnl_lock();
3068
3069 err = -ENODEV;
3b1e0a65 3070 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3071 if (!dev)
3072 goto done;
3073
3074 err = -EINVAL;
1162563f 3075 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3076 goto done;
3077
3078 err = -ENOBUFS;
8b3a7005 3079 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3080 if (i == NULL)
3081 goto done;
3082
3083 err = 0;
3084 for (ml = po->mclist; ml; ml = ml->next) {
3085 if (ml->ifindex == mreq->mr_ifindex &&
3086 ml->type == mreq->mr_type &&
3087 ml->alen == mreq->mr_alen &&
3088 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3089 ml->count++;
3090 /* Free the new element ... */
3091 kfree(i);
3092 goto done;
3093 }
3094 }
3095
3096 i->type = mreq->mr_type;
3097 i->ifindex = mreq->mr_ifindex;
3098 i->alen = mreq->mr_alen;
3099 memcpy(i->addr, mreq->mr_address, i->alen);
3100 i->count = 1;
3101 i->next = po->mclist;
3102 po->mclist = i;
2aeb0b88
WC
3103 err = packet_dev_mc(dev, i, 1);
3104 if (err) {
3105 po->mclist = i->next;
3106 kfree(i);
3107 }
1da177e4
LT
3108
3109done:
3110 rtnl_unlock();
3111 return err;
3112}
3113
0fb375fb 3114static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3115{
3116 struct packet_mclist *ml, **mlp;
3117
3118 rtnl_lock();
3119
3120 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3121 if (ml->ifindex == mreq->mr_ifindex &&
3122 ml->type == mreq->mr_type &&
3123 ml->alen == mreq->mr_alen &&
3124 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3125 if (--ml->count == 0) {
3126 struct net_device *dev;
3127 *mlp = ml->next;
ad959e76
ED
3128 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3129 if (dev)
1da177e4 3130 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3131 kfree(ml);
3132 }
3133 rtnl_unlock();
3134 return 0;
3135 }
3136 }
3137 rtnl_unlock();
3138 return -EADDRNOTAVAIL;
3139}
3140
3141static void packet_flush_mclist(struct sock *sk)
3142{
3143 struct packet_sock *po = pkt_sk(sk);
3144 struct packet_mclist *ml;
3145
3146 if (!po->mclist)
3147 return;
3148
3149 rtnl_lock();
3150 while ((ml = po->mclist) != NULL) {
3151 struct net_device *dev;
3152
3153 po->mclist = ml->next;
ad959e76
ED
3154 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3155 if (dev != NULL)
1da177e4 3156 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3157 kfree(ml);
3158 }
3159 rtnl_unlock();
3160}
1da177e4
LT
3161
3162static int
b7058842 3163packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3164{
3165 struct sock *sk = sock->sk;
8dc41944 3166 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3167 int ret;
3168
3169 if (level != SOL_PACKET)
3170 return -ENOPROTOOPT;
3171
69e3c75f 3172 switch (optname) {
1ce4f28b 3173 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3174 case PACKET_DROP_MEMBERSHIP:
3175 {
0fb375fb
EB
3176 struct packet_mreq_max mreq;
3177 int len = optlen;
3178 memset(&mreq, 0, sizeof(mreq));
3179 if (len < sizeof(struct packet_mreq))
1da177e4 3180 return -EINVAL;
0fb375fb
EB
3181 if (len > sizeof(mreq))
3182 len = sizeof(mreq);
40d4e3df 3183 if (copy_from_user(&mreq, optval, len))
1da177e4 3184 return -EFAULT;
0fb375fb
EB
3185 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3186 return -EINVAL;
1da177e4
LT
3187 if (optname == PACKET_ADD_MEMBERSHIP)
3188 ret = packet_mc_add(sk, &mreq);
3189 else
3190 ret = packet_mc_drop(sk, &mreq);
3191 return ret;
3192 }
a2efcfa0 3193
1da177e4 3194 case PACKET_RX_RING:
69e3c75f 3195 case PACKET_TX_RING:
1da177e4 3196 {
f6fb8f10 3197 union tpacket_req_u req_u;
3198 int len;
1da177e4 3199
f6fb8f10 3200 switch (po->tp_version) {
3201 case TPACKET_V1:
3202 case TPACKET_V2:
3203 len = sizeof(req_u.req);
3204 break;
3205 case TPACKET_V3:
3206 default:
3207 len = sizeof(req_u.req3);
3208 break;
3209 }
3210 if (optlen < len)
1da177e4 3211 return -EINVAL;
bfd5f4a3
SS
3212 if (pkt_sk(sk)->has_vnet_hdr)
3213 return -EINVAL;
f6fb8f10 3214 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3215 return -EFAULT;
f6fb8f10 3216 return packet_set_ring(sk, &req_u, 0,
3217 optname == PACKET_TX_RING);
1da177e4
LT
3218 }
3219 case PACKET_COPY_THRESH:
3220 {
3221 int val;
3222
40d4e3df 3223 if (optlen != sizeof(val))
1da177e4 3224 return -EINVAL;
40d4e3df 3225 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3226 return -EFAULT;
3227
3228 pkt_sk(sk)->copy_thresh = val;
3229 return 0;
3230 }
bbd6ef87
PM
3231 case PACKET_VERSION:
3232 {
3233 int val;
3234
3235 if (optlen != sizeof(val))
3236 return -EINVAL;
69e3c75f 3237 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3238 return -EBUSY;
3239 if (copy_from_user(&val, optval, sizeof(val)))
3240 return -EFAULT;
3241 switch (val) {
3242 case TPACKET_V1:
3243 case TPACKET_V2:
f6fb8f10 3244 case TPACKET_V3:
bbd6ef87
PM
3245 po->tp_version = val;
3246 return 0;
3247 default:
3248 return -EINVAL;
3249 }
3250 }
8913336a
PM
3251 case PACKET_RESERVE:
3252 {
3253 unsigned int val;
3254
3255 if (optlen != sizeof(val))
3256 return -EINVAL;
69e3c75f 3257 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3258 return -EBUSY;
3259 if (copy_from_user(&val, optval, sizeof(val)))
3260 return -EFAULT;
3261 po->tp_reserve = val;
3262 return 0;
3263 }
69e3c75f
JB
3264 case PACKET_LOSS:
3265 {
3266 unsigned int val;
3267
3268 if (optlen != sizeof(val))
3269 return -EINVAL;
3270 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3271 return -EBUSY;
3272 if (copy_from_user(&val, optval, sizeof(val)))
3273 return -EFAULT;
3274 po->tp_loss = !!val;
3275 return 0;
3276 }
8dc41944
HX
3277 case PACKET_AUXDATA:
3278 {
3279 int val;
3280
3281 if (optlen < sizeof(val))
3282 return -EINVAL;
3283 if (copy_from_user(&val, optval, sizeof(val)))
3284 return -EFAULT;
3285
3286 po->auxdata = !!val;
3287 return 0;
3288 }
80feaacb
PWJ
3289 case PACKET_ORIGDEV:
3290 {
3291 int val;
3292
3293 if (optlen < sizeof(val))
3294 return -EINVAL;
3295 if (copy_from_user(&val, optval, sizeof(val)))
3296 return -EFAULT;
3297
3298 po->origdev = !!val;
3299 return 0;
3300 }
bfd5f4a3
SS
3301 case PACKET_VNET_HDR:
3302 {
3303 int val;
3304
3305 if (sock->type != SOCK_RAW)
3306 return -EINVAL;
3307 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3308 return -EBUSY;
3309 if (optlen < sizeof(val))
3310 return -EINVAL;
3311 if (copy_from_user(&val, optval, sizeof(val)))
3312 return -EFAULT;
3313
3314 po->has_vnet_hdr = !!val;
3315 return 0;
3316 }
614f60fa
SM
3317 case PACKET_TIMESTAMP:
3318 {
3319 int val;
3320
3321 if (optlen != sizeof(val))
3322 return -EINVAL;
3323 if (copy_from_user(&val, optval, sizeof(val)))
3324 return -EFAULT;
3325
3326 po->tp_tstamp = val;
3327 return 0;
3328 }
dc99f600
DM
3329 case PACKET_FANOUT:
3330 {
3331 int val;
3332
3333 if (optlen != sizeof(val))
3334 return -EINVAL;
3335 if (copy_from_user(&val, optval, sizeof(val)))
3336 return -EFAULT;
3337
3338 return fanout_add(sk, val & 0xffff, val >> 16);
3339 }
5920cd3a
PC
3340 case PACKET_TX_HAS_OFF:
3341 {
3342 unsigned int val;
3343
3344 if (optlen != sizeof(val))
3345 return -EINVAL;
3346 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3347 return -EBUSY;
3348 if (copy_from_user(&val, optval, sizeof(val)))
3349 return -EFAULT;
3350 po->tp_tx_has_off = !!val;
3351 return 0;
3352 }
d346a3fa
DB
3353 case PACKET_QDISC_BYPASS:
3354 {
3355 int val;
3356
3357 if (optlen != sizeof(val))
3358 return -EINVAL;
3359 if (copy_from_user(&val, optval, sizeof(val)))
3360 return -EFAULT;
3361
3362 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3363 return 0;
3364 }
1da177e4
LT
3365 default:
3366 return -ENOPROTOOPT;
3367 }
3368}
3369
3370static int packet_getsockopt(struct socket *sock, int level, int optname,
3371 char __user *optval, int __user *optlen)
3372{
3373 int len;
c06fff6e 3374 int val, lv = sizeof(val);
1da177e4
LT
3375 struct sock *sk = sock->sk;
3376 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3377 void *data = &val;
ee80fbf3 3378 union tpacket_stats_u st;
1da177e4
LT
3379
3380 if (level != SOL_PACKET)
3381 return -ENOPROTOOPT;
3382
8ae55f04
KK
3383 if (get_user(len, optlen))
3384 return -EFAULT;
1da177e4
LT
3385
3386 if (len < 0)
3387 return -EINVAL;
1ce4f28b 3388
69e3c75f 3389 switch (optname) {
1da177e4 3390 case PACKET_STATISTICS:
1da177e4 3391 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3392 memcpy(&st, &po->stats, sizeof(st));
3393 memset(&po->stats, 0, sizeof(po->stats));
3394 spin_unlock_bh(&sk->sk_receive_queue.lock);
3395
f6fb8f10 3396 if (po->tp_version == TPACKET_V3) {
c06fff6e 3397 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3398 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3399 data = &st.stats3;
f6fb8f10 3400 } else {
c06fff6e 3401 lv = sizeof(struct tpacket_stats);
8bcdeaff 3402 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3403 data = &st.stats1;
f6fb8f10 3404 }
ee80fbf3 3405
8dc41944
HX
3406 break;
3407 case PACKET_AUXDATA:
8dc41944 3408 val = po->auxdata;
80feaacb
PWJ
3409 break;
3410 case PACKET_ORIGDEV:
80feaacb 3411 val = po->origdev;
bfd5f4a3
SS
3412 break;
3413 case PACKET_VNET_HDR:
bfd5f4a3 3414 val = po->has_vnet_hdr;
1da177e4 3415 break;
bbd6ef87 3416 case PACKET_VERSION:
bbd6ef87 3417 val = po->tp_version;
bbd6ef87
PM
3418 break;
3419 case PACKET_HDRLEN:
3420 if (len > sizeof(int))
3421 len = sizeof(int);
3422 if (copy_from_user(&val, optval, len))
3423 return -EFAULT;
3424 switch (val) {
3425 case TPACKET_V1:
3426 val = sizeof(struct tpacket_hdr);
3427 break;
3428 case TPACKET_V2:
3429 val = sizeof(struct tpacket2_hdr);
3430 break;
f6fb8f10 3431 case TPACKET_V3:
3432 val = sizeof(struct tpacket3_hdr);
3433 break;
bbd6ef87
PM
3434 default:
3435 return -EINVAL;
3436 }
bbd6ef87 3437 break;
8913336a 3438 case PACKET_RESERVE:
8913336a 3439 val = po->tp_reserve;
8913336a 3440 break;
69e3c75f 3441 case PACKET_LOSS:
69e3c75f 3442 val = po->tp_loss;
69e3c75f 3443 break;
614f60fa 3444 case PACKET_TIMESTAMP:
614f60fa 3445 val = po->tp_tstamp;
614f60fa 3446 break;
dc99f600 3447 case PACKET_FANOUT:
dc99f600
DM
3448 val = (po->fanout ?
3449 ((u32)po->fanout->id |
77f65ebd
WB
3450 ((u32)po->fanout->type << 16) |
3451 ((u32)po->fanout->flags << 24)) :
dc99f600 3452 0);
dc99f600 3453 break;
5920cd3a
PC
3454 case PACKET_TX_HAS_OFF:
3455 val = po->tp_tx_has_off;
3456 break;
d346a3fa
DB
3457 case PACKET_QDISC_BYPASS:
3458 val = packet_use_direct_xmit(po);
3459 break;
1da177e4
LT
3460 default:
3461 return -ENOPROTOOPT;
3462 }
3463
c06fff6e
ED
3464 if (len > lv)
3465 len = lv;
8ae55f04
KK
3466 if (put_user(len, optlen))
3467 return -EFAULT;
8dc41944
HX
3468 if (copy_to_user(optval, data, len))
3469 return -EFAULT;
8ae55f04 3470 return 0;
1da177e4
LT
3471}
3472
3473
351638e7
JP
3474static int packet_notifier(struct notifier_block *this,
3475 unsigned long msg, void *ptr)
1da177e4
LT
3476{
3477 struct sock *sk;
351638e7 3478 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3479 struct net *net = dev_net(dev);
1da177e4 3480
808f5114 3481 rcu_read_lock();
b67bfe0d 3482 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3483 struct packet_sock *po = pkt_sk(sk);
3484
3485 switch (msg) {
3486 case NETDEV_UNREGISTER:
1da177e4
LT
3487 if (po->mclist)
3488 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3489 /* fallthrough */
3490
1da177e4
LT
3491 case NETDEV_DOWN:
3492 if (dev->ifindex == po->ifindex) {
3493 spin_lock(&po->bind_lock);
3494 if (po->running) {
ce06b03e 3495 __unregister_prot_hook(sk, false);
1da177e4
LT
3496 sk->sk_err = ENETDOWN;
3497 if (!sock_flag(sk, SOCK_DEAD))
3498 sk->sk_error_report(sk);
3499 }
3500 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3501 packet_cached_dev_reset(po);
1da177e4 3502 po->ifindex = -1;
160ff18a
BG
3503 if (po->prot_hook.dev)
3504 dev_put(po->prot_hook.dev);
1da177e4
LT
3505 po->prot_hook.dev = NULL;
3506 }
3507 spin_unlock(&po->bind_lock);
3508 }
3509 break;
3510 case NETDEV_UP:
808f5114 3511 if (dev->ifindex == po->ifindex) {
3512 spin_lock(&po->bind_lock);
ce06b03e
DM
3513 if (po->num)
3514 register_prot_hook(sk);
808f5114 3515 spin_unlock(&po->bind_lock);
1da177e4 3516 }
1da177e4
LT
3517 break;
3518 }
3519 }
808f5114 3520 rcu_read_unlock();
1da177e4
LT
3521 return NOTIFY_DONE;
3522}
3523
3524
3525static int packet_ioctl(struct socket *sock, unsigned int cmd,
3526 unsigned long arg)
3527{
3528 struct sock *sk = sock->sk;
3529
69e3c75f 3530 switch (cmd) {
40d4e3df
ED
3531 case SIOCOUTQ:
3532 {
3533 int amount = sk_wmem_alloc_get(sk);
31e6d363 3534
40d4e3df
ED
3535 return put_user(amount, (int __user *)arg);
3536 }
3537 case SIOCINQ:
3538 {
3539 struct sk_buff *skb;
3540 int amount = 0;
3541
3542 spin_lock_bh(&sk->sk_receive_queue.lock);
3543 skb = skb_peek(&sk->sk_receive_queue);
3544 if (skb)
3545 amount = skb->len;
3546 spin_unlock_bh(&sk->sk_receive_queue.lock);
3547 return put_user(amount, (int __user *)arg);
3548 }
3549 case SIOCGSTAMP:
3550 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3551 case SIOCGSTAMPNS:
3552 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3553
1da177e4 3554#ifdef CONFIG_INET
40d4e3df
ED
3555 case SIOCADDRT:
3556 case SIOCDELRT:
3557 case SIOCDARP:
3558 case SIOCGARP:
3559 case SIOCSARP:
3560 case SIOCGIFADDR:
3561 case SIOCSIFADDR:
3562 case SIOCGIFBRDADDR:
3563 case SIOCSIFBRDADDR:
3564 case SIOCGIFNETMASK:
3565 case SIOCSIFNETMASK:
3566 case SIOCGIFDSTADDR:
3567 case SIOCSIFDSTADDR:
3568 case SIOCSIFFLAGS:
40d4e3df 3569 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3570#endif
3571
40d4e3df
ED
3572 default:
3573 return -ENOIOCTLCMD;
1da177e4
LT
3574 }
3575 return 0;
3576}
3577
40d4e3df 3578static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3579 poll_table *wait)
3580{
3581 struct sock *sk = sock->sk;
3582 struct packet_sock *po = pkt_sk(sk);
3583 unsigned int mask = datagram_poll(file, sock, wait);
3584
3585 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3586 if (po->rx_ring.pg_vec) {
f6fb8f10 3587 if (!packet_previous_rx_frame(po, &po->rx_ring,
3588 TP_STATUS_KERNEL))
1da177e4
LT
3589 mask |= POLLIN | POLLRDNORM;
3590 }
3591 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3592 spin_lock_bh(&sk->sk_write_queue.lock);
3593 if (po->tx_ring.pg_vec) {
3594 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3595 mask |= POLLOUT | POLLWRNORM;
3596 }
3597 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3598 return mask;
3599}
3600
3601
3602/* Dirty? Well, I still did not learn better way to account
3603 * for user mmaps.
3604 */
3605
3606static void packet_mm_open(struct vm_area_struct *vma)
3607{
3608 struct file *file = vma->vm_file;
40d4e3df 3609 struct socket *sock = file->private_data;
1da177e4 3610 struct sock *sk = sock->sk;
1ce4f28b 3611
1da177e4
LT
3612 if (sk)
3613 atomic_inc(&pkt_sk(sk)->mapped);
3614}
3615
3616static void packet_mm_close(struct vm_area_struct *vma)
3617{
3618 struct file *file = vma->vm_file;
40d4e3df 3619 struct socket *sock = file->private_data;
1da177e4 3620 struct sock *sk = sock->sk;
1ce4f28b 3621
1da177e4
LT
3622 if (sk)
3623 atomic_dec(&pkt_sk(sk)->mapped);
3624}
3625
f0f37e2f 3626static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3627 .open = packet_mm_open,
3628 .close = packet_mm_close,
1da177e4
LT
3629};
3630
0e3125c7
NH
3631static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3632 unsigned int len)
1da177e4
LT
3633{
3634 int i;
3635
4ebf0ae2 3636 for (i = 0; i < len; i++) {
0e3125c7 3637 if (likely(pg_vec[i].buffer)) {
c56b4d90 3638 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3639 vfree(pg_vec[i].buffer);
3640 else
3641 free_pages((unsigned long)pg_vec[i].buffer,
3642 order);
3643 pg_vec[i].buffer = NULL;
3644 }
1da177e4
LT
3645 }
3646 kfree(pg_vec);
3647}
3648
eea49cc9 3649static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3650{
0e3125c7
NH
3651 char *buffer = NULL;
3652 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3653 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3654
3655 buffer = (char *) __get_free_pages(gfp_flags, order);
3656
3657 if (buffer)
3658 return buffer;
3659
3660 /*
3661 * __get_free_pages failed, fall back to vmalloc
3662 */
bbce5a59 3663 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3664
0e3125c7
NH
3665 if (buffer)
3666 return buffer;
3667
3668 /*
3669 * vmalloc failed, lets dig into swap here
3670 */
0e3125c7
NH
3671 gfp_flags &= ~__GFP_NORETRY;
3672 buffer = (char *)__get_free_pages(gfp_flags, order);
3673 if (buffer)
3674 return buffer;
3675
3676 /*
3677 * complete and utter failure
3678 */
3679 return NULL;
4ebf0ae2
DM
3680}
3681
0e3125c7 3682static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3683{
3684 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3685 struct pgv *pg_vec;
4ebf0ae2
DM
3686 int i;
3687
0e3125c7 3688 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3689 if (unlikely(!pg_vec))
3690 goto out;
3691
3692 for (i = 0; i < block_nr; i++) {
c56b4d90 3693 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3694 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3695 goto out_free_pgvec;
3696 }
3697
3698out:
3699 return pg_vec;
3700
3701out_free_pgvec:
3702 free_pg_vec(pg_vec, order, block_nr);
3703 pg_vec = NULL;
3704 goto out;
3705}
1da177e4 3706
f6fb8f10 3707static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3708 int closing, int tx_ring)
1da177e4 3709{
0e3125c7 3710 struct pgv *pg_vec = NULL;
1da177e4 3711 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3712 int was_running, order = 0;
69e3c75f
JB
3713 struct packet_ring_buffer *rb;
3714 struct sk_buff_head *rb_queue;
0e11c91e 3715 __be16 num;
f6fb8f10 3716 int err = -EINVAL;
3717 /* Added to avoid minimal code churn */
3718 struct tpacket_req *req = &req_u->req;
3719
3720 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3721 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3722 WARN(1, "Tx-ring is not supported.\n");
3723 goto out;
3724 }
1ce4f28b 3725
69e3c75f
JB
3726 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3727 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3728
69e3c75f
JB
3729 err = -EBUSY;
3730 if (!closing) {
3731 if (atomic_read(&po->mapped))
3732 goto out;
b0138408 3733 if (packet_read_pending(rb))
69e3c75f
JB
3734 goto out;
3735 }
1da177e4 3736
69e3c75f
JB
3737 if (req->tp_block_nr) {
3738 /* Sanity tests and some calculations */
3739 err = -EBUSY;
3740 if (unlikely(rb->pg_vec))
3741 goto out;
1da177e4 3742
bbd6ef87
PM
3743 switch (po->tp_version) {
3744 case TPACKET_V1:
3745 po->tp_hdrlen = TPACKET_HDRLEN;
3746 break;
3747 case TPACKET_V2:
3748 po->tp_hdrlen = TPACKET2_HDRLEN;
3749 break;
f6fb8f10 3750 case TPACKET_V3:
3751 po->tp_hdrlen = TPACKET3_HDRLEN;
3752 break;
bbd6ef87
PM
3753 }
3754
69e3c75f 3755 err = -EINVAL;
4ebf0ae2 3756 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3757 goto out;
4ebf0ae2 3758 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3759 goto out;
8913336a 3760 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3761 po->tp_reserve))
3762 goto out;
4ebf0ae2 3763 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3764 goto out;
1da177e4 3765
69e3c75f
JB
3766 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3767 if (unlikely(rb->frames_per_block <= 0))
3768 goto out;
3769 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3770 req->tp_frame_nr))
3771 goto out;
1da177e4
LT
3772
3773 err = -ENOMEM;
4ebf0ae2
DM
3774 order = get_order(req->tp_block_size);
3775 pg_vec = alloc_pg_vec(req, order);
3776 if (unlikely(!pg_vec))
1da177e4 3777 goto out;
f6fb8f10 3778 switch (po->tp_version) {
3779 case TPACKET_V3:
3780 /* Transmit path is not supported. We checked
3781 * it above but just being paranoid
3782 */
3783 if (!tx_ring)
3784 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3785 break;
3786 default:
3787 break;
3788 }
69e3c75f
JB
3789 }
3790 /* Done */
3791 else {
3792 err = -EINVAL;
4ebf0ae2 3793 if (unlikely(req->tp_frame_nr))
69e3c75f 3794 goto out;
1da177e4
LT
3795 }
3796
3797 lock_sock(sk);
3798
3799 /* Detach socket from network */
3800 spin_lock(&po->bind_lock);
3801 was_running = po->running;
3802 num = po->num;
3803 if (was_running) {
1da177e4 3804 po->num = 0;
ce06b03e 3805 __unregister_prot_hook(sk, false);
1da177e4
LT
3806 }
3807 spin_unlock(&po->bind_lock);
1ce4f28b 3808
1da177e4
LT
3809 synchronize_net();
3810
3811 err = -EBUSY;
905db440 3812 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3813 if (closing || atomic_read(&po->mapped) == 0) {
3814 err = 0;
69e3c75f 3815 spin_lock_bh(&rb_queue->lock);
c053fd96 3816 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3817 rb->frame_max = (req->tp_frame_nr - 1);
3818 rb->head = 0;
3819 rb->frame_size = req->tp_frame_size;
3820 spin_unlock_bh(&rb_queue->lock);
3821
c053fd96
CG
3822 swap(rb->pg_vec_order, order);
3823 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3824
3825 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3826 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3827 tpacket_rcv : packet_rcv;
3828 skb_queue_purge(rb_queue);
1da177e4 3829 if (atomic_read(&po->mapped))
40d4e3df
ED
3830 pr_err("packet_mmap: vma is busy: %d\n",
3831 atomic_read(&po->mapped));
1da177e4 3832 }
905db440 3833 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3834
3835 spin_lock(&po->bind_lock);
ce06b03e 3836 if (was_running) {
1da177e4 3837 po->num = num;
ce06b03e 3838 register_prot_hook(sk);
1da177e4
LT
3839 }
3840 spin_unlock(&po->bind_lock);
f6fb8f10 3841 if (closing && (po->tp_version > TPACKET_V2)) {
3842 /* Because we don't support block-based V3 on tx-ring */
3843 if (!tx_ring)
3844 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3845 }
1da177e4
LT
3846 release_sock(sk);
3847
1da177e4
LT
3848 if (pg_vec)
3849 free_pg_vec(pg_vec, order, req->tp_block_nr);
3850out:
3851 return err;
3852}
3853
69e3c75f
JB
3854static int packet_mmap(struct file *file, struct socket *sock,
3855 struct vm_area_struct *vma)
1da177e4
LT
3856{
3857 struct sock *sk = sock->sk;
3858 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3859 unsigned long size, expected_size;
3860 struct packet_ring_buffer *rb;
1da177e4
LT
3861 unsigned long start;
3862 int err = -EINVAL;
3863 int i;
3864
3865 if (vma->vm_pgoff)
3866 return -EINVAL;
3867
905db440 3868 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3869
3870 expected_size = 0;
3871 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3872 if (rb->pg_vec) {
3873 expected_size += rb->pg_vec_len
3874 * rb->pg_vec_pages
3875 * PAGE_SIZE;
3876 }
3877 }
3878
3879 if (expected_size == 0)
1da177e4 3880 goto out;
69e3c75f
JB
3881
3882 size = vma->vm_end - vma->vm_start;
3883 if (size != expected_size)
1da177e4
LT
3884 goto out;
3885
1da177e4 3886 start = vma->vm_start;
69e3c75f
JB
3887 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3888 if (rb->pg_vec == NULL)
3889 continue;
3890
3891 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3892 struct page *page;
3893 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3894 int pg_num;
3895
c56b4d90
CG
3896 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3897 page = pgv_to_page(kaddr);
69e3c75f
JB
3898 err = vm_insert_page(vma, start, page);
3899 if (unlikely(err))
3900 goto out;
3901 start += PAGE_SIZE;
0e3125c7 3902 kaddr += PAGE_SIZE;
69e3c75f 3903 }
4ebf0ae2 3904 }
1da177e4 3905 }
69e3c75f 3906
4ebf0ae2 3907 atomic_inc(&po->mapped);
1da177e4
LT
3908 vma->vm_ops = &packet_mmap_ops;
3909 err = 0;
3910
3911out:
905db440 3912 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3913 return err;
3914}
1da177e4 3915
90ddc4f0 3916static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3917 .family = PF_PACKET,
3918 .owner = THIS_MODULE,
3919 .release = packet_release,
3920 .bind = packet_bind_spkt,
3921 .connect = sock_no_connect,
3922 .socketpair = sock_no_socketpair,
3923 .accept = sock_no_accept,
3924 .getname = packet_getname_spkt,
3925 .poll = datagram_poll,
3926 .ioctl = packet_ioctl,
3927 .listen = sock_no_listen,
3928 .shutdown = sock_no_shutdown,
3929 .setsockopt = sock_no_setsockopt,
3930 .getsockopt = sock_no_getsockopt,
3931 .sendmsg = packet_sendmsg_spkt,
3932 .recvmsg = packet_recvmsg,
3933 .mmap = sock_no_mmap,
3934 .sendpage = sock_no_sendpage,
3935};
1da177e4 3936
90ddc4f0 3937static const struct proto_ops packet_ops = {
1da177e4
LT
3938 .family = PF_PACKET,
3939 .owner = THIS_MODULE,
3940 .release = packet_release,
3941 .bind = packet_bind,
3942 .connect = sock_no_connect,
3943 .socketpair = sock_no_socketpair,
3944 .accept = sock_no_accept,
1ce4f28b 3945 .getname = packet_getname,
1da177e4
LT
3946 .poll = packet_poll,
3947 .ioctl = packet_ioctl,
3948 .listen = sock_no_listen,
3949 .shutdown = sock_no_shutdown,
3950 .setsockopt = packet_setsockopt,
3951 .getsockopt = packet_getsockopt,
3952 .sendmsg = packet_sendmsg,
3953 .recvmsg = packet_recvmsg,
3954 .mmap = packet_mmap,
3955 .sendpage = sock_no_sendpage,
3956};
3957
ec1b4cf7 3958static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3959 .family = PF_PACKET,
3960 .create = packet_create,
3961 .owner = THIS_MODULE,
3962};
3963
3964static struct notifier_block packet_netdev_notifier = {
40d4e3df 3965 .notifier_call = packet_notifier,
1da177e4
LT
3966};
3967
3968#ifdef CONFIG_PROC_FS
1da177e4
LT
3969
3970static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3971 __acquires(RCU)
1da177e4 3972{
e372c414 3973 struct net *net = seq_file_net(seq);
808f5114 3974
3975 rcu_read_lock();
3976 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3977}
3978
3979static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3980{
1bf40954 3981 struct net *net = seq_file_net(seq);
808f5114 3982 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3983}
3984
3985static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3986 __releases(RCU)
1da177e4 3987{
808f5114 3988 rcu_read_unlock();
1da177e4
LT
3989}
3990
1ce4f28b 3991static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3992{
3993 if (v == SEQ_START_TOKEN)
3994 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3995 else {
b7ceabd9 3996 struct sock *s = sk_entry(v);
1da177e4
LT
3997 const struct packet_sock *po = pkt_sk(s);
3998
3999 seq_printf(seq,
71338aa7 4000 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4001 s,
4002 atomic_read(&s->sk_refcnt),
4003 s->sk_type,
4004 ntohs(po->num),
4005 po->ifindex,
4006 po->running,
4007 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4008 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4009 sock_i_ino(s));
1da177e4
LT
4010 }
4011
4012 return 0;
4013}
4014
56b3d975 4015static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4016 .start = packet_seq_start,
4017 .next = packet_seq_next,
4018 .stop = packet_seq_stop,
4019 .show = packet_seq_show,
4020};
4021
4022static int packet_seq_open(struct inode *inode, struct file *file)
4023{
e372c414
DL
4024 return seq_open_net(inode, file, &packet_seq_ops,
4025 sizeof(struct seq_net_private));
1da177e4
LT
4026}
4027
da7071d7 4028static const struct file_operations packet_seq_fops = {
1da177e4
LT
4029 .owner = THIS_MODULE,
4030 .open = packet_seq_open,
4031 .read = seq_read,
4032 .llseek = seq_lseek,
e372c414 4033 .release = seq_release_net,
1da177e4
LT
4034};
4035
4036#endif
4037
2c8c1e72 4038static int __net_init packet_net_init(struct net *net)
d12d01d6 4039{
0fa7fa98 4040 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4041 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4042
d4beaa66 4043 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4044 return -ENOMEM;
4045
4046 return 0;
4047}
4048
2c8c1e72 4049static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4050{
ece31ffd 4051 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4052}
4053
4054static struct pernet_operations packet_net_ops = {
4055 .init = packet_net_init,
4056 .exit = packet_net_exit,
4057};
4058
4059
1da177e4
LT
4060static void __exit packet_exit(void)
4061{
1da177e4 4062 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4063 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4064 sock_unregister(PF_PACKET);
4065 proto_unregister(&packet_proto);
4066}
4067
4068static int __init packet_init(void)
4069{
4070 int rc = proto_register(&packet_proto, 0);
4071
4072 if (rc != 0)
4073 goto out;
4074
4075 sock_register(&packet_family_ops);
d12d01d6 4076 register_pernet_subsys(&packet_net_ops);
1da177e4 4077 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4078out:
4079 return rc;
4080}
4081
4082module_init(packet_init);
4083module_exit(packet_exit);
4084MODULE_LICENSE("GPL");
4085MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.672585 seconds and 5 git commands to generate.