net/mlx4_core: Remove unnecessary validation for port number
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
5df0ddfb 91#include <linux/reciprocal_div.h>
b0138408 92#include <linux/percpu.h>
1da177e4
LT
93#ifdef CONFIG_INET
94#include <net/inet_common.h>
95#endif
96
2787b04b
PE
97#include "internal.h"
98
1da177e4
LT
99/*
100 Assumptions:
101 - if device has no dev->hard_header routine, it adds and removes ll header
102 inside itself. In this case ll header is invisible outside of device,
103 but higher levels still should reserve dev->hard_header_len.
104 Some devices are enough clever to reallocate skb, when header
105 will not fit to reserved space (tunnel), another ones are silly
106 (PPP).
107 - packet socket receives packets with pulled ll header,
108 so that SOCK_RAW should push it back.
109
110On receive:
111-----------
112
113Incoming, dev->hard_header!=NULL
b0e380b1
ACM
114 mac_header -> ll header
115 data -> data
1da177e4
LT
116
117Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
118 mac_header -> ll header
119 data -> ll header
1da177e4
LT
120
121Incoming, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> UNKNOWN position. It is very likely, that it points to ll
123 header. PPP makes it, that is wrong, because introduce
db0c58f9 124 assymetry between rx and tx paths.
b0e380b1 125 data -> data
1da177e4
LT
126
127Outgoing, dev->hard_header==NULL
b0e380b1
ACM
128 mac_header -> data. ll header is still not built!
129 data -> data
1da177e4
LT
130
131Resume
132 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133
134
135On transmit:
136------------
137
138dev->hard_header != NULL
b0e380b1
ACM
139 mac_header -> ll header
140 data -> ll header
1da177e4
LT
141
142dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
143 mac_header -> data
144 data -> data
1da177e4
LT
145
146 We should set nh.raw on output to correct posistion,
147 packet classifier depends on it.
148 */
149
1da177e4
LT
150/* Private packet socket structures. */
151
0fb375fb
EB
152/* identical to struct packet_mreq except it has
153 * a longer address field.
154 */
40d4e3df 155struct packet_mreq_max {
0fb375fb
EB
156 int mr_ifindex;
157 unsigned short mr_type;
158 unsigned short mr_alen;
159 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 160};
a2efcfa0 161
184f489e
DB
162union tpacket_uhdr {
163 struct tpacket_hdr *h1;
164 struct tpacket2_hdr *h2;
165 struct tpacket3_hdr *h3;
166 void *raw;
167};
168
f6fb8f10 169static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
170 int closing, int tx_ring);
171
f6fb8f10 172#define V3_ALIGNMENT (8)
173
bc59ba39 174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
f6fb8f10 179#define PGV_FROM_VMALLOC 1
69e3c75f 180
f6fb8f10 181#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
182#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
183#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
184#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
185#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
186#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
188
69e3c75f
JB
189struct packet_sock;
190static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
191static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
192 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 193
f6fb8f10 194static void *packet_previous_frame(struct packet_sock *po,
195 struct packet_ring_buffer *rb,
196 int status);
197static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 198static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
199 struct tpacket_block_desc *);
200static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 201 struct packet_sock *);
bc59ba39 202static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 203 struct packet_sock *, unsigned int status);
bc59ba39 204static int prb_queue_frozen(struct tpacket_kbdq_core *);
205static void prb_open_block(struct tpacket_kbdq_core *,
206 struct tpacket_block_desc *);
f6fb8f10 207static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 208static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
209static void prb_init_blk_timer(struct packet_sock *,
210 struct tpacket_kbdq_core *,
211 void (*func) (unsigned long));
212static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
213static void prb_clear_rxhash(struct tpacket_kbdq_core *,
214 struct tpacket3_hdr *);
215static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
216 struct tpacket3_hdr *);
1da177e4
LT
217static void packet_flush_mclist(struct sock *sk);
218
ffbc6111
HX
219struct packet_skb_cb {
220 unsigned int origlen;
221 union {
222 struct sockaddr_pkt pkt;
223 struct sockaddr_ll ll;
224 } sa;
225};
226
227#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 228
bc59ba39 229#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 230#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 231 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 232#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 233 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 234#define GET_NEXT_PRB_BLK_NUM(x) \
235 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
236 ((x)->kactive_blk_num+1) : 0)
237
dc99f600
DM
238static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
239static void __fanout_link(struct sock *sk, struct packet_sock *po);
240
d346a3fa
DB
241static int packet_direct_xmit(struct sk_buff *skb)
242{
243 struct net_device *dev = skb->dev;
244 const struct net_device_ops *ops = dev->netdev_ops;
245 netdev_features_t features;
246 struct netdev_queue *txq;
247 u16 queue_map;
248 int ret;
249
250 if (unlikely(!netif_running(dev) ||
251 !netif_carrier_ok(dev))) {
252 kfree_skb(skb);
253 return NET_XMIT_DROP;
254 }
255
256 features = netif_skb_features(skb);
257 if (skb_needs_linearize(skb, features) &&
258 __skb_linearize(skb)) {
259 kfree_skb(skb);
260 return NET_XMIT_DROP;
261 }
262
263 queue_map = skb_get_queue_mapping(skb);
264 txq = netdev_get_tx_queue(dev, queue_map);
265
266 __netif_tx_lock_bh(txq);
267 if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
268 ret = NETDEV_TX_BUSY;
269 kfree_skb(skb);
270 goto out;
271 }
272
273 ret = ops->ndo_start_xmit(skb, dev);
274 if (likely(dev_xmit_complete(ret)))
275 txq_trans_update(txq);
276 else
277 kfree_skb(skb);
278out:
279 __netif_tx_unlock_bh(txq);
280 return ret;
281}
282
66e56cd4
DB
283static struct net_device *packet_cached_dev_get(struct packet_sock *po)
284{
285 struct net_device *dev;
286
287 rcu_read_lock();
288 dev = rcu_dereference(po->cached_dev);
289 if (likely(dev))
290 dev_hold(dev);
291 rcu_read_unlock();
292
293 return dev;
294}
295
296static void packet_cached_dev_assign(struct packet_sock *po,
297 struct net_device *dev)
298{
299 rcu_assign_pointer(po->cached_dev, dev);
300}
301
302static void packet_cached_dev_reset(struct packet_sock *po)
303{
304 RCU_INIT_POINTER(po->cached_dev, NULL);
305}
306
d346a3fa
DB
307static bool packet_use_direct_xmit(const struct packet_sock *po)
308{
309 return po->xmit == packet_direct_xmit;
310}
311
312static u16 packet_pick_tx_queue(struct net_device *dev)
313{
1cbac010 314 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
315}
316
ce06b03e
DM
317/* register_prot_hook must be invoked with the po->bind_lock held,
318 * or from a context in which asynchronous accesses to the packet
319 * socket is not possible (packet_create()).
320 */
321static void register_prot_hook(struct sock *sk)
322{
323 struct packet_sock *po = pkt_sk(sk);
e40526cb 324
ce06b03e 325 if (!po->running) {
66e56cd4 326 if (po->fanout)
dc99f600 327 __fanout_link(sk, po);
66e56cd4 328 else
dc99f600 329 dev_add_pack(&po->prot_hook);
e40526cb 330
ce06b03e
DM
331 sock_hold(sk);
332 po->running = 1;
333 }
334}
335
336/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
337 * held. If the sync parameter is true, we will temporarily drop
338 * the po->bind_lock and do a synchronize_net to make sure no
339 * asynchronous packet processing paths still refer to the elements
340 * of po->prot_hook. If the sync parameter is false, it is the
341 * callers responsibility to take care of this.
342 */
343static void __unregister_prot_hook(struct sock *sk, bool sync)
344{
345 struct packet_sock *po = pkt_sk(sk);
346
347 po->running = 0;
66e56cd4
DB
348
349 if (po->fanout)
dc99f600 350 __fanout_unlink(sk, po);
66e56cd4 351 else
dc99f600 352 __dev_remove_pack(&po->prot_hook);
e40526cb 353
ce06b03e
DM
354 __sock_put(sk);
355
356 if (sync) {
357 spin_unlock(&po->bind_lock);
358 synchronize_net();
359 spin_lock(&po->bind_lock);
360 }
361}
362
363static void unregister_prot_hook(struct sock *sk, bool sync)
364{
365 struct packet_sock *po = pkt_sk(sk);
366
367 if (po->running)
368 __unregister_prot_hook(sk, sync);
369}
370
f6dafa95 371static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
372{
373 if (is_vmalloc_addr(addr))
374 return vmalloc_to_page(addr);
375 return virt_to_page(addr);
376}
377
69e3c75f 378static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 379{
184f489e 380 union tpacket_uhdr h;
1da177e4 381
69e3c75f 382 h.raw = frame;
bbd6ef87
PM
383 switch (po->tp_version) {
384 case TPACKET_V1:
69e3c75f 385 h.h1->tp_status = status;
0af55bb5 386 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
387 break;
388 case TPACKET_V2:
69e3c75f 389 h.h2->tp_status = status;
0af55bb5 390 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 391 break;
f6fb8f10 392 case TPACKET_V3:
69e3c75f 393 default:
f6fb8f10 394 WARN(1, "TPACKET version not supported.\n");
69e3c75f 395 BUG();
bbd6ef87 396 }
69e3c75f
JB
397
398 smp_wmb();
bbd6ef87
PM
399}
400
69e3c75f 401static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 402{
184f489e 403 union tpacket_uhdr h;
bbd6ef87 404
69e3c75f
JB
405 smp_rmb();
406
bbd6ef87
PM
407 h.raw = frame;
408 switch (po->tp_version) {
409 case TPACKET_V1:
0af55bb5 410 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 411 return h.h1->tp_status;
bbd6ef87 412 case TPACKET_V2:
0af55bb5 413 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 414 return h.h2->tp_status;
f6fb8f10 415 case TPACKET_V3:
69e3c75f 416 default:
f6fb8f10 417 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
418 BUG();
419 return 0;
bbd6ef87 420 }
1da177e4 421}
69e3c75f 422
b9c32fb2
DB
423static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
424 unsigned int flags)
7a51384c
DB
425{
426 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
427
428 if (shhwtstamps) {
429 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
430 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 431 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
432 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
433 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 434 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
435 }
436
437 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 438 return TP_STATUS_TS_SOFTWARE;
7a51384c 439
b9c32fb2 440 return 0;
7a51384c
DB
441}
442
b9c32fb2
DB
443static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
444 struct sk_buff *skb)
2e31396f
WB
445{
446 union tpacket_uhdr h;
447 struct timespec ts;
b9c32fb2 448 __u32 ts_status;
2e31396f 449
b9c32fb2
DB
450 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
451 return 0;
2e31396f
WB
452
453 h.raw = frame;
454 switch (po->tp_version) {
455 case TPACKET_V1:
456 h.h1->tp_sec = ts.tv_sec;
457 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
458 break;
459 case TPACKET_V2:
460 h.h2->tp_sec = ts.tv_sec;
461 h.h2->tp_nsec = ts.tv_nsec;
462 break;
463 case TPACKET_V3:
464 default:
465 WARN(1, "TPACKET version not supported.\n");
466 BUG();
467 }
468
469 /* one flush is safe, as both fields always lie on the same cacheline */
470 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
471 smp_wmb();
b9c32fb2
DB
472
473 return ts_status;
2e31396f
WB
474}
475
69e3c75f
JB
476static void *packet_lookup_frame(struct packet_sock *po,
477 struct packet_ring_buffer *rb,
478 unsigned int position,
479 int status)
480{
481 unsigned int pg_vec_pos, frame_offset;
184f489e 482 union tpacket_uhdr h;
69e3c75f
JB
483
484 pg_vec_pos = position / rb->frames_per_block;
485 frame_offset = position % rb->frames_per_block;
486
0e3125c7
NH
487 h.raw = rb->pg_vec[pg_vec_pos].buffer +
488 (frame_offset * rb->frame_size);
69e3c75f
JB
489
490 if (status != __packet_get_status(po, h.raw))
491 return NULL;
492
493 return h.raw;
494}
495
eea49cc9 496static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
497 struct packet_ring_buffer *rb,
498 int status)
499{
500 return packet_lookup_frame(po, rb, rb->head, status);
501}
502
bc59ba39 503static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 504{
505 del_timer_sync(&pkc->retire_blk_timer);
506}
507
508static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
509 int tx_ring,
510 struct sk_buff_head *rb_queue)
511{
bc59ba39 512 struct tpacket_kbdq_core *pkc;
f6fb8f10 513
22781a5b
DJ
514 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
515 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 516
ec6f809f 517 spin_lock_bh(&rb_queue->lock);
f6fb8f10 518 pkc->delete_blk_timer = 1;
ec6f809f 519 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 520
521 prb_del_retire_blk_timer(pkc);
522}
523
524static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 525 struct tpacket_kbdq_core *pkc,
f6fb8f10 526 void (*func) (unsigned long))
527{
528 init_timer(&pkc->retire_blk_timer);
529 pkc->retire_blk_timer.data = (long)po;
530 pkc->retire_blk_timer.function = func;
531 pkc->retire_blk_timer.expires = jiffies;
532}
533
534static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
535{
bc59ba39 536 struct tpacket_kbdq_core *pkc;
f6fb8f10 537
538 if (tx_ring)
539 BUG();
540
22781a5b
DJ
541 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
542 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 543 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
544}
545
546static int prb_calc_retire_blk_tmo(struct packet_sock *po,
547 int blk_size_in_bytes)
548{
549 struct net_device *dev;
550 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
551 struct ethtool_cmd ecmd;
552 int err;
e440cf2c 553 u32 speed;
f6fb8f10 554
4bc71cb9
JP
555 rtnl_lock();
556 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
557 if (unlikely(!dev)) {
558 rtnl_unlock();
f6fb8f10 559 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
560 }
561 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 562 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
563 rtnl_unlock();
564 if (!err) {
4bc71cb9
JP
565 /*
566 * If the link speed is so slow you don't really
567 * need to worry about perf anyways
568 */
e440cf2c 569 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 570 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 571 } else {
572 msec = 1;
573 div = speed / 1000;
f6fb8f10 574 }
575 }
576
577 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
578
579 if (div)
580 mbits /= div;
581
582 tmo = mbits * msec;
583
584 if (div)
585 return tmo+1;
586 return tmo;
587}
588
bc59ba39 589static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 590 union tpacket_req_u *req_u)
591{
592 p1->feature_req_word = req_u->req3.tp_feature_req_word;
593}
594
595static void init_prb_bdqc(struct packet_sock *po,
596 struct packet_ring_buffer *rb,
597 struct pgv *pg_vec,
598 union tpacket_req_u *req_u, int tx_ring)
599{
22781a5b 600 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 601 struct tpacket_block_desc *pbd;
f6fb8f10 602
603 memset(p1, 0x0, sizeof(*p1));
604
605 p1->knxt_seq_num = 1;
606 p1->pkbdq = pg_vec;
bc59ba39 607 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 608 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 609 p1->kblk_size = req_u->req3.tp_block_size;
610 p1->knum_blocks = req_u->req3.tp_block_nr;
611 p1->hdrlen = po->tp_hdrlen;
612 p1->version = po->tp_version;
613 p1->last_kactive_blk_num = 0;
ee80fbf3 614 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 615 if (req_u->req3.tp_retire_blk_tov)
616 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
617 else
618 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
619 req_u->req3.tp_block_size);
620 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
621 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
622
623 prb_init_ft_ops(p1, req_u);
624 prb_setup_retire_blk_timer(po, tx_ring);
625 prb_open_block(p1, pbd);
626}
627
628/* Do NOT update the last_blk_num first.
629 * Assumes sk_buff_head lock is held.
630 */
bc59ba39 631static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 632{
633 mod_timer(&pkc->retire_blk_timer,
634 jiffies + pkc->tov_in_jiffies);
635 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
636}
637
638/*
639 * Timer logic:
640 * 1) We refresh the timer only when we open a block.
641 * By doing this we don't waste cycles refreshing the timer
642 * on packet-by-packet basis.
643 *
644 * With a 1MB block-size, on a 1Gbps line, it will take
645 * i) ~8 ms to fill a block + ii) memcpy etc.
646 * In this cut we are not accounting for the memcpy time.
647 *
648 * So, if the user sets the 'tmo' to 10ms then the timer
649 * will never fire while the block is still getting filled
650 * (which is what we want). However, the user could choose
651 * to close a block early and that's fine.
652 *
653 * But when the timer does fire, we check whether or not to refresh it.
654 * Since the tmo granularity is in msecs, it is not too expensive
655 * to refresh the timer, lets say every '8' msecs.
656 * Either the user can set the 'tmo' or we can derive it based on
657 * a) line-speed and b) block-size.
658 * prb_calc_retire_blk_tmo() calculates the tmo.
659 *
660 */
661static void prb_retire_rx_blk_timer_expired(unsigned long data)
662{
663 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 664 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 665 unsigned int frozen;
bc59ba39 666 struct tpacket_block_desc *pbd;
f6fb8f10 667
668 spin_lock(&po->sk.sk_receive_queue.lock);
669
670 frozen = prb_queue_frozen(pkc);
671 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
672
673 if (unlikely(pkc->delete_blk_timer))
674 goto out;
675
676 /* We only need to plug the race when the block is partially filled.
677 * tpacket_rcv:
678 * lock(); increment BLOCK_NUM_PKTS; unlock()
679 * copy_bits() is in progress ...
680 * timer fires on other cpu:
681 * we can't retire the current block because copy_bits
682 * is in progress.
683 *
684 */
685 if (BLOCK_NUM_PKTS(pbd)) {
686 while (atomic_read(&pkc->blk_fill_in_prog)) {
687 /* Waiting for skb_copy_bits to finish... */
688 cpu_relax();
689 }
690 }
691
692 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
693 if (!frozen) {
694 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
695 if (!prb_dispatch_next_block(pkc, po))
696 goto refresh_timer;
697 else
698 goto out;
699 } else {
700 /* Case 1. Queue was frozen because user-space was
701 * lagging behind.
702 */
703 if (prb_curr_blk_in_use(pkc, pbd)) {
704 /*
705 * Ok, user-space is still behind.
706 * So just refresh the timer.
707 */
708 goto refresh_timer;
709 } else {
710 /* Case 2. queue was frozen,user-space caught up,
711 * now the link went idle && the timer fired.
712 * We don't have a block to close.So we open this
713 * block and restart the timer.
714 * opening a block thaws the queue,restarts timer
715 * Thawing/timer-refresh is a side effect.
716 */
717 prb_open_block(pkc, pbd);
718 goto out;
719 }
720 }
721 }
722
723refresh_timer:
724 _prb_refresh_rx_retire_blk_timer(pkc);
725
726out:
727 spin_unlock(&po->sk.sk_receive_queue.lock);
728}
729
eea49cc9 730static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 731 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 732{
733 /* Flush everything minus the block header */
734
735#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
736 u8 *start, *end;
737
738 start = (u8 *)pbd1;
739
740 /* Skip the block header(we know header WILL fit in 4K) */
741 start += PAGE_SIZE;
742
743 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
744 for (; start < end; start += PAGE_SIZE)
745 flush_dcache_page(pgv_to_page(start));
746
747 smp_wmb();
748#endif
749
750 /* Now update the block status. */
751
752 BLOCK_STATUS(pbd1) = status;
753
754 /* Flush the block header */
755
756#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
757 start = (u8 *)pbd1;
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762}
763
764/*
765 * Side effect:
766 *
767 * 1) flush the block
768 * 2) Increment active_blk_num
769 *
770 * Note:We DONT refresh the timer on purpose.
771 * Because almost always the next block will be opened.
772 */
bc59ba39 773static void prb_close_block(struct tpacket_kbdq_core *pkc1,
774 struct tpacket_block_desc *pbd1,
f6fb8f10 775 struct packet_sock *po, unsigned int stat)
776{
777 __u32 status = TP_STATUS_USER | stat;
778
779 struct tpacket3_hdr *last_pkt;
bc59ba39 780 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 781
ee80fbf3 782 if (po->stats.stats3.tp_drops)
f6fb8f10 783 status |= TP_STATUS_LOSING;
784
785 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
786 last_pkt->tp_next_offset = 0;
787
788 /* Get the ts of the last pkt */
789 if (BLOCK_NUM_PKTS(pbd1)) {
790 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
791 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
792 } else {
793 /* Ok, we tmo'd - so get the current time */
794 struct timespec ts;
795 getnstimeofday(&ts);
796 h1->ts_last_pkt.ts_sec = ts.tv_sec;
797 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
798 }
799
800 smp_wmb();
801
802 /* Flush the block */
803 prb_flush_block(pkc1, pbd1, status);
804
805 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
806}
807
eea49cc9 808static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 809{
810 pkc->reset_pending_on_curr_blk = 0;
811}
812
813/*
814 * Side effect of opening a block:
815 *
816 * 1) prb_queue is thawed.
817 * 2) retire_blk_timer is refreshed.
818 *
819 */
bc59ba39 820static void prb_open_block(struct tpacket_kbdq_core *pkc1,
821 struct tpacket_block_desc *pbd1)
f6fb8f10 822{
823 struct timespec ts;
bc59ba39 824 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 825
826 smp_rmb();
827
8da3056c
DB
828 /* We could have just memset this but we will lose the
829 * flexibility of making the priv area sticky
830 */
f6fb8f10 831
8da3056c
DB
832 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
833 BLOCK_NUM_PKTS(pbd1) = 0;
834 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 835
8da3056c
DB
836 getnstimeofday(&ts);
837
838 h1->ts_first_pkt.ts_sec = ts.tv_sec;
839 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 840
8da3056c
DB
841 pkc1->pkblk_start = (char *)pbd1;
842 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
843
844 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
845 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
846
847 pbd1->version = pkc1->version;
848 pkc1->prev = pkc1->nxt_offset;
849 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
850
851 prb_thaw_queue(pkc1);
852 _prb_refresh_rx_retire_blk_timer(pkc1);
853
854 smp_wmb();
f6fb8f10 855}
856
857/*
858 * Queue freeze logic:
859 * 1) Assume tp_block_nr = 8 blocks.
860 * 2) At time 't0', user opens Rx ring.
861 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
862 * 4) user-space is either sleeping or processing block '0'.
863 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
864 * it will close block-7,loop around and try to fill block '0'.
865 * call-flow:
866 * __packet_lookup_frame_in_block
867 * prb_retire_current_block()
868 * prb_dispatch_next_block()
869 * |->(BLOCK_STATUS == USER) evaluates to true
870 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
871 * 6) Now there are two cases:
872 * 6.1) Link goes idle right after the queue is frozen.
873 * But remember, the last open_block() refreshed the timer.
874 * When this timer expires,it will refresh itself so that we can
875 * re-open block-0 in near future.
876 * 6.2) Link is busy and keeps on receiving packets. This is a simple
877 * case and __packet_lookup_frame_in_block will check if block-0
878 * is free and can now be re-used.
879 */
eea49cc9 880static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 881 struct packet_sock *po)
882{
883 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 884 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 885}
886
887#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
888
889/*
890 * If the next block is free then we will dispatch it
891 * and return a good offset.
892 * Else, we will freeze the queue.
893 * So, caller must check the return value.
894 */
bc59ba39 895static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 896 struct packet_sock *po)
897{
bc59ba39 898 struct tpacket_block_desc *pbd;
f6fb8f10 899
900 smp_rmb();
901
902 /* 1. Get current block num */
903 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
904
905 /* 2. If this block is currently in_use then freeze the queue */
906 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
907 prb_freeze_queue(pkc, po);
908 return NULL;
909 }
910
911 /*
912 * 3.
913 * open this block and return the offset where the first packet
914 * needs to get stored.
915 */
916 prb_open_block(pkc, pbd);
917 return (void *)pkc->nxt_offset;
918}
919
bc59ba39 920static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 921 struct packet_sock *po, unsigned int status)
922{
bc59ba39 923 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 924
925 /* retire/close the current block */
926 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
927 /*
928 * Plug the case where copy_bits() is in progress on
929 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
930 * have space to copy the pkt in the current block and
931 * called prb_retire_current_block()
932 *
933 * We don't need to worry about the TMO case because
934 * the timer-handler already handled this case.
935 */
936 if (!(status & TP_STATUS_BLK_TMO)) {
937 while (atomic_read(&pkc->blk_fill_in_prog)) {
938 /* Waiting for skb_copy_bits to finish... */
939 cpu_relax();
940 }
941 }
942 prb_close_block(pkc, pbd, po, status);
943 return;
944 }
f6fb8f10 945}
946
eea49cc9 947static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 948 struct tpacket_block_desc *pbd)
f6fb8f10 949{
950 return TP_STATUS_USER & BLOCK_STATUS(pbd);
951}
952
eea49cc9 953static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 954{
955 return pkc->reset_pending_on_curr_blk;
956}
957
eea49cc9 958static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 959{
bc59ba39 960 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 961 atomic_dec(&pkc->blk_fill_in_prog);
962}
963
eea49cc9 964static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 965 struct tpacket3_hdr *ppd)
966{
3958afa1 967 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 968}
969
eea49cc9 970static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 971 struct tpacket3_hdr *ppd)
972{
973 ppd->hv1.tp_rxhash = 0;
974}
975
eea49cc9 976static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 977 struct tpacket3_hdr *ppd)
978{
979 if (vlan_tx_tag_present(pkc->skb)) {
980 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
a0cdfcf3
AW
981 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
982 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 983 } else {
9e67030a 984 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 985 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 986 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 987 }
988}
989
bc59ba39 990static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
a0cdfcf3 993 ppd->hv1.tp_padding = 0;
f6fb8f10 994 prb_fill_vlan_info(pkc, ppd);
995
996 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
997 prb_fill_rxhash(pkc, ppd);
998 else
999 prb_clear_rxhash(pkc, ppd);
1000}
1001
eea49cc9 1002static void prb_fill_curr_block(char *curr,
bc59ba39 1003 struct tpacket_kbdq_core *pkc,
1004 struct tpacket_block_desc *pbd,
f6fb8f10 1005 unsigned int len)
1006{
1007 struct tpacket3_hdr *ppd;
1008
1009 ppd = (struct tpacket3_hdr *)curr;
1010 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1011 pkc->prev = curr;
1012 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1013 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1014 BLOCK_NUM_PKTS(pbd) += 1;
1015 atomic_inc(&pkc->blk_fill_in_prog);
1016 prb_run_all_ft_ops(pkc, ppd);
1017}
1018
1019/* Assumes caller has the sk->rx_queue.lock */
1020static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1021 struct sk_buff *skb,
1022 int status,
1023 unsigned int len
1024 )
1025{
bc59ba39 1026 struct tpacket_kbdq_core *pkc;
1027 struct tpacket_block_desc *pbd;
f6fb8f10 1028 char *curr, *end;
1029
e3192690 1030 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1031 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1032
1033 /* Queue is frozen when user space is lagging behind */
1034 if (prb_queue_frozen(pkc)) {
1035 /*
1036 * Check if that last block which caused the queue to freeze,
1037 * is still in_use by user-space.
1038 */
1039 if (prb_curr_blk_in_use(pkc, pbd)) {
1040 /* Can't record this packet */
1041 return NULL;
1042 } else {
1043 /*
1044 * Ok, the block was released by user-space.
1045 * Now let's open that block.
1046 * opening a block also thaws the queue.
1047 * Thawing is a side effect.
1048 */
1049 prb_open_block(pkc, pbd);
1050 }
1051 }
1052
1053 smp_mb();
1054 curr = pkc->nxt_offset;
1055 pkc->skb = skb;
e3192690 1056 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1057
1058 /* first try the current block */
1059 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1060 prb_fill_curr_block(curr, pkc, pbd, len);
1061 return (void *)curr;
1062 }
1063
1064 /* Ok, close the current block */
1065 prb_retire_current_block(pkc, po, 0);
1066
1067 /* Now, try to dispatch the next block */
1068 curr = (char *)prb_dispatch_next_block(pkc, po);
1069 if (curr) {
1070 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1071 prb_fill_curr_block(curr, pkc, pbd, len);
1072 return (void *)curr;
1073 }
1074
1075 /*
1076 * No free blocks are available.user_space hasn't caught up yet.
1077 * Queue was just frozen and now this packet will get dropped.
1078 */
1079 return NULL;
1080}
1081
eea49cc9 1082static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1083 struct sk_buff *skb,
1084 int status, unsigned int len)
1085{
1086 char *curr = NULL;
1087 switch (po->tp_version) {
1088 case TPACKET_V1:
1089 case TPACKET_V2:
1090 curr = packet_lookup_frame(po, &po->rx_ring,
1091 po->rx_ring.head, status);
1092 return curr;
1093 case TPACKET_V3:
1094 return __packet_lookup_frame_in_block(po, skb, status, len);
1095 default:
1096 WARN(1, "TPACKET version not supported\n");
1097 BUG();
99aa3473 1098 return NULL;
f6fb8f10 1099 }
1100}
1101
eea49cc9 1102static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1103 struct packet_ring_buffer *rb,
77f65ebd 1104 unsigned int idx,
f6fb8f10 1105 int status)
1106{
bc59ba39 1107 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1108 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1109
1110 if (status != BLOCK_STATUS(pbd))
1111 return NULL;
1112 return pbd;
1113}
1114
eea49cc9 1115static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1116{
1117 unsigned int prev;
1118 if (rb->prb_bdqc.kactive_blk_num)
1119 prev = rb->prb_bdqc.kactive_blk_num-1;
1120 else
1121 prev = rb->prb_bdqc.knum_blocks-1;
1122 return prev;
1123}
1124
1125/* Assumes caller has held the rx_queue.lock */
eea49cc9 1126static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1127 struct packet_ring_buffer *rb,
1128 int status)
1129{
1130 unsigned int previous = prb_previous_blk_num(rb);
1131 return prb_lookup_block(po, rb, previous, status);
1132}
1133
eea49cc9 1134static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1135 struct packet_ring_buffer *rb,
1136 int status)
1137{
1138 if (po->tp_version <= TPACKET_V2)
1139 return packet_previous_frame(po, rb, status);
1140
1141 return __prb_previous_block(po, rb, status);
1142}
1143
eea49cc9 1144static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1145 struct packet_ring_buffer *rb)
1146{
1147 switch (po->tp_version) {
1148 case TPACKET_V1:
1149 case TPACKET_V2:
1150 return packet_increment_head(rb);
1151 case TPACKET_V3:
1152 default:
1153 WARN(1, "TPACKET version not supported.\n");
1154 BUG();
1155 return;
1156 }
1157}
1158
eea49cc9 1159static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1160 struct packet_ring_buffer *rb,
1161 int status)
1162{
1163 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1164 return packet_lookup_frame(po, rb, previous, status);
1165}
1166
eea49cc9 1167static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1168{
1169 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1170}
1171
b0138408
DB
1172static void packet_inc_pending(struct packet_ring_buffer *rb)
1173{
1174 this_cpu_inc(*rb->pending_refcnt);
1175}
1176
1177static void packet_dec_pending(struct packet_ring_buffer *rb)
1178{
1179 this_cpu_dec(*rb->pending_refcnt);
1180}
1181
1182static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1183{
1184 unsigned int refcnt = 0;
1185 int cpu;
1186
1187 /* We don't use pending refcount in rx_ring. */
1188 if (rb->pending_refcnt == NULL)
1189 return 0;
1190
1191 for_each_possible_cpu(cpu)
1192 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1193
1194 return refcnt;
1195}
1196
1197static int packet_alloc_pending(struct packet_sock *po)
1198{
1199 po->rx_ring.pending_refcnt = NULL;
1200
1201 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1202 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1203 return -ENOBUFS;
1204
1205 return 0;
1206}
1207
1208static void packet_free_pending(struct packet_sock *po)
1209{
1210 free_percpu(po->tx_ring.pending_refcnt);
1211}
1212
77f65ebd
WB
1213static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1214{
1215 struct sock *sk = &po->sk;
1216 bool has_room;
1217
1218 if (po->prot_hook.func != tpacket_rcv)
1219 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1220 <= sk->sk_rcvbuf;
1221
1222 spin_lock(&sk->sk_receive_queue.lock);
1223 if (po->tp_version == TPACKET_V3)
1224 has_room = prb_lookup_block(po, &po->rx_ring,
1225 po->rx_ring.prb_bdqc.kactive_blk_num,
1226 TP_STATUS_KERNEL);
1227 else
1228 has_room = packet_lookup_frame(po, &po->rx_ring,
1229 po->rx_ring.head,
1230 TP_STATUS_KERNEL);
1231 spin_unlock(&sk->sk_receive_queue.lock);
1232
1233 return has_room;
1234}
1235
1da177e4
LT
1236static void packet_sock_destruct(struct sock *sk)
1237{
ed85b565
RC
1238 skb_queue_purge(&sk->sk_error_queue);
1239
547b792c
IJ
1240 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1241 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1242
1243 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1244 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1245 return;
1246 }
1247
17ab56a2 1248 sk_refcnt_debug_dec(sk);
1da177e4
LT
1249}
1250
dc99f600
DM
1251static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1252{
1253 int x = atomic_read(&f->rr_cur) + 1;
1254
1255 if (x >= num)
1256 x = 0;
1257
1258 return x;
1259}
1260
77f65ebd
WB
1261static unsigned int fanout_demux_hash(struct packet_fanout *f,
1262 struct sk_buff *skb,
1263 unsigned int num)
dc99f600 1264{
f55d112e 1265 return reciprocal_divide(skb->rxhash, num);
dc99f600
DM
1266}
1267
77f65ebd
WB
1268static unsigned int fanout_demux_lb(struct packet_fanout *f,
1269 struct sk_buff *skb,
1270 unsigned int num)
dc99f600
DM
1271{
1272 int cur, old;
1273
1274 cur = atomic_read(&f->rr_cur);
1275 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1276 fanout_rr_next(f, num))) != cur)
1277 cur = old;
77f65ebd
WB
1278 return cur;
1279}
1280
1281static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1282 struct sk_buff *skb,
1283 unsigned int num)
1284{
1285 return smp_processor_id() % num;
dc99f600
DM
1286}
1287
5df0ddfb
DB
1288static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1289 struct sk_buff *skb,
1290 unsigned int num)
1291{
1292 return reciprocal_divide(prandom_u32(), num);
1293}
1294
77f65ebd
WB
1295static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1296 struct sk_buff *skb,
1297 unsigned int idx, unsigned int skip,
1298 unsigned int num)
95ec3eb4 1299{
77f65ebd 1300 unsigned int i, j;
95ec3eb4 1301
77f65ebd
WB
1302 i = j = min_t(int, f->next[idx], num - 1);
1303 do {
1304 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1305 if (i != j)
1306 f->next[idx] = i;
1307 return i;
1308 }
1309 if (++i == num)
1310 i = 0;
1311 } while (i != j);
1312
1313 return idx;
1314}
1315
1316static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1317{
1318 return f->flags & (flag >> 8);
95ec3eb4
DM
1319}
1320
95ec3eb4
DM
1321static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1322 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1323{
1324 struct packet_fanout *f = pt->af_packet_priv;
1325 unsigned int num = f->num_members;
1326 struct packet_sock *po;
77f65ebd 1327 unsigned int idx;
dc99f600
DM
1328
1329 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1330 !num) {
1331 kfree_skb(skb);
1332 return 0;
1333 }
1334
95ec3eb4
DM
1335 switch (f->type) {
1336 case PACKET_FANOUT_HASH:
1337 default:
77f65ebd 1338 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1339 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1340 if (!skb)
1341 return 0;
1342 }
3958afa1 1343 skb_get_hash(skb);
77f65ebd 1344 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1345 break;
1346 case PACKET_FANOUT_LB:
77f65ebd 1347 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1348 break;
1349 case PACKET_FANOUT_CPU:
77f65ebd
WB
1350 idx = fanout_demux_cpu(f, skb, num);
1351 break;
5df0ddfb
DB
1352 case PACKET_FANOUT_RND:
1353 idx = fanout_demux_rnd(f, skb, num);
1354 break;
77f65ebd
WB
1355 case PACKET_FANOUT_ROLLOVER:
1356 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1357 break;
dc99f600
DM
1358 }
1359
77f65ebd
WB
1360 po = pkt_sk(f->arr[idx]);
1361 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1362 unlikely(!packet_rcv_has_room(po, skb))) {
1363 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1364 po = pkt_sk(f->arr[idx]);
1365 }
dc99f600
DM
1366
1367 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1368}
1369
fff3321d
PE
1370DEFINE_MUTEX(fanout_mutex);
1371EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1372static LIST_HEAD(fanout_list);
1373
1374static void __fanout_link(struct sock *sk, struct packet_sock *po)
1375{
1376 struct packet_fanout *f = po->fanout;
1377
1378 spin_lock(&f->lock);
1379 f->arr[f->num_members] = sk;
1380 smp_wmb();
1381 f->num_members++;
1382 spin_unlock(&f->lock);
1383}
1384
1385static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1386{
1387 struct packet_fanout *f = po->fanout;
1388 int i;
1389
1390 spin_lock(&f->lock);
1391 for (i = 0; i < f->num_members; i++) {
1392 if (f->arr[i] == sk)
1393 break;
1394 }
1395 BUG_ON(i >= f->num_members);
1396 f->arr[i] = f->arr[f->num_members - 1];
1397 f->num_members--;
1398 spin_unlock(&f->lock);
1399}
1400
d4dd8aee 1401static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1402{
d4dd8aee 1403 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1404 return true;
1405
1406 return false;
1407}
1408
7736d33f 1409static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1410{
1411 struct packet_sock *po = pkt_sk(sk);
1412 struct packet_fanout *f, *match;
7736d33f 1413 u8 type = type_flags & 0xff;
77f65ebd 1414 u8 flags = type_flags >> 8;
dc99f600
DM
1415 int err;
1416
1417 switch (type) {
77f65ebd
WB
1418 case PACKET_FANOUT_ROLLOVER:
1419 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1420 return -EINVAL;
dc99f600
DM
1421 case PACKET_FANOUT_HASH:
1422 case PACKET_FANOUT_LB:
95ec3eb4 1423 case PACKET_FANOUT_CPU:
5df0ddfb 1424 case PACKET_FANOUT_RND:
dc99f600
DM
1425 break;
1426 default:
1427 return -EINVAL;
1428 }
1429
1430 if (!po->running)
1431 return -EINVAL;
1432
1433 if (po->fanout)
1434 return -EALREADY;
1435
1436 mutex_lock(&fanout_mutex);
1437 match = NULL;
1438 list_for_each_entry(f, &fanout_list, list) {
1439 if (f->id == id &&
1440 read_pnet(&f->net) == sock_net(sk)) {
1441 match = f;
1442 break;
1443 }
1444 }
afe62c68 1445 err = -EINVAL;
77f65ebd 1446 if (match && match->flags != flags)
afe62c68 1447 goto out;
dc99f600 1448 if (!match) {
afe62c68 1449 err = -ENOMEM;
dc99f600 1450 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1451 if (!match)
1452 goto out;
1453 write_pnet(&match->net, sock_net(sk));
1454 match->id = id;
1455 match->type = type;
77f65ebd 1456 match->flags = flags;
afe62c68
ED
1457 atomic_set(&match->rr_cur, 0);
1458 INIT_LIST_HEAD(&match->list);
1459 spin_lock_init(&match->lock);
1460 atomic_set(&match->sk_ref, 0);
1461 match->prot_hook.type = po->prot_hook.type;
1462 match->prot_hook.dev = po->prot_hook.dev;
1463 match->prot_hook.func = packet_rcv_fanout;
1464 match->prot_hook.af_packet_priv = match;
c0de08d0 1465 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1466 dev_add_pack(&match->prot_hook);
1467 list_add(&match->list, &fanout_list);
dc99f600 1468 }
afe62c68
ED
1469 err = -EINVAL;
1470 if (match->type == type &&
1471 match->prot_hook.type == po->prot_hook.type &&
1472 match->prot_hook.dev == po->prot_hook.dev) {
1473 err = -ENOSPC;
1474 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1475 __dev_remove_pack(&po->prot_hook);
1476 po->fanout = match;
1477 atomic_inc(&match->sk_ref);
1478 __fanout_link(sk, po);
1479 err = 0;
dc99f600
DM
1480 }
1481 }
afe62c68 1482out:
dc99f600
DM
1483 mutex_unlock(&fanout_mutex);
1484 return err;
1485}
1486
1487static void fanout_release(struct sock *sk)
1488{
1489 struct packet_sock *po = pkt_sk(sk);
1490 struct packet_fanout *f;
1491
1492 f = po->fanout;
1493 if (!f)
1494 return;
1495
fff3321d 1496 mutex_lock(&fanout_mutex);
dc99f600
DM
1497 po->fanout = NULL;
1498
dc99f600
DM
1499 if (atomic_dec_and_test(&f->sk_ref)) {
1500 list_del(&f->list);
1501 dev_remove_pack(&f->prot_hook);
1502 kfree(f);
1503 }
1504 mutex_unlock(&fanout_mutex);
1505}
1da177e4 1506
90ddc4f0 1507static const struct proto_ops packet_ops;
1da177e4 1508
90ddc4f0 1509static const struct proto_ops packet_ops_spkt;
1da177e4 1510
40d4e3df
ED
1511static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1512 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1513{
1514 struct sock *sk;
1515 struct sockaddr_pkt *spkt;
1516
1517 /*
1518 * When we registered the protocol we saved the socket in the data
1519 * field for just this event.
1520 */
1521
1522 sk = pt->af_packet_priv;
1ce4f28b 1523
1da177e4
LT
1524 /*
1525 * Yank back the headers [hope the device set this
1526 * right or kerboom...]
1527 *
1528 * Incoming packets have ll header pulled,
1529 * push it back.
1530 *
98e399f8 1531 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1532 * so that this procedure is noop.
1533 */
1534
1535 if (skb->pkt_type == PACKET_LOOPBACK)
1536 goto out;
1537
09ad9bc7 1538 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1539 goto out;
1540
40d4e3df
ED
1541 skb = skb_share_check(skb, GFP_ATOMIC);
1542 if (skb == NULL)
1da177e4
LT
1543 goto oom;
1544
1545 /* drop any routing info */
adf30907 1546 skb_dst_drop(skb);
1da177e4 1547
84531c24
PO
1548 /* drop conntrack reference */
1549 nf_reset(skb);
1550
ffbc6111 1551 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1552
98e399f8 1553 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1554
1555 /*
1556 * The SOCK_PACKET socket receives _all_ frames.
1557 */
1558
1559 spkt->spkt_family = dev->type;
1560 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1561 spkt->spkt_protocol = skb->protocol;
1562
1563 /*
1564 * Charge the memory to the socket. This is done specifically
1565 * to prevent sockets using all the memory up.
1566 */
1567
40d4e3df 1568 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1569 return 0;
1570
1571out:
1572 kfree_skb(skb);
1573oom:
1574 return 0;
1575}
1576
1577
1578/*
1579 * Output a raw packet to a device layer. This bypasses all the other
1580 * protocol layers and you must therefore supply it with a complete frame
1581 */
1ce4f28b 1582
1da177e4
LT
1583static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1584 struct msghdr *msg, size_t len)
1585{
1586 struct sock *sk = sock->sk;
342dfc30 1587 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1588 struct sk_buff *skb = NULL;
1da177e4 1589 struct net_device *dev;
40d4e3df 1590 __be16 proto = 0;
1da177e4 1591 int err;
3bdc0eba 1592 int extra_len = 0;
1ce4f28b 1593
1da177e4 1594 /*
1ce4f28b 1595 * Get and verify the address.
1da177e4
LT
1596 */
1597
40d4e3df 1598 if (saddr) {
1da177e4 1599 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1600 return -EINVAL;
1601 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1602 proto = saddr->spkt_protocol;
1603 } else
1604 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1605
1606 /*
1ce4f28b 1607 * Find the device first to size check it
1da177e4
LT
1608 */
1609
de74e92a 1610 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1611retry:
654d1f8a
ED
1612 rcu_read_lock();
1613 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1614 err = -ENODEV;
1615 if (dev == NULL)
1616 goto out_unlock;
1ce4f28b 1617
d5e76b0a
DM
1618 err = -ENETDOWN;
1619 if (!(dev->flags & IFF_UP))
1620 goto out_unlock;
1621
1da177e4 1622 /*
40d4e3df
ED
1623 * You may not queue a frame bigger than the mtu. This is the lowest level
1624 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1625 */
1ce4f28b 1626
3bdc0eba
BG
1627 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1628 if (!netif_supports_nofcs(dev)) {
1629 err = -EPROTONOSUPPORT;
1630 goto out_unlock;
1631 }
1632 extra_len = 4; /* We're doing our own CRC */
1633 }
1634
1da177e4 1635 err = -EMSGSIZE;
3bdc0eba 1636 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1637 goto out_unlock;
1638
1a35ca80
ED
1639 if (!skb) {
1640 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1641 int tlen = dev->needed_tailroom;
1a35ca80
ED
1642 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1643
1644 rcu_read_unlock();
4ce40912 1645 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1646 if (skb == NULL)
1647 return -ENOBUFS;
1648 /* FIXME: Save some space for broken drivers that write a hard
1649 * header at transmission time by themselves. PPP is the notable
1650 * one here. This should really be fixed at the driver level.
1651 */
1652 skb_reserve(skb, reserved);
1653 skb_reset_network_header(skb);
1654
1655 /* Try to align data part correctly */
1656 if (hhlen) {
1657 skb->data -= hhlen;
1658 skb->tail -= hhlen;
1659 if (len < hhlen)
1660 skb_reset_network_header(skb);
1661 }
1662 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1663 if (err)
1664 goto out_free;
1665 goto retry;
1da177e4
LT
1666 }
1667
3bdc0eba 1668 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1669 /* Earlier code assumed this would be a VLAN pkt,
1670 * double-check this now that we have the actual
1671 * packet in hand.
1672 */
1673 struct ethhdr *ehdr;
1674 skb_reset_mac_header(skb);
1675 ehdr = eth_hdr(skb);
1676 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1677 err = -EMSGSIZE;
1678 goto out_unlock;
1679 }
1680 }
1a35ca80 1681
1da177e4
LT
1682 skb->protocol = proto;
1683 skb->dev = dev;
1684 skb->priority = sk->sk_priority;
2d37a186 1685 skb->mark = sk->sk_mark;
bf84a010
DB
1686
1687 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1688
3bdc0eba
BG
1689 if (unlikely(extra_len == 4))
1690 skb->no_fcs = 1;
1691
40893fd0 1692 skb_probe_transport_header(skb, 0);
c1aad275 1693
1da177e4 1694 dev_queue_xmit(skb);
654d1f8a 1695 rcu_read_unlock();
40d4e3df 1696 return len;
1da177e4 1697
1da177e4 1698out_unlock:
654d1f8a 1699 rcu_read_unlock();
1a35ca80
ED
1700out_free:
1701 kfree_skb(skb);
1da177e4
LT
1702 return err;
1703}
1da177e4 1704
eea49cc9 1705static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1706 const struct sock *sk,
dbcb5855 1707 unsigned int res)
1da177e4
LT
1708{
1709 struct sk_filter *filter;
fda9ef5d 1710
80f8f102
ED
1711 rcu_read_lock();
1712 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1713 if (filter != NULL)
0a14842f 1714 res = SK_RUN_FILTER(filter, skb);
80f8f102 1715 rcu_read_unlock();
1da177e4 1716
dbcb5855 1717 return res;
1da177e4
LT
1718}
1719
1720/*
62ab0812
ED
1721 * This function makes lazy skb cloning in hope that most of packets
1722 * are discarded by BPF.
1723 *
1724 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1725 * and skb->cb are mangled. It works because (and until) packets
1726 * falling here are owned by current CPU. Output packets are cloned
1727 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1728 * sequencially, so that if we return skb to original state on exit,
1729 * we will not harm anyone.
1da177e4
LT
1730 */
1731
40d4e3df
ED
1732static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1733 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1734{
1735 struct sock *sk;
1736 struct sockaddr_ll *sll;
1737 struct packet_sock *po;
40d4e3df 1738 u8 *skb_head = skb->data;
1da177e4 1739 int skb_len = skb->len;
dbcb5855 1740 unsigned int snaplen, res;
1da177e4
LT
1741
1742 if (skb->pkt_type == PACKET_LOOPBACK)
1743 goto drop;
1744
1745 sk = pt->af_packet_priv;
1746 po = pkt_sk(sk);
1747
09ad9bc7 1748 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1749 goto drop;
1750
1da177e4
LT
1751 skb->dev = dev;
1752
3b04ddde 1753 if (dev->header_ops) {
1da177e4 1754 /* The device has an explicit notion of ll header,
62ab0812
ED
1755 * exported to higher levels.
1756 *
1757 * Otherwise, the device hides details of its frame
1758 * structure, so that corresponding packet head is
1759 * never delivered to user.
1da177e4
LT
1760 */
1761 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1762 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1763 else if (skb->pkt_type == PACKET_OUTGOING) {
1764 /* Special case: outgoing packets have ll header at head */
bbe735e4 1765 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1766 }
1767 }
1768
1769 snaplen = skb->len;
1770
dbcb5855
DM
1771 res = run_filter(skb, sk, snaplen);
1772 if (!res)
fda9ef5d 1773 goto drop_n_restore;
dbcb5855
DM
1774 if (snaplen > res)
1775 snaplen = res;
1da177e4 1776
0fd7bac6 1777 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1778 goto drop_n_acct;
1779
1780 if (skb_shared(skb)) {
1781 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1782 if (nskb == NULL)
1783 goto drop_n_acct;
1784
1785 if (skb_head != skb->data) {
1786 skb->data = skb_head;
1787 skb->len = skb_len;
1788 }
abc4e4fa 1789 consume_skb(skb);
1da177e4
LT
1790 skb = nskb;
1791 }
1792
ffbc6111
HX
1793 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1794 sizeof(skb->cb));
1795
1796 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1797 sll->sll_family = AF_PACKET;
1798 sll->sll_hatype = dev->type;
1799 sll->sll_protocol = skb->protocol;
1800 sll->sll_pkttype = skb->pkt_type;
8032b464 1801 if (unlikely(po->origdev))
80feaacb
PWJ
1802 sll->sll_ifindex = orig_dev->ifindex;
1803 else
1804 sll->sll_ifindex = dev->ifindex;
1da177e4 1805
b95cce35 1806 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1807
ffbc6111 1808 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1809
1da177e4
LT
1810 if (pskb_trim(skb, snaplen))
1811 goto drop_n_acct;
1812
1813 skb_set_owner_r(skb, sk);
1814 skb->dev = NULL;
adf30907 1815 skb_dst_drop(skb);
1da177e4 1816
84531c24
PO
1817 /* drop conntrack reference */
1818 nf_reset(skb);
1819
1da177e4 1820 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1821 po->stats.stats1.tp_packets++;
3b885787 1822 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1823 __skb_queue_tail(&sk->sk_receive_queue, skb);
1824 spin_unlock(&sk->sk_receive_queue.lock);
1825 sk->sk_data_ready(sk, skb->len);
1826 return 0;
1827
1828drop_n_acct:
7091fbd8 1829 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1830 po->stats.stats1.tp_drops++;
7091fbd8
WB
1831 atomic_inc(&sk->sk_drops);
1832 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1833
1834drop_n_restore:
1835 if (skb_head != skb->data && skb_shared(skb)) {
1836 skb->data = skb_head;
1837 skb->len = skb_len;
1838 }
1839drop:
ead2ceb0 1840 consume_skb(skb);
1da177e4
LT
1841 return 0;
1842}
1843
40d4e3df
ED
1844static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1845 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1846{
1847 struct sock *sk;
1848 struct packet_sock *po;
1849 struct sockaddr_ll *sll;
184f489e 1850 union tpacket_uhdr h;
40d4e3df 1851 u8 *skb_head = skb->data;
1da177e4 1852 int skb_len = skb->len;
dbcb5855 1853 unsigned int snaplen, res;
f6fb8f10 1854 unsigned long status = TP_STATUS_USER;
bbd6ef87 1855 unsigned short macoff, netoff, hdrlen;
1da177e4 1856 struct sk_buff *copy_skb = NULL;
bbd6ef87 1857 struct timespec ts;
b9c32fb2 1858 __u32 ts_status;
1da177e4 1859
51846355
AW
1860 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1861 * We may add members to them until current aligned size without forcing
1862 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1863 */
1864 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1865 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1866
1da177e4
LT
1867 if (skb->pkt_type == PACKET_LOOPBACK)
1868 goto drop;
1869
1870 sk = pt->af_packet_priv;
1871 po = pkt_sk(sk);
1872
09ad9bc7 1873 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1874 goto drop;
1875
3b04ddde 1876 if (dev->header_ops) {
1da177e4 1877 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1878 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1879 else if (skb->pkt_type == PACKET_OUTGOING) {
1880 /* Special case: outgoing packets have ll header at head */
bbe735e4 1881 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1882 }
1883 }
1884
8dc41944
HX
1885 if (skb->ip_summed == CHECKSUM_PARTIAL)
1886 status |= TP_STATUS_CSUMNOTREADY;
1887
1da177e4
LT
1888 snaplen = skb->len;
1889
dbcb5855
DM
1890 res = run_filter(skb, sk, snaplen);
1891 if (!res)
fda9ef5d 1892 goto drop_n_restore;
dbcb5855
DM
1893 if (snaplen > res)
1894 snaplen = res;
1da177e4
LT
1895
1896 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1897 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1898 po->tp_reserve;
1da177e4 1899 } else {
95c96174 1900 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1901 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1902 (maclen < 16 ? 16 : maclen)) +
1903 po->tp_reserve;
1da177e4
LT
1904 macoff = netoff - maclen;
1905 }
f6fb8f10 1906 if (po->tp_version <= TPACKET_V2) {
1907 if (macoff + snaplen > po->rx_ring.frame_size) {
1908 if (po->copy_thresh &&
0fd7bac6 1909 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1910 if (skb_shared(skb)) {
1911 copy_skb = skb_clone(skb, GFP_ATOMIC);
1912 } else {
1913 copy_skb = skb_get(skb);
1914 skb_head = skb->data;
1915 }
1916 if (copy_skb)
1917 skb_set_owner_r(copy_skb, sk);
1da177e4 1918 }
f6fb8f10 1919 snaplen = po->rx_ring.frame_size - macoff;
1920 if ((int)snaplen < 0)
1921 snaplen = 0;
1da177e4 1922 }
1da177e4 1923 }
1da177e4 1924 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1925 h.raw = packet_current_rx_frame(po, skb,
1926 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1927 if (!h.raw)
1da177e4 1928 goto ring_is_full;
f6fb8f10 1929 if (po->tp_version <= TPACKET_V2) {
1930 packet_increment_rx_head(po, &po->rx_ring);
1931 /*
1932 * LOSING will be reported till you read the stats,
1933 * because it's COR - Clear On Read.
1934 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1935 * at packet level.
1936 */
ee80fbf3 1937 if (po->stats.stats1.tp_drops)
f6fb8f10 1938 status |= TP_STATUS_LOSING;
1939 }
ee80fbf3 1940 po->stats.stats1.tp_packets++;
1da177e4
LT
1941 if (copy_skb) {
1942 status |= TP_STATUS_COPY;
1943 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1944 }
1da177e4
LT
1945 spin_unlock(&sk->sk_receive_queue.lock);
1946
bbd6ef87 1947 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1948
1949 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1950 getnstimeofday(&ts);
1da177e4 1951
b9c32fb2
DB
1952 status |= ts_status;
1953
bbd6ef87
PM
1954 switch (po->tp_version) {
1955 case TPACKET_V1:
1956 h.h1->tp_len = skb->len;
1957 h.h1->tp_snaplen = snaplen;
1958 h.h1->tp_mac = macoff;
1959 h.h1->tp_net = netoff;
4b457bdf
DB
1960 h.h1->tp_sec = ts.tv_sec;
1961 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1962 hdrlen = sizeof(*h.h1);
1963 break;
1964 case TPACKET_V2:
1965 h.h2->tp_len = skb->len;
1966 h.h2->tp_snaplen = snaplen;
1967 h.h2->tp_mac = macoff;
1968 h.h2->tp_net = netoff;
bbd6ef87
PM
1969 h.h2->tp_sec = ts.tv_sec;
1970 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1971 if (vlan_tx_tag_present(skb)) {
1972 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
a0cdfcf3
AW
1973 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
1974 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
1975 } else {
1976 h.h2->tp_vlan_tci = 0;
a0cdfcf3 1977 h.h2->tp_vlan_tpid = 0;
a3bcc23e 1978 }
e4d26f4b 1979 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
1980 hdrlen = sizeof(*h.h2);
1981 break;
f6fb8f10 1982 case TPACKET_V3:
1983 /* tp_nxt_offset,vlan are already populated above.
1984 * So DONT clear those fields here
1985 */
1986 h.h3->tp_status |= status;
1987 h.h3->tp_len = skb->len;
1988 h.h3->tp_snaplen = snaplen;
1989 h.h3->tp_mac = macoff;
1990 h.h3->tp_net = netoff;
f6fb8f10 1991 h.h3->tp_sec = ts.tv_sec;
1992 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 1993 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 1994 hdrlen = sizeof(*h.h3);
1995 break;
bbd6ef87
PM
1996 default:
1997 BUG();
1998 }
1da177e4 1999
bbd6ef87 2000 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2001 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2002 sll->sll_family = AF_PACKET;
2003 sll->sll_hatype = dev->type;
2004 sll->sll_protocol = skb->protocol;
2005 sll->sll_pkttype = skb->pkt_type;
8032b464 2006 if (unlikely(po->origdev))
80feaacb
PWJ
2007 sll->sll_ifindex = orig_dev->ifindex;
2008 else
2009 sll->sll_ifindex = dev->ifindex;
1da177e4 2010
e16aa207 2011 smp_mb();
f0d4eb29 2012
f6dafa95 2013#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2014 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2015 u8 *start, *end;
2016
f0d4eb29
DB
2017 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2018 macoff + snaplen);
2019
2020 for (start = h.raw; start < end; start += PAGE_SIZE)
2021 flush_dcache_page(pgv_to_page(start));
1da177e4 2022 }
f0d4eb29 2023 smp_wmb();
f6dafa95 2024#endif
f0d4eb29 2025
f6fb8f10 2026 if (po->tp_version <= TPACKET_V2)
2027 __packet_set_status(po, h.raw, status);
2028 else
2029 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
2030
2031 sk->sk_data_ready(sk, 0);
2032
2033drop_n_restore:
2034 if (skb_head != skb->data && skb_shared(skb)) {
2035 skb->data = skb_head;
2036 skb->len = skb_len;
2037 }
2038drop:
1ce4f28b 2039 kfree_skb(skb);
1da177e4
LT
2040 return 0;
2041
2042ring_is_full:
ee80fbf3 2043 po->stats.stats1.tp_drops++;
1da177e4
LT
2044 spin_unlock(&sk->sk_receive_queue.lock);
2045
2046 sk->sk_data_ready(sk, 0);
acb5d75b 2047 kfree_skb(copy_skb);
1da177e4
LT
2048 goto drop_n_restore;
2049}
2050
69e3c75f
JB
2051static void tpacket_destruct_skb(struct sk_buff *skb)
2052{
2053 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2054
69e3c75f 2055 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2056 void *ph;
b9c32fb2
DB
2057 __u32 ts;
2058
69e3c75f 2059 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2060 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2061
2062 ts = __packet_set_timestamp(po, ph, skb);
2063 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2064 }
2065
2066 sock_wfree(skb);
2067}
2068
40d4e3df
ED
2069static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2070 void *frame, struct net_device *dev, int size_max,
ae641949 2071 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2072{
184f489e 2073 union tpacket_uhdr ph;
09effa67 2074 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2075 struct socket *sock = po->sk.sk_socket;
2076 struct page *page;
2077 void *data;
2078 int err;
2079
2080 ph.raw = frame;
2081
2082 skb->protocol = proto;
2083 skb->dev = dev;
2084 skb->priority = po->sk.sk_priority;
2d37a186 2085 skb->mark = po->sk.sk_mark;
2e31396f 2086 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2087 skb_shinfo(skb)->destructor_arg = ph.raw;
2088
2089 switch (po->tp_version) {
2090 case TPACKET_V2:
2091 tp_len = ph.h2->tp_len;
2092 break;
2093 default:
2094 tp_len = ph.h1->tp_len;
2095 break;
2096 }
09effa67
DM
2097 if (unlikely(tp_len > size_max)) {
2098 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2099 return -EMSGSIZE;
2100 }
69e3c75f 2101
ae641949 2102 skb_reserve(skb, hlen);
69e3c75f 2103 skb_reset_network_header(skb);
c1aad275 2104
d346a3fa
DB
2105 if (!packet_use_direct_xmit(po))
2106 skb_probe_transport_header(skb, 0);
2107 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2108 int off_min, off_max, off;
2109 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2110 off_max = po->tx_ring.frame_size - tp_len;
2111 if (sock->type == SOCK_DGRAM) {
2112 switch (po->tp_version) {
2113 case TPACKET_V2:
2114 off = ph.h2->tp_net;
2115 break;
2116 default:
2117 off = ph.h1->tp_net;
2118 break;
2119 }
2120 } else {
2121 switch (po->tp_version) {
2122 case TPACKET_V2:
2123 off = ph.h2->tp_mac;
2124 break;
2125 default:
2126 off = ph.h1->tp_mac;
2127 break;
2128 }
2129 }
2130 if (unlikely((off < off_min) || (off_max < off)))
2131 return -EINVAL;
2132 data = ph.raw + off;
2133 } else {
2134 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2135 }
69e3c75f
JB
2136 to_write = tp_len;
2137
2138 if (sock->type == SOCK_DGRAM) {
2139 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2140 NULL, tp_len);
2141 if (unlikely(err < 0))
2142 return -EINVAL;
40d4e3df 2143 } else if (dev->hard_header_len) {
69e3c75f
JB
2144 /* net device doesn't like empty head */
2145 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2146 pr_err("packet size is too short (%d < %d)\n",
2147 tp_len, dev->hard_header_len);
69e3c75f
JB
2148 return -EINVAL;
2149 }
2150
2151 skb_push(skb, dev->hard_header_len);
2152 err = skb_store_bits(skb, 0, data,
2153 dev->hard_header_len);
2154 if (unlikely(err))
2155 return err;
2156
2157 data += dev->hard_header_len;
2158 to_write -= dev->hard_header_len;
2159 }
2160
69e3c75f
JB
2161 offset = offset_in_page(data);
2162 len_max = PAGE_SIZE - offset;
2163 len = ((to_write > len_max) ? len_max : to_write);
2164
2165 skb->data_len = to_write;
2166 skb->len += to_write;
2167 skb->truesize += to_write;
2168 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2169
2170 while (likely(to_write)) {
2171 nr_frags = skb_shinfo(skb)->nr_frags;
2172
2173 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2174 pr_err("Packet exceed the number of skb frags(%lu)\n",
2175 MAX_SKB_FRAGS);
69e3c75f
JB
2176 return -EFAULT;
2177 }
2178
0af55bb5
CG
2179 page = pgv_to_page(data);
2180 data += len;
69e3c75f
JB
2181 flush_dcache_page(page);
2182 get_page(page);
0af55bb5 2183 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2184 to_write -= len;
2185 offset = 0;
2186 len_max = PAGE_SIZE;
2187 len = ((to_write > len_max) ? len_max : to_write);
2188 }
2189
2190 return tp_len;
2191}
2192
2193static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2194{
69e3c75f
JB
2195 struct sk_buff *skb;
2196 struct net_device *dev;
2197 __be16 proto;
09effa67 2198 int err, reserve = 0;
40d4e3df 2199 void *ph;
342dfc30 2200 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2201 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2202 int tp_len, size_max;
2203 unsigned char *addr;
2204 int len_sum = 0;
9e67030a 2205 int status = TP_STATUS_AVAILABLE;
ae641949 2206 int hlen, tlen;
69e3c75f 2207
69e3c75f
JB
2208 mutex_lock(&po->pg_vec_lock);
2209
66e56cd4 2210 if (likely(saddr == NULL)) {
e40526cb 2211 dev = packet_cached_dev_get(po);
69e3c75f
JB
2212 proto = po->num;
2213 addr = NULL;
2214 } else {
2215 err = -EINVAL;
2216 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2217 goto out;
2218 if (msg->msg_namelen < (saddr->sll_halen
2219 + offsetof(struct sockaddr_ll,
2220 sll_addr)))
2221 goto out;
69e3c75f
JB
2222 proto = saddr->sll_protocol;
2223 addr = saddr->sll_addr;
827d9780 2224 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2225 }
2226
69e3c75f
JB
2227 err = -ENXIO;
2228 if (unlikely(dev == NULL))
2229 goto out;
69e3c75f
JB
2230 err = -ENETDOWN;
2231 if (unlikely(!(dev->flags & IFF_UP)))
2232 goto out_put;
2233
e40526cb
DB
2234 reserve = dev->hard_header_len;
2235
69e3c75f 2236 size_max = po->tx_ring.frame_size
b5dd884e 2237 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2238
09effa67
DM
2239 if (size_max > dev->mtu + reserve)
2240 size_max = dev->mtu + reserve;
2241
69e3c75f
JB
2242 do {
2243 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2244 TP_STATUS_SEND_REQUEST);
69e3c75f 2245 if (unlikely(ph == NULL)) {
87a2fd28
DB
2246 if (need_wait && need_resched())
2247 schedule();
69e3c75f
JB
2248 continue;
2249 }
2250
2251 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2252 hlen = LL_RESERVED_SPACE(dev);
2253 tlen = dev->needed_tailroom;
69e3c75f 2254 skb = sock_alloc_send_skb(&po->sk,
ae641949 2255 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2256 0, &err);
2257
2258 if (unlikely(skb == NULL))
2259 goto out_status;
2260
2261 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2262 addr, hlen);
69e3c75f
JB
2263
2264 if (unlikely(tp_len < 0)) {
2265 if (po->tp_loss) {
2266 __packet_set_status(po, ph,
2267 TP_STATUS_AVAILABLE);
2268 packet_increment_head(&po->tx_ring);
2269 kfree_skb(skb);
2270 continue;
2271 } else {
2272 status = TP_STATUS_WRONG_FORMAT;
2273 err = tp_len;
2274 goto out_status;
2275 }
2276 }
2277
d346a3fa 2278 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
69e3c75f
JB
2279 skb->destructor = tpacket_destruct_skb;
2280 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2281 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2282
2283 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2284 err = po->xmit(skb);
eb70df13
JP
2285 if (unlikely(err > 0)) {
2286 err = net_xmit_errno(err);
2287 if (err && __packet_get_status(po, ph) ==
2288 TP_STATUS_AVAILABLE) {
2289 /* skb was destructed already */
2290 skb = NULL;
2291 goto out_status;
2292 }
2293 /*
2294 * skb was dropped but not destructed yet;
2295 * let's treat it like congestion or err < 0
2296 */
2297 err = 0;
2298 }
69e3c75f
JB
2299 packet_increment_head(&po->tx_ring);
2300 len_sum += tp_len;
b0138408
DB
2301 } while (likely((ph != NULL) ||
2302 /* Note: packet_read_pending() might be slow if we have
2303 * to call it as it's per_cpu variable, but in fast-path
2304 * we already short-circuit the loop with the first
2305 * condition, and luckily don't have to go that path
2306 * anyway.
2307 */
2308 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2309
2310 err = len_sum;
2311 goto out_put;
2312
69e3c75f
JB
2313out_status:
2314 __packet_set_status(po, ph, status);
2315 kfree_skb(skb);
2316out_put:
e40526cb 2317 dev_put(dev);
69e3c75f
JB
2318out:
2319 mutex_unlock(&po->pg_vec_lock);
2320 return err;
2321}
69e3c75f 2322
eea49cc9
OJ
2323static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2324 size_t reserve, size_t len,
2325 size_t linear, int noblock,
2326 int *err)
bfd5f4a3
SS
2327{
2328 struct sk_buff *skb;
2329
2330 /* Under a page? Don't bother with paged skb. */
2331 if (prepad + len < PAGE_SIZE || !linear)
2332 linear = len;
2333
2334 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2335 err, 0);
bfd5f4a3
SS
2336 if (!skb)
2337 return NULL;
2338
2339 skb_reserve(skb, reserve);
2340 skb_put(skb, linear);
2341 skb->data_len = len - linear;
2342 skb->len += len - linear;
2343
2344 return skb;
2345}
2346
d346a3fa 2347static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2348{
2349 struct sock *sk = sock->sk;
342dfc30 2350 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2351 struct sk_buff *skb;
2352 struct net_device *dev;
0e11c91e 2353 __be16 proto;
1da177e4 2354 unsigned char *addr;
827d9780 2355 int err, reserve = 0;
bfd5f4a3
SS
2356 struct virtio_net_hdr vnet_hdr = { 0 };
2357 int offset = 0;
2358 int vnet_hdr_len;
2359 struct packet_sock *po = pkt_sk(sk);
2360 unsigned short gso_type = 0;
ae641949 2361 int hlen, tlen;
3bdc0eba 2362 int extra_len = 0;
1da177e4
LT
2363
2364 /*
1ce4f28b 2365 * Get and verify the address.
1da177e4 2366 */
1ce4f28b 2367
66e56cd4 2368 if (likely(saddr == NULL)) {
e40526cb 2369 dev = packet_cached_dev_get(po);
1da177e4
LT
2370 proto = po->num;
2371 addr = NULL;
2372 } else {
2373 err = -EINVAL;
2374 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2375 goto out;
0fb375fb
EB
2376 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2377 goto out;
1da177e4
LT
2378 proto = saddr->sll_protocol;
2379 addr = saddr->sll_addr;
827d9780 2380 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2381 }
2382
1da177e4 2383 err = -ENXIO;
e40526cb 2384 if (unlikely(dev == NULL))
1da177e4 2385 goto out_unlock;
d5e76b0a 2386 err = -ENETDOWN;
e40526cb 2387 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2388 goto out_unlock;
2389
e40526cb
DB
2390 if (sock->type == SOCK_RAW)
2391 reserve = dev->hard_header_len;
bfd5f4a3
SS
2392 if (po->has_vnet_hdr) {
2393 vnet_hdr_len = sizeof(vnet_hdr);
2394
2395 err = -EINVAL;
2396 if (len < vnet_hdr_len)
2397 goto out_unlock;
2398
2399 len -= vnet_hdr_len;
2400
2401 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2402 vnet_hdr_len);
2403 if (err < 0)
2404 goto out_unlock;
2405
2406 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2407 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2408 vnet_hdr.hdr_len))
2409 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2410 vnet_hdr.csum_offset + 2;
2411
2412 err = -EINVAL;
2413 if (vnet_hdr.hdr_len > len)
2414 goto out_unlock;
2415
2416 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2417 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2418 case VIRTIO_NET_HDR_GSO_TCPV4:
2419 gso_type = SKB_GSO_TCPV4;
2420 break;
2421 case VIRTIO_NET_HDR_GSO_TCPV6:
2422 gso_type = SKB_GSO_TCPV6;
2423 break;
2424 case VIRTIO_NET_HDR_GSO_UDP:
2425 gso_type = SKB_GSO_UDP;
2426 break;
2427 default:
2428 goto out_unlock;
2429 }
2430
2431 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2432 gso_type |= SKB_GSO_TCP_ECN;
2433
2434 if (vnet_hdr.gso_size == 0)
2435 goto out_unlock;
2436
2437 }
2438 }
2439
3bdc0eba
BG
2440 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2441 if (!netif_supports_nofcs(dev)) {
2442 err = -EPROTONOSUPPORT;
2443 goto out_unlock;
2444 }
2445 extra_len = 4; /* We're doing our own CRC */
2446 }
2447
1da177e4 2448 err = -EMSGSIZE;
3bdc0eba 2449 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2450 goto out_unlock;
2451
bfd5f4a3 2452 err = -ENOBUFS;
ae641949
HX
2453 hlen = LL_RESERVED_SPACE(dev);
2454 tlen = dev->needed_tailroom;
2455 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2456 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2457 if (skb == NULL)
1da177e4
LT
2458 goto out_unlock;
2459
bfd5f4a3 2460 skb_set_network_header(skb, reserve);
1da177e4 2461
0c4e8581
SH
2462 err = -EINVAL;
2463 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2464 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2465 goto out_free;
1da177e4
LT
2466
2467 /* Returns -EFAULT on error */
bfd5f4a3 2468 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2469 if (err)
2470 goto out_free;
bf84a010
DB
2471
2472 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2473
3bdc0eba 2474 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2475 /* Earlier code assumed this would be a VLAN pkt,
2476 * double-check this now that we have the actual
2477 * packet in hand.
2478 */
2479 struct ethhdr *ehdr;
2480 skb_reset_mac_header(skb);
2481 ehdr = eth_hdr(skb);
2482 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2483 err = -EMSGSIZE;
2484 goto out_free;
2485 }
57f89bfa
BG
2486 }
2487
09effa67
DM
2488 skb->protocol = proto;
2489 skb->dev = dev;
1da177e4 2490 skb->priority = sk->sk_priority;
2d37a186 2491 skb->mark = sk->sk_mark;
d346a3fa 2492 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
1da177e4 2493
bfd5f4a3
SS
2494 if (po->has_vnet_hdr) {
2495 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2496 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2497 vnet_hdr.csum_offset)) {
2498 err = -EINVAL;
2499 goto out_free;
2500 }
2501 }
2502
2503 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2504 skb_shinfo(skb)->gso_type = gso_type;
2505
2506 /* Header must be checked, and gso_segs computed. */
2507 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2508 skb_shinfo(skb)->gso_segs = 0;
2509
2510 len += vnet_hdr_len;
2511 }
2512
d346a3fa
DB
2513 if (!packet_use_direct_xmit(po))
2514 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2515 if (unlikely(extra_len == 4))
2516 skb->no_fcs = 1;
2517
d346a3fa 2518 err = po->xmit(skb);
1da177e4
LT
2519 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2520 goto out_unlock;
2521
e40526cb 2522 dev_put(dev);
1da177e4 2523
40d4e3df 2524 return len;
1da177e4
LT
2525
2526out_free:
2527 kfree_skb(skb);
2528out_unlock:
e40526cb 2529 if (dev)
1da177e4
LT
2530 dev_put(dev);
2531out:
2532 return err;
2533}
2534
69e3c75f
JB
2535static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2536 struct msghdr *msg, size_t len)
2537{
69e3c75f
JB
2538 struct sock *sk = sock->sk;
2539 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2540
69e3c75f
JB
2541 if (po->tx_ring.pg_vec)
2542 return tpacket_snd(po, msg);
2543 else
69e3c75f
JB
2544 return packet_snd(sock, msg, len);
2545}
2546
1da177e4
LT
2547/*
2548 * Close a PACKET socket. This is fairly simple. We immediately go
2549 * to 'closed' state and remove our protocol entry in the device list.
2550 */
2551
2552static int packet_release(struct socket *sock)
2553{
2554 struct sock *sk = sock->sk;
2555 struct packet_sock *po;
d12d01d6 2556 struct net *net;
f6fb8f10 2557 union tpacket_req_u req_u;
1da177e4
LT
2558
2559 if (!sk)
2560 return 0;
2561
3b1e0a65 2562 net = sock_net(sk);
1da177e4
LT
2563 po = pkt_sk(sk);
2564
0fa7fa98 2565 mutex_lock(&net->packet.sklist_lock);
808f5114 2566 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2567 mutex_unlock(&net->packet.sklist_lock);
2568
2569 preempt_disable();
920de804 2570 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2571 preempt_enable();
1da177e4 2572
808f5114 2573 spin_lock(&po->bind_lock);
ce06b03e 2574 unregister_prot_hook(sk, false);
66e56cd4
DB
2575 packet_cached_dev_reset(po);
2576
160ff18a
BG
2577 if (po->prot_hook.dev) {
2578 dev_put(po->prot_hook.dev);
2579 po->prot_hook.dev = NULL;
2580 }
808f5114 2581 spin_unlock(&po->bind_lock);
1da177e4 2582
1da177e4 2583 packet_flush_mclist(sk);
1da177e4 2584
9665d5d6
PS
2585 if (po->rx_ring.pg_vec) {
2586 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2587 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2588 }
69e3c75f 2589
9665d5d6
PS
2590 if (po->tx_ring.pg_vec) {
2591 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2592 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2593 }
1da177e4 2594
dc99f600
DM
2595 fanout_release(sk);
2596
808f5114 2597 synchronize_net();
1da177e4
LT
2598 /*
2599 * Now the socket is dead. No more input will appear.
2600 */
1da177e4
LT
2601 sock_orphan(sk);
2602 sock->sk = NULL;
2603
2604 /* Purge queues */
2605
2606 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2607 packet_free_pending(po);
17ab56a2 2608 sk_refcnt_debug_release(sk);
1da177e4
LT
2609
2610 sock_put(sk);
2611 return 0;
2612}
2613
2614/*
2615 * Attach a packet hook.
2616 */
2617
902fefb8 2618static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2619{
2620 struct packet_sock *po = pkt_sk(sk);
902fefb8
DB
2621 const struct net_device *dev_curr;
2622 __be16 proto_curr;
2623 bool need_rehook;
dc99f600 2624
aef950b4
WY
2625 if (po->fanout) {
2626 if (dev)
2627 dev_put(dev);
2628
dc99f600 2629 return -EINVAL;
aef950b4 2630 }
1da177e4
LT
2631
2632 lock_sock(sk);
1da177e4 2633 spin_lock(&po->bind_lock);
66e56cd4 2634
902fefb8
DB
2635 proto_curr = po->prot_hook.type;
2636 dev_curr = po->prot_hook.dev;
2637
2638 need_rehook = proto_curr != proto || dev_curr != dev;
2639
2640 if (need_rehook) {
2641 unregister_prot_hook(sk, true);
1da177e4 2642
902fefb8
DB
2643 po->num = proto;
2644 po->prot_hook.type = proto;
1da177e4 2645
902fefb8
DB
2646 if (po->prot_hook.dev)
2647 dev_put(po->prot_hook.dev);
2648
2649 po->prot_hook.dev = dev;
2650
2651 po->ifindex = dev ? dev->ifindex : 0;
2652 packet_cached_dev_assign(po, dev);
2653 }
66e56cd4 2654
902fefb8 2655 if (proto == 0 || !need_rehook)
1da177e4
LT
2656 goto out_unlock;
2657
be85d4ad 2658 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2659 register_prot_hook(sk);
be85d4ad
UT
2660 } else {
2661 sk->sk_err = ENETDOWN;
2662 if (!sock_flag(sk, SOCK_DEAD))
2663 sk->sk_error_report(sk);
1da177e4
LT
2664 }
2665
2666out_unlock:
2667 spin_unlock(&po->bind_lock);
2668 release_sock(sk);
2669 return 0;
2670}
2671
2672/*
2673 * Bind a packet socket to a device
2674 */
2675
40d4e3df
ED
2676static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2677 int addr_len)
1da177e4 2678{
40d4e3df 2679 struct sock *sk = sock->sk;
1da177e4
LT
2680 char name[15];
2681 struct net_device *dev;
2682 int err = -ENODEV;
1ce4f28b 2683
1da177e4
LT
2684 /*
2685 * Check legality
2686 */
1ce4f28b 2687
8ae55f04 2688 if (addr_len != sizeof(struct sockaddr))
1da177e4 2689 return -EINVAL;
40d4e3df 2690 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2691
3b1e0a65 2692 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2693 if (dev)
1da177e4 2694 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2695 return err;
2696}
1da177e4
LT
2697
2698static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2699{
40d4e3df
ED
2700 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2701 struct sock *sk = sock->sk;
1da177e4
LT
2702 struct net_device *dev = NULL;
2703 int err;
2704
2705
2706 /*
2707 * Check legality
2708 */
1ce4f28b 2709
1da177e4
LT
2710 if (addr_len < sizeof(struct sockaddr_ll))
2711 return -EINVAL;
2712 if (sll->sll_family != AF_PACKET)
2713 return -EINVAL;
2714
2715 if (sll->sll_ifindex) {
2716 err = -ENODEV;
3b1e0a65 2717 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2718 if (dev == NULL)
2719 goto out;
2720 }
2721 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2722
2723out:
2724 return err;
2725}
2726
2727static struct proto packet_proto = {
2728 .name = "PACKET",
2729 .owner = THIS_MODULE,
2730 .obj_size = sizeof(struct packet_sock),
2731};
2732
2733/*
1ce4f28b 2734 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2735 */
2736
3f378b68
EP
2737static int packet_create(struct net *net, struct socket *sock, int protocol,
2738 int kern)
1da177e4
LT
2739{
2740 struct sock *sk;
2741 struct packet_sock *po;
0e11c91e 2742 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2743 int err;
2744
df008c91 2745 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2746 return -EPERM;
be02097c
DM
2747 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2748 sock->type != SOCK_PACKET)
1da177e4
LT
2749 return -ESOCKTNOSUPPORT;
2750
2751 sock->state = SS_UNCONNECTED;
2752
2753 err = -ENOBUFS;
6257ff21 2754 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2755 if (sk == NULL)
2756 goto out;
2757
2758 sock->ops = &packet_ops;
1da177e4
LT
2759 if (sock->type == SOCK_PACKET)
2760 sock->ops = &packet_ops_spkt;
be02097c 2761
1da177e4
LT
2762 sock_init_data(sock, sk);
2763
2764 po = pkt_sk(sk);
2765 sk->sk_family = PF_PACKET;
0e11c91e 2766 po->num = proto;
d346a3fa 2767 po->xmit = dev_queue_xmit;
66e56cd4 2768
b0138408
DB
2769 err = packet_alloc_pending(po);
2770 if (err)
2771 goto out2;
2772
66e56cd4 2773 packet_cached_dev_reset(po);
1da177e4
LT
2774
2775 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2776 sk_refcnt_debug_inc(sk);
1da177e4
LT
2777
2778 /*
2779 * Attach a protocol block
2780 */
2781
2782 spin_lock_init(&po->bind_lock);
905db440 2783 mutex_init(&po->pg_vec_lock);
1da177e4 2784 po->prot_hook.func = packet_rcv;
be02097c 2785
1da177e4
LT
2786 if (sock->type == SOCK_PACKET)
2787 po->prot_hook.func = packet_rcv_spkt;
be02097c 2788
1da177e4
LT
2789 po->prot_hook.af_packet_priv = sk;
2790
0e11c91e
AV
2791 if (proto) {
2792 po->prot_hook.type = proto;
ce06b03e 2793 register_prot_hook(sk);
1da177e4
LT
2794 }
2795
0fa7fa98 2796 mutex_lock(&net->packet.sklist_lock);
808f5114 2797 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2798 mutex_unlock(&net->packet.sklist_lock);
2799
2800 preempt_disable();
3680453c 2801 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2802 preempt_enable();
808f5114 2803
40d4e3df 2804 return 0;
b0138408
DB
2805out2:
2806 sk_free(sk);
1da177e4
LT
2807out:
2808 return err;
2809}
2810
2811/*
2812 * Pull a packet from our receive queue and hand it to the user.
2813 * If necessary we block.
2814 */
2815
2816static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2817 struct msghdr *msg, size_t len, int flags)
2818{
2819 struct sock *sk = sock->sk;
2820 struct sk_buff *skb;
2821 int copied, err;
bfd5f4a3 2822 int vnet_hdr_len = 0;
1da177e4
LT
2823
2824 err = -EINVAL;
ed85b565 2825 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2826 goto out;
2827
2828#if 0
2829 /* What error should we return now? EUNATTACH? */
2830 if (pkt_sk(sk)->ifindex < 0)
2831 return -ENODEV;
2832#endif
2833
ed85b565 2834 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
2835 err = sock_recv_errqueue(sk, msg, len,
2836 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
2837 goto out;
2838 }
2839
1da177e4
LT
2840 /*
2841 * Call the generic datagram receiver. This handles all sorts
2842 * of horrible races and re-entrancy so we can forget about it
2843 * in the protocol layers.
2844 *
2845 * Now it will return ENETDOWN, if device have just gone down,
2846 * but then it will block.
2847 */
2848
40d4e3df 2849 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2850
2851 /*
1ce4f28b 2852 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2853 * handles the blocking we don't see and worry about blocking
2854 * retries.
2855 */
2856
8ae55f04 2857 if (skb == NULL)
1da177e4
LT
2858 goto out;
2859
bfd5f4a3
SS
2860 if (pkt_sk(sk)->has_vnet_hdr) {
2861 struct virtio_net_hdr vnet_hdr = { 0 };
2862
2863 err = -EINVAL;
2864 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2865 if (len < vnet_hdr_len)
bfd5f4a3
SS
2866 goto out_free;
2867
1f18b717
MK
2868 len -= vnet_hdr_len;
2869
bfd5f4a3
SS
2870 if (skb_is_gso(skb)) {
2871 struct skb_shared_info *sinfo = skb_shinfo(skb);
2872
2873 /* This is a hint as to how much should be linear. */
2874 vnet_hdr.hdr_len = skb_headlen(skb);
2875 vnet_hdr.gso_size = sinfo->gso_size;
2876 if (sinfo->gso_type & SKB_GSO_TCPV4)
2877 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2878 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2879 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2880 else if (sinfo->gso_type & SKB_GSO_UDP)
2881 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2882 else if (sinfo->gso_type & SKB_GSO_FCOE)
2883 goto out_free;
2884 else
2885 BUG();
2886 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2887 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2888 } else
2889 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2890
2891 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2892 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2893 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2894 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2895 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2896 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2897 } /* else everything is zero */
2898
2899 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2900 vnet_hdr_len);
2901 if (err < 0)
2902 goto out_free;
2903 }
2904
f3d33426
HFS
2905 /* You lose any data beyond the buffer you gave. If it worries
2906 * a user program they can ask the device for its MTU
2907 * anyway.
1da177e4 2908 */
1da177e4 2909 copied = skb->len;
40d4e3df
ED
2910 if (copied > len) {
2911 copied = len;
2912 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2913 }
2914
2915 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2916 if (err)
2917 goto out_free;
2918
3b885787 2919 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 2920
f3d33426
HFS
2921 if (msg->msg_name) {
2922 /* If the address length field is there to be filled
2923 * in, we fill it in now.
2924 */
2925 if (sock->type == SOCK_PACKET) {
342dfc30 2926 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
2927 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2928 } else {
2929 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2930 msg->msg_namelen = sll->sll_halen +
2931 offsetof(struct sockaddr_ll, sll_addr);
2932 }
ffbc6111
HX
2933 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2934 msg->msg_namelen);
f3d33426 2935 }
1da177e4 2936
8dc41944 2937 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2938 struct tpacket_auxdata aux;
2939
2940 aux.tp_status = TP_STATUS_USER;
2941 if (skb->ip_summed == CHECKSUM_PARTIAL)
2942 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2943 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2944 aux.tp_snaplen = skb->len;
2945 aux.tp_mac = 0;
bbe735e4 2946 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2947 if (vlan_tx_tag_present(skb)) {
2948 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
a0cdfcf3
AW
2949 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
2950 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2951 } else {
2952 aux.tp_vlan_tci = 0;
a0cdfcf3 2953 aux.tp_vlan_tpid = 0;
a3bcc23e 2954 }
ffbc6111 2955 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2956 }
2957
1da177e4
LT
2958 /*
2959 * Free or return the buffer as appropriate. Again this
2960 * hides all the races and re-entrancy issues from us.
2961 */
bfd5f4a3 2962 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2963
2964out_free:
2965 skb_free_datagram(sk, skb);
2966out:
2967 return err;
2968}
2969
1da177e4
LT
2970static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2971 int *uaddr_len, int peer)
2972{
2973 struct net_device *dev;
2974 struct sock *sk = sock->sk;
2975
2976 if (peer)
2977 return -EOPNOTSUPP;
2978
2979 uaddr->sa_family = AF_PACKET;
2dc85bf3 2980 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
2981 rcu_read_lock();
2982 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2983 if (dev)
2dc85bf3 2984 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 2985 rcu_read_unlock();
1da177e4
LT
2986 *uaddr_len = sizeof(*uaddr);
2987
2988 return 0;
2989}
1da177e4
LT
2990
2991static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2992 int *uaddr_len, int peer)
2993{
2994 struct net_device *dev;
2995 struct sock *sk = sock->sk;
2996 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2997 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2998
2999 if (peer)
3000 return -EOPNOTSUPP;
3001
3002 sll->sll_family = AF_PACKET;
3003 sll->sll_ifindex = po->ifindex;
3004 sll->sll_protocol = po->num;
67286640 3005 sll->sll_pkttype = 0;
654d1f8a
ED
3006 rcu_read_lock();
3007 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3008 if (dev) {
3009 sll->sll_hatype = dev->type;
3010 sll->sll_halen = dev->addr_len;
3011 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3012 } else {
3013 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3014 sll->sll_halen = 0;
3015 }
654d1f8a 3016 rcu_read_unlock();
0fb375fb 3017 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3018
3019 return 0;
3020}
3021
2aeb0b88
WC
3022static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3023 int what)
1da177e4
LT
3024{
3025 switch (i->type) {
3026 case PACKET_MR_MULTICAST:
1162563f
JP
3027 if (i->alen != dev->addr_len)
3028 return -EINVAL;
1da177e4 3029 if (what > 0)
22bedad3 3030 return dev_mc_add(dev, i->addr);
1da177e4 3031 else
22bedad3 3032 return dev_mc_del(dev, i->addr);
1da177e4
LT
3033 break;
3034 case PACKET_MR_PROMISC:
2aeb0b88 3035 return dev_set_promiscuity(dev, what);
1da177e4
LT
3036 break;
3037 case PACKET_MR_ALLMULTI:
2aeb0b88 3038 return dev_set_allmulti(dev, what);
1da177e4 3039 break;
d95ed927 3040 case PACKET_MR_UNICAST:
1162563f
JP
3041 if (i->alen != dev->addr_len)
3042 return -EINVAL;
d95ed927 3043 if (what > 0)
a748ee24 3044 return dev_uc_add(dev, i->addr);
d95ed927 3045 else
a748ee24 3046 return dev_uc_del(dev, i->addr);
d95ed927 3047 break;
40d4e3df
ED
3048 default:
3049 break;
1da177e4 3050 }
2aeb0b88 3051 return 0;
1da177e4
LT
3052}
3053
3054static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
3055{
40d4e3df 3056 for ( ; i; i = i->next) {
1da177e4
LT
3057 if (i->ifindex == dev->ifindex)
3058 packet_dev_mc(dev, i, what);
3059 }
3060}
3061
0fb375fb 3062static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3063{
3064 struct packet_sock *po = pkt_sk(sk);
3065 struct packet_mclist *ml, *i;
3066 struct net_device *dev;
3067 int err;
3068
3069 rtnl_lock();
3070
3071 err = -ENODEV;
3b1e0a65 3072 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3073 if (!dev)
3074 goto done;
3075
3076 err = -EINVAL;
1162563f 3077 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3078 goto done;
3079
3080 err = -ENOBUFS;
8b3a7005 3081 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3082 if (i == NULL)
3083 goto done;
3084
3085 err = 0;
3086 for (ml = po->mclist; ml; ml = ml->next) {
3087 if (ml->ifindex == mreq->mr_ifindex &&
3088 ml->type == mreq->mr_type &&
3089 ml->alen == mreq->mr_alen &&
3090 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3091 ml->count++;
3092 /* Free the new element ... */
3093 kfree(i);
3094 goto done;
3095 }
3096 }
3097
3098 i->type = mreq->mr_type;
3099 i->ifindex = mreq->mr_ifindex;
3100 i->alen = mreq->mr_alen;
3101 memcpy(i->addr, mreq->mr_address, i->alen);
3102 i->count = 1;
3103 i->next = po->mclist;
3104 po->mclist = i;
2aeb0b88
WC
3105 err = packet_dev_mc(dev, i, 1);
3106 if (err) {
3107 po->mclist = i->next;
3108 kfree(i);
3109 }
1da177e4
LT
3110
3111done:
3112 rtnl_unlock();
3113 return err;
3114}
3115
0fb375fb 3116static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3117{
3118 struct packet_mclist *ml, **mlp;
3119
3120 rtnl_lock();
3121
3122 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3123 if (ml->ifindex == mreq->mr_ifindex &&
3124 ml->type == mreq->mr_type &&
3125 ml->alen == mreq->mr_alen &&
3126 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3127 if (--ml->count == 0) {
3128 struct net_device *dev;
3129 *mlp = ml->next;
ad959e76
ED
3130 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3131 if (dev)
1da177e4 3132 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3133 kfree(ml);
3134 }
3135 rtnl_unlock();
3136 return 0;
3137 }
3138 }
3139 rtnl_unlock();
3140 return -EADDRNOTAVAIL;
3141}
3142
3143static void packet_flush_mclist(struct sock *sk)
3144{
3145 struct packet_sock *po = pkt_sk(sk);
3146 struct packet_mclist *ml;
3147
3148 if (!po->mclist)
3149 return;
3150
3151 rtnl_lock();
3152 while ((ml = po->mclist) != NULL) {
3153 struct net_device *dev;
3154
3155 po->mclist = ml->next;
ad959e76
ED
3156 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3157 if (dev != NULL)
1da177e4 3158 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3159 kfree(ml);
3160 }
3161 rtnl_unlock();
3162}
1da177e4
LT
3163
3164static int
b7058842 3165packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3166{
3167 struct sock *sk = sock->sk;
8dc41944 3168 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3169 int ret;
3170
3171 if (level != SOL_PACKET)
3172 return -ENOPROTOOPT;
3173
69e3c75f 3174 switch (optname) {
1ce4f28b 3175 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3176 case PACKET_DROP_MEMBERSHIP:
3177 {
0fb375fb
EB
3178 struct packet_mreq_max mreq;
3179 int len = optlen;
3180 memset(&mreq, 0, sizeof(mreq));
3181 if (len < sizeof(struct packet_mreq))
1da177e4 3182 return -EINVAL;
0fb375fb
EB
3183 if (len > sizeof(mreq))
3184 len = sizeof(mreq);
40d4e3df 3185 if (copy_from_user(&mreq, optval, len))
1da177e4 3186 return -EFAULT;
0fb375fb
EB
3187 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3188 return -EINVAL;
1da177e4
LT
3189 if (optname == PACKET_ADD_MEMBERSHIP)
3190 ret = packet_mc_add(sk, &mreq);
3191 else
3192 ret = packet_mc_drop(sk, &mreq);
3193 return ret;
3194 }
a2efcfa0 3195
1da177e4 3196 case PACKET_RX_RING:
69e3c75f 3197 case PACKET_TX_RING:
1da177e4 3198 {
f6fb8f10 3199 union tpacket_req_u req_u;
3200 int len;
1da177e4 3201
f6fb8f10 3202 switch (po->tp_version) {
3203 case TPACKET_V1:
3204 case TPACKET_V2:
3205 len = sizeof(req_u.req);
3206 break;
3207 case TPACKET_V3:
3208 default:
3209 len = sizeof(req_u.req3);
3210 break;
3211 }
3212 if (optlen < len)
1da177e4 3213 return -EINVAL;
bfd5f4a3
SS
3214 if (pkt_sk(sk)->has_vnet_hdr)
3215 return -EINVAL;
f6fb8f10 3216 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3217 return -EFAULT;
f6fb8f10 3218 return packet_set_ring(sk, &req_u, 0,
3219 optname == PACKET_TX_RING);
1da177e4
LT
3220 }
3221 case PACKET_COPY_THRESH:
3222 {
3223 int val;
3224
40d4e3df 3225 if (optlen != sizeof(val))
1da177e4 3226 return -EINVAL;
40d4e3df 3227 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3228 return -EFAULT;
3229
3230 pkt_sk(sk)->copy_thresh = val;
3231 return 0;
3232 }
bbd6ef87
PM
3233 case PACKET_VERSION:
3234 {
3235 int val;
3236
3237 if (optlen != sizeof(val))
3238 return -EINVAL;
69e3c75f 3239 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3240 return -EBUSY;
3241 if (copy_from_user(&val, optval, sizeof(val)))
3242 return -EFAULT;
3243 switch (val) {
3244 case TPACKET_V1:
3245 case TPACKET_V2:
f6fb8f10 3246 case TPACKET_V3:
bbd6ef87
PM
3247 po->tp_version = val;
3248 return 0;
3249 default:
3250 return -EINVAL;
3251 }
3252 }
8913336a
PM
3253 case PACKET_RESERVE:
3254 {
3255 unsigned int val;
3256
3257 if (optlen != sizeof(val))
3258 return -EINVAL;
69e3c75f 3259 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3260 return -EBUSY;
3261 if (copy_from_user(&val, optval, sizeof(val)))
3262 return -EFAULT;
3263 po->tp_reserve = val;
3264 return 0;
3265 }
69e3c75f
JB
3266 case PACKET_LOSS:
3267 {
3268 unsigned int val;
3269
3270 if (optlen != sizeof(val))
3271 return -EINVAL;
3272 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3273 return -EBUSY;
3274 if (copy_from_user(&val, optval, sizeof(val)))
3275 return -EFAULT;
3276 po->tp_loss = !!val;
3277 return 0;
3278 }
8dc41944
HX
3279 case PACKET_AUXDATA:
3280 {
3281 int val;
3282
3283 if (optlen < sizeof(val))
3284 return -EINVAL;
3285 if (copy_from_user(&val, optval, sizeof(val)))
3286 return -EFAULT;
3287
3288 po->auxdata = !!val;
3289 return 0;
3290 }
80feaacb
PWJ
3291 case PACKET_ORIGDEV:
3292 {
3293 int val;
3294
3295 if (optlen < sizeof(val))
3296 return -EINVAL;
3297 if (copy_from_user(&val, optval, sizeof(val)))
3298 return -EFAULT;
3299
3300 po->origdev = !!val;
3301 return 0;
3302 }
bfd5f4a3
SS
3303 case PACKET_VNET_HDR:
3304 {
3305 int val;
3306
3307 if (sock->type != SOCK_RAW)
3308 return -EINVAL;
3309 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3310 return -EBUSY;
3311 if (optlen < sizeof(val))
3312 return -EINVAL;
3313 if (copy_from_user(&val, optval, sizeof(val)))
3314 return -EFAULT;
3315
3316 po->has_vnet_hdr = !!val;
3317 return 0;
3318 }
614f60fa
SM
3319 case PACKET_TIMESTAMP:
3320 {
3321 int val;
3322
3323 if (optlen != sizeof(val))
3324 return -EINVAL;
3325 if (copy_from_user(&val, optval, sizeof(val)))
3326 return -EFAULT;
3327
3328 po->tp_tstamp = val;
3329 return 0;
3330 }
dc99f600
DM
3331 case PACKET_FANOUT:
3332 {
3333 int val;
3334
3335 if (optlen != sizeof(val))
3336 return -EINVAL;
3337 if (copy_from_user(&val, optval, sizeof(val)))
3338 return -EFAULT;
3339
3340 return fanout_add(sk, val & 0xffff, val >> 16);
3341 }
5920cd3a
PC
3342 case PACKET_TX_HAS_OFF:
3343 {
3344 unsigned int val;
3345
3346 if (optlen != sizeof(val))
3347 return -EINVAL;
3348 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3349 return -EBUSY;
3350 if (copy_from_user(&val, optval, sizeof(val)))
3351 return -EFAULT;
3352 po->tp_tx_has_off = !!val;
3353 return 0;
3354 }
d346a3fa
DB
3355 case PACKET_QDISC_BYPASS:
3356 {
3357 int val;
3358
3359 if (optlen != sizeof(val))
3360 return -EINVAL;
3361 if (copy_from_user(&val, optval, sizeof(val)))
3362 return -EFAULT;
3363
3364 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3365 return 0;
3366 }
1da177e4
LT
3367 default:
3368 return -ENOPROTOOPT;
3369 }
3370}
3371
3372static int packet_getsockopt(struct socket *sock, int level, int optname,
3373 char __user *optval, int __user *optlen)
3374{
3375 int len;
c06fff6e 3376 int val, lv = sizeof(val);
1da177e4
LT
3377 struct sock *sk = sock->sk;
3378 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3379 void *data = &val;
ee80fbf3 3380 union tpacket_stats_u st;
1da177e4
LT
3381
3382 if (level != SOL_PACKET)
3383 return -ENOPROTOOPT;
3384
8ae55f04
KK
3385 if (get_user(len, optlen))
3386 return -EFAULT;
1da177e4
LT
3387
3388 if (len < 0)
3389 return -EINVAL;
1ce4f28b 3390
69e3c75f 3391 switch (optname) {
1da177e4 3392 case PACKET_STATISTICS:
1da177e4 3393 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3394 memcpy(&st, &po->stats, sizeof(st));
3395 memset(&po->stats, 0, sizeof(po->stats));
3396 spin_unlock_bh(&sk->sk_receive_queue.lock);
3397
f6fb8f10 3398 if (po->tp_version == TPACKET_V3) {
c06fff6e 3399 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3400 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3401 data = &st.stats3;
f6fb8f10 3402 } else {
c06fff6e 3403 lv = sizeof(struct tpacket_stats);
8bcdeaff 3404 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3405 data = &st.stats1;
f6fb8f10 3406 }
ee80fbf3 3407
8dc41944
HX
3408 break;
3409 case PACKET_AUXDATA:
8dc41944 3410 val = po->auxdata;
80feaacb
PWJ
3411 break;
3412 case PACKET_ORIGDEV:
80feaacb 3413 val = po->origdev;
bfd5f4a3
SS
3414 break;
3415 case PACKET_VNET_HDR:
bfd5f4a3 3416 val = po->has_vnet_hdr;
1da177e4 3417 break;
bbd6ef87 3418 case PACKET_VERSION:
bbd6ef87 3419 val = po->tp_version;
bbd6ef87
PM
3420 break;
3421 case PACKET_HDRLEN:
3422 if (len > sizeof(int))
3423 len = sizeof(int);
3424 if (copy_from_user(&val, optval, len))
3425 return -EFAULT;
3426 switch (val) {
3427 case TPACKET_V1:
3428 val = sizeof(struct tpacket_hdr);
3429 break;
3430 case TPACKET_V2:
3431 val = sizeof(struct tpacket2_hdr);
3432 break;
f6fb8f10 3433 case TPACKET_V3:
3434 val = sizeof(struct tpacket3_hdr);
3435 break;
bbd6ef87
PM
3436 default:
3437 return -EINVAL;
3438 }
bbd6ef87 3439 break;
8913336a 3440 case PACKET_RESERVE:
8913336a 3441 val = po->tp_reserve;
8913336a 3442 break;
69e3c75f 3443 case PACKET_LOSS:
69e3c75f 3444 val = po->tp_loss;
69e3c75f 3445 break;
614f60fa 3446 case PACKET_TIMESTAMP:
614f60fa 3447 val = po->tp_tstamp;
614f60fa 3448 break;
dc99f600 3449 case PACKET_FANOUT:
dc99f600
DM
3450 val = (po->fanout ?
3451 ((u32)po->fanout->id |
77f65ebd
WB
3452 ((u32)po->fanout->type << 16) |
3453 ((u32)po->fanout->flags << 24)) :
dc99f600 3454 0);
dc99f600 3455 break;
5920cd3a
PC
3456 case PACKET_TX_HAS_OFF:
3457 val = po->tp_tx_has_off;
3458 break;
d346a3fa
DB
3459 case PACKET_QDISC_BYPASS:
3460 val = packet_use_direct_xmit(po);
3461 break;
1da177e4
LT
3462 default:
3463 return -ENOPROTOOPT;
3464 }
3465
c06fff6e
ED
3466 if (len > lv)
3467 len = lv;
8ae55f04
KK
3468 if (put_user(len, optlen))
3469 return -EFAULT;
8dc41944
HX
3470 if (copy_to_user(optval, data, len))
3471 return -EFAULT;
8ae55f04 3472 return 0;
1da177e4
LT
3473}
3474
3475
351638e7
JP
3476static int packet_notifier(struct notifier_block *this,
3477 unsigned long msg, void *ptr)
1da177e4
LT
3478{
3479 struct sock *sk;
351638e7 3480 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3481 struct net *net = dev_net(dev);
1da177e4 3482
808f5114 3483 rcu_read_lock();
b67bfe0d 3484 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3485 struct packet_sock *po = pkt_sk(sk);
3486
3487 switch (msg) {
3488 case NETDEV_UNREGISTER:
1da177e4
LT
3489 if (po->mclist)
3490 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3491 /* fallthrough */
3492
1da177e4
LT
3493 case NETDEV_DOWN:
3494 if (dev->ifindex == po->ifindex) {
3495 spin_lock(&po->bind_lock);
3496 if (po->running) {
ce06b03e 3497 __unregister_prot_hook(sk, false);
1da177e4
LT
3498 sk->sk_err = ENETDOWN;
3499 if (!sock_flag(sk, SOCK_DEAD))
3500 sk->sk_error_report(sk);
3501 }
3502 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3503 packet_cached_dev_reset(po);
1da177e4 3504 po->ifindex = -1;
160ff18a
BG
3505 if (po->prot_hook.dev)
3506 dev_put(po->prot_hook.dev);
1da177e4
LT
3507 po->prot_hook.dev = NULL;
3508 }
3509 spin_unlock(&po->bind_lock);
3510 }
3511 break;
3512 case NETDEV_UP:
808f5114 3513 if (dev->ifindex == po->ifindex) {
3514 spin_lock(&po->bind_lock);
ce06b03e
DM
3515 if (po->num)
3516 register_prot_hook(sk);
808f5114 3517 spin_unlock(&po->bind_lock);
1da177e4 3518 }
1da177e4
LT
3519 break;
3520 }
3521 }
808f5114 3522 rcu_read_unlock();
1da177e4
LT
3523 return NOTIFY_DONE;
3524}
3525
3526
3527static int packet_ioctl(struct socket *sock, unsigned int cmd,
3528 unsigned long arg)
3529{
3530 struct sock *sk = sock->sk;
3531
69e3c75f 3532 switch (cmd) {
40d4e3df
ED
3533 case SIOCOUTQ:
3534 {
3535 int amount = sk_wmem_alloc_get(sk);
31e6d363 3536
40d4e3df
ED
3537 return put_user(amount, (int __user *)arg);
3538 }
3539 case SIOCINQ:
3540 {
3541 struct sk_buff *skb;
3542 int amount = 0;
3543
3544 spin_lock_bh(&sk->sk_receive_queue.lock);
3545 skb = skb_peek(&sk->sk_receive_queue);
3546 if (skb)
3547 amount = skb->len;
3548 spin_unlock_bh(&sk->sk_receive_queue.lock);
3549 return put_user(amount, (int __user *)arg);
3550 }
3551 case SIOCGSTAMP:
3552 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3553 case SIOCGSTAMPNS:
3554 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3555
1da177e4 3556#ifdef CONFIG_INET
40d4e3df
ED
3557 case SIOCADDRT:
3558 case SIOCDELRT:
3559 case SIOCDARP:
3560 case SIOCGARP:
3561 case SIOCSARP:
3562 case SIOCGIFADDR:
3563 case SIOCSIFADDR:
3564 case SIOCGIFBRDADDR:
3565 case SIOCSIFBRDADDR:
3566 case SIOCGIFNETMASK:
3567 case SIOCSIFNETMASK:
3568 case SIOCGIFDSTADDR:
3569 case SIOCSIFDSTADDR:
3570 case SIOCSIFFLAGS:
40d4e3df 3571 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3572#endif
3573
40d4e3df
ED
3574 default:
3575 return -ENOIOCTLCMD;
1da177e4
LT
3576 }
3577 return 0;
3578}
3579
40d4e3df 3580static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3581 poll_table *wait)
3582{
3583 struct sock *sk = sock->sk;
3584 struct packet_sock *po = pkt_sk(sk);
3585 unsigned int mask = datagram_poll(file, sock, wait);
3586
3587 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3588 if (po->rx_ring.pg_vec) {
f6fb8f10 3589 if (!packet_previous_rx_frame(po, &po->rx_ring,
3590 TP_STATUS_KERNEL))
1da177e4
LT
3591 mask |= POLLIN | POLLRDNORM;
3592 }
3593 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3594 spin_lock_bh(&sk->sk_write_queue.lock);
3595 if (po->tx_ring.pg_vec) {
3596 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3597 mask |= POLLOUT | POLLWRNORM;
3598 }
3599 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3600 return mask;
3601}
3602
3603
3604/* Dirty? Well, I still did not learn better way to account
3605 * for user mmaps.
3606 */
3607
3608static void packet_mm_open(struct vm_area_struct *vma)
3609{
3610 struct file *file = vma->vm_file;
40d4e3df 3611 struct socket *sock = file->private_data;
1da177e4 3612 struct sock *sk = sock->sk;
1ce4f28b 3613
1da177e4
LT
3614 if (sk)
3615 atomic_inc(&pkt_sk(sk)->mapped);
3616}
3617
3618static void packet_mm_close(struct vm_area_struct *vma)
3619{
3620 struct file *file = vma->vm_file;
40d4e3df 3621 struct socket *sock = file->private_data;
1da177e4 3622 struct sock *sk = sock->sk;
1ce4f28b 3623
1da177e4
LT
3624 if (sk)
3625 atomic_dec(&pkt_sk(sk)->mapped);
3626}
3627
f0f37e2f 3628static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3629 .open = packet_mm_open,
3630 .close = packet_mm_close,
1da177e4
LT
3631};
3632
0e3125c7
NH
3633static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3634 unsigned int len)
1da177e4
LT
3635{
3636 int i;
3637
4ebf0ae2 3638 for (i = 0; i < len; i++) {
0e3125c7 3639 if (likely(pg_vec[i].buffer)) {
c56b4d90 3640 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3641 vfree(pg_vec[i].buffer);
3642 else
3643 free_pages((unsigned long)pg_vec[i].buffer,
3644 order);
3645 pg_vec[i].buffer = NULL;
3646 }
1da177e4
LT
3647 }
3648 kfree(pg_vec);
3649}
3650
eea49cc9 3651static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3652{
f0d4eb29 3653 char *buffer;
0e3125c7
NH
3654 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3655 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3656
3657 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3658 if (buffer)
3659 return buffer;
3660
f0d4eb29 3661 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 3662 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
3663 if (buffer)
3664 return buffer;
3665
f0d4eb29 3666 /* vmalloc failed, lets dig into swap here */
0e3125c7 3667 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 3668 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3669 if (buffer)
3670 return buffer;
3671
f0d4eb29 3672 /* complete and utter failure */
0e3125c7 3673 return NULL;
4ebf0ae2
DM
3674}
3675
0e3125c7 3676static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3677{
3678 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3679 struct pgv *pg_vec;
4ebf0ae2
DM
3680 int i;
3681
0e3125c7 3682 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3683 if (unlikely(!pg_vec))
3684 goto out;
3685
3686 for (i = 0; i < block_nr; i++) {
c56b4d90 3687 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3688 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3689 goto out_free_pgvec;
3690 }
3691
3692out:
3693 return pg_vec;
3694
3695out_free_pgvec:
3696 free_pg_vec(pg_vec, order, block_nr);
3697 pg_vec = NULL;
3698 goto out;
3699}
1da177e4 3700
f6fb8f10 3701static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3702 int closing, int tx_ring)
1da177e4 3703{
0e3125c7 3704 struct pgv *pg_vec = NULL;
1da177e4 3705 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3706 int was_running, order = 0;
69e3c75f
JB
3707 struct packet_ring_buffer *rb;
3708 struct sk_buff_head *rb_queue;
0e11c91e 3709 __be16 num;
f6fb8f10 3710 int err = -EINVAL;
3711 /* Added to avoid minimal code churn */
3712 struct tpacket_req *req = &req_u->req;
3713
3714 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3715 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3716 WARN(1, "Tx-ring is not supported.\n");
3717 goto out;
3718 }
1ce4f28b 3719
69e3c75f
JB
3720 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3721 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3722
69e3c75f
JB
3723 err = -EBUSY;
3724 if (!closing) {
3725 if (atomic_read(&po->mapped))
3726 goto out;
b0138408 3727 if (packet_read_pending(rb))
69e3c75f
JB
3728 goto out;
3729 }
1da177e4 3730
69e3c75f
JB
3731 if (req->tp_block_nr) {
3732 /* Sanity tests and some calculations */
3733 err = -EBUSY;
3734 if (unlikely(rb->pg_vec))
3735 goto out;
1da177e4 3736
bbd6ef87
PM
3737 switch (po->tp_version) {
3738 case TPACKET_V1:
3739 po->tp_hdrlen = TPACKET_HDRLEN;
3740 break;
3741 case TPACKET_V2:
3742 po->tp_hdrlen = TPACKET2_HDRLEN;
3743 break;
f6fb8f10 3744 case TPACKET_V3:
3745 po->tp_hdrlen = TPACKET3_HDRLEN;
3746 break;
bbd6ef87
PM
3747 }
3748
69e3c75f 3749 err = -EINVAL;
4ebf0ae2 3750 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3751 goto out;
4ebf0ae2 3752 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3753 goto out;
8913336a 3754 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3755 po->tp_reserve))
3756 goto out;
4ebf0ae2 3757 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3758 goto out;
1da177e4 3759
69e3c75f
JB
3760 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3761 if (unlikely(rb->frames_per_block <= 0))
3762 goto out;
3763 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3764 req->tp_frame_nr))
3765 goto out;
1da177e4
LT
3766
3767 err = -ENOMEM;
4ebf0ae2
DM
3768 order = get_order(req->tp_block_size);
3769 pg_vec = alloc_pg_vec(req, order);
3770 if (unlikely(!pg_vec))
1da177e4 3771 goto out;
f6fb8f10 3772 switch (po->tp_version) {
3773 case TPACKET_V3:
3774 /* Transmit path is not supported. We checked
3775 * it above but just being paranoid
3776 */
3777 if (!tx_ring)
3778 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3779 break;
3780 default:
3781 break;
3782 }
69e3c75f
JB
3783 }
3784 /* Done */
3785 else {
3786 err = -EINVAL;
4ebf0ae2 3787 if (unlikely(req->tp_frame_nr))
69e3c75f 3788 goto out;
1da177e4
LT
3789 }
3790
3791 lock_sock(sk);
3792
3793 /* Detach socket from network */
3794 spin_lock(&po->bind_lock);
3795 was_running = po->running;
3796 num = po->num;
3797 if (was_running) {
1da177e4 3798 po->num = 0;
ce06b03e 3799 __unregister_prot_hook(sk, false);
1da177e4
LT
3800 }
3801 spin_unlock(&po->bind_lock);
1ce4f28b 3802
1da177e4
LT
3803 synchronize_net();
3804
3805 err = -EBUSY;
905db440 3806 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3807 if (closing || atomic_read(&po->mapped) == 0) {
3808 err = 0;
69e3c75f 3809 spin_lock_bh(&rb_queue->lock);
c053fd96 3810 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3811 rb->frame_max = (req->tp_frame_nr - 1);
3812 rb->head = 0;
3813 rb->frame_size = req->tp_frame_size;
3814 spin_unlock_bh(&rb_queue->lock);
3815
c053fd96
CG
3816 swap(rb->pg_vec_order, order);
3817 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3818
3819 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3820 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3821 tpacket_rcv : packet_rcv;
3822 skb_queue_purge(rb_queue);
1da177e4 3823 if (atomic_read(&po->mapped))
40d4e3df
ED
3824 pr_err("packet_mmap: vma is busy: %d\n",
3825 atomic_read(&po->mapped));
1da177e4 3826 }
905db440 3827 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3828
3829 spin_lock(&po->bind_lock);
ce06b03e 3830 if (was_running) {
1da177e4 3831 po->num = num;
ce06b03e 3832 register_prot_hook(sk);
1da177e4
LT
3833 }
3834 spin_unlock(&po->bind_lock);
f6fb8f10 3835 if (closing && (po->tp_version > TPACKET_V2)) {
3836 /* Because we don't support block-based V3 on tx-ring */
3837 if (!tx_ring)
3838 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3839 }
1da177e4
LT
3840 release_sock(sk);
3841
1da177e4
LT
3842 if (pg_vec)
3843 free_pg_vec(pg_vec, order, req->tp_block_nr);
3844out:
3845 return err;
3846}
3847
69e3c75f
JB
3848static int packet_mmap(struct file *file, struct socket *sock,
3849 struct vm_area_struct *vma)
1da177e4
LT
3850{
3851 struct sock *sk = sock->sk;
3852 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3853 unsigned long size, expected_size;
3854 struct packet_ring_buffer *rb;
1da177e4
LT
3855 unsigned long start;
3856 int err = -EINVAL;
3857 int i;
3858
3859 if (vma->vm_pgoff)
3860 return -EINVAL;
3861
905db440 3862 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3863
3864 expected_size = 0;
3865 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3866 if (rb->pg_vec) {
3867 expected_size += rb->pg_vec_len
3868 * rb->pg_vec_pages
3869 * PAGE_SIZE;
3870 }
3871 }
3872
3873 if (expected_size == 0)
1da177e4 3874 goto out;
69e3c75f
JB
3875
3876 size = vma->vm_end - vma->vm_start;
3877 if (size != expected_size)
1da177e4
LT
3878 goto out;
3879
1da177e4 3880 start = vma->vm_start;
69e3c75f
JB
3881 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3882 if (rb->pg_vec == NULL)
3883 continue;
3884
3885 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3886 struct page *page;
3887 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3888 int pg_num;
3889
c56b4d90
CG
3890 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3891 page = pgv_to_page(kaddr);
69e3c75f
JB
3892 err = vm_insert_page(vma, start, page);
3893 if (unlikely(err))
3894 goto out;
3895 start += PAGE_SIZE;
0e3125c7 3896 kaddr += PAGE_SIZE;
69e3c75f 3897 }
4ebf0ae2 3898 }
1da177e4 3899 }
69e3c75f 3900
4ebf0ae2 3901 atomic_inc(&po->mapped);
1da177e4
LT
3902 vma->vm_ops = &packet_mmap_ops;
3903 err = 0;
3904
3905out:
905db440 3906 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3907 return err;
3908}
1da177e4 3909
90ddc4f0 3910static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3911 .family = PF_PACKET,
3912 .owner = THIS_MODULE,
3913 .release = packet_release,
3914 .bind = packet_bind_spkt,
3915 .connect = sock_no_connect,
3916 .socketpair = sock_no_socketpair,
3917 .accept = sock_no_accept,
3918 .getname = packet_getname_spkt,
3919 .poll = datagram_poll,
3920 .ioctl = packet_ioctl,
3921 .listen = sock_no_listen,
3922 .shutdown = sock_no_shutdown,
3923 .setsockopt = sock_no_setsockopt,
3924 .getsockopt = sock_no_getsockopt,
3925 .sendmsg = packet_sendmsg_spkt,
3926 .recvmsg = packet_recvmsg,
3927 .mmap = sock_no_mmap,
3928 .sendpage = sock_no_sendpage,
3929};
1da177e4 3930
90ddc4f0 3931static const struct proto_ops packet_ops = {
1da177e4
LT
3932 .family = PF_PACKET,
3933 .owner = THIS_MODULE,
3934 .release = packet_release,
3935 .bind = packet_bind,
3936 .connect = sock_no_connect,
3937 .socketpair = sock_no_socketpair,
3938 .accept = sock_no_accept,
1ce4f28b 3939 .getname = packet_getname,
1da177e4
LT
3940 .poll = packet_poll,
3941 .ioctl = packet_ioctl,
3942 .listen = sock_no_listen,
3943 .shutdown = sock_no_shutdown,
3944 .setsockopt = packet_setsockopt,
3945 .getsockopt = packet_getsockopt,
3946 .sendmsg = packet_sendmsg,
3947 .recvmsg = packet_recvmsg,
3948 .mmap = packet_mmap,
3949 .sendpage = sock_no_sendpage,
3950};
3951
ec1b4cf7 3952static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3953 .family = PF_PACKET,
3954 .create = packet_create,
3955 .owner = THIS_MODULE,
3956};
3957
3958static struct notifier_block packet_netdev_notifier = {
40d4e3df 3959 .notifier_call = packet_notifier,
1da177e4
LT
3960};
3961
3962#ifdef CONFIG_PROC_FS
1da177e4
LT
3963
3964static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3965 __acquires(RCU)
1da177e4 3966{
e372c414 3967 struct net *net = seq_file_net(seq);
808f5114 3968
3969 rcu_read_lock();
3970 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3971}
3972
3973static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3974{
1bf40954 3975 struct net *net = seq_file_net(seq);
808f5114 3976 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3977}
3978
3979static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3980 __releases(RCU)
1da177e4 3981{
808f5114 3982 rcu_read_unlock();
1da177e4
LT
3983}
3984
1ce4f28b 3985static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3986{
3987 if (v == SEQ_START_TOKEN)
3988 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3989 else {
b7ceabd9 3990 struct sock *s = sk_entry(v);
1da177e4
LT
3991 const struct packet_sock *po = pkt_sk(s);
3992
3993 seq_printf(seq,
71338aa7 3994 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3995 s,
3996 atomic_read(&s->sk_refcnt),
3997 s->sk_type,
3998 ntohs(po->num),
3999 po->ifindex,
4000 po->running,
4001 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4002 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4003 sock_i_ino(s));
1da177e4
LT
4004 }
4005
4006 return 0;
4007}
4008
56b3d975 4009static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4010 .start = packet_seq_start,
4011 .next = packet_seq_next,
4012 .stop = packet_seq_stop,
4013 .show = packet_seq_show,
4014};
4015
4016static int packet_seq_open(struct inode *inode, struct file *file)
4017{
e372c414
DL
4018 return seq_open_net(inode, file, &packet_seq_ops,
4019 sizeof(struct seq_net_private));
1da177e4
LT
4020}
4021
da7071d7 4022static const struct file_operations packet_seq_fops = {
1da177e4
LT
4023 .owner = THIS_MODULE,
4024 .open = packet_seq_open,
4025 .read = seq_read,
4026 .llseek = seq_lseek,
e372c414 4027 .release = seq_release_net,
1da177e4
LT
4028};
4029
4030#endif
4031
2c8c1e72 4032static int __net_init packet_net_init(struct net *net)
d12d01d6 4033{
0fa7fa98 4034 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4035 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4036
d4beaa66 4037 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4038 return -ENOMEM;
4039
4040 return 0;
4041}
4042
2c8c1e72 4043static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4044{
ece31ffd 4045 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4046}
4047
4048static struct pernet_operations packet_net_ops = {
4049 .init = packet_net_init,
4050 .exit = packet_net_exit,
4051};
4052
4053
1da177e4
LT
4054static void __exit packet_exit(void)
4055{
1da177e4 4056 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4057 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4058 sock_unregister(PF_PACKET);
4059 proto_unregister(&packet_proto);
4060}
4061
4062static int __init packet_init(void)
4063{
4064 int rc = proto_register(&packet_proto, 0);
4065
4066 if (rc != 0)
4067 goto out;
4068
4069 sock_register(&packet_family_ops);
d12d01d6 4070 register_pernet_subsys(&packet_net_ops);
1da177e4 4071 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4072out:
4073 return rc;
4074}
4075
4076module_init(packet_init);
4077module_exit(packet_exit);
4078MODULE_LICENSE("GPL");
4079MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.671804 seconds and 5 git commands to generate.