net: macb: constify macb configuration data
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
d346a3fa
DB
240static int packet_direct_xmit(struct sk_buff *skb)
241{
242 struct net_device *dev = skb->dev;
d346a3fa
DB
243 netdev_features_t features;
244 struct netdev_queue *txq;
43279500 245 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
246
247 if (unlikely(!netif_running(dev) ||
43279500
DB
248 !netif_carrier_ok(dev)))
249 goto drop;
d346a3fa
DB
250
251 features = netif_skb_features(skb);
252 if (skb_needs_linearize(skb, features) &&
43279500
DB
253 __skb_linearize(skb))
254 goto drop;
d346a3fa 255
10c51b56 256 txq = skb_get_tx_queue(dev, skb);
d346a3fa 257
43279500
DB
258 local_bh_disable();
259
260 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 261 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 262 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 263 HARD_TX_UNLOCK(dev, txq);
d346a3fa 264
43279500
DB
265 local_bh_enable();
266
267 if (!dev_xmit_complete(ret))
d346a3fa 268 kfree_skb(skb);
43279500 269
d346a3fa 270 return ret;
43279500 271drop:
0f97ede4 272 atomic_long_inc(&dev->tx_dropped);
43279500
DB
273 kfree_skb(skb);
274 return NET_XMIT_DROP;
d346a3fa
DB
275}
276
66e56cd4
DB
277static struct net_device *packet_cached_dev_get(struct packet_sock *po)
278{
279 struct net_device *dev;
280
281 rcu_read_lock();
282 dev = rcu_dereference(po->cached_dev);
283 if (likely(dev))
284 dev_hold(dev);
285 rcu_read_unlock();
286
287 return dev;
288}
289
290static void packet_cached_dev_assign(struct packet_sock *po,
291 struct net_device *dev)
292{
293 rcu_assign_pointer(po->cached_dev, dev);
294}
295
296static void packet_cached_dev_reset(struct packet_sock *po)
297{
298 RCU_INIT_POINTER(po->cached_dev, NULL);
299}
300
d346a3fa
DB
301static bool packet_use_direct_xmit(const struct packet_sock *po)
302{
303 return po->xmit == packet_direct_xmit;
304}
305
0fd5d57b 306static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 307{
1cbac010 308 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
309}
310
0fd5d57b
DB
311static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
312{
313 const struct net_device_ops *ops = dev->netdev_ops;
314 u16 queue_index;
315
316 if (ops->ndo_select_queue) {
317 queue_index = ops->ndo_select_queue(dev, skb, NULL,
318 __packet_pick_tx_queue);
319 queue_index = netdev_cap_txqueue(dev, queue_index);
320 } else {
321 queue_index = __packet_pick_tx_queue(dev, skb);
322 }
323
324 skb_set_queue_mapping(skb, queue_index);
325}
326
ce06b03e
DM
327/* register_prot_hook must be invoked with the po->bind_lock held,
328 * or from a context in which asynchronous accesses to the packet
329 * socket is not possible (packet_create()).
330 */
331static void register_prot_hook(struct sock *sk)
332{
333 struct packet_sock *po = pkt_sk(sk);
e40526cb 334
ce06b03e 335 if (!po->running) {
66e56cd4 336 if (po->fanout)
dc99f600 337 __fanout_link(sk, po);
66e56cd4 338 else
dc99f600 339 dev_add_pack(&po->prot_hook);
e40526cb 340
ce06b03e
DM
341 sock_hold(sk);
342 po->running = 1;
343 }
344}
345
346/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
347 * held. If the sync parameter is true, we will temporarily drop
348 * the po->bind_lock and do a synchronize_net to make sure no
349 * asynchronous packet processing paths still refer to the elements
350 * of po->prot_hook. If the sync parameter is false, it is the
351 * callers responsibility to take care of this.
352 */
353static void __unregister_prot_hook(struct sock *sk, bool sync)
354{
355 struct packet_sock *po = pkt_sk(sk);
356
357 po->running = 0;
66e56cd4
DB
358
359 if (po->fanout)
dc99f600 360 __fanout_unlink(sk, po);
66e56cd4 361 else
dc99f600 362 __dev_remove_pack(&po->prot_hook);
e40526cb 363
ce06b03e
DM
364 __sock_put(sk);
365
366 if (sync) {
367 spin_unlock(&po->bind_lock);
368 synchronize_net();
369 spin_lock(&po->bind_lock);
370 }
371}
372
373static void unregister_prot_hook(struct sock *sk, bool sync)
374{
375 struct packet_sock *po = pkt_sk(sk);
376
377 if (po->running)
378 __unregister_prot_hook(sk, sync);
379}
380
6e58040b 381static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
382{
383 if (is_vmalloc_addr(addr))
384 return vmalloc_to_page(addr);
385 return virt_to_page(addr);
386}
387
69e3c75f 388static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 389{
184f489e 390 union tpacket_uhdr h;
1da177e4 391
69e3c75f 392 h.raw = frame;
bbd6ef87
PM
393 switch (po->tp_version) {
394 case TPACKET_V1:
69e3c75f 395 h.h1->tp_status = status;
0af55bb5 396 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
397 break;
398 case TPACKET_V2:
69e3c75f 399 h.h2->tp_status = status;
0af55bb5 400 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 401 break;
f6fb8f10 402 case TPACKET_V3:
69e3c75f 403 default:
f6fb8f10 404 WARN(1, "TPACKET version not supported.\n");
69e3c75f 405 BUG();
bbd6ef87 406 }
69e3c75f
JB
407
408 smp_wmb();
bbd6ef87
PM
409}
410
69e3c75f 411static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 412{
184f489e 413 union tpacket_uhdr h;
bbd6ef87 414
69e3c75f
JB
415 smp_rmb();
416
bbd6ef87
PM
417 h.raw = frame;
418 switch (po->tp_version) {
419 case TPACKET_V1:
0af55bb5 420 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 421 return h.h1->tp_status;
bbd6ef87 422 case TPACKET_V2:
0af55bb5 423 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 424 return h.h2->tp_status;
f6fb8f10 425 case TPACKET_V3:
69e3c75f 426 default:
f6fb8f10 427 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
428 BUG();
429 return 0;
bbd6ef87 430 }
1da177e4 431}
69e3c75f 432
b9c32fb2
DB
433static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
434 unsigned int flags)
7a51384c
DB
435{
436 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
437
68a360e8
WB
438 if (shhwtstamps &&
439 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
440 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
441 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
442
443 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 444 return TP_STATUS_TS_SOFTWARE;
7a51384c 445
b9c32fb2 446 return 0;
7a51384c
DB
447}
448
b9c32fb2
DB
449static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
450 struct sk_buff *skb)
2e31396f
WB
451{
452 union tpacket_uhdr h;
453 struct timespec ts;
b9c32fb2 454 __u32 ts_status;
2e31396f 455
b9c32fb2
DB
456 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
457 return 0;
2e31396f
WB
458
459 h.raw = frame;
460 switch (po->tp_version) {
461 case TPACKET_V1:
462 h.h1->tp_sec = ts.tv_sec;
463 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
464 break;
465 case TPACKET_V2:
466 h.h2->tp_sec = ts.tv_sec;
467 h.h2->tp_nsec = ts.tv_nsec;
468 break;
469 case TPACKET_V3:
470 default:
471 WARN(1, "TPACKET version not supported.\n");
472 BUG();
473 }
474
475 /* one flush is safe, as both fields always lie on the same cacheline */
476 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
477 smp_wmb();
b9c32fb2
DB
478
479 return ts_status;
2e31396f
WB
480}
481
69e3c75f
JB
482static void *packet_lookup_frame(struct packet_sock *po,
483 struct packet_ring_buffer *rb,
484 unsigned int position,
485 int status)
486{
487 unsigned int pg_vec_pos, frame_offset;
184f489e 488 union tpacket_uhdr h;
69e3c75f
JB
489
490 pg_vec_pos = position / rb->frames_per_block;
491 frame_offset = position % rb->frames_per_block;
492
0e3125c7
NH
493 h.raw = rb->pg_vec[pg_vec_pos].buffer +
494 (frame_offset * rb->frame_size);
69e3c75f
JB
495
496 if (status != __packet_get_status(po, h.raw))
497 return NULL;
498
499 return h.raw;
500}
501
eea49cc9 502static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
503 struct packet_ring_buffer *rb,
504 int status)
505{
506 return packet_lookup_frame(po, rb, rb->head, status);
507}
508
bc59ba39 509static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 510{
511 del_timer_sync(&pkc->retire_blk_timer);
512}
513
514static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
515 int tx_ring,
516 struct sk_buff_head *rb_queue)
517{
bc59ba39 518 struct tpacket_kbdq_core *pkc;
f6fb8f10 519
22781a5b
DJ
520 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
521 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 522
ec6f809f 523 spin_lock_bh(&rb_queue->lock);
f6fb8f10 524 pkc->delete_blk_timer = 1;
ec6f809f 525 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 526
527 prb_del_retire_blk_timer(pkc);
528}
529
530static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 531 struct tpacket_kbdq_core *pkc,
f6fb8f10 532 void (*func) (unsigned long))
533{
534 init_timer(&pkc->retire_blk_timer);
535 pkc->retire_blk_timer.data = (long)po;
536 pkc->retire_blk_timer.function = func;
537 pkc->retire_blk_timer.expires = jiffies;
538}
539
540static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
541{
bc59ba39 542 struct tpacket_kbdq_core *pkc;
f6fb8f10 543
544 if (tx_ring)
545 BUG();
546
22781a5b
DJ
547 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
548 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 549 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
550}
551
552static int prb_calc_retire_blk_tmo(struct packet_sock *po,
553 int blk_size_in_bytes)
554{
555 struct net_device *dev;
556 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
557 struct ethtool_cmd ecmd;
558 int err;
e440cf2c 559 u32 speed;
f6fb8f10 560
4bc71cb9
JP
561 rtnl_lock();
562 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
563 if (unlikely(!dev)) {
564 rtnl_unlock();
f6fb8f10 565 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
566 }
567 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 568 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
569 rtnl_unlock();
570 if (!err) {
4bc71cb9
JP
571 /*
572 * If the link speed is so slow you don't really
573 * need to worry about perf anyways
574 */
e440cf2c 575 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 576 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 577 } else {
578 msec = 1;
579 div = speed / 1000;
f6fb8f10 580 }
581 }
582
583 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
584
585 if (div)
586 mbits /= div;
587
588 tmo = mbits * msec;
589
590 if (div)
591 return tmo+1;
592 return tmo;
593}
594
bc59ba39 595static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 596 union tpacket_req_u *req_u)
597{
598 p1->feature_req_word = req_u->req3.tp_feature_req_word;
599}
600
601static void init_prb_bdqc(struct packet_sock *po,
602 struct packet_ring_buffer *rb,
603 struct pgv *pg_vec,
604 union tpacket_req_u *req_u, int tx_ring)
605{
22781a5b 606 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 607 struct tpacket_block_desc *pbd;
f6fb8f10 608
609 memset(p1, 0x0, sizeof(*p1));
610
611 p1->knxt_seq_num = 1;
612 p1->pkbdq = pg_vec;
bc59ba39 613 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 614 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 615 p1->kblk_size = req_u->req3.tp_block_size;
616 p1->knum_blocks = req_u->req3.tp_block_nr;
617 p1->hdrlen = po->tp_hdrlen;
618 p1->version = po->tp_version;
619 p1->last_kactive_blk_num = 0;
ee80fbf3 620 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 621 if (req_u->req3.tp_retire_blk_tov)
622 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
623 else
624 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
625 req_u->req3.tp_block_size);
626 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
627 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
628
dc808110 629 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 630 prb_init_ft_ops(p1, req_u);
631 prb_setup_retire_blk_timer(po, tx_ring);
632 prb_open_block(p1, pbd);
633}
634
635/* Do NOT update the last_blk_num first.
636 * Assumes sk_buff_head lock is held.
637 */
bc59ba39 638static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 639{
640 mod_timer(&pkc->retire_blk_timer,
641 jiffies + pkc->tov_in_jiffies);
642 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
643}
644
645/*
646 * Timer logic:
647 * 1) We refresh the timer only when we open a block.
648 * By doing this we don't waste cycles refreshing the timer
649 * on packet-by-packet basis.
650 *
651 * With a 1MB block-size, on a 1Gbps line, it will take
652 * i) ~8 ms to fill a block + ii) memcpy etc.
653 * In this cut we are not accounting for the memcpy time.
654 *
655 * So, if the user sets the 'tmo' to 10ms then the timer
656 * will never fire while the block is still getting filled
657 * (which is what we want). However, the user could choose
658 * to close a block early and that's fine.
659 *
660 * But when the timer does fire, we check whether or not to refresh it.
661 * Since the tmo granularity is in msecs, it is not too expensive
662 * to refresh the timer, lets say every '8' msecs.
663 * Either the user can set the 'tmo' or we can derive it based on
664 * a) line-speed and b) block-size.
665 * prb_calc_retire_blk_tmo() calculates the tmo.
666 *
667 */
668static void prb_retire_rx_blk_timer_expired(unsigned long data)
669{
670 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 671 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 672 unsigned int frozen;
bc59ba39 673 struct tpacket_block_desc *pbd;
f6fb8f10 674
675 spin_lock(&po->sk.sk_receive_queue.lock);
676
677 frozen = prb_queue_frozen(pkc);
678 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
679
680 if (unlikely(pkc->delete_blk_timer))
681 goto out;
682
683 /* We only need to plug the race when the block is partially filled.
684 * tpacket_rcv:
685 * lock(); increment BLOCK_NUM_PKTS; unlock()
686 * copy_bits() is in progress ...
687 * timer fires on other cpu:
688 * we can't retire the current block because copy_bits
689 * is in progress.
690 *
691 */
692 if (BLOCK_NUM_PKTS(pbd)) {
693 while (atomic_read(&pkc->blk_fill_in_prog)) {
694 /* Waiting for skb_copy_bits to finish... */
695 cpu_relax();
696 }
697 }
698
699 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
700 if (!frozen) {
41a50d62
AD
701 if (!BLOCK_NUM_PKTS(pbd)) {
702 /* An empty block. Just refresh the timer. */
703 goto refresh_timer;
704 }
f6fb8f10 705 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
706 if (!prb_dispatch_next_block(pkc, po))
707 goto refresh_timer;
708 else
709 goto out;
710 } else {
711 /* Case 1. Queue was frozen because user-space was
712 * lagging behind.
713 */
714 if (prb_curr_blk_in_use(pkc, pbd)) {
715 /*
716 * Ok, user-space is still behind.
717 * So just refresh the timer.
718 */
719 goto refresh_timer;
720 } else {
721 /* Case 2. queue was frozen,user-space caught up,
722 * now the link went idle && the timer fired.
723 * We don't have a block to close.So we open this
724 * block and restart the timer.
725 * opening a block thaws the queue,restarts timer
726 * Thawing/timer-refresh is a side effect.
727 */
728 prb_open_block(pkc, pbd);
729 goto out;
730 }
731 }
732 }
733
734refresh_timer:
735 _prb_refresh_rx_retire_blk_timer(pkc);
736
737out:
738 spin_unlock(&po->sk.sk_receive_queue.lock);
739}
740
eea49cc9 741static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 742 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 743{
744 /* Flush everything minus the block header */
745
746#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
747 u8 *start, *end;
748
749 start = (u8 *)pbd1;
750
751 /* Skip the block header(we know header WILL fit in 4K) */
752 start += PAGE_SIZE;
753
754 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
755 for (; start < end; start += PAGE_SIZE)
756 flush_dcache_page(pgv_to_page(start));
757
758 smp_wmb();
759#endif
760
761 /* Now update the block status. */
762
763 BLOCK_STATUS(pbd1) = status;
764
765 /* Flush the block header */
766
767#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
768 start = (u8 *)pbd1;
769 flush_dcache_page(pgv_to_page(start));
770
771 smp_wmb();
772#endif
773}
774
775/*
776 * Side effect:
777 *
778 * 1) flush the block
779 * 2) Increment active_blk_num
780 *
781 * Note:We DONT refresh the timer on purpose.
782 * Because almost always the next block will be opened.
783 */
bc59ba39 784static void prb_close_block(struct tpacket_kbdq_core *pkc1,
785 struct tpacket_block_desc *pbd1,
f6fb8f10 786 struct packet_sock *po, unsigned int stat)
787{
788 __u32 status = TP_STATUS_USER | stat;
789
790 struct tpacket3_hdr *last_pkt;
bc59ba39 791 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 792 struct sock *sk = &po->sk;
f6fb8f10 793
ee80fbf3 794 if (po->stats.stats3.tp_drops)
f6fb8f10 795 status |= TP_STATUS_LOSING;
796
797 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
798 last_pkt->tp_next_offset = 0;
799
800 /* Get the ts of the last pkt */
801 if (BLOCK_NUM_PKTS(pbd1)) {
802 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
803 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
804 } else {
41a50d62
AD
805 /* Ok, we tmo'd - so get the current time.
806 *
807 * It shouldn't really happen as we don't close empty
808 * blocks. See prb_retire_rx_blk_timer_expired().
809 */
f6fb8f10 810 struct timespec ts;
811 getnstimeofday(&ts);
812 h1->ts_last_pkt.ts_sec = ts.tv_sec;
813 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
814 }
815
816 smp_wmb();
817
818 /* Flush the block */
819 prb_flush_block(pkc1, pbd1, status);
820
da413eec
DC
821 sk->sk_data_ready(sk);
822
f6fb8f10 823 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
824}
825
eea49cc9 826static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 827{
828 pkc->reset_pending_on_curr_blk = 0;
829}
830
831/*
832 * Side effect of opening a block:
833 *
834 * 1) prb_queue is thawed.
835 * 2) retire_blk_timer is refreshed.
836 *
837 */
bc59ba39 838static void prb_open_block(struct tpacket_kbdq_core *pkc1,
839 struct tpacket_block_desc *pbd1)
f6fb8f10 840{
841 struct timespec ts;
bc59ba39 842 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 843
844 smp_rmb();
845
8da3056c
DB
846 /* We could have just memset this but we will lose the
847 * flexibility of making the priv area sticky
848 */
f6fb8f10 849
8da3056c
DB
850 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
851 BLOCK_NUM_PKTS(pbd1) = 0;
852 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 853
8da3056c
DB
854 getnstimeofday(&ts);
855
856 h1->ts_first_pkt.ts_sec = ts.tv_sec;
857 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 858
8da3056c
DB
859 pkc1->pkblk_start = (char *)pbd1;
860 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
861
862 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
864
865 pbd1->version = pkc1->version;
866 pkc1->prev = pkc1->nxt_offset;
867 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
868
869 prb_thaw_queue(pkc1);
870 _prb_refresh_rx_retire_blk_timer(pkc1);
871
872 smp_wmb();
f6fb8f10 873}
874
875/*
876 * Queue freeze logic:
877 * 1) Assume tp_block_nr = 8 blocks.
878 * 2) At time 't0', user opens Rx ring.
879 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
880 * 4) user-space is either sleeping or processing block '0'.
881 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
882 * it will close block-7,loop around and try to fill block '0'.
883 * call-flow:
884 * __packet_lookup_frame_in_block
885 * prb_retire_current_block()
886 * prb_dispatch_next_block()
887 * |->(BLOCK_STATUS == USER) evaluates to true
888 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
889 * 6) Now there are two cases:
890 * 6.1) Link goes idle right after the queue is frozen.
891 * But remember, the last open_block() refreshed the timer.
892 * When this timer expires,it will refresh itself so that we can
893 * re-open block-0 in near future.
894 * 6.2) Link is busy and keeps on receiving packets. This is a simple
895 * case and __packet_lookup_frame_in_block will check if block-0
896 * is free and can now be re-used.
897 */
eea49cc9 898static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 899 struct packet_sock *po)
900{
901 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 902 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 903}
904
905#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
906
907/*
908 * If the next block is free then we will dispatch it
909 * and return a good offset.
910 * Else, we will freeze the queue.
911 * So, caller must check the return value.
912 */
bc59ba39 913static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 914 struct packet_sock *po)
915{
bc59ba39 916 struct tpacket_block_desc *pbd;
f6fb8f10 917
918 smp_rmb();
919
920 /* 1. Get current block num */
921 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
922
923 /* 2. If this block is currently in_use then freeze the queue */
924 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
925 prb_freeze_queue(pkc, po);
926 return NULL;
927 }
928
929 /*
930 * 3.
931 * open this block and return the offset where the first packet
932 * needs to get stored.
933 */
934 prb_open_block(pkc, pbd);
935 return (void *)pkc->nxt_offset;
936}
937
bc59ba39 938static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 939 struct packet_sock *po, unsigned int status)
940{
bc59ba39 941 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 942
943 /* retire/close the current block */
944 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
945 /*
946 * Plug the case where copy_bits() is in progress on
947 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
948 * have space to copy the pkt in the current block and
949 * called prb_retire_current_block()
950 *
951 * We don't need to worry about the TMO case because
952 * the timer-handler already handled this case.
953 */
954 if (!(status & TP_STATUS_BLK_TMO)) {
955 while (atomic_read(&pkc->blk_fill_in_prog)) {
956 /* Waiting for skb_copy_bits to finish... */
957 cpu_relax();
958 }
959 }
960 prb_close_block(pkc, pbd, po, status);
961 return;
962 }
f6fb8f10 963}
964
eea49cc9 965static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 966 struct tpacket_block_desc *pbd)
f6fb8f10 967{
968 return TP_STATUS_USER & BLOCK_STATUS(pbd);
969}
970
eea49cc9 971static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 972{
973 return pkc->reset_pending_on_curr_blk;
974}
975
eea49cc9 976static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 977{
bc59ba39 978 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 979 atomic_dec(&pkc->blk_fill_in_prog);
980}
981
eea49cc9 982static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 983 struct tpacket3_hdr *ppd)
984{
3958afa1 985 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 986}
987
eea49cc9 988static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 989 struct tpacket3_hdr *ppd)
990{
991 ppd->hv1.tp_rxhash = 0;
992}
993
eea49cc9 994static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 995 struct tpacket3_hdr *ppd)
996{
df8a39de
JP
997 if (skb_vlan_tag_present(pkc->skb)) {
998 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
999 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1000 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1001 } else {
9e67030a 1002 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1003 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1004 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1005 }
1006}
1007
bc59ba39 1008static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1009 struct tpacket3_hdr *ppd)
1010{
a0cdfcf3 1011 ppd->hv1.tp_padding = 0;
f6fb8f10 1012 prb_fill_vlan_info(pkc, ppd);
1013
1014 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1015 prb_fill_rxhash(pkc, ppd);
1016 else
1017 prb_clear_rxhash(pkc, ppd);
1018}
1019
eea49cc9 1020static void prb_fill_curr_block(char *curr,
bc59ba39 1021 struct tpacket_kbdq_core *pkc,
1022 struct tpacket_block_desc *pbd,
f6fb8f10 1023 unsigned int len)
1024{
1025 struct tpacket3_hdr *ppd;
1026
1027 ppd = (struct tpacket3_hdr *)curr;
1028 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1029 pkc->prev = curr;
1030 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1032 BLOCK_NUM_PKTS(pbd) += 1;
1033 atomic_inc(&pkc->blk_fill_in_prog);
1034 prb_run_all_ft_ops(pkc, ppd);
1035}
1036
1037/* Assumes caller has the sk->rx_queue.lock */
1038static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1039 struct sk_buff *skb,
1040 int status,
1041 unsigned int len
1042 )
1043{
bc59ba39 1044 struct tpacket_kbdq_core *pkc;
1045 struct tpacket_block_desc *pbd;
f6fb8f10 1046 char *curr, *end;
1047
e3192690 1048 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1049 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1050
1051 /* Queue is frozen when user space is lagging behind */
1052 if (prb_queue_frozen(pkc)) {
1053 /*
1054 * Check if that last block which caused the queue to freeze,
1055 * is still in_use by user-space.
1056 */
1057 if (prb_curr_blk_in_use(pkc, pbd)) {
1058 /* Can't record this packet */
1059 return NULL;
1060 } else {
1061 /*
1062 * Ok, the block was released by user-space.
1063 * Now let's open that block.
1064 * opening a block also thaws the queue.
1065 * Thawing is a side effect.
1066 */
1067 prb_open_block(pkc, pbd);
1068 }
1069 }
1070
1071 smp_mb();
1072 curr = pkc->nxt_offset;
1073 pkc->skb = skb;
e3192690 1074 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1075
1076 /* first try the current block */
1077 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1078 prb_fill_curr_block(curr, pkc, pbd, len);
1079 return (void *)curr;
1080 }
1081
1082 /* Ok, close the current block */
1083 prb_retire_current_block(pkc, po, 0);
1084
1085 /* Now, try to dispatch the next block */
1086 curr = (char *)prb_dispatch_next_block(pkc, po);
1087 if (curr) {
1088 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1089 prb_fill_curr_block(curr, pkc, pbd, len);
1090 return (void *)curr;
1091 }
1092
1093 /*
1094 * No free blocks are available.user_space hasn't caught up yet.
1095 * Queue was just frozen and now this packet will get dropped.
1096 */
1097 return NULL;
1098}
1099
eea49cc9 1100static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1101 struct sk_buff *skb,
1102 int status, unsigned int len)
1103{
1104 char *curr = NULL;
1105 switch (po->tp_version) {
1106 case TPACKET_V1:
1107 case TPACKET_V2:
1108 curr = packet_lookup_frame(po, &po->rx_ring,
1109 po->rx_ring.head, status);
1110 return curr;
1111 case TPACKET_V3:
1112 return __packet_lookup_frame_in_block(po, skb, status, len);
1113 default:
1114 WARN(1, "TPACKET version not supported\n");
1115 BUG();
99aa3473 1116 return NULL;
f6fb8f10 1117 }
1118}
1119
eea49cc9 1120static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1121 struct packet_ring_buffer *rb,
77f65ebd 1122 unsigned int idx,
f6fb8f10 1123 int status)
1124{
bc59ba39 1125 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1126 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1127
1128 if (status != BLOCK_STATUS(pbd))
1129 return NULL;
1130 return pbd;
1131}
1132
eea49cc9 1133static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1134{
1135 unsigned int prev;
1136 if (rb->prb_bdqc.kactive_blk_num)
1137 prev = rb->prb_bdqc.kactive_blk_num-1;
1138 else
1139 prev = rb->prb_bdqc.knum_blocks-1;
1140 return prev;
1141}
1142
1143/* Assumes caller has held the rx_queue.lock */
eea49cc9 1144static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1145 struct packet_ring_buffer *rb,
1146 int status)
1147{
1148 unsigned int previous = prb_previous_blk_num(rb);
1149 return prb_lookup_block(po, rb, previous, status);
1150}
1151
eea49cc9 1152static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1153 struct packet_ring_buffer *rb,
1154 int status)
1155{
1156 if (po->tp_version <= TPACKET_V2)
1157 return packet_previous_frame(po, rb, status);
1158
1159 return __prb_previous_block(po, rb, status);
1160}
1161
eea49cc9 1162static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1163 struct packet_ring_buffer *rb)
1164{
1165 switch (po->tp_version) {
1166 case TPACKET_V1:
1167 case TPACKET_V2:
1168 return packet_increment_head(rb);
1169 case TPACKET_V3:
1170 default:
1171 WARN(1, "TPACKET version not supported.\n");
1172 BUG();
1173 return;
1174 }
1175}
1176
eea49cc9 1177static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1178 struct packet_ring_buffer *rb,
1179 int status)
1180{
1181 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1182 return packet_lookup_frame(po, rb, previous, status);
1183}
1184
eea49cc9 1185static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1186{
1187 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1188}
1189
b0138408
DB
1190static void packet_inc_pending(struct packet_ring_buffer *rb)
1191{
1192 this_cpu_inc(*rb->pending_refcnt);
1193}
1194
1195static void packet_dec_pending(struct packet_ring_buffer *rb)
1196{
1197 this_cpu_dec(*rb->pending_refcnt);
1198}
1199
1200static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1201{
1202 unsigned int refcnt = 0;
1203 int cpu;
1204
1205 /* We don't use pending refcount in rx_ring. */
1206 if (rb->pending_refcnt == NULL)
1207 return 0;
1208
1209 for_each_possible_cpu(cpu)
1210 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1211
1212 return refcnt;
1213}
1214
1215static int packet_alloc_pending(struct packet_sock *po)
1216{
1217 po->rx_ring.pending_refcnt = NULL;
1218
1219 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1220 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1221 return -ENOBUFS;
1222
1223 return 0;
1224}
1225
1226static void packet_free_pending(struct packet_sock *po)
1227{
1228 free_percpu(po->tx_ring.pending_refcnt);
1229}
1230
77f65ebd
WB
1231static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1232{
1233 struct sock *sk = &po->sk;
1234 bool has_room;
1235
1236 if (po->prot_hook.func != tpacket_rcv)
1237 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1238 <= sk->sk_rcvbuf;
1239
1240 spin_lock(&sk->sk_receive_queue.lock);
1241 if (po->tp_version == TPACKET_V3)
1242 has_room = prb_lookup_block(po, &po->rx_ring,
1243 po->rx_ring.prb_bdqc.kactive_blk_num,
1244 TP_STATUS_KERNEL);
1245 else
1246 has_room = packet_lookup_frame(po, &po->rx_ring,
1247 po->rx_ring.head,
1248 TP_STATUS_KERNEL);
1249 spin_unlock(&sk->sk_receive_queue.lock);
1250
1251 return has_room;
1252}
1253
1da177e4
LT
1254static void packet_sock_destruct(struct sock *sk)
1255{
ed85b565
RC
1256 skb_queue_purge(&sk->sk_error_queue);
1257
547b792c
IJ
1258 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1259 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1260
1261 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1262 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1263 return;
1264 }
1265
17ab56a2 1266 sk_refcnt_debug_dec(sk);
1da177e4
LT
1267}
1268
dc99f600
DM
1269static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1270{
1271 int x = atomic_read(&f->rr_cur) + 1;
1272
1273 if (x >= num)
1274 x = 0;
1275
1276 return x;
1277}
1278
77f65ebd
WB
1279static unsigned int fanout_demux_hash(struct packet_fanout *f,
1280 struct sk_buff *skb,
1281 unsigned int num)
dc99f600 1282{
61b905da 1283 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1284}
1285
77f65ebd
WB
1286static unsigned int fanout_demux_lb(struct packet_fanout *f,
1287 struct sk_buff *skb,
1288 unsigned int num)
dc99f600
DM
1289{
1290 int cur, old;
1291
1292 cur = atomic_read(&f->rr_cur);
1293 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1294 fanout_rr_next(f, num))) != cur)
1295 cur = old;
77f65ebd
WB
1296 return cur;
1297}
1298
1299static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1300 struct sk_buff *skb,
1301 unsigned int num)
1302{
1303 return smp_processor_id() % num;
dc99f600
DM
1304}
1305
5df0ddfb
DB
1306static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1307 struct sk_buff *skb,
1308 unsigned int num)
1309{
f337db64 1310 return prandom_u32_max(num);
5df0ddfb
DB
1311}
1312
77f65ebd
WB
1313static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1314 struct sk_buff *skb,
1315 unsigned int idx, unsigned int skip,
1316 unsigned int num)
95ec3eb4 1317{
77f65ebd 1318 unsigned int i, j;
95ec3eb4 1319
77f65ebd
WB
1320 i = j = min_t(int, f->next[idx], num - 1);
1321 do {
1322 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1323 if (i != j)
1324 f->next[idx] = i;
1325 return i;
1326 }
1327 if (++i == num)
1328 i = 0;
1329 } while (i != j);
1330
1331 return idx;
1332}
1333
2d36097d
NH
1334static unsigned int fanout_demux_qm(struct packet_fanout *f,
1335 struct sk_buff *skb,
1336 unsigned int num)
1337{
1338 return skb_get_queue_mapping(skb) % num;
1339}
1340
77f65ebd
WB
1341static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1342{
1343 return f->flags & (flag >> 8);
95ec3eb4
DM
1344}
1345
95ec3eb4
DM
1346static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1347 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1348{
1349 struct packet_fanout *f = pt->af_packet_priv;
1350 unsigned int num = f->num_members;
1351 struct packet_sock *po;
77f65ebd 1352 unsigned int idx;
dc99f600
DM
1353
1354 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1355 !num) {
1356 kfree_skb(skb);
1357 return 0;
1358 }
1359
3f34b24a
AD
1360 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1361 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1362 if (!skb)
1363 return 0;
1364 }
95ec3eb4
DM
1365 switch (f->type) {
1366 case PACKET_FANOUT_HASH:
1367 default:
77f65ebd 1368 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1369 break;
1370 case PACKET_FANOUT_LB:
77f65ebd 1371 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1372 break;
1373 case PACKET_FANOUT_CPU:
77f65ebd
WB
1374 idx = fanout_demux_cpu(f, skb, num);
1375 break;
5df0ddfb
DB
1376 case PACKET_FANOUT_RND:
1377 idx = fanout_demux_rnd(f, skb, num);
1378 break;
2d36097d
NH
1379 case PACKET_FANOUT_QM:
1380 idx = fanout_demux_qm(f, skb, num);
1381 break;
77f65ebd
WB
1382 case PACKET_FANOUT_ROLLOVER:
1383 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1384 break;
dc99f600
DM
1385 }
1386
77f65ebd
WB
1387 po = pkt_sk(f->arr[idx]);
1388 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1389 unlikely(!packet_rcv_has_room(po, skb))) {
1390 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1391 po = pkt_sk(f->arr[idx]);
1392 }
dc99f600
DM
1393
1394 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1395}
1396
fff3321d
PE
1397DEFINE_MUTEX(fanout_mutex);
1398EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1399static LIST_HEAD(fanout_list);
1400
1401static void __fanout_link(struct sock *sk, struct packet_sock *po)
1402{
1403 struct packet_fanout *f = po->fanout;
1404
1405 spin_lock(&f->lock);
1406 f->arr[f->num_members] = sk;
1407 smp_wmb();
1408 f->num_members++;
1409 spin_unlock(&f->lock);
1410}
1411
1412static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1413{
1414 struct packet_fanout *f = po->fanout;
1415 int i;
1416
1417 spin_lock(&f->lock);
1418 for (i = 0; i < f->num_members; i++) {
1419 if (f->arr[i] == sk)
1420 break;
1421 }
1422 BUG_ON(i >= f->num_members);
1423 f->arr[i] = f->arr[f->num_members - 1];
1424 f->num_members--;
1425 spin_unlock(&f->lock);
1426}
1427
d4dd8aee 1428static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1429{
d4dd8aee 1430 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1431 return true;
1432
1433 return false;
1434}
1435
7736d33f 1436static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1437{
1438 struct packet_sock *po = pkt_sk(sk);
1439 struct packet_fanout *f, *match;
7736d33f 1440 u8 type = type_flags & 0xff;
77f65ebd 1441 u8 flags = type_flags >> 8;
dc99f600
DM
1442 int err;
1443
1444 switch (type) {
77f65ebd
WB
1445 case PACKET_FANOUT_ROLLOVER:
1446 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1447 return -EINVAL;
dc99f600
DM
1448 case PACKET_FANOUT_HASH:
1449 case PACKET_FANOUT_LB:
95ec3eb4 1450 case PACKET_FANOUT_CPU:
5df0ddfb 1451 case PACKET_FANOUT_RND:
2d36097d 1452 case PACKET_FANOUT_QM:
dc99f600
DM
1453 break;
1454 default:
1455 return -EINVAL;
1456 }
1457
1458 if (!po->running)
1459 return -EINVAL;
1460
1461 if (po->fanout)
1462 return -EALREADY;
1463
1464 mutex_lock(&fanout_mutex);
1465 match = NULL;
1466 list_for_each_entry(f, &fanout_list, list) {
1467 if (f->id == id &&
1468 read_pnet(&f->net) == sock_net(sk)) {
1469 match = f;
1470 break;
1471 }
1472 }
afe62c68 1473 err = -EINVAL;
77f65ebd 1474 if (match && match->flags != flags)
afe62c68 1475 goto out;
dc99f600 1476 if (!match) {
afe62c68 1477 err = -ENOMEM;
dc99f600 1478 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1479 if (!match)
1480 goto out;
1481 write_pnet(&match->net, sock_net(sk));
1482 match->id = id;
1483 match->type = type;
77f65ebd 1484 match->flags = flags;
afe62c68
ED
1485 atomic_set(&match->rr_cur, 0);
1486 INIT_LIST_HEAD(&match->list);
1487 spin_lock_init(&match->lock);
1488 atomic_set(&match->sk_ref, 0);
1489 match->prot_hook.type = po->prot_hook.type;
1490 match->prot_hook.dev = po->prot_hook.dev;
1491 match->prot_hook.func = packet_rcv_fanout;
1492 match->prot_hook.af_packet_priv = match;
c0de08d0 1493 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1494 dev_add_pack(&match->prot_hook);
1495 list_add(&match->list, &fanout_list);
dc99f600 1496 }
afe62c68
ED
1497 err = -EINVAL;
1498 if (match->type == type &&
1499 match->prot_hook.type == po->prot_hook.type &&
1500 match->prot_hook.dev == po->prot_hook.dev) {
1501 err = -ENOSPC;
1502 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1503 __dev_remove_pack(&po->prot_hook);
1504 po->fanout = match;
1505 atomic_inc(&match->sk_ref);
1506 __fanout_link(sk, po);
1507 err = 0;
dc99f600
DM
1508 }
1509 }
afe62c68 1510out:
dc99f600
DM
1511 mutex_unlock(&fanout_mutex);
1512 return err;
1513}
1514
1515static void fanout_release(struct sock *sk)
1516{
1517 struct packet_sock *po = pkt_sk(sk);
1518 struct packet_fanout *f;
1519
1520 f = po->fanout;
1521 if (!f)
1522 return;
1523
fff3321d 1524 mutex_lock(&fanout_mutex);
dc99f600
DM
1525 po->fanout = NULL;
1526
dc99f600
DM
1527 if (atomic_dec_and_test(&f->sk_ref)) {
1528 list_del(&f->list);
1529 dev_remove_pack(&f->prot_hook);
1530 kfree(f);
1531 }
1532 mutex_unlock(&fanout_mutex);
1533}
1da177e4 1534
90ddc4f0 1535static const struct proto_ops packet_ops;
1da177e4 1536
90ddc4f0 1537static const struct proto_ops packet_ops_spkt;
1da177e4 1538
40d4e3df
ED
1539static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1540 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1541{
1542 struct sock *sk;
1543 struct sockaddr_pkt *spkt;
1544
1545 /*
1546 * When we registered the protocol we saved the socket in the data
1547 * field for just this event.
1548 */
1549
1550 sk = pt->af_packet_priv;
1ce4f28b 1551
1da177e4
LT
1552 /*
1553 * Yank back the headers [hope the device set this
1554 * right or kerboom...]
1555 *
1556 * Incoming packets have ll header pulled,
1557 * push it back.
1558 *
98e399f8 1559 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1560 * so that this procedure is noop.
1561 */
1562
1563 if (skb->pkt_type == PACKET_LOOPBACK)
1564 goto out;
1565
09ad9bc7 1566 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1567 goto out;
1568
40d4e3df
ED
1569 skb = skb_share_check(skb, GFP_ATOMIC);
1570 if (skb == NULL)
1da177e4
LT
1571 goto oom;
1572
1573 /* drop any routing info */
adf30907 1574 skb_dst_drop(skb);
1da177e4 1575
84531c24
PO
1576 /* drop conntrack reference */
1577 nf_reset(skb);
1578
ffbc6111 1579 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1580
98e399f8 1581 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1582
1583 /*
1584 * The SOCK_PACKET socket receives _all_ frames.
1585 */
1586
1587 spkt->spkt_family = dev->type;
1588 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1589 spkt->spkt_protocol = skb->protocol;
1590
1591 /*
1592 * Charge the memory to the socket. This is done specifically
1593 * to prevent sockets using all the memory up.
1594 */
1595
40d4e3df 1596 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1597 return 0;
1598
1599out:
1600 kfree_skb(skb);
1601oom:
1602 return 0;
1603}
1604
1605
1606/*
1607 * Output a raw packet to a device layer. This bypasses all the other
1608 * protocol layers and you must therefore supply it with a complete frame
1609 */
1ce4f28b 1610
1da177e4
LT
1611static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1612 struct msghdr *msg, size_t len)
1613{
1614 struct sock *sk = sock->sk;
342dfc30 1615 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1616 struct sk_buff *skb = NULL;
1da177e4 1617 struct net_device *dev;
40d4e3df 1618 __be16 proto = 0;
1da177e4 1619 int err;
3bdc0eba 1620 int extra_len = 0;
1ce4f28b 1621
1da177e4 1622 /*
1ce4f28b 1623 * Get and verify the address.
1da177e4
LT
1624 */
1625
40d4e3df 1626 if (saddr) {
1da177e4 1627 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1628 return -EINVAL;
1629 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1630 proto = saddr->spkt_protocol;
1631 } else
1632 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1633
1634 /*
1ce4f28b 1635 * Find the device first to size check it
1da177e4
LT
1636 */
1637
de74e92a 1638 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1639retry:
654d1f8a
ED
1640 rcu_read_lock();
1641 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1642 err = -ENODEV;
1643 if (dev == NULL)
1644 goto out_unlock;
1ce4f28b 1645
d5e76b0a
DM
1646 err = -ENETDOWN;
1647 if (!(dev->flags & IFF_UP))
1648 goto out_unlock;
1649
1da177e4 1650 /*
40d4e3df
ED
1651 * You may not queue a frame bigger than the mtu. This is the lowest level
1652 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1653 */
1ce4f28b 1654
3bdc0eba
BG
1655 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1656 if (!netif_supports_nofcs(dev)) {
1657 err = -EPROTONOSUPPORT;
1658 goto out_unlock;
1659 }
1660 extra_len = 4; /* We're doing our own CRC */
1661 }
1662
1da177e4 1663 err = -EMSGSIZE;
3bdc0eba 1664 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1665 goto out_unlock;
1666
1a35ca80
ED
1667 if (!skb) {
1668 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1669 int tlen = dev->needed_tailroom;
1a35ca80
ED
1670 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1671
1672 rcu_read_unlock();
4ce40912 1673 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1674 if (skb == NULL)
1675 return -ENOBUFS;
1676 /* FIXME: Save some space for broken drivers that write a hard
1677 * header at transmission time by themselves. PPP is the notable
1678 * one here. This should really be fixed at the driver level.
1679 */
1680 skb_reserve(skb, reserved);
1681 skb_reset_network_header(skb);
1682
1683 /* Try to align data part correctly */
1684 if (hhlen) {
1685 skb->data -= hhlen;
1686 skb->tail -= hhlen;
1687 if (len < hhlen)
1688 skb_reset_network_header(skb);
1689 }
6ce8e9ce 1690 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1691 if (err)
1692 goto out_free;
1693 goto retry;
1da177e4
LT
1694 }
1695
3bdc0eba 1696 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1697 /* Earlier code assumed this would be a VLAN pkt,
1698 * double-check this now that we have the actual
1699 * packet in hand.
1700 */
1701 struct ethhdr *ehdr;
1702 skb_reset_mac_header(skb);
1703 ehdr = eth_hdr(skb);
1704 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1705 err = -EMSGSIZE;
1706 goto out_unlock;
1707 }
1708 }
1a35ca80 1709
1da177e4
LT
1710 skb->protocol = proto;
1711 skb->dev = dev;
1712 skb->priority = sk->sk_priority;
2d37a186 1713 skb->mark = sk->sk_mark;
bf84a010
DB
1714
1715 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1716
3bdc0eba
BG
1717 if (unlikely(extra_len == 4))
1718 skb->no_fcs = 1;
1719
40893fd0 1720 skb_probe_transport_header(skb, 0);
c1aad275 1721
1da177e4 1722 dev_queue_xmit(skb);
654d1f8a 1723 rcu_read_unlock();
40d4e3df 1724 return len;
1da177e4 1725
1da177e4 1726out_unlock:
654d1f8a 1727 rcu_read_unlock();
1a35ca80
ED
1728out_free:
1729 kfree_skb(skb);
1da177e4
LT
1730 return err;
1731}
1da177e4 1732
eea49cc9 1733static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1734 const struct sock *sk,
dbcb5855 1735 unsigned int res)
1da177e4
LT
1736{
1737 struct sk_filter *filter;
fda9ef5d 1738
80f8f102
ED
1739 rcu_read_lock();
1740 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1741 if (filter != NULL)
0a14842f 1742 res = SK_RUN_FILTER(filter, skb);
80f8f102 1743 rcu_read_unlock();
1da177e4 1744
dbcb5855 1745 return res;
1da177e4
LT
1746}
1747
1748/*
62ab0812
ED
1749 * This function makes lazy skb cloning in hope that most of packets
1750 * are discarded by BPF.
1751 *
1752 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1753 * and skb->cb are mangled. It works because (and until) packets
1754 * falling here are owned by current CPU. Output packets are cloned
1755 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1756 * sequencially, so that if we return skb to original state on exit,
1757 * we will not harm anyone.
1da177e4
LT
1758 */
1759
40d4e3df
ED
1760static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1761 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1762{
1763 struct sock *sk;
1764 struct sockaddr_ll *sll;
1765 struct packet_sock *po;
40d4e3df 1766 u8 *skb_head = skb->data;
1da177e4 1767 int skb_len = skb->len;
dbcb5855 1768 unsigned int snaplen, res;
1da177e4
LT
1769
1770 if (skb->pkt_type == PACKET_LOOPBACK)
1771 goto drop;
1772
1773 sk = pt->af_packet_priv;
1774 po = pkt_sk(sk);
1775
09ad9bc7 1776 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1777 goto drop;
1778
1da177e4
LT
1779 skb->dev = dev;
1780
3b04ddde 1781 if (dev->header_ops) {
1da177e4 1782 /* The device has an explicit notion of ll header,
62ab0812
ED
1783 * exported to higher levels.
1784 *
1785 * Otherwise, the device hides details of its frame
1786 * structure, so that corresponding packet head is
1787 * never delivered to user.
1da177e4
LT
1788 */
1789 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1790 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1791 else if (skb->pkt_type == PACKET_OUTGOING) {
1792 /* Special case: outgoing packets have ll header at head */
bbe735e4 1793 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1794 }
1795 }
1796
1797 snaplen = skb->len;
1798
dbcb5855
DM
1799 res = run_filter(skb, sk, snaplen);
1800 if (!res)
fda9ef5d 1801 goto drop_n_restore;
dbcb5855
DM
1802 if (snaplen > res)
1803 snaplen = res;
1da177e4 1804
0fd7bac6 1805 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1806 goto drop_n_acct;
1807
1808 if (skb_shared(skb)) {
1809 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1810 if (nskb == NULL)
1811 goto drop_n_acct;
1812
1813 if (skb_head != skb->data) {
1814 skb->data = skb_head;
1815 skb->len = skb_len;
1816 }
abc4e4fa 1817 consume_skb(skb);
1da177e4
LT
1818 skb = nskb;
1819 }
1820
ffbc6111
HX
1821 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1822 sizeof(skb->cb));
1823
1824 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1825 sll->sll_family = AF_PACKET;
1826 sll->sll_hatype = dev->type;
1827 sll->sll_protocol = skb->protocol;
1828 sll->sll_pkttype = skb->pkt_type;
8032b464 1829 if (unlikely(po->origdev))
80feaacb
PWJ
1830 sll->sll_ifindex = orig_dev->ifindex;
1831 else
1832 sll->sll_ifindex = dev->ifindex;
1da177e4 1833
b95cce35 1834 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1835
ffbc6111 1836 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1837
1da177e4
LT
1838 if (pskb_trim(skb, snaplen))
1839 goto drop_n_acct;
1840
1841 skb_set_owner_r(skb, sk);
1842 skb->dev = NULL;
adf30907 1843 skb_dst_drop(skb);
1da177e4 1844
84531c24
PO
1845 /* drop conntrack reference */
1846 nf_reset(skb);
1847
1da177e4 1848 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1849 po->stats.stats1.tp_packets++;
3b885787 1850 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1851 __skb_queue_tail(&sk->sk_receive_queue, skb);
1852 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 1853 sk->sk_data_ready(sk);
1da177e4
LT
1854 return 0;
1855
1856drop_n_acct:
7091fbd8 1857 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1858 po->stats.stats1.tp_drops++;
7091fbd8
WB
1859 atomic_inc(&sk->sk_drops);
1860 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1861
1862drop_n_restore:
1863 if (skb_head != skb->data && skb_shared(skb)) {
1864 skb->data = skb_head;
1865 skb->len = skb_len;
1866 }
1867drop:
ead2ceb0 1868 consume_skb(skb);
1da177e4
LT
1869 return 0;
1870}
1871
40d4e3df
ED
1872static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1873 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1874{
1875 struct sock *sk;
1876 struct packet_sock *po;
1877 struct sockaddr_ll *sll;
184f489e 1878 union tpacket_uhdr h;
40d4e3df 1879 u8 *skb_head = skb->data;
1da177e4 1880 int skb_len = skb->len;
dbcb5855 1881 unsigned int snaplen, res;
f6fb8f10 1882 unsigned long status = TP_STATUS_USER;
bbd6ef87 1883 unsigned short macoff, netoff, hdrlen;
1da177e4 1884 struct sk_buff *copy_skb = NULL;
bbd6ef87 1885 struct timespec ts;
b9c32fb2 1886 __u32 ts_status;
1da177e4 1887
51846355
AW
1888 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1889 * We may add members to them until current aligned size without forcing
1890 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1891 */
1892 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1893 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1894
1da177e4
LT
1895 if (skb->pkt_type == PACKET_LOOPBACK)
1896 goto drop;
1897
1898 sk = pt->af_packet_priv;
1899 po = pkt_sk(sk);
1900
09ad9bc7 1901 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1902 goto drop;
1903
3b04ddde 1904 if (dev->header_ops) {
1da177e4 1905 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1906 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1907 else if (skb->pkt_type == PACKET_OUTGOING) {
1908 /* Special case: outgoing packets have ll header at head */
bbe735e4 1909 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1910 }
1911 }
1912
8dc41944
HX
1913 if (skb->ip_summed == CHECKSUM_PARTIAL)
1914 status |= TP_STATUS_CSUMNOTREADY;
1915
1da177e4
LT
1916 snaplen = skb->len;
1917
dbcb5855
DM
1918 res = run_filter(skb, sk, snaplen);
1919 if (!res)
fda9ef5d 1920 goto drop_n_restore;
dbcb5855
DM
1921 if (snaplen > res)
1922 snaplen = res;
1da177e4
LT
1923
1924 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1925 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1926 po->tp_reserve;
1da177e4 1927 } else {
95c96174 1928 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1929 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1930 (maclen < 16 ? 16 : maclen)) +
1931 po->tp_reserve;
1da177e4
LT
1932 macoff = netoff - maclen;
1933 }
f6fb8f10 1934 if (po->tp_version <= TPACKET_V2) {
1935 if (macoff + snaplen > po->rx_ring.frame_size) {
1936 if (po->copy_thresh &&
0fd7bac6 1937 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1938 if (skb_shared(skb)) {
1939 copy_skb = skb_clone(skb, GFP_ATOMIC);
1940 } else {
1941 copy_skb = skb_get(skb);
1942 skb_head = skb->data;
1943 }
1944 if (copy_skb)
1945 skb_set_owner_r(copy_skb, sk);
1da177e4 1946 }
f6fb8f10 1947 snaplen = po->rx_ring.frame_size - macoff;
1948 if ((int)snaplen < 0)
1949 snaplen = 0;
1da177e4 1950 }
dc808110
ED
1951 } else if (unlikely(macoff + snaplen >
1952 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
1953 u32 nval;
1954
1955 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
1956 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
1957 snaplen, nval, macoff);
1958 snaplen = nval;
1959 if (unlikely((int)snaplen < 0)) {
1960 snaplen = 0;
1961 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
1962 }
1da177e4 1963 }
1da177e4 1964 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1965 h.raw = packet_current_rx_frame(po, skb,
1966 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1967 if (!h.raw)
1da177e4 1968 goto ring_is_full;
f6fb8f10 1969 if (po->tp_version <= TPACKET_V2) {
1970 packet_increment_rx_head(po, &po->rx_ring);
1971 /*
1972 * LOSING will be reported till you read the stats,
1973 * because it's COR - Clear On Read.
1974 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1975 * at packet level.
1976 */
ee80fbf3 1977 if (po->stats.stats1.tp_drops)
f6fb8f10 1978 status |= TP_STATUS_LOSING;
1979 }
ee80fbf3 1980 po->stats.stats1.tp_packets++;
1da177e4
LT
1981 if (copy_skb) {
1982 status |= TP_STATUS_COPY;
1983 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1984 }
1da177e4
LT
1985 spin_unlock(&sk->sk_receive_queue.lock);
1986
bbd6ef87 1987 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1988
1989 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1990 getnstimeofday(&ts);
1da177e4 1991
b9c32fb2
DB
1992 status |= ts_status;
1993
bbd6ef87
PM
1994 switch (po->tp_version) {
1995 case TPACKET_V1:
1996 h.h1->tp_len = skb->len;
1997 h.h1->tp_snaplen = snaplen;
1998 h.h1->tp_mac = macoff;
1999 h.h1->tp_net = netoff;
4b457bdf
DB
2000 h.h1->tp_sec = ts.tv_sec;
2001 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2002 hdrlen = sizeof(*h.h1);
2003 break;
2004 case TPACKET_V2:
2005 h.h2->tp_len = skb->len;
2006 h.h2->tp_snaplen = snaplen;
2007 h.h2->tp_mac = macoff;
2008 h.h2->tp_net = netoff;
bbd6ef87
PM
2009 h.h2->tp_sec = ts.tv_sec;
2010 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2011 if (skb_vlan_tag_present(skb)) {
2012 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2013 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2014 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2015 } else {
2016 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2017 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2018 }
e4d26f4b 2019 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2020 hdrlen = sizeof(*h.h2);
2021 break;
f6fb8f10 2022 case TPACKET_V3:
2023 /* tp_nxt_offset,vlan are already populated above.
2024 * So DONT clear those fields here
2025 */
2026 h.h3->tp_status |= status;
2027 h.h3->tp_len = skb->len;
2028 h.h3->tp_snaplen = snaplen;
2029 h.h3->tp_mac = macoff;
2030 h.h3->tp_net = netoff;
f6fb8f10 2031 h.h3->tp_sec = ts.tv_sec;
2032 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2033 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2034 hdrlen = sizeof(*h.h3);
2035 break;
bbd6ef87
PM
2036 default:
2037 BUG();
2038 }
1da177e4 2039
bbd6ef87 2040 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2041 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2042 sll->sll_family = AF_PACKET;
2043 sll->sll_hatype = dev->type;
2044 sll->sll_protocol = skb->protocol;
2045 sll->sll_pkttype = skb->pkt_type;
8032b464 2046 if (unlikely(po->origdev))
80feaacb
PWJ
2047 sll->sll_ifindex = orig_dev->ifindex;
2048 else
2049 sll->sll_ifindex = dev->ifindex;
1da177e4 2050
e16aa207 2051 smp_mb();
f0d4eb29 2052
f6dafa95 2053#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2054 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2055 u8 *start, *end;
2056
f0d4eb29
DB
2057 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2058 macoff + snaplen);
2059
2060 for (start = h.raw; start < end; start += PAGE_SIZE)
2061 flush_dcache_page(pgv_to_page(start));
1da177e4 2062 }
f0d4eb29 2063 smp_wmb();
f6dafa95 2064#endif
f0d4eb29 2065
da413eec 2066 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2067 __packet_set_status(po, h.raw, status);
da413eec
DC
2068 sk->sk_data_ready(sk);
2069 } else {
f6fb8f10 2070 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2071 }
1da177e4
LT
2072
2073drop_n_restore:
2074 if (skb_head != skb->data && skb_shared(skb)) {
2075 skb->data = skb_head;
2076 skb->len = skb_len;
2077 }
2078drop:
1ce4f28b 2079 kfree_skb(skb);
1da177e4
LT
2080 return 0;
2081
2082ring_is_full:
ee80fbf3 2083 po->stats.stats1.tp_drops++;
1da177e4
LT
2084 spin_unlock(&sk->sk_receive_queue.lock);
2085
676d2369 2086 sk->sk_data_ready(sk);
acb5d75b 2087 kfree_skb(copy_skb);
1da177e4
LT
2088 goto drop_n_restore;
2089}
2090
69e3c75f
JB
2091static void tpacket_destruct_skb(struct sk_buff *skb)
2092{
2093 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2094
69e3c75f 2095 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2096 void *ph;
b9c32fb2
DB
2097 __u32 ts;
2098
69e3c75f 2099 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2100 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2101
2102 ts = __packet_set_timestamp(po, ph, skb);
2103 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2104 }
2105
2106 sock_wfree(skb);
2107}
2108
9c707762
WB
2109static bool ll_header_truncated(const struct net_device *dev, int len)
2110{
2111 /* net device doesn't like empty head */
2112 if (unlikely(len <= dev->hard_header_len)) {
eee2f04b 2113 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
9c707762
WB
2114 current->comm, len, dev->hard_header_len);
2115 return true;
2116 }
2117
2118 return false;
2119}
2120
40d4e3df
ED
2121static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2122 void *frame, struct net_device *dev, int size_max,
ae641949 2123 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2124{
184f489e 2125 union tpacket_uhdr ph;
09effa67 2126 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2127 struct socket *sock = po->sk.sk_socket;
2128 struct page *page;
2129 void *data;
2130 int err;
2131
2132 ph.raw = frame;
2133
2134 skb->protocol = proto;
2135 skb->dev = dev;
2136 skb->priority = po->sk.sk_priority;
2d37a186 2137 skb->mark = po->sk.sk_mark;
2e31396f 2138 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2139 skb_shinfo(skb)->destructor_arg = ph.raw;
2140
2141 switch (po->tp_version) {
2142 case TPACKET_V2:
2143 tp_len = ph.h2->tp_len;
2144 break;
2145 default:
2146 tp_len = ph.h1->tp_len;
2147 break;
2148 }
09effa67
DM
2149 if (unlikely(tp_len > size_max)) {
2150 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2151 return -EMSGSIZE;
2152 }
69e3c75f 2153
ae641949 2154 skb_reserve(skb, hlen);
69e3c75f 2155 skb_reset_network_header(skb);
c1aad275 2156
d346a3fa
DB
2157 if (!packet_use_direct_xmit(po))
2158 skb_probe_transport_header(skb, 0);
2159 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2160 int off_min, off_max, off;
2161 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2162 off_max = po->tx_ring.frame_size - tp_len;
2163 if (sock->type == SOCK_DGRAM) {
2164 switch (po->tp_version) {
2165 case TPACKET_V2:
2166 off = ph.h2->tp_net;
2167 break;
2168 default:
2169 off = ph.h1->tp_net;
2170 break;
2171 }
2172 } else {
2173 switch (po->tp_version) {
2174 case TPACKET_V2:
2175 off = ph.h2->tp_mac;
2176 break;
2177 default:
2178 off = ph.h1->tp_mac;
2179 break;
2180 }
2181 }
2182 if (unlikely((off < off_min) || (off_max < off)))
2183 return -EINVAL;
2184 data = ph.raw + off;
2185 } else {
2186 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2187 }
69e3c75f
JB
2188 to_write = tp_len;
2189
2190 if (sock->type == SOCK_DGRAM) {
2191 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2192 NULL, tp_len);
2193 if (unlikely(err < 0))
2194 return -EINVAL;
40d4e3df 2195 } else if (dev->hard_header_len) {
9c707762 2196 if (ll_header_truncated(dev, tp_len))
69e3c75f 2197 return -EINVAL;
69e3c75f
JB
2198
2199 skb_push(skb, dev->hard_header_len);
2200 err = skb_store_bits(skb, 0, data,
2201 dev->hard_header_len);
2202 if (unlikely(err))
2203 return err;
2204
2205 data += dev->hard_header_len;
2206 to_write -= dev->hard_header_len;
2207 }
2208
69e3c75f
JB
2209 offset = offset_in_page(data);
2210 len_max = PAGE_SIZE - offset;
2211 len = ((to_write > len_max) ? len_max : to_write);
2212
2213 skb->data_len = to_write;
2214 skb->len += to_write;
2215 skb->truesize += to_write;
2216 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2217
2218 while (likely(to_write)) {
2219 nr_frags = skb_shinfo(skb)->nr_frags;
2220
2221 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2222 pr_err("Packet exceed the number of skb frags(%lu)\n",
2223 MAX_SKB_FRAGS);
69e3c75f
JB
2224 return -EFAULT;
2225 }
2226
0af55bb5
CG
2227 page = pgv_to_page(data);
2228 data += len;
69e3c75f
JB
2229 flush_dcache_page(page);
2230 get_page(page);
0af55bb5 2231 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2232 to_write -= len;
2233 offset = 0;
2234 len_max = PAGE_SIZE;
2235 len = ((to_write > len_max) ? len_max : to_write);
2236 }
2237
2238 return tp_len;
2239}
2240
2241static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2242{
69e3c75f
JB
2243 struct sk_buff *skb;
2244 struct net_device *dev;
2245 __be16 proto;
09effa67 2246 int err, reserve = 0;
40d4e3df 2247 void *ph;
342dfc30 2248 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2249 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2250 int tp_len, size_max;
2251 unsigned char *addr;
2252 int len_sum = 0;
9e67030a 2253 int status = TP_STATUS_AVAILABLE;
ae641949 2254 int hlen, tlen;
69e3c75f 2255
69e3c75f
JB
2256 mutex_lock(&po->pg_vec_lock);
2257
66e56cd4 2258 if (likely(saddr == NULL)) {
e40526cb 2259 dev = packet_cached_dev_get(po);
69e3c75f
JB
2260 proto = po->num;
2261 addr = NULL;
2262 } else {
2263 err = -EINVAL;
2264 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2265 goto out;
2266 if (msg->msg_namelen < (saddr->sll_halen
2267 + offsetof(struct sockaddr_ll,
2268 sll_addr)))
2269 goto out;
69e3c75f
JB
2270 proto = saddr->sll_protocol;
2271 addr = saddr->sll_addr;
827d9780 2272 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2273 }
2274
69e3c75f
JB
2275 err = -ENXIO;
2276 if (unlikely(dev == NULL))
2277 goto out;
69e3c75f
JB
2278 err = -ENETDOWN;
2279 if (unlikely(!(dev->flags & IFF_UP)))
2280 goto out_put;
2281
52f1454f 2282 reserve = dev->hard_header_len + VLAN_HLEN;
69e3c75f 2283 size_max = po->tx_ring.frame_size
b5dd884e 2284 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2285
09effa67
DM
2286 if (size_max > dev->mtu + reserve)
2287 size_max = dev->mtu + reserve;
2288
69e3c75f
JB
2289 do {
2290 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2291 TP_STATUS_SEND_REQUEST);
69e3c75f 2292 if (unlikely(ph == NULL)) {
87a2fd28
DB
2293 if (need_wait && need_resched())
2294 schedule();
69e3c75f
JB
2295 continue;
2296 }
2297
2298 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2299 hlen = LL_RESERVED_SPACE(dev);
2300 tlen = dev->needed_tailroom;
69e3c75f 2301 skb = sock_alloc_send_skb(&po->sk,
ae641949 2302 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2303 0, &err);
2304
2305 if (unlikely(skb == NULL))
2306 goto out_status;
2307
2308 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
52f1454f
DB
2309 addr, hlen);
2310 if (tp_len > dev->mtu + dev->hard_header_len) {
2311 struct ethhdr *ehdr;
2312 /* Earlier code assumed this would be a VLAN pkt,
2313 * double-check this now that we have the actual
2314 * packet in hand.
2315 */
69e3c75f 2316
52f1454f
DB
2317 skb_reset_mac_header(skb);
2318 ehdr = eth_hdr(skb);
2319 if (ehdr->h_proto != htons(ETH_P_8021Q))
2320 tp_len = -EMSGSIZE;
2321 }
69e3c75f
JB
2322 if (unlikely(tp_len < 0)) {
2323 if (po->tp_loss) {
2324 __packet_set_status(po, ph,
2325 TP_STATUS_AVAILABLE);
2326 packet_increment_head(&po->tx_ring);
2327 kfree_skb(skb);
2328 continue;
2329 } else {
2330 status = TP_STATUS_WRONG_FORMAT;
2331 err = tp_len;
2332 goto out_status;
2333 }
2334 }
2335
0fd5d57b
DB
2336 packet_pick_tx_queue(dev, skb);
2337
69e3c75f
JB
2338 skb->destructor = tpacket_destruct_skb;
2339 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2340 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2341
2342 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2343 err = po->xmit(skb);
eb70df13
JP
2344 if (unlikely(err > 0)) {
2345 err = net_xmit_errno(err);
2346 if (err && __packet_get_status(po, ph) ==
2347 TP_STATUS_AVAILABLE) {
2348 /* skb was destructed already */
2349 skb = NULL;
2350 goto out_status;
2351 }
2352 /*
2353 * skb was dropped but not destructed yet;
2354 * let's treat it like congestion or err < 0
2355 */
2356 err = 0;
2357 }
69e3c75f
JB
2358 packet_increment_head(&po->tx_ring);
2359 len_sum += tp_len;
b0138408
DB
2360 } while (likely((ph != NULL) ||
2361 /* Note: packet_read_pending() might be slow if we have
2362 * to call it as it's per_cpu variable, but in fast-path
2363 * we already short-circuit the loop with the first
2364 * condition, and luckily don't have to go that path
2365 * anyway.
2366 */
2367 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2368
2369 err = len_sum;
2370 goto out_put;
2371
69e3c75f
JB
2372out_status:
2373 __packet_set_status(po, ph, status);
2374 kfree_skb(skb);
2375out_put:
e40526cb 2376 dev_put(dev);
69e3c75f
JB
2377out:
2378 mutex_unlock(&po->pg_vec_lock);
2379 return err;
2380}
69e3c75f 2381
eea49cc9
OJ
2382static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2383 size_t reserve, size_t len,
2384 size_t linear, int noblock,
2385 int *err)
bfd5f4a3
SS
2386{
2387 struct sk_buff *skb;
2388
2389 /* Under a page? Don't bother with paged skb. */
2390 if (prepad + len < PAGE_SIZE || !linear)
2391 linear = len;
2392
2393 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2394 err, 0);
bfd5f4a3
SS
2395 if (!skb)
2396 return NULL;
2397
2398 skb_reserve(skb, reserve);
2399 skb_put(skb, linear);
2400 skb->data_len = len - linear;
2401 skb->len += len - linear;
2402
2403 return skb;
2404}
2405
d346a3fa 2406static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2407{
2408 struct sock *sk = sock->sk;
342dfc30 2409 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2410 struct sk_buff *skb;
2411 struct net_device *dev;
0e11c91e 2412 __be16 proto;
1da177e4 2413 unsigned char *addr;
827d9780 2414 int err, reserve = 0;
bfd5f4a3
SS
2415 struct virtio_net_hdr vnet_hdr = { 0 };
2416 int offset = 0;
2417 int vnet_hdr_len;
2418 struct packet_sock *po = pkt_sk(sk);
2419 unsigned short gso_type = 0;
ae641949 2420 int hlen, tlen;
3bdc0eba 2421 int extra_len = 0;
8feb2fb2 2422 ssize_t n;
1da177e4
LT
2423
2424 /*
1ce4f28b 2425 * Get and verify the address.
1da177e4 2426 */
1ce4f28b 2427
66e56cd4 2428 if (likely(saddr == NULL)) {
e40526cb 2429 dev = packet_cached_dev_get(po);
1da177e4
LT
2430 proto = po->num;
2431 addr = NULL;
2432 } else {
2433 err = -EINVAL;
2434 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2435 goto out;
0fb375fb
EB
2436 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2437 goto out;
1da177e4
LT
2438 proto = saddr->sll_protocol;
2439 addr = saddr->sll_addr;
827d9780 2440 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2441 }
2442
1da177e4 2443 err = -ENXIO;
e40526cb 2444 if (unlikely(dev == NULL))
1da177e4 2445 goto out_unlock;
d5e76b0a 2446 err = -ENETDOWN;
e40526cb 2447 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2448 goto out_unlock;
2449
e40526cb
DB
2450 if (sock->type == SOCK_RAW)
2451 reserve = dev->hard_header_len;
bfd5f4a3
SS
2452 if (po->has_vnet_hdr) {
2453 vnet_hdr_len = sizeof(vnet_hdr);
2454
2455 err = -EINVAL;
2456 if (len < vnet_hdr_len)
2457 goto out_unlock;
2458
2459 len -= vnet_hdr_len;
2460
8feb2fb2 2461 err = -EFAULT;
c0371da6 2462 n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
8feb2fb2 2463 if (n != vnet_hdr_len)
bfd5f4a3
SS
2464 goto out_unlock;
2465
2466 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
dc9e5153
MT
2467 (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
2468 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
2469 __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
2470 vnet_hdr.hdr_len = __cpu_to_virtio16(false,
2471 __virtio16_to_cpu(false, vnet_hdr.csum_start) +
2472 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
bfd5f4a3
SS
2473
2474 err = -EINVAL;
dc9e5153 2475 if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
bfd5f4a3
SS
2476 goto out_unlock;
2477
2478 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2479 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2480 case VIRTIO_NET_HDR_GSO_TCPV4:
2481 gso_type = SKB_GSO_TCPV4;
2482 break;
2483 case VIRTIO_NET_HDR_GSO_TCPV6:
2484 gso_type = SKB_GSO_TCPV6;
2485 break;
2486 case VIRTIO_NET_HDR_GSO_UDP:
2487 gso_type = SKB_GSO_UDP;
2488 break;
2489 default:
2490 goto out_unlock;
2491 }
2492
2493 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2494 gso_type |= SKB_GSO_TCP_ECN;
2495
2496 if (vnet_hdr.gso_size == 0)
2497 goto out_unlock;
2498
2499 }
2500 }
2501
3bdc0eba
BG
2502 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2503 if (!netif_supports_nofcs(dev)) {
2504 err = -EPROTONOSUPPORT;
2505 goto out_unlock;
2506 }
2507 extra_len = 4; /* We're doing our own CRC */
2508 }
2509
1da177e4 2510 err = -EMSGSIZE;
3bdc0eba 2511 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2512 goto out_unlock;
2513
bfd5f4a3 2514 err = -ENOBUFS;
ae641949
HX
2515 hlen = LL_RESERVED_SPACE(dev);
2516 tlen = dev->needed_tailroom;
dc9e5153
MT
2517 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
2518 __virtio16_to_cpu(false, vnet_hdr.hdr_len),
bfd5f4a3 2519 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2520 if (skb == NULL)
1da177e4
LT
2521 goto out_unlock;
2522
bfd5f4a3 2523 skb_set_network_header(skb, reserve);
1da177e4 2524
0c4e8581 2525 err = -EINVAL;
9c707762
WB
2526 if (sock->type == SOCK_DGRAM) {
2527 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2528 if (unlikely(offset < 0))
9c707762
WB
2529 goto out_free;
2530 } else {
2531 if (ll_header_truncated(dev, len))
2532 goto out_free;
2533 }
1da177e4
LT
2534
2535 /* Returns -EFAULT on error */
c0371da6 2536 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2537 if (err)
2538 goto out_free;
bf84a010
DB
2539
2540 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2541
3bdc0eba 2542 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2543 /* Earlier code assumed this would be a VLAN pkt,
2544 * double-check this now that we have the actual
2545 * packet in hand.
2546 */
2547 struct ethhdr *ehdr;
2548 skb_reset_mac_header(skb);
2549 ehdr = eth_hdr(skb);
2550 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2551 err = -EMSGSIZE;
2552 goto out_free;
2553 }
57f89bfa
BG
2554 }
2555
09effa67
DM
2556 skb->protocol = proto;
2557 skb->dev = dev;
1da177e4 2558 skb->priority = sk->sk_priority;
2d37a186 2559 skb->mark = sk->sk_mark;
0fd5d57b
DB
2560
2561 packet_pick_tx_queue(dev, skb);
1da177e4 2562
bfd5f4a3
SS
2563 if (po->has_vnet_hdr) {
2564 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
dc9e5153
MT
2565 u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
2566 u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
2567 if (!skb_partial_csum_set(skb, s, o)) {
bfd5f4a3
SS
2568 err = -EINVAL;
2569 goto out_free;
2570 }
2571 }
2572
dc9e5153
MT
2573 skb_shinfo(skb)->gso_size =
2574 __virtio16_to_cpu(false, vnet_hdr.gso_size);
bfd5f4a3
SS
2575 skb_shinfo(skb)->gso_type = gso_type;
2576
2577 /* Header must be checked, and gso_segs computed. */
2578 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2579 skb_shinfo(skb)->gso_segs = 0;
2580
2581 len += vnet_hdr_len;
2582 }
2583
d346a3fa
DB
2584 if (!packet_use_direct_xmit(po))
2585 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2586 if (unlikely(extra_len == 4))
2587 skb->no_fcs = 1;
2588
d346a3fa 2589 err = po->xmit(skb);
1da177e4
LT
2590 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2591 goto out_unlock;
2592
e40526cb 2593 dev_put(dev);
1da177e4 2594
40d4e3df 2595 return len;
1da177e4
LT
2596
2597out_free:
2598 kfree_skb(skb);
2599out_unlock:
e40526cb 2600 if (dev)
1da177e4
LT
2601 dev_put(dev);
2602out:
2603 return err;
2604}
2605
69e3c75f
JB
2606static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2607 struct msghdr *msg, size_t len)
2608{
69e3c75f
JB
2609 struct sock *sk = sock->sk;
2610 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2611
69e3c75f
JB
2612 if (po->tx_ring.pg_vec)
2613 return tpacket_snd(po, msg);
2614 else
69e3c75f
JB
2615 return packet_snd(sock, msg, len);
2616}
2617
1da177e4
LT
2618/*
2619 * Close a PACKET socket. This is fairly simple. We immediately go
2620 * to 'closed' state and remove our protocol entry in the device list.
2621 */
2622
2623static int packet_release(struct socket *sock)
2624{
2625 struct sock *sk = sock->sk;
2626 struct packet_sock *po;
d12d01d6 2627 struct net *net;
f6fb8f10 2628 union tpacket_req_u req_u;
1da177e4
LT
2629
2630 if (!sk)
2631 return 0;
2632
3b1e0a65 2633 net = sock_net(sk);
1da177e4
LT
2634 po = pkt_sk(sk);
2635
0fa7fa98 2636 mutex_lock(&net->packet.sklist_lock);
808f5114 2637 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2638 mutex_unlock(&net->packet.sklist_lock);
2639
2640 preempt_disable();
920de804 2641 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2642 preempt_enable();
1da177e4 2643
808f5114 2644 spin_lock(&po->bind_lock);
ce06b03e 2645 unregister_prot_hook(sk, false);
66e56cd4
DB
2646 packet_cached_dev_reset(po);
2647
160ff18a
BG
2648 if (po->prot_hook.dev) {
2649 dev_put(po->prot_hook.dev);
2650 po->prot_hook.dev = NULL;
2651 }
808f5114 2652 spin_unlock(&po->bind_lock);
1da177e4 2653
1da177e4 2654 packet_flush_mclist(sk);
1da177e4 2655
9665d5d6
PS
2656 if (po->rx_ring.pg_vec) {
2657 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2658 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2659 }
69e3c75f 2660
9665d5d6
PS
2661 if (po->tx_ring.pg_vec) {
2662 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2663 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2664 }
1da177e4 2665
dc99f600
DM
2666 fanout_release(sk);
2667
808f5114 2668 synchronize_net();
1da177e4
LT
2669 /*
2670 * Now the socket is dead. No more input will appear.
2671 */
1da177e4
LT
2672 sock_orphan(sk);
2673 sock->sk = NULL;
2674
2675 /* Purge queues */
2676
2677 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2678 packet_free_pending(po);
17ab56a2 2679 sk_refcnt_debug_release(sk);
1da177e4
LT
2680
2681 sock_put(sk);
2682 return 0;
2683}
2684
2685/*
2686 * Attach a packet hook.
2687 */
2688
902fefb8 2689static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2690{
2691 struct packet_sock *po = pkt_sk(sk);
902fefb8
DB
2692 const struct net_device *dev_curr;
2693 __be16 proto_curr;
2694 bool need_rehook;
dc99f600 2695
aef950b4
WY
2696 if (po->fanout) {
2697 if (dev)
2698 dev_put(dev);
2699
dc99f600 2700 return -EINVAL;
aef950b4 2701 }
1da177e4
LT
2702
2703 lock_sock(sk);
1da177e4 2704 spin_lock(&po->bind_lock);
66e56cd4 2705
902fefb8
DB
2706 proto_curr = po->prot_hook.type;
2707 dev_curr = po->prot_hook.dev;
2708
2709 need_rehook = proto_curr != proto || dev_curr != dev;
2710
2711 if (need_rehook) {
2712 unregister_prot_hook(sk, true);
1da177e4 2713
902fefb8
DB
2714 po->num = proto;
2715 po->prot_hook.type = proto;
1da177e4 2716
902fefb8
DB
2717 if (po->prot_hook.dev)
2718 dev_put(po->prot_hook.dev);
2719
2720 po->prot_hook.dev = dev;
2721
2722 po->ifindex = dev ? dev->ifindex : 0;
2723 packet_cached_dev_assign(po, dev);
2724 }
66e56cd4 2725
902fefb8 2726 if (proto == 0 || !need_rehook)
1da177e4
LT
2727 goto out_unlock;
2728
be85d4ad 2729 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2730 register_prot_hook(sk);
be85d4ad
UT
2731 } else {
2732 sk->sk_err = ENETDOWN;
2733 if (!sock_flag(sk, SOCK_DEAD))
2734 sk->sk_error_report(sk);
1da177e4
LT
2735 }
2736
2737out_unlock:
2738 spin_unlock(&po->bind_lock);
2739 release_sock(sk);
2740 return 0;
2741}
2742
2743/*
2744 * Bind a packet socket to a device
2745 */
2746
40d4e3df
ED
2747static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2748 int addr_len)
1da177e4 2749{
40d4e3df 2750 struct sock *sk = sock->sk;
1da177e4
LT
2751 char name[15];
2752 struct net_device *dev;
2753 int err = -ENODEV;
1ce4f28b 2754
1da177e4
LT
2755 /*
2756 * Check legality
2757 */
1ce4f28b 2758
8ae55f04 2759 if (addr_len != sizeof(struct sockaddr))
1da177e4 2760 return -EINVAL;
40d4e3df 2761 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2762
3b1e0a65 2763 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2764 if (dev)
1da177e4 2765 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2766 return err;
2767}
1da177e4
LT
2768
2769static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2770{
40d4e3df
ED
2771 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2772 struct sock *sk = sock->sk;
1da177e4
LT
2773 struct net_device *dev = NULL;
2774 int err;
2775
2776
2777 /*
2778 * Check legality
2779 */
1ce4f28b 2780
1da177e4
LT
2781 if (addr_len < sizeof(struct sockaddr_ll))
2782 return -EINVAL;
2783 if (sll->sll_family != AF_PACKET)
2784 return -EINVAL;
2785
2786 if (sll->sll_ifindex) {
2787 err = -ENODEV;
3b1e0a65 2788 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2789 if (dev == NULL)
2790 goto out;
2791 }
2792 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2793
2794out:
2795 return err;
2796}
2797
2798static struct proto packet_proto = {
2799 .name = "PACKET",
2800 .owner = THIS_MODULE,
2801 .obj_size = sizeof(struct packet_sock),
2802};
2803
2804/*
1ce4f28b 2805 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2806 */
2807
3f378b68
EP
2808static int packet_create(struct net *net, struct socket *sock, int protocol,
2809 int kern)
1da177e4
LT
2810{
2811 struct sock *sk;
2812 struct packet_sock *po;
0e11c91e 2813 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2814 int err;
2815
df008c91 2816 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2817 return -EPERM;
be02097c
DM
2818 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2819 sock->type != SOCK_PACKET)
1da177e4
LT
2820 return -ESOCKTNOSUPPORT;
2821
2822 sock->state = SS_UNCONNECTED;
2823
2824 err = -ENOBUFS;
6257ff21 2825 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2826 if (sk == NULL)
2827 goto out;
2828
2829 sock->ops = &packet_ops;
1da177e4
LT
2830 if (sock->type == SOCK_PACKET)
2831 sock->ops = &packet_ops_spkt;
be02097c 2832
1da177e4
LT
2833 sock_init_data(sock, sk);
2834
2835 po = pkt_sk(sk);
2836 sk->sk_family = PF_PACKET;
0e11c91e 2837 po->num = proto;
d346a3fa 2838 po->xmit = dev_queue_xmit;
66e56cd4 2839
b0138408
DB
2840 err = packet_alloc_pending(po);
2841 if (err)
2842 goto out2;
2843
66e56cd4 2844 packet_cached_dev_reset(po);
1da177e4
LT
2845
2846 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2847 sk_refcnt_debug_inc(sk);
1da177e4
LT
2848
2849 /*
2850 * Attach a protocol block
2851 */
2852
2853 spin_lock_init(&po->bind_lock);
905db440 2854 mutex_init(&po->pg_vec_lock);
1da177e4 2855 po->prot_hook.func = packet_rcv;
be02097c 2856
1da177e4
LT
2857 if (sock->type == SOCK_PACKET)
2858 po->prot_hook.func = packet_rcv_spkt;
be02097c 2859
1da177e4
LT
2860 po->prot_hook.af_packet_priv = sk;
2861
0e11c91e
AV
2862 if (proto) {
2863 po->prot_hook.type = proto;
ce06b03e 2864 register_prot_hook(sk);
1da177e4
LT
2865 }
2866
0fa7fa98 2867 mutex_lock(&net->packet.sklist_lock);
808f5114 2868 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2869 mutex_unlock(&net->packet.sklist_lock);
2870
2871 preempt_disable();
3680453c 2872 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2873 preempt_enable();
808f5114 2874
40d4e3df 2875 return 0;
b0138408
DB
2876out2:
2877 sk_free(sk);
1da177e4
LT
2878out:
2879 return err;
2880}
2881
2882/*
2883 * Pull a packet from our receive queue and hand it to the user.
2884 * If necessary we block.
2885 */
2886
2887static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2888 struct msghdr *msg, size_t len, int flags)
2889{
2890 struct sock *sk = sock->sk;
2891 struct sk_buff *skb;
2892 int copied, err;
bfd5f4a3 2893 int vnet_hdr_len = 0;
1da177e4
LT
2894
2895 err = -EINVAL;
ed85b565 2896 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2897 goto out;
2898
2899#if 0
2900 /* What error should we return now? EUNATTACH? */
2901 if (pkt_sk(sk)->ifindex < 0)
2902 return -ENODEV;
2903#endif
2904
ed85b565 2905 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
2906 err = sock_recv_errqueue(sk, msg, len,
2907 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
2908 goto out;
2909 }
2910
1da177e4
LT
2911 /*
2912 * Call the generic datagram receiver. This handles all sorts
2913 * of horrible races and re-entrancy so we can forget about it
2914 * in the protocol layers.
2915 *
2916 * Now it will return ENETDOWN, if device have just gone down,
2917 * but then it will block.
2918 */
2919
40d4e3df 2920 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2921
2922 /*
1ce4f28b 2923 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2924 * handles the blocking we don't see and worry about blocking
2925 * retries.
2926 */
2927
8ae55f04 2928 if (skb == NULL)
1da177e4
LT
2929 goto out;
2930
bfd5f4a3
SS
2931 if (pkt_sk(sk)->has_vnet_hdr) {
2932 struct virtio_net_hdr vnet_hdr = { 0 };
2933
2934 err = -EINVAL;
2935 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2936 if (len < vnet_hdr_len)
bfd5f4a3
SS
2937 goto out_free;
2938
1f18b717
MK
2939 len -= vnet_hdr_len;
2940
bfd5f4a3
SS
2941 if (skb_is_gso(skb)) {
2942 struct skb_shared_info *sinfo = skb_shinfo(skb);
2943
2944 /* This is a hint as to how much should be linear. */
dc9e5153
MT
2945 vnet_hdr.hdr_len =
2946 __cpu_to_virtio16(false, skb_headlen(skb));
2947 vnet_hdr.gso_size =
2948 __cpu_to_virtio16(false, sinfo->gso_size);
bfd5f4a3
SS
2949 if (sinfo->gso_type & SKB_GSO_TCPV4)
2950 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2951 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2952 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2953 else if (sinfo->gso_type & SKB_GSO_UDP)
2954 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2955 else if (sinfo->gso_type & SKB_GSO_FCOE)
2956 goto out_free;
2957 else
2958 BUG();
2959 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2960 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2961 } else
2962 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2963
2964 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2965 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
dc9e5153
MT
2966 vnet_hdr.csum_start = __cpu_to_virtio16(false,
2967 skb_checksum_start_offset(skb));
2968 vnet_hdr.csum_offset = __cpu_to_virtio16(false,
2969 skb->csum_offset);
10a8d94a
JW
2970 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2971 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2972 } /* else everything is zero */
2973
7eab8d9e 2974 err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
bfd5f4a3
SS
2975 if (err < 0)
2976 goto out_free;
2977 }
2978
f3d33426
HFS
2979 /* You lose any data beyond the buffer you gave. If it worries
2980 * a user program they can ask the device for its MTU
2981 * anyway.
1da177e4 2982 */
1da177e4 2983 copied = skb->len;
40d4e3df
ED
2984 if (copied > len) {
2985 copied = len;
2986 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2987 }
2988
51f3d02b 2989 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
2990 if (err)
2991 goto out_free;
2992
3b885787 2993 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 2994
f3d33426
HFS
2995 if (msg->msg_name) {
2996 /* If the address length field is there to be filled
2997 * in, we fill it in now.
2998 */
2999 if (sock->type == SOCK_PACKET) {
342dfc30 3000 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3001 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3002 } else {
3003 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3004 msg->msg_namelen = sll->sll_halen +
3005 offsetof(struct sockaddr_ll, sll_addr);
3006 }
ffbc6111
HX
3007 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3008 msg->msg_namelen);
f3d33426 3009 }
1da177e4 3010
8dc41944 3011 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3012 struct tpacket_auxdata aux;
3013
3014 aux.tp_status = TP_STATUS_USER;
3015 if (skb->ip_summed == CHECKSUM_PARTIAL)
3016 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3017 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
3018 aux.tp_snaplen = skb->len;
3019 aux.tp_mac = 0;
bbe735e4 3020 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3021 if (skb_vlan_tag_present(skb)) {
3022 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3023 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3024 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3025 } else {
3026 aux.tp_vlan_tci = 0;
a0cdfcf3 3027 aux.tp_vlan_tpid = 0;
a3bcc23e 3028 }
ffbc6111 3029 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3030 }
3031
1da177e4
LT
3032 /*
3033 * Free or return the buffer as appropriate. Again this
3034 * hides all the races and re-entrancy issues from us.
3035 */
bfd5f4a3 3036 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3037
3038out_free:
3039 skb_free_datagram(sk, skb);
3040out:
3041 return err;
3042}
3043
1da177e4
LT
3044static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3045 int *uaddr_len, int peer)
3046{
3047 struct net_device *dev;
3048 struct sock *sk = sock->sk;
3049
3050 if (peer)
3051 return -EOPNOTSUPP;
3052
3053 uaddr->sa_family = AF_PACKET;
2dc85bf3 3054 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3055 rcu_read_lock();
3056 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3057 if (dev)
2dc85bf3 3058 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3059 rcu_read_unlock();
1da177e4
LT
3060 *uaddr_len = sizeof(*uaddr);
3061
3062 return 0;
3063}
1da177e4
LT
3064
3065static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3066 int *uaddr_len, int peer)
3067{
3068 struct net_device *dev;
3069 struct sock *sk = sock->sk;
3070 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3071 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3072
3073 if (peer)
3074 return -EOPNOTSUPP;
3075
3076 sll->sll_family = AF_PACKET;
3077 sll->sll_ifindex = po->ifindex;
3078 sll->sll_protocol = po->num;
67286640 3079 sll->sll_pkttype = 0;
654d1f8a
ED
3080 rcu_read_lock();
3081 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3082 if (dev) {
3083 sll->sll_hatype = dev->type;
3084 sll->sll_halen = dev->addr_len;
3085 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3086 } else {
3087 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3088 sll->sll_halen = 0;
3089 }
654d1f8a 3090 rcu_read_unlock();
0fb375fb 3091 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3092
3093 return 0;
3094}
3095
2aeb0b88
WC
3096static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3097 int what)
1da177e4
LT
3098{
3099 switch (i->type) {
3100 case PACKET_MR_MULTICAST:
1162563f
JP
3101 if (i->alen != dev->addr_len)
3102 return -EINVAL;
1da177e4 3103 if (what > 0)
22bedad3 3104 return dev_mc_add(dev, i->addr);
1da177e4 3105 else
22bedad3 3106 return dev_mc_del(dev, i->addr);
1da177e4
LT
3107 break;
3108 case PACKET_MR_PROMISC:
2aeb0b88 3109 return dev_set_promiscuity(dev, what);
1da177e4 3110 case PACKET_MR_ALLMULTI:
2aeb0b88 3111 return dev_set_allmulti(dev, what);
d95ed927 3112 case PACKET_MR_UNICAST:
1162563f
JP
3113 if (i->alen != dev->addr_len)
3114 return -EINVAL;
d95ed927 3115 if (what > 0)
a748ee24 3116 return dev_uc_add(dev, i->addr);
d95ed927 3117 else
a748ee24 3118 return dev_uc_del(dev, i->addr);
d95ed927 3119 break;
40d4e3df
ED
3120 default:
3121 break;
1da177e4 3122 }
2aeb0b88 3123 return 0;
1da177e4
LT
3124}
3125
3126static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
3127{
40d4e3df 3128 for ( ; i; i = i->next) {
1da177e4
LT
3129 if (i->ifindex == dev->ifindex)
3130 packet_dev_mc(dev, i, what);
3131 }
3132}
3133
0fb375fb 3134static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3135{
3136 struct packet_sock *po = pkt_sk(sk);
3137 struct packet_mclist *ml, *i;
3138 struct net_device *dev;
3139 int err;
3140
3141 rtnl_lock();
3142
3143 err = -ENODEV;
3b1e0a65 3144 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3145 if (!dev)
3146 goto done;
3147
3148 err = -EINVAL;
1162563f 3149 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3150 goto done;
3151
3152 err = -ENOBUFS;
8b3a7005 3153 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3154 if (i == NULL)
3155 goto done;
3156
3157 err = 0;
3158 for (ml = po->mclist; ml; ml = ml->next) {
3159 if (ml->ifindex == mreq->mr_ifindex &&
3160 ml->type == mreq->mr_type &&
3161 ml->alen == mreq->mr_alen &&
3162 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3163 ml->count++;
3164 /* Free the new element ... */
3165 kfree(i);
3166 goto done;
3167 }
3168 }
3169
3170 i->type = mreq->mr_type;
3171 i->ifindex = mreq->mr_ifindex;
3172 i->alen = mreq->mr_alen;
3173 memcpy(i->addr, mreq->mr_address, i->alen);
3174 i->count = 1;
3175 i->next = po->mclist;
3176 po->mclist = i;
2aeb0b88
WC
3177 err = packet_dev_mc(dev, i, 1);
3178 if (err) {
3179 po->mclist = i->next;
3180 kfree(i);
3181 }
1da177e4
LT
3182
3183done:
3184 rtnl_unlock();
3185 return err;
3186}
3187
0fb375fb 3188static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3189{
3190 struct packet_mclist *ml, **mlp;
3191
3192 rtnl_lock();
3193
3194 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3195 if (ml->ifindex == mreq->mr_ifindex &&
3196 ml->type == mreq->mr_type &&
3197 ml->alen == mreq->mr_alen &&
3198 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3199 if (--ml->count == 0) {
3200 struct net_device *dev;
3201 *mlp = ml->next;
ad959e76
ED
3202 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3203 if (dev)
1da177e4 3204 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3205 kfree(ml);
3206 }
3207 rtnl_unlock();
3208 return 0;
3209 }
3210 }
3211 rtnl_unlock();
3212 return -EADDRNOTAVAIL;
3213}
3214
3215static void packet_flush_mclist(struct sock *sk)
3216{
3217 struct packet_sock *po = pkt_sk(sk);
3218 struct packet_mclist *ml;
3219
3220 if (!po->mclist)
3221 return;
3222
3223 rtnl_lock();
3224 while ((ml = po->mclist) != NULL) {
3225 struct net_device *dev;
3226
3227 po->mclist = ml->next;
ad959e76
ED
3228 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3229 if (dev != NULL)
1da177e4 3230 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3231 kfree(ml);
3232 }
3233 rtnl_unlock();
3234}
1da177e4
LT
3235
3236static int
b7058842 3237packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3238{
3239 struct sock *sk = sock->sk;
8dc41944 3240 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3241 int ret;
3242
3243 if (level != SOL_PACKET)
3244 return -ENOPROTOOPT;
3245
69e3c75f 3246 switch (optname) {
1ce4f28b 3247 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3248 case PACKET_DROP_MEMBERSHIP:
3249 {
0fb375fb
EB
3250 struct packet_mreq_max mreq;
3251 int len = optlen;
3252 memset(&mreq, 0, sizeof(mreq));
3253 if (len < sizeof(struct packet_mreq))
1da177e4 3254 return -EINVAL;
0fb375fb
EB
3255 if (len > sizeof(mreq))
3256 len = sizeof(mreq);
40d4e3df 3257 if (copy_from_user(&mreq, optval, len))
1da177e4 3258 return -EFAULT;
0fb375fb
EB
3259 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3260 return -EINVAL;
1da177e4
LT
3261 if (optname == PACKET_ADD_MEMBERSHIP)
3262 ret = packet_mc_add(sk, &mreq);
3263 else
3264 ret = packet_mc_drop(sk, &mreq);
3265 return ret;
3266 }
a2efcfa0 3267
1da177e4 3268 case PACKET_RX_RING:
69e3c75f 3269 case PACKET_TX_RING:
1da177e4 3270 {
f6fb8f10 3271 union tpacket_req_u req_u;
3272 int len;
1da177e4 3273
f6fb8f10 3274 switch (po->tp_version) {
3275 case TPACKET_V1:
3276 case TPACKET_V2:
3277 len = sizeof(req_u.req);
3278 break;
3279 case TPACKET_V3:
3280 default:
3281 len = sizeof(req_u.req3);
3282 break;
3283 }
3284 if (optlen < len)
1da177e4 3285 return -EINVAL;
bfd5f4a3
SS
3286 if (pkt_sk(sk)->has_vnet_hdr)
3287 return -EINVAL;
f6fb8f10 3288 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3289 return -EFAULT;
f6fb8f10 3290 return packet_set_ring(sk, &req_u, 0,
3291 optname == PACKET_TX_RING);
1da177e4
LT
3292 }
3293 case PACKET_COPY_THRESH:
3294 {
3295 int val;
3296
40d4e3df 3297 if (optlen != sizeof(val))
1da177e4 3298 return -EINVAL;
40d4e3df 3299 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3300 return -EFAULT;
3301
3302 pkt_sk(sk)->copy_thresh = val;
3303 return 0;
3304 }
bbd6ef87
PM
3305 case PACKET_VERSION:
3306 {
3307 int val;
3308
3309 if (optlen != sizeof(val))
3310 return -EINVAL;
69e3c75f 3311 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3312 return -EBUSY;
3313 if (copy_from_user(&val, optval, sizeof(val)))
3314 return -EFAULT;
3315 switch (val) {
3316 case TPACKET_V1:
3317 case TPACKET_V2:
f6fb8f10 3318 case TPACKET_V3:
bbd6ef87
PM
3319 po->tp_version = val;
3320 return 0;
3321 default:
3322 return -EINVAL;
3323 }
3324 }
8913336a
PM
3325 case PACKET_RESERVE:
3326 {
3327 unsigned int val;
3328
3329 if (optlen != sizeof(val))
3330 return -EINVAL;
69e3c75f 3331 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3332 return -EBUSY;
3333 if (copy_from_user(&val, optval, sizeof(val)))
3334 return -EFAULT;
3335 po->tp_reserve = val;
3336 return 0;
3337 }
69e3c75f
JB
3338 case PACKET_LOSS:
3339 {
3340 unsigned int val;
3341
3342 if (optlen != sizeof(val))
3343 return -EINVAL;
3344 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3345 return -EBUSY;
3346 if (copy_from_user(&val, optval, sizeof(val)))
3347 return -EFAULT;
3348 po->tp_loss = !!val;
3349 return 0;
3350 }
8dc41944
HX
3351 case PACKET_AUXDATA:
3352 {
3353 int val;
3354
3355 if (optlen < sizeof(val))
3356 return -EINVAL;
3357 if (copy_from_user(&val, optval, sizeof(val)))
3358 return -EFAULT;
3359
3360 po->auxdata = !!val;
3361 return 0;
3362 }
80feaacb
PWJ
3363 case PACKET_ORIGDEV:
3364 {
3365 int val;
3366
3367 if (optlen < sizeof(val))
3368 return -EINVAL;
3369 if (copy_from_user(&val, optval, sizeof(val)))
3370 return -EFAULT;
3371
3372 po->origdev = !!val;
3373 return 0;
3374 }
bfd5f4a3
SS
3375 case PACKET_VNET_HDR:
3376 {
3377 int val;
3378
3379 if (sock->type != SOCK_RAW)
3380 return -EINVAL;
3381 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3382 return -EBUSY;
3383 if (optlen < sizeof(val))
3384 return -EINVAL;
3385 if (copy_from_user(&val, optval, sizeof(val)))
3386 return -EFAULT;
3387
3388 po->has_vnet_hdr = !!val;
3389 return 0;
3390 }
614f60fa
SM
3391 case PACKET_TIMESTAMP:
3392 {
3393 int val;
3394
3395 if (optlen != sizeof(val))
3396 return -EINVAL;
3397 if (copy_from_user(&val, optval, sizeof(val)))
3398 return -EFAULT;
3399
3400 po->tp_tstamp = val;
3401 return 0;
3402 }
dc99f600
DM
3403 case PACKET_FANOUT:
3404 {
3405 int val;
3406
3407 if (optlen != sizeof(val))
3408 return -EINVAL;
3409 if (copy_from_user(&val, optval, sizeof(val)))
3410 return -EFAULT;
3411
3412 return fanout_add(sk, val & 0xffff, val >> 16);
3413 }
5920cd3a
PC
3414 case PACKET_TX_HAS_OFF:
3415 {
3416 unsigned int val;
3417
3418 if (optlen != sizeof(val))
3419 return -EINVAL;
3420 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3421 return -EBUSY;
3422 if (copy_from_user(&val, optval, sizeof(val)))
3423 return -EFAULT;
3424 po->tp_tx_has_off = !!val;
3425 return 0;
3426 }
d346a3fa
DB
3427 case PACKET_QDISC_BYPASS:
3428 {
3429 int val;
3430
3431 if (optlen != sizeof(val))
3432 return -EINVAL;
3433 if (copy_from_user(&val, optval, sizeof(val)))
3434 return -EFAULT;
3435
3436 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3437 return 0;
3438 }
1da177e4
LT
3439 default:
3440 return -ENOPROTOOPT;
3441 }
3442}
3443
3444static int packet_getsockopt(struct socket *sock, int level, int optname,
3445 char __user *optval, int __user *optlen)
3446{
3447 int len;
c06fff6e 3448 int val, lv = sizeof(val);
1da177e4
LT
3449 struct sock *sk = sock->sk;
3450 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3451 void *data = &val;
ee80fbf3 3452 union tpacket_stats_u st;
1da177e4
LT
3453
3454 if (level != SOL_PACKET)
3455 return -ENOPROTOOPT;
3456
8ae55f04
KK
3457 if (get_user(len, optlen))
3458 return -EFAULT;
1da177e4
LT
3459
3460 if (len < 0)
3461 return -EINVAL;
1ce4f28b 3462
69e3c75f 3463 switch (optname) {
1da177e4 3464 case PACKET_STATISTICS:
1da177e4 3465 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3466 memcpy(&st, &po->stats, sizeof(st));
3467 memset(&po->stats, 0, sizeof(po->stats));
3468 spin_unlock_bh(&sk->sk_receive_queue.lock);
3469
f6fb8f10 3470 if (po->tp_version == TPACKET_V3) {
c06fff6e 3471 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3472 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3473 data = &st.stats3;
f6fb8f10 3474 } else {
c06fff6e 3475 lv = sizeof(struct tpacket_stats);
8bcdeaff 3476 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3477 data = &st.stats1;
f6fb8f10 3478 }
ee80fbf3 3479
8dc41944
HX
3480 break;
3481 case PACKET_AUXDATA:
8dc41944 3482 val = po->auxdata;
80feaacb
PWJ
3483 break;
3484 case PACKET_ORIGDEV:
80feaacb 3485 val = po->origdev;
bfd5f4a3
SS
3486 break;
3487 case PACKET_VNET_HDR:
bfd5f4a3 3488 val = po->has_vnet_hdr;
1da177e4 3489 break;
bbd6ef87 3490 case PACKET_VERSION:
bbd6ef87 3491 val = po->tp_version;
bbd6ef87
PM
3492 break;
3493 case PACKET_HDRLEN:
3494 if (len > sizeof(int))
3495 len = sizeof(int);
3496 if (copy_from_user(&val, optval, len))
3497 return -EFAULT;
3498 switch (val) {
3499 case TPACKET_V1:
3500 val = sizeof(struct tpacket_hdr);
3501 break;
3502 case TPACKET_V2:
3503 val = sizeof(struct tpacket2_hdr);
3504 break;
f6fb8f10 3505 case TPACKET_V3:
3506 val = sizeof(struct tpacket3_hdr);
3507 break;
bbd6ef87
PM
3508 default:
3509 return -EINVAL;
3510 }
bbd6ef87 3511 break;
8913336a 3512 case PACKET_RESERVE:
8913336a 3513 val = po->tp_reserve;
8913336a 3514 break;
69e3c75f 3515 case PACKET_LOSS:
69e3c75f 3516 val = po->tp_loss;
69e3c75f 3517 break;
614f60fa 3518 case PACKET_TIMESTAMP:
614f60fa 3519 val = po->tp_tstamp;
614f60fa 3520 break;
dc99f600 3521 case PACKET_FANOUT:
dc99f600
DM
3522 val = (po->fanout ?
3523 ((u32)po->fanout->id |
77f65ebd
WB
3524 ((u32)po->fanout->type << 16) |
3525 ((u32)po->fanout->flags << 24)) :
dc99f600 3526 0);
dc99f600 3527 break;
5920cd3a
PC
3528 case PACKET_TX_HAS_OFF:
3529 val = po->tp_tx_has_off;
3530 break;
d346a3fa
DB
3531 case PACKET_QDISC_BYPASS:
3532 val = packet_use_direct_xmit(po);
3533 break;
1da177e4
LT
3534 default:
3535 return -ENOPROTOOPT;
3536 }
3537
c06fff6e
ED
3538 if (len > lv)
3539 len = lv;
8ae55f04
KK
3540 if (put_user(len, optlen))
3541 return -EFAULT;
8dc41944
HX
3542 if (copy_to_user(optval, data, len))
3543 return -EFAULT;
8ae55f04 3544 return 0;
1da177e4
LT
3545}
3546
3547
351638e7
JP
3548static int packet_notifier(struct notifier_block *this,
3549 unsigned long msg, void *ptr)
1da177e4
LT
3550{
3551 struct sock *sk;
351638e7 3552 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3553 struct net *net = dev_net(dev);
1da177e4 3554
808f5114 3555 rcu_read_lock();
b67bfe0d 3556 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3557 struct packet_sock *po = pkt_sk(sk);
3558
3559 switch (msg) {
3560 case NETDEV_UNREGISTER:
1da177e4
LT
3561 if (po->mclist)
3562 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3563 /* fallthrough */
3564
1da177e4
LT
3565 case NETDEV_DOWN:
3566 if (dev->ifindex == po->ifindex) {
3567 spin_lock(&po->bind_lock);
3568 if (po->running) {
ce06b03e 3569 __unregister_prot_hook(sk, false);
1da177e4
LT
3570 sk->sk_err = ENETDOWN;
3571 if (!sock_flag(sk, SOCK_DEAD))
3572 sk->sk_error_report(sk);
3573 }
3574 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3575 packet_cached_dev_reset(po);
1da177e4 3576 po->ifindex = -1;
160ff18a
BG
3577 if (po->prot_hook.dev)
3578 dev_put(po->prot_hook.dev);
1da177e4
LT
3579 po->prot_hook.dev = NULL;
3580 }
3581 spin_unlock(&po->bind_lock);
3582 }
3583 break;
3584 case NETDEV_UP:
808f5114 3585 if (dev->ifindex == po->ifindex) {
3586 spin_lock(&po->bind_lock);
ce06b03e
DM
3587 if (po->num)
3588 register_prot_hook(sk);
808f5114 3589 spin_unlock(&po->bind_lock);
1da177e4 3590 }
1da177e4
LT
3591 break;
3592 }
3593 }
808f5114 3594 rcu_read_unlock();
1da177e4
LT
3595 return NOTIFY_DONE;
3596}
3597
3598
3599static int packet_ioctl(struct socket *sock, unsigned int cmd,
3600 unsigned long arg)
3601{
3602 struct sock *sk = sock->sk;
3603
69e3c75f 3604 switch (cmd) {
40d4e3df
ED
3605 case SIOCOUTQ:
3606 {
3607 int amount = sk_wmem_alloc_get(sk);
31e6d363 3608
40d4e3df
ED
3609 return put_user(amount, (int __user *)arg);
3610 }
3611 case SIOCINQ:
3612 {
3613 struct sk_buff *skb;
3614 int amount = 0;
3615
3616 spin_lock_bh(&sk->sk_receive_queue.lock);
3617 skb = skb_peek(&sk->sk_receive_queue);
3618 if (skb)
3619 amount = skb->len;
3620 spin_unlock_bh(&sk->sk_receive_queue.lock);
3621 return put_user(amount, (int __user *)arg);
3622 }
3623 case SIOCGSTAMP:
3624 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3625 case SIOCGSTAMPNS:
3626 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3627
1da177e4 3628#ifdef CONFIG_INET
40d4e3df
ED
3629 case SIOCADDRT:
3630 case SIOCDELRT:
3631 case SIOCDARP:
3632 case SIOCGARP:
3633 case SIOCSARP:
3634 case SIOCGIFADDR:
3635 case SIOCSIFADDR:
3636 case SIOCGIFBRDADDR:
3637 case SIOCSIFBRDADDR:
3638 case SIOCGIFNETMASK:
3639 case SIOCSIFNETMASK:
3640 case SIOCGIFDSTADDR:
3641 case SIOCSIFDSTADDR:
3642 case SIOCSIFFLAGS:
40d4e3df 3643 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3644#endif
3645
40d4e3df
ED
3646 default:
3647 return -ENOIOCTLCMD;
1da177e4
LT
3648 }
3649 return 0;
3650}
3651
40d4e3df 3652static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3653 poll_table *wait)
3654{
3655 struct sock *sk = sock->sk;
3656 struct packet_sock *po = pkt_sk(sk);
3657 unsigned int mask = datagram_poll(file, sock, wait);
3658
3659 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3660 if (po->rx_ring.pg_vec) {
f6fb8f10 3661 if (!packet_previous_rx_frame(po, &po->rx_ring,
3662 TP_STATUS_KERNEL))
1da177e4
LT
3663 mask |= POLLIN | POLLRDNORM;
3664 }
3665 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3666 spin_lock_bh(&sk->sk_write_queue.lock);
3667 if (po->tx_ring.pg_vec) {
3668 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3669 mask |= POLLOUT | POLLWRNORM;
3670 }
3671 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3672 return mask;
3673}
3674
3675
3676/* Dirty? Well, I still did not learn better way to account
3677 * for user mmaps.
3678 */
3679
3680static void packet_mm_open(struct vm_area_struct *vma)
3681{
3682 struct file *file = vma->vm_file;
40d4e3df 3683 struct socket *sock = file->private_data;
1da177e4 3684 struct sock *sk = sock->sk;
1ce4f28b 3685
1da177e4
LT
3686 if (sk)
3687 atomic_inc(&pkt_sk(sk)->mapped);
3688}
3689
3690static void packet_mm_close(struct vm_area_struct *vma)
3691{
3692 struct file *file = vma->vm_file;
40d4e3df 3693 struct socket *sock = file->private_data;
1da177e4 3694 struct sock *sk = sock->sk;
1ce4f28b 3695
1da177e4
LT
3696 if (sk)
3697 atomic_dec(&pkt_sk(sk)->mapped);
3698}
3699
f0f37e2f 3700static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3701 .open = packet_mm_open,
3702 .close = packet_mm_close,
1da177e4
LT
3703};
3704
0e3125c7
NH
3705static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3706 unsigned int len)
1da177e4
LT
3707{
3708 int i;
3709
4ebf0ae2 3710 for (i = 0; i < len; i++) {
0e3125c7 3711 if (likely(pg_vec[i].buffer)) {
c56b4d90 3712 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3713 vfree(pg_vec[i].buffer);
3714 else
3715 free_pages((unsigned long)pg_vec[i].buffer,
3716 order);
3717 pg_vec[i].buffer = NULL;
3718 }
1da177e4
LT
3719 }
3720 kfree(pg_vec);
3721}
3722
eea49cc9 3723static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3724{
f0d4eb29 3725 char *buffer;
0e3125c7
NH
3726 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3727 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3728
3729 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3730 if (buffer)
3731 return buffer;
3732
f0d4eb29 3733 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 3734 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
3735 if (buffer)
3736 return buffer;
3737
f0d4eb29 3738 /* vmalloc failed, lets dig into swap here */
0e3125c7 3739 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 3740 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3741 if (buffer)
3742 return buffer;
3743
f0d4eb29 3744 /* complete and utter failure */
0e3125c7 3745 return NULL;
4ebf0ae2
DM
3746}
3747
0e3125c7 3748static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3749{
3750 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3751 struct pgv *pg_vec;
4ebf0ae2
DM
3752 int i;
3753
0e3125c7 3754 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3755 if (unlikely(!pg_vec))
3756 goto out;
3757
3758 for (i = 0; i < block_nr; i++) {
c56b4d90 3759 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3760 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3761 goto out_free_pgvec;
3762 }
3763
3764out:
3765 return pg_vec;
3766
3767out_free_pgvec:
3768 free_pg_vec(pg_vec, order, block_nr);
3769 pg_vec = NULL;
3770 goto out;
3771}
1da177e4 3772
f6fb8f10 3773static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3774 int closing, int tx_ring)
1da177e4 3775{
0e3125c7 3776 struct pgv *pg_vec = NULL;
1da177e4 3777 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3778 int was_running, order = 0;
69e3c75f
JB
3779 struct packet_ring_buffer *rb;
3780 struct sk_buff_head *rb_queue;
0e11c91e 3781 __be16 num;
f6fb8f10 3782 int err = -EINVAL;
3783 /* Added to avoid minimal code churn */
3784 struct tpacket_req *req = &req_u->req;
3785
3786 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3787 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3788 WARN(1, "Tx-ring is not supported.\n");
3789 goto out;
3790 }
1ce4f28b 3791
69e3c75f
JB
3792 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3793 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3794
69e3c75f
JB
3795 err = -EBUSY;
3796 if (!closing) {
3797 if (atomic_read(&po->mapped))
3798 goto out;
b0138408 3799 if (packet_read_pending(rb))
69e3c75f
JB
3800 goto out;
3801 }
1da177e4 3802
69e3c75f
JB
3803 if (req->tp_block_nr) {
3804 /* Sanity tests and some calculations */
3805 err = -EBUSY;
3806 if (unlikely(rb->pg_vec))
3807 goto out;
1da177e4 3808
bbd6ef87
PM
3809 switch (po->tp_version) {
3810 case TPACKET_V1:
3811 po->tp_hdrlen = TPACKET_HDRLEN;
3812 break;
3813 case TPACKET_V2:
3814 po->tp_hdrlen = TPACKET2_HDRLEN;
3815 break;
f6fb8f10 3816 case TPACKET_V3:
3817 po->tp_hdrlen = TPACKET3_HDRLEN;
3818 break;
bbd6ef87
PM
3819 }
3820
69e3c75f 3821 err = -EINVAL;
4ebf0ae2 3822 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3823 goto out;
4ebf0ae2 3824 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3825 goto out;
dc808110
ED
3826 if (po->tp_version >= TPACKET_V3 &&
3827 (int)(req->tp_block_size -
3828 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
3829 goto out;
8913336a 3830 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3831 po->tp_reserve))
3832 goto out;
4ebf0ae2 3833 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3834 goto out;
1da177e4 3835
69e3c75f
JB
3836 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3837 if (unlikely(rb->frames_per_block <= 0))
3838 goto out;
3839 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3840 req->tp_frame_nr))
3841 goto out;
1da177e4
LT
3842
3843 err = -ENOMEM;
4ebf0ae2
DM
3844 order = get_order(req->tp_block_size);
3845 pg_vec = alloc_pg_vec(req, order);
3846 if (unlikely(!pg_vec))
1da177e4 3847 goto out;
f6fb8f10 3848 switch (po->tp_version) {
3849 case TPACKET_V3:
3850 /* Transmit path is not supported. We checked
3851 * it above but just being paranoid
3852 */
3853 if (!tx_ring)
3854 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
d7cf0c34 3855 break;
f6fb8f10 3856 default:
3857 break;
3858 }
69e3c75f
JB
3859 }
3860 /* Done */
3861 else {
3862 err = -EINVAL;
4ebf0ae2 3863 if (unlikely(req->tp_frame_nr))
69e3c75f 3864 goto out;
1da177e4
LT
3865 }
3866
3867 lock_sock(sk);
3868
3869 /* Detach socket from network */
3870 spin_lock(&po->bind_lock);
3871 was_running = po->running;
3872 num = po->num;
3873 if (was_running) {
1da177e4 3874 po->num = 0;
ce06b03e 3875 __unregister_prot_hook(sk, false);
1da177e4
LT
3876 }
3877 spin_unlock(&po->bind_lock);
1ce4f28b 3878
1da177e4
LT
3879 synchronize_net();
3880
3881 err = -EBUSY;
905db440 3882 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3883 if (closing || atomic_read(&po->mapped) == 0) {
3884 err = 0;
69e3c75f 3885 spin_lock_bh(&rb_queue->lock);
c053fd96 3886 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3887 rb->frame_max = (req->tp_frame_nr - 1);
3888 rb->head = 0;
3889 rb->frame_size = req->tp_frame_size;
3890 spin_unlock_bh(&rb_queue->lock);
3891
c053fd96
CG
3892 swap(rb->pg_vec_order, order);
3893 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3894
3895 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3896 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3897 tpacket_rcv : packet_rcv;
3898 skb_queue_purge(rb_queue);
1da177e4 3899 if (atomic_read(&po->mapped))
40d4e3df
ED
3900 pr_err("packet_mmap: vma is busy: %d\n",
3901 atomic_read(&po->mapped));
1da177e4 3902 }
905db440 3903 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3904
3905 spin_lock(&po->bind_lock);
ce06b03e 3906 if (was_running) {
1da177e4 3907 po->num = num;
ce06b03e 3908 register_prot_hook(sk);
1da177e4
LT
3909 }
3910 spin_unlock(&po->bind_lock);
f6fb8f10 3911 if (closing && (po->tp_version > TPACKET_V2)) {
3912 /* Because we don't support block-based V3 on tx-ring */
3913 if (!tx_ring)
3914 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3915 }
1da177e4
LT
3916 release_sock(sk);
3917
1da177e4
LT
3918 if (pg_vec)
3919 free_pg_vec(pg_vec, order, req->tp_block_nr);
3920out:
3921 return err;
3922}
3923
69e3c75f
JB
3924static int packet_mmap(struct file *file, struct socket *sock,
3925 struct vm_area_struct *vma)
1da177e4
LT
3926{
3927 struct sock *sk = sock->sk;
3928 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3929 unsigned long size, expected_size;
3930 struct packet_ring_buffer *rb;
1da177e4
LT
3931 unsigned long start;
3932 int err = -EINVAL;
3933 int i;
3934
3935 if (vma->vm_pgoff)
3936 return -EINVAL;
3937
905db440 3938 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3939
3940 expected_size = 0;
3941 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3942 if (rb->pg_vec) {
3943 expected_size += rb->pg_vec_len
3944 * rb->pg_vec_pages
3945 * PAGE_SIZE;
3946 }
3947 }
3948
3949 if (expected_size == 0)
1da177e4 3950 goto out;
69e3c75f
JB
3951
3952 size = vma->vm_end - vma->vm_start;
3953 if (size != expected_size)
1da177e4
LT
3954 goto out;
3955
1da177e4 3956 start = vma->vm_start;
69e3c75f
JB
3957 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3958 if (rb->pg_vec == NULL)
3959 continue;
3960
3961 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3962 struct page *page;
3963 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3964 int pg_num;
3965
c56b4d90
CG
3966 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3967 page = pgv_to_page(kaddr);
69e3c75f
JB
3968 err = vm_insert_page(vma, start, page);
3969 if (unlikely(err))
3970 goto out;
3971 start += PAGE_SIZE;
0e3125c7 3972 kaddr += PAGE_SIZE;
69e3c75f 3973 }
4ebf0ae2 3974 }
1da177e4 3975 }
69e3c75f 3976
4ebf0ae2 3977 atomic_inc(&po->mapped);
1da177e4
LT
3978 vma->vm_ops = &packet_mmap_ops;
3979 err = 0;
3980
3981out:
905db440 3982 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3983 return err;
3984}
1da177e4 3985
90ddc4f0 3986static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3987 .family = PF_PACKET,
3988 .owner = THIS_MODULE,
3989 .release = packet_release,
3990 .bind = packet_bind_spkt,
3991 .connect = sock_no_connect,
3992 .socketpair = sock_no_socketpair,
3993 .accept = sock_no_accept,
3994 .getname = packet_getname_spkt,
3995 .poll = datagram_poll,
3996 .ioctl = packet_ioctl,
3997 .listen = sock_no_listen,
3998 .shutdown = sock_no_shutdown,
3999 .setsockopt = sock_no_setsockopt,
4000 .getsockopt = sock_no_getsockopt,
4001 .sendmsg = packet_sendmsg_spkt,
4002 .recvmsg = packet_recvmsg,
4003 .mmap = sock_no_mmap,
4004 .sendpage = sock_no_sendpage,
4005};
1da177e4 4006
90ddc4f0 4007static const struct proto_ops packet_ops = {
1da177e4
LT
4008 .family = PF_PACKET,
4009 .owner = THIS_MODULE,
4010 .release = packet_release,
4011 .bind = packet_bind,
4012 .connect = sock_no_connect,
4013 .socketpair = sock_no_socketpair,
4014 .accept = sock_no_accept,
1ce4f28b 4015 .getname = packet_getname,
1da177e4
LT
4016 .poll = packet_poll,
4017 .ioctl = packet_ioctl,
4018 .listen = sock_no_listen,
4019 .shutdown = sock_no_shutdown,
4020 .setsockopt = packet_setsockopt,
4021 .getsockopt = packet_getsockopt,
4022 .sendmsg = packet_sendmsg,
4023 .recvmsg = packet_recvmsg,
4024 .mmap = packet_mmap,
4025 .sendpage = sock_no_sendpage,
4026};
4027
ec1b4cf7 4028static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4029 .family = PF_PACKET,
4030 .create = packet_create,
4031 .owner = THIS_MODULE,
4032};
4033
4034static struct notifier_block packet_netdev_notifier = {
40d4e3df 4035 .notifier_call = packet_notifier,
1da177e4
LT
4036};
4037
4038#ifdef CONFIG_PROC_FS
1da177e4
LT
4039
4040static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4041 __acquires(RCU)
1da177e4 4042{
e372c414 4043 struct net *net = seq_file_net(seq);
808f5114 4044
4045 rcu_read_lock();
4046 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4047}
4048
4049static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4050{
1bf40954 4051 struct net *net = seq_file_net(seq);
808f5114 4052 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4053}
4054
4055static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4056 __releases(RCU)
1da177e4 4057{
808f5114 4058 rcu_read_unlock();
1da177e4
LT
4059}
4060
1ce4f28b 4061static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4062{
4063 if (v == SEQ_START_TOKEN)
4064 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4065 else {
b7ceabd9 4066 struct sock *s = sk_entry(v);
1da177e4
LT
4067 const struct packet_sock *po = pkt_sk(s);
4068
4069 seq_printf(seq,
71338aa7 4070 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4071 s,
4072 atomic_read(&s->sk_refcnt),
4073 s->sk_type,
4074 ntohs(po->num),
4075 po->ifindex,
4076 po->running,
4077 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4078 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4079 sock_i_ino(s));
1da177e4
LT
4080 }
4081
4082 return 0;
4083}
4084
56b3d975 4085static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4086 .start = packet_seq_start,
4087 .next = packet_seq_next,
4088 .stop = packet_seq_stop,
4089 .show = packet_seq_show,
4090};
4091
4092static int packet_seq_open(struct inode *inode, struct file *file)
4093{
e372c414
DL
4094 return seq_open_net(inode, file, &packet_seq_ops,
4095 sizeof(struct seq_net_private));
1da177e4
LT
4096}
4097
da7071d7 4098static const struct file_operations packet_seq_fops = {
1da177e4
LT
4099 .owner = THIS_MODULE,
4100 .open = packet_seq_open,
4101 .read = seq_read,
4102 .llseek = seq_lseek,
e372c414 4103 .release = seq_release_net,
1da177e4
LT
4104};
4105
4106#endif
4107
2c8c1e72 4108static int __net_init packet_net_init(struct net *net)
d12d01d6 4109{
0fa7fa98 4110 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4111 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4112
d4beaa66 4113 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4114 return -ENOMEM;
4115
4116 return 0;
4117}
4118
2c8c1e72 4119static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4120{
ece31ffd 4121 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4122}
4123
4124static struct pernet_operations packet_net_ops = {
4125 .init = packet_net_init,
4126 .exit = packet_net_exit,
4127};
4128
4129
1da177e4
LT
4130static void __exit packet_exit(void)
4131{
1da177e4 4132 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4133 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4134 sock_unregister(PF_PACKET);
4135 proto_unregister(&packet_proto);
4136}
4137
4138static int __init packet_init(void)
4139{
4140 int rc = proto_register(&packet_proto, 0);
4141
4142 if (rc != 0)
4143 goto out;
4144
4145 sock_register(&packet_family_ops);
d12d01d6 4146 register_pernet_subsys(&packet_net_ops);
1da177e4 4147 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4148out:
4149 return rc;
4150}
4151
4152module_init(packet_init);
4153module_exit(packet_exit);
4154MODULE_LICENSE("GPL");
4155MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.123808 seconds and 5 git commands to generate.