drm/i915: Return more precise cdclk for gen2/3
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
d346a3fa
DB
240static int packet_direct_xmit(struct sk_buff *skb)
241{
242 struct net_device *dev = skb->dev;
d346a3fa
DB
243 netdev_features_t features;
244 struct netdev_queue *txq;
43279500 245 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
246
247 if (unlikely(!netif_running(dev) ||
43279500
DB
248 !netif_carrier_ok(dev)))
249 goto drop;
d346a3fa
DB
250
251 features = netif_skb_features(skb);
252 if (skb_needs_linearize(skb, features) &&
43279500
DB
253 __skb_linearize(skb))
254 goto drop;
d346a3fa 255
10c51b56 256 txq = skb_get_tx_queue(dev, skb);
d346a3fa 257
43279500
DB
258 local_bh_disable();
259
260 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 261 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 262 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 263 HARD_TX_UNLOCK(dev, txq);
d346a3fa 264
43279500
DB
265 local_bh_enable();
266
267 if (!dev_xmit_complete(ret))
d346a3fa 268 kfree_skb(skb);
43279500 269
d346a3fa 270 return ret;
43279500 271drop:
0f97ede4 272 atomic_long_inc(&dev->tx_dropped);
43279500
DB
273 kfree_skb(skb);
274 return NET_XMIT_DROP;
d346a3fa
DB
275}
276
66e56cd4
DB
277static struct net_device *packet_cached_dev_get(struct packet_sock *po)
278{
279 struct net_device *dev;
280
281 rcu_read_lock();
282 dev = rcu_dereference(po->cached_dev);
283 if (likely(dev))
284 dev_hold(dev);
285 rcu_read_unlock();
286
287 return dev;
288}
289
290static void packet_cached_dev_assign(struct packet_sock *po,
291 struct net_device *dev)
292{
293 rcu_assign_pointer(po->cached_dev, dev);
294}
295
296static void packet_cached_dev_reset(struct packet_sock *po)
297{
298 RCU_INIT_POINTER(po->cached_dev, NULL);
299}
300
d346a3fa
DB
301static bool packet_use_direct_xmit(const struct packet_sock *po)
302{
303 return po->xmit == packet_direct_xmit;
304}
305
0fd5d57b 306static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 307{
1cbac010 308 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
309}
310
0fd5d57b
DB
311static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
312{
313 const struct net_device_ops *ops = dev->netdev_ops;
314 u16 queue_index;
315
316 if (ops->ndo_select_queue) {
317 queue_index = ops->ndo_select_queue(dev, skb, NULL,
318 __packet_pick_tx_queue);
319 queue_index = netdev_cap_txqueue(dev, queue_index);
320 } else {
321 queue_index = __packet_pick_tx_queue(dev, skb);
322 }
323
324 skb_set_queue_mapping(skb, queue_index);
325}
326
ce06b03e
DM
327/* register_prot_hook must be invoked with the po->bind_lock held,
328 * or from a context in which asynchronous accesses to the packet
329 * socket is not possible (packet_create()).
330 */
331static void register_prot_hook(struct sock *sk)
332{
333 struct packet_sock *po = pkt_sk(sk);
e40526cb 334
ce06b03e 335 if (!po->running) {
66e56cd4 336 if (po->fanout)
dc99f600 337 __fanout_link(sk, po);
66e56cd4 338 else
dc99f600 339 dev_add_pack(&po->prot_hook);
e40526cb 340
ce06b03e
DM
341 sock_hold(sk);
342 po->running = 1;
343 }
344}
345
346/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
347 * held. If the sync parameter is true, we will temporarily drop
348 * the po->bind_lock and do a synchronize_net to make sure no
349 * asynchronous packet processing paths still refer to the elements
350 * of po->prot_hook. If the sync parameter is false, it is the
351 * callers responsibility to take care of this.
352 */
353static void __unregister_prot_hook(struct sock *sk, bool sync)
354{
355 struct packet_sock *po = pkt_sk(sk);
356
357 po->running = 0;
66e56cd4
DB
358
359 if (po->fanout)
dc99f600 360 __fanout_unlink(sk, po);
66e56cd4 361 else
dc99f600 362 __dev_remove_pack(&po->prot_hook);
e40526cb 363
ce06b03e
DM
364 __sock_put(sk);
365
366 if (sync) {
367 spin_unlock(&po->bind_lock);
368 synchronize_net();
369 spin_lock(&po->bind_lock);
370 }
371}
372
373static void unregister_prot_hook(struct sock *sk, bool sync)
374{
375 struct packet_sock *po = pkt_sk(sk);
376
377 if (po->running)
378 __unregister_prot_hook(sk, sync);
379}
380
6e58040b 381static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
382{
383 if (is_vmalloc_addr(addr))
384 return vmalloc_to_page(addr);
385 return virt_to_page(addr);
386}
387
69e3c75f 388static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 389{
184f489e 390 union tpacket_uhdr h;
1da177e4 391
69e3c75f 392 h.raw = frame;
bbd6ef87
PM
393 switch (po->tp_version) {
394 case TPACKET_V1:
69e3c75f 395 h.h1->tp_status = status;
0af55bb5 396 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
397 break;
398 case TPACKET_V2:
69e3c75f 399 h.h2->tp_status = status;
0af55bb5 400 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 401 break;
f6fb8f10 402 case TPACKET_V3:
69e3c75f 403 default:
f6fb8f10 404 WARN(1, "TPACKET version not supported.\n");
69e3c75f 405 BUG();
bbd6ef87 406 }
69e3c75f
JB
407
408 smp_wmb();
bbd6ef87
PM
409}
410
69e3c75f 411static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 412{
184f489e 413 union tpacket_uhdr h;
bbd6ef87 414
69e3c75f
JB
415 smp_rmb();
416
bbd6ef87
PM
417 h.raw = frame;
418 switch (po->tp_version) {
419 case TPACKET_V1:
0af55bb5 420 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 421 return h.h1->tp_status;
bbd6ef87 422 case TPACKET_V2:
0af55bb5 423 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 424 return h.h2->tp_status;
f6fb8f10 425 case TPACKET_V3:
69e3c75f 426 default:
f6fb8f10 427 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
428 BUG();
429 return 0;
bbd6ef87 430 }
1da177e4 431}
69e3c75f 432
b9c32fb2
DB
433static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
434 unsigned int flags)
7a51384c
DB
435{
436 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
437
68a360e8
WB
438 if (shhwtstamps &&
439 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
440 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
441 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
442
443 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 444 return TP_STATUS_TS_SOFTWARE;
7a51384c 445
b9c32fb2 446 return 0;
7a51384c
DB
447}
448
b9c32fb2
DB
449static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
450 struct sk_buff *skb)
2e31396f
WB
451{
452 union tpacket_uhdr h;
453 struct timespec ts;
b9c32fb2 454 __u32 ts_status;
2e31396f 455
b9c32fb2
DB
456 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
457 return 0;
2e31396f
WB
458
459 h.raw = frame;
460 switch (po->tp_version) {
461 case TPACKET_V1:
462 h.h1->tp_sec = ts.tv_sec;
463 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
464 break;
465 case TPACKET_V2:
466 h.h2->tp_sec = ts.tv_sec;
467 h.h2->tp_nsec = ts.tv_nsec;
468 break;
469 case TPACKET_V3:
470 default:
471 WARN(1, "TPACKET version not supported.\n");
472 BUG();
473 }
474
475 /* one flush is safe, as both fields always lie on the same cacheline */
476 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
477 smp_wmb();
b9c32fb2
DB
478
479 return ts_status;
2e31396f
WB
480}
481
69e3c75f
JB
482static void *packet_lookup_frame(struct packet_sock *po,
483 struct packet_ring_buffer *rb,
484 unsigned int position,
485 int status)
486{
487 unsigned int pg_vec_pos, frame_offset;
184f489e 488 union tpacket_uhdr h;
69e3c75f
JB
489
490 pg_vec_pos = position / rb->frames_per_block;
491 frame_offset = position % rb->frames_per_block;
492
0e3125c7
NH
493 h.raw = rb->pg_vec[pg_vec_pos].buffer +
494 (frame_offset * rb->frame_size);
69e3c75f
JB
495
496 if (status != __packet_get_status(po, h.raw))
497 return NULL;
498
499 return h.raw;
500}
501
eea49cc9 502static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
503 struct packet_ring_buffer *rb,
504 int status)
505{
506 return packet_lookup_frame(po, rb, rb->head, status);
507}
508
bc59ba39 509static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 510{
511 del_timer_sync(&pkc->retire_blk_timer);
512}
513
514static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
515 int tx_ring,
516 struct sk_buff_head *rb_queue)
517{
bc59ba39 518 struct tpacket_kbdq_core *pkc;
f6fb8f10 519
22781a5b
DJ
520 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
521 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 522
ec6f809f 523 spin_lock_bh(&rb_queue->lock);
f6fb8f10 524 pkc->delete_blk_timer = 1;
ec6f809f 525 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 526
527 prb_del_retire_blk_timer(pkc);
528}
529
530static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 531 struct tpacket_kbdq_core *pkc,
f6fb8f10 532 void (*func) (unsigned long))
533{
534 init_timer(&pkc->retire_blk_timer);
535 pkc->retire_blk_timer.data = (long)po;
536 pkc->retire_blk_timer.function = func;
537 pkc->retire_blk_timer.expires = jiffies;
538}
539
540static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
541{
bc59ba39 542 struct tpacket_kbdq_core *pkc;
f6fb8f10 543
544 if (tx_ring)
545 BUG();
546
22781a5b
DJ
547 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
548 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 549 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
550}
551
552static int prb_calc_retire_blk_tmo(struct packet_sock *po,
553 int blk_size_in_bytes)
554{
555 struct net_device *dev;
556 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
557 struct ethtool_cmd ecmd;
558 int err;
e440cf2c 559 u32 speed;
f6fb8f10 560
4bc71cb9
JP
561 rtnl_lock();
562 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
563 if (unlikely(!dev)) {
564 rtnl_unlock();
f6fb8f10 565 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
566 }
567 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 568 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
569 rtnl_unlock();
570 if (!err) {
4bc71cb9
JP
571 /*
572 * If the link speed is so slow you don't really
573 * need to worry about perf anyways
574 */
e440cf2c 575 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 576 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 577 } else {
578 msec = 1;
579 div = speed / 1000;
f6fb8f10 580 }
581 }
582
583 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
584
585 if (div)
586 mbits /= div;
587
588 tmo = mbits * msec;
589
590 if (div)
591 return tmo+1;
592 return tmo;
593}
594
bc59ba39 595static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 596 union tpacket_req_u *req_u)
597{
598 p1->feature_req_word = req_u->req3.tp_feature_req_word;
599}
600
601static void init_prb_bdqc(struct packet_sock *po,
602 struct packet_ring_buffer *rb,
603 struct pgv *pg_vec,
604 union tpacket_req_u *req_u, int tx_ring)
605{
22781a5b 606 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 607 struct tpacket_block_desc *pbd;
f6fb8f10 608
609 memset(p1, 0x0, sizeof(*p1));
610
611 p1->knxt_seq_num = 1;
612 p1->pkbdq = pg_vec;
bc59ba39 613 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 614 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 615 p1->kblk_size = req_u->req3.tp_block_size;
616 p1->knum_blocks = req_u->req3.tp_block_nr;
617 p1->hdrlen = po->tp_hdrlen;
618 p1->version = po->tp_version;
619 p1->last_kactive_blk_num = 0;
ee80fbf3 620 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 621 if (req_u->req3.tp_retire_blk_tov)
622 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
623 else
624 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
625 req_u->req3.tp_block_size);
626 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
627 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
628
dc808110 629 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 630 prb_init_ft_ops(p1, req_u);
631 prb_setup_retire_blk_timer(po, tx_ring);
632 prb_open_block(p1, pbd);
633}
634
635/* Do NOT update the last_blk_num first.
636 * Assumes sk_buff_head lock is held.
637 */
bc59ba39 638static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 639{
640 mod_timer(&pkc->retire_blk_timer,
641 jiffies + pkc->tov_in_jiffies);
642 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
643}
644
645/*
646 * Timer logic:
647 * 1) We refresh the timer only when we open a block.
648 * By doing this we don't waste cycles refreshing the timer
649 * on packet-by-packet basis.
650 *
651 * With a 1MB block-size, on a 1Gbps line, it will take
652 * i) ~8 ms to fill a block + ii) memcpy etc.
653 * In this cut we are not accounting for the memcpy time.
654 *
655 * So, if the user sets the 'tmo' to 10ms then the timer
656 * will never fire while the block is still getting filled
657 * (which is what we want). However, the user could choose
658 * to close a block early and that's fine.
659 *
660 * But when the timer does fire, we check whether or not to refresh it.
661 * Since the tmo granularity is in msecs, it is not too expensive
662 * to refresh the timer, lets say every '8' msecs.
663 * Either the user can set the 'tmo' or we can derive it based on
664 * a) line-speed and b) block-size.
665 * prb_calc_retire_blk_tmo() calculates the tmo.
666 *
667 */
668static void prb_retire_rx_blk_timer_expired(unsigned long data)
669{
670 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 671 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 672 unsigned int frozen;
bc59ba39 673 struct tpacket_block_desc *pbd;
f6fb8f10 674
675 spin_lock(&po->sk.sk_receive_queue.lock);
676
677 frozen = prb_queue_frozen(pkc);
678 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
679
680 if (unlikely(pkc->delete_blk_timer))
681 goto out;
682
683 /* We only need to plug the race when the block is partially filled.
684 * tpacket_rcv:
685 * lock(); increment BLOCK_NUM_PKTS; unlock()
686 * copy_bits() is in progress ...
687 * timer fires on other cpu:
688 * we can't retire the current block because copy_bits
689 * is in progress.
690 *
691 */
692 if (BLOCK_NUM_PKTS(pbd)) {
693 while (atomic_read(&pkc->blk_fill_in_prog)) {
694 /* Waiting for skb_copy_bits to finish... */
695 cpu_relax();
696 }
697 }
698
699 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
700 if (!frozen) {
41a50d62
AD
701 if (!BLOCK_NUM_PKTS(pbd)) {
702 /* An empty block. Just refresh the timer. */
703 goto refresh_timer;
704 }
f6fb8f10 705 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
706 if (!prb_dispatch_next_block(pkc, po))
707 goto refresh_timer;
708 else
709 goto out;
710 } else {
711 /* Case 1. Queue was frozen because user-space was
712 * lagging behind.
713 */
714 if (prb_curr_blk_in_use(pkc, pbd)) {
715 /*
716 * Ok, user-space is still behind.
717 * So just refresh the timer.
718 */
719 goto refresh_timer;
720 } else {
721 /* Case 2. queue was frozen,user-space caught up,
722 * now the link went idle && the timer fired.
723 * We don't have a block to close.So we open this
724 * block and restart the timer.
725 * opening a block thaws the queue,restarts timer
726 * Thawing/timer-refresh is a side effect.
727 */
728 prb_open_block(pkc, pbd);
729 goto out;
730 }
731 }
732 }
733
734refresh_timer:
735 _prb_refresh_rx_retire_blk_timer(pkc);
736
737out:
738 spin_unlock(&po->sk.sk_receive_queue.lock);
739}
740
eea49cc9 741static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 742 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 743{
744 /* Flush everything minus the block header */
745
746#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
747 u8 *start, *end;
748
749 start = (u8 *)pbd1;
750
751 /* Skip the block header(we know header WILL fit in 4K) */
752 start += PAGE_SIZE;
753
754 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
755 for (; start < end; start += PAGE_SIZE)
756 flush_dcache_page(pgv_to_page(start));
757
758 smp_wmb();
759#endif
760
761 /* Now update the block status. */
762
763 BLOCK_STATUS(pbd1) = status;
764
765 /* Flush the block header */
766
767#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
768 start = (u8 *)pbd1;
769 flush_dcache_page(pgv_to_page(start));
770
771 smp_wmb();
772#endif
773}
774
775/*
776 * Side effect:
777 *
778 * 1) flush the block
779 * 2) Increment active_blk_num
780 *
781 * Note:We DONT refresh the timer on purpose.
782 * Because almost always the next block will be opened.
783 */
bc59ba39 784static void prb_close_block(struct tpacket_kbdq_core *pkc1,
785 struct tpacket_block_desc *pbd1,
f6fb8f10 786 struct packet_sock *po, unsigned int stat)
787{
788 __u32 status = TP_STATUS_USER | stat;
789
790 struct tpacket3_hdr *last_pkt;
bc59ba39 791 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 792 struct sock *sk = &po->sk;
f6fb8f10 793
ee80fbf3 794 if (po->stats.stats3.tp_drops)
f6fb8f10 795 status |= TP_STATUS_LOSING;
796
797 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
798 last_pkt->tp_next_offset = 0;
799
800 /* Get the ts of the last pkt */
801 if (BLOCK_NUM_PKTS(pbd1)) {
802 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
803 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
804 } else {
41a50d62
AD
805 /* Ok, we tmo'd - so get the current time.
806 *
807 * It shouldn't really happen as we don't close empty
808 * blocks. See prb_retire_rx_blk_timer_expired().
809 */
f6fb8f10 810 struct timespec ts;
811 getnstimeofday(&ts);
812 h1->ts_last_pkt.ts_sec = ts.tv_sec;
813 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
814 }
815
816 smp_wmb();
817
818 /* Flush the block */
819 prb_flush_block(pkc1, pbd1, status);
820
da413eec
DC
821 sk->sk_data_ready(sk);
822
f6fb8f10 823 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
824}
825
eea49cc9 826static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 827{
828 pkc->reset_pending_on_curr_blk = 0;
829}
830
831/*
832 * Side effect of opening a block:
833 *
834 * 1) prb_queue is thawed.
835 * 2) retire_blk_timer is refreshed.
836 *
837 */
bc59ba39 838static void prb_open_block(struct tpacket_kbdq_core *pkc1,
839 struct tpacket_block_desc *pbd1)
f6fb8f10 840{
841 struct timespec ts;
bc59ba39 842 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 843
844 smp_rmb();
845
8da3056c
DB
846 /* We could have just memset this but we will lose the
847 * flexibility of making the priv area sticky
848 */
f6fb8f10 849
8da3056c
DB
850 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
851 BLOCK_NUM_PKTS(pbd1) = 0;
852 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 853
8da3056c
DB
854 getnstimeofday(&ts);
855
856 h1->ts_first_pkt.ts_sec = ts.tv_sec;
857 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 858
8da3056c
DB
859 pkc1->pkblk_start = (char *)pbd1;
860 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
861
862 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
864
865 pbd1->version = pkc1->version;
866 pkc1->prev = pkc1->nxt_offset;
867 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
868
869 prb_thaw_queue(pkc1);
870 _prb_refresh_rx_retire_blk_timer(pkc1);
871
872 smp_wmb();
f6fb8f10 873}
874
875/*
876 * Queue freeze logic:
877 * 1) Assume tp_block_nr = 8 blocks.
878 * 2) At time 't0', user opens Rx ring.
879 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
880 * 4) user-space is either sleeping or processing block '0'.
881 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
882 * it will close block-7,loop around and try to fill block '0'.
883 * call-flow:
884 * __packet_lookup_frame_in_block
885 * prb_retire_current_block()
886 * prb_dispatch_next_block()
887 * |->(BLOCK_STATUS == USER) evaluates to true
888 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
889 * 6) Now there are two cases:
890 * 6.1) Link goes idle right after the queue is frozen.
891 * But remember, the last open_block() refreshed the timer.
892 * When this timer expires,it will refresh itself so that we can
893 * re-open block-0 in near future.
894 * 6.2) Link is busy and keeps on receiving packets. This is a simple
895 * case and __packet_lookup_frame_in_block will check if block-0
896 * is free and can now be re-used.
897 */
eea49cc9 898static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 899 struct packet_sock *po)
900{
901 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 902 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 903}
904
905#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
906
907/*
908 * If the next block is free then we will dispatch it
909 * and return a good offset.
910 * Else, we will freeze the queue.
911 * So, caller must check the return value.
912 */
bc59ba39 913static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 914 struct packet_sock *po)
915{
bc59ba39 916 struct tpacket_block_desc *pbd;
f6fb8f10 917
918 smp_rmb();
919
920 /* 1. Get current block num */
921 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
922
923 /* 2. If this block is currently in_use then freeze the queue */
924 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
925 prb_freeze_queue(pkc, po);
926 return NULL;
927 }
928
929 /*
930 * 3.
931 * open this block and return the offset where the first packet
932 * needs to get stored.
933 */
934 prb_open_block(pkc, pbd);
935 return (void *)pkc->nxt_offset;
936}
937
bc59ba39 938static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 939 struct packet_sock *po, unsigned int status)
940{
bc59ba39 941 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 942
943 /* retire/close the current block */
944 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
945 /*
946 * Plug the case where copy_bits() is in progress on
947 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
948 * have space to copy the pkt in the current block and
949 * called prb_retire_current_block()
950 *
951 * We don't need to worry about the TMO case because
952 * the timer-handler already handled this case.
953 */
954 if (!(status & TP_STATUS_BLK_TMO)) {
955 while (atomic_read(&pkc->blk_fill_in_prog)) {
956 /* Waiting for skb_copy_bits to finish... */
957 cpu_relax();
958 }
959 }
960 prb_close_block(pkc, pbd, po, status);
961 return;
962 }
f6fb8f10 963}
964
eea49cc9 965static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 966 struct tpacket_block_desc *pbd)
f6fb8f10 967{
968 return TP_STATUS_USER & BLOCK_STATUS(pbd);
969}
970
eea49cc9 971static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 972{
973 return pkc->reset_pending_on_curr_blk;
974}
975
eea49cc9 976static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 977{
bc59ba39 978 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 979 atomic_dec(&pkc->blk_fill_in_prog);
980}
981
eea49cc9 982static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 983 struct tpacket3_hdr *ppd)
984{
3958afa1 985 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 986}
987
eea49cc9 988static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 989 struct tpacket3_hdr *ppd)
990{
991 ppd->hv1.tp_rxhash = 0;
992}
993
eea49cc9 994static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 995 struct tpacket3_hdr *ppd)
996{
df8a39de
JP
997 if (skb_vlan_tag_present(pkc->skb)) {
998 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
999 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1000 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1001 } else {
9e67030a 1002 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1003 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1004 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1005 }
1006}
1007
bc59ba39 1008static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1009 struct tpacket3_hdr *ppd)
1010{
a0cdfcf3 1011 ppd->hv1.tp_padding = 0;
f6fb8f10 1012 prb_fill_vlan_info(pkc, ppd);
1013
1014 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1015 prb_fill_rxhash(pkc, ppd);
1016 else
1017 prb_clear_rxhash(pkc, ppd);
1018}
1019
eea49cc9 1020static void prb_fill_curr_block(char *curr,
bc59ba39 1021 struct tpacket_kbdq_core *pkc,
1022 struct tpacket_block_desc *pbd,
f6fb8f10 1023 unsigned int len)
1024{
1025 struct tpacket3_hdr *ppd;
1026
1027 ppd = (struct tpacket3_hdr *)curr;
1028 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1029 pkc->prev = curr;
1030 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1032 BLOCK_NUM_PKTS(pbd) += 1;
1033 atomic_inc(&pkc->blk_fill_in_prog);
1034 prb_run_all_ft_ops(pkc, ppd);
1035}
1036
1037/* Assumes caller has the sk->rx_queue.lock */
1038static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1039 struct sk_buff *skb,
1040 int status,
1041 unsigned int len
1042 )
1043{
bc59ba39 1044 struct tpacket_kbdq_core *pkc;
1045 struct tpacket_block_desc *pbd;
f6fb8f10 1046 char *curr, *end;
1047
e3192690 1048 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1049 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1050
1051 /* Queue is frozen when user space is lagging behind */
1052 if (prb_queue_frozen(pkc)) {
1053 /*
1054 * Check if that last block which caused the queue to freeze,
1055 * is still in_use by user-space.
1056 */
1057 if (prb_curr_blk_in_use(pkc, pbd)) {
1058 /* Can't record this packet */
1059 return NULL;
1060 } else {
1061 /*
1062 * Ok, the block was released by user-space.
1063 * Now let's open that block.
1064 * opening a block also thaws the queue.
1065 * Thawing is a side effect.
1066 */
1067 prb_open_block(pkc, pbd);
1068 }
1069 }
1070
1071 smp_mb();
1072 curr = pkc->nxt_offset;
1073 pkc->skb = skb;
e3192690 1074 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1075
1076 /* first try the current block */
1077 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1078 prb_fill_curr_block(curr, pkc, pbd, len);
1079 return (void *)curr;
1080 }
1081
1082 /* Ok, close the current block */
1083 prb_retire_current_block(pkc, po, 0);
1084
1085 /* Now, try to dispatch the next block */
1086 curr = (char *)prb_dispatch_next_block(pkc, po);
1087 if (curr) {
1088 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1089 prb_fill_curr_block(curr, pkc, pbd, len);
1090 return (void *)curr;
1091 }
1092
1093 /*
1094 * No free blocks are available.user_space hasn't caught up yet.
1095 * Queue was just frozen and now this packet will get dropped.
1096 */
1097 return NULL;
1098}
1099
eea49cc9 1100static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1101 struct sk_buff *skb,
1102 int status, unsigned int len)
1103{
1104 char *curr = NULL;
1105 switch (po->tp_version) {
1106 case TPACKET_V1:
1107 case TPACKET_V2:
1108 curr = packet_lookup_frame(po, &po->rx_ring,
1109 po->rx_ring.head, status);
1110 return curr;
1111 case TPACKET_V3:
1112 return __packet_lookup_frame_in_block(po, skb, status, len);
1113 default:
1114 WARN(1, "TPACKET version not supported\n");
1115 BUG();
99aa3473 1116 return NULL;
f6fb8f10 1117 }
1118}
1119
eea49cc9 1120static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1121 struct packet_ring_buffer *rb,
77f65ebd 1122 unsigned int idx,
f6fb8f10 1123 int status)
1124{
bc59ba39 1125 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1126 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1127
1128 if (status != BLOCK_STATUS(pbd))
1129 return NULL;
1130 return pbd;
1131}
1132
eea49cc9 1133static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1134{
1135 unsigned int prev;
1136 if (rb->prb_bdqc.kactive_blk_num)
1137 prev = rb->prb_bdqc.kactive_blk_num-1;
1138 else
1139 prev = rb->prb_bdqc.knum_blocks-1;
1140 return prev;
1141}
1142
1143/* Assumes caller has held the rx_queue.lock */
eea49cc9 1144static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1145 struct packet_ring_buffer *rb,
1146 int status)
1147{
1148 unsigned int previous = prb_previous_blk_num(rb);
1149 return prb_lookup_block(po, rb, previous, status);
1150}
1151
eea49cc9 1152static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1153 struct packet_ring_buffer *rb,
1154 int status)
1155{
1156 if (po->tp_version <= TPACKET_V2)
1157 return packet_previous_frame(po, rb, status);
1158
1159 return __prb_previous_block(po, rb, status);
1160}
1161
eea49cc9 1162static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1163 struct packet_ring_buffer *rb)
1164{
1165 switch (po->tp_version) {
1166 case TPACKET_V1:
1167 case TPACKET_V2:
1168 return packet_increment_head(rb);
1169 case TPACKET_V3:
1170 default:
1171 WARN(1, "TPACKET version not supported.\n");
1172 BUG();
1173 return;
1174 }
1175}
1176
eea49cc9 1177static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1178 struct packet_ring_buffer *rb,
1179 int status)
1180{
1181 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1182 return packet_lookup_frame(po, rb, previous, status);
1183}
1184
eea49cc9 1185static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1186{
1187 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1188}
1189
b0138408
DB
1190static void packet_inc_pending(struct packet_ring_buffer *rb)
1191{
1192 this_cpu_inc(*rb->pending_refcnt);
1193}
1194
1195static void packet_dec_pending(struct packet_ring_buffer *rb)
1196{
1197 this_cpu_dec(*rb->pending_refcnt);
1198}
1199
1200static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1201{
1202 unsigned int refcnt = 0;
1203 int cpu;
1204
1205 /* We don't use pending refcount in rx_ring. */
1206 if (rb->pending_refcnt == NULL)
1207 return 0;
1208
1209 for_each_possible_cpu(cpu)
1210 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1211
1212 return refcnt;
1213}
1214
1215static int packet_alloc_pending(struct packet_sock *po)
1216{
1217 po->rx_ring.pending_refcnt = NULL;
1218
1219 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1220 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1221 return -ENOBUFS;
1222
1223 return 0;
1224}
1225
1226static void packet_free_pending(struct packet_sock *po)
1227{
1228 free_percpu(po->tx_ring.pending_refcnt);
1229}
1230
77f65ebd
WB
1231static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1232{
1233 struct sock *sk = &po->sk;
1234 bool has_room;
1235
1236 if (po->prot_hook.func != tpacket_rcv)
1237 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1238 <= sk->sk_rcvbuf;
1239
1240 spin_lock(&sk->sk_receive_queue.lock);
1241 if (po->tp_version == TPACKET_V3)
1242 has_room = prb_lookup_block(po, &po->rx_ring,
1243 po->rx_ring.prb_bdqc.kactive_blk_num,
1244 TP_STATUS_KERNEL);
1245 else
1246 has_room = packet_lookup_frame(po, &po->rx_ring,
1247 po->rx_ring.head,
1248 TP_STATUS_KERNEL);
1249 spin_unlock(&sk->sk_receive_queue.lock);
1250
1251 return has_room;
1252}
1253
1da177e4
LT
1254static void packet_sock_destruct(struct sock *sk)
1255{
ed85b565
RC
1256 skb_queue_purge(&sk->sk_error_queue);
1257
547b792c
IJ
1258 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1259 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1260
1261 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1262 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1263 return;
1264 }
1265
17ab56a2 1266 sk_refcnt_debug_dec(sk);
1da177e4
LT
1267}
1268
dc99f600
DM
1269static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1270{
1271 int x = atomic_read(&f->rr_cur) + 1;
1272
1273 if (x >= num)
1274 x = 0;
1275
1276 return x;
1277}
1278
77f65ebd
WB
1279static unsigned int fanout_demux_hash(struct packet_fanout *f,
1280 struct sk_buff *skb,
1281 unsigned int num)
dc99f600 1282{
61b905da 1283 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1284}
1285
77f65ebd
WB
1286static unsigned int fanout_demux_lb(struct packet_fanout *f,
1287 struct sk_buff *skb,
1288 unsigned int num)
dc99f600
DM
1289{
1290 int cur, old;
1291
1292 cur = atomic_read(&f->rr_cur);
1293 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1294 fanout_rr_next(f, num))) != cur)
1295 cur = old;
77f65ebd
WB
1296 return cur;
1297}
1298
1299static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1300 struct sk_buff *skb,
1301 unsigned int num)
1302{
1303 return smp_processor_id() % num;
dc99f600
DM
1304}
1305
5df0ddfb
DB
1306static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1307 struct sk_buff *skb,
1308 unsigned int num)
1309{
f337db64 1310 return prandom_u32_max(num);
5df0ddfb
DB
1311}
1312
77f65ebd
WB
1313static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1314 struct sk_buff *skb,
1315 unsigned int idx, unsigned int skip,
1316 unsigned int num)
95ec3eb4 1317{
77f65ebd 1318 unsigned int i, j;
95ec3eb4 1319
77f65ebd
WB
1320 i = j = min_t(int, f->next[idx], num - 1);
1321 do {
1322 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1323 if (i != j)
1324 f->next[idx] = i;
1325 return i;
1326 }
1327 if (++i == num)
1328 i = 0;
1329 } while (i != j);
1330
1331 return idx;
1332}
1333
2d36097d
NH
1334static unsigned int fanout_demux_qm(struct packet_fanout *f,
1335 struct sk_buff *skb,
1336 unsigned int num)
1337{
1338 return skb_get_queue_mapping(skb) % num;
1339}
1340
77f65ebd
WB
1341static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1342{
1343 return f->flags & (flag >> 8);
95ec3eb4
DM
1344}
1345
95ec3eb4
DM
1346static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1347 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1348{
1349 struct packet_fanout *f = pt->af_packet_priv;
1350 unsigned int num = f->num_members;
1351 struct packet_sock *po;
77f65ebd 1352 unsigned int idx;
dc99f600
DM
1353
1354 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1355 !num) {
1356 kfree_skb(skb);
1357 return 0;
1358 }
1359
3f34b24a
AD
1360 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1361 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1362 if (!skb)
1363 return 0;
1364 }
95ec3eb4
DM
1365 switch (f->type) {
1366 case PACKET_FANOUT_HASH:
1367 default:
77f65ebd 1368 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1369 break;
1370 case PACKET_FANOUT_LB:
77f65ebd 1371 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1372 break;
1373 case PACKET_FANOUT_CPU:
77f65ebd
WB
1374 idx = fanout_demux_cpu(f, skb, num);
1375 break;
5df0ddfb
DB
1376 case PACKET_FANOUT_RND:
1377 idx = fanout_demux_rnd(f, skb, num);
1378 break;
2d36097d
NH
1379 case PACKET_FANOUT_QM:
1380 idx = fanout_demux_qm(f, skb, num);
1381 break;
77f65ebd
WB
1382 case PACKET_FANOUT_ROLLOVER:
1383 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1384 break;
dc99f600
DM
1385 }
1386
77f65ebd
WB
1387 po = pkt_sk(f->arr[idx]);
1388 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1389 unlikely(!packet_rcv_has_room(po, skb))) {
1390 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1391 po = pkt_sk(f->arr[idx]);
1392 }
dc99f600
DM
1393
1394 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1395}
1396
fff3321d
PE
1397DEFINE_MUTEX(fanout_mutex);
1398EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1399static LIST_HEAD(fanout_list);
1400
1401static void __fanout_link(struct sock *sk, struct packet_sock *po)
1402{
1403 struct packet_fanout *f = po->fanout;
1404
1405 spin_lock(&f->lock);
1406 f->arr[f->num_members] = sk;
1407 smp_wmb();
1408 f->num_members++;
1409 spin_unlock(&f->lock);
1410}
1411
1412static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1413{
1414 struct packet_fanout *f = po->fanout;
1415 int i;
1416
1417 spin_lock(&f->lock);
1418 for (i = 0; i < f->num_members; i++) {
1419 if (f->arr[i] == sk)
1420 break;
1421 }
1422 BUG_ON(i >= f->num_members);
1423 f->arr[i] = f->arr[f->num_members - 1];
1424 f->num_members--;
1425 spin_unlock(&f->lock);
1426}
1427
d4dd8aee 1428static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1429{
d4dd8aee 1430 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1431 return true;
1432
1433 return false;
1434}
1435
7736d33f 1436static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1437{
1438 struct packet_sock *po = pkt_sk(sk);
1439 struct packet_fanout *f, *match;
7736d33f 1440 u8 type = type_flags & 0xff;
77f65ebd 1441 u8 flags = type_flags >> 8;
dc99f600
DM
1442 int err;
1443
1444 switch (type) {
77f65ebd
WB
1445 case PACKET_FANOUT_ROLLOVER:
1446 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1447 return -EINVAL;
dc99f600
DM
1448 case PACKET_FANOUT_HASH:
1449 case PACKET_FANOUT_LB:
95ec3eb4 1450 case PACKET_FANOUT_CPU:
5df0ddfb 1451 case PACKET_FANOUT_RND:
2d36097d 1452 case PACKET_FANOUT_QM:
dc99f600
DM
1453 break;
1454 default:
1455 return -EINVAL;
1456 }
1457
1458 if (!po->running)
1459 return -EINVAL;
1460
1461 if (po->fanout)
1462 return -EALREADY;
1463
1464 mutex_lock(&fanout_mutex);
1465 match = NULL;
1466 list_for_each_entry(f, &fanout_list, list) {
1467 if (f->id == id &&
1468 read_pnet(&f->net) == sock_net(sk)) {
1469 match = f;
1470 break;
1471 }
1472 }
afe62c68 1473 err = -EINVAL;
77f65ebd 1474 if (match && match->flags != flags)
afe62c68 1475 goto out;
dc99f600 1476 if (!match) {
afe62c68 1477 err = -ENOMEM;
dc99f600 1478 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1479 if (!match)
1480 goto out;
1481 write_pnet(&match->net, sock_net(sk));
1482 match->id = id;
1483 match->type = type;
77f65ebd 1484 match->flags = flags;
afe62c68
ED
1485 atomic_set(&match->rr_cur, 0);
1486 INIT_LIST_HEAD(&match->list);
1487 spin_lock_init(&match->lock);
1488 atomic_set(&match->sk_ref, 0);
1489 match->prot_hook.type = po->prot_hook.type;
1490 match->prot_hook.dev = po->prot_hook.dev;
1491 match->prot_hook.func = packet_rcv_fanout;
1492 match->prot_hook.af_packet_priv = match;
c0de08d0 1493 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1494 dev_add_pack(&match->prot_hook);
1495 list_add(&match->list, &fanout_list);
dc99f600 1496 }
afe62c68
ED
1497 err = -EINVAL;
1498 if (match->type == type &&
1499 match->prot_hook.type == po->prot_hook.type &&
1500 match->prot_hook.dev == po->prot_hook.dev) {
1501 err = -ENOSPC;
1502 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1503 __dev_remove_pack(&po->prot_hook);
1504 po->fanout = match;
1505 atomic_inc(&match->sk_ref);
1506 __fanout_link(sk, po);
1507 err = 0;
dc99f600
DM
1508 }
1509 }
afe62c68 1510out:
dc99f600
DM
1511 mutex_unlock(&fanout_mutex);
1512 return err;
1513}
1514
1515static void fanout_release(struct sock *sk)
1516{
1517 struct packet_sock *po = pkt_sk(sk);
1518 struct packet_fanout *f;
1519
1520 f = po->fanout;
1521 if (!f)
1522 return;
1523
fff3321d 1524 mutex_lock(&fanout_mutex);
dc99f600
DM
1525 po->fanout = NULL;
1526
dc99f600
DM
1527 if (atomic_dec_and_test(&f->sk_ref)) {
1528 list_del(&f->list);
1529 dev_remove_pack(&f->prot_hook);
1530 kfree(f);
1531 }
1532 mutex_unlock(&fanout_mutex);
1533}
1da177e4 1534
90ddc4f0 1535static const struct proto_ops packet_ops;
1da177e4 1536
90ddc4f0 1537static const struct proto_ops packet_ops_spkt;
1da177e4 1538
40d4e3df
ED
1539static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1540 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1541{
1542 struct sock *sk;
1543 struct sockaddr_pkt *spkt;
1544
1545 /*
1546 * When we registered the protocol we saved the socket in the data
1547 * field for just this event.
1548 */
1549
1550 sk = pt->af_packet_priv;
1ce4f28b 1551
1da177e4
LT
1552 /*
1553 * Yank back the headers [hope the device set this
1554 * right or kerboom...]
1555 *
1556 * Incoming packets have ll header pulled,
1557 * push it back.
1558 *
98e399f8 1559 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1560 * so that this procedure is noop.
1561 */
1562
1563 if (skb->pkt_type == PACKET_LOOPBACK)
1564 goto out;
1565
09ad9bc7 1566 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1567 goto out;
1568
40d4e3df
ED
1569 skb = skb_share_check(skb, GFP_ATOMIC);
1570 if (skb == NULL)
1da177e4
LT
1571 goto oom;
1572
1573 /* drop any routing info */
adf30907 1574 skb_dst_drop(skb);
1da177e4 1575
84531c24
PO
1576 /* drop conntrack reference */
1577 nf_reset(skb);
1578
ffbc6111 1579 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1580
98e399f8 1581 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1582
1583 /*
1584 * The SOCK_PACKET socket receives _all_ frames.
1585 */
1586
1587 spkt->spkt_family = dev->type;
1588 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1589 spkt->spkt_protocol = skb->protocol;
1590
1591 /*
1592 * Charge the memory to the socket. This is done specifically
1593 * to prevent sockets using all the memory up.
1594 */
1595
40d4e3df 1596 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1597 return 0;
1598
1599out:
1600 kfree_skb(skb);
1601oom:
1602 return 0;
1603}
1604
1605
1606/*
1607 * Output a raw packet to a device layer. This bypasses all the other
1608 * protocol layers and you must therefore supply it with a complete frame
1609 */
1ce4f28b 1610
1da177e4
LT
1611static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1612 struct msghdr *msg, size_t len)
1613{
1614 struct sock *sk = sock->sk;
342dfc30 1615 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1616 struct sk_buff *skb = NULL;
1da177e4 1617 struct net_device *dev;
40d4e3df 1618 __be16 proto = 0;
1da177e4 1619 int err;
3bdc0eba 1620 int extra_len = 0;
1ce4f28b 1621
1da177e4 1622 /*
1ce4f28b 1623 * Get and verify the address.
1da177e4
LT
1624 */
1625
40d4e3df 1626 if (saddr) {
1da177e4 1627 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1628 return -EINVAL;
1629 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1630 proto = saddr->spkt_protocol;
1631 } else
1632 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1633
1634 /*
1ce4f28b 1635 * Find the device first to size check it
1da177e4
LT
1636 */
1637
de74e92a 1638 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1639retry:
654d1f8a
ED
1640 rcu_read_lock();
1641 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1642 err = -ENODEV;
1643 if (dev == NULL)
1644 goto out_unlock;
1ce4f28b 1645
d5e76b0a
DM
1646 err = -ENETDOWN;
1647 if (!(dev->flags & IFF_UP))
1648 goto out_unlock;
1649
1da177e4 1650 /*
40d4e3df
ED
1651 * You may not queue a frame bigger than the mtu. This is the lowest level
1652 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1653 */
1ce4f28b 1654
3bdc0eba
BG
1655 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1656 if (!netif_supports_nofcs(dev)) {
1657 err = -EPROTONOSUPPORT;
1658 goto out_unlock;
1659 }
1660 extra_len = 4; /* We're doing our own CRC */
1661 }
1662
1da177e4 1663 err = -EMSGSIZE;
3bdc0eba 1664 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1665 goto out_unlock;
1666
1a35ca80
ED
1667 if (!skb) {
1668 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1669 int tlen = dev->needed_tailroom;
1a35ca80
ED
1670 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1671
1672 rcu_read_unlock();
4ce40912 1673 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1674 if (skb == NULL)
1675 return -ENOBUFS;
1676 /* FIXME: Save some space for broken drivers that write a hard
1677 * header at transmission time by themselves. PPP is the notable
1678 * one here. This should really be fixed at the driver level.
1679 */
1680 skb_reserve(skb, reserved);
1681 skb_reset_network_header(skb);
1682
1683 /* Try to align data part correctly */
1684 if (hhlen) {
1685 skb->data -= hhlen;
1686 skb->tail -= hhlen;
1687 if (len < hhlen)
1688 skb_reset_network_header(skb);
1689 }
6ce8e9ce 1690 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1691 if (err)
1692 goto out_free;
1693 goto retry;
1da177e4
LT
1694 }
1695
3bdc0eba 1696 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1697 /* Earlier code assumed this would be a VLAN pkt,
1698 * double-check this now that we have the actual
1699 * packet in hand.
1700 */
1701 struct ethhdr *ehdr;
1702 skb_reset_mac_header(skb);
1703 ehdr = eth_hdr(skb);
1704 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1705 err = -EMSGSIZE;
1706 goto out_unlock;
1707 }
1708 }
1a35ca80 1709
1da177e4
LT
1710 skb->protocol = proto;
1711 skb->dev = dev;
1712 skb->priority = sk->sk_priority;
2d37a186 1713 skb->mark = sk->sk_mark;
bf84a010
DB
1714
1715 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1716
3bdc0eba
BG
1717 if (unlikely(extra_len == 4))
1718 skb->no_fcs = 1;
1719
40893fd0 1720 skb_probe_transport_header(skb, 0);
c1aad275 1721
1da177e4 1722 dev_queue_xmit(skb);
654d1f8a 1723 rcu_read_unlock();
40d4e3df 1724 return len;
1da177e4 1725
1da177e4 1726out_unlock:
654d1f8a 1727 rcu_read_unlock();
1a35ca80
ED
1728out_free:
1729 kfree_skb(skb);
1da177e4
LT
1730 return err;
1731}
1da177e4 1732
eea49cc9 1733static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1734 const struct sock *sk,
dbcb5855 1735 unsigned int res)
1da177e4
LT
1736{
1737 struct sk_filter *filter;
fda9ef5d 1738
80f8f102
ED
1739 rcu_read_lock();
1740 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1741 if (filter != NULL)
0a14842f 1742 res = SK_RUN_FILTER(filter, skb);
80f8f102 1743 rcu_read_unlock();
1da177e4 1744
dbcb5855 1745 return res;
1da177e4
LT
1746}
1747
1748/*
62ab0812
ED
1749 * This function makes lazy skb cloning in hope that most of packets
1750 * are discarded by BPF.
1751 *
1752 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1753 * and skb->cb are mangled. It works because (and until) packets
1754 * falling here are owned by current CPU. Output packets are cloned
1755 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1756 * sequencially, so that if we return skb to original state on exit,
1757 * we will not harm anyone.
1da177e4
LT
1758 */
1759
40d4e3df
ED
1760static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1761 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1762{
1763 struct sock *sk;
1764 struct sockaddr_ll *sll;
1765 struct packet_sock *po;
40d4e3df 1766 u8 *skb_head = skb->data;
1da177e4 1767 int skb_len = skb->len;
dbcb5855 1768 unsigned int snaplen, res;
1da177e4
LT
1769
1770 if (skb->pkt_type == PACKET_LOOPBACK)
1771 goto drop;
1772
1773 sk = pt->af_packet_priv;
1774 po = pkt_sk(sk);
1775
09ad9bc7 1776 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1777 goto drop;
1778
1da177e4
LT
1779 skb->dev = dev;
1780
3b04ddde 1781 if (dev->header_ops) {
1da177e4 1782 /* The device has an explicit notion of ll header,
62ab0812
ED
1783 * exported to higher levels.
1784 *
1785 * Otherwise, the device hides details of its frame
1786 * structure, so that corresponding packet head is
1787 * never delivered to user.
1da177e4
LT
1788 */
1789 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1790 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1791 else if (skb->pkt_type == PACKET_OUTGOING) {
1792 /* Special case: outgoing packets have ll header at head */
bbe735e4 1793 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1794 }
1795 }
1796
1797 snaplen = skb->len;
1798
dbcb5855
DM
1799 res = run_filter(skb, sk, snaplen);
1800 if (!res)
fda9ef5d 1801 goto drop_n_restore;
dbcb5855
DM
1802 if (snaplen > res)
1803 snaplen = res;
1da177e4 1804
0fd7bac6 1805 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1806 goto drop_n_acct;
1807
1808 if (skb_shared(skb)) {
1809 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1810 if (nskb == NULL)
1811 goto drop_n_acct;
1812
1813 if (skb_head != skb->data) {
1814 skb->data = skb_head;
1815 skb->len = skb_len;
1816 }
abc4e4fa 1817 consume_skb(skb);
1da177e4
LT
1818 skb = nskb;
1819 }
1820
ffbc6111
HX
1821 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1822 sizeof(skb->cb));
1823
1824 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1825 sll->sll_family = AF_PACKET;
1826 sll->sll_hatype = dev->type;
1827 sll->sll_protocol = skb->protocol;
1828 sll->sll_pkttype = skb->pkt_type;
8032b464 1829 if (unlikely(po->origdev))
80feaacb
PWJ
1830 sll->sll_ifindex = orig_dev->ifindex;
1831 else
1832 sll->sll_ifindex = dev->ifindex;
1da177e4 1833
b95cce35 1834 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1835
ffbc6111 1836 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1837
1da177e4
LT
1838 if (pskb_trim(skb, snaplen))
1839 goto drop_n_acct;
1840
1841 skb_set_owner_r(skb, sk);
1842 skb->dev = NULL;
adf30907 1843 skb_dst_drop(skb);
1da177e4 1844
84531c24
PO
1845 /* drop conntrack reference */
1846 nf_reset(skb);
1847
1da177e4 1848 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1849 po->stats.stats1.tp_packets++;
3b885787 1850 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1851 __skb_queue_tail(&sk->sk_receive_queue, skb);
1852 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 1853 sk->sk_data_ready(sk);
1da177e4
LT
1854 return 0;
1855
1856drop_n_acct:
7091fbd8 1857 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1858 po->stats.stats1.tp_drops++;
7091fbd8
WB
1859 atomic_inc(&sk->sk_drops);
1860 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1861
1862drop_n_restore:
1863 if (skb_head != skb->data && skb_shared(skb)) {
1864 skb->data = skb_head;
1865 skb->len = skb_len;
1866 }
1867drop:
ead2ceb0 1868 consume_skb(skb);
1da177e4
LT
1869 return 0;
1870}
1871
40d4e3df
ED
1872static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1873 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1874{
1875 struct sock *sk;
1876 struct packet_sock *po;
1877 struct sockaddr_ll *sll;
184f489e 1878 union tpacket_uhdr h;
40d4e3df 1879 u8 *skb_head = skb->data;
1da177e4 1880 int skb_len = skb->len;
dbcb5855 1881 unsigned int snaplen, res;
f6fb8f10 1882 unsigned long status = TP_STATUS_USER;
bbd6ef87 1883 unsigned short macoff, netoff, hdrlen;
1da177e4 1884 struct sk_buff *copy_skb = NULL;
bbd6ef87 1885 struct timespec ts;
b9c32fb2 1886 __u32 ts_status;
1da177e4 1887
51846355
AW
1888 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1889 * We may add members to them until current aligned size without forcing
1890 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1891 */
1892 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1893 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1894
1da177e4
LT
1895 if (skb->pkt_type == PACKET_LOOPBACK)
1896 goto drop;
1897
1898 sk = pt->af_packet_priv;
1899 po = pkt_sk(sk);
1900
09ad9bc7 1901 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1902 goto drop;
1903
3b04ddde 1904 if (dev->header_ops) {
1da177e4 1905 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1906 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1907 else if (skb->pkt_type == PACKET_OUTGOING) {
1908 /* Special case: outgoing packets have ll header at head */
bbe735e4 1909 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1910 }
1911 }
1912
8dc41944
HX
1913 if (skb->ip_summed == CHECKSUM_PARTIAL)
1914 status |= TP_STATUS_CSUMNOTREADY;
1915
1da177e4
LT
1916 snaplen = skb->len;
1917
dbcb5855
DM
1918 res = run_filter(skb, sk, snaplen);
1919 if (!res)
fda9ef5d 1920 goto drop_n_restore;
dbcb5855
DM
1921 if (snaplen > res)
1922 snaplen = res;
1da177e4
LT
1923
1924 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1925 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1926 po->tp_reserve;
1da177e4 1927 } else {
95c96174 1928 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1929 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1930 (maclen < 16 ? 16 : maclen)) +
1931 po->tp_reserve;
1da177e4
LT
1932 macoff = netoff - maclen;
1933 }
f6fb8f10 1934 if (po->tp_version <= TPACKET_V2) {
1935 if (macoff + snaplen > po->rx_ring.frame_size) {
1936 if (po->copy_thresh &&
0fd7bac6 1937 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1938 if (skb_shared(skb)) {
1939 copy_skb = skb_clone(skb, GFP_ATOMIC);
1940 } else {
1941 copy_skb = skb_get(skb);
1942 skb_head = skb->data;
1943 }
1944 if (copy_skb)
1945 skb_set_owner_r(copy_skb, sk);
1da177e4 1946 }
f6fb8f10 1947 snaplen = po->rx_ring.frame_size - macoff;
1948 if ((int)snaplen < 0)
1949 snaplen = 0;
1da177e4 1950 }
dc808110
ED
1951 } else if (unlikely(macoff + snaplen >
1952 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
1953 u32 nval;
1954
1955 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
1956 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
1957 snaplen, nval, macoff);
1958 snaplen = nval;
1959 if (unlikely((int)snaplen < 0)) {
1960 snaplen = 0;
1961 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
1962 }
1da177e4 1963 }
1da177e4 1964 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1965 h.raw = packet_current_rx_frame(po, skb,
1966 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1967 if (!h.raw)
1da177e4 1968 goto ring_is_full;
f6fb8f10 1969 if (po->tp_version <= TPACKET_V2) {
1970 packet_increment_rx_head(po, &po->rx_ring);
1971 /*
1972 * LOSING will be reported till you read the stats,
1973 * because it's COR - Clear On Read.
1974 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1975 * at packet level.
1976 */
ee80fbf3 1977 if (po->stats.stats1.tp_drops)
f6fb8f10 1978 status |= TP_STATUS_LOSING;
1979 }
ee80fbf3 1980 po->stats.stats1.tp_packets++;
1da177e4
LT
1981 if (copy_skb) {
1982 status |= TP_STATUS_COPY;
1983 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1984 }
1da177e4
LT
1985 spin_unlock(&sk->sk_receive_queue.lock);
1986
bbd6ef87 1987 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1988
1989 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1990 getnstimeofday(&ts);
1da177e4 1991
b9c32fb2
DB
1992 status |= ts_status;
1993
bbd6ef87
PM
1994 switch (po->tp_version) {
1995 case TPACKET_V1:
1996 h.h1->tp_len = skb->len;
1997 h.h1->tp_snaplen = snaplen;
1998 h.h1->tp_mac = macoff;
1999 h.h1->tp_net = netoff;
4b457bdf
DB
2000 h.h1->tp_sec = ts.tv_sec;
2001 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2002 hdrlen = sizeof(*h.h1);
2003 break;
2004 case TPACKET_V2:
2005 h.h2->tp_len = skb->len;
2006 h.h2->tp_snaplen = snaplen;
2007 h.h2->tp_mac = macoff;
2008 h.h2->tp_net = netoff;
bbd6ef87
PM
2009 h.h2->tp_sec = ts.tv_sec;
2010 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2011 if (skb_vlan_tag_present(skb)) {
2012 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2013 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2014 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2015 } else {
2016 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2017 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2018 }
e4d26f4b 2019 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2020 hdrlen = sizeof(*h.h2);
2021 break;
f6fb8f10 2022 case TPACKET_V3:
2023 /* tp_nxt_offset,vlan are already populated above.
2024 * So DONT clear those fields here
2025 */
2026 h.h3->tp_status |= status;
2027 h.h3->tp_len = skb->len;
2028 h.h3->tp_snaplen = snaplen;
2029 h.h3->tp_mac = macoff;
2030 h.h3->tp_net = netoff;
f6fb8f10 2031 h.h3->tp_sec = ts.tv_sec;
2032 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2033 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2034 hdrlen = sizeof(*h.h3);
2035 break;
bbd6ef87
PM
2036 default:
2037 BUG();
2038 }
1da177e4 2039
bbd6ef87 2040 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2041 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2042 sll->sll_family = AF_PACKET;
2043 sll->sll_hatype = dev->type;
2044 sll->sll_protocol = skb->protocol;
2045 sll->sll_pkttype = skb->pkt_type;
8032b464 2046 if (unlikely(po->origdev))
80feaacb
PWJ
2047 sll->sll_ifindex = orig_dev->ifindex;
2048 else
2049 sll->sll_ifindex = dev->ifindex;
1da177e4 2050
e16aa207 2051 smp_mb();
f0d4eb29 2052
f6dafa95 2053#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2054 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2055 u8 *start, *end;
2056
f0d4eb29
DB
2057 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2058 macoff + snaplen);
2059
2060 for (start = h.raw; start < end; start += PAGE_SIZE)
2061 flush_dcache_page(pgv_to_page(start));
1da177e4 2062 }
f0d4eb29 2063 smp_wmb();
f6dafa95 2064#endif
f0d4eb29 2065
da413eec 2066 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2067 __packet_set_status(po, h.raw, status);
da413eec
DC
2068 sk->sk_data_ready(sk);
2069 } else {
f6fb8f10 2070 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2071 }
1da177e4
LT
2072
2073drop_n_restore:
2074 if (skb_head != skb->data && skb_shared(skb)) {
2075 skb->data = skb_head;
2076 skb->len = skb_len;
2077 }
2078drop:
1ce4f28b 2079 kfree_skb(skb);
1da177e4
LT
2080 return 0;
2081
2082ring_is_full:
ee80fbf3 2083 po->stats.stats1.tp_drops++;
1da177e4
LT
2084 spin_unlock(&sk->sk_receive_queue.lock);
2085
676d2369 2086 sk->sk_data_ready(sk);
acb5d75b 2087 kfree_skb(copy_skb);
1da177e4
LT
2088 goto drop_n_restore;
2089}
2090
69e3c75f
JB
2091static void tpacket_destruct_skb(struct sk_buff *skb)
2092{
2093 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2094
69e3c75f 2095 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2096 void *ph;
b9c32fb2
DB
2097 __u32 ts;
2098
69e3c75f 2099 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2100 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2101
2102 ts = __packet_set_timestamp(po, ph, skb);
2103 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2104 }
2105
2106 sock_wfree(skb);
2107}
2108
9c707762
WB
2109static bool ll_header_truncated(const struct net_device *dev, int len)
2110{
2111 /* net device doesn't like empty head */
2112 if (unlikely(len <= dev->hard_header_len)) {
eee2f04b 2113 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
9c707762
WB
2114 current->comm, len, dev->hard_header_len);
2115 return true;
2116 }
2117
2118 return false;
2119}
2120
40d4e3df
ED
2121static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2122 void *frame, struct net_device *dev, int size_max,
ae641949 2123 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2124{
184f489e 2125 union tpacket_uhdr ph;
09effa67 2126 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2127 struct socket *sock = po->sk.sk_socket;
2128 struct page *page;
2129 void *data;
2130 int err;
2131
2132 ph.raw = frame;
2133
2134 skb->protocol = proto;
2135 skb->dev = dev;
2136 skb->priority = po->sk.sk_priority;
2d37a186 2137 skb->mark = po->sk.sk_mark;
2e31396f 2138 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2139 skb_shinfo(skb)->destructor_arg = ph.raw;
2140
2141 switch (po->tp_version) {
2142 case TPACKET_V2:
2143 tp_len = ph.h2->tp_len;
2144 break;
2145 default:
2146 tp_len = ph.h1->tp_len;
2147 break;
2148 }
09effa67
DM
2149 if (unlikely(tp_len > size_max)) {
2150 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2151 return -EMSGSIZE;
2152 }
69e3c75f 2153
ae641949 2154 skb_reserve(skb, hlen);
69e3c75f 2155 skb_reset_network_header(skb);
c1aad275 2156
d346a3fa
DB
2157 if (!packet_use_direct_xmit(po))
2158 skb_probe_transport_header(skb, 0);
2159 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2160 int off_min, off_max, off;
2161 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2162 off_max = po->tx_ring.frame_size - tp_len;
2163 if (sock->type == SOCK_DGRAM) {
2164 switch (po->tp_version) {
2165 case TPACKET_V2:
2166 off = ph.h2->tp_net;
2167 break;
2168 default:
2169 off = ph.h1->tp_net;
2170 break;
2171 }
2172 } else {
2173 switch (po->tp_version) {
2174 case TPACKET_V2:
2175 off = ph.h2->tp_mac;
2176 break;
2177 default:
2178 off = ph.h1->tp_mac;
2179 break;
2180 }
2181 }
2182 if (unlikely((off < off_min) || (off_max < off)))
2183 return -EINVAL;
2184 data = ph.raw + off;
2185 } else {
2186 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2187 }
69e3c75f
JB
2188 to_write = tp_len;
2189
2190 if (sock->type == SOCK_DGRAM) {
2191 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2192 NULL, tp_len);
2193 if (unlikely(err < 0))
2194 return -EINVAL;
40d4e3df 2195 } else if (dev->hard_header_len) {
9c707762 2196 if (ll_header_truncated(dev, tp_len))
69e3c75f 2197 return -EINVAL;
69e3c75f
JB
2198
2199 skb_push(skb, dev->hard_header_len);
2200 err = skb_store_bits(skb, 0, data,
2201 dev->hard_header_len);
2202 if (unlikely(err))
2203 return err;
2204
2205 data += dev->hard_header_len;
2206 to_write -= dev->hard_header_len;
2207 }
2208
69e3c75f
JB
2209 offset = offset_in_page(data);
2210 len_max = PAGE_SIZE - offset;
2211 len = ((to_write > len_max) ? len_max : to_write);
2212
2213 skb->data_len = to_write;
2214 skb->len += to_write;
2215 skb->truesize += to_write;
2216 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2217
2218 while (likely(to_write)) {
2219 nr_frags = skb_shinfo(skb)->nr_frags;
2220
2221 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2222 pr_err("Packet exceed the number of skb frags(%lu)\n",
2223 MAX_SKB_FRAGS);
69e3c75f
JB
2224 return -EFAULT;
2225 }
2226
0af55bb5
CG
2227 page = pgv_to_page(data);
2228 data += len;
69e3c75f
JB
2229 flush_dcache_page(page);
2230 get_page(page);
0af55bb5 2231 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2232 to_write -= len;
2233 offset = 0;
2234 len_max = PAGE_SIZE;
2235 len = ((to_write > len_max) ? len_max : to_write);
2236 }
2237
2238 return tp_len;
2239}
2240
2241static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2242{
69e3c75f
JB
2243 struct sk_buff *skb;
2244 struct net_device *dev;
2245 __be16 proto;
09effa67 2246 int err, reserve = 0;
40d4e3df 2247 void *ph;
342dfc30 2248 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2249 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2250 int tp_len, size_max;
2251 unsigned char *addr;
2252 int len_sum = 0;
9e67030a 2253 int status = TP_STATUS_AVAILABLE;
ae641949 2254 int hlen, tlen;
69e3c75f 2255
69e3c75f
JB
2256 mutex_lock(&po->pg_vec_lock);
2257
66e56cd4 2258 if (likely(saddr == NULL)) {
e40526cb 2259 dev = packet_cached_dev_get(po);
69e3c75f
JB
2260 proto = po->num;
2261 addr = NULL;
2262 } else {
2263 err = -EINVAL;
2264 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2265 goto out;
2266 if (msg->msg_namelen < (saddr->sll_halen
2267 + offsetof(struct sockaddr_ll,
2268 sll_addr)))
2269 goto out;
69e3c75f
JB
2270 proto = saddr->sll_protocol;
2271 addr = saddr->sll_addr;
827d9780 2272 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2273 }
2274
69e3c75f
JB
2275 err = -ENXIO;
2276 if (unlikely(dev == NULL))
2277 goto out;
69e3c75f
JB
2278 err = -ENETDOWN;
2279 if (unlikely(!(dev->flags & IFF_UP)))
2280 goto out_put;
2281
52f1454f 2282 reserve = dev->hard_header_len + VLAN_HLEN;
69e3c75f 2283 size_max = po->tx_ring.frame_size
b5dd884e 2284 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2285
09effa67
DM
2286 if (size_max > dev->mtu + reserve)
2287 size_max = dev->mtu + reserve;
2288
69e3c75f
JB
2289 do {
2290 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2291 TP_STATUS_SEND_REQUEST);
69e3c75f 2292 if (unlikely(ph == NULL)) {
87a2fd28
DB
2293 if (need_wait && need_resched())
2294 schedule();
69e3c75f
JB
2295 continue;
2296 }
2297
2298 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2299 hlen = LL_RESERVED_SPACE(dev);
2300 tlen = dev->needed_tailroom;
69e3c75f 2301 skb = sock_alloc_send_skb(&po->sk,
ae641949 2302 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2303 0, &err);
2304
2305 if (unlikely(skb == NULL))
2306 goto out_status;
2307
2308 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
52f1454f
DB
2309 addr, hlen);
2310 if (tp_len > dev->mtu + dev->hard_header_len) {
2311 struct ethhdr *ehdr;
2312 /* Earlier code assumed this would be a VLAN pkt,
2313 * double-check this now that we have the actual
2314 * packet in hand.
2315 */
69e3c75f 2316
52f1454f
DB
2317 skb_reset_mac_header(skb);
2318 ehdr = eth_hdr(skb);
2319 if (ehdr->h_proto != htons(ETH_P_8021Q))
2320 tp_len = -EMSGSIZE;
2321 }
69e3c75f
JB
2322 if (unlikely(tp_len < 0)) {
2323 if (po->tp_loss) {
2324 __packet_set_status(po, ph,
2325 TP_STATUS_AVAILABLE);
2326 packet_increment_head(&po->tx_ring);
2327 kfree_skb(skb);
2328 continue;
2329 } else {
2330 status = TP_STATUS_WRONG_FORMAT;
2331 err = tp_len;
2332 goto out_status;
2333 }
2334 }
2335
0fd5d57b
DB
2336 packet_pick_tx_queue(dev, skb);
2337
69e3c75f
JB
2338 skb->destructor = tpacket_destruct_skb;
2339 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2340 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2341
2342 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2343 err = po->xmit(skb);
eb70df13
JP
2344 if (unlikely(err > 0)) {
2345 err = net_xmit_errno(err);
2346 if (err && __packet_get_status(po, ph) ==
2347 TP_STATUS_AVAILABLE) {
2348 /* skb was destructed already */
2349 skb = NULL;
2350 goto out_status;
2351 }
2352 /*
2353 * skb was dropped but not destructed yet;
2354 * let's treat it like congestion or err < 0
2355 */
2356 err = 0;
2357 }
69e3c75f
JB
2358 packet_increment_head(&po->tx_ring);
2359 len_sum += tp_len;
b0138408
DB
2360 } while (likely((ph != NULL) ||
2361 /* Note: packet_read_pending() might be slow if we have
2362 * to call it as it's per_cpu variable, but in fast-path
2363 * we already short-circuit the loop with the first
2364 * condition, and luckily don't have to go that path
2365 * anyway.
2366 */
2367 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2368
2369 err = len_sum;
2370 goto out_put;
2371
69e3c75f
JB
2372out_status:
2373 __packet_set_status(po, ph, status);
2374 kfree_skb(skb);
2375out_put:
e40526cb 2376 dev_put(dev);
69e3c75f
JB
2377out:
2378 mutex_unlock(&po->pg_vec_lock);
2379 return err;
2380}
69e3c75f 2381
eea49cc9
OJ
2382static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2383 size_t reserve, size_t len,
2384 size_t linear, int noblock,
2385 int *err)
bfd5f4a3
SS
2386{
2387 struct sk_buff *skb;
2388
2389 /* Under a page? Don't bother with paged skb. */
2390 if (prepad + len < PAGE_SIZE || !linear)
2391 linear = len;
2392
2393 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2394 err, 0);
bfd5f4a3
SS
2395 if (!skb)
2396 return NULL;
2397
2398 skb_reserve(skb, reserve);
2399 skb_put(skb, linear);
2400 skb->data_len = len - linear;
2401 skb->len += len - linear;
2402
2403 return skb;
2404}
2405
d346a3fa 2406static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2407{
2408 struct sock *sk = sock->sk;
342dfc30 2409 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2410 struct sk_buff *skb;
2411 struct net_device *dev;
0e11c91e 2412 __be16 proto;
1da177e4 2413 unsigned char *addr;
827d9780 2414 int err, reserve = 0;
bfd5f4a3
SS
2415 struct virtio_net_hdr vnet_hdr = { 0 };
2416 int offset = 0;
2417 int vnet_hdr_len;
2418 struct packet_sock *po = pkt_sk(sk);
2419 unsigned short gso_type = 0;
ae641949 2420 int hlen, tlen;
3bdc0eba 2421 int extra_len = 0;
8feb2fb2 2422 ssize_t n;
1da177e4
LT
2423
2424 /*
1ce4f28b 2425 * Get and verify the address.
1da177e4 2426 */
1ce4f28b 2427
66e56cd4 2428 if (likely(saddr == NULL)) {
e40526cb 2429 dev = packet_cached_dev_get(po);
1da177e4
LT
2430 proto = po->num;
2431 addr = NULL;
2432 } else {
2433 err = -EINVAL;
2434 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2435 goto out;
0fb375fb
EB
2436 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2437 goto out;
1da177e4
LT
2438 proto = saddr->sll_protocol;
2439 addr = saddr->sll_addr;
827d9780 2440 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2441 }
2442
1da177e4 2443 err = -ENXIO;
e40526cb 2444 if (unlikely(dev == NULL))
1da177e4 2445 goto out_unlock;
d5e76b0a 2446 err = -ENETDOWN;
e40526cb 2447 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2448 goto out_unlock;
2449
e40526cb
DB
2450 if (sock->type == SOCK_RAW)
2451 reserve = dev->hard_header_len;
bfd5f4a3
SS
2452 if (po->has_vnet_hdr) {
2453 vnet_hdr_len = sizeof(vnet_hdr);
2454
2455 err = -EINVAL;
2456 if (len < vnet_hdr_len)
2457 goto out_unlock;
2458
2459 len -= vnet_hdr_len;
2460
8feb2fb2 2461 err = -EFAULT;
c0371da6 2462 n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
8feb2fb2 2463 if (n != vnet_hdr_len)
bfd5f4a3
SS
2464 goto out_unlock;
2465
2466 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
dc9e5153
MT
2467 (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
2468 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
2469 __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
2470 vnet_hdr.hdr_len = __cpu_to_virtio16(false,
2471 __virtio16_to_cpu(false, vnet_hdr.csum_start) +
2472 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
bfd5f4a3
SS
2473
2474 err = -EINVAL;
dc9e5153 2475 if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
bfd5f4a3
SS
2476 goto out_unlock;
2477
2478 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2479 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2480 case VIRTIO_NET_HDR_GSO_TCPV4:
2481 gso_type = SKB_GSO_TCPV4;
2482 break;
2483 case VIRTIO_NET_HDR_GSO_TCPV6:
2484 gso_type = SKB_GSO_TCPV6;
2485 break;
2486 case VIRTIO_NET_HDR_GSO_UDP:
2487 gso_type = SKB_GSO_UDP;
2488 break;
2489 default:
2490 goto out_unlock;
2491 }
2492
2493 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2494 gso_type |= SKB_GSO_TCP_ECN;
2495
2496 if (vnet_hdr.gso_size == 0)
2497 goto out_unlock;
2498
2499 }
2500 }
2501
3bdc0eba
BG
2502 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2503 if (!netif_supports_nofcs(dev)) {
2504 err = -EPROTONOSUPPORT;
2505 goto out_unlock;
2506 }
2507 extra_len = 4; /* We're doing our own CRC */
2508 }
2509
1da177e4 2510 err = -EMSGSIZE;
3bdc0eba 2511 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2512 goto out_unlock;
2513
bfd5f4a3 2514 err = -ENOBUFS;
ae641949
HX
2515 hlen = LL_RESERVED_SPACE(dev);
2516 tlen = dev->needed_tailroom;
dc9e5153
MT
2517 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
2518 __virtio16_to_cpu(false, vnet_hdr.hdr_len),
bfd5f4a3 2519 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2520 if (skb == NULL)
1da177e4
LT
2521 goto out_unlock;
2522
bfd5f4a3 2523 skb_set_network_header(skb, reserve);
1da177e4 2524
0c4e8581 2525 err = -EINVAL;
9c707762
WB
2526 if (sock->type == SOCK_DGRAM) {
2527 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2528 if (unlikely(offset < 0))
9c707762
WB
2529 goto out_free;
2530 } else {
2531 if (ll_header_truncated(dev, len))
2532 goto out_free;
2533 }
1da177e4
LT
2534
2535 /* Returns -EFAULT on error */
c0371da6 2536 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2537 if (err)
2538 goto out_free;
bf84a010
DB
2539
2540 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2541
3bdc0eba 2542 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2543 /* Earlier code assumed this would be a VLAN pkt,
2544 * double-check this now that we have the actual
2545 * packet in hand.
2546 */
2547 struct ethhdr *ehdr;
2548 skb_reset_mac_header(skb);
2549 ehdr = eth_hdr(skb);
2550 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2551 err = -EMSGSIZE;
2552 goto out_free;
2553 }
57f89bfa
BG
2554 }
2555
09effa67
DM
2556 skb->protocol = proto;
2557 skb->dev = dev;
1da177e4 2558 skb->priority = sk->sk_priority;
2d37a186 2559 skb->mark = sk->sk_mark;
0fd5d57b
DB
2560
2561 packet_pick_tx_queue(dev, skb);
1da177e4 2562
bfd5f4a3
SS
2563 if (po->has_vnet_hdr) {
2564 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
dc9e5153
MT
2565 u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
2566 u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
2567 if (!skb_partial_csum_set(skb, s, o)) {
bfd5f4a3
SS
2568 err = -EINVAL;
2569 goto out_free;
2570 }
2571 }
2572
dc9e5153
MT
2573 skb_shinfo(skb)->gso_size =
2574 __virtio16_to_cpu(false, vnet_hdr.gso_size);
bfd5f4a3
SS
2575 skb_shinfo(skb)->gso_type = gso_type;
2576
2577 /* Header must be checked, and gso_segs computed. */
2578 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2579 skb_shinfo(skb)->gso_segs = 0;
2580
2581 len += vnet_hdr_len;
2582 }
2583
d346a3fa
DB
2584 if (!packet_use_direct_xmit(po))
2585 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2586 if (unlikely(extra_len == 4))
2587 skb->no_fcs = 1;
2588
d346a3fa 2589 err = po->xmit(skb);
1da177e4
LT
2590 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2591 goto out_unlock;
2592
e40526cb 2593 dev_put(dev);
1da177e4 2594
40d4e3df 2595 return len;
1da177e4
LT
2596
2597out_free:
2598 kfree_skb(skb);
2599out_unlock:
e40526cb 2600 if (dev)
1da177e4
LT
2601 dev_put(dev);
2602out:
2603 return err;
2604}
2605
69e3c75f
JB
2606static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2607 struct msghdr *msg, size_t len)
2608{
69e3c75f
JB
2609 struct sock *sk = sock->sk;
2610 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2611
69e3c75f
JB
2612 if (po->tx_ring.pg_vec)
2613 return tpacket_snd(po, msg);
2614 else
69e3c75f
JB
2615 return packet_snd(sock, msg, len);
2616}
2617
1da177e4
LT
2618/*
2619 * Close a PACKET socket. This is fairly simple. We immediately go
2620 * to 'closed' state and remove our protocol entry in the device list.
2621 */
2622
2623static int packet_release(struct socket *sock)
2624{
2625 struct sock *sk = sock->sk;
2626 struct packet_sock *po;
d12d01d6 2627 struct net *net;
f6fb8f10 2628 union tpacket_req_u req_u;
1da177e4
LT
2629
2630 if (!sk)
2631 return 0;
2632
3b1e0a65 2633 net = sock_net(sk);
1da177e4
LT
2634 po = pkt_sk(sk);
2635
0fa7fa98 2636 mutex_lock(&net->packet.sklist_lock);
808f5114 2637 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2638 mutex_unlock(&net->packet.sklist_lock);
2639
2640 preempt_disable();
920de804 2641 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2642 preempt_enable();
1da177e4 2643
808f5114 2644 spin_lock(&po->bind_lock);
ce06b03e 2645 unregister_prot_hook(sk, false);
66e56cd4
DB
2646 packet_cached_dev_reset(po);
2647
160ff18a
BG
2648 if (po->prot_hook.dev) {
2649 dev_put(po->prot_hook.dev);
2650 po->prot_hook.dev = NULL;
2651 }
808f5114 2652 spin_unlock(&po->bind_lock);
1da177e4 2653
1da177e4 2654 packet_flush_mclist(sk);
1da177e4 2655
9665d5d6
PS
2656 if (po->rx_ring.pg_vec) {
2657 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2658 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2659 }
69e3c75f 2660
9665d5d6
PS
2661 if (po->tx_ring.pg_vec) {
2662 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2663 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2664 }
1da177e4 2665
dc99f600
DM
2666 fanout_release(sk);
2667
808f5114 2668 synchronize_net();
1da177e4
LT
2669 /*
2670 * Now the socket is dead. No more input will appear.
2671 */
1da177e4
LT
2672 sock_orphan(sk);
2673 sock->sk = NULL;
2674
2675 /* Purge queues */
2676
2677 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2678 packet_free_pending(po);
17ab56a2 2679 sk_refcnt_debug_release(sk);
1da177e4
LT
2680
2681 sock_put(sk);
2682 return 0;
2683}
2684
2685/*
2686 * Attach a packet hook.
2687 */
2688
902fefb8 2689static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2690{
2691 struct packet_sock *po = pkt_sk(sk);
902fefb8
DB
2692 const struct net_device *dev_curr;
2693 __be16 proto_curr;
2694 bool need_rehook;
dc99f600 2695
aef950b4
WY
2696 if (po->fanout) {
2697 if (dev)
2698 dev_put(dev);
2699
dc99f600 2700 return -EINVAL;
aef950b4 2701 }
1da177e4
LT
2702
2703 lock_sock(sk);
1da177e4 2704 spin_lock(&po->bind_lock);
66e56cd4 2705
902fefb8
DB
2706 proto_curr = po->prot_hook.type;
2707 dev_curr = po->prot_hook.dev;
2708
2709 need_rehook = proto_curr != proto || dev_curr != dev;
2710
2711 if (need_rehook) {
2712 unregister_prot_hook(sk, true);
1da177e4 2713
902fefb8
DB
2714 po->num = proto;
2715 po->prot_hook.type = proto;
1da177e4 2716
902fefb8
DB
2717 if (po->prot_hook.dev)
2718 dev_put(po->prot_hook.dev);
2719
2720 po->prot_hook.dev = dev;
2721
2722 po->ifindex = dev ? dev->ifindex : 0;
2723 packet_cached_dev_assign(po, dev);
2724 }
66e56cd4 2725
902fefb8 2726 if (proto == 0 || !need_rehook)
1da177e4
LT
2727 goto out_unlock;
2728
be85d4ad 2729 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2730 register_prot_hook(sk);
be85d4ad
UT
2731 } else {
2732 sk->sk_err = ENETDOWN;
2733 if (!sock_flag(sk, SOCK_DEAD))
2734 sk->sk_error_report(sk);
1da177e4
LT
2735 }
2736
2737out_unlock:
2738 spin_unlock(&po->bind_lock);
2739 release_sock(sk);
2740 return 0;
2741}
2742
2743/*
2744 * Bind a packet socket to a device
2745 */
2746
40d4e3df
ED
2747static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2748 int addr_len)
1da177e4 2749{
40d4e3df 2750 struct sock *sk = sock->sk;
1da177e4
LT
2751 char name[15];
2752 struct net_device *dev;
2753 int err = -ENODEV;
1ce4f28b 2754
1da177e4
LT
2755 /*
2756 * Check legality
2757 */
1ce4f28b 2758
8ae55f04 2759 if (addr_len != sizeof(struct sockaddr))
1da177e4 2760 return -EINVAL;
40d4e3df 2761 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2762
3b1e0a65 2763 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2764 if (dev)
1da177e4 2765 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2766 return err;
2767}
1da177e4
LT
2768
2769static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2770{
40d4e3df
ED
2771 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2772 struct sock *sk = sock->sk;
1da177e4
LT
2773 struct net_device *dev = NULL;
2774 int err;
2775
2776
2777 /*
2778 * Check legality
2779 */
1ce4f28b 2780
1da177e4
LT
2781 if (addr_len < sizeof(struct sockaddr_ll))
2782 return -EINVAL;
2783 if (sll->sll_family != AF_PACKET)
2784 return -EINVAL;
2785
2786 if (sll->sll_ifindex) {
2787 err = -ENODEV;
3b1e0a65 2788 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2789 if (dev == NULL)
2790 goto out;
2791 }
2792 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2793
2794out:
2795 return err;
2796}
2797
2798static struct proto packet_proto = {
2799 .name = "PACKET",
2800 .owner = THIS_MODULE,
2801 .obj_size = sizeof(struct packet_sock),
2802};
2803
2804/*
1ce4f28b 2805 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2806 */
2807
3f378b68
EP
2808static int packet_create(struct net *net, struct socket *sock, int protocol,
2809 int kern)
1da177e4
LT
2810{
2811 struct sock *sk;
2812 struct packet_sock *po;
0e11c91e 2813 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2814 int err;
2815
df008c91 2816 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2817 return -EPERM;
be02097c
DM
2818 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2819 sock->type != SOCK_PACKET)
1da177e4
LT
2820 return -ESOCKTNOSUPPORT;
2821
2822 sock->state = SS_UNCONNECTED;
2823
2824 err = -ENOBUFS;
6257ff21 2825 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2826 if (sk == NULL)
2827 goto out;
2828
2829 sock->ops = &packet_ops;
1da177e4
LT
2830 if (sock->type == SOCK_PACKET)
2831 sock->ops = &packet_ops_spkt;
be02097c 2832
1da177e4
LT
2833 sock_init_data(sock, sk);
2834
2835 po = pkt_sk(sk);
2836 sk->sk_family = PF_PACKET;
0e11c91e 2837 po->num = proto;
d346a3fa 2838 po->xmit = dev_queue_xmit;
66e56cd4 2839
b0138408
DB
2840 err = packet_alloc_pending(po);
2841 if (err)
2842 goto out2;
2843
66e56cd4 2844 packet_cached_dev_reset(po);
1da177e4
LT
2845
2846 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2847 sk_refcnt_debug_inc(sk);
1da177e4
LT
2848
2849 /*
2850 * Attach a protocol block
2851 */
2852
2853 spin_lock_init(&po->bind_lock);
905db440 2854 mutex_init(&po->pg_vec_lock);
1da177e4 2855 po->prot_hook.func = packet_rcv;
be02097c 2856
1da177e4
LT
2857 if (sock->type == SOCK_PACKET)
2858 po->prot_hook.func = packet_rcv_spkt;
be02097c 2859
1da177e4
LT
2860 po->prot_hook.af_packet_priv = sk;
2861
0e11c91e
AV
2862 if (proto) {
2863 po->prot_hook.type = proto;
ce06b03e 2864 register_prot_hook(sk);
1da177e4
LT
2865 }
2866
0fa7fa98 2867 mutex_lock(&net->packet.sklist_lock);
808f5114 2868 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2869 mutex_unlock(&net->packet.sklist_lock);
2870
2871 preempt_disable();
3680453c 2872 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2873 preempt_enable();
808f5114 2874
40d4e3df 2875 return 0;
b0138408
DB
2876out2:
2877 sk_free(sk);
1da177e4
LT
2878out:
2879 return err;
2880}
2881
2882/*
2883 * Pull a packet from our receive queue and hand it to the user.
2884 * If necessary we block.
2885 */
2886
2887static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2888 struct msghdr *msg, size_t len, int flags)
2889{
2890 struct sock *sk = sock->sk;
2891 struct sk_buff *skb;
2892 int copied, err;
bfd5f4a3 2893 int vnet_hdr_len = 0;
1da177e4
LT
2894
2895 err = -EINVAL;
ed85b565 2896 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2897 goto out;
2898
2899#if 0
2900 /* What error should we return now? EUNATTACH? */
2901 if (pkt_sk(sk)->ifindex < 0)
2902 return -ENODEV;
2903#endif
2904
ed85b565 2905 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
2906 err = sock_recv_errqueue(sk, msg, len,
2907 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
2908 goto out;
2909 }
2910
1da177e4
LT
2911 /*
2912 * Call the generic datagram receiver. This handles all sorts
2913 * of horrible races and re-entrancy so we can forget about it
2914 * in the protocol layers.
2915 *
2916 * Now it will return ENETDOWN, if device have just gone down,
2917 * but then it will block.
2918 */
2919
40d4e3df 2920 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2921
2922 /*
1ce4f28b 2923 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2924 * handles the blocking we don't see and worry about blocking
2925 * retries.
2926 */
2927
8ae55f04 2928 if (skb == NULL)
1da177e4
LT
2929 goto out;
2930
bfd5f4a3
SS
2931 if (pkt_sk(sk)->has_vnet_hdr) {
2932 struct virtio_net_hdr vnet_hdr = { 0 };
2933
2934 err = -EINVAL;
2935 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2936 if (len < vnet_hdr_len)
bfd5f4a3
SS
2937 goto out_free;
2938
1f18b717
MK
2939 len -= vnet_hdr_len;
2940
bfd5f4a3
SS
2941 if (skb_is_gso(skb)) {
2942 struct skb_shared_info *sinfo = skb_shinfo(skb);
2943
2944 /* This is a hint as to how much should be linear. */
dc9e5153
MT
2945 vnet_hdr.hdr_len =
2946 __cpu_to_virtio16(false, skb_headlen(skb));
2947 vnet_hdr.gso_size =
2948 __cpu_to_virtio16(false, sinfo->gso_size);
bfd5f4a3
SS
2949 if (sinfo->gso_type & SKB_GSO_TCPV4)
2950 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2951 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2952 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2953 else if (sinfo->gso_type & SKB_GSO_UDP)
2954 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2955 else if (sinfo->gso_type & SKB_GSO_FCOE)
2956 goto out_free;
2957 else
2958 BUG();
2959 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2960 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2961 } else
2962 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2963
2964 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2965 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
dc9e5153
MT
2966 vnet_hdr.csum_start = __cpu_to_virtio16(false,
2967 skb_checksum_start_offset(skb));
2968 vnet_hdr.csum_offset = __cpu_to_virtio16(false,
2969 skb->csum_offset);
10a8d94a
JW
2970 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2971 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2972 } /* else everything is zero */
2973
7eab8d9e 2974 err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
bfd5f4a3
SS
2975 if (err < 0)
2976 goto out_free;
2977 }
2978
f3d33426
HFS
2979 /* You lose any data beyond the buffer you gave. If it worries
2980 * a user program they can ask the device for its MTU
2981 * anyway.
1da177e4 2982 */
1da177e4 2983 copied = skb->len;
40d4e3df
ED
2984 if (copied > len) {
2985 copied = len;
2986 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2987 }
2988
51f3d02b 2989 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
2990 if (err)
2991 goto out_free;
2992
3b885787 2993 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 2994
f3d33426
HFS
2995 if (msg->msg_name) {
2996 /* If the address length field is there to be filled
2997 * in, we fill it in now.
2998 */
2999 if (sock->type == SOCK_PACKET) {
342dfc30 3000 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3001 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3002 } else {
3003 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3004 msg->msg_namelen = sll->sll_halen +
3005 offsetof(struct sockaddr_ll, sll_addr);
3006 }
ffbc6111
HX
3007 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3008 msg->msg_namelen);
f3d33426 3009 }
1da177e4 3010
8dc41944 3011 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3012 struct tpacket_auxdata aux;
3013
3014 aux.tp_status = TP_STATUS_USER;
3015 if (skb->ip_summed == CHECKSUM_PARTIAL)
3016 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3017 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
3018 aux.tp_snaplen = skb->len;
3019 aux.tp_mac = 0;
bbe735e4 3020 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3021 if (skb_vlan_tag_present(skb)) {
3022 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3023 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3024 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3025 } else {
3026 aux.tp_vlan_tci = 0;
a0cdfcf3 3027 aux.tp_vlan_tpid = 0;
a3bcc23e 3028 }
ffbc6111 3029 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3030 }
3031
1da177e4
LT
3032 /*
3033 * Free or return the buffer as appropriate. Again this
3034 * hides all the races and re-entrancy issues from us.
3035 */
bfd5f4a3 3036 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3037
3038out_free:
3039 skb_free_datagram(sk, skb);
3040out:
3041 return err;
3042}
3043
1da177e4
LT
3044static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3045 int *uaddr_len, int peer)
3046{
3047 struct net_device *dev;
3048 struct sock *sk = sock->sk;
3049
3050 if (peer)
3051 return -EOPNOTSUPP;
3052
3053 uaddr->sa_family = AF_PACKET;
2dc85bf3 3054 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3055 rcu_read_lock();
3056 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3057 if (dev)
2dc85bf3 3058 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3059 rcu_read_unlock();
1da177e4
LT
3060 *uaddr_len = sizeof(*uaddr);
3061
3062 return 0;
3063}
1da177e4
LT
3064
3065static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3066 int *uaddr_len, int peer)
3067{
3068 struct net_device *dev;
3069 struct sock *sk = sock->sk;
3070 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3071 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3072
3073 if (peer)
3074 return -EOPNOTSUPP;
3075
3076 sll->sll_family = AF_PACKET;
3077 sll->sll_ifindex = po->ifindex;
3078 sll->sll_protocol = po->num;
67286640 3079 sll->sll_pkttype = 0;
654d1f8a
ED
3080 rcu_read_lock();
3081 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3082 if (dev) {
3083 sll->sll_hatype = dev->type;
3084 sll->sll_halen = dev->addr_len;
3085 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3086 } else {
3087 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3088 sll->sll_halen = 0;
3089 }
654d1f8a 3090 rcu_read_unlock();
0fb375fb 3091 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3092
3093 return 0;
3094}
3095
2aeb0b88
WC
3096static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3097 int what)
1da177e4
LT
3098{
3099 switch (i->type) {
3100 case PACKET_MR_MULTICAST:
1162563f
JP
3101 if (i->alen != dev->addr_len)
3102 return -EINVAL;
1da177e4 3103 if (what > 0)
22bedad3 3104 return dev_mc_add(dev, i->addr);
1da177e4 3105 else
22bedad3 3106 return dev_mc_del(dev, i->addr);
1da177e4
LT
3107 break;
3108 case PACKET_MR_PROMISC:
2aeb0b88 3109 return dev_set_promiscuity(dev, what);
1da177e4 3110 case PACKET_MR_ALLMULTI:
2aeb0b88 3111 return dev_set_allmulti(dev, what);
d95ed927 3112 case PACKET_MR_UNICAST:
1162563f
JP
3113 if (i->alen != dev->addr_len)
3114 return -EINVAL;
d95ed927 3115 if (what > 0)
a748ee24 3116 return dev_uc_add(dev, i->addr);
d95ed927 3117 else
a748ee24 3118 return dev_uc_del(dev, i->addr);
d95ed927 3119 break;
40d4e3df
ED
3120 default:
3121 break;
1da177e4 3122 }
2aeb0b88 3123 return 0;
1da177e4
LT
3124}
3125
82f17091
FR
3126static void packet_dev_mclist_delete(struct net_device *dev,
3127 struct packet_mclist **mlp)
1da177e4 3128{
82f17091
FR
3129 struct packet_mclist *ml;
3130
3131 while ((ml = *mlp) != NULL) {
3132 if (ml->ifindex == dev->ifindex) {
3133 packet_dev_mc(dev, ml, -1);
3134 *mlp = ml->next;
3135 kfree(ml);
3136 } else
3137 mlp = &ml->next;
1da177e4
LT
3138 }
3139}
3140
0fb375fb 3141static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3142{
3143 struct packet_sock *po = pkt_sk(sk);
3144 struct packet_mclist *ml, *i;
3145 struct net_device *dev;
3146 int err;
3147
3148 rtnl_lock();
3149
3150 err = -ENODEV;
3b1e0a65 3151 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3152 if (!dev)
3153 goto done;
3154
3155 err = -EINVAL;
1162563f 3156 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3157 goto done;
3158
3159 err = -ENOBUFS;
8b3a7005 3160 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3161 if (i == NULL)
3162 goto done;
3163
3164 err = 0;
3165 for (ml = po->mclist; ml; ml = ml->next) {
3166 if (ml->ifindex == mreq->mr_ifindex &&
3167 ml->type == mreq->mr_type &&
3168 ml->alen == mreq->mr_alen &&
3169 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3170 ml->count++;
3171 /* Free the new element ... */
3172 kfree(i);
3173 goto done;
3174 }
3175 }
3176
3177 i->type = mreq->mr_type;
3178 i->ifindex = mreq->mr_ifindex;
3179 i->alen = mreq->mr_alen;
3180 memcpy(i->addr, mreq->mr_address, i->alen);
3181 i->count = 1;
3182 i->next = po->mclist;
3183 po->mclist = i;
2aeb0b88
WC
3184 err = packet_dev_mc(dev, i, 1);
3185 if (err) {
3186 po->mclist = i->next;
3187 kfree(i);
3188 }
1da177e4
LT
3189
3190done:
3191 rtnl_unlock();
3192 return err;
3193}
3194
0fb375fb 3195static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3196{
3197 struct packet_mclist *ml, **mlp;
3198
3199 rtnl_lock();
3200
3201 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3202 if (ml->ifindex == mreq->mr_ifindex &&
3203 ml->type == mreq->mr_type &&
3204 ml->alen == mreq->mr_alen &&
3205 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3206 if (--ml->count == 0) {
3207 struct net_device *dev;
3208 *mlp = ml->next;
ad959e76
ED
3209 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3210 if (dev)
1da177e4 3211 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3212 kfree(ml);
3213 }
82f17091 3214 break;
1da177e4
LT
3215 }
3216 }
3217 rtnl_unlock();
82f17091 3218 return 0;
1da177e4
LT
3219}
3220
3221static void packet_flush_mclist(struct sock *sk)
3222{
3223 struct packet_sock *po = pkt_sk(sk);
3224 struct packet_mclist *ml;
3225
3226 if (!po->mclist)
3227 return;
3228
3229 rtnl_lock();
3230 while ((ml = po->mclist) != NULL) {
3231 struct net_device *dev;
3232
3233 po->mclist = ml->next;
ad959e76
ED
3234 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3235 if (dev != NULL)
1da177e4 3236 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3237 kfree(ml);
3238 }
3239 rtnl_unlock();
3240}
1da177e4
LT
3241
3242static int
b7058842 3243packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3244{
3245 struct sock *sk = sock->sk;
8dc41944 3246 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3247 int ret;
3248
3249 if (level != SOL_PACKET)
3250 return -ENOPROTOOPT;
3251
69e3c75f 3252 switch (optname) {
1ce4f28b 3253 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3254 case PACKET_DROP_MEMBERSHIP:
3255 {
0fb375fb
EB
3256 struct packet_mreq_max mreq;
3257 int len = optlen;
3258 memset(&mreq, 0, sizeof(mreq));
3259 if (len < sizeof(struct packet_mreq))
1da177e4 3260 return -EINVAL;
0fb375fb
EB
3261 if (len > sizeof(mreq))
3262 len = sizeof(mreq);
40d4e3df 3263 if (copy_from_user(&mreq, optval, len))
1da177e4 3264 return -EFAULT;
0fb375fb
EB
3265 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3266 return -EINVAL;
1da177e4
LT
3267 if (optname == PACKET_ADD_MEMBERSHIP)
3268 ret = packet_mc_add(sk, &mreq);
3269 else
3270 ret = packet_mc_drop(sk, &mreq);
3271 return ret;
3272 }
a2efcfa0 3273
1da177e4 3274 case PACKET_RX_RING:
69e3c75f 3275 case PACKET_TX_RING:
1da177e4 3276 {
f6fb8f10 3277 union tpacket_req_u req_u;
3278 int len;
1da177e4 3279
f6fb8f10 3280 switch (po->tp_version) {
3281 case TPACKET_V1:
3282 case TPACKET_V2:
3283 len = sizeof(req_u.req);
3284 break;
3285 case TPACKET_V3:
3286 default:
3287 len = sizeof(req_u.req3);
3288 break;
3289 }
3290 if (optlen < len)
1da177e4 3291 return -EINVAL;
bfd5f4a3
SS
3292 if (pkt_sk(sk)->has_vnet_hdr)
3293 return -EINVAL;
f6fb8f10 3294 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3295 return -EFAULT;
f6fb8f10 3296 return packet_set_ring(sk, &req_u, 0,
3297 optname == PACKET_TX_RING);
1da177e4
LT
3298 }
3299 case PACKET_COPY_THRESH:
3300 {
3301 int val;
3302
40d4e3df 3303 if (optlen != sizeof(val))
1da177e4 3304 return -EINVAL;
40d4e3df 3305 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3306 return -EFAULT;
3307
3308 pkt_sk(sk)->copy_thresh = val;
3309 return 0;
3310 }
bbd6ef87
PM
3311 case PACKET_VERSION:
3312 {
3313 int val;
3314
3315 if (optlen != sizeof(val))
3316 return -EINVAL;
69e3c75f 3317 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3318 return -EBUSY;
3319 if (copy_from_user(&val, optval, sizeof(val)))
3320 return -EFAULT;
3321 switch (val) {
3322 case TPACKET_V1:
3323 case TPACKET_V2:
f6fb8f10 3324 case TPACKET_V3:
bbd6ef87
PM
3325 po->tp_version = val;
3326 return 0;
3327 default:
3328 return -EINVAL;
3329 }
3330 }
8913336a
PM
3331 case PACKET_RESERVE:
3332 {
3333 unsigned int val;
3334
3335 if (optlen != sizeof(val))
3336 return -EINVAL;
69e3c75f 3337 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3338 return -EBUSY;
3339 if (copy_from_user(&val, optval, sizeof(val)))
3340 return -EFAULT;
3341 po->tp_reserve = val;
3342 return 0;
3343 }
69e3c75f
JB
3344 case PACKET_LOSS:
3345 {
3346 unsigned int val;
3347
3348 if (optlen != sizeof(val))
3349 return -EINVAL;
3350 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3351 return -EBUSY;
3352 if (copy_from_user(&val, optval, sizeof(val)))
3353 return -EFAULT;
3354 po->tp_loss = !!val;
3355 return 0;
3356 }
8dc41944
HX
3357 case PACKET_AUXDATA:
3358 {
3359 int val;
3360
3361 if (optlen < sizeof(val))
3362 return -EINVAL;
3363 if (copy_from_user(&val, optval, sizeof(val)))
3364 return -EFAULT;
3365
3366 po->auxdata = !!val;
3367 return 0;
3368 }
80feaacb
PWJ
3369 case PACKET_ORIGDEV:
3370 {
3371 int val;
3372
3373 if (optlen < sizeof(val))
3374 return -EINVAL;
3375 if (copy_from_user(&val, optval, sizeof(val)))
3376 return -EFAULT;
3377
3378 po->origdev = !!val;
3379 return 0;
3380 }
bfd5f4a3
SS
3381 case PACKET_VNET_HDR:
3382 {
3383 int val;
3384
3385 if (sock->type != SOCK_RAW)
3386 return -EINVAL;
3387 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3388 return -EBUSY;
3389 if (optlen < sizeof(val))
3390 return -EINVAL;
3391 if (copy_from_user(&val, optval, sizeof(val)))
3392 return -EFAULT;
3393
3394 po->has_vnet_hdr = !!val;
3395 return 0;
3396 }
614f60fa
SM
3397 case PACKET_TIMESTAMP:
3398 {
3399 int val;
3400
3401 if (optlen != sizeof(val))
3402 return -EINVAL;
3403 if (copy_from_user(&val, optval, sizeof(val)))
3404 return -EFAULT;
3405
3406 po->tp_tstamp = val;
3407 return 0;
3408 }
dc99f600
DM
3409 case PACKET_FANOUT:
3410 {
3411 int val;
3412
3413 if (optlen != sizeof(val))
3414 return -EINVAL;
3415 if (copy_from_user(&val, optval, sizeof(val)))
3416 return -EFAULT;
3417
3418 return fanout_add(sk, val & 0xffff, val >> 16);
3419 }
5920cd3a
PC
3420 case PACKET_TX_HAS_OFF:
3421 {
3422 unsigned int val;
3423
3424 if (optlen != sizeof(val))
3425 return -EINVAL;
3426 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3427 return -EBUSY;
3428 if (copy_from_user(&val, optval, sizeof(val)))
3429 return -EFAULT;
3430 po->tp_tx_has_off = !!val;
3431 return 0;
3432 }
d346a3fa
DB
3433 case PACKET_QDISC_BYPASS:
3434 {
3435 int val;
3436
3437 if (optlen != sizeof(val))
3438 return -EINVAL;
3439 if (copy_from_user(&val, optval, sizeof(val)))
3440 return -EFAULT;
3441
3442 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3443 return 0;
3444 }
1da177e4
LT
3445 default:
3446 return -ENOPROTOOPT;
3447 }
3448}
3449
3450static int packet_getsockopt(struct socket *sock, int level, int optname,
3451 char __user *optval, int __user *optlen)
3452{
3453 int len;
c06fff6e 3454 int val, lv = sizeof(val);
1da177e4
LT
3455 struct sock *sk = sock->sk;
3456 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3457 void *data = &val;
ee80fbf3 3458 union tpacket_stats_u st;
1da177e4
LT
3459
3460 if (level != SOL_PACKET)
3461 return -ENOPROTOOPT;
3462
8ae55f04
KK
3463 if (get_user(len, optlen))
3464 return -EFAULT;
1da177e4
LT
3465
3466 if (len < 0)
3467 return -EINVAL;
1ce4f28b 3468
69e3c75f 3469 switch (optname) {
1da177e4 3470 case PACKET_STATISTICS:
1da177e4 3471 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3472 memcpy(&st, &po->stats, sizeof(st));
3473 memset(&po->stats, 0, sizeof(po->stats));
3474 spin_unlock_bh(&sk->sk_receive_queue.lock);
3475
f6fb8f10 3476 if (po->tp_version == TPACKET_V3) {
c06fff6e 3477 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3478 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3479 data = &st.stats3;
f6fb8f10 3480 } else {
c06fff6e 3481 lv = sizeof(struct tpacket_stats);
8bcdeaff 3482 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3483 data = &st.stats1;
f6fb8f10 3484 }
ee80fbf3 3485
8dc41944
HX
3486 break;
3487 case PACKET_AUXDATA:
8dc41944 3488 val = po->auxdata;
80feaacb
PWJ
3489 break;
3490 case PACKET_ORIGDEV:
80feaacb 3491 val = po->origdev;
bfd5f4a3
SS
3492 break;
3493 case PACKET_VNET_HDR:
bfd5f4a3 3494 val = po->has_vnet_hdr;
1da177e4 3495 break;
bbd6ef87 3496 case PACKET_VERSION:
bbd6ef87 3497 val = po->tp_version;
bbd6ef87
PM
3498 break;
3499 case PACKET_HDRLEN:
3500 if (len > sizeof(int))
3501 len = sizeof(int);
3502 if (copy_from_user(&val, optval, len))
3503 return -EFAULT;
3504 switch (val) {
3505 case TPACKET_V1:
3506 val = sizeof(struct tpacket_hdr);
3507 break;
3508 case TPACKET_V2:
3509 val = sizeof(struct tpacket2_hdr);
3510 break;
f6fb8f10 3511 case TPACKET_V3:
3512 val = sizeof(struct tpacket3_hdr);
3513 break;
bbd6ef87
PM
3514 default:
3515 return -EINVAL;
3516 }
bbd6ef87 3517 break;
8913336a 3518 case PACKET_RESERVE:
8913336a 3519 val = po->tp_reserve;
8913336a 3520 break;
69e3c75f 3521 case PACKET_LOSS:
69e3c75f 3522 val = po->tp_loss;
69e3c75f 3523 break;
614f60fa 3524 case PACKET_TIMESTAMP:
614f60fa 3525 val = po->tp_tstamp;
614f60fa 3526 break;
dc99f600 3527 case PACKET_FANOUT:
dc99f600
DM
3528 val = (po->fanout ?
3529 ((u32)po->fanout->id |
77f65ebd
WB
3530 ((u32)po->fanout->type << 16) |
3531 ((u32)po->fanout->flags << 24)) :
dc99f600 3532 0);
dc99f600 3533 break;
5920cd3a
PC
3534 case PACKET_TX_HAS_OFF:
3535 val = po->tp_tx_has_off;
3536 break;
d346a3fa
DB
3537 case PACKET_QDISC_BYPASS:
3538 val = packet_use_direct_xmit(po);
3539 break;
1da177e4
LT
3540 default:
3541 return -ENOPROTOOPT;
3542 }
3543
c06fff6e
ED
3544 if (len > lv)
3545 len = lv;
8ae55f04
KK
3546 if (put_user(len, optlen))
3547 return -EFAULT;
8dc41944
HX
3548 if (copy_to_user(optval, data, len))
3549 return -EFAULT;
8ae55f04 3550 return 0;
1da177e4
LT
3551}
3552
3553
351638e7
JP
3554static int packet_notifier(struct notifier_block *this,
3555 unsigned long msg, void *ptr)
1da177e4
LT
3556{
3557 struct sock *sk;
351638e7 3558 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3559 struct net *net = dev_net(dev);
1da177e4 3560
808f5114 3561 rcu_read_lock();
b67bfe0d 3562 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3563 struct packet_sock *po = pkt_sk(sk);
3564
3565 switch (msg) {
3566 case NETDEV_UNREGISTER:
1da177e4 3567 if (po->mclist)
82f17091 3568 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3569 /* fallthrough */
3570
1da177e4
LT
3571 case NETDEV_DOWN:
3572 if (dev->ifindex == po->ifindex) {
3573 spin_lock(&po->bind_lock);
3574 if (po->running) {
ce06b03e 3575 __unregister_prot_hook(sk, false);
1da177e4
LT
3576 sk->sk_err = ENETDOWN;
3577 if (!sock_flag(sk, SOCK_DEAD))
3578 sk->sk_error_report(sk);
3579 }
3580 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3581 packet_cached_dev_reset(po);
1da177e4 3582 po->ifindex = -1;
160ff18a
BG
3583 if (po->prot_hook.dev)
3584 dev_put(po->prot_hook.dev);
1da177e4
LT
3585 po->prot_hook.dev = NULL;
3586 }
3587 spin_unlock(&po->bind_lock);
3588 }
3589 break;
3590 case NETDEV_UP:
808f5114 3591 if (dev->ifindex == po->ifindex) {
3592 spin_lock(&po->bind_lock);
ce06b03e
DM
3593 if (po->num)
3594 register_prot_hook(sk);
808f5114 3595 spin_unlock(&po->bind_lock);
1da177e4 3596 }
1da177e4
LT
3597 break;
3598 }
3599 }
808f5114 3600 rcu_read_unlock();
1da177e4
LT
3601 return NOTIFY_DONE;
3602}
3603
3604
3605static int packet_ioctl(struct socket *sock, unsigned int cmd,
3606 unsigned long arg)
3607{
3608 struct sock *sk = sock->sk;
3609
69e3c75f 3610 switch (cmd) {
40d4e3df
ED
3611 case SIOCOUTQ:
3612 {
3613 int amount = sk_wmem_alloc_get(sk);
31e6d363 3614
40d4e3df
ED
3615 return put_user(amount, (int __user *)arg);
3616 }
3617 case SIOCINQ:
3618 {
3619 struct sk_buff *skb;
3620 int amount = 0;
3621
3622 spin_lock_bh(&sk->sk_receive_queue.lock);
3623 skb = skb_peek(&sk->sk_receive_queue);
3624 if (skb)
3625 amount = skb->len;
3626 spin_unlock_bh(&sk->sk_receive_queue.lock);
3627 return put_user(amount, (int __user *)arg);
3628 }
3629 case SIOCGSTAMP:
3630 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3631 case SIOCGSTAMPNS:
3632 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3633
1da177e4 3634#ifdef CONFIG_INET
40d4e3df
ED
3635 case SIOCADDRT:
3636 case SIOCDELRT:
3637 case SIOCDARP:
3638 case SIOCGARP:
3639 case SIOCSARP:
3640 case SIOCGIFADDR:
3641 case SIOCSIFADDR:
3642 case SIOCGIFBRDADDR:
3643 case SIOCSIFBRDADDR:
3644 case SIOCGIFNETMASK:
3645 case SIOCSIFNETMASK:
3646 case SIOCGIFDSTADDR:
3647 case SIOCSIFDSTADDR:
3648 case SIOCSIFFLAGS:
40d4e3df 3649 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3650#endif
3651
40d4e3df
ED
3652 default:
3653 return -ENOIOCTLCMD;
1da177e4
LT
3654 }
3655 return 0;
3656}
3657
40d4e3df 3658static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3659 poll_table *wait)
3660{
3661 struct sock *sk = sock->sk;
3662 struct packet_sock *po = pkt_sk(sk);
3663 unsigned int mask = datagram_poll(file, sock, wait);
3664
3665 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3666 if (po->rx_ring.pg_vec) {
f6fb8f10 3667 if (!packet_previous_rx_frame(po, &po->rx_ring,
3668 TP_STATUS_KERNEL))
1da177e4
LT
3669 mask |= POLLIN | POLLRDNORM;
3670 }
3671 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3672 spin_lock_bh(&sk->sk_write_queue.lock);
3673 if (po->tx_ring.pg_vec) {
3674 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3675 mask |= POLLOUT | POLLWRNORM;
3676 }
3677 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3678 return mask;
3679}
3680
3681
3682/* Dirty? Well, I still did not learn better way to account
3683 * for user mmaps.
3684 */
3685
3686static void packet_mm_open(struct vm_area_struct *vma)
3687{
3688 struct file *file = vma->vm_file;
40d4e3df 3689 struct socket *sock = file->private_data;
1da177e4 3690 struct sock *sk = sock->sk;
1ce4f28b 3691
1da177e4
LT
3692 if (sk)
3693 atomic_inc(&pkt_sk(sk)->mapped);
3694}
3695
3696static void packet_mm_close(struct vm_area_struct *vma)
3697{
3698 struct file *file = vma->vm_file;
40d4e3df 3699 struct socket *sock = file->private_data;
1da177e4 3700 struct sock *sk = sock->sk;
1ce4f28b 3701
1da177e4
LT
3702 if (sk)
3703 atomic_dec(&pkt_sk(sk)->mapped);
3704}
3705
f0f37e2f 3706static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3707 .open = packet_mm_open,
3708 .close = packet_mm_close,
1da177e4
LT
3709};
3710
0e3125c7
NH
3711static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3712 unsigned int len)
1da177e4
LT
3713{
3714 int i;
3715
4ebf0ae2 3716 for (i = 0; i < len; i++) {
0e3125c7 3717 if (likely(pg_vec[i].buffer)) {
c56b4d90 3718 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3719 vfree(pg_vec[i].buffer);
3720 else
3721 free_pages((unsigned long)pg_vec[i].buffer,
3722 order);
3723 pg_vec[i].buffer = NULL;
3724 }
1da177e4
LT
3725 }
3726 kfree(pg_vec);
3727}
3728
eea49cc9 3729static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3730{
f0d4eb29 3731 char *buffer;
0e3125c7
NH
3732 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3733 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3734
3735 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3736 if (buffer)
3737 return buffer;
3738
f0d4eb29 3739 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 3740 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
3741 if (buffer)
3742 return buffer;
3743
f0d4eb29 3744 /* vmalloc failed, lets dig into swap here */
0e3125c7 3745 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 3746 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3747 if (buffer)
3748 return buffer;
3749
f0d4eb29 3750 /* complete and utter failure */
0e3125c7 3751 return NULL;
4ebf0ae2
DM
3752}
3753
0e3125c7 3754static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3755{
3756 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3757 struct pgv *pg_vec;
4ebf0ae2
DM
3758 int i;
3759
0e3125c7 3760 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3761 if (unlikely(!pg_vec))
3762 goto out;
3763
3764 for (i = 0; i < block_nr; i++) {
c56b4d90 3765 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3766 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3767 goto out_free_pgvec;
3768 }
3769
3770out:
3771 return pg_vec;
3772
3773out_free_pgvec:
3774 free_pg_vec(pg_vec, order, block_nr);
3775 pg_vec = NULL;
3776 goto out;
3777}
1da177e4 3778
f6fb8f10 3779static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3780 int closing, int tx_ring)
1da177e4 3781{
0e3125c7 3782 struct pgv *pg_vec = NULL;
1da177e4 3783 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3784 int was_running, order = 0;
69e3c75f
JB
3785 struct packet_ring_buffer *rb;
3786 struct sk_buff_head *rb_queue;
0e11c91e 3787 __be16 num;
f6fb8f10 3788 int err = -EINVAL;
3789 /* Added to avoid minimal code churn */
3790 struct tpacket_req *req = &req_u->req;
3791
3792 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3793 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3794 WARN(1, "Tx-ring is not supported.\n");
3795 goto out;
3796 }
1ce4f28b 3797
69e3c75f
JB
3798 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3799 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3800
69e3c75f
JB
3801 err = -EBUSY;
3802 if (!closing) {
3803 if (atomic_read(&po->mapped))
3804 goto out;
b0138408 3805 if (packet_read_pending(rb))
69e3c75f
JB
3806 goto out;
3807 }
1da177e4 3808
69e3c75f
JB
3809 if (req->tp_block_nr) {
3810 /* Sanity tests and some calculations */
3811 err = -EBUSY;
3812 if (unlikely(rb->pg_vec))
3813 goto out;
1da177e4 3814
bbd6ef87
PM
3815 switch (po->tp_version) {
3816 case TPACKET_V1:
3817 po->tp_hdrlen = TPACKET_HDRLEN;
3818 break;
3819 case TPACKET_V2:
3820 po->tp_hdrlen = TPACKET2_HDRLEN;
3821 break;
f6fb8f10 3822 case TPACKET_V3:
3823 po->tp_hdrlen = TPACKET3_HDRLEN;
3824 break;
bbd6ef87
PM
3825 }
3826
69e3c75f 3827 err = -EINVAL;
4ebf0ae2 3828 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3829 goto out;
4ebf0ae2 3830 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3831 goto out;
dc808110
ED
3832 if (po->tp_version >= TPACKET_V3 &&
3833 (int)(req->tp_block_size -
3834 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
3835 goto out;
8913336a 3836 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3837 po->tp_reserve))
3838 goto out;
4ebf0ae2 3839 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3840 goto out;
1da177e4 3841
69e3c75f
JB
3842 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3843 if (unlikely(rb->frames_per_block <= 0))
3844 goto out;
3845 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3846 req->tp_frame_nr))
3847 goto out;
1da177e4
LT
3848
3849 err = -ENOMEM;
4ebf0ae2
DM
3850 order = get_order(req->tp_block_size);
3851 pg_vec = alloc_pg_vec(req, order);
3852 if (unlikely(!pg_vec))
1da177e4 3853 goto out;
f6fb8f10 3854 switch (po->tp_version) {
3855 case TPACKET_V3:
3856 /* Transmit path is not supported. We checked
3857 * it above but just being paranoid
3858 */
3859 if (!tx_ring)
3860 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
d7cf0c34 3861 break;
f6fb8f10 3862 default:
3863 break;
3864 }
69e3c75f
JB
3865 }
3866 /* Done */
3867 else {
3868 err = -EINVAL;
4ebf0ae2 3869 if (unlikely(req->tp_frame_nr))
69e3c75f 3870 goto out;
1da177e4
LT
3871 }
3872
3873 lock_sock(sk);
3874
3875 /* Detach socket from network */
3876 spin_lock(&po->bind_lock);
3877 was_running = po->running;
3878 num = po->num;
3879 if (was_running) {
1da177e4 3880 po->num = 0;
ce06b03e 3881 __unregister_prot_hook(sk, false);
1da177e4
LT
3882 }
3883 spin_unlock(&po->bind_lock);
1ce4f28b 3884
1da177e4
LT
3885 synchronize_net();
3886
3887 err = -EBUSY;
905db440 3888 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3889 if (closing || atomic_read(&po->mapped) == 0) {
3890 err = 0;
69e3c75f 3891 spin_lock_bh(&rb_queue->lock);
c053fd96 3892 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3893 rb->frame_max = (req->tp_frame_nr - 1);
3894 rb->head = 0;
3895 rb->frame_size = req->tp_frame_size;
3896 spin_unlock_bh(&rb_queue->lock);
3897
c053fd96
CG
3898 swap(rb->pg_vec_order, order);
3899 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3900
3901 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3902 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3903 tpacket_rcv : packet_rcv;
3904 skb_queue_purge(rb_queue);
1da177e4 3905 if (atomic_read(&po->mapped))
40d4e3df
ED
3906 pr_err("packet_mmap: vma is busy: %d\n",
3907 atomic_read(&po->mapped));
1da177e4 3908 }
905db440 3909 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3910
3911 spin_lock(&po->bind_lock);
ce06b03e 3912 if (was_running) {
1da177e4 3913 po->num = num;
ce06b03e 3914 register_prot_hook(sk);
1da177e4
LT
3915 }
3916 spin_unlock(&po->bind_lock);
f6fb8f10 3917 if (closing && (po->tp_version > TPACKET_V2)) {
3918 /* Because we don't support block-based V3 on tx-ring */
3919 if (!tx_ring)
3920 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3921 }
1da177e4
LT
3922 release_sock(sk);
3923
1da177e4
LT
3924 if (pg_vec)
3925 free_pg_vec(pg_vec, order, req->tp_block_nr);
3926out:
3927 return err;
3928}
3929
69e3c75f
JB
3930static int packet_mmap(struct file *file, struct socket *sock,
3931 struct vm_area_struct *vma)
1da177e4
LT
3932{
3933 struct sock *sk = sock->sk;
3934 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3935 unsigned long size, expected_size;
3936 struct packet_ring_buffer *rb;
1da177e4
LT
3937 unsigned long start;
3938 int err = -EINVAL;
3939 int i;
3940
3941 if (vma->vm_pgoff)
3942 return -EINVAL;
3943
905db440 3944 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3945
3946 expected_size = 0;
3947 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3948 if (rb->pg_vec) {
3949 expected_size += rb->pg_vec_len
3950 * rb->pg_vec_pages
3951 * PAGE_SIZE;
3952 }
3953 }
3954
3955 if (expected_size == 0)
1da177e4 3956 goto out;
69e3c75f
JB
3957
3958 size = vma->vm_end - vma->vm_start;
3959 if (size != expected_size)
1da177e4
LT
3960 goto out;
3961
1da177e4 3962 start = vma->vm_start;
69e3c75f
JB
3963 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3964 if (rb->pg_vec == NULL)
3965 continue;
3966
3967 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3968 struct page *page;
3969 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3970 int pg_num;
3971
c56b4d90
CG
3972 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3973 page = pgv_to_page(kaddr);
69e3c75f
JB
3974 err = vm_insert_page(vma, start, page);
3975 if (unlikely(err))
3976 goto out;
3977 start += PAGE_SIZE;
0e3125c7 3978 kaddr += PAGE_SIZE;
69e3c75f 3979 }
4ebf0ae2 3980 }
1da177e4 3981 }
69e3c75f 3982
4ebf0ae2 3983 atomic_inc(&po->mapped);
1da177e4
LT
3984 vma->vm_ops = &packet_mmap_ops;
3985 err = 0;
3986
3987out:
905db440 3988 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3989 return err;
3990}
1da177e4 3991
90ddc4f0 3992static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3993 .family = PF_PACKET,
3994 .owner = THIS_MODULE,
3995 .release = packet_release,
3996 .bind = packet_bind_spkt,
3997 .connect = sock_no_connect,
3998 .socketpair = sock_no_socketpair,
3999 .accept = sock_no_accept,
4000 .getname = packet_getname_spkt,
4001 .poll = datagram_poll,
4002 .ioctl = packet_ioctl,
4003 .listen = sock_no_listen,
4004 .shutdown = sock_no_shutdown,
4005 .setsockopt = sock_no_setsockopt,
4006 .getsockopt = sock_no_getsockopt,
4007 .sendmsg = packet_sendmsg_spkt,
4008 .recvmsg = packet_recvmsg,
4009 .mmap = sock_no_mmap,
4010 .sendpage = sock_no_sendpage,
4011};
1da177e4 4012
90ddc4f0 4013static const struct proto_ops packet_ops = {
1da177e4
LT
4014 .family = PF_PACKET,
4015 .owner = THIS_MODULE,
4016 .release = packet_release,
4017 .bind = packet_bind,
4018 .connect = sock_no_connect,
4019 .socketpair = sock_no_socketpair,
4020 .accept = sock_no_accept,
1ce4f28b 4021 .getname = packet_getname,
1da177e4
LT
4022 .poll = packet_poll,
4023 .ioctl = packet_ioctl,
4024 .listen = sock_no_listen,
4025 .shutdown = sock_no_shutdown,
4026 .setsockopt = packet_setsockopt,
4027 .getsockopt = packet_getsockopt,
4028 .sendmsg = packet_sendmsg,
4029 .recvmsg = packet_recvmsg,
4030 .mmap = packet_mmap,
4031 .sendpage = sock_no_sendpage,
4032};
4033
ec1b4cf7 4034static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4035 .family = PF_PACKET,
4036 .create = packet_create,
4037 .owner = THIS_MODULE,
4038};
4039
4040static struct notifier_block packet_netdev_notifier = {
40d4e3df 4041 .notifier_call = packet_notifier,
1da177e4
LT
4042};
4043
4044#ifdef CONFIG_PROC_FS
1da177e4
LT
4045
4046static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4047 __acquires(RCU)
1da177e4 4048{
e372c414 4049 struct net *net = seq_file_net(seq);
808f5114 4050
4051 rcu_read_lock();
4052 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4053}
4054
4055static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4056{
1bf40954 4057 struct net *net = seq_file_net(seq);
808f5114 4058 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4059}
4060
4061static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4062 __releases(RCU)
1da177e4 4063{
808f5114 4064 rcu_read_unlock();
1da177e4
LT
4065}
4066
1ce4f28b 4067static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4068{
4069 if (v == SEQ_START_TOKEN)
4070 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4071 else {
b7ceabd9 4072 struct sock *s = sk_entry(v);
1da177e4
LT
4073 const struct packet_sock *po = pkt_sk(s);
4074
4075 seq_printf(seq,
71338aa7 4076 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4077 s,
4078 atomic_read(&s->sk_refcnt),
4079 s->sk_type,
4080 ntohs(po->num),
4081 po->ifindex,
4082 po->running,
4083 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4084 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4085 sock_i_ino(s));
1da177e4
LT
4086 }
4087
4088 return 0;
4089}
4090
56b3d975 4091static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4092 .start = packet_seq_start,
4093 .next = packet_seq_next,
4094 .stop = packet_seq_stop,
4095 .show = packet_seq_show,
4096};
4097
4098static int packet_seq_open(struct inode *inode, struct file *file)
4099{
e372c414
DL
4100 return seq_open_net(inode, file, &packet_seq_ops,
4101 sizeof(struct seq_net_private));
1da177e4
LT
4102}
4103
da7071d7 4104static const struct file_operations packet_seq_fops = {
1da177e4
LT
4105 .owner = THIS_MODULE,
4106 .open = packet_seq_open,
4107 .read = seq_read,
4108 .llseek = seq_lseek,
e372c414 4109 .release = seq_release_net,
1da177e4
LT
4110};
4111
4112#endif
4113
2c8c1e72 4114static int __net_init packet_net_init(struct net *net)
d12d01d6 4115{
0fa7fa98 4116 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4117 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4118
d4beaa66 4119 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4120 return -ENOMEM;
4121
4122 return 0;
4123}
4124
2c8c1e72 4125static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4126{
ece31ffd 4127 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4128}
4129
4130static struct pernet_operations packet_net_ops = {
4131 .init = packet_net_init,
4132 .exit = packet_net_exit,
4133};
4134
4135
1da177e4
LT
4136static void __exit packet_exit(void)
4137{
1da177e4 4138 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4139 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4140 sock_unregister(PF_PACKET);
4141 proto_unregister(&packet_proto);
4142}
4143
4144static int __init packet_init(void)
4145{
4146 int rc = proto_register(&packet_proto, 0);
4147
4148 if (rc != 0)
4149 goto out;
4150
4151 sock_register(&packet_family_ops);
d12d01d6 4152 register_pernet_subsys(&packet_net_ops);
1da177e4 4153 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4154out:
4155 return rc;
4156}
4157
4158module_init(packet_init);
4159module_exit(packet_exit);
4160MODULE_LICENSE("GPL");
4161MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.238999 seconds and 5 git commands to generate.