packet: rollover prepare: move code out of callsites
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111 218struct packet_skb_cb {
ffbc6111
HX
219 union {
220 struct sockaddr_pkt pkt;
2472d761
EB
221 union {
222 /* Trick: alias skb original length with
223 * ll.sll_family and ll.protocol in order
224 * to save room.
225 */
226 unsigned int origlen;
227 struct sockaddr_ll ll;
228 };
ffbc6111
HX
229 } sa;
230};
231
232#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 233
bc59ba39 234#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 235#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 237#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 238 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 239#define GET_NEXT_PRB_BLK_NUM(x) \
240 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241 ((x)->kactive_blk_num+1) : 0)
242
dc99f600
DM
243static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
244static void __fanout_link(struct sock *sk, struct packet_sock *po);
245
d346a3fa
DB
246static int packet_direct_xmit(struct sk_buff *skb)
247{
248 struct net_device *dev = skb->dev;
d346a3fa
DB
249 netdev_features_t features;
250 struct netdev_queue *txq;
43279500 251 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
252
253 if (unlikely(!netif_running(dev) ||
43279500
DB
254 !netif_carrier_ok(dev)))
255 goto drop;
d346a3fa
DB
256
257 features = netif_skb_features(skb);
258 if (skb_needs_linearize(skb, features) &&
43279500
DB
259 __skb_linearize(skb))
260 goto drop;
d346a3fa 261
10c51b56 262 txq = skb_get_tx_queue(dev, skb);
d346a3fa 263
43279500
DB
264 local_bh_disable();
265
266 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 267 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 268 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 269 HARD_TX_UNLOCK(dev, txq);
d346a3fa 270
43279500
DB
271 local_bh_enable();
272
273 if (!dev_xmit_complete(ret))
d346a3fa 274 kfree_skb(skb);
43279500 275
d346a3fa 276 return ret;
43279500 277drop:
0f97ede4 278 atomic_long_inc(&dev->tx_dropped);
43279500
DB
279 kfree_skb(skb);
280 return NET_XMIT_DROP;
d346a3fa
DB
281}
282
66e56cd4
DB
283static struct net_device *packet_cached_dev_get(struct packet_sock *po)
284{
285 struct net_device *dev;
286
287 rcu_read_lock();
288 dev = rcu_dereference(po->cached_dev);
289 if (likely(dev))
290 dev_hold(dev);
291 rcu_read_unlock();
292
293 return dev;
294}
295
296static void packet_cached_dev_assign(struct packet_sock *po,
297 struct net_device *dev)
298{
299 rcu_assign_pointer(po->cached_dev, dev);
300}
301
302static void packet_cached_dev_reset(struct packet_sock *po)
303{
304 RCU_INIT_POINTER(po->cached_dev, NULL);
305}
306
d346a3fa
DB
307static bool packet_use_direct_xmit(const struct packet_sock *po)
308{
309 return po->xmit == packet_direct_xmit;
310}
311
0fd5d57b 312static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 313{
1cbac010 314 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
315}
316
0fd5d57b
DB
317static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
318{
319 const struct net_device_ops *ops = dev->netdev_ops;
320 u16 queue_index;
321
322 if (ops->ndo_select_queue) {
323 queue_index = ops->ndo_select_queue(dev, skb, NULL,
324 __packet_pick_tx_queue);
325 queue_index = netdev_cap_txqueue(dev, queue_index);
326 } else {
327 queue_index = __packet_pick_tx_queue(dev, skb);
328 }
329
330 skb_set_queue_mapping(skb, queue_index);
331}
332
ce06b03e
DM
333/* register_prot_hook must be invoked with the po->bind_lock held,
334 * or from a context in which asynchronous accesses to the packet
335 * socket is not possible (packet_create()).
336 */
337static void register_prot_hook(struct sock *sk)
338{
339 struct packet_sock *po = pkt_sk(sk);
e40526cb 340
ce06b03e 341 if (!po->running) {
66e56cd4 342 if (po->fanout)
dc99f600 343 __fanout_link(sk, po);
66e56cd4 344 else
dc99f600 345 dev_add_pack(&po->prot_hook);
e40526cb 346
ce06b03e
DM
347 sock_hold(sk);
348 po->running = 1;
349 }
350}
351
352/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
353 * held. If the sync parameter is true, we will temporarily drop
354 * the po->bind_lock and do a synchronize_net to make sure no
355 * asynchronous packet processing paths still refer to the elements
356 * of po->prot_hook. If the sync parameter is false, it is the
357 * callers responsibility to take care of this.
358 */
359static void __unregister_prot_hook(struct sock *sk, bool sync)
360{
361 struct packet_sock *po = pkt_sk(sk);
362
363 po->running = 0;
66e56cd4
DB
364
365 if (po->fanout)
dc99f600 366 __fanout_unlink(sk, po);
66e56cd4 367 else
dc99f600 368 __dev_remove_pack(&po->prot_hook);
e40526cb 369
ce06b03e
DM
370 __sock_put(sk);
371
372 if (sync) {
373 spin_unlock(&po->bind_lock);
374 synchronize_net();
375 spin_lock(&po->bind_lock);
376 }
377}
378
379static void unregister_prot_hook(struct sock *sk, bool sync)
380{
381 struct packet_sock *po = pkt_sk(sk);
382
383 if (po->running)
384 __unregister_prot_hook(sk, sync);
385}
386
6e58040b 387static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
388{
389 if (is_vmalloc_addr(addr))
390 return vmalloc_to_page(addr);
391 return virt_to_page(addr);
392}
393
69e3c75f 394static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 395{
184f489e 396 union tpacket_uhdr h;
1da177e4 397
69e3c75f 398 h.raw = frame;
bbd6ef87
PM
399 switch (po->tp_version) {
400 case TPACKET_V1:
69e3c75f 401 h.h1->tp_status = status;
0af55bb5 402 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
403 break;
404 case TPACKET_V2:
69e3c75f 405 h.h2->tp_status = status;
0af55bb5 406 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 407 break;
f6fb8f10 408 case TPACKET_V3:
69e3c75f 409 default:
f6fb8f10 410 WARN(1, "TPACKET version not supported.\n");
69e3c75f 411 BUG();
bbd6ef87 412 }
69e3c75f
JB
413
414 smp_wmb();
bbd6ef87
PM
415}
416
69e3c75f 417static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 418{
184f489e 419 union tpacket_uhdr h;
bbd6ef87 420
69e3c75f
JB
421 smp_rmb();
422
bbd6ef87
PM
423 h.raw = frame;
424 switch (po->tp_version) {
425 case TPACKET_V1:
0af55bb5 426 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 427 return h.h1->tp_status;
bbd6ef87 428 case TPACKET_V2:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 430 return h.h2->tp_status;
f6fb8f10 431 case TPACKET_V3:
69e3c75f 432 default:
f6fb8f10 433 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
434 BUG();
435 return 0;
bbd6ef87 436 }
1da177e4 437}
69e3c75f 438
b9c32fb2
DB
439static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
440 unsigned int flags)
7a51384c
DB
441{
442 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
443
68a360e8
WB
444 if (shhwtstamps &&
445 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
446 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
447 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
448
449 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 450 return TP_STATUS_TS_SOFTWARE;
7a51384c 451
b9c32fb2 452 return 0;
7a51384c
DB
453}
454
b9c32fb2
DB
455static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
456 struct sk_buff *skb)
2e31396f
WB
457{
458 union tpacket_uhdr h;
459 struct timespec ts;
b9c32fb2 460 __u32 ts_status;
2e31396f 461
b9c32fb2
DB
462 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
463 return 0;
2e31396f
WB
464
465 h.raw = frame;
466 switch (po->tp_version) {
467 case TPACKET_V1:
468 h.h1->tp_sec = ts.tv_sec;
469 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
470 break;
471 case TPACKET_V2:
472 h.h2->tp_sec = ts.tv_sec;
473 h.h2->tp_nsec = ts.tv_nsec;
474 break;
475 case TPACKET_V3:
476 default:
477 WARN(1, "TPACKET version not supported.\n");
478 BUG();
479 }
480
481 /* one flush is safe, as both fields always lie on the same cacheline */
482 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
483 smp_wmb();
b9c32fb2
DB
484
485 return ts_status;
2e31396f
WB
486}
487
69e3c75f
JB
488static void *packet_lookup_frame(struct packet_sock *po,
489 struct packet_ring_buffer *rb,
490 unsigned int position,
491 int status)
492{
493 unsigned int pg_vec_pos, frame_offset;
184f489e 494 union tpacket_uhdr h;
69e3c75f
JB
495
496 pg_vec_pos = position / rb->frames_per_block;
497 frame_offset = position % rb->frames_per_block;
498
0e3125c7
NH
499 h.raw = rb->pg_vec[pg_vec_pos].buffer +
500 (frame_offset * rb->frame_size);
69e3c75f
JB
501
502 if (status != __packet_get_status(po, h.raw))
503 return NULL;
504
505 return h.raw;
506}
507
eea49cc9 508static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
509 struct packet_ring_buffer *rb,
510 int status)
511{
512 return packet_lookup_frame(po, rb, rb->head, status);
513}
514
bc59ba39 515static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 516{
517 del_timer_sync(&pkc->retire_blk_timer);
518}
519
520static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
521 int tx_ring,
522 struct sk_buff_head *rb_queue)
523{
bc59ba39 524 struct tpacket_kbdq_core *pkc;
f6fb8f10 525
22781a5b
DJ
526 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
527 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 528
ec6f809f 529 spin_lock_bh(&rb_queue->lock);
f6fb8f10 530 pkc->delete_blk_timer = 1;
ec6f809f 531 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 532
533 prb_del_retire_blk_timer(pkc);
534}
535
536static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 537 struct tpacket_kbdq_core *pkc,
f6fb8f10 538 void (*func) (unsigned long))
539{
540 init_timer(&pkc->retire_blk_timer);
541 pkc->retire_blk_timer.data = (long)po;
542 pkc->retire_blk_timer.function = func;
543 pkc->retire_blk_timer.expires = jiffies;
544}
545
546static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
547{
bc59ba39 548 struct tpacket_kbdq_core *pkc;
f6fb8f10 549
550 if (tx_ring)
551 BUG();
552
22781a5b
DJ
553 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
554 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 555 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
556}
557
558static int prb_calc_retire_blk_tmo(struct packet_sock *po,
559 int blk_size_in_bytes)
560{
561 struct net_device *dev;
562 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
563 struct ethtool_cmd ecmd;
564 int err;
e440cf2c 565 u32 speed;
f6fb8f10 566
4bc71cb9
JP
567 rtnl_lock();
568 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
569 if (unlikely(!dev)) {
570 rtnl_unlock();
f6fb8f10 571 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
572 }
573 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 574 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
575 rtnl_unlock();
576 if (!err) {
4bc71cb9
JP
577 /*
578 * If the link speed is so slow you don't really
579 * need to worry about perf anyways
580 */
e440cf2c 581 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 582 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 583 } else {
584 msec = 1;
585 div = speed / 1000;
f6fb8f10 586 }
587 }
588
589 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
590
591 if (div)
592 mbits /= div;
593
594 tmo = mbits * msec;
595
596 if (div)
597 return tmo+1;
598 return tmo;
599}
600
bc59ba39 601static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 602 union tpacket_req_u *req_u)
603{
604 p1->feature_req_word = req_u->req3.tp_feature_req_word;
605}
606
607static void init_prb_bdqc(struct packet_sock *po,
608 struct packet_ring_buffer *rb,
609 struct pgv *pg_vec,
610 union tpacket_req_u *req_u, int tx_ring)
611{
22781a5b 612 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 613 struct tpacket_block_desc *pbd;
f6fb8f10 614
615 memset(p1, 0x0, sizeof(*p1));
616
617 p1->knxt_seq_num = 1;
618 p1->pkbdq = pg_vec;
bc59ba39 619 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 620 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 621 p1->kblk_size = req_u->req3.tp_block_size;
622 p1->knum_blocks = req_u->req3.tp_block_nr;
623 p1->hdrlen = po->tp_hdrlen;
624 p1->version = po->tp_version;
625 p1->last_kactive_blk_num = 0;
ee80fbf3 626 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 627 if (req_u->req3.tp_retire_blk_tov)
628 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
629 else
630 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
631 req_u->req3.tp_block_size);
632 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
633 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
634
dc808110 635 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 636 prb_init_ft_ops(p1, req_u);
637 prb_setup_retire_blk_timer(po, tx_ring);
638 prb_open_block(p1, pbd);
639}
640
641/* Do NOT update the last_blk_num first.
642 * Assumes sk_buff_head lock is held.
643 */
bc59ba39 644static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 645{
646 mod_timer(&pkc->retire_blk_timer,
647 jiffies + pkc->tov_in_jiffies);
648 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
649}
650
651/*
652 * Timer logic:
653 * 1) We refresh the timer only when we open a block.
654 * By doing this we don't waste cycles refreshing the timer
655 * on packet-by-packet basis.
656 *
657 * With a 1MB block-size, on a 1Gbps line, it will take
658 * i) ~8 ms to fill a block + ii) memcpy etc.
659 * In this cut we are not accounting for the memcpy time.
660 *
661 * So, if the user sets the 'tmo' to 10ms then the timer
662 * will never fire while the block is still getting filled
663 * (which is what we want). However, the user could choose
664 * to close a block early and that's fine.
665 *
666 * But when the timer does fire, we check whether or not to refresh it.
667 * Since the tmo granularity is in msecs, it is not too expensive
668 * to refresh the timer, lets say every '8' msecs.
669 * Either the user can set the 'tmo' or we can derive it based on
670 * a) line-speed and b) block-size.
671 * prb_calc_retire_blk_tmo() calculates the tmo.
672 *
673 */
674static void prb_retire_rx_blk_timer_expired(unsigned long data)
675{
676 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 677 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 678 unsigned int frozen;
bc59ba39 679 struct tpacket_block_desc *pbd;
f6fb8f10 680
681 spin_lock(&po->sk.sk_receive_queue.lock);
682
683 frozen = prb_queue_frozen(pkc);
684 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
685
686 if (unlikely(pkc->delete_blk_timer))
687 goto out;
688
689 /* We only need to plug the race when the block is partially filled.
690 * tpacket_rcv:
691 * lock(); increment BLOCK_NUM_PKTS; unlock()
692 * copy_bits() is in progress ...
693 * timer fires on other cpu:
694 * we can't retire the current block because copy_bits
695 * is in progress.
696 *
697 */
698 if (BLOCK_NUM_PKTS(pbd)) {
699 while (atomic_read(&pkc->blk_fill_in_prog)) {
700 /* Waiting for skb_copy_bits to finish... */
701 cpu_relax();
702 }
703 }
704
705 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
706 if (!frozen) {
41a50d62
AD
707 if (!BLOCK_NUM_PKTS(pbd)) {
708 /* An empty block. Just refresh the timer. */
709 goto refresh_timer;
710 }
f6fb8f10 711 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
712 if (!prb_dispatch_next_block(pkc, po))
713 goto refresh_timer;
714 else
715 goto out;
716 } else {
717 /* Case 1. Queue was frozen because user-space was
718 * lagging behind.
719 */
720 if (prb_curr_blk_in_use(pkc, pbd)) {
721 /*
722 * Ok, user-space is still behind.
723 * So just refresh the timer.
724 */
725 goto refresh_timer;
726 } else {
727 /* Case 2. queue was frozen,user-space caught up,
728 * now the link went idle && the timer fired.
729 * We don't have a block to close.So we open this
730 * block and restart the timer.
731 * opening a block thaws the queue,restarts timer
732 * Thawing/timer-refresh is a side effect.
733 */
734 prb_open_block(pkc, pbd);
735 goto out;
736 }
737 }
738 }
739
740refresh_timer:
741 _prb_refresh_rx_retire_blk_timer(pkc);
742
743out:
744 spin_unlock(&po->sk.sk_receive_queue.lock);
745}
746
eea49cc9 747static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 748 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 749{
750 /* Flush everything minus the block header */
751
752#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
753 u8 *start, *end;
754
755 start = (u8 *)pbd1;
756
757 /* Skip the block header(we know header WILL fit in 4K) */
758 start += PAGE_SIZE;
759
760 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
761 for (; start < end; start += PAGE_SIZE)
762 flush_dcache_page(pgv_to_page(start));
763
764 smp_wmb();
765#endif
766
767 /* Now update the block status. */
768
769 BLOCK_STATUS(pbd1) = status;
770
771 /* Flush the block header */
772
773#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
774 start = (u8 *)pbd1;
775 flush_dcache_page(pgv_to_page(start));
776
777 smp_wmb();
778#endif
779}
780
781/*
782 * Side effect:
783 *
784 * 1) flush the block
785 * 2) Increment active_blk_num
786 *
787 * Note:We DONT refresh the timer on purpose.
788 * Because almost always the next block will be opened.
789 */
bc59ba39 790static void prb_close_block(struct tpacket_kbdq_core *pkc1,
791 struct tpacket_block_desc *pbd1,
f6fb8f10 792 struct packet_sock *po, unsigned int stat)
793{
794 __u32 status = TP_STATUS_USER | stat;
795
796 struct tpacket3_hdr *last_pkt;
bc59ba39 797 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 798 struct sock *sk = &po->sk;
f6fb8f10 799
ee80fbf3 800 if (po->stats.stats3.tp_drops)
f6fb8f10 801 status |= TP_STATUS_LOSING;
802
803 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
804 last_pkt->tp_next_offset = 0;
805
806 /* Get the ts of the last pkt */
807 if (BLOCK_NUM_PKTS(pbd1)) {
808 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
809 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
810 } else {
41a50d62
AD
811 /* Ok, we tmo'd - so get the current time.
812 *
813 * It shouldn't really happen as we don't close empty
814 * blocks. See prb_retire_rx_blk_timer_expired().
815 */
f6fb8f10 816 struct timespec ts;
817 getnstimeofday(&ts);
818 h1->ts_last_pkt.ts_sec = ts.tv_sec;
819 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
820 }
821
822 smp_wmb();
823
824 /* Flush the block */
825 prb_flush_block(pkc1, pbd1, status);
826
da413eec
DC
827 sk->sk_data_ready(sk);
828
f6fb8f10 829 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
830}
831
eea49cc9 832static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 833{
834 pkc->reset_pending_on_curr_blk = 0;
835}
836
837/*
838 * Side effect of opening a block:
839 *
840 * 1) prb_queue is thawed.
841 * 2) retire_blk_timer is refreshed.
842 *
843 */
bc59ba39 844static void prb_open_block(struct tpacket_kbdq_core *pkc1,
845 struct tpacket_block_desc *pbd1)
f6fb8f10 846{
847 struct timespec ts;
bc59ba39 848 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 849
850 smp_rmb();
851
8da3056c
DB
852 /* We could have just memset this but we will lose the
853 * flexibility of making the priv area sticky
854 */
f6fb8f10 855
8da3056c
DB
856 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
857 BLOCK_NUM_PKTS(pbd1) = 0;
858 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 859
8da3056c
DB
860 getnstimeofday(&ts);
861
862 h1->ts_first_pkt.ts_sec = ts.tv_sec;
863 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 864
8da3056c
DB
865 pkc1->pkblk_start = (char *)pbd1;
866 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
867
868 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
869 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
870
871 pbd1->version = pkc1->version;
872 pkc1->prev = pkc1->nxt_offset;
873 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
874
875 prb_thaw_queue(pkc1);
876 _prb_refresh_rx_retire_blk_timer(pkc1);
877
878 smp_wmb();
f6fb8f10 879}
880
881/*
882 * Queue freeze logic:
883 * 1) Assume tp_block_nr = 8 blocks.
884 * 2) At time 't0', user opens Rx ring.
885 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
886 * 4) user-space is either sleeping or processing block '0'.
887 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
888 * it will close block-7,loop around and try to fill block '0'.
889 * call-flow:
890 * __packet_lookup_frame_in_block
891 * prb_retire_current_block()
892 * prb_dispatch_next_block()
893 * |->(BLOCK_STATUS == USER) evaluates to true
894 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
895 * 6) Now there are two cases:
896 * 6.1) Link goes idle right after the queue is frozen.
897 * But remember, the last open_block() refreshed the timer.
898 * When this timer expires,it will refresh itself so that we can
899 * re-open block-0 in near future.
900 * 6.2) Link is busy and keeps on receiving packets. This is a simple
901 * case and __packet_lookup_frame_in_block will check if block-0
902 * is free and can now be re-used.
903 */
eea49cc9 904static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 905 struct packet_sock *po)
906{
907 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 908 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 909}
910
911#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
912
913/*
914 * If the next block is free then we will dispatch it
915 * and return a good offset.
916 * Else, we will freeze the queue.
917 * So, caller must check the return value.
918 */
bc59ba39 919static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 920 struct packet_sock *po)
921{
bc59ba39 922 struct tpacket_block_desc *pbd;
f6fb8f10 923
924 smp_rmb();
925
926 /* 1. Get current block num */
927 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
928
929 /* 2. If this block is currently in_use then freeze the queue */
930 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
931 prb_freeze_queue(pkc, po);
932 return NULL;
933 }
934
935 /*
936 * 3.
937 * open this block and return the offset where the first packet
938 * needs to get stored.
939 */
940 prb_open_block(pkc, pbd);
941 return (void *)pkc->nxt_offset;
942}
943
bc59ba39 944static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 945 struct packet_sock *po, unsigned int status)
946{
bc59ba39 947 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 948
949 /* retire/close the current block */
950 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
951 /*
952 * Plug the case where copy_bits() is in progress on
953 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
954 * have space to copy the pkt in the current block and
955 * called prb_retire_current_block()
956 *
957 * We don't need to worry about the TMO case because
958 * the timer-handler already handled this case.
959 */
960 if (!(status & TP_STATUS_BLK_TMO)) {
961 while (atomic_read(&pkc->blk_fill_in_prog)) {
962 /* Waiting for skb_copy_bits to finish... */
963 cpu_relax();
964 }
965 }
966 prb_close_block(pkc, pbd, po, status);
967 return;
968 }
f6fb8f10 969}
970
eea49cc9 971static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 972 struct tpacket_block_desc *pbd)
f6fb8f10 973{
974 return TP_STATUS_USER & BLOCK_STATUS(pbd);
975}
976
eea49cc9 977static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 978{
979 return pkc->reset_pending_on_curr_blk;
980}
981
eea49cc9 982static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 983{
bc59ba39 984 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 985 atomic_dec(&pkc->blk_fill_in_prog);
986}
987
eea49cc9 988static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 989 struct tpacket3_hdr *ppd)
990{
3958afa1 991 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 992}
993
eea49cc9 994static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 995 struct tpacket3_hdr *ppd)
996{
997 ppd->hv1.tp_rxhash = 0;
998}
999
eea49cc9 1000static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 1001 struct tpacket3_hdr *ppd)
1002{
df8a39de
JP
1003 if (skb_vlan_tag_present(pkc->skb)) {
1004 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1005 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1006 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1007 } else {
9e67030a 1008 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1009 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1010 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1011 }
1012}
1013
bc59ba39 1014static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1015 struct tpacket3_hdr *ppd)
1016{
a0cdfcf3 1017 ppd->hv1.tp_padding = 0;
f6fb8f10 1018 prb_fill_vlan_info(pkc, ppd);
1019
1020 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1021 prb_fill_rxhash(pkc, ppd);
1022 else
1023 prb_clear_rxhash(pkc, ppd);
1024}
1025
eea49cc9 1026static void prb_fill_curr_block(char *curr,
bc59ba39 1027 struct tpacket_kbdq_core *pkc,
1028 struct tpacket_block_desc *pbd,
f6fb8f10 1029 unsigned int len)
1030{
1031 struct tpacket3_hdr *ppd;
1032
1033 ppd = (struct tpacket3_hdr *)curr;
1034 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1035 pkc->prev = curr;
1036 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1037 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1038 BLOCK_NUM_PKTS(pbd) += 1;
1039 atomic_inc(&pkc->blk_fill_in_prog);
1040 prb_run_all_ft_ops(pkc, ppd);
1041}
1042
1043/* Assumes caller has the sk->rx_queue.lock */
1044static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1045 struct sk_buff *skb,
1046 int status,
1047 unsigned int len
1048 )
1049{
bc59ba39 1050 struct tpacket_kbdq_core *pkc;
1051 struct tpacket_block_desc *pbd;
f6fb8f10 1052 char *curr, *end;
1053
e3192690 1054 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1055 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1056
1057 /* Queue is frozen when user space is lagging behind */
1058 if (prb_queue_frozen(pkc)) {
1059 /*
1060 * Check if that last block which caused the queue to freeze,
1061 * is still in_use by user-space.
1062 */
1063 if (prb_curr_blk_in_use(pkc, pbd)) {
1064 /* Can't record this packet */
1065 return NULL;
1066 } else {
1067 /*
1068 * Ok, the block was released by user-space.
1069 * Now let's open that block.
1070 * opening a block also thaws the queue.
1071 * Thawing is a side effect.
1072 */
1073 prb_open_block(pkc, pbd);
1074 }
1075 }
1076
1077 smp_mb();
1078 curr = pkc->nxt_offset;
1079 pkc->skb = skb;
e3192690 1080 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1081
1082 /* first try the current block */
1083 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1084 prb_fill_curr_block(curr, pkc, pbd, len);
1085 return (void *)curr;
1086 }
1087
1088 /* Ok, close the current block */
1089 prb_retire_current_block(pkc, po, 0);
1090
1091 /* Now, try to dispatch the next block */
1092 curr = (char *)prb_dispatch_next_block(pkc, po);
1093 if (curr) {
1094 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1095 prb_fill_curr_block(curr, pkc, pbd, len);
1096 return (void *)curr;
1097 }
1098
1099 /*
1100 * No free blocks are available.user_space hasn't caught up yet.
1101 * Queue was just frozen and now this packet will get dropped.
1102 */
1103 return NULL;
1104}
1105
eea49cc9 1106static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1107 struct sk_buff *skb,
1108 int status, unsigned int len)
1109{
1110 char *curr = NULL;
1111 switch (po->tp_version) {
1112 case TPACKET_V1:
1113 case TPACKET_V2:
1114 curr = packet_lookup_frame(po, &po->rx_ring,
1115 po->rx_ring.head, status);
1116 return curr;
1117 case TPACKET_V3:
1118 return __packet_lookup_frame_in_block(po, skb, status, len);
1119 default:
1120 WARN(1, "TPACKET version not supported\n");
1121 BUG();
99aa3473 1122 return NULL;
f6fb8f10 1123 }
1124}
1125
eea49cc9 1126static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1127 struct packet_ring_buffer *rb,
77f65ebd 1128 unsigned int idx,
f6fb8f10 1129 int status)
1130{
bc59ba39 1131 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1132 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1133
1134 if (status != BLOCK_STATUS(pbd))
1135 return NULL;
1136 return pbd;
1137}
1138
eea49cc9 1139static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1140{
1141 unsigned int prev;
1142 if (rb->prb_bdqc.kactive_blk_num)
1143 prev = rb->prb_bdqc.kactive_blk_num-1;
1144 else
1145 prev = rb->prb_bdqc.knum_blocks-1;
1146 return prev;
1147}
1148
1149/* Assumes caller has held the rx_queue.lock */
eea49cc9 1150static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1151 struct packet_ring_buffer *rb,
1152 int status)
1153{
1154 unsigned int previous = prb_previous_blk_num(rb);
1155 return prb_lookup_block(po, rb, previous, status);
1156}
1157
eea49cc9 1158static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1159 struct packet_ring_buffer *rb,
1160 int status)
1161{
1162 if (po->tp_version <= TPACKET_V2)
1163 return packet_previous_frame(po, rb, status);
1164
1165 return __prb_previous_block(po, rb, status);
1166}
1167
eea49cc9 1168static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1169 struct packet_ring_buffer *rb)
1170{
1171 switch (po->tp_version) {
1172 case TPACKET_V1:
1173 case TPACKET_V2:
1174 return packet_increment_head(rb);
1175 case TPACKET_V3:
1176 default:
1177 WARN(1, "TPACKET version not supported.\n");
1178 BUG();
1179 return;
1180 }
1181}
1182
eea49cc9 1183static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1184 struct packet_ring_buffer *rb,
1185 int status)
1186{
1187 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1188 return packet_lookup_frame(po, rb, previous, status);
1189}
1190
eea49cc9 1191static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1192{
1193 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1194}
1195
b0138408
DB
1196static void packet_inc_pending(struct packet_ring_buffer *rb)
1197{
1198 this_cpu_inc(*rb->pending_refcnt);
1199}
1200
1201static void packet_dec_pending(struct packet_ring_buffer *rb)
1202{
1203 this_cpu_dec(*rb->pending_refcnt);
1204}
1205
1206static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1207{
1208 unsigned int refcnt = 0;
1209 int cpu;
1210
1211 /* We don't use pending refcount in rx_ring. */
1212 if (rb->pending_refcnt == NULL)
1213 return 0;
1214
1215 for_each_possible_cpu(cpu)
1216 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1217
1218 return refcnt;
1219}
1220
1221static int packet_alloc_pending(struct packet_sock *po)
1222{
1223 po->rx_ring.pending_refcnt = NULL;
1224
1225 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1226 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1227 return -ENOBUFS;
1228
1229 return 0;
1230}
1231
1232static void packet_free_pending(struct packet_sock *po)
1233{
1234 free_percpu(po->tx_ring.pending_refcnt);
1235}
1236
77f65ebd
WB
1237static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1238{
1239 struct sock *sk = &po->sk;
1240 bool has_room;
1241
1242 if (po->prot_hook.func != tpacket_rcv)
1243 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1244 <= sk->sk_rcvbuf;
1245
1246 spin_lock(&sk->sk_receive_queue.lock);
1247 if (po->tp_version == TPACKET_V3)
1248 has_room = prb_lookup_block(po, &po->rx_ring,
1249 po->rx_ring.prb_bdqc.kactive_blk_num,
1250 TP_STATUS_KERNEL);
1251 else
1252 has_room = packet_lookup_frame(po, &po->rx_ring,
1253 po->rx_ring.head,
1254 TP_STATUS_KERNEL);
1255 spin_unlock(&sk->sk_receive_queue.lock);
1256
1257 return has_room;
1258}
1259
1da177e4
LT
1260static void packet_sock_destruct(struct sock *sk)
1261{
ed85b565
RC
1262 skb_queue_purge(&sk->sk_error_queue);
1263
547b792c
IJ
1264 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1265 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1266
1267 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1268 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1269 return;
1270 }
1271
17ab56a2 1272 sk_refcnt_debug_dec(sk);
1da177e4
LT
1273}
1274
dc99f600
DM
1275static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1276{
1277 int x = atomic_read(&f->rr_cur) + 1;
1278
1279 if (x >= num)
1280 x = 0;
1281
1282 return x;
1283}
1284
77f65ebd
WB
1285static unsigned int fanout_demux_hash(struct packet_fanout *f,
1286 struct sk_buff *skb,
1287 unsigned int num)
dc99f600 1288{
61b905da 1289 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1290}
1291
77f65ebd
WB
1292static unsigned int fanout_demux_lb(struct packet_fanout *f,
1293 struct sk_buff *skb,
1294 unsigned int num)
dc99f600
DM
1295{
1296 int cur, old;
1297
1298 cur = atomic_read(&f->rr_cur);
1299 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1300 fanout_rr_next(f, num))) != cur)
1301 cur = old;
77f65ebd
WB
1302 return cur;
1303}
1304
1305static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1306 struct sk_buff *skb,
1307 unsigned int num)
1308{
1309 return smp_processor_id() % num;
dc99f600
DM
1310}
1311
5df0ddfb
DB
1312static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1313 struct sk_buff *skb,
1314 unsigned int num)
1315{
f337db64 1316 return prandom_u32_max(num);
5df0ddfb
DB
1317}
1318
77f65ebd
WB
1319static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1320 struct sk_buff *skb,
ad377cab 1321 unsigned int idx, bool try_self,
77f65ebd 1322 unsigned int num)
95ec3eb4 1323{
77f65ebd 1324 unsigned int i, j;
95ec3eb4 1325
ad377cab
WB
1326 if (try_self && packet_rcv_has_room(pkt_sk(f->arr[idx]), skb))
1327 return idx;
1328
77f65ebd
WB
1329 i = j = min_t(int, f->next[idx], num - 1);
1330 do {
ad377cab 1331 if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
77f65ebd
WB
1332 if (i != j)
1333 f->next[idx] = i;
1334 return i;
1335 }
ad377cab 1336
77f65ebd
WB
1337 if (++i == num)
1338 i = 0;
1339 } while (i != j);
1340
1341 return idx;
1342}
1343
2d36097d
NH
1344static unsigned int fanout_demux_qm(struct packet_fanout *f,
1345 struct sk_buff *skb,
1346 unsigned int num)
1347{
1348 return skb_get_queue_mapping(skb) % num;
1349}
1350
77f65ebd
WB
1351static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1352{
1353 return f->flags & (flag >> 8);
95ec3eb4
DM
1354}
1355
95ec3eb4
DM
1356static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1357 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1358{
1359 struct packet_fanout *f = pt->af_packet_priv;
1360 unsigned int num = f->num_members;
1361 struct packet_sock *po;
77f65ebd 1362 unsigned int idx;
dc99f600
DM
1363
1364 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1365 !num) {
1366 kfree_skb(skb);
1367 return 0;
1368 }
1369
3f34b24a
AD
1370 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1371 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1372 if (!skb)
1373 return 0;
1374 }
95ec3eb4
DM
1375 switch (f->type) {
1376 case PACKET_FANOUT_HASH:
1377 default:
77f65ebd 1378 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1379 break;
1380 case PACKET_FANOUT_LB:
77f65ebd 1381 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1382 break;
1383 case PACKET_FANOUT_CPU:
77f65ebd
WB
1384 idx = fanout_demux_cpu(f, skb, num);
1385 break;
5df0ddfb
DB
1386 case PACKET_FANOUT_RND:
1387 idx = fanout_demux_rnd(f, skb, num);
1388 break;
2d36097d
NH
1389 case PACKET_FANOUT_QM:
1390 idx = fanout_demux_qm(f, skb, num);
1391 break;
77f65ebd 1392 case PACKET_FANOUT_ROLLOVER:
ad377cab 1393 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1394 break;
dc99f600
DM
1395 }
1396
ad377cab
WB
1397 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1398 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1399
ad377cab 1400 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1401 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1402}
1403
fff3321d
PE
1404DEFINE_MUTEX(fanout_mutex);
1405EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1406static LIST_HEAD(fanout_list);
1407
1408static void __fanout_link(struct sock *sk, struct packet_sock *po)
1409{
1410 struct packet_fanout *f = po->fanout;
1411
1412 spin_lock(&f->lock);
1413 f->arr[f->num_members] = sk;
1414 smp_wmb();
1415 f->num_members++;
1416 spin_unlock(&f->lock);
1417}
1418
1419static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1420{
1421 struct packet_fanout *f = po->fanout;
1422 int i;
1423
1424 spin_lock(&f->lock);
1425 for (i = 0; i < f->num_members; i++) {
1426 if (f->arr[i] == sk)
1427 break;
1428 }
1429 BUG_ON(i >= f->num_members);
1430 f->arr[i] = f->arr[f->num_members - 1];
1431 f->num_members--;
1432 spin_unlock(&f->lock);
1433}
1434
d4dd8aee 1435static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1436{
d4dd8aee 1437 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1438 return true;
1439
1440 return false;
1441}
1442
7736d33f 1443static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1444{
1445 struct packet_sock *po = pkt_sk(sk);
1446 struct packet_fanout *f, *match;
7736d33f 1447 u8 type = type_flags & 0xff;
77f65ebd 1448 u8 flags = type_flags >> 8;
dc99f600
DM
1449 int err;
1450
1451 switch (type) {
77f65ebd
WB
1452 case PACKET_FANOUT_ROLLOVER:
1453 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1454 return -EINVAL;
dc99f600
DM
1455 case PACKET_FANOUT_HASH:
1456 case PACKET_FANOUT_LB:
95ec3eb4 1457 case PACKET_FANOUT_CPU:
5df0ddfb 1458 case PACKET_FANOUT_RND:
2d36097d 1459 case PACKET_FANOUT_QM:
dc99f600
DM
1460 break;
1461 default:
1462 return -EINVAL;
1463 }
1464
1465 if (!po->running)
1466 return -EINVAL;
1467
1468 if (po->fanout)
1469 return -EALREADY;
1470
1471 mutex_lock(&fanout_mutex);
1472 match = NULL;
1473 list_for_each_entry(f, &fanout_list, list) {
1474 if (f->id == id &&
1475 read_pnet(&f->net) == sock_net(sk)) {
1476 match = f;
1477 break;
1478 }
1479 }
afe62c68 1480 err = -EINVAL;
77f65ebd 1481 if (match && match->flags != flags)
afe62c68 1482 goto out;
dc99f600 1483 if (!match) {
afe62c68 1484 err = -ENOMEM;
dc99f600 1485 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1486 if (!match)
1487 goto out;
1488 write_pnet(&match->net, sock_net(sk));
1489 match->id = id;
1490 match->type = type;
77f65ebd 1491 match->flags = flags;
afe62c68
ED
1492 atomic_set(&match->rr_cur, 0);
1493 INIT_LIST_HEAD(&match->list);
1494 spin_lock_init(&match->lock);
1495 atomic_set(&match->sk_ref, 0);
1496 match->prot_hook.type = po->prot_hook.type;
1497 match->prot_hook.dev = po->prot_hook.dev;
1498 match->prot_hook.func = packet_rcv_fanout;
1499 match->prot_hook.af_packet_priv = match;
c0de08d0 1500 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1501 dev_add_pack(&match->prot_hook);
1502 list_add(&match->list, &fanout_list);
dc99f600 1503 }
afe62c68
ED
1504 err = -EINVAL;
1505 if (match->type == type &&
1506 match->prot_hook.type == po->prot_hook.type &&
1507 match->prot_hook.dev == po->prot_hook.dev) {
1508 err = -ENOSPC;
1509 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1510 __dev_remove_pack(&po->prot_hook);
1511 po->fanout = match;
1512 atomic_inc(&match->sk_ref);
1513 __fanout_link(sk, po);
1514 err = 0;
dc99f600
DM
1515 }
1516 }
afe62c68 1517out:
dc99f600
DM
1518 mutex_unlock(&fanout_mutex);
1519 return err;
1520}
1521
1522static void fanout_release(struct sock *sk)
1523{
1524 struct packet_sock *po = pkt_sk(sk);
1525 struct packet_fanout *f;
1526
1527 f = po->fanout;
1528 if (!f)
1529 return;
1530
fff3321d 1531 mutex_lock(&fanout_mutex);
dc99f600
DM
1532 po->fanout = NULL;
1533
dc99f600
DM
1534 if (atomic_dec_and_test(&f->sk_ref)) {
1535 list_del(&f->list);
1536 dev_remove_pack(&f->prot_hook);
1537 kfree(f);
1538 }
1539 mutex_unlock(&fanout_mutex);
1540}
1da177e4 1541
90ddc4f0 1542static const struct proto_ops packet_ops;
1da177e4 1543
90ddc4f0 1544static const struct proto_ops packet_ops_spkt;
1da177e4 1545
40d4e3df
ED
1546static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1547 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1548{
1549 struct sock *sk;
1550 struct sockaddr_pkt *spkt;
1551
1552 /*
1553 * When we registered the protocol we saved the socket in the data
1554 * field for just this event.
1555 */
1556
1557 sk = pt->af_packet_priv;
1ce4f28b 1558
1da177e4
LT
1559 /*
1560 * Yank back the headers [hope the device set this
1561 * right or kerboom...]
1562 *
1563 * Incoming packets have ll header pulled,
1564 * push it back.
1565 *
98e399f8 1566 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1567 * so that this procedure is noop.
1568 */
1569
1570 if (skb->pkt_type == PACKET_LOOPBACK)
1571 goto out;
1572
09ad9bc7 1573 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1574 goto out;
1575
40d4e3df
ED
1576 skb = skb_share_check(skb, GFP_ATOMIC);
1577 if (skb == NULL)
1da177e4
LT
1578 goto oom;
1579
1580 /* drop any routing info */
adf30907 1581 skb_dst_drop(skb);
1da177e4 1582
84531c24
PO
1583 /* drop conntrack reference */
1584 nf_reset(skb);
1585
ffbc6111 1586 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1587
98e399f8 1588 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1589
1590 /*
1591 * The SOCK_PACKET socket receives _all_ frames.
1592 */
1593
1594 spkt->spkt_family = dev->type;
1595 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1596 spkt->spkt_protocol = skb->protocol;
1597
1598 /*
1599 * Charge the memory to the socket. This is done specifically
1600 * to prevent sockets using all the memory up.
1601 */
1602
40d4e3df 1603 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1604 return 0;
1605
1606out:
1607 kfree_skb(skb);
1608oom:
1609 return 0;
1610}
1611
1612
1613/*
1614 * Output a raw packet to a device layer. This bypasses all the other
1615 * protocol layers and you must therefore supply it with a complete frame
1616 */
1ce4f28b 1617
1b784140
YX
1618static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1619 size_t len)
1da177e4
LT
1620{
1621 struct sock *sk = sock->sk;
342dfc30 1622 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1623 struct sk_buff *skb = NULL;
1da177e4 1624 struct net_device *dev;
40d4e3df 1625 __be16 proto = 0;
1da177e4 1626 int err;
3bdc0eba 1627 int extra_len = 0;
1ce4f28b 1628
1da177e4 1629 /*
1ce4f28b 1630 * Get and verify the address.
1da177e4
LT
1631 */
1632
40d4e3df 1633 if (saddr) {
1da177e4 1634 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1635 return -EINVAL;
1636 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1637 proto = saddr->spkt_protocol;
1638 } else
1639 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1640
1641 /*
1ce4f28b 1642 * Find the device first to size check it
1da177e4
LT
1643 */
1644
de74e92a 1645 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1646retry:
654d1f8a
ED
1647 rcu_read_lock();
1648 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1649 err = -ENODEV;
1650 if (dev == NULL)
1651 goto out_unlock;
1ce4f28b 1652
d5e76b0a
DM
1653 err = -ENETDOWN;
1654 if (!(dev->flags & IFF_UP))
1655 goto out_unlock;
1656
1da177e4 1657 /*
40d4e3df
ED
1658 * You may not queue a frame bigger than the mtu. This is the lowest level
1659 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1660 */
1ce4f28b 1661
3bdc0eba
BG
1662 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1663 if (!netif_supports_nofcs(dev)) {
1664 err = -EPROTONOSUPPORT;
1665 goto out_unlock;
1666 }
1667 extra_len = 4; /* We're doing our own CRC */
1668 }
1669
1da177e4 1670 err = -EMSGSIZE;
3bdc0eba 1671 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1672 goto out_unlock;
1673
1a35ca80
ED
1674 if (!skb) {
1675 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1676 int tlen = dev->needed_tailroom;
1a35ca80
ED
1677 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1678
1679 rcu_read_unlock();
4ce40912 1680 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1681 if (skb == NULL)
1682 return -ENOBUFS;
1683 /* FIXME: Save some space for broken drivers that write a hard
1684 * header at transmission time by themselves. PPP is the notable
1685 * one here. This should really be fixed at the driver level.
1686 */
1687 skb_reserve(skb, reserved);
1688 skb_reset_network_header(skb);
1689
1690 /* Try to align data part correctly */
1691 if (hhlen) {
1692 skb->data -= hhlen;
1693 skb->tail -= hhlen;
1694 if (len < hhlen)
1695 skb_reset_network_header(skb);
1696 }
6ce8e9ce 1697 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1698 if (err)
1699 goto out_free;
1700 goto retry;
1da177e4
LT
1701 }
1702
3bdc0eba 1703 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1704 /* Earlier code assumed this would be a VLAN pkt,
1705 * double-check this now that we have the actual
1706 * packet in hand.
1707 */
1708 struct ethhdr *ehdr;
1709 skb_reset_mac_header(skb);
1710 ehdr = eth_hdr(skb);
1711 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1712 err = -EMSGSIZE;
1713 goto out_unlock;
1714 }
1715 }
1a35ca80 1716
1da177e4
LT
1717 skb->protocol = proto;
1718 skb->dev = dev;
1719 skb->priority = sk->sk_priority;
2d37a186 1720 skb->mark = sk->sk_mark;
bf84a010
DB
1721
1722 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1723
3bdc0eba
BG
1724 if (unlikely(extra_len == 4))
1725 skb->no_fcs = 1;
1726
40893fd0 1727 skb_probe_transport_header(skb, 0);
c1aad275 1728
1da177e4 1729 dev_queue_xmit(skb);
654d1f8a 1730 rcu_read_unlock();
40d4e3df 1731 return len;
1da177e4 1732
1da177e4 1733out_unlock:
654d1f8a 1734 rcu_read_unlock();
1a35ca80
ED
1735out_free:
1736 kfree_skb(skb);
1da177e4
LT
1737 return err;
1738}
1da177e4 1739
eea49cc9 1740static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1741 const struct sock *sk,
dbcb5855 1742 unsigned int res)
1da177e4
LT
1743{
1744 struct sk_filter *filter;
fda9ef5d 1745
80f8f102
ED
1746 rcu_read_lock();
1747 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1748 if (filter != NULL)
0a14842f 1749 res = SK_RUN_FILTER(filter, skb);
80f8f102 1750 rcu_read_unlock();
1da177e4 1751
dbcb5855 1752 return res;
1da177e4
LT
1753}
1754
1755/*
62ab0812
ED
1756 * This function makes lazy skb cloning in hope that most of packets
1757 * are discarded by BPF.
1758 *
1759 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1760 * and skb->cb are mangled. It works because (and until) packets
1761 * falling here are owned by current CPU. Output packets are cloned
1762 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1763 * sequencially, so that if we return skb to original state on exit,
1764 * we will not harm anyone.
1da177e4
LT
1765 */
1766
40d4e3df
ED
1767static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1768 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1769{
1770 struct sock *sk;
1771 struct sockaddr_ll *sll;
1772 struct packet_sock *po;
40d4e3df 1773 u8 *skb_head = skb->data;
1da177e4 1774 int skb_len = skb->len;
dbcb5855 1775 unsigned int snaplen, res;
1da177e4
LT
1776
1777 if (skb->pkt_type == PACKET_LOOPBACK)
1778 goto drop;
1779
1780 sk = pt->af_packet_priv;
1781 po = pkt_sk(sk);
1782
09ad9bc7 1783 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1784 goto drop;
1785
1da177e4
LT
1786 skb->dev = dev;
1787
3b04ddde 1788 if (dev->header_ops) {
1da177e4 1789 /* The device has an explicit notion of ll header,
62ab0812
ED
1790 * exported to higher levels.
1791 *
1792 * Otherwise, the device hides details of its frame
1793 * structure, so that corresponding packet head is
1794 * never delivered to user.
1da177e4
LT
1795 */
1796 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1797 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1798 else if (skb->pkt_type == PACKET_OUTGOING) {
1799 /* Special case: outgoing packets have ll header at head */
bbe735e4 1800 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1801 }
1802 }
1803
1804 snaplen = skb->len;
1805
dbcb5855
DM
1806 res = run_filter(skb, sk, snaplen);
1807 if (!res)
fda9ef5d 1808 goto drop_n_restore;
dbcb5855
DM
1809 if (snaplen > res)
1810 snaplen = res;
1da177e4 1811
0fd7bac6 1812 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1813 goto drop_n_acct;
1814
1815 if (skb_shared(skb)) {
1816 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1817 if (nskb == NULL)
1818 goto drop_n_acct;
1819
1820 if (skb_head != skb->data) {
1821 skb->data = skb_head;
1822 skb->len = skb_len;
1823 }
abc4e4fa 1824 consume_skb(skb);
1da177e4
LT
1825 skb = nskb;
1826 }
1827
b4772ef8 1828 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
1829
1830 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 1831 sll->sll_hatype = dev->type;
1da177e4 1832 sll->sll_pkttype = skb->pkt_type;
8032b464 1833 if (unlikely(po->origdev))
80feaacb
PWJ
1834 sll->sll_ifindex = orig_dev->ifindex;
1835 else
1836 sll->sll_ifindex = dev->ifindex;
1da177e4 1837
b95cce35 1838 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1839
2472d761
EB
1840 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
1841 * Use their space for storing the original skb length.
1842 */
1843 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 1844
1da177e4
LT
1845 if (pskb_trim(skb, snaplen))
1846 goto drop_n_acct;
1847
1848 skb_set_owner_r(skb, sk);
1849 skb->dev = NULL;
adf30907 1850 skb_dst_drop(skb);
1da177e4 1851
84531c24
PO
1852 /* drop conntrack reference */
1853 nf_reset(skb);
1854
1da177e4 1855 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1856 po->stats.stats1.tp_packets++;
3bc3b96f 1857 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
1858 __skb_queue_tail(&sk->sk_receive_queue, skb);
1859 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 1860 sk->sk_data_ready(sk);
1da177e4
LT
1861 return 0;
1862
1863drop_n_acct:
7091fbd8 1864 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1865 po->stats.stats1.tp_drops++;
7091fbd8
WB
1866 atomic_inc(&sk->sk_drops);
1867 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1868
1869drop_n_restore:
1870 if (skb_head != skb->data && skb_shared(skb)) {
1871 skb->data = skb_head;
1872 skb->len = skb_len;
1873 }
1874drop:
ead2ceb0 1875 consume_skb(skb);
1da177e4
LT
1876 return 0;
1877}
1878
40d4e3df
ED
1879static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1880 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1881{
1882 struct sock *sk;
1883 struct packet_sock *po;
1884 struct sockaddr_ll *sll;
184f489e 1885 union tpacket_uhdr h;
40d4e3df 1886 u8 *skb_head = skb->data;
1da177e4 1887 int skb_len = skb->len;
dbcb5855 1888 unsigned int snaplen, res;
f6fb8f10 1889 unsigned long status = TP_STATUS_USER;
bbd6ef87 1890 unsigned short macoff, netoff, hdrlen;
1da177e4 1891 struct sk_buff *copy_skb = NULL;
bbd6ef87 1892 struct timespec ts;
b9c32fb2 1893 __u32 ts_status;
1da177e4 1894
51846355
AW
1895 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1896 * We may add members to them until current aligned size without forcing
1897 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1898 */
1899 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1900 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1901
1da177e4
LT
1902 if (skb->pkt_type == PACKET_LOOPBACK)
1903 goto drop;
1904
1905 sk = pt->af_packet_priv;
1906 po = pkt_sk(sk);
1907
09ad9bc7 1908 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1909 goto drop;
1910
3b04ddde 1911 if (dev->header_ops) {
1da177e4 1912 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1913 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1914 else if (skb->pkt_type == PACKET_OUTGOING) {
1915 /* Special case: outgoing packets have ll header at head */
bbe735e4 1916 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1917 }
1918 }
1919
1920 snaplen = skb->len;
1921
dbcb5855
DM
1922 res = run_filter(skb, sk, snaplen);
1923 if (!res)
fda9ef5d 1924 goto drop_n_restore;
68c2e5de
AD
1925
1926 if (skb->ip_summed == CHECKSUM_PARTIAL)
1927 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
1928 else if (skb->pkt_type != PACKET_OUTGOING &&
1929 (skb->ip_summed == CHECKSUM_COMPLETE ||
1930 skb_csum_unnecessary(skb)))
1931 status |= TP_STATUS_CSUM_VALID;
68c2e5de 1932
dbcb5855
DM
1933 if (snaplen > res)
1934 snaplen = res;
1da177e4
LT
1935
1936 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1937 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1938 po->tp_reserve;
1da177e4 1939 } else {
95c96174 1940 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1941 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1942 (maclen < 16 ? 16 : maclen)) +
1943 po->tp_reserve;
1da177e4
LT
1944 macoff = netoff - maclen;
1945 }
f6fb8f10 1946 if (po->tp_version <= TPACKET_V2) {
1947 if (macoff + snaplen > po->rx_ring.frame_size) {
1948 if (po->copy_thresh &&
0fd7bac6 1949 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1950 if (skb_shared(skb)) {
1951 copy_skb = skb_clone(skb, GFP_ATOMIC);
1952 } else {
1953 copy_skb = skb_get(skb);
1954 skb_head = skb->data;
1955 }
1956 if (copy_skb)
1957 skb_set_owner_r(copy_skb, sk);
1da177e4 1958 }
f6fb8f10 1959 snaplen = po->rx_ring.frame_size - macoff;
1960 if ((int)snaplen < 0)
1961 snaplen = 0;
1da177e4 1962 }
dc808110
ED
1963 } else if (unlikely(macoff + snaplen >
1964 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
1965 u32 nval;
1966
1967 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
1968 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
1969 snaplen, nval, macoff);
1970 snaplen = nval;
1971 if (unlikely((int)snaplen < 0)) {
1972 snaplen = 0;
1973 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
1974 }
1da177e4 1975 }
1da177e4 1976 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1977 h.raw = packet_current_rx_frame(po, skb,
1978 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1979 if (!h.raw)
1da177e4 1980 goto ring_is_full;
f6fb8f10 1981 if (po->tp_version <= TPACKET_V2) {
1982 packet_increment_rx_head(po, &po->rx_ring);
1983 /*
1984 * LOSING will be reported till you read the stats,
1985 * because it's COR - Clear On Read.
1986 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1987 * at packet level.
1988 */
ee80fbf3 1989 if (po->stats.stats1.tp_drops)
f6fb8f10 1990 status |= TP_STATUS_LOSING;
1991 }
ee80fbf3 1992 po->stats.stats1.tp_packets++;
1da177e4
LT
1993 if (copy_skb) {
1994 status |= TP_STATUS_COPY;
1995 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1996 }
1da177e4
LT
1997 spin_unlock(&sk->sk_receive_queue.lock);
1998
bbd6ef87 1999 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2000
2001 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2002 getnstimeofday(&ts);
1da177e4 2003
b9c32fb2
DB
2004 status |= ts_status;
2005
bbd6ef87
PM
2006 switch (po->tp_version) {
2007 case TPACKET_V1:
2008 h.h1->tp_len = skb->len;
2009 h.h1->tp_snaplen = snaplen;
2010 h.h1->tp_mac = macoff;
2011 h.h1->tp_net = netoff;
4b457bdf
DB
2012 h.h1->tp_sec = ts.tv_sec;
2013 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2014 hdrlen = sizeof(*h.h1);
2015 break;
2016 case TPACKET_V2:
2017 h.h2->tp_len = skb->len;
2018 h.h2->tp_snaplen = snaplen;
2019 h.h2->tp_mac = macoff;
2020 h.h2->tp_net = netoff;
bbd6ef87
PM
2021 h.h2->tp_sec = ts.tv_sec;
2022 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2023 if (skb_vlan_tag_present(skb)) {
2024 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2025 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2026 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2027 } else {
2028 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2029 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2030 }
e4d26f4b 2031 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2032 hdrlen = sizeof(*h.h2);
2033 break;
f6fb8f10 2034 case TPACKET_V3:
2035 /* tp_nxt_offset,vlan are already populated above.
2036 * So DONT clear those fields here
2037 */
2038 h.h3->tp_status |= status;
2039 h.h3->tp_len = skb->len;
2040 h.h3->tp_snaplen = snaplen;
2041 h.h3->tp_mac = macoff;
2042 h.h3->tp_net = netoff;
f6fb8f10 2043 h.h3->tp_sec = ts.tv_sec;
2044 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2045 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2046 hdrlen = sizeof(*h.h3);
2047 break;
bbd6ef87
PM
2048 default:
2049 BUG();
2050 }
1da177e4 2051
bbd6ef87 2052 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2053 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2054 sll->sll_family = AF_PACKET;
2055 sll->sll_hatype = dev->type;
2056 sll->sll_protocol = skb->protocol;
2057 sll->sll_pkttype = skb->pkt_type;
8032b464 2058 if (unlikely(po->origdev))
80feaacb
PWJ
2059 sll->sll_ifindex = orig_dev->ifindex;
2060 else
2061 sll->sll_ifindex = dev->ifindex;
1da177e4 2062
e16aa207 2063 smp_mb();
f0d4eb29 2064
f6dafa95 2065#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2066 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2067 u8 *start, *end;
2068
f0d4eb29
DB
2069 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2070 macoff + snaplen);
2071
2072 for (start = h.raw; start < end; start += PAGE_SIZE)
2073 flush_dcache_page(pgv_to_page(start));
1da177e4 2074 }
f0d4eb29 2075 smp_wmb();
f6dafa95 2076#endif
f0d4eb29 2077
da413eec 2078 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2079 __packet_set_status(po, h.raw, status);
da413eec
DC
2080 sk->sk_data_ready(sk);
2081 } else {
f6fb8f10 2082 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2083 }
1da177e4
LT
2084
2085drop_n_restore:
2086 if (skb_head != skb->data && skb_shared(skb)) {
2087 skb->data = skb_head;
2088 skb->len = skb_len;
2089 }
2090drop:
1ce4f28b 2091 kfree_skb(skb);
1da177e4
LT
2092 return 0;
2093
2094ring_is_full:
ee80fbf3 2095 po->stats.stats1.tp_drops++;
1da177e4
LT
2096 spin_unlock(&sk->sk_receive_queue.lock);
2097
676d2369 2098 sk->sk_data_ready(sk);
acb5d75b 2099 kfree_skb(copy_skb);
1da177e4
LT
2100 goto drop_n_restore;
2101}
2102
69e3c75f
JB
2103static void tpacket_destruct_skb(struct sk_buff *skb)
2104{
2105 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2106
69e3c75f 2107 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2108 void *ph;
b9c32fb2
DB
2109 __u32 ts;
2110
69e3c75f 2111 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2112 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2113
2114 ts = __packet_set_timestamp(po, ph, skb);
2115 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2116 }
2117
2118 sock_wfree(skb);
2119}
2120
9c707762
WB
2121static bool ll_header_truncated(const struct net_device *dev, int len)
2122{
2123 /* net device doesn't like empty head */
2124 if (unlikely(len <= dev->hard_header_len)) {
eee2f04b 2125 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
9c707762
WB
2126 current->comm, len, dev->hard_header_len);
2127 return true;
2128 }
2129
2130 return false;
2131}
2132
40d4e3df
ED
2133static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2134 void *frame, struct net_device *dev, int size_max,
ae641949 2135 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2136{
184f489e 2137 union tpacket_uhdr ph;
09effa67 2138 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2139 struct socket *sock = po->sk.sk_socket;
2140 struct page *page;
2141 void *data;
2142 int err;
2143
2144 ph.raw = frame;
2145
2146 skb->protocol = proto;
2147 skb->dev = dev;
2148 skb->priority = po->sk.sk_priority;
2d37a186 2149 skb->mark = po->sk.sk_mark;
2e31396f 2150 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2151 skb_shinfo(skb)->destructor_arg = ph.raw;
2152
2153 switch (po->tp_version) {
2154 case TPACKET_V2:
2155 tp_len = ph.h2->tp_len;
2156 break;
2157 default:
2158 tp_len = ph.h1->tp_len;
2159 break;
2160 }
09effa67
DM
2161 if (unlikely(tp_len > size_max)) {
2162 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2163 return -EMSGSIZE;
2164 }
69e3c75f 2165
ae641949 2166 skb_reserve(skb, hlen);
69e3c75f 2167 skb_reset_network_header(skb);
c1aad275 2168
d346a3fa
DB
2169 if (!packet_use_direct_xmit(po))
2170 skb_probe_transport_header(skb, 0);
2171 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2172 int off_min, off_max, off;
2173 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2174 off_max = po->tx_ring.frame_size - tp_len;
2175 if (sock->type == SOCK_DGRAM) {
2176 switch (po->tp_version) {
2177 case TPACKET_V2:
2178 off = ph.h2->tp_net;
2179 break;
2180 default:
2181 off = ph.h1->tp_net;
2182 break;
2183 }
2184 } else {
2185 switch (po->tp_version) {
2186 case TPACKET_V2:
2187 off = ph.h2->tp_mac;
2188 break;
2189 default:
2190 off = ph.h1->tp_mac;
2191 break;
2192 }
2193 }
2194 if (unlikely((off < off_min) || (off_max < off)))
2195 return -EINVAL;
2196 data = ph.raw + off;
2197 } else {
2198 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2199 }
69e3c75f
JB
2200 to_write = tp_len;
2201
2202 if (sock->type == SOCK_DGRAM) {
2203 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2204 NULL, tp_len);
2205 if (unlikely(err < 0))
2206 return -EINVAL;
40d4e3df 2207 } else if (dev->hard_header_len) {
9c707762 2208 if (ll_header_truncated(dev, tp_len))
69e3c75f 2209 return -EINVAL;
69e3c75f
JB
2210
2211 skb_push(skb, dev->hard_header_len);
2212 err = skb_store_bits(skb, 0, data,
2213 dev->hard_header_len);
2214 if (unlikely(err))
2215 return err;
2216
2217 data += dev->hard_header_len;
2218 to_write -= dev->hard_header_len;
2219 }
2220
69e3c75f
JB
2221 offset = offset_in_page(data);
2222 len_max = PAGE_SIZE - offset;
2223 len = ((to_write > len_max) ? len_max : to_write);
2224
2225 skb->data_len = to_write;
2226 skb->len += to_write;
2227 skb->truesize += to_write;
2228 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2229
2230 while (likely(to_write)) {
2231 nr_frags = skb_shinfo(skb)->nr_frags;
2232
2233 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2234 pr_err("Packet exceed the number of skb frags(%lu)\n",
2235 MAX_SKB_FRAGS);
69e3c75f
JB
2236 return -EFAULT;
2237 }
2238
0af55bb5
CG
2239 page = pgv_to_page(data);
2240 data += len;
69e3c75f
JB
2241 flush_dcache_page(page);
2242 get_page(page);
0af55bb5 2243 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2244 to_write -= len;
2245 offset = 0;
2246 len_max = PAGE_SIZE;
2247 len = ((to_write > len_max) ? len_max : to_write);
2248 }
2249
2250 return tp_len;
2251}
2252
2253static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2254{
69e3c75f
JB
2255 struct sk_buff *skb;
2256 struct net_device *dev;
2257 __be16 proto;
09effa67 2258 int err, reserve = 0;
40d4e3df 2259 void *ph;
342dfc30 2260 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2261 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2262 int tp_len, size_max;
2263 unsigned char *addr;
2264 int len_sum = 0;
9e67030a 2265 int status = TP_STATUS_AVAILABLE;
ae641949 2266 int hlen, tlen;
69e3c75f 2267
69e3c75f
JB
2268 mutex_lock(&po->pg_vec_lock);
2269
66e56cd4 2270 if (likely(saddr == NULL)) {
e40526cb 2271 dev = packet_cached_dev_get(po);
69e3c75f
JB
2272 proto = po->num;
2273 addr = NULL;
2274 } else {
2275 err = -EINVAL;
2276 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2277 goto out;
2278 if (msg->msg_namelen < (saddr->sll_halen
2279 + offsetof(struct sockaddr_ll,
2280 sll_addr)))
2281 goto out;
69e3c75f
JB
2282 proto = saddr->sll_protocol;
2283 addr = saddr->sll_addr;
827d9780 2284 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2285 }
2286
69e3c75f
JB
2287 err = -ENXIO;
2288 if (unlikely(dev == NULL))
2289 goto out;
69e3c75f
JB
2290 err = -ENETDOWN;
2291 if (unlikely(!(dev->flags & IFF_UP)))
2292 goto out_put;
2293
52f1454f 2294 reserve = dev->hard_header_len + VLAN_HLEN;
69e3c75f 2295 size_max = po->tx_ring.frame_size
b5dd884e 2296 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2297
09effa67
DM
2298 if (size_max > dev->mtu + reserve)
2299 size_max = dev->mtu + reserve;
2300
69e3c75f
JB
2301 do {
2302 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2303 TP_STATUS_SEND_REQUEST);
69e3c75f 2304 if (unlikely(ph == NULL)) {
87a2fd28
DB
2305 if (need_wait && need_resched())
2306 schedule();
69e3c75f
JB
2307 continue;
2308 }
2309
2310 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2311 hlen = LL_RESERVED_SPACE(dev);
2312 tlen = dev->needed_tailroom;
69e3c75f 2313 skb = sock_alloc_send_skb(&po->sk,
ae641949 2314 hlen + tlen + sizeof(struct sockaddr_ll),
fbf33a28 2315 !need_wait, &err);
69e3c75f 2316
fbf33a28
KM
2317 if (unlikely(skb == NULL)) {
2318 /* we assume the socket was initially writeable ... */
2319 if (likely(len_sum > 0))
2320 err = len_sum;
69e3c75f 2321 goto out_status;
fbf33a28 2322 }
69e3c75f 2323 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
52f1454f
DB
2324 addr, hlen);
2325 if (tp_len > dev->mtu + dev->hard_header_len) {
2326 struct ethhdr *ehdr;
2327 /* Earlier code assumed this would be a VLAN pkt,
2328 * double-check this now that we have the actual
2329 * packet in hand.
2330 */
69e3c75f 2331
52f1454f
DB
2332 skb_reset_mac_header(skb);
2333 ehdr = eth_hdr(skb);
2334 if (ehdr->h_proto != htons(ETH_P_8021Q))
2335 tp_len = -EMSGSIZE;
2336 }
69e3c75f
JB
2337 if (unlikely(tp_len < 0)) {
2338 if (po->tp_loss) {
2339 __packet_set_status(po, ph,
2340 TP_STATUS_AVAILABLE);
2341 packet_increment_head(&po->tx_ring);
2342 kfree_skb(skb);
2343 continue;
2344 } else {
2345 status = TP_STATUS_WRONG_FORMAT;
2346 err = tp_len;
2347 goto out_status;
2348 }
2349 }
2350
0fd5d57b
DB
2351 packet_pick_tx_queue(dev, skb);
2352
69e3c75f
JB
2353 skb->destructor = tpacket_destruct_skb;
2354 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2355 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2356
2357 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2358 err = po->xmit(skb);
eb70df13
JP
2359 if (unlikely(err > 0)) {
2360 err = net_xmit_errno(err);
2361 if (err && __packet_get_status(po, ph) ==
2362 TP_STATUS_AVAILABLE) {
2363 /* skb was destructed already */
2364 skb = NULL;
2365 goto out_status;
2366 }
2367 /*
2368 * skb was dropped but not destructed yet;
2369 * let's treat it like congestion or err < 0
2370 */
2371 err = 0;
2372 }
69e3c75f
JB
2373 packet_increment_head(&po->tx_ring);
2374 len_sum += tp_len;
b0138408
DB
2375 } while (likely((ph != NULL) ||
2376 /* Note: packet_read_pending() might be slow if we have
2377 * to call it as it's per_cpu variable, but in fast-path
2378 * we already short-circuit the loop with the first
2379 * condition, and luckily don't have to go that path
2380 * anyway.
2381 */
2382 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2383
2384 err = len_sum;
2385 goto out_put;
2386
69e3c75f
JB
2387out_status:
2388 __packet_set_status(po, ph, status);
2389 kfree_skb(skb);
2390out_put:
e40526cb 2391 dev_put(dev);
69e3c75f
JB
2392out:
2393 mutex_unlock(&po->pg_vec_lock);
2394 return err;
2395}
69e3c75f 2396
eea49cc9
OJ
2397static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2398 size_t reserve, size_t len,
2399 size_t linear, int noblock,
2400 int *err)
bfd5f4a3
SS
2401{
2402 struct sk_buff *skb;
2403
2404 /* Under a page? Don't bother with paged skb. */
2405 if (prepad + len < PAGE_SIZE || !linear)
2406 linear = len;
2407
2408 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2409 err, 0);
bfd5f4a3
SS
2410 if (!skb)
2411 return NULL;
2412
2413 skb_reserve(skb, reserve);
2414 skb_put(skb, linear);
2415 skb->data_len = len - linear;
2416 skb->len += len - linear;
2417
2418 return skb;
2419}
2420
d346a3fa 2421static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2422{
2423 struct sock *sk = sock->sk;
342dfc30 2424 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2425 struct sk_buff *skb;
2426 struct net_device *dev;
0e11c91e 2427 __be16 proto;
1da177e4 2428 unsigned char *addr;
827d9780 2429 int err, reserve = 0;
bfd5f4a3
SS
2430 struct virtio_net_hdr vnet_hdr = { 0 };
2431 int offset = 0;
2432 int vnet_hdr_len;
2433 struct packet_sock *po = pkt_sk(sk);
2434 unsigned short gso_type = 0;
ae641949 2435 int hlen, tlen;
3bdc0eba 2436 int extra_len = 0;
8feb2fb2 2437 ssize_t n;
1da177e4
LT
2438
2439 /*
1ce4f28b 2440 * Get and verify the address.
1da177e4 2441 */
1ce4f28b 2442
66e56cd4 2443 if (likely(saddr == NULL)) {
e40526cb 2444 dev = packet_cached_dev_get(po);
1da177e4
LT
2445 proto = po->num;
2446 addr = NULL;
2447 } else {
2448 err = -EINVAL;
2449 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2450 goto out;
0fb375fb
EB
2451 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2452 goto out;
1da177e4
LT
2453 proto = saddr->sll_protocol;
2454 addr = saddr->sll_addr;
827d9780 2455 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2456 }
2457
1da177e4 2458 err = -ENXIO;
e40526cb 2459 if (unlikely(dev == NULL))
1da177e4 2460 goto out_unlock;
d5e76b0a 2461 err = -ENETDOWN;
e40526cb 2462 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2463 goto out_unlock;
2464
e40526cb
DB
2465 if (sock->type == SOCK_RAW)
2466 reserve = dev->hard_header_len;
bfd5f4a3
SS
2467 if (po->has_vnet_hdr) {
2468 vnet_hdr_len = sizeof(vnet_hdr);
2469
2470 err = -EINVAL;
2471 if (len < vnet_hdr_len)
2472 goto out_unlock;
2473
2474 len -= vnet_hdr_len;
2475
8feb2fb2 2476 err = -EFAULT;
c0371da6 2477 n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
8feb2fb2 2478 if (n != vnet_hdr_len)
bfd5f4a3
SS
2479 goto out_unlock;
2480
2481 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
dc9e5153
MT
2482 (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
2483 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
2484 __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
2485 vnet_hdr.hdr_len = __cpu_to_virtio16(false,
2486 __virtio16_to_cpu(false, vnet_hdr.csum_start) +
2487 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
bfd5f4a3
SS
2488
2489 err = -EINVAL;
dc9e5153 2490 if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
bfd5f4a3
SS
2491 goto out_unlock;
2492
2493 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2494 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2495 case VIRTIO_NET_HDR_GSO_TCPV4:
2496 gso_type = SKB_GSO_TCPV4;
2497 break;
2498 case VIRTIO_NET_HDR_GSO_TCPV6:
2499 gso_type = SKB_GSO_TCPV6;
2500 break;
2501 case VIRTIO_NET_HDR_GSO_UDP:
2502 gso_type = SKB_GSO_UDP;
2503 break;
2504 default:
2505 goto out_unlock;
2506 }
2507
2508 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2509 gso_type |= SKB_GSO_TCP_ECN;
2510
2511 if (vnet_hdr.gso_size == 0)
2512 goto out_unlock;
2513
2514 }
2515 }
2516
3bdc0eba
BG
2517 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2518 if (!netif_supports_nofcs(dev)) {
2519 err = -EPROTONOSUPPORT;
2520 goto out_unlock;
2521 }
2522 extra_len = 4; /* We're doing our own CRC */
2523 }
2524
1da177e4 2525 err = -EMSGSIZE;
3bdc0eba 2526 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2527 goto out_unlock;
2528
bfd5f4a3 2529 err = -ENOBUFS;
ae641949
HX
2530 hlen = LL_RESERVED_SPACE(dev);
2531 tlen = dev->needed_tailroom;
dc9e5153
MT
2532 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
2533 __virtio16_to_cpu(false, vnet_hdr.hdr_len),
bfd5f4a3 2534 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2535 if (skb == NULL)
1da177e4
LT
2536 goto out_unlock;
2537
bfd5f4a3 2538 skb_set_network_header(skb, reserve);
1da177e4 2539
0c4e8581 2540 err = -EINVAL;
9c707762
WB
2541 if (sock->type == SOCK_DGRAM) {
2542 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2543 if (unlikely(offset < 0))
9c707762
WB
2544 goto out_free;
2545 } else {
2546 if (ll_header_truncated(dev, len))
2547 goto out_free;
2548 }
1da177e4
LT
2549
2550 /* Returns -EFAULT on error */
c0371da6 2551 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2552 if (err)
2553 goto out_free;
bf84a010
DB
2554
2555 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2556
3bdc0eba 2557 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2558 /* Earlier code assumed this would be a VLAN pkt,
2559 * double-check this now that we have the actual
2560 * packet in hand.
2561 */
2562 struct ethhdr *ehdr;
2563 skb_reset_mac_header(skb);
2564 ehdr = eth_hdr(skb);
2565 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2566 err = -EMSGSIZE;
2567 goto out_free;
2568 }
57f89bfa
BG
2569 }
2570
09effa67
DM
2571 skb->protocol = proto;
2572 skb->dev = dev;
1da177e4 2573 skb->priority = sk->sk_priority;
2d37a186 2574 skb->mark = sk->sk_mark;
0fd5d57b
DB
2575
2576 packet_pick_tx_queue(dev, skb);
1da177e4 2577
bfd5f4a3
SS
2578 if (po->has_vnet_hdr) {
2579 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
dc9e5153
MT
2580 u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
2581 u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
2582 if (!skb_partial_csum_set(skb, s, o)) {
bfd5f4a3
SS
2583 err = -EINVAL;
2584 goto out_free;
2585 }
2586 }
2587
dc9e5153
MT
2588 skb_shinfo(skb)->gso_size =
2589 __virtio16_to_cpu(false, vnet_hdr.gso_size);
bfd5f4a3
SS
2590 skb_shinfo(skb)->gso_type = gso_type;
2591
2592 /* Header must be checked, and gso_segs computed. */
2593 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2594 skb_shinfo(skb)->gso_segs = 0;
2595
2596 len += vnet_hdr_len;
2597 }
2598
d346a3fa
DB
2599 if (!packet_use_direct_xmit(po))
2600 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2601 if (unlikely(extra_len == 4))
2602 skb->no_fcs = 1;
2603
d346a3fa 2604 err = po->xmit(skb);
1da177e4
LT
2605 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2606 goto out_unlock;
2607
e40526cb 2608 dev_put(dev);
1da177e4 2609
40d4e3df 2610 return len;
1da177e4
LT
2611
2612out_free:
2613 kfree_skb(skb);
2614out_unlock:
e40526cb 2615 if (dev)
1da177e4
LT
2616 dev_put(dev);
2617out:
2618 return err;
2619}
2620
1b784140 2621static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2622{
69e3c75f
JB
2623 struct sock *sk = sock->sk;
2624 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2625
69e3c75f
JB
2626 if (po->tx_ring.pg_vec)
2627 return tpacket_snd(po, msg);
2628 else
69e3c75f
JB
2629 return packet_snd(sock, msg, len);
2630}
2631
1da177e4
LT
2632/*
2633 * Close a PACKET socket. This is fairly simple. We immediately go
2634 * to 'closed' state and remove our protocol entry in the device list.
2635 */
2636
2637static int packet_release(struct socket *sock)
2638{
2639 struct sock *sk = sock->sk;
2640 struct packet_sock *po;
d12d01d6 2641 struct net *net;
f6fb8f10 2642 union tpacket_req_u req_u;
1da177e4
LT
2643
2644 if (!sk)
2645 return 0;
2646
3b1e0a65 2647 net = sock_net(sk);
1da177e4
LT
2648 po = pkt_sk(sk);
2649
0fa7fa98 2650 mutex_lock(&net->packet.sklist_lock);
808f5114 2651 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2652 mutex_unlock(&net->packet.sklist_lock);
2653
2654 preempt_disable();
920de804 2655 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2656 preempt_enable();
1da177e4 2657
808f5114 2658 spin_lock(&po->bind_lock);
ce06b03e 2659 unregister_prot_hook(sk, false);
66e56cd4
DB
2660 packet_cached_dev_reset(po);
2661
160ff18a
BG
2662 if (po->prot_hook.dev) {
2663 dev_put(po->prot_hook.dev);
2664 po->prot_hook.dev = NULL;
2665 }
808f5114 2666 spin_unlock(&po->bind_lock);
1da177e4 2667
1da177e4 2668 packet_flush_mclist(sk);
1da177e4 2669
9665d5d6
PS
2670 if (po->rx_ring.pg_vec) {
2671 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2672 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2673 }
69e3c75f 2674
9665d5d6
PS
2675 if (po->tx_ring.pg_vec) {
2676 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2677 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2678 }
1da177e4 2679
dc99f600
DM
2680 fanout_release(sk);
2681
808f5114 2682 synchronize_net();
1da177e4
LT
2683 /*
2684 * Now the socket is dead. No more input will appear.
2685 */
1da177e4
LT
2686 sock_orphan(sk);
2687 sock->sk = NULL;
2688
2689 /* Purge queues */
2690
2691 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2692 packet_free_pending(po);
17ab56a2 2693 sk_refcnt_debug_release(sk);
1da177e4
LT
2694
2695 sock_put(sk);
2696 return 0;
2697}
2698
2699/*
2700 * Attach a packet hook.
2701 */
2702
902fefb8 2703static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2704{
2705 struct packet_sock *po = pkt_sk(sk);
902fefb8
DB
2706 const struct net_device *dev_curr;
2707 __be16 proto_curr;
2708 bool need_rehook;
dc99f600 2709
aef950b4
WY
2710 if (po->fanout) {
2711 if (dev)
2712 dev_put(dev);
2713
dc99f600 2714 return -EINVAL;
aef950b4 2715 }
1da177e4
LT
2716
2717 lock_sock(sk);
1da177e4 2718 spin_lock(&po->bind_lock);
66e56cd4 2719
902fefb8
DB
2720 proto_curr = po->prot_hook.type;
2721 dev_curr = po->prot_hook.dev;
2722
2723 need_rehook = proto_curr != proto || dev_curr != dev;
2724
2725 if (need_rehook) {
2726 unregister_prot_hook(sk, true);
1da177e4 2727
902fefb8
DB
2728 po->num = proto;
2729 po->prot_hook.type = proto;
1da177e4 2730
902fefb8
DB
2731 if (po->prot_hook.dev)
2732 dev_put(po->prot_hook.dev);
2733
2734 po->prot_hook.dev = dev;
2735
2736 po->ifindex = dev ? dev->ifindex : 0;
2737 packet_cached_dev_assign(po, dev);
2738 }
66e56cd4 2739
902fefb8 2740 if (proto == 0 || !need_rehook)
1da177e4
LT
2741 goto out_unlock;
2742
be85d4ad 2743 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2744 register_prot_hook(sk);
be85d4ad
UT
2745 } else {
2746 sk->sk_err = ENETDOWN;
2747 if (!sock_flag(sk, SOCK_DEAD))
2748 sk->sk_error_report(sk);
1da177e4
LT
2749 }
2750
2751out_unlock:
2752 spin_unlock(&po->bind_lock);
2753 release_sock(sk);
2754 return 0;
2755}
2756
2757/*
2758 * Bind a packet socket to a device
2759 */
2760
40d4e3df
ED
2761static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2762 int addr_len)
1da177e4 2763{
40d4e3df 2764 struct sock *sk = sock->sk;
1da177e4
LT
2765 char name[15];
2766 struct net_device *dev;
2767 int err = -ENODEV;
1ce4f28b 2768
1da177e4
LT
2769 /*
2770 * Check legality
2771 */
1ce4f28b 2772
8ae55f04 2773 if (addr_len != sizeof(struct sockaddr))
1da177e4 2774 return -EINVAL;
40d4e3df 2775 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2776
3b1e0a65 2777 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2778 if (dev)
1da177e4 2779 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2780 return err;
2781}
1da177e4
LT
2782
2783static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2784{
40d4e3df
ED
2785 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2786 struct sock *sk = sock->sk;
1da177e4
LT
2787 struct net_device *dev = NULL;
2788 int err;
2789
2790
2791 /*
2792 * Check legality
2793 */
1ce4f28b 2794
1da177e4
LT
2795 if (addr_len < sizeof(struct sockaddr_ll))
2796 return -EINVAL;
2797 if (sll->sll_family != AF_PACKET)
2798 return -EINVAL;
2799
2800 if (sll->sll_ifindex) {
2801 err = -ENODEV;
3b1e0a65 2802 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2803 if (dev == NULL)
2804 goto out;
2805 }
2806 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2807
2808out:
2809 return err;
2810}
2811
2812static struct proto packet_proto = {
2813 .name = "PACKET",
2814 .owner = THIS_MODULE,
2815 .obj_size = sizeof(struct packet_sock),
2816};
2817
2818/*
1ce4f28b 2819 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2820 */
2821
3f378b68
EP
2822static int packet_create(struct net *net, struct socket *sock, int protocol,
2823 int kern)
1da177e4
LT
2824{
2825 struct sock *sk;
2826 struct packet_sock *po;
0e11c91e 2827 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2828 int err;
2829
df008c91 2830 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2831 return -EPERM;
be02097c
DM
2832 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2833 sock->type != SOCK_PACKET)
1da177e4
LT
2834 return -ESOCKTNOSUPPORT;
2835
2836 sock->state = SS_UNCONNECTED;
2837
2838 err = -ENOBUFS;
11aa9c28 2839 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
2840 if (sk == NULL)
2841 goto out;
2842
2843 sock->ops = &packet_ops;
1da177e4
LT
2844 if (sock->type == SOCK_PACKET)
2845 sock->ops = &packet_ops_spkt;
be02097c 2846
1da177e4
LT
2847 sock_init_data(sock, sk);
2848
2849 po = pkt_sk(sk);
2850 sk->sk_family = PF_PACKET;
0e11c91e 2851 po->num = proto;
d346a3fa 2852 po->xmit = dev_queue_xmit;
66e56cd4 2853
b0138408
DB
2854 err = packet_alloc_pending(po);
2855 if (err)
2856 goto out2;
2857
66e56cd4 2858 packet_cached_dev_reset(po);
1da177e4
LT
2859
2860 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2861 sk_refcnt_debug_inc(sk);
1da177e4
LT
2862
2863 /*
2864 * Attach a protocol block
2865 */
2866
2867 spin_lock_init(&po->bind_lock);
905db440 2868 mutex_init(&po->pg_vec_lock);
1da177e4 2869 po->prot_hook.func = packet_rcv;
be02097c 2870
1da177e4
LT
2871 if (sock->type == SOCK_PACKET)
2872 po->prot_hook.func = packet_rcv_spkt;
be02097c 2873
1da177e4
LT
2874 po->prot_hook.af_packet_priv = sk;
2875
0e11c91e
AV
2876 if (proto) {
2877 po->prot_hook.type = proto;
ce06b03e 2878 register_prot_hook(sk);
1da177e4
LT
2879 }
2880
0fa7fa98 2881 mutex_lock(&net->packet.sklist_lock);
808f5114 2882 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2883 mutex_unlock(&net->packet.sklist_lock);
2884
2885 preempt_disable();
3680453c 2886 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2887 preempt_enable();
808f5114 2888
40d4e3df 2889 return 0;
b0138408
DB
2890out2:
2891 sk_free(sk);
1da177e4
LT
2892out:
2893 return err;
2894}
2895
2896/*
2897 * Pull a packet from our receive queue and hand it to the user.
2898 * If necessary we block.
2899 */
2900
1b784140
YX
2901static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
2902 int flags)
1da177e4
LT
2903{
2904 struct sock *sk = sock->sk;
2905 struct sk_buff *skb;
2906 int copied, err;
bfd5f4a3 2907 int vnet_hdr_len = 0;
2472d761 2908 unsigned int origlen = 0;
1da177e4
LT
2909
2910 err = -EINVAL;
ed85b565 2911 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2912 goto out;
2913
2914#if 0
2915 /* What error should we return now? EUNATTACH? */
2916 if (pkt_sk(sk)->ifindex < 0)
2917 return -ENODEV;
2918#endif
2919
ed85b565 2920 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
2921 err = sock_recv_errqueue(sk, msg, len,
2922 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
2923 goto out;
2924 }
2925
1da177e4
LT
2926 /*
2927 * Call the generic datagram receiver. This handles all sorts
2928 * of horrible races and re-entrancy so we can forget about it
2929 * in the protocol layers.
2930 *
2931 * Now it will return ENETDOWN, if device have just gone down,
2932 * but then it will block.
2933 */
2934
40d4e3df 2935 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2936
2937 /*
1ce4f28b 2938 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2939 * handles the blocking we don't see and worry about blocking
2940 * retries.
2941 */
2942
8ae55f04 2943 if (skb == NULL)
1da177e4
LT
2944 goto out;
2945
bfd5f4a3
SS
2946 if (pkt_sk(sk)->has_vnet_hdr) {
2947 struct virtio_net_hdr vnet_hdr = { 0 };
2948
2949 err = -EINVAL;
2950 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2951 if (len < vnet_hdr_len)
bfd5f4a3
SS
2952 goto out_free;
2953
1f18b717
MK
2954 len -= vnet_hdr_len;
2955
bfd5f4a3
SS
2956 if (skb_is_gso(skb)) {
2957 struct skb_shared_info *sinfo = skb_shinfo(skb);
2958
2959 /* This is a hint as to how much should be linear. */
dc9e5153
MT
2960 vnet_hdr.hdr_len =
2961 __cpu_to_virtio16(false, skb_headlen(skb));
2962 vnet_hdr.gso_size =
2963 __cpu_to_virtio16(false, sinfo->gso_size);
bfd5f4a3
SS
2964 if (sinfo->gso_type & SKB_GSO_TCPV4)
2965 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2966 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2967 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2968 else if (sinfo->gso_type & SKB_GSO_UDP)
2969 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2970 else if (sinfo->gso_type & SKB_GSO_FCOE)
2971 goto out_free;
2972 else
2973 BUG();
2974 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2975 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2976 } else
2977 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2978
2979 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2980 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
dc9e5153
MT
2981 vnet_hdr.csum_start = __cpu_to_virtio16(false,
2982 skb_checksum_start_offset(skb));
2983 vnet_hdr.csum_offset = __cpu_to_virtio16(false,
2984 skb->csum_offset);
10a8d94a
JW
2985 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2986 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2987 } /* else everything is zero */
2988
7eab8d9e 2989 err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
bfd5f4a3
SS
2990 if (err < 0)
2991 goto out_free;
2992 }
2993
f3d33426
HFS
2994 /* You lose any data beyond the buffer you gave. If it worries
2995 * a user program they can ask the device for its MTU
2996 * anyway.
1da177e4 2997 */
1da177e4 2998 copied = skb->len;
40d4e3df
ED
2999 if (copied > len) {
3000 copied = len;
3001 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3002 }
3003
51f3d02b 3004 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3005 if (err)
3006 goto out_free;
3007
2472d761
EB
3008 if (sock->type != SOCK_PACKET) {
3009 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3010
3011 /* Original length was stored in sockaddr_ll fields */
3012 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3013 sll->sll_family = AF_PACKET;
3014 sll->sll_protocol = skb->protocol;
3015 }
3016
3b885787 3017 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3018
f3d33426
HFS
3019 if (msg->msg_name) {
3020 /* If the address length field is there to be filled
3021 * in, we fill it in now.
3022 */
3023 if (sock->type == SOCK_PACKET) {
342dfc30 3024 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3025 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3026 } else {
3027 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3028
f3d33426
HFS
3029 msg->msg_namelen = sll->sll_halen +
3030 offsetof(struct sockaddr_ll, sll_addr);
3031 }
ffbc6111
HX
3032 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3033 msg->msg_namelen);
f3d33426 3034 }
1da177e4 3035
8dc41944 3036 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3037 struct tpacket_auxdata aux;
3038
3039 aux.tp_status = TP_STATUS_USER;
3040 if (skb->ip_summed == CHECKSUM_PARTIAL)
3041 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3042 else if (skb->pkt_type != PACKET_OUTGOING &&
3043 (skb->ip_summed == CHECKSUM_COMPLETE ||
3044 skb_csum_unnecessary(skb)))
3045 aux.tp_status |= TP_STATUS_CSUM_VALID;
3046
2472d761 3047 aux.tp_len = origlen;
ffbc6111
HX
3048 aux.tp_snaplen = skb->len;
3049 aux.tp_mac = 0;
bbe735e4 3050 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3051 if (skb_vlan_tag_present(skb)) {
3052 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3053 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3054 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3055 } else {
3056 aux.tp_vlan_tci = 0;
a0cdfcf3 3057 aux.tp_vlan_tpid = 0;
a3bcc23e 3058 }
ffbc6111 3059 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3060 }
3061
1da177e4
LT
3062 /*
3063 * Free or return the buffer as appropriate. Again this
3064 * hides all the races and re-entrancy issues from us.
3065 */
bfd5f4a3 3066 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3067
3068out_free:
3069 skb_free_datagram(sk, skb);
3070out:
3071 return err;
3072}
3073
1da177e4
LT
3074static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3075 int *uaddr_len, int peer)
3076{
3077 struct net_device *dev;
3078 struct sock *sk = sock->sk;
3079
3080 if (peer)
3081 return -EOPNOTSUPP;
3082
3083 uaddr->sa_family = AF_PACKET;
2dc85bf3 3084 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3085 rcu_read_lock();
3086 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3087 if (dev)
2dc85bf3 3088 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3089 rcu_read_unlock();
1da177e4
LT
3090 *uaddr_len = sizeof(*uaddr);
3091
3092 return 0;
3093}
1da177e4
LT
3094
3095static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3096 int *uaddr_len, int peer)
3097{
3098 struct net_device *dev;
3099 struct sock *sk = sock->sk;
3100 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3101 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3102
3103 if (peer)
3104 return -EOPNOTSUPP;
3105
3106 sll->sll_family = AF_PACKET;
3107 sll->sll_ifindex = po->ifindex;
3108 sll->sll_protocol = po->num;
67286640 3109 sll->sll_pkttype = 0;
654d1f8a
ED
3110 rcu_read_lock();
3111 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3112 if (dev) {
3113 sll->sll_hatype = dev->type;
3114 sll->sll_halen = dev->addr_len;
3115 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3116 } else {
3117 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3118 sll->sll_halen = 0;
3119 }
654d1f8a 3120 rcu_read_unlock();
0fb375fb 3121 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3122
3123 return 0;
3124}
3125
2aeb0b88
WC
3126static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3127 int what)
1da177e4
LT
3128{
3129 switch (i->type) {
3130 case PACKET_MR_MULTICAST:
1162563f
JP
3131 if (i->alen != dev->addr_len)
3132 return -EINVAL;
1da177e4 3133 if (what > 0)
22bedad3 3134 return dev_mc_add(dev, i->addr);
1da177e4 3135 else
22bedad3 3136 return dev_mc_del(dev, i->addr);
1da177e4
LT
3137 break;
3138 case PACKET_MR_PROMISC:
2aeb0b88 3139 return dev_set_promiscuity(dev, what);
1da177e4 3140 case PACKET_MR_ALLMULTI:
2aeb0b88 3141 return dev_set_allmulti(dev, what);
d95ed927 3142 case PACKET_MR_UNICAST:
1162563f
JP
3143 if (i->alen != dev->addr_len)
3144 return -EINVAL;
d95ed927 3145 if (what > 0)
a748ee24 3146 return dev_uc_add(dev, i->addr);
d95ed927 3147 else
a748ee24 3148 return dev_uc_del(dev, i->addr);
d95ed927 3149 break;
40d4e3df
ED
3150 default:
3151 break;
1da177e4 3152 }
2aeb0b88 3153 return 0;
1da177e4
LT
3154}
3155
82f17091
FR
3156static void packet_dev_mclist_delete(struct net_device *dev,
3157 struct packet_mclist **mlp)
1da177e4 3158{
82f17091
FR
3159 struct packet_mclist *ml;
3160
3161 while ((ml = *mlp) != NULL) {
3162 if (ml->ifindex == dev->ifindex) {
3163 packet_dev_mc(dev, ml, -1);
3164 *mlp = ml->next;
3165 kfree(ml);
3166 } else
3167 mlp = &ml->next;
1da177e4
LT
3168 }
3169}
3170
0fb375fb 3171static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3172{
3173 struct packet_sock *po = pkt_sk(sk);
3174 struct packet_mclist *ml, *i;
3175 struct net_device *dev;
3176 int err;
3177
3178 rtnl_lock();
3179
3180 err = -ENODEV;
3b1e0a65 3181 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3182 if (!dev)
3183 goto done;
3184
3185 err = -EINVAL;
1162563f 3186 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3187 goto done;
3188
3189 err = -ENOBUFS;
8b3a7005 3190 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3191 if (i == NULL)
3192 goto done;
3193
3194 err = 0;
3195 for (ml = po->mclist; ml; ml = ml->next) {
3196 if (ml->ifindex == mreq->mr_ifindex &&
3197 ml->type == mreq->mr_type &&
3198 ml->alen == mreq->mr_alen &&
3199 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3200 ml->count++;
3201 /* Free the new element ... */
3202 kfree(i);
3203 goto done;
3204 }
3205 }
3206
3207 i->type = mreq->mr_type;
3208 i->ifindex = mreq->mr_ifindex;
3209 i->alen = mreq->mr_alen;
3210 memcpy(i->addr, mreq->mr_address, i->alen);
3211 i->count = 1;
3212 i->next = po->mclist;
3213 po->mclist = i;
2aeb0b88
WC
3214 err = packet_dev_mc(dev, i, 1);
3215 if (err) {
3216 po->mclist = i->next;
3217 kfree(i);
3218 }
1da177e4
LT
3219
3220done:
3221 rtnl_unlock();
3222 return err;
3223}
3224
0fb375fb 3225static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3226{
3227 struct packet_mclist *ml, **mlp;
3228
3229 rtnl_lock();
3230
3231 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3232 if (ml->ifindex == mreq->mr_ifindex &&
3233 ml->type == mreq->mr_type &&
3234 ml->alen == mreq->mr_alen &&
3235 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3236 if (--ml->count == 0) {
3237 struct net_device *dev;
3238 *mlp = ml->next;
ad959e76
ED
3239 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3240 if (dev)
1da177e4 3241 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3242 kfree(ml);
3243 }
82f17091 3244 break;
1da177e4
LT
3245 }
3246 }
3247 rtnl_unlock();
82f17091 3248 return 0;
1da177e4
LT
3249}
3250
3251static void packet_flush_mclist(struct sock *sk)
3252{
3253 struct packet_sock *po = pkt_sk(sk);
3254 struct packet_mclist *ml;
3255
3256 if (!po->mclist)
3257 return;
3258
3259 rtnl_lock();
3260 while ((ml = po->mclist) != NULL) {
3261 struct net_device *dev;
3262
3263 po->mclist = ml->next;
ad959e76
ED
3264 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3265 if (dev != NULL)
1da177e4 3266 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3267 kfree(ml);
3268 }
3269 rtnl_unlock();
3270}
1da177e4
LT
3271
3272static int
b7058842 3273packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3274{
3275 struct sock *sk = sock->sk;
8dc41944 3276 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3277 int ret;
3278
3279 if (level != SOL_PACKET)
3280 return -ENOPROTOOPT;
3281
69e3c75f 3282 switch (optname) {
1ce4f28b 3283 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3284 case PACKET_DROP_MEMBERSHIP:
3285 {
0fb375fb
EB
3286 struct packet_mreq_max mreq;
3287 int len = optlen;
3288 memset(&mreq, 0, sizeof(mreq));
3289 if (len < sizeof(struct packet_mreq))
1da177e4 3290 return -EINVAL;
0fb375fb
EB
3291 if (len > sizeof(mreq))
3292 len = sizeof(mreq);
40d4e3df 3293 if (copy_from_user(&mreq, optval, len))
1da177e4 3294 return -EFAULT;
0fb375fb
EB
3295 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3296 return -EINVAL;
1da177e4
LT
3297 if (optname == PACKET_ADD_MEMBERSHIP)
3298 ret = packet_mc_add(sk, &mreq);
3299 else
3300 ret = packet_mc_drop(sk, &mreq);
3301 return ret;
3302 }
a2efcfa0 3303
1da177e4 3304 case PACKET_RX_RING:
69e3c75f 3305 case PACKET_TX_RING:
1da177e4 3306 {
f6fb8f10 3307 union tpacket_req_u req_u;
3308 int len;
1da177e4 3309
f6fb8f10 3310 switch (po->tp_version) {
3311 case TPACKET_V1:
3312 case TPACKET_V2:
3313 len = sizeof(req_u.req);
3314 break;
3315 case TPACKET_V3:
3316 default:
3317 len = sizeof(req_u.req3);
3318 break;
3319 }
3320 if (optlen < len)
1da177e4 3321 return -EINVAL;
bfd5f4a3
SS
3322 if (pkt_sk(sk)->has_vnet_hdr)
3323 return -EINVAL;
f6fb8f10 3324 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3325 return -EFAULT;
f6fb8f10 3326 return packet_set_ring(sk, &req_u, 0,
3327 optname == PACKET_TX_RING);
1da177e4
LT
3328 }
3329 case PACKET_COPY_THRESH:
3330 {
3331 int val;
3332
40d4e3df 3333 if (optlen != sizeof(val))
1da177e4 3334 return -EINVAL;
40d4e3df 3335 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3336 return -EFAULT;
3337
3338 pkt_sk(sk)->copy_thresh = val;
3339 return 0;
3340 }
bbd6ef87
PM
3341 case PACKET_VERSION:
3342 {
3343 int val;
3344
3345 if (optlen != sizeof(val))
3346 return -EINVAL;
69e3c75f 3347 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3348 return -EBUSY;
3349 if (copy_from_user(&val, optval, sizeof(val)))
3350 return -EFAULT;
3351 switch (val) {
3352 case TPACKET_V1:
3353 case TPACKET_V2:
f6fb8f10 3354 case TPACKET_V3:
bbd6ef87
PM
3355 po->tp_version = val;
3356 return 0;
3357 default:
3358 return -EINVAL;
3359 }
3360 }
8913336a
PM
3361 case PACKET_RESERVE:
3362 {
3363 unsigned int val;
3364
3365 if (optlen != sizeof(val))
3366 return -EINVAL;
69e3c75f 3367 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3368 return -EBUSY;
3369 if (copy_from_user(&val, optval, sizeof(val)))
3370 return -EFAULT;
3371 po->tp_reserve = val;
3372 return 0;
3373 }
69e3c75f
JB
3374 case PACKET_LOSS:
3375 {
3376 unsigned int val;
3377
3378 if (optlen != sizeof(val))
3379 return -EINVAL;
3380 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3381 return -EBUSY;
3382 if (copy_from_user(&val, optval, sizeof(val)))
3383 return -EFAULT;
3384 po->tp_loss = !!val;
3385 return 0;
3386 }
8dc41944
HX
3387 case PACKET_AUXDATA:
3388 {
3389 int val;
3390
3391 if (optlen < sizeof(val))
3392 return -EINVAL;
3393 if (copy_from_user(&val, optval, sizeof(val)))
3394 return -EFAULT;
3395
3396 po->auxdata = !!val;
3397 return 0;
3398 }
80feaacb
PWJ
3399 case PACKET_ORIGDEV:
3400 {
3401 int val;
3402
3403 if (optlen < sizeof(val))
3404 return -EINVAL;
3405 if (copy_from_user(&val, optval, sizeof(val)))
3406 return -EFAULT;
3407
3408 po->origdev = !!val;
3409 return 0;
3410 }
bfd5f4a3
SS
3411 case PACKET_VNET_HDR:
3412 {
3413 int val;
3414
3415 if (sock->type != SOCK_RAW)
3416 return -EINVAL;
3417 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3418 return -EBUSY;
3419 if (optlen < sizeof(val))
3420 return -EINVAL;
3421 if (copy_from_user(&val, optval, sizeof(val)))
3422 return -EFAULT;
3423
3424 po->has_vnet_hdr = !!val;
3425 return 0;
3426 }
614f60fa
SM
3427 case PACKET_TIMESTAMP:
3428 {
3429 int val;
3430
3431 if (optlen != sizeof(val))
3432 return -EINVAL;
3433 if (copy_from_user(&val, optval, sizeof(val)))
3434 return -EFAULT;
3435
3436 po->tp_tstamp = val;
3437 return 0;
3438 }
dc99f600
DM
3439 case PACKET_FANOUT:
3440 {
3441 int val;
3442
3443 if (optlen != sizeof(val))
3444 return -EINVAL;
3445 if (copy_from_user(&val, optval, sizeof(val)))
3446 return -EFAULT;
3447
3448 return fanout_add(sk, val & 0xffff, val >> 16);
3449 }
5920cd3a
PC
3450 case PACKET_TX_HAS_OFF:
3451 {
3452 unsigned int val;
3453
3454 if (optlen != sizeof(val))
3455 return -EINVAL;
3456 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3457 return -EBUSY;
3458 if (copy_from_user(&val, optval, sizeof(val)))
3459 return -EFAULT;
3460 po->tp_tx_has_off = !!val;
3461 return 0;
3462 }
d346a3fa
DB
3463 case PACKET_QDISC_BYPASS:
3464 {
3465 int val;
3466
3467 if (optlen != sizeof(val))
3468 return -EINVAL;
3469 if (copy_from_user(&val, optval, sizeof(val)))
3470 return -EFAULT;
3471
3472 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3473 return 0;
3474 }
1da177e4
LT
3475 default:
3476 return -ENOPROTOOPT;
3477 }
3478}
3479
3480static int packet_getsockopt(struct socket *sock, int level, int optname,
3481 char __user *optval, int __user *optlen)
3482{
3483 int len;
c06fff6e 3484 int val, lv = sizeof(val);
1da177e4
LT
3485 struct sock *sk = sock->sk;
3486 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3487 void *data = &val;
ee80fbf3 3488 union tpacket_stats_u st;
1da177e4
LT
3489
3490 if (level != SOL_PACKET)
3491 return -ENOPROTOOPT;
3492
8ae55f04
KK
3493 if (get_user(len, optlen))
3494 return -EFAULT;
1da177e4
LT
3495
3496 if (len < 0)
3497 return -EINVAL;
1ce4f28b 3498
69e3c75f 3499 switch (optname) {
1da177e4 3500 case PACKET_STATISTICS:
1da177e4 3501 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3502 memcpy(&st, &po->stats, sizeof(st));
3503 memset(&po->stats, 0, sizeof(po->stats));
3504 spin_unlock_bh(&sk->sk_receive_queue.lock);
3505
f6fb8f10 3506 if (po->tp_version == TPACKET_V3) {
c06fff6e 3507 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3508 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3509 data = &st.stats3;
f6fb8f10 3510 } else {
c06fff6e 3511 lv = sizeof(struct tpacket_stats);
8bcdeaff 3512 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3513 data = &st.stats1;
f6fb8f10 3514 }
ee80fbf3 3515
8dc41944
HX
3516 break;
3517 case PACKET_AUXDATA:
8dc41944 3518 val = po->auxdata;
80feaacb
PWJ
3519 break;
3520 case PACKET_ORIGDEV:
80feaacb 3521 val = po->origdev;
bfd5f4a3
SS
3522 break;
3523 case PACKET_VNET_HDR:
bfd5f4a3 3524 val = po->has_vnet_hdr;
1da177e4 3525 break;
bbd6ef87 3526 case PACKET_VERSION:
bbd6ef87 3527 val = po->tp_version;
bbd6ef87
PM
3528 break;
3529 case PACKET_HDRLEN:
3530 if (len > sizeof(int))
3531 len = sizeof(int);
3532 if (copy_from_user(&val, optval, len))
3533 return -EFAULT;
3534 switch (val) {
3535 case TPACKET_V1:
3536 val = sizeof(struct tpacket_hdr);
3537 break;
3538 case TPACKET_V2:
3539 val = sizeof(struct tpacket2_hdr);
3540 break;
f6fb8f10 3541 case TPACKET_V3:
3542 val = sizeof(struct tpacket3_hdr);
3543 break;
bbd6ef87
PM
3544 default:
3545 return -EINVAL;
3546 }
bbd6ef87 3547 break;
8913336a 3548 case PACKET_RESERVE:
8913336a 3549 val = po->tp_reserve;
8913336a 3550 break;
69e3c75f 3551 case PACKET_LOSS:
69e3c75f 3552 val = po->tp_loss;
69e3c75f 3553 break;
614f60fa 3554 case PACKET_TIMESTAMP:
614f60fa 3555 val = po->tp_tstamp;
614f60fa 3556 break;
dc99f600 3557 case PACKET_FANOUT:
dc99f600
DM
3558 val = (po->fanout ?
3559 ((u32)po->fanout->id |
77f65ebd
WB
3560 ((u32)po->fanout->type << 16) |
3561 ((u32)po->fanout->flags << 24)) :
dc99f600 3562 0);
dc99f600 3563 break;
5920cd3a
PC
3564 case PACKET_TX_HAS_OFF:
3565 val = po->tp_tx_has_off;
3566 break;
d346a3fa
DB
3567 case PACKET_QDISC_BYPASS:
3568 val = packet_use_direct_xmit(po);
3569 break;
1da177e4
LT
3570 default:
3571 return -ENOPROTOOPT;
3572 }
3573
c06fff6e
ED
3574 if (len > lv)
3575 len = lv;
8ae55f04
KK
3576 if (put_user(len, optlen))
3577 return -EFAULT;
8dc41944
HX
3578 if (copy_to_user(optval, data, len))
3579 return -EFAULT;
8ae55f04 3580 return 0;
1da177e4
LT
3581}
3582
3583
351638e7
JP
3584static int packet_notifier(struct notifier_block *this,
3585 unsigned long msg, void *ptr)
1da177e4
LT
3586{
3587 struct sock *sk;
351638e7 3588 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3589 struct net *net = dev_net(dev);
1da177e4 3590
808f5114 3591 rcu_read_lock();
b67bfe0d 3592 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3593 struct packet_sock *po = pkt_sk(sk);
3594
3595 switch (msg) {
3596 case NETDEV_UNREGISTER:
1da177e4 3597 if (po->mclist)
82f17091 3598 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3599 /* fallthrough */
3600
1da177e4
LT
3601 case NETDEV_DOWN:
3602 if (dev->ifindex == po->ifindex) {
3603 spin_lock(&po->bind_lock);
3604 if (po->running) {
ce06b03e 3605 __unregister_prot_hook(sk, false);
1da177e4
LT
3606 sk->sk_err = ENETDOWN;
3607 if (!sock_flag(sk, SOCK_DEAD))
3608 sk->sk_error_report(sk);
3609 }
3610 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3611 packet_cached_dev_reset(po);
1da177e4 3612 po->ifindex = -1;
160ff18a
BG
3613 if (po->prot_hook.dev)
3614 dev_put(po->prot_hook.dev);
1da177e4
LT
3615 po->prot_hook.dev = NULL;
3616 }
3617 spin_unlock(&po->bind_lock);
3618 }
3619 break;
3620 case NETDEV_UP:
808f5114 3621 if (dev->ifindex == po->ifindex) {
3622 spin_lock(&po->bind_lock);
ce06b03e
DM
3623 if (po->num)
3624 register_prot_hook(sk);
808f5114 3625 spin_unlock(&po->bind_lock);
1da177e4 3626 }
1da177e4
LT
3627 break;
3628 }
3629 }
808f5114 3630 rcu_read_unlock();
1da177e4
LT
3631 return NOTIFY_DONE;
3632}
3633
3634
3635static int packet_ioctl(struct socket *sock, unsigned int cmd,
3636 unsigned long arg)
3637{
3638 struct sock *sk = sock->sk;
3639
69e3c75f 3640 switch (cmd) {
40d4e3df
ED
3641 case SIOCOUTQ:
3642 {
3643 int amount = sk_wmem_alloc_get(sk);
31e6d363 3644
40d4e3df
ED
3645 return put_user(amount, (int __user *)arg);
3646 }
3647 case SIOCINQ:
3648 {
3649 struct sk_buff *skb;
3650 int amount = 0;
3651
3652 spin_lock_bh(&sk->sk_receive_queue.lock);
3653 skb = skb_peek(&sk->sk_receive_queue);
3654 if (skb)
3655 amount = skb->len;
3656 spin_unlock_bh(&sk->sk_receive_queue.lock);
3657 return put_user(amount, (int __user *)arg);
3658 }
3659 case SIOCGSTAMP:
3660 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3661 case SIOCGSTAMPNS:
3662 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3663
1da177e4 3664#ifdef CONFIG_INET
40d4e3df
ED
3665 case SIOCADDRT:
3666 case SIOCDELRT:
3667 case SIOCDARP:
3668 case SIOCGARP:
3669 case SIOCSARP:
3670 case SIOCGIFADDR:
3671 case SIOCSIFADDR:
3672 case SIOCGIFBRDADDR:
3673 case SIOCSIFBRDADDR:
3674 case SIOCGIFNETMASK:
3675 case SIOCSIFNETMASK:
3676 case SIOCGIFDSTADDR:
3677 case SIOCSIFDSTADDR:
3678 case SIOCSIFFLAGS:
40d4e3df 3679 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3680#endif
3681
40d4e3df
ED
3682 default:
3683 return -ENOIOCTLCMD;
1da177e4
LT
3684 }
3685 return 0;
3686}
3687
40d4e3df 3688static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3689 poll_table *wait)
3690{
3691 struct sock *sk = sock->sk;
3692 struct packet_sock *po = pkt_sk(sk);
3693 unsigned int mask = datagram_poll(file, sock, wait);
3694
3695 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3696 if (po->rx_ring.pg_vec) {
f6fb8f10 3697 if (!packet_previous_rx_frame(po, &po->rx_ring,
3698 TP_STATUS_KERNEL))
1da177e4
LT
3699 mask |= POLLIN | POLLRDNORM;
3700 }
3701 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3702 spin_lock_bh(&sk->sk_write_queue.lock);
3703 if (po->tx_ring.pg_vec) {
3704 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3705 mask |= POLLOUT | POLLWRNORM;
3706 }
3707 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3708 return mask;
3709}
3710
3711
3712/* Dirty? Well, I still did not learn better way to account
3713 * for user mmaps.
3714 */
3715
3716static void packet_mm_open(struct vm_area_struct *vma)
3717{
3718 struct file *file = vma->vm_file;
40d4e3df 3719 struct socket *sock = file->private_data;
1da177e4 3720 struct sock *sk = sock->sk;
1ce4f28b 3721
1da177e4
LT
3722 if (sk)
3723 atomic_inc(&pkt_sk(sk)->mapped);
3724}
3725
3726static void packet_mm_close(struct vm_area_struct *vma)
3727{
3728 struct file *file = vma->vm_file;
40d4e3df 3729 struct socket *sock = file->private_data;
1da177e4 3730 struct sock *sk = sock->sk;
1ce4f28b 3731
1da177e4
LT
3732 if (sk)
3733 atomic_dec(&pkt_sk(sk)->mapped);
3734}
3735
f0f37e2f 3736static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3737 .open = packet_mm_open,
3738 .close = packet_mm_close,
1da177e4
LT
3739};
3740
0e3125c7
NH
3741static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3742 unsigned int len)
1da177e4
LT
3743{
3744 int i;
3745
4ebf0ae2 3746 for (i = 0; i < len; i++) {
0e3125c7 3747 if (likely(pg_vec[i].buffer)) {
c56b4d90 3748 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3749 vfree(pg_vec[i].buffer);
3750 else
3751 free_pages((unsigned long)pg_vec[i].buffer,
3752 order);
3753 pg_vec[i].buffer = NULL;
3754 }
1da177e4
LT
3755 }
3756 kfree(pg_vec);
3757}
3758
eea49cc9 3759static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3760{
f0d4eb29 3761 char *buffer;
0e3125c7
NH
3762 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3763 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3764
3765 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3766 if (buffer)
3767 return buffer;
3768
f0d4eb29 3769 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 3770 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
3771 if (buffer)
3772 return buffer;
3773
f0d4eb29 3774 /* vmalloc failed, lets dig into swap here */
0e3125c7 3775 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 3776 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3777 if (buffer)
3778 return buffer;
3779
f0d4eb29 3780 /* complete and utter failure */
0e3125c7 3781 return NULL;
4ebf0ae2
DM
3782}
3783
0e3125c7 3784static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3785{
3786 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3787 struct pgv *pg_vec;
4ebf0ae2
DM
3788 int i;
3789
0e3125c7 3790 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3791 if (unlikely(!pg_vec))
3792 goto out;
3793
3794 for (i = 0; i < block_nr; i++) {
c56b4d90 3795 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3796 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3797 goto out_free_pgvec;
3798 }
3799
3800out:
3801 return pg_vec;
3802
3803out_free_pgvec:
3804 free_pg_vec(pg_vec, order, block_nr);
3805 pg_vec = NULL;
3806 goto out;
3807}
1da177e4 3808
f6fb8f10 3809static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3810 int closing, int tx_ring)
1da177e4 3811{
0e3125c7 3812 struct pgv *pg_vec = NULL;
1da177e4 3813 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3814 int was_running, order = 0;
69e3c75f
JB
3815 struct packet_ring_buffer *rb;
3816 struct sk_buff_head *rb_queue;
0e11c91e 3817 __be16 num;
f6fb8f10 3818 int err = -EINVAL;
3819 /* Added to avoid minimal code churn */
3820 struct tpacket_req *req = &req_u->req;
3821
3822 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3823 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3824 WARN(1, "Tx-ring is not supported.\n");
3825 goto out;
3826 }
1ce4f28b 3827
69e3c75f
JB
3828 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3829 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3830
69e3c75f
JB
3831 err = -EBUSY;
3832 if (!closing) {
3833 if (atomic_read(&po->mapped))
3834 goto out;
b0138408 3835 if (packet_read_pending(rb))
69e3c75f
JB
3836 goto out;
3837 }
1da177e4 3838
69e3c75f
JB
3839 if (req->tp_block_nr) {
3840 /* Sanity tests and some calculations */
3841 err = -EBUSY;
3842 if (unlikely(rb->pg_vec))
3843 goto out;
1da177e4 3844
bbd6ef87
PM
3845 switch (po->tp_version) {
3846 case TPACKET_V1:
3847 po->tp_hdrlen = TPACKET_HDRLEN;
3848 break;
3849 case TPACKET_V2:
3850 po->tp_hdrlen = TPACKET2_HDRLEN;
3851 break;
f6fb8f10 3852 case TPACKET_V3:
3853 po->tp_hdrlen = TPACKET3_HDRLEN;
3854 break;
bbd6ef87
PM
3855 }
3856
69e3c75f 3857 err = -EINVAL;
4ebf0ae2 3858 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3859 goto out;
4ebf0ae2 3860 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3861 goto out;
dc808110
ED
3862 if (po->tp_version >= TPACKET_V3 &&
3863 (int)(req->tp_block_size -
3864 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
3865 goto out;
8913336a 3866 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3867 po->tp_reserve))
3868 goto out;
4ebf0ae2 3869 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3870 goto out;
1da177e4 3871
69e3c75f
JB
3872 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3873 if (unlikely(rb->frames_per_block <= 0))
3874 goto out;
3875 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3876 req->tp_frame_nr))
3877 goto out;
1da177e4
LT
3878
3879 err = -ENOMEM;
4ebf0ae2
DM
3880 order = get_order(req->tp_block_size);
3881 pg_vec = alloc_pg_vec(req, order);
3882 if (unlikely(!pg_vec))
1da177e4 3883 goto out;
f6fb8f10 3884 switch (po->tp_version) {
3885 case TPACKET_V3:
3886 /* Transmit path is not supported. We checked
3887 * it above but just being paranoid
3888 */
3889 if (!tx_ring)
3890 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
d7cf0c34 3891 break;
f6fb8f10 3892 default:
3893 break;
3894 }
69e3c75f
JB
3895 }
3896 /* Done */
3897 else {
3898 err = -EINVAL;
4ebf0ae2 3899 if (unlikely(req->tp_frame_nr))
69e3c75f 3900 goto out;
1da177e4
LT
3901 }
3902
3903 lock_sock(sk);
3904
3905 /* Detach socket from network */
3906 spin_lock(&po->bind_lock);
3907 was_running = po->running;
3908 num = po->num;
3909 if (was_running) {
1da177e4 3910 po->num = 0;
ce06b03e 3911 __unregister_prot_hook(sk, false);
1da177e4
LT
3912 }
3913 spin_unlock(&po->bind_lock);
1ce4f28b 3914
1da177e4
LT
3915 synchronize_net();
3916
3917 err = -EBUSY;
905db440 3918 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3919 if (closing || atomic_read(&po->mapped) == 0) {
3920 err = 0;
69e3c75f 3921 spin_lock_bh(&rb_queue->lock);
c053fd96 3922 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3923 rb->frame_max = (req->tp_frame_nr - 1);
3924 rb->head = 0;
3925 rb->frame_size = req->tp_frame_size;
3926 spin_unlock_bh(&rb_queue->lock);
3927
c053fd96
CG
3928 swap(rb->pg_vec_order, order);
3929 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3930
3931 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3932 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3933 tpacket_rcv : packet_rcv;
3934 skb_queue_purge(rb_queue);
1da177e4 3935 if (atomic_read(&po->mapped))
40d4e3df
ED
3936 pr_err("packet_mmap: vma is busy: %d\n",
3937 atomic_read(&po->mapped));
1da177e4 3938 }
905db440 3939 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3940
3941 spin_lock(&po->bind_lock);
ce06b03e 3942 if (was_running) {
1da177e4 3943 po->num = num;
ce06b03e 3944 register_prot_hook(sk);
1da177e4
LT
3945 }
3946 spin_unlock(&po->bind_lock);
f6fb8f10 3947 if (closing && (po->tp_version > TPACKET_V2)) {
3948 /* Because we don't support block-based V3 on tx-ring */
3949 if (!tx_ring)
3950 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3951 }
1da177e4
LT
3952 release_sock(sk);
3953
1da177e4
LT
3954 if (pg_vec)
3955 free_pg_vec(pg_vec, order, req->tp_block_nr);
3956out:
3957 return err;
3958}
3959
69e3c75f
JB
3960static int packet_mmap(struct file *file, struct socket *sock,
3961 struct vm_area_struct *vma)
1da177e4
LT
3962{
3963 struct sock *sk = sock->sk;
3964 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3965 unsigned long size, expected_size;
3966 struct packet_ring_buffer *rb;
1da177e4
LT
3967 unsigned long start;
3968 int err = -EINVAL;
3969 int i;
3970
3971 if (vma->vm_pgoff)
3972 return -EINVAL;
3973
905db440 3974 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3975
3976 expected_size = 0;
3977 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3978 if (rb->pg_vec) {
3979 expected_size += rb->pg_vec_len
3980 * rb->pg_vec_pages
3981 * PAGE_SIZE;
3982 }
3983 }
3984
3985 if (expected_size == 0)
1da177e4 3986 goto out;
69e3c75f
JB
3987
3988 size = vma->vm_end - vma->vm_start;
3989 if (size != expected_size)
1da177e4
LT
3990 goto out;
3991
1da177e4 3992 start = vma->vm_start;
69e3c75f
JB
3993 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3994 if (rb->pg_vec == NULL)
3995 continue;
3996
3997 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3998 struct page *page;
3999 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4000 int pg_num;
4001
c56b4d90
CG
4002 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4003 page = pgv_to_page(kaddr);
69e3c75f
JB
4004 err = vm_insert_page(vma, start, page);
4005 if (unlikely(err))
4006 goto out;
4007 start += PAGE_SIZE;
0e3125c7 4008 kaddr += PAGE_SIZE;
69e3c75f 4009 }
4ebf0ae2 4010 }
1da177e4 4011 }
69e3c75f 4012
4ebf0ae2 4013 atomic_inc(&po->mapped);
1da177e4
LT
4014 vma->vm_ops = &packet_mmap_ops;
4015 err = 0;
4016
4017out:
905db440 4018 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4019 return err;
4020}
1da177e4 4021
90ddc4f0 4022static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4023 .family = PF_PACKET,
4024 .owner = THIS_MODULE,
4025 .release = packet_release,
4026 .bind = packet_bind_spkt,
4027 .connect = sock_no_connect,
4028 .socketpair = sock_no_socketpair,
4029 .accept = sock_no_accept,
4030 .getname = packet_getname_spkt,
4031 .poll = datagram_poll,
4032 .ioctl = packet_ioctl,
4033 .listen = sock_no_listen,
4034 .shutdown = sock_no_shutdown,
4035 .setsockopt = sock_no_setsockopt,
4036 .getsockopt = sock_no_getsockopt,
4037 .sendmsg = packet_sendmsg_spkt,
4038 .recvmsg = packet_recvmsg,
4039 .mmap = sock_no_mmap,
4040 .sendpage = sock_no_sendpage,
4041};
1da177e4 4042
90ddc4f0 4043static const struct proto_ops packet_ops = {
1da177e4
LT
4044 .family = PF_PACKET,
4045 .owner = THIS_MODULE,
4046 .release = packet_release,
4047 .bind = packet_bind,
4048 .connect = sock_no_connect,
4049 .socketpair = sock_no_socketpair,
4050 .accept = sock_no_accept,
1ce4f28b 4051 .getname = packet_getname,
1da177e4
LT
4052 .poll = packet_poll,
4053 .ioctl = packet_ioctl,
4054 .listen = sock_no_listen,
4055 .shutdown = sock_no_shutdown,
4056 .setsockopt = packet_setsockopt,
4057 .getsockopt = packet_getsockopt,
4058 .sendmsg = packet_sendmsg,
4059 .recvmsg = packet_recvmsg,
4060 .mmap = packet_mmap,
4061 .sendpage = sock_no_sendpage,
4062};
4063
ec1b4cf7 4064static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4065 .family = PF_PACKET,
4066 .create = packet_create,
4067 .owner = THIS_MODULE,
4068};
4069
4070static struct notifier_block packet_netdev_notifier = {
40d4e3df 4071 .notifier_call = packet_notifier,
1da177e4
LT
4072};
4073
4074#ifdef CONFIG_PROC_FS
1da177e4
LT
4075
4076static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4077 __acquires(RCU)
1da177e4 4078{
e372c414 4079 struct net *net = seq_file_net(seq);
808f5114 4080
4081 rcu_read_lock();
4082 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4083}
4084
4085static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4086{
1bf40954 4087 struct net *net = seq_file_net(seq);
808f5114 4088 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4089}
4090
4091static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4092 __releases(RCU)
1da177e4 4093{
808f5114 4094 rcu_read_unlock();
1da177e4
LT
4095}
4096
1ce4f28b 4097static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4098{
4099 if (v == SEQ_START_TOKEN)
4100 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4101 else {
b7ceabd9 4102 struct sock *s = sk_entry(v);
1da177e4
LT
4103 const struct packet_sock *po = pkt_sk(s);
4104
4105 seq_printf(seq,
71338aa7 4106 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4107 s,
4108 atomic_read(&s->sk_refcnt),
4109 s->sk_type,
4110 ntohs(po->num),
4111 po->ifindex,
4112 po->running,
4113 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4114 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4115 sock_i_ino(s));
1da177e4
LT
4116 }
4117
4118 return 0;
4119}
4120
56b3d975 4121static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4122 .start = packet_seq_start,
4123 .next = packet_seq_next,
4124 .stop = packet_seq_stop,
4125 .show = packet_seq_show,
4126};
4127
4128static int packet_seq_open(struct inode *inode, struct file *file)
4129{
e372c414
DL
4130 return seq_open_net(inode, file, &packet_seq_ops,
4131 sizeof(struct seq_net_private));
1da177e4
LT
4132}
4133
da7071d7 4134static const struct file_operations packet_seq_fops = {
1da177e4
LT
4135 .owner = THIS_MODULE,
4136 .open = packet_seq_open,
4137 .read = seq_read,
4138 .llseek = seq_lseek,
e372c414 4139 .release = seq_release_net,
1da177e4
LT
4140};
4141
4142#endif
4143
2c8c1e72 4144static int __net_init packet_net_init(struct net *net)
d12d01d6 4145{
0fa7fa98 4146 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4147 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4148
d4beaa66 4149 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4150 return -ENOMEM;
4151
4152 return 0;
4153}
4154
2c8c1e72 4155static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4156{
ece31ffd 4157 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4158}
4159
4160static struct pernet_operations packet_net_ops = {
4161 .init = packet_net_init,
4162 .exit = packet_net_exit,
4163};
4164
4165
1da177e4
LT
4166static void __exit packet_exit(void)
4167{
1da177e4 4168 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4169 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4170 sock_unregister(PF_PACKET);
4171 proto_unregister(&packet_proto);
4172}
4173
4174static int __init packet_init(void)
4175{
4176 int rc = proto_register(&packet_proto, 0);
4177
4178 if (rc != 0)
4179 goto out;
4180
4181 sock_register(&packet_family_ops);
d12d01d6 4182 register_pernet_subsys(&packet_net_ops);
1da177e4 4183 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4184out:
4185 return rc;
4186}
4187
4188module_init(packet_init);
4189module_exit(packet_exit);
4190MODULE_LICENSE("GPL");
4191MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.29185 seconds and 5 git commands to generate.