packet: make aligned size of struct tpacket{2,3}_hdr clear
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
5df0ddfb 91#include <linux/reciprocal_div.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
d346a3fa
DB
240static int packet_direct_xmit(struct sk_buff *skb)
241{
242 struct net_device *dev = skb->dev;
243 const struct net_device_ops *ops = dev->netdev_ops;
244 netdev_features_t features;
245 struct netdev_queue *txq;
246 u16 queue_map;
247 int ret;
248
249 if (unlikely(!netif_running(dev) ||
250 !netif_carrier_ok(dev))) {
251 kfree_skb(skb);
252 return NET_XMIT_DROP;
253 }
254
255 features = netif_skb_features(skb);
256 if (skb_needs_linearize(skb, features) &&
257 __skb_linearize(skb)) {
258 kfree_skb(skb);
259 return NET_XMIT_DROP;
260 }
261
262 queue_map = skb_get_queue_mapping(skb);
263 txq = netdev_get_tx_queue(dev, queue_map);
264
265 __netif_tx_lock_bh(txq);
266 if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
267 ret = NETDEV_TX_BUSY;
268 kfree_skb(skb);
269 goto out;
270 }
271
272 ret = ops->ndo_start_xmit(skb, dev);
273 if (likely(dev_xmit_complete(ret)))
274 txq_trans_update(txq);
275 else
276 kfree_skb(skb);
277out:
278 __netif_tx_unlock_bh(txq);
279 return ret;
280}
281
66e56cd4
DB
282static struct net_device *packet_cached_dev_get(struct packet_sock *po)
283{
284 struct net_device *dev;
285
286 rcu_read_lock();
287 dev = rcu_dereference(po->cached_dev);
288 if (likely(dev))
289 dev_hold(dev);
290 rcu_read_unlock();
291
292 return dev;
293}
294
295static void packet_cached_dev_assign(struct packet_sock *po,
296 struct net_device *dev)
297{
298 rcu_assign_pointer(po->cached_dev, dev);
299}
300
301static void packet_cached_dev_reset(struct packet_sock *po)
302{
303 RCU_INIT_POINTER(po->cached_dev, NULL);
304}
305
d346a3fa
DB
306static bool packet_use_direct_xmit(const struct packet_sock *po)
307{
308 return po->xmit == packet_direct_xmit;
309}
310
311static u16 packet_pick_tx_queue(struct net_device *dev)
312{
1cbac010 313 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
314}
315
ce06b03e
DM
316/* register_prot_hook must be invoked with the po->bind_lock held,
317 * or from a context in which asynchronous accesses to the packet
318 * socket is not possible (packet_create()).
319 */
320static void register_prot_hook(struct sock *sk)
321{
322 struct packet_sock *po = pkt_sk(sk);
e40526cb 323
ce06b03e 324 if (!po->running) {
66e56cd4 325 if (po->fanout)
dc99f600 326 __fanout_link(sk, po);
66e56cd4 327 else
dc99f600 328 dev_add_pack(&po->prot_hook);
e40526cb 329
ce06b03e
DM
330 sock_hold(sk);
331 po->running = 1;
332 }
333}
334
335/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
336 * held. If the sync parameter is true, we will temporarily drop
337 * the po->bind_lock and do a synchronize_net to make sure no
338 * asynchronous packet processing paths still refer to the elements
339 * of po->prot_hook. If the sync parameter is false, it is the
340 * callers responsibility to take care of this.
341 */
342static void __unregister_prot_hook(struct sock *sk, bool sync)
343{
344 struct packet_sock *po = pkt_sk(sk);
345
346 po->running = 0;
66e56cd4
DB
347
348 if (po->fanout)
dc99f600 349 __fanout_unlink(sk, po);
66e56cd4 350 else
dc99f600 351 __dev_remove_pack(&po->prot_hook);
e40526cb 352
ce06b03e
DM
353 __sock_put(sk);
354
355 if (sync) {
356 spin_unlock(&po->bind_lock);
357 synchronize_net();
358 spin_lock(&po->bind_lock);
359 }
360}
361
362static void unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 if (po->running)
367 __unregister_prot_hook(sk, sync);
368}
369
f6dafa95 370static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
371{
372 if (is_vmalloc_addr(addr))
373 return vmalloc_to_page(addr);
374 return virt_to_page(addr);
375}
376
69e3c75f 377static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 378{
184f489e 379 union tpacket_uhdr h;
1da177e4 380
69e3c75f 381 h.raw = frame;
bbd6ef87
PM
382 switch (po->tp_version) {
383 case TPACKET_V1:
69e3c75f 384 h.h1->tp_status = status;
0af55bb5 385 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
386 break;
387 case TPACKET_V2:
69e3c75f 388 h.h2->tp_status = status;
0af55bb5 389 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 390 break;
f6fb8f10 391 case TPACKET_V3:
69e3c75f 392 default:
f6fb8f10 393 WARN(1, "TPACKET version not supported.\n");
69e3c75f 394 BUG();
bbd6ef87 395 }
69e3c75f
JB
396
397 smp_wmb();
bbd6ef87
PM
398}
399
69e3c75f 400static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 401{
184f489e 402 union tpacket_uhdr h;
bbd6ef87 403
69e3c75f
JB
404 smp_rmb();
405
bbd6ef87
PM
406 h.raw = frame;
407 switch (po->tp_version) {
408 case TPACKET_V1:
0af55bb5 409 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 410 return h.h1->tp_status;
bbd6ef87 411 case TPACKET_V2:
0af55bb5 412 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 413 return h.h2->tp_status;
f6fb8f10 414 case TPACKET_V3:
69e3c75f 415 default:
f6fb8f10 416 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
417 BUG();
418 return 0;
bbd6ef87 419 }
1da177e4 420}
69e3c75f 421
b9c32fb2
DB
422static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
423 unsigned int flags)
7a51384c
DB
424{
425 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
426
427 if (shhwtstamps) {
428 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
429 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 430 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
431 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
432 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 433 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
434 }
435
436 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 437 return TP_STATUS_TS_SOFTWARE;
7a51384c 438
b9c32fb2 439 return 0;
7a51384c
DB
440}
441
b9c32fb2
DB
442static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
443 struct sk_buff *skb)
2e31396f
WB
444{
445 union tpacket_uhdr h;
446 struct timespec ts;
b9c32fb2 447 __u32 ts_status;
2e31396f 448
b9c32fb2
DB
449 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
450 return 0;
2e31396f
WB
451
452 h.raw = frame;
453 switch (po->tp_version) {
454 case TPACKET_V1:
455 h.h1->tp_sec = ts.tv_sec;
456 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
457 break;
458 case TPACKET_V2:
459 h.h2->tp_sec = ts.tv_sec;
460 h.h2->tp_nsec = ts.tv_nsec;
461 break;
462 case TPACKET_V3:
463 default:
464 WARN(1, "TPACKET version not supported.\n");
465 BUG();
466 }
467
468 /* one flush is safe, as both fields always lie on the same cacheline */
469 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
470 smp_wmb();
b9c32fb2
DB
471
472 return ts_status;
2e31396f
WB
473}
474
69e3c75f
JB
475static void *packet_lookup_frame(struct packet_sock *po,
476 struct packet_ring_buffer *rb,
477 unsigned int position,
478 int status)
479{
480 unsigned int pg_vec_pos, frame_offset;
184f489e 481 union tpacket_uhdr h;
69e3c75f
JB
482
483 pg_vec_pos = position / rb->frames_per_block;
484 frame_offset = position % rb->frames_per_block;
485
0e3125c7
NH
486 h.raw = rb->pg_vec[pg_vec_pos].buffer +
487 (frame_offset * rb->frame_size);
69e3c75f
JB
488
489 if (status != __packet_get_status(po, h.raw))
490 return NULL;
491
492 return h.raw;
493}
494
eea49cc9 495static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
496 struct packet_ring_buffer *rb,
497 int status)
498{
499 return packet_lookup_frame(po, rb, rb->head, status);
500}
501
bc59ba39 502static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 503{
504 del_timer_sync(&pkc->retire_blk_timer);
505}
506
507static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
508 int tx_ring,
509 struct sk_buff_head *rb_queue)
510{
bc59ba39 511 struct tpacket_kbdq_core *pkc;
f6fb8f10 512
22781a5b
DJ
513 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
514 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 515
ec6f809f 516 spin_lock_bh(&rb_queue->lock);
f6fb8f10 517 pkc->delete_blk_timer = 1;
ec6f809f 518 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 519
520 prb_del_retire_blk_timer(pkc);
521}
522
523static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 524 struct tpacket_kbdq_core *pkc,
f6fb8f10 525 void (*func) (unsigned long))
526{
527 init_timer(&pkc->retire_blk_timer);
528 pkc->retire_blk_timer.data = (long)po;
529 pkc->retire_blk_timer.function = func;
530 pkc->retire_blk_timer.expires = jiffies;
531}
532
533static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
534{
bc59ba39 535 struct tpacket_kbdq_core *pkc;
f6fb8f10 536
537 if (tx_ring)
538 BUG();
539
22781a5b
DJ
540 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
541 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 542 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
543}
544
545static int prb_calc_retire_blk_tmo(struct packet_sock *po,
546 int blk_size_in_bytes)
547{
548 struct net_device *dev;
549 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
550 struct ethtool_cmd ecmd;
551 int err;
e440cf2c 552 u32 speed;
f6fb8f10 553
4bc71cb9
JP
554 rtnl_lock();
555 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
556 if (unlikely(!dev)) {
557 rtnl_unlock();
f6fb8f10 558 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
559 }
560 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 561 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
562 rtnl_unlock();
563 if (!err) {
4bc71cb9
JP
564 /*
565 * If the link speed is so slow you don't really
566 * need to worry about perf anyways
567 */
e440cf2c 568 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 569 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 570 } else {
571 msec = 1;
572 div = speed / 1000;
f6fb8f10 573 }
574 }
575
576 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
577
578 if (div)
579 mbits /= div;
580
581 tmo = mbits * msec;
582
583 if (div)
584 return tmo+1;
585 return tmo;
586}
587
bc59ba39 588static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 589 union tpacket_req_u *req_u)
590{
591 p1->feature_req_word = req_u->req3.tp_feature_req_word;
592}
593
594static void init_prb_bdqc(struct packet_sock *po,
595 struct packet_ring_buffer *rb,
596 struct pgv *pg_vec,
597 union tpacket_req_u *req_u, int tx_ring)
598{
22781a5b 599 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 600 struct tpacket_block_desc *pbd;
f6fb8f10 601
602 memset(p1, 0x0, sizeof(*p1));
603
604 p1->knxt_seq_num = 1;
605 p1->pkbdq = pg_vec;
bc59ba39 606 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 607 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 608 p1->kblk_size = req_u->req3.tp_block_size;
609 p1->knum_blocks = req_u->req3.tp_block_nr;
610 p1->hdrlen = po->tp_hdrlen;
611 p1->version = po->tp_version;
612 p1->last_kactive_blk_num = 0;
ee80fbf3 613 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 614 if (req_u->req3.tp_retire_blk_tov)
615 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
616 else
617 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
618 req_u->req3.tp_block_size);
619 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
620 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
621
622 prb_init_ft_ops(p1, req_u);
623 prb_setup_retire_blk_timer(po, tx_ring);
624 prb_open_block(p1, pbd);
625}
626
627/* Do NOT update the last_blk_num first.
628 * Assumes sk_buff_head lock is held.
629 */
bc59ba39 630static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 631{
632 mod_timer(&pkc->retire_blk_timer,
633 jiffies + pkc->tov_in_jiffies);
634 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
635}
636
637/*
638 * Timer logic:
639 * 1) We refresh the timer only when we open a block.
640 * By doing this we don't waste cycles refreshing the timer
641 * on packet-by-packet basis.
642 *
643 * With a 1MB block-size, on a 1Gbps line, it will take
644 * i) ~8 ms to fill a block + ii) memcpy etc.
645 * In this cut we are not accounting for the memcpy time.
646 *
647 * So, if the user sets the 'tmo' to 10ms then the timer
648 * will never fire while the block is still getting filled
649 * (which is what we want). However, the user could choose
650 * to close a block early and that's fine.
651 *
652 * But when the timer does fire, we check whether or not to refresh it.
653 * Since the tmo granularity is in msecs, it is not too expensive
654 * to refresh the timer, lets say every '8' msecs.
655 * Either the user can set the 'tmo' or we can derive it based on
656 * a) line-speed and b) block-size.
657 * prb_calc_retire_blk_tmo() calculates the tmo.
658 *
659 */
660static void prb_retire_rx_blk_timer_expired(unsigned long data)
661{
662 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 663 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 664 unsigned int frozen;
bc59ba39 665 struct tpacket_block_desc *pbd;
f6fb8f10 666
667 spin_lock(&po->sk.sk_receive_queue.lock);
668
669 frozen = prb_queue_frozen(pkc);
670 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
671
672 if (unlikely(pkc->delete_blk_timer))
673 goto out;
674
675 /* We only need to plug the race when the block is partially filled.
676 * tpacket_rcv:
677 * lock(); increment BLOCK_NUM_PKTS; unlock()
678 * copy_bits() is in progress ...
679 * timer fires on other cpu:
680 * we can't retire the current block because copy_bits
681 * is in progress.
682 *
683 */
684 if (BLOCK_NUM_PKTS(pbd)) {
685 while (atomic_read(&pkc->blk_fill_in_prog)) {
686 /* Waiting for skb_copy_bits to finish... */
687 cpu_relax();
688 }
689 }
690
691 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
692 if (!frozen) {
693 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
694 if (!prb_dispatch_next_block(pkc, po))
695 goto refresh_timer;
696 else
697 goto out;
698 } else {
699 /* Case 1. Queue was frozen because user-space was
700 * lagging behind.
701 */
702 if (prb_curr_blk_in_use(pkc, pbd)) {
703 /*
704 * Ok, user-space is still behind.
705 * So just refresh the timer.
706 */
707 goto refresh_timer;
708 } else {
709 /* Case 2. queue was frozen,user-space caught up,
710 * now the link went idle && the timer fired.
711 * We don't have a block to close.So we open this
712 * block and restart the timer.
713 * opening a block thaws the queue,restarts timer
714 * Thawing/timer-refresh is a side effect.
715 */
716 prb_open_block(pkc, pbd);
717 goto out;
718 }
719 }
720 }
721
722refresh_timer:
723 _prb_refresh_rx_retire_blk_timer(pkc);
724
725out:
726 spin_unlock(&po->sk.sk_receive_queue.lock);
727}
728
eea49cc9 729static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 730 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 731{
732 /* Flush everything minus the block header */
733
734#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
735 u8 *start, *end;
736
737 start = (u8 *)pbd1;
738
739 /* Skip the block header(we know header WILL fit in 4K) */
740 start += PAGE_SIZE;
741
742 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
743 for (; start < end; start += PAGE_SIZE)
744 flush_dcache_page(pgv_to_page(start));
745
746 smp_wmb();
747#endif
748
749 /* Now update the block status. */
750
751 BLOCK_STATUS(pbd1) = status;
752
753 /* Flush the block header */
754
755#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
756 start = (u8 *)pbd1;
757 flush_dcache_page(pgv_to_page(start));
758
759 smp_wmb();
760#endif
761}
762
763/*
764 * Side effect:
765 *
766 * 1) flush the block
767 * 2) Increment active_blk_num
768 *
769 * Note:We DONT refresh the timer on purpose.
770 * Because almost always the next block will be opened.
771 */
bc59ba39 772static void prb_close_block(struct tpacket_kbdq_core *pkc1,
773 struct tpacket_block_desc *pbd1,
f6fb8f10 774 struct packet_sock *po, unsigned int stat)
775{
776 __u32 status = TP_STATUS_USER | stat;
777
778 struct tpacket3_hdr *last_pkt;
bc59ba39 779 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 780
ee80fbf3 781 if (po->stats.stats3.tp_drops)
f6fb8f10 782 status |= TP_STATUS_LOSING;
783
784 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
785 last_pkt->tp_next_offset = 0;
786
787 /* Get the ts of the last pkt */
788 if (BLOCK_NUM_PKTS(pbd1)) {
789 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
790 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
791 } else {
792 /* Ok, we tmo'd - so get the current time */
793 struct timespec ts;
794 getnstimeofday(&ts);
795 h1->ts_last_pkt.ts_sec = ts.tv_sec;
796 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
797 }
798
799 smp_wmb();
800
801 /* Flush the block */
802 prb_flush_block(pkc1, pbd1, status);
803
804 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
805}
806
eea49cc9 807static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 808{
809 pkc->reset_pending_on_curr_blk = 0;
810}
811
812/*
813 * Side effect of opening a block:
814 *
815 * 1) prb_queue is thawed.
816 * 2) retire_blk_timer is refreshed.
817 *
818 */
bc59ba39 819static void prb_open_block(struct tpacket_kbdq_core *pkc1,
820 struct tpacket_block_desc *pbd1)
f6fb8f10 821{
822 struct timespec ts;
bc59ba39 823 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 824
825 smp_rmb();
826
8da3056c
DB
827 /* We could have just memset this but we will lose the
828 * flexibility of making the priv area sticky
829 */
f6fb8f10 830
8da3056c
DB
831 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
832 BLOCK_NUM_PKTS(pbd1) = 0;
833 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 834
8da3056c
DB
835 getnstimeofday(&ts);
836
837 h1->ts_first_pkt.ts_sec = ts.tv_sec;
838 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 839
8da3056c
DB
840 pkc1->pkblk_start = (char *)pbd1;
841 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
842
843 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
844 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
845
846 pbd1->version = pkc1->version;
847 pkc1->prev = pkc1->nxt_offset;
848 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
849
850 prb_thaw_queue(pkc1);
851 _prb_refresh_rx_retire_blk_timer(pkc1);
852
853 smp_wmb();
f6fb8f10 854}
855
856/*
857 * Queue freeze logic:
858 * 1) Assume tp_block_nr = 8 blocks.
859 * 2) At time 't0', user opens Rx ring.
860 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
861 * 4) user-space is either sleeping or processing block '0'.
862 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
863 * it will close block-7,loop around and try to fill block '0'.
864 * call-flow:
865 * __packet_lookup_frame_in_block
866 * prb_retire_current_block()
867 * prb_dispatch_next_block()
868 * |->(BLOCK_STATUS == USER) evaluates to true
869 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
870 * 6) Now there are two cases:
871 * 6.1) Link goes idle right after the queue is frozen.
872 * But remember, the last open_block() refreshed the timer.
873 * When this timer expires,it will refresh itself so that we can
874 * re-open block-0 in near future.
875 * 6.2) Link is busy and keeps on receiving packets. This is a simple
876 * case and __packet_lookup_frame_in_block will check if block-0
877 * is free and can now be re-used.
878 */
eea49cc9 879static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 880 struct packet_sock *po)
881{
882 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 883 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 884}
885
886#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
887
888/*
889 * If the next block is free then we will dispatch it
890 * and return a good offset.
891 * Else, we will freeze the queue.
892 * So, caller must check the return value.
893 */
bc59ba39 894static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 895 struct packet_sock *po)
896{
bc59ba39 897 struct tpacket_block_desc *pbd;
f6fb8f10 898
899 smp_rmb();
900
901 /* 1. Get current block num */
902 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
903
904 /* 2. If this block is currently in_use then freeze the queue */
905 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
906 prb_freeze_queue(pkc, po);
907 return NULL;
908 }
909
910 /*
911 * 3.
912 * open this block and return the offset where the first packet
913 * needs to get stored.
914 */
915 prb_open_block(pkc, pbd);
916 return (void *)pkc->nxt_offset;
917}
918
bc59ba39 919static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 920 struct packet_sock *po, unsigned int status)
921{
bc59ba39 922 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 923
924 /* retire/close the current block */
925 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
926 /*
927 * Plug the case where copy_bits() is in progress on
928 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
929 * have space to copy the pkt in the current block and
930 * called prb_retire_current_block()
931 *
932 * We don't need to worry about the TMO case because
933 * the timer-handler already handled this case.
934 */
935 if (!(status & TP_STATUS_BLK_TMO)) {
936 while (atomic_read(&pkc->blk_fill_in_prog)) {
937 /* Waiting for skb_copy_bits to finish... */
938 cpu_relax();
939 }
940 }
941 prb_close_block(pkc, pbd, po, status);
942 return;
943 }
f6fb8f10 944}
945
eea49cc9 946static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 947 struct tpacket_block_desc *pbd)
f6fb8f10 948{
949 return TP_STATUS_USER & BLOCK_STATUS(pbd);
950}
951
eea49cc9 952static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 953{
954 return pkc->reset_pending_on_curr_blk;
955}
956
eea49cc9 957static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 958{
bc59ba39 959 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 960 atomic_dec(&pkc->blk_fill_in_prog);
961}
962
eea49cc9 963static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 964 struct tpacket3_hdr *ppd)
965{
3958afa1 966 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 967}
968
eea49cc9 969static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 970 struct tpacket3_hdr *ppd)
971{
972 ppd->hv1.tp_rxhash = 0;
973}
974
eea49cc9 975static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 976 struct tpacket3_hdr *ppd)
977{
978 if (vlan_tx_tag_present(pkc->skb)) {
979 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
980 ppd->tp_status = TP_STATUS_VLAN_VALID;
981 } else {
9e67030a 982 ppd->hv1.tp_vlan_tci = 0;
983 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 984 }
985}
986
bc59ba39 987static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 988 struct tpacket3_hdr *ppd)
989{
990 prb_fill_vlan_info(pkc, ppd);
991
992 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
993 prb_fill_rxhash(pkc, ppd);
994 else
995 prb_clear_rxhash(pkc, ppd);
996}
997
eea49cc9 998static void prb_fill_curr_block(char *curr,
bc59ba39 999 struct tpacket_kbdq_core *pkc,
1000 struct tpacket_block_desc *pbd,
f6fb8f10 1001 unsigned int len)
1002{
1003 struct tpacket3_hdr *ppd;
1004
1005 ppd = (struct tpacket3_hdr *)curr;
1006 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1007 pkc->prev = curr;
1008 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1009 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1010 BLOCK_NUM_PKTS(pbd) += 1;
1011 atomic_inc(&pkc->blk_fill_in_prog);
1012 prb_run_all_ft_ops(pkc, ppd);
1013}
1014
1015/* Assumes caller has the sk->rx_queue.lock */
1016static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1017 struct sk_buff *skb,
1018 int status,
1019 unsigned int len
1020 )
1021{
bc59ba39 1022 struct tpacket_kbdq_core *pkc;
1023 struct tpacket_block_desc *pbd;
f6fb8f10 1024 char *curr, *end;
1025
e3192690 1026 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1027 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1028
1029 /* Queue is frozen when user space is lagging behind */
1030 if (prb_queue_frozen(pkc)) {
1031 /*
1032 * Check if that last block which caused the queue to freeze,
1033 * is still in_use by user-space.
1034 */
1035 if (prb_curr_blk_in_use(pkc, pbd)) {
1036 /* Can't record this packet */
1037 return NULL;
1038 } else {
1039 /*
1040 * Ok, the block was released by user-space.
1041 * Now let's open that block.
1042 * opening a block also thaws the queue.
1043 * Thawing is a side effect.
1044 */
1045 prb_open_block(pkc, pbd);
1046 }
1047 }
1048
1049 smp_mb();
1050 curr = pkc->nxt_offset;
1051 pkc->skb = skb;
e3192690 1052 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1053
1054 /* first try the current block */
1055 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1056 prb_fill_curr_block(curr, pkc, pbd, len);
1057 return (void *)curr;
1058 }
1059
1060 /* Ok, close the current block */
1061 prb_retire_current_block(pkc, po, 0);
1062
1063 /* Now, try to dispatch the next block */
1064 curr = (char *)prb_dispatch_next_block(pkc, po);
1065 if (curr) {
1066 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1067 prb_fill_curr_block(curr, pkc, pbd, len);
1068 return (void *)curr;
1069 }
1070
1071 /*
1072 * No free blocks are available.user_space hasn't caught up yet.
1073 * Queue was just frozen and now this packet will get dropped.
1074 */
1075 return NULL;
1076}
1077
eea49cc9 1078static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1079 struct sk_buff *skb,
1080 int status, unsigned int len)
1081{
1082 char *curr = NULL;
1083 switch (po->tp_version) {
1084 case TPACKET_V1:
1085 case TPACKET_V2:
1086 curr = packet_lookup_frame(po, &po->rx_ring,
1087 po->rx_ring.head, status);
1088 return curr;
1089 case TPACKET_V3:
1090 return __packet_lookup_frame_in_block(po, skb, status, len);
1091 default:
1092 WARN(1, "TPACKET version not supported\n");
1093 BUG();
99aa3473 1094 return NULL;
f6fb8f10 1095 }
1096}
1097
eea49cc9 1098static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1099 struct packet_ring_buffer *rb,
77f65ebd 1100 unsigned int idx,
f6fb8f10 1101 int status)
1102{
bc59ba39 1103 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1104 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1105
1106 if (status != BLOCK_STATUS(pbd))
1107 return NULL;
1108 return pbd;
1109}
1110
eea49cc9 1111static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1112{
1113 unsigned int prev;
1114 if (rb->prb_bdqc.kactive_blk_num)
1115 prev = rb->prb_bdqc.kactive_blk_num-1;
1116 else
1117 prev = rb->prb_bdqc.knum_blocks-1;
1118 return prev;
1119}
1120
1121/* Assumes caller has held the rx_queue.lock */
eea49cc9 1122static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
1124 int status)
1125{
1126 unsigned int previous = prb_previous_blk_num(rb);
1127 return prb_lookup_block(po, rb, previous, status);
1128}
1129
eea49cc9 1130static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1131 struct packet_ring_buffer *rb,
1132 int status)
1133{
1134 if (po->tp_version <= TPACKET_V2)
1135 return packet_previous_frame(po, rb, status);
1136
1137 return __prb_previous_block(po, rb, status);
1138}
1139
eea49cc9 1140static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1141 struct packet_ring_buffer *rb)
1142{
1143 switch (po->tp_version) {
1144 case TPACKET_V1:
1145 case TPACKET_V2:
1146 return packet_increment_head(rb);
1147 case TPACKET_V3:
1148 default:
1149 WARN(1, "TPACKET version not supported.\n");
1150 BUG();
1151 return;
1152 }
1153}
1154
eea49cc9 1155static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1156 struct packet_ring_buffer *rb,
1157 int status)
1158{
1159 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1160 return packet_lookup_frame(po, rb, previous, status);
1161}
1162
eea49cc9 1163static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1164{
1165 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1166}
1167
77f65ebd
WB
1168static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1169{
1170 struct sock *sk = &po->sk;
1171 bool has_room;
1172
1173 if (po->prot_hook.func != tpacket_rcv)
1174 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1175 <= sk->sk_rcvbuf;
1176
1177 spin_lock(&sk->sk_receive_queue.lock);
1178 if (po->tp_version == TPACKET_V3)
1179 has_room = prb_lookup_block(po, &po->rx_ring,
1180 po->rx_ring.prb_bdqc.kactive_blk_num,
1181 TP_STATUS_KERNEL);
1182 else
1183 has_room = packet_lookup_frame(po, &po->rx_ring,
1184 po->rx_ring.head,
1185 TP_STATUS_KERNEL);
1186 spin_unlock(&sk->sk_receive_queue.lock);
1187
1188 return has_room;
1189}
1190
1da177e4
LT
1191static void packet_sock_destruct(struct sock *sk)
1192{
ed85b565
RC
1193 skb_queue_purge(&sk->sk_error_queue);
1194
547b792c
IJ
1195 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1196 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1197
1198 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1199 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1200 return;
1201 }
1202
17ab56a2 1203 sk_refcnt_debug_dec(sk);
1da177e4
LT
1204}
1205
dc99f600
DM
1206static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1207{
1208 int x = atomic_read(&f->rr_cur) + 1;
1209
1210 if (x >= num)
1211 x = 0;
1212
1213 return x;
1214}
1215
77f65ebd
WB
1216static unsigned int fanout_demux_hash(struct packet_fanout *f,
1217 struct sk_buff *skb,
1218 unsigned int num)
dc99f600 1219{
f55d112e 1220 return reciprocal_divide(skb->rxhash, num);
dc99f600
DM
1221}
1222
77f65ebd
WB
1223static unsigned int fanout_demux_lb(struct packet_fanout *f,
1224 struct sk_buff *skb,
1225 unsigned int num)
dc99f600
DM
1226{
1227 int cur, old;
1228
1229 cur = atomic_read(&f->rr_cur);
1230 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1231 fanout_rr_next(f, num))) != cur)
1232 cur = old;
77f65ebd
WB
1233 return cur;
1234}
1235
1236static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1237 struct sk_buff *skb,
1238 unsigned int num)
1239{
1240 return smp_processor_id() % num;
dc99f600
DM
1241}
1242
5df0ddfb
DB
1243static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1244 struct sk_buff *skb,
1245 unsigned int num)
1246{
1247 return reciprocal_divide(prandom_u32(), num);
1248}
1249
77f65ebd
WB
1250static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1251 struct sk_buff *skb,
1252 unsigned int idx, unsigned int skip,
1253 unsigned int num)
95ec3eb4 1254{
77f65ebd 1255 unsigned int i, j;
95ec3eb4 1256
77f65ebd
WB
1257 i = j = min_t(int, f->next[idx], num - 1);
1258 do {
1259 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1260 if (i != j)
1261 f->next[idx] = i;
1262 return i;
1263 }
1264 if (++i == num)
1265 i = 0;
1266 } while (i != j);
1267
1268 return idx;
1269}
1270
1271static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1272{
1273 return f->flags & (flag >> 8);
95ec3eb4
DM
1274}
1275
95ec3eb4
DM
1276static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1277 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1278{
1279 struct packet_fanout *f = pt->af_packet_priv;
1280 unsigned int num = f->num_members;
1281 struct packet_sock *po;
77f65ebd 1282 unsigned int idx;
dc99f600
DM
1283
1284 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1285 !num) {
1286 kfree_skb(skb);
1287 return 0;
1288 }
1289
95ec3eb4
DM
1290 switch (f->type) {
1291 case PACKET_FANOUT_HASH:
1292 default:
77f65ebd 1293 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1294 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1295 if (!skb)
1296 return 0;
1297 }
3958afa1 1298 skb_get_hash(skb);
77f65ebd 1299 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1300 break;
1301 case PACKET_FANOUT_LB:
77f65ebd 1302 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1303 break;
1304 case PACKET_FANOUT_CPU:
77f65ebd
WB
1305 idx = fanout_demux_cpu(f, skb, num);
1306 break;
5df0ddfb
DB
1307 case PACKET_FANOUT_RND:
1308 idx = fanout_demux_rnd(f, skb, num);
1309 break;
77f65ebd
WB
1310 case PACKET_FANOUT_ROLLOVER:
1311 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1312 break;
dc99f600
DM
1313 }
1314
77f65ebd
WB
1315 po = pkt_sk(f->arr[idx]);
1316 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1317 unlikely(!packet_rcv_has_room(po, skb))) {
1318 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1319 po = pkt_sk(f->arr[idx]);
1320 }
dc99f600
DM
1321
1322 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1323}
1324
fff3321d
PE
1325DEFINE_MUTEX(fanout_mutex);
1326EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1327static LIST_HEAD(fanout_list);
1328
1329static void __fanout_link(struct sock *sk, struct packet_sock *po)
1330{
1331 struct packet_fanout *f = po->fanout;
1332
1333 spin_lock(&f->lock);
1334 f->arr[f->num_members] = sk;
1335 smp_wmb();
1336 f->num_members++;
1337 spin_unlock(&f->lock);
1338}
1339
1340static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1341{
1342 struct packet_fanout *f = po->fanout;
1343 int i;
1344
1345 spin_lock(&f->lock);
1346 for (i = 0; i < f->num_members; i++) {
1347 if (f->arr[i] == sk)
1348 break;
1349 }
1350 BUG_ON(i >= f->num_members);
1351 f->arr[i] = f->arr[f->num_members - 1];
1352 f->num_members--;
1353 spin_unlock(&f->lock);
1354}
1355
a0dfb263 1356static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
c0de08d0
EL
1357{
1358 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1359 return true;
1360
1361 return false;
1362}
1363
7736d33f 1364static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1365{
1366 struct packet_sock *po = pkt_sk(sk);
1367 struct packet_fanout *f, *match;
7736d33f 1368 u8 type = type_flags & 0xff;
77f65ebd 1369 u8 flags = type_flags >> 8;
dc99f600
DM
1370 int err;
1371
1372 switch (type) {
77f65ebd
WB
1373 case PACKET_FANOUT_ROLLOVER:
1374 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1375 return -EINVAL;
dc99f600
DM
1376 case PACKET_FANOUT_HASH:
1377 case PACKET_FANOUT_LB:
95ec3eb4 1378 case PACKET_FANOUT_CPU:
5df0ddfb 1379 case PACKET_FANOUT_RND:
dc99f600
DM
1380 break;
1381 default:
1382 return -EINVAL;
1383 }
1384
1385 if (!po->running)
1386 return -EINVAL;
1387
1388 if (po->fanout)
1389 return -EALREADY;
1390
1391 mutex_lock(&fanout_mutex);
1392 match = NULL;
1393 list_for_each_entry(f, &fanout_list, list) {
1394 if (f->id == id &&
1395 read_pnet(&f->net) == sock_net(sk)) {
1396 match = f;
1397 break;
1398 }
1399 }
afe62c68 1400 err = -EINVAL;
77f65ebd 1401 if (match && match->flags != flags)
afe62c68 1402 goto out;
dc99f600 1403 if (!match) {
afe62c68 1404 err = -ENOMEM;
dc99f600 1405 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1406 if (!match)
1407 goto out;
1408 write_pnet(&match->net, sock_net(sk));
1409 match->id = id;
1410 match->type = type;
77f65ebd 1411 match->flags = flags;
afe62c68
ED
1412 atomic_set(&match->rr_cur, 0);
1413 INIT_LIST_HEAD(&match->list);
1414 spin_lock_init(&match->lock);
1415 atomic_set(&match->sk_ref, 0);
1416 match->prot_hook.type = po->prot_hook.type;
1417 match->prot_hook.dev = po->prot_hook.dev;
1418 match->prot_hook.func = packet_rcv_fanout;
1419 match->prot_hook.af_packet_priv = match;
c0de08d0 1420 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1421 dev_add_pack(&match->prot_hook);
1422 list_add(&match->list, &fanout_list);
dc99f600 1423 }
afe62c68
ED
1424 err = -EINVAL;
1425 if (match->type == type &&
1426 match->prot_hook.type == po->prot_hook.type &&
1427 match->prot_hook.dev == po->prot_hook.dev) {
1428 err = -ENOSPC;
1429 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1430 __dev_remove_pack(&po->prot_hook);
1431 po->fanout = match;
1432 atomic_inc(&match->sk_ref);
1433 __fanout_link(sk, po);
1434 err = 0;
dc99f600
DM
1435 }
1436 }
afe62c68 1437out:
dc99f600
DM
1438 mutex_unlock(&fanout_mutex);
1439 return err;
1440}
1441
1442static void fanout_release(struct sock *sk)
1443{
1444 struct packet_sock *po = pkt_sk(sk);
1445 struct packet_fanout *f;
1446
1447 f = po->fanout;
1448 if (!f)
1449 return;
1450
fff3321d 1451 mutex_lock(&fanout_mutex);
dc99f600
DM
1452 po->fanout = NULL;
1453
dc99f600
DM
1454 if (atomic_dec_and_test(&f->sk_ref)) {
1455 list_del(&f->list);
1456 dev_remove_pack(&f->prot_hook);
1457 kfree(f);
1458 }
1459 mutex_unlock(&fanout_mutex);
1460}
1da177e4 1461
90ddc4f0 1462static const struct proto_ops packet_ops;
1da177e4 1463
90ddc4f0 1464static const struct proto_ops packet_ops_spkt;
1da177e4 1465
40d4e3df
ED
1466static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1467 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1468{
1469 struct sock *sk;
1470 struct sockaddr_pkt *spkt;
1471
1472 /*
1473 * When we registered the protocol we saved the socket in the data
1474 * field for just this event.
1475 */
1476
1477 sk = pt->af_packet_priv;
1ce4f28b 1478
1da177e4
LT
1479 /*
1480 * Yank back the headers [hope the device set this
1481 * right or kerboom...]
1482 *
1483 * Incoming packets have ll header pulled,
1484 * push it back.
1485 *
98e399f8 1486 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1487 * so that this procedure is noop.
1488 */
1489
1490 if (skb->pkt_type == PACKET_LOOPBACK)
1491 goto out;
1492
09ad9bc7 1493 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1494 goto out;
1495
40d4e3df
ED
1496 skb = skb_share_check(skb, GFP_ATOMIC);
1497 if (skb == NULL)
1da177e4
LT
1498 goto oom;
1499
1500 /* drop any routing info */
adf30907 1501 skb_dst_drop(skb);
1da177e4 1502
84531c24
PO
1503 /* drop conntrack reference */
1504 nf_reset(skb);
1505
ffbc6111 1506 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1507
98e399f8 1508 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1509
1510 /*
1511 * The SOCK_PACKET socket receives _all_ frames.
1512 */
1513
1514 spkt->spkt_family = dev->type;
1515 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1516 spkt->spkt_protocol = skb->protocol;
1517
1518 /*
1519 * Charge the memory to the socket. This is done specifically
1520 * to prevent sockets using all the memory up.
1521 */
1522
40d4e3df 1523 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1524 return 0;
1525
1526out:
1527 kfree_skb(skb);
1528oom:
1529 return 0;
1530}
1531
1532
1533/*
1534 * Output a raw packet to a device layer. This bypasses all the other
1535 * protocol layers and you must therefore supply it with a complete frame
1536 */
1ce4f28b 1537
1da177e4
LT
1538static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1539 struct msghdr *msg, size_t len)
1540{
1541 struct sock *sk = sock->sk;
40d4e3df 1542 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1543 struct sk_buff *skb = NULL;
1da177e4 1544 struct net_device *dev;
40d4e3df 1545 __be16 proto = 0;
1da177e4 1546 int err;
3bdc0eba 1547 int extra_len = 0;
1ce4f28b 1548
1da177e4 1549 /*
1ce4f28b 1550 * Get and verify the address.
1da177e4
LT
1551 */
1552
40d4e3df 1553 if (saddr) {
1da177e4 1554 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1555 return -EINVAL;
1556 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1557 proto = saddr->spkt_protocol;
1558 } else
1559 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1560
1561 /*
1ce4f28b 1562 * Find the device first to size check it
1da177e4
LT
1563 */
1564
de74e92a 1565 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1566retry:
654d1f8a
ED
1567 rcu_read_lock();
1568 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1569 err = -ENODEV;
1570 if (dev == NULL)
1571 goto out_unlock;
1ce4f28b 1572
d5e76b0a
DM
1573 err = -ENETDOWN;
1574 if (!(dev->flags & IFF_UP))
1575 goto out_unlock;
1576
1da177e4 1577 /*
40d4e3df
ED
1578 * You may not queue a frame bigger than the mtu. This is the lowest level
1579 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1580 */
1ce4f28b 1581
3bdc0eba
BG
1582 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1583 if (!netif_supports_nofcs(dev)) {
1584 err = -EPROTONOSUPPORT;
1585 goto out_unlock;
1586 }
1587 extra_len = 4; /* We're doing our own CRC */
1588 }
1589
1da177e4 1590 err = -EMSGSIZE;
3bdc0eba 1591 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1592 goto out_unlock;
1593
1a35ca80
ED
1594 if (!skb) {
1595 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1596 int tlen = dev->needed_tailroom;
1a35ca80
ED
1597 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1598
1599 rcu_read_unlock();
4ce40912 1600 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1601 if (skb == NULL)
1602 return -ENOBUFS;
1603 /* FIXME: Save some space for broken drivers that write a hard
1604 * header at transmission time by themselves. PPP is the notable
1605 * one here. This should really be fixed at the driver level.
1606 */
1607 skb_reserve(skb, reserved);
1608 skb_reset_network_header(skb);
1609
1610 /* Try to align data part correctly */
1611 if (hhlen) {
1612 skb->data -= hhlen;
1613 skb->tail -= hhlen;
1614 if (len < hhlen)
1615 skb_reset_network_header(skb);
1616 }
1617 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1618 if (err)
1619 goto out_free;
1620 goto retry;
1da177e4
LT
1621 }
1622
3bdc0eba 1623 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1624 /* Earlier code assumed this would be a VLAN pkt,
1625 * double-check this now that we have the actual
1626 * packet in hand.
1627 */
1628 struct ethhdr *ehdr;
1629 skb_reset_mac_header(skb);
1630 ehdr = eth_hdr(skb);
1631 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1632 err = -EMSGSIZE;
1633 goto out_unlock;
1634 }
1635 }
1a35ca80 1636
1da177e4
LT
1637 skb->protocol = proto;
1638 skb->dev = dev;
1639 skb->priority = sk->sk_priority;
2d37a186 1640 skb->mark = sk->sk_mark;
bf84a010
DB
1641
1642 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1643
3bdc0eba
BG
1644 if (unlikely(extra_len == 4))
1645 skb->no_fcs = 1;
1646
40893fd0 1647 skb_probe_transport_header(skb, 0);
c1aad275 1648
1da177e4 1649 dev_queue_xmit(skb);
654d1f8a 1650 rcu_read_unlock();
40d4e3df 1651 return len;
1da177e4 1652
1da177e4 1653out_unlock:
654d1f8a 1654 rcu_read_unlock();
1a35ca80
ED
1655out_free:
1656 kfree_skb(skb);
1da177e4
LT
1657 return err;
1658}
1da177e4 1659
eea49cc9 1660static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1661 const struct sock *sk,
dbcb5855 1662 unsigned int res)
1da177e4
LT
1663{
1664 struct sk_filter *filter;
fda9ef5d 1665
80f8f102
ED
1666 rcu_read_lock();
1667 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1668 if (filter != NULL)
0a14842f 1669 res = SK_RUN_FILTER(filter, skb);
80f8f102 1670 rcu_read_unlock();
1da177e4 1671
dbcb5855 1672 return res;
1da177e4
LT
1673}
1674
1675/*
62ab0812
ED
1676 * This function makes lazy skb cloning in hope that most of packets
1677 * are discarded by BPF.
1678 *
1679 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1680 * and skb->cb are mangled. It works because (and until) packets
1681 * falling here are owned by current CPU. Output packets are cloned
1682 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1683 * sequencially, so that if we return skb to original state on exit,
1684 * we will not harm anyone.
1da177e4
LT
1685 */
1686
40d4e3df
ED
1687static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1688 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1689{
1690 struct sock *sk;
1691 struct sockaddr_ll *sll;
1692 struct packet_sock *po;
40d4e3df 1693 u8 *skb_head = skb->data;
1da177e4 1694 int skb_len = skb->len;
dbcb5855 1695 unsigned int snaplen, res;
1da177e4
LT
1696
1697 if (skb->pkt_type == PACKET_LOOPBACK)
1698 goto drop;
1699
1700 sk = pt->af_packet_priv;
1701 po = pkt_sk(sk);
1702
09ad9bc7 1703 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1704 goto drop;
1705
1da177e4
LT
1706 skb->dev = dev;
1707
3b04ddde 1708 if (dev->header_ops) {
1da177e4 1709 /* The device has an explicit notion of ll header,
62ab0812
ED
1710 * exported to higher levels.
1711 *
1712 * Otherwise, the device hides details of its frame
1713 * structure, so that corresponding packet head is
1714 * never delivered to user.
1da177e4
LT
1715 */
1716 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1717 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1718 else if (skb->pkt_type == PACKET_OUTGOING) {
1719 /* Special case: outgoing packets have ll header at head */
bbe735e4 1720 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1721 }
1722 }
1723
1724 snaplen = skb->len;
1725
dbcb5855
DM
1726 res = run_filter(skb, sk, snaplen);
1727 if (!res)
fda9ef5d 1728 goto drop_n_restore;
dbcb5855
DM
1729 if (snaplen > res)
1730 snaplen = res;
1da177e4 1731
0fd7bac6 1732 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1733 goto drop_n_acct;
1734
1735 if (skb_shared(skb)) {
1736 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1737 if (nskb == NULL)
1738 goto drop_n_acct;
1739
1740 if (skb_head != skb->data) {
1741 skb->data = skb_head;
1742 skb->len = skb_len;
1743 }
abc4e4fa 1744 consume_skb(skb);
1da177e4
LT
1745 skb = nskb;
1746 }
1747
ffbc6111
HX
1748 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1749 sizeof(skb->cb));
1750
1751 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1752 sll->sll_family = AF_PACKET;
1753 sll->sll_hatype = dev->type;
1754 sll->sll_protocol = skb->protocol;
1755 sll->sll_pkttype = skb->pkt_type;
8032b464 1756 if (unlikely(po->origdev))
80feaacb
PWJ
1757 sll->sll_ifindex = orig_dev->ifindex;
1758 else
1759 sll->sll_ifindex = dev->ifindex;
1da177e4 1760
b95cce35 1761 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1762
ffbc6111 1763 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1764
1da177e4
LT
1765 if (pskb_trim(skb, snaplen))
1766 goto drop_n_acct;
1767
1768 skb_set_owner_r(skb, sk);
1769 skb->dev = NULL;
adf30907 1770 skb_dst_drop(skb);
1da177e4 1771
84531c24
PO
1772 /* drop conntrack reference */
1773 nf_reset(skb);
1774
1da177e4 1775 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1776 po->stats.stats1.tp_packets++;
3b885787 1777 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1778 __skb_queue_tail(&sk->sk_receive_queue, skb);
1779 spin_unlock(&sk->sk_receive_queue.lock);
1780 sk->sk_data_ready(sk, skb->len);
1781 return 0;
1782
1783drop_n_acct:
7091fbd8 1784 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1785 po->stats.stats1.tp_drops++;
7091fbd8
WB
1786 atomic_inc(&sk->sk_drops);
1787 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1788
1789drop_n_restore:
1790 if (skb_head != skb->data && skb_shared(skb)) {
1791 skb->data = skb_head;
1792 skb->len = skb_len;
1793 }
1794drop:
ead2ceb0 1795 consume_skb(skb);
1da177e4
LT
1796 return 0;
1797}
1798
40d4e3df
ED
1799static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1800 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1801{
1802 struct sock *sk;
1803 struct packet_sock *po;
1804 struct sockaddr_ll *sll;
184f489e 1805 union tpacket_uhdr h;
40d4e3df 1806 u8 *skb_head = skb->data;
1da177e4 1807 int skb_len = skb->len;
dbcb5855 1808 unsigned int snaplen, res;
f6fb8f10 1809 unsigned long status = TP_STATUS_USER;
bbd6ef87 1810 unsigned short macoff, netoff, hdrlen;
1da177e4 1811 struct sk_buff *copy_skb = NULL;
bbd6ef87 1812 struct timespec ts;
b9c32fb2 1813 __u32 ts_status;
1da177e4 1814
51846355
AW
1815 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1816 * We may add members to them until current aligned size without forcing
1817 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1818 */
1819 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1820 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1821
1da177e4
LT
1822 if (skb->pkt_type == PACKET_LOOPBACK)
1823 goto drop;
1824
1825 sk = pt->af_packet_priv;
1826 po = pkt_sk(sk);
1827
09ad9bc7 1828 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1829 goto drop;
1830
3b04ddde 1831 if (dev->header_ops) {
1da177e4 1832 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1833 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1834 else if (skb->pkt_type == PACKET_OUTGOING) {
1835 /* Special case: outgoing packets have ll header at head */
bbe735e4 1836 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1837 }
1838 }
1839
8dc41944
HX
1840 if (skb->ip_summed == CHECKSUM_PARTIAL)
1841 status |= TP_STATUS_CSUMNOTREADY;
1842
1da177e4
LT
1843 snaplen = skb->len;
1844
dbcb5855
DM
1845 res = run_filter(skb, sk, snaplen);
1846 if (!res)
fda9ef5d 1847 goto drop_n_restore;
dbcb5855
DM
1848 if (snaplen > res)
1849 snaplen = res;
1da177e4
LT
1850
1851 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1852 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1853 po->tp_reserve;
1da177e4 1854 } else {
95c96174 1855 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1856 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1857 (maclen < 16 ? 16 : maclen)) +
1858 po->tp_reserve;
1da177e4
LT
1859 macoff = netoff - maclen;
1860 }
f6fb8f10 1861 if (po->tp_version <= TPACKET_V2) {
1862 if (macoff + snaplen > po->rx_ring.frame_size) {
1863 if (po->copy_thresh &&
0fd7bac6 1864 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1865 if (skb_shared(skb)) {
1866 copy_skb = skb_clone(skb, GFP_ATOMIC);
1867 } else {
1868 copy_skb = skb_get(skb);
1869 skb_head = skb->data;
1870 }
1871 if (copy_skb)
1872 skb_set_owner_r(copy_skb, sk);
1da177e4 1873 }
f6fb8f10 1874 snaplen = po->rx_ring.frame_size - macoff;
1875 if ((int)snaplen < 0)
1876 snaplen = 0;
1da177e4 1877 }
1da177e4 1878 }
1da177e4 1879 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1880 h.raw = packet_current_rx_frame(po, skb,
1881 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1882 if (!h.raw)
1da177e4 1883 goto ring_is_full;
f6fb8f10 1884 if (po->tp_version <= TPACKET_V2) {
1885 packet_increment_rx_head(po, &po->rx_ring);
1886 /*
1887 * LOSING will be reported till you read the stats,
1888 * because it's COR - Clear On Read.
1889 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1890 * at packet level.
1891 */
ee80fbf3 1892 if (po->stats.stats1.tp_drops)
f6fb8f10 1893 status |= TP_STATUS_LOSING;
1894 }
ee80fbf3 1895 po->stats.stats1.tp_packets++;
1da177e4
LT
1896 if (copy_skb) {
1897 status |= TP_STATUS_COPY;
1898 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1899 }
1da177e4
LT
1900 spin_unlock(&sk->sk_receive_queue.lock);
1901
bbd6ef87 1902 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1903
1904 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1905 getnstimeofday(&ts);
1da177e4 1906
b9c32fb2
DB
1907 status |= ts_status;
1908
bbd6ef87
PM
1909 switch (po->tp_version) {
1910 case TPACKET_V1:
1911 h.h1->tp_len = skb->len;
1912 h.h1->tp_snaplen = snaplen;
1913 h.h1->tp_mac = macoff;
1914 h.h1->tp_net = netoff;
4b457bdf
DB
1915 h.h1->tp_sec = ts.tv_sec;
1916 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1917 hdrlen = sizeof(*h.h1);
1918 break;
1919 case TPACKET_V2:
1920 h.h2->tp_len = skb->len;
1921 h.h2->tp_snaplen = snaplen;
1922 h.h2->tp_mac = macoff;
1923 h.h2->tp_net = netoff;
bbd6ef87
PM
1924 h.h2->tp_sec = ts.tv_sec;
1925 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1926 if (vlan_tx_tag_present(skb)) {
1927 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1928 status |= TP_STATUS_VLAN_VALID;
1929 } else {
1930 h.h2->tp_vlan_tci = 0;
1931 }
13fcb7bd 1932 h.h2->tp_padding = 0;
bbd6ef87
PM
1933 hdrlen = sizeof(*h.h2);
1934 break;
f6fb8f10 1935 case TPACKET_V3:
1936 /* tp_nxt_offset,vlan are already populated above.
1937 * So DONT clear those fields here
1938 */
1939 h.h3->tp_status |= status;
1940 h.h3->tp_len = skb->len;
1941 h.h3->tp_snaplen = snaplen;
1942 h.h3->tp_mac = macoff;
1943 h.h3->tp_net = netoff;
f6fb8f10 1944 h.h3->tp_sec = ts.tv_sec;
1945 h.h3->tp_nsec = ts.tv_nsec;
1946 hdrlen = sizeof(*h.h3);
1947 break;
bbd6ef87
PM
1948 default:
1949 BUG();
1950 }
1da177e4 1951
bbd6ef87 1952 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1953 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1954 sll->sll_family = AF_PACKET;
1955 sll->sll_hatype = dev->type;
1956 sll->sll_protocol = skb->protocol;
1957 sll->sll_pkttype = skb->pkt_type;
8032b464 1958 if (unlikely(po->origdev))
80feaacb
PWJ
1959 sll->sll_ifindex = orig_dev->ifindex;
1960 else
1961 sll->sll_ifindex = dev->ifindex;
1da177e4 1962
e16aa207 1963 smp_mb();
f6dafa95 1964#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1965 {
0af55bb5
CG
1966 u8 *start, *end;
1967
f6fb8f10 1968 if (po->tp_version <= TPACKET_V2) {
1969 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1970 + macoff + snaplen);
1971 for (start = h.raw; start < end; start += PAGE_SIZE)
1972 flush_dcache_page(pgv_to_page(start));
1973 }
cc9f01b2 1974 smp_wmb();
1da177e4 1975 }
f6dafa95 1976#endif
f6fb8f10 1977 if (po->tp_version <= TPACKET_V2)
1978 __packet_set_status(po, h.raw, status);
1979 else
1980 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1981
1982 sk->sk_data_ready(sk, 0);
1983
1984drop_n_restore:
1985 if (skb_head != skb->data && skb_shared(skb)) {
1986 skb->data = skb_head;
1987 skb->len = skb_len;
1988 }
1989drop:
1ce4f28b 1990 kfree_skb(skb);
1da177e4
LT
1991 return 0;
1992
1993ring_is_full:
ee80fbf3 1994 po->stats.stats1.tp_drops++;
1da177e4
LT
1995 spin_unlock(&sk->sk_receive_queue.lock);
1996
1997 sk->sk_data_ready(sk, 0);
acb5d75b 1998 kfree_skb(copy_skb);
1da177e4
LT
1999 goto drop_n_restore;
2000}
2001
69e3c75f
JB
2002static void tpacket_destruct_skb(struct sk_buff *skb)
2003{
2004 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 2005 void *ph;
1da177e4 2006
69e3c75f 2007 if (likely(po->tx_ring.pg_vec)) {
b9c32fb2
DB
2008 __u32 ts;
2009
69e3c75f 2010 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
2011 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
2012 atomic_dec(&po->tx_ring.pending);
b9c32fb2
DB
2013
2014 ts = __packet_set_timestamp(po, ph, skb);
2015 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2016 }
2017
2018 sock_wfree(skb);
2019}
2020
40d4e3df
ED
2021static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2022 void *frame, struct net_device *dev, int size_max,
ae641949 2023 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2024{
184f489e 2025 union tpacket_uhdr ph;
09effa67 2026 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2027 struct socket *sock = po->sk.sk_socket;
2028 struct page *page;
2029 void *data;
2030 int err;
2031
2032 ph.raw = frame;
2033
2034 skb->protocol = proto;
2035 skb->dev = dev;
2036 skb->priority = po->sk.sk_priority;
2d37a186 2037 skb->mark = po->sk.sk_mark;
2e31396f 2038 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2039 skb_shinfo(skb)->destructor_arg = ph.raw;
2040
2041 switch (po->tp_version) {
2042 case TPACKET_V2:
2043 tp_len = ph.h2->tp_len;
2044 break;
2045 default:
2046 tp_len = ph.h1->tp_len;
2047 break;
2048 }
09effa67
DM
2049 if (unlikely(tp_len > size_max)) {
2050 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2051 return -EMSGSIZE;
2052 }
69e3c75f 2053
ae641949 2054 skb_reserve(skb, hlen);
69e3c75f 2055 skb_reset_network_header(skb);
c1aad275 2056
d346a3fa
DB
2057 if (!packet_use_direct_xmit(po))
2058 skb_probe_transport_header(skb, 0);
2059 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2060 int off_min, off_max, off;
2061 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2062 off_max = po->tx_ring.frame_size - tp_len;
2063 if (sock->type == SOCK_DGRAM) {
2064 switch (po->tp_version) {
2065 case TPACKET_V2:
2066 off = ph.h2->tp_net;
2067 break;
2068 default:
2069 off = ph.h1->tp_net;
2070 break;
2071 }
2072 } else {
2073 switch (po->tp_version) {
2074 case TPACKET_V2:
2075 off = ph.h2->tp_mac;
2076 break;
2077 default:
2078 off = ph.h1->tp_mac;
2079 break;
2080 }
2081 }
2082 if (unlikely((off < off_min) || (off_max < off)))
2083 return -EINVAL;
2084 data = ph.raw + off;
2085 } else {
2086 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2087 }
69e3c75f
JB
2088 to_write = tp_len;
2089
2090 if (sock->type == SOCK_DGRAM) {
2091 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2092 NULL, tp_len);
2093 if (unlikely(err < 0))
2094 return -EINVAL;
40d4e3df 2095 } else if (dev->hard_header_len) {
69e3c75f
JB
2096 /* net device doesn't like empty head */
2097 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2098 pr_err("packet size is too short (%d < %d)\n",
2099 tp_len, dev->hard_header_len);
69e3c75f
JB
2100 return -EINVAL;
2101 }
2102
2103 skb_push(skb, dev->hard_header_len);
2104 err = skb_store_bits(skb, 0, data,
2105 dev->hard_header_len);
2106 if (unlikely(err))
2107 return err;
2108
2109 data += dev->hard_header_len;
2110 to_write -= dev->hard_header_len;
2111 }
2112
69e3c75f
JB
2113 offset = offset_in_page(data);
2114 len_max = PAGE_SIZE - offset;
2115 len = ((to_write > len_max) ? len_max : to_write);
2116
2117 skb->data_len = to_write;
2118 skb->len += to_write;
2119 skb->truesize += to_write;
2120 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2121
2122 while (likely(to_write)) {
2123 nr_frags = skb_shinfo(skb)->nr_frags;
2124
2125 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2126 pr_err("Packet exceed the number of skb frags(%lu)\n",
2127 MAX_SKB_FRAGS);
69e3c75f
JB
2128 return -EFAULT;
2129 }
2130
0af55bb5
CG
2131 page = pgv_to_page(data);
2132 data += len;
69e3c75f
JB
2133 flush_dcache_page(page);
2134 get_page(page);
0af55bb5 2135 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2136 to_write -= len;
2137 offset = 0;
2138 len_max = PAGE_SIZE;
2139 len = ((to_write > len_max) ? len_max : to_write);
2140 }
2141
2142 return tp_len;
2143}
2144
2145static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2146{
69e3c75f
JB
2147 struct sk_buff *skb;
2148 struct net_device *dev;
2149 __be16 proto;
09effa67 2150 int err, reserve = 0;
40d4e3df
ED
2151 void *ph;
2152 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2153 int tp_len, size_max;
2154 unsigned char *addr;
2155 int len_sum = 0;
9e67030a 2156 int status = TP_STATUS_AVAILABLE;
ae641949 2157 int hlen, tlen;
69e3c75f 2158
69e3c75f
JB
2159 mutex_lock(&po->pg_vec_lock);
2160
66e56cd4 2161 if (likely(saddr == NULL)) {
e40526cb 2162 dev = packet_cached_dev_get(po);
69e3c75f
JB
2163 proto = po->num;
2164 addr = NULL;
2165 } else {
2166 err = -EINVAL;
2167 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2168 goto out;
2169 if (msg->msg_namelen < (saddr->sll_halen
2170 + offsetof(struct sockaddr_ll,
2171 sll_addr)))
2172 goto out;
69e3c75f
JB
2173 proto = saddr->sll_protocol;
2174 addr = saddr->sll_addr;
827d9780 2175 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2176 }
2177
69e3c75f
JB
2178 err = -ENXIO;
2179 if (unlikely(dev == NULL))
2180 goto out;
69e3c75f
JB
2181 err = -ENETDOWN;
2182 if (unlikely(!(dev->flags & IFF_UP)))
2183 goto out_put;
2184
e40526cb
DB
2185 reserve = dev->hard_header_len;
2186
69e3c75f 2187 size_max = po->tx_ring.frame_size
b5dd884e 2188 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2189
09effa67
DM
2190 if (size_max > dev->mtu + reserve)
2191 size_max = dev->mtu + reserve;
2192
69e3c75f
JB
2193 do {
2194 ph = packet_current_frame(po, &po->tx_ring,
2195 TP_STATUS_SEND_REQUEST);
2196
2197 if (unlikely(ph == NULL)) {
2198 schedule();
2199 continue;
2200 }
2201
2202 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2203 hlen = LL_RESERVED_SPACE(dev);
2204 tlen = dev->needed_tailroom;
69e3c75f 2205 skb = sock_alloc_send_skb(&po->sk,
ae641949 2206 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2207 0, &err);
2208
2209 if (unlikely(skb == NULL))
2210 goto out_status;
2211
2212 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2213 addr, hlen);
69e3c75f
JB
2214
2215 if (unlikely(tp_len < 0)) {
2216 if (po->tp_loss) {
2217 __packet_set_status(po, ph,
2218 TP_STATUS_AVAILABLE);
2219 packet_increment_head(&po->tx_ring);
2220 kfree_skb(skb);
2221 continue;
2222 } else {
2223 status = TP_STATUS_WRONG_FORMAT;
2224 err = tp_len;
2225 goto out_status;
2226 }
2227 }
2228
d346a3fa 2229 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
69e3c75f
JB
2230 skb->destructor = tpacket_destruct_skb;
2231 __packet_set_status(po, ph, TP_STATUS_SENDING);
2232 atomic_inc(&po->tx_ring.pending);
2233
2234 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2235 err = po->xmit(skb);
eb70df13
JP
2236 if (unlikely(err > 0)) {
2237 err = net_xmit_errno(err);
2238 if (err && __packet_get_status(po, ph) ==
2239 TP_STATUS_AVAILABLE) {
2240 /* skb was destructed already */
2241 skb = NULL;
2242 goto out_status;
2243 }
2244 /*
2245 * skb was dropped but not destructed yet;
2246 * let's treat it like congestion or err < 0
2247 */
2248 err = 0;
2249 }
69e3c75f
JB
2250 packet_increment_head(&po->tx_ring);
2251 len_sum += tp_len;
f64f9e71
JP
2252 } while (likely((ph != NULL) ||
2253 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2254 (atomic_read(&po->tx_ring.pending))))
2255 );
69e3c75f
JB
2256
2257 err = len_sum;
2258 goto out_put;
2259
69e3c75f
JB
2260out_status:
2261 __packet_set_status(po, ph, status);
2262 kfree_skb(skb);
2263out_put:
e40526cb 2264 dev_put(dev);
69e3c75f
JB
2265out:
2266 mutex_unlock(&po->pg_vec_lock);
2267 return err;
2268}
69e3c75f 2269
eea49cc9
OJ
2270static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2271 size_t reserve, size_t len,
2272 size_t linear, int noblock,
2273 int *err)
bfd5f4a3
SS
2274{
2275 struct sk_buff *skb;
2276
2277 /* Under a page? Don't bother with paged skb. */
2278 if (prepad + len < PAGE_SIZE || !linear)
2279 linear = len;
2280
2281 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2282 err, 0);
bfd5f4a3
SS
2283 if (!skb)
2284 return NULL;
2285
2286 skb_reserve(skb, reserve);
2287 skb_put(skb, linear);
2288 skb->data_len = len - linear;
2289 skb->len += len - linear;
2290
2291 return skb;
2292}
2293
d346a3fa 2294static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2295{
2296 struct sock *sk = sock->sk;
40d4e3df 2297 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2298 struct sk_buff *skb;
2299 struct net_device *dev;
0e11c91e 2300 __be16 proto;
1da177e4 2301 unsigned char *addr;
827d9780 2302 int err, reserve = 0;
bfd5f4a3
SS
2303 struct virtio_net_hdr vnet_hdr = { 0 };
2304 int offset = 0;
2305 int vnet_hdr_len;
2306 struct packet_sock *po = pkt_sk(sk);
2307 unsigned short gso_type = 0;
ae641949 2308 int hlen, tlen;
3bdc0eba 2309 int extra_len = 0;
1da177e4
LT
2310
2311 /*
1ce4f28b 2312 * Get and verify the address.
1da177e4 2313 */
1ce4f28b 2314
66e56cd4 2315 if (likely(saddr == NULL)) {
e40526cb 2316 dev = packet_cached_dev_get(po);
1da177e4
LT
2317 proto = po->num;
2318 addr = NULL;
2319 } else {
2320 err = -EINVAL;
2321 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2322 goto out;
0fb375fb
EB
2323 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2324 goto out;
1da177e4
LT
2325 proto = saddr->sll_protocol;
2326 addr = saddr->sll_addr;
827d9780 2327 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2328 }
2329
1da177e4 2330 err = -ENXIO;
e40526cb 2331 if (unlikely(dev == NULL))
1da177e4 2332 goto out_unlock;
d5e76b0a 2333 err = -ENETDOWN;
e40526cb 2334 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2335 goto out_unlock;
2336
e40526cb
DB
2337 if (sock->type == SOCK_RAW)
2338 reserve = dev->hard_header_len;
bfd5f4a3
SS
2339 if (po->has_vnet_hdr) {
2340 vnet_hdr_len = sizeof(vnet_hdr);
2341
2342 err = -EINVAL;
2343 if (len < vnet_hdr_len)
2344 goto out_unlock;
2345
2346 len -= vnet_hdr_len;
2347
2348 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2349 vnet_hdr_len);
2350 if (err < 0)
2351 goto out_unlock;
2352
2353 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2354 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2355 vnet_hdr.hdr_len))
2356 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2357 vnet_hdr.csum_offset + 2;
2358
2359 err = -EINVAL;
2360 if (vnet_hdr.hdr_len > len)
2361 goto out_unlock;
2362
2363 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2364 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2365 case VIRTIO_NET_HDR_GSO_TCPV4:
2366 gso_type = SKB_GSO_TCPV4;
2367 break;
2368 case VIRTIO_NET_HDR_GSO_TCPV6:
2369 gso_type = SKB_GSO_TCPV6;
2370 break;
2371 case VIRTIO_NET_HDR_GSO_UDP:
2372 gso_type = SKB_GSO_UDP;
2373 break;
2374 default:
2375 goto out_unlock;
2376 }
2377
2378 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2379 gso_type |= SKB_GSO_TCP_ECN;
2380
2381 if (vnet_hdr.gso_size == 0)
2382 goto out_unlock;
2383
2384 }
2385 }
2386
3bdc0eba
BG
2387 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2388 if (!netif_supports_nofcs(dev)) {
2389 err = -EPROTONOSUPPORT;
2390 goto out_unlock;
2391 }
2392 extra_len = 4; /* We're doing our own CRC */
2393 }
2394
1da177e4 2395 err = -EMSGSIZE;
3bdc0eba 2396 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2397 goto out_unlock;
2398
bfd5f4a3 2399 err = -ENOBUFS;
ae641949
HX
2400 hlen = LL_RESERVED_SPACE(dev);
2401 tlen = dev->needed_tailroom;
2402 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2403 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2404 if (skb == NULL)
1da177e4
LT
2405 goto out_unlock;
2406
bfd5f4a3 2407 skb_set_network_header(skb, reserve);
1da177e4 2408
0c4e8581
SH
2409 err = -EINVAL;
2410 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2411 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2412 goto out_free;
1da177e4
LT
2413
2414 /* Returns -EFAULT on error */
bfd5f4a3 2415 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2416 if (err)
2417 goto out_free;
bf84a010
DB
2418
2419 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2420
3bdc0eba 2421 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2422 /* Earlier code assumed this would be a VLAN pkt,
2423 * double-check this now that we have the actual
2424 * packet in hand.
2425 */
2426 struct ethhdr *ehdr;
2427 skb_reset_mac_header(skb);
2428 ehdr = eth_hdr(skb);
2429 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2430 err = -EMSGSIZE;
2431 goto out_free;
2432 }
57f89bfa
BG
2433 }
2434
09effa67
DM
2435 skb->protocol = proto;
2436 skb->dev = dev;
1da177e4 2437 skb->priority = sk->sk_priority;
2d37a186 2438 skb->mark = sk->sk_mark;
d346a3fa 2439 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
1da177e4 2440
bfd5f4a3
SS
2441 if (po->has_vnet_hdr) {
2442 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2443 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2444 vnet_hdr.csum_offset)) {
2445 err = -EINVAL;
2446 goto out_free;
2447 }
2448 }
2449
2450 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2451 skb_shinfo(skb)->gso_type = gso_type;
2452
2453 /* Header must be checked, and gso_segs computed. */
2454 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2455 skb_shinfo(skb)->gso_segs = 0;
2456
2457 len += vnet_hdr_len;
2458 }
2459
d346a3fa
DB
2460 if (!packet_use_direct_xmit(po))
2461 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2462 if (unlikely(extra_len == 4))
2463 skb->no_fcs = 1;
2464
d346a3fa 2465 err = po->xmit(skb);
1da177e4
LT
2466 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2467 goto out_unlock;
2468
e40526cb 2469 dev_put(dev);
1da177e4 2470
40d4e3df 2471 return len;
1da177e4
LT
2472
2473out_free:
2474 kfree_skb(skb);
2475out_unlock:
e40526cb 2476 if (dev)
1da177e4
LT
2477 dev_put(dev);
2478out:
2479 return err;
2480}
2481
69e3c75f
JB
2482static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2483 struct msghdr *msg, size_t len)
2484{
69e3c75f
JB
2485 struct sock *sk = sock->sk;
2486 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2487
69e3c75f
JB
2488 if (po->tx_ring.pg_vec)
2489 return tpacket_snd(po, msg);
2490 else
69e3c75f
JB
2491 return packet_snd(sock, msg, len);
2492}
2493
1da177e4
LT
2494/*
2495 * Close a PACKET socket. This is fairly simple. We immediately go
2496 * to 'closed' state and remove our protocol entry in the device list.
2497 */
2498
2499static int packet_release(struct socket *sock)
2500{
2501 struct sock *sk = sock->sk;
2502 struct packet_sock *po;
d12d01d6 2503 struct net *net;
f6fb8f10 2504 union tpacket_req_u req_u;
1da177e4
LT
2505
2506 if (!sk)
2507 return 0;
2508
3b1e0a65 2509 net = sock_net(sk);
1da177e4
LT
2510 po = pkt_sk(sk);
2511
0fa7fa98 2512 mutex_lock(&net->packet.sklist_lock);
808f5114 2513 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2514 mutex_unlock(&net->packet.sklist_lock);
2515
2516 preempt_disable();
920de804 2517 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2518 preempt_enable();
1da177e4 2519
808f5114 2520 spin_lock(&po->bind_lock);
ce06b03e 2521 unregister_prot_hook(sk, false);
66e56cd4
DB
2522 packet_cached_dev_reset(po);
2523
160ff18a
BG
2524 if (po->prot_hook.dev) {
2525 dev_put(po->prot_hook.dev);
2526 po->prot_hook.dev = NULL;
2527 }
808f5114 2528 spin_unlock(&po->bind_lock);
1da177e4 2529
1da177e4 2530 packet_flush_mclist(sk);
1da177e4 2531
9665d5d6
PS
2532 if (po->rx_ring.pg_vec) {
2533 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2534 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2535 }
69e3c75f 2536
9665d5d6
PS
2537 if (po->tx_ring.pg_vec) {
2538 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2539 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2540 }
1da177e4 2541
dc99f600
DM
2542 fanout_release(sk);
2543
808f5114 2544 synchronize_net();
1da177e4
LT
2545 /*
2546 * Now the socket is dead. No more input will appear.
2547 */
1da177e4
LT
2548 sock_orphan(sk);
2549 sock->sk = NULL;
2550
2551 /* Purge queues */
2552
2553 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2554 sk_refcnt_debug_release(sk);
1da177e4
LT
2555
2556 sock_put(sk);
2557 return 0;
2558}
2559
2560/*
2561 * Attach a packet hook.
2562 */
2563
0e11c91e 2564static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2565{
2566 struct packet_sock *po = pkt_sk(sk);
dc99f600 2567
aef950b4
WY
2568 if (po->fanout) {
2569 if (dev)
2570 dev_put(dev);
2571
dc99f600 2572 return -EINVAL;
aef950b4 2573 }
1da177e4
LT
2574
2575 lock_sock(sk);
2576
2577 spin_lock(&po->bind_lock);
ce06b03e 2578 unregister_prot_hook(sk, true);
66e56cd4 2579
1da177e4
LT
2580 po->num = protocol;
2581 po->prot_hook.type = protocol;
160ff18a
BG
2582 if (po->prot_hook.dev)
2583 dev_put(po->prot_hook.dev);
1da177e4 2584
66e56cd4 2585 po->prot_hook.dev = dev;
1da177e4
LT
2586 po->ifindex = dev ? dev->ifindex : 0;
2587
66e56cd4
DB
2588 packet_cached_dev_assign(po, dev);
2589
1da177e4
LT
2590 if (protocol == 0)
2591 goto out_unlock;
2592
be85d4ad 2593 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2594 register_prot_hook(sk);
be85d4ad
UT
2595 } else {
2596 sk->sk_err = ENETDOWN;
2597 if (!sock_flag(sk, SOCK_DEAD))
2598 sk->sk_error_report(sk);
1da177e4
LT
2599 }
2600
2601out_unlock:
2602 spin_unlock(&po->bind_lock);
2603 release_sock(sk);
2604 return 0;
2605}
2606
2607/*
2608 * Bind a packet socket to a device
2609 */
2610
40d4e3df
ED
2611static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2612 int addr_len)
1da177e4 2613{
40d4e3df 2614 struct sock *sk = sock->sk;
1da177e4
LT
2615 char name[15];
2616 struct net_device *dev;
2617 int err = -ENODEV;
1ce4f28b 2618
1da177e4
LT
2619 /*
2620 * Check legality
2621 */
1ce4f28b 2622
8ae55f04 2623 if (addr_len != sizeof(struct sockaddr))
1da177e4 2624 return -EINVAL;
40d4e3df 2625 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2626
3b1e0a65 2627 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2628 if (dev)
1da177e4 2629 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2630 return err;
2631}
1da177e4
LT
2632
2633static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2634{
40d4e3df
ED
2635 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2636 struct sock *sk = sock->sk;
1da177e4
LT
2637 struct net_device *dev = NULL;
2638 int err;
2639
2640
2641 /*
2642 * Check legality
2643 */
1ce4f28b 2644
1da177e4
LT
2645 if (addr_len < sizeof(struct sockaddr_ll))
2646 return -EINVAL;
2647 if (sll->sll_family != AF_PACKET)
2648 return -EINVAL;
2649
2650 if (sll->sll_ifindex) {
2651 err = -ENODEV;
3b1e0a65 2652 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2653 if (dev == NULL)
2654 goto out;
2655 }
2656 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2657
2658out:
2659 return err;
2660}
2661
2662static struct proto packet_proto = {
2663 .name = "PACKET",
2664 .owner = THIS_MODULE,
2665 .obj_size = sizeof(struct packet_sock),
2666};
2667
2668/*
1ce4f28b 2669 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2670 */
2671
3f378b68
EP
2672static int packet_create(struct net *net, struct socket *sock, int protocol,
2673 int kern)
1da177e4
LT
2674{
2675 struct sock *sk;
2676 struct packet_sock *po;
0e11c91e 2677 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2678 int err;
2679
df008c91 2680 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2681 return -EPERM;
be02097c
DM
2682 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2683 sock->type != SOCK_PACKET)
1da177e4
LT
2684 return -ESOCKTNOSUPPORT;
2685
2686 sock->state = SS_UNCONNECTED;
2687
2688 err = -ENOBUFS;
6257ff21 2689 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2690 if (sk == NULL)
2691 goto out;
2692
2693 sock->ops = &packet_ops;
1da177e4
LT
2694 if (sock->type == SOCK_PACKET)
2695 sock->ops = &packet_ops_spkt;
be02097c 2696
1da177e4
LT
2697 sock_init_data(sock, sk);
2698
2699 po = pkt_sk(sk);
2700 sk->sk_family = PF_PACKET;
0e11c91e 2701 po->num = proto;
d346a3fa 2702 po->xmit = dev_queue_xmit;
66e56cd4
DB
2703
2704 packet_cached_dev_reset(po);
1da177e4
LT
2705
2706 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2707 sk_refcnt_debug_inc(sk);
1da177e4
LT
2708
2709 /*
2710 * Attach a protocol block
2711 */
2712
2713 spin_lock_init(&po->bind_lock);
905db440 2714 mutex_init(&po->pg_vec_lock);
1da177e4 2715 po->prot_hook.func = packet_rcv;
be02097c 2716
1da177e4
LT
2717 if (sock->type == SOCK_PACKET)
2718 po->prot_hook.func = packet_rcv_spkt;
be02097c 2719
1da177e4
LT
2720 po->prot_hook.af_packet_priv = sk;
2721
0e11c91e
AV
2722 if (proto) {
2723 po->prot_hook.type = proto;
ce06b03e 2724 register_prot_hook(sk);
1da177e4
LT
2725 }
2726
0fa7fa98 2727 mutex_lock(&net->packet.sklist_lock);
808f5114 2728 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2729 mutex_unlock(&net->packet.sklist_lock);
2730
2731 preempt_disable();
3680453c 2732 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2733 preempt_enable();
808f5114 2734
40d4e3df 2735 return 0;
1da177e4
LT
2736out:
2737 return err;
2738}
2739
2740/*
2741 * Pull a packet from our receive queue and hand it to the user.
2742 * If necessary we block.
2743 */
2744
2745static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2746 struct msghdr *msg, size_t len, int flags)
2747{
2748 struct sock *sk = sock->sk;
2749 struct sk_buff *skb;
2750 int copied, err;
bfd5f4a3 2751 int vnet_hdr_len = 0;
1da177e4
LT
2752
2753 err = -EINVAL;
ed85b565 2754 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2755 goto out;
2756
2757#if 0
2758 /* What error should we return now? EUNATTACH? */
2759 if (pkt_sk(sk)->ifindex < 0)
2760 return -ENODEV;
2761#endif
2762
ed85b565 2763 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
2764 err = sock_recv_errqueue(sk, msg, len,
2765 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
2766 goto out;
2767 }
2768
1da177e4
LT
2769 /*
2770 * Call the generic datagram receiver. This handles all sorts
2771 * of horrible races and re-entrancy so we can forget about it
2772 * in the protocol layers.
2773 *
2774 * Now it will return ENETDOWN, if device have just gone down,
2775 * but then it will block.
2776 */
2777
40d4e3df 2778 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2779
2780 /*
1ce4f28b 2781 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2782 * handles the blocking we don't see and worry about blocking
2783 * retries.
2784 */
2785
8ae55f04 2786 if (skb == NULL)
1da177e4
LT
2787 goto out;
2788
bfd5f4a3
SS
2789 if (pkt_sk(sk)->has_vnet_hdr) {
2790 struct virtio_net_hdr vnet_hdr = { 0 };
2791
2792 err = -EINVAL;
2793 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2794 if (len < vnet_hdr_len)
bfd5f4a3
SS
2795 goto out_free;
2796
1f18b717
MK
2797 len -= vnet_hdr_len;
2798
bfd5f4a3
SS
2799 if (skb_is_gso(skb)) {
2800 struct skb_shared_info *sinfo = skb_shinfo(skb);
2801
2802 /* This is a hint as to how much should be linear. */
2803 vnet_hdr.hdr_len = skb_headlen(skb);
2804 vnet_hdr.gso_size = sinfo->gso_size;
2805 if (sinfo->gso_type & SKB_GSO_TCPV4)
2806 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2807 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2808 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2809 else if (sinfo->gso_type & SKB_GSO_UDP)
2810 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2811 else if (sinfo->gso_type & SKB_GSO_FCOE)
2812 goto out_free;
2813 else
2814 BUG();
2815 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2816 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2817 } else
2818 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2819
2820 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2821 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2822 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2823 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2824 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2825 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2826 } /* else everything is zero */
2827
2828 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2829 vnet_hdr_len);
2830 if (err < 0)
2831 goto out_free;
2832 }
2833
f3d33426
HFS
2834 /* You lose any data beyond the buffer you gave. If it worries
2835 * a user program they can ask the device for its MTU
2836 * anyway.
1da177e4 2837 */
1da177e4 2838 copied = skb->len;
40d4e3df
ED
2839 if (copied > len) {
2840 copied = len;
2841 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2842 }
2843
2844 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2845 if (err)
2846 goto out_free;
2847
3b885787 2848 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 2849
f3d33426
HFS
2850 if (msg->msg_name) {
2851 /* If the address length field is there to be filled
2852 * in, we fill it in now.
2853 */
2854 if (sock->type == SOCK_PACKET) {
2855 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2856 } else {
2857 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2858 msg->msg_namelen = sll->sll_halen +
2859 offsetof(struct sockaddr_ll, sll_addr);
2860 }
ffbc6111
HX
2861 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2862 msg->msg_namelen);
f3d33426 2863 }
1da177e4 2864
8dc41944 2865 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2866 struct tpacket_auxdata aux;
2867
2868 aux.tp_status = TP_STATUS_USER;
2869 if (skb->ip_summed == CHECKSUM_PARTIAL)
2870 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2871 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2872 aux.tp_snaplen = skb->len;
2873 aux.tp_mac = 0;
bbe735e4 2874 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2875 if (vlan_tx_tag_present(skb)) {
2876 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2877 aux.tp_status |= TP_STATUS_VLAN_VALID;
2878 } else {
2879 aux.tp_vlan_tci = 0;
2880 }
13fcb7bd 2881 aux.tp_padding = 0;
ffbc6111 2882 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2883 }
2884
1da177e4
LT
2885 /*
2886 * Free or return the buffer as appropriate. Again this
2887 * hides all the races and re-entrancy issues from us.
2888 */
bfd5f4a3 2889 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2890
2891out_free:
2892 skb_free_datagram(sk, skb);
2893out:
2894 return err;
2895}
2896
1da177e4
LT
2897static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2898 int *uaddr_len, int peer)
2899{
2900 struct net_device *dev;
2901 struct sock *sk = sock->sk;
2902
2903 if (peer)
2904 return -EOPNOTSUPP;
2905
2906 uaddr->sa_family = AF_PACKET;
2dc85bf3 2907 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
2908 rcu_read_lock();
2909 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2910 if (dev)
2dc85bf3 2911 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 2912 rcu_read_unlock();
1da177e4
LT
2913 *uaddr_len = sizeof(*uaddr);
2914
2915 return 0;
2916}
1da177e4
LT
2917
2918static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2919 int *uaddr_len, int peer)
2920{
2921 struct net_device *dev;
2922 struct sock *sk = sock->sk;
2923 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2924 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2925
2926 if (peer)
2927 return -EOPNOTSUPP;
2928
2929 sll->sll_family = AF_PACKET;
2930 sll->sll_ifindex = po->ifindex;
2931 sll->sll_protocol = po->num;
67286640 2932 sll->sll_pkttype = 0;
654d1f8a
ED
2933 rcu_read_lock();
2934 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2935 if (dev) {
2936 sll->sll_hatype = dev->type;
2937 sll->sll_halen = dev->addr_len;
2938 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2939 } else {
2940 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2941 sll->sll_halen = 0;
2942 }
654d1f8a 2943 rcu_read_unlock();
0fb375fb 2944 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2945
2946 return 0;
2947}
2948
2aeb0b88
WC
2949static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2950 int what)
1da177e4
LT
2951{
2952 switch (i->type) {
2953 case PACKET_MR_MULTICAST:
1162563f
JP
2954 if (i->alen != dev->addr_len)
2955 return -EINVAL;
1da177e4 2956 if (what > 0)
22bedad3 2957 return dev_mc_add(dev, i->addr);
1da177e4 2958 else
22bedad3 2959 return dev_mc_del(dev, i->addr);
1da177e4
LT
2960 break;
2961 case PACKET_MR_PROMISC:
2aeb0b88 2962 return dev_set_promiscuity(dev, what);
1da177e4
LT
2963 break;
2964 case PACKET_MR_ALLMULTI:
2aeb0b88 2965 return dev_set_allmulti(dev, what);
1da177e4 2966 break;
d95ed927 2967 case PACKET_MR_UNICAST:
1162563f
JP
2968 if (i->alen != dev->addr_len)
2969 return -EINVAL;
d95ed927 2970 if (what > 0)
a748ee24 2971 return dev_uc_add(dev, i->addr);
d95ed927 2972 else
a748ee24 2973 return dev_uc_del(dev, i->addr);
d95ed927 2974 break;
40d4e3df
ED
2975 default:
2976 break;
1da177e4 2977 }
2aeb0b88 2978 return 0;
1da177e4
LT
2979}
2980
2981static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2982{
40d4e3df 2983 for ( ; i; i = i->next) {
1da177e4
LT
2984 if (i->ifindex == dev->ifindex)
2985 packet_dev_mc(dev, i, what);
2986 }
2987}
2988
0fb375fb 2989static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2990{
2991 struct packet_sock *po = pkt_sk(sk);
2992 struct packet_mclist *ml, *i;
2993 struct net_device *dev;
2994 int err;
2995
2996 rtnl_lock();
2997
2998 err = -ENODEV;
3b1e0a65 2999 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3000 if (!dev)
3001 goto done;
3002
3003 err = -EINVAL;
1162563f 3004 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3005 goto done;
3006
3007 err = -ENOBUFS;
8b3a7005 3008 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3009 if (i == NULL)
3010 goto done;
3011
3012 err = 0;
3013 for (ml = po->mclist; ml; ml = ml->next) {
3014 if (ml->ifindex == mreq->mr_ifindex &&
3015 ml->type == mreq->mr_type &&
3016 ml->alen == mreq->mr_alen &&
3017 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3018 ml->count++;
3019 /* Free the new element ... */
3020 kfree(i);
3021 goto done;
3022 }
3023 }
3024
3025 i->type = mreq->mr_type;
3026 i->ifindex = mreq->mr_ifindex;
3027 i->alen = mreq->mr_alen;
3028 memcpy(i->addr, mreq->mr_address, i->alen);
3029 i->count = 1;
3030 i->next = po->mclist;
3031 po->mclist = i;
2aeb0b88
WC
3032 err = packet_dev_mc(dev, i, 1);
3033 if (err) {
3034 po->mclist = i->next;
3035 kfree(i);
3036 }
1da177e4
LT
3037
3038done:
3039 rtnl_unlock();
3040 return err;
3041}
3042
0fb375fb 3043static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3044{
3045 struct packet_mclist *ml, **mlp;
3046
3047 rtnl_lock();
3048
3049 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3050 if (ml->ifindex == mreq->mr_ifindex &&
3051 ml->type == mreq->mr_type &&
3052 ml->alen == mreq->mr_alen &&
3053 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3054 if (--ml->count == 0) {
3055 struct net_device *dev;
3056 *mlp = ml->next;
ad959e76
ED
3057 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3058 if (dev)
1da177e4 3059 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3060 kfree(ml);
3061 }
3062 rtnl_unlock();
3063 return 0;
3064 }
3065 }
3066 rtnl_unlock();
3067 return -EADDRNOTAVAIL;
3068}
3069
3070static void packet_flush_mclist(struct sock *sk)
3071{
3072 struct packet_sock *po = pkt_sk(sk);
3073 struct packet_mclist *ml;
3074
3075 if (!po->mclist)
3076 return;
3077
3078 rtnl_lock();
3079 while ((ml = po->mclist) != NULL) {
3080 struct net_device *dev;
3081
3082 po->mclist = ml->next;
ad959e76
ED
3083 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3084 if (dev != NULL)
1da177e4 3085 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3086 kfree(ml);
3087 }
3088 rtnl_unlock();
3089}
1da177e4
LT
3090
3091static int
b7058842 3092packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3093{
3094 struct sock *sk = sock->sk;
8dc41944 3095 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3096 int ret;
3097
3098 if (level != SOL_PACKET)
3099 return -ENOPROTOOPT;
3100
69e3c75f 3101 switch (optname) {
1ce4f28b 3102 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3103 case PACKET_DROP_MEMBERSHIP:
3104 {
0fb375fb
EB
3105 struct packet_mreq_max mreq;
3106 int len = optlen;
3107 memset(&mreq, 0, sizeof(mreq));
3108 if (len < sizeof(struct packet_mreq))
1da177e4 3109 return -EINVAL;
0fb375fb
EB
3110 if (len > sizeof(mreq))
3111 len = sizeof(mreq);
40d4e3df 3112 if (copy_from_user(&mreq, optval, len))
1da177e4 3113 return -EFAULT;
0fb375fb
EB
3114 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3115 return -EINVAL;
1da177e4
LT
3116 if (optname == PACKET_ADD_MEMBERSHIP)
3117 ret = packet_mc_add(sk, &mreq);
3118 else
3119 ret = packet_mc_drop(sk, &mreq);
3120 return ret;
3121 }
a2efcfa0 3122
1da177e4 3123 case PACKET_RX_RING:
69e3c75f 3124 case PACKET_TX_RING:
1da177e4 3125 {
f6fb8f10 3126 union tpacket_req_u req_u;
3127 int len;
1da177e4 3128
f6fb8f10 3129 switch (po->tp_version) {
3130 case TPACKET_V1:
3131 case TPACKET_V2:
3132 len = sizeof(req_u.req);
3133 break;
3134 case TPACKET_V3:
3135 default:
3136 len = sizeof(req_u.req3);
3137 break;
3138 }
3139 if (optlen < len)
1da177e4 3140 return -EINVAL;
bfd5f4a3
SS
3141 if (pkt_sk(sk)->has_vnet_hdr)
3142 return -EINVAL;
f6fb8f10 3143 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3144 return -EFAULT;
f6fb8f10 3145 return packet_set_ring(sk, &req_u, 0,
3146 optname == PACKET_TX_RING);
1da177e4
LT
3147 }
3148 case PACKET_COPY_THRESH:
3149 {
3150 int val;
3151
40d4e3df 3152 if (optlen != sizeof(val))
1da177e4 3153 return -EINVAL;
40d4e3df 3154 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3155 return -EFAULT;
3156
3157 pkt_sk(sk)->copy_thresh = val;
3158 return 0;
3159 }
bbd6ef87
PM
3160 case PACKET_VERSION:
3161 {
3162 int val;
3163
3164 if (optlen != sizeof(val))
3165 return -EINVAL;
69e3c75f 3166 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3167 return -EBUSY;
3168 if (copy_from_user(&val, optval, sizeof(val)))
3169 return -EFAULT;
3170 switch (val) {
3171 case TPACKET_V1:
3172 case TPACKET_V2:
f6fb8f10 3173 case TPACKET_V3:
bbd6ef87
PM
3174 po->tp_version = val;
3175 return 0;
3176 default:
3177 return -EINVAL;
3178 }
3179 }
8913336a
PM
3180 case PACKET_RESERVE:
3181 {
3182 unsigned int val;
3183
3184 if (optlen != sizeof(val))
3185 return -EINVAL;
69e3c75f 3186 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3187 return -EBUSY;
3188 if (copy_from_user(&val, optval, sizeof(val)))
3189 return -EFAULT;
3190 po->tp_reserve = val;
3191 return 0;
3192 }
69e3c75f
JB
3193 case PACKET_LOSS:
3194 {
3195 unsigned int val;
3196
3197 if (optlen != sizeof(val))
3198 return -EINVAL;
3199 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3200 return -EBUSY;
3201 if (copy_from_user(&val, optval, sizeof(val)))
3202 return -EFAULT;
3203 po->tp_loss = !!val;
3204 return 0;
3205 }
8dc41944
HX
3206 case PACKET_AUXDATA:
3207 {
3208 int val;
3209
3210 if (optlen < sizeof(val))
3211 return -EINVAL;
3212 if (copy_from_user(&val, optval, sizeof(val)))
3213 return -EFAULT;
3214
3215 po->auxdata = !!val;
3216 return 0;
3217 }
80feaacb
PWJ
3218 case PACKET_ORIGDEV:
3219 {
3220 int val;
3221
3222 if (optlen < sizeof(val))
3223 return -EINVAL;
3224 if (copy_from_user(&val, optval, sizeof(val)))
3225 return -EFAULT;
3226
3227 po->origdev = !!val;
3228 return 0;
3229 }
bfd5f4a3
SS
3230 case PACKET_VNET_HDR:
3231 {
3232 int val;
3233
3234 if (sock->type != SOCK_RAW)
3235 return -EINVAL;
3236 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3237 return -EBUSY;
3238 if (optlen < sizeof(val))
3239 return -EINVAL;
3240 if (copy_from_user(&val, optval, sizeof(val)))
3241 return -EFAULT;
3242
3243 po->has_vnet_hdr = !!val;
3244 return 0;
3245 }
614f60fa
SM
3246 case PACKET_TIMESTAMP:
3247 {
3248 int val;
3249
3250 if (optlen != sizeof(val))
3251 return -EINVAL;
3252 if (copy_from_user(&val, optval, sizeof(val)))
3253 return -EFAULT;
3254
3255 po->tp_tstamp = val;
3256 return 0;
3257 }
dc99f600
DM
3258 case PACKET_FANOUT:
3259 {
3260 int val;
3261
3262 if (optlen != sizeof(val))
3263 return -EINVAL;
3264 if (copy_from_user(&val, optval, sizeof(val)))
3265 return -EFAULT;
3266
3267 return fanout_add(sk, val & 0xffff, val >> 16);
3268 }
5920cd3a
PC
3269 case PACKET_TX_HAS_OFF:
3270 {
3271 unsigned int val;
3272
3273 if (optlen != sizeof(val))
3274 return -EINVAL;
3275 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3276 return -EBUSY;
3277 if (copy_from_user(&val, optval, sizeof(val)))
3278 return -EFAULT;
3279 po->tp_tx_has_off = !!val;
3280 return 0;
3281 }
d346a3fa
DB
3282 case PACKET_QDISC_BYPASS:
3283 {
3284 int val;
3285
3286 if (optlen != sizeof(val))
3287 return -EINVAL;
3288 if (copy_from_user(&val, optval, sizeof(val)))
3289 return -EFAULT;
3290
3291 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3292 return 0;
3293 }
1da177e4
LT
3294 default:
3295 return -ENOPROTOOPT;
3296 }
3297}
3298
3299static int packet_getsockopt(struct socket *sock, int level, int optname,
3300 char __user *optval, int __user *optlen)
3301{
3302 int len;
c06fff6e 3303 int val, lv = sizeof(val);
1da177e4
LT
3304 struct sock *sk = sock->sk;
3305 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3306 void *data = &val;
ee80fbf3 3307 union tpacket_stats_u st;
1da177e4
LT
3308
3309 if (level != SOL_PACKET)
3310 return -ENOPROTOOPT;
3311
8ae55f04
KK
3312 if (get_user(len, optlen))
3313 return -EFAULT;
1da177e4
LT
3314
3315 if (len < 0)
3316 return -EINVAL;
1ce4f28b 3317
69e3c75f 3318 switch (optname) {
1da177e4 3319 case PACKET_STATISTICS:
1da177e4 3320 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3321 memcpy(&st, &po->stats, sizeof(st));
3322 memset(&po->stats, 0, sizeof(po->stats));
3323 spin_unlock_bh(&sk->sk_receive_queue.lock);
3324
f6fb8f10 3325 if (po->tp_version == TPACKET_V3) {
c06fff6e 3326 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3327 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3328 data = &st.stats3;
f6fb8f10 3329 } else {
c06fff6e 3330 lv = sizeof(struct tpacket_stats);
8bcdeaff 3331 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3332 data = &st.stats1;
f6fb8f10 3333 }
ee80fbf3 3334
8dc41944
HX
3335 break;
3336 case PACKET_AUXDATA:
8dc41944 3337 val = po->auxdata;
80feaacb
PWJ
3338 break;
3339 case PACKET_ORIGDEV:
80feaacb 3340 val = po->origdev;
bfd5f4a3
SS
3341 break;
3342 case PACKET_VNET_HDR:
bfd5f4a3 3343 val = po->has_vnet_hdr;
1da177e4 3344 break;
bbd6ef87 3345 case PACKET_VERSION:
bbd6ef87 3346 val = po->tp_version;
bbd6ef87
PM
3347 break;
3348 case PACKET_HDRLEN:
3349 if (len > sizeof(int))
3350 len = sizeof(int);
3351 if (copy_from_user(&val, optval, len))
3352 return -EFAULT;
3353 switch (val) {
3354 case TPACKET_V1:
3355 val = sizeof(struct tpacket_hdr);
3356 break;
3357 case TPACKET_V2:
3358 val = sizeof(struct tpacket2_hdr);
3359 break;
f6fb8f10 3360 case TPACKET_V3:
3361 val = sizeof(struct tpacket3_hdr);
3362 break;
bbd6ef87
PM
3363 default:
3364 return -EINVAL;
3365 }
bbd6ef87 3366 break;
8913336a 3367 case PACKET_RESERVE:
8913336a 3368 val = po->tp_reserve;
8913336a 3369 break;
69e3c75f 3370 case PACKET_LOSS:
69e3c75f 3371 val = po->tp_loss;
69e3c75f 3372 break;
614f60fa 3373 case PACKET_TIMESTAMP:
614f60fa 3374 val = po->tp_tstamp;
614f60fa 3375 break;
dc99f600 3376 case PACKET_FANOUT:
dc99f600
DM
3377 val = (po->fanout ?
3378 ((u32)po->fanout->id |
77f65ebd
WB
3379 ((u32)po->fanout->type << 16) |
3380 ((u32)po->fanout->flags << 24)) :
dc99f600 3381 0);
dc99f600 3382 break;
5920cd3a
PC
3383 case PACKET_TX_HAS_OFF:
3384 val = po->tp_tx_has_off;
3385 break;
d346a3fa
DB
3386 case PACKET_QDISC_BYPASS:
3387 val = packet_use_direct_xmit(po);
3388 break;
1da177e4
LT
3389 default:
3390 return -ENOPROTOOPT;
3391 }
3392
c06fff6e
ED
3393 if (len > lv)
3394 len = lv;
8ae55f04
KK
3395 if (put_user(len, optlen))
3396 return -EFAULT;
8dc41944
HX
3397 if (copy_to_user(optval, data, len))
3398 return -EFAULT;
8ae55f04 3399 return 0;
1da177e4
LT
3400}
3401
3402
351638e7
JP
3403static int packet_notifier(struct notifier_block *this,
3404 unsigned long msg, void *ptr)
1da177e4
LT
3405{
3406 struct sock *sk;
351638e7 3407 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3408 struct net *net = dev_net(dev);
1da177e4 3409
808f5114 3410 rcu_read_lock();
b67bfe0d 3411 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3412 struct packet_sock *po = pkt_sk(sk);
3413
3414 switch (msg) {
3415 case NETDEV_UNREGISTER:
1da177e4
LT
3416 if (po->mclist)
3417 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3418 /* fallthrough */
3419
1da177e4
LT
3420 case NETDEV_DOWN:
3421 if (dev->ifindex == po->ifindex) {
3422 spin_lock(&po->bind_lock);
3423 if (po->running) {
ce06b03e 3424 __unregister_prot_hook(sk, false);
1da177e4
LT
3425 sk->sk_err = ENETDOWN;
3426 if (!sock_flag(sk, SOCK_DEAD))
3427 sk->sk_error_report(sk);
3428 }
3429 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3430 packet_cached_dev_reset(po);
1da177e4 3431 po->ifindex = -1;
160ff18a
BG
3432 if (po->prot_hook.dev)
3433 dev_put(po->prot_hook.dev);
1da177e4
LT
3434 po->prot_hook.dev = NULL;
3435 }
3436 spin_unlock(&po->bind_lock);
3437 }
3438 break;
3439 case NETDEV_UP:
808f5114 3440 if (dev->ifindex == po->ifindex) {
3441 spin_lock(&po->bind_lock);
ce06b03e
DM
3442 if (po->num)
3443 register_prot_hook(sk);
808f5114 3444 spin_unlock(&po->bind_lock);
1da177e4 3445 }
1da177e4
LT
3446 break;
3447 }
3448 }
808f5114 3449 rcu_read_unlock();
1da177e4
LT
3450 return NOTIFY_DONE;
3451}
3452
3453
3454static int packet_ioctl(struct socket *sock, unsigned int cmd,
3455 unsigned long arg)
3456{
3457 struct sock *sk = sock->sk;
3458
69e3c75f 3459 switch (cmd) {
40d4e3df
ED
3460 case SIOCOUTQ:
3461 {
3462 int amount = sk_wmem_alloc_get(sk);
31e6d363 3463
40d4e3df
ED
3464 return put_user(amount, (int __user *)arg);
3465 }
3466 case SIOCINQ:
3467 {
3468 struct sk_buff *skb;
3469 int amount = 0;
3470
3471 spin_lock_bh(&sk->sk_receive_queue.lock);
3472 skb = skb_peek(&sk->sk_receive_queue);
3473 if (skb)
3474 amount = skb->len;
3475 spin_unlock_bh(&sk->sk_receive_queue.lock);
3476 return put_user(amount, (int __user *)arg);
3477 }
3478 case SIOCGSTAMP:
3479 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3480 case SIOCGSTAMPNS:
3481 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3482
1da177e4 3483#ifdef CONFIG_INET
40d4e3df
ED
3484 case SIOCADDRT:
3485 case SIOCDELRT:
3486 case SIOCDARP:
3487 case SIOCGARP:
3488 case SIOCSARP:
3489 case SIOCGIFADDR:
3490 case SIOCSIFADDR:
3491 case SIOCGIFBRDADDR:
3492 case SIOCSIFBRDADDR:
3493 case SIOCGIFNETMASK:
3494 case SIOCSIFNETMASK:
3495 case SIOCGIFDSTADDR:
3496 case SIOCSIFDSTADDR:
3497 case SIOCSIFFLAGS:
40d4e3df 3498 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3499#endif
3500
40d4e3df
ED
3501 default:
3502 return -ENOIOCTLCMD;
1da177e4
LT
3503 }
3504 return 0;
3505}
3506
40d4e3df 3507static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3508 poll_table *wait)
3509{
3510 struct sock *sk = sock->sk;
3511 struct packet_sock *po = pkt_sk(sk);
3512 unsigned int mask = datagram_poll(file, sock, wait);
3513
3514 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3515 if (po->rx_ring.pg_vec) {
f6fb8f10 3516 if (!packet_previous_rx_frame(po, &po->rx_ring,
3517 TP_STATUS_KERNEL))
1da177e4
LT
3518 mask |= POLLIN | POLLRDNORM;
3519 }
3520 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3521 spin_lock_bh(&sk->sk_write_queue.lock);
3522 if (po->tx_ring.pg_vec) {
3523 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3524 mask |= POLLOUT | POLLWRNORM;
3525 }
3526 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3527 return mask;
3528}
3529
3530
3531/* Dirty? Well, I still did not learn better way to account
3532 * for user mmaps.
3533 */
3534
3535static void packet_mm_open(struct vm_area_struct *vma)
3536{
3537 struct file *file = vma->vm_file;
40d4e3df 3538 struct socket *sock = file->private_data;
1da177e4 3539 struct sock *sk = sock->sk;
1ce4f28b 3540
1da177e4
LT
3541 if (sk)
3542 atomic_inc(&pkt_sk(sk)->mapped);
3543}
3544
3545static void packet_mm_close(struct vm_area_struct *vma)
3546{
3547 struct file *file = vma->vm_file;
40d4e3df 3548 struct socket *sock = file->private_data;
1da177e4 3549 struct sock *sk = sock->sk;
1ce4f28b 3550
1da177e4
LT
3551 if (sk)
3552 atomic_dec(&pkt_sk(sk)->mapped);
3553}
3554
f0f37e2f 3555static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3556 .open = packet_mm_open,
3557 .close = packet_mm_close,
1da177e4
LT
3558};
3559
0e3125c7
NH
3560static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3561 unsigned int len)
1da177e4
LT
3562{
3563 int i;
3564
4ebf0ae2 3565 for (i = 0; i < len; i++) {
0e3125c7 3566 if (likely(pg_vec[i].buffer)) {
c56b4d90 3567 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3568 vfree(pg_vec[i].buffer);
3569 else
3570 free_pages((unsigned long)pg_vec[i].buffer,
3571 order);
3572 pg_vec[i].buffer = NULL;
3573 }
1da177e4
LT
3574 }
3575 kfree(pg_vec);
3576}
3577
eea49cc9 3578static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3579{
0e3125c7
NH
3580 char *buffer = NULL;
3581 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3582 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3583
3584 buffer = (char *) __get_free_pages(gfp_flags, order);
3585
3586 if (buffer)
3587 return buffer;
3588
3589 /*
3590 * __get_free_pages failed, fall back to vmalloc
3591 */
bbce5a59 3592 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3593
0e3125c7
NH
3594 if (buffer)
3595 return buffer;
3596
3597 /*
3598 * vmalloc failed, lets dig into swap here
3599 */
0e3125c7
NH
3600 gfp_flags &= ~__GFP_NORETRY;
3601 buffer = (char *)__get_free_pages(gfp_flags, order);
3602 if (buffer)
3603 return buffer;
3604
3605 /*
3606 * complete and utter failure
3607 */
3608 return NULL;
4ebf0ae2
DM
3609}
3610
0e3125c7 3611static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3612{
3613 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3614 struct pgv *pg_vec;
4ebf0ae2
DM
3615 int i;
3616
0e3125c7 3617 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3618 if (unlikely(!pg_vec))
3619 goto out;
3620
3621 for (i = 0; i < block_nr; i++) {
c56b4d90 3622 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3623 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3624 goto out_free_pgvec;
3625 }
3626
3627out:
3628 return pg_vec;
3629
3630out_free_pgvec:
3631 free_pg_vec(pg_vec, order, block_nr);
3632 pg_vec = NULL;
3633 goto out;
3634}
1da177e4 3635
f6fb8f10 3636static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3637 int closing, int tx_ring)
1da177e4 3638{
0e3125c7 3639 struct pgv *pg_vec = NULL;
1da177e4 3640 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3641 int was_running, order = 0;
69e3c75f
JB
3642 struct packet_ring_buffer *rb;
3643 struct sk_buff_head *rb_queue;
0e11c91e 3644 __be16 num;
f6fb8f10 3645 int err = -EINVAL;
3646 /* Added to avoid minimal code churn */
3647 struct tpacket_req *req = &req_u->req;
3648
3649 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3650 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3651 WARN(1, "Tx-ring is not supported.\n");
3652 goto out;
3653 }
1ce4f28b 3654
69e3c75f
JB
3655 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3656 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3657
69e3c75f
JB
3658 err = -EBUSY;
3659 if (!closing) {
3660 if (atomic_read(&po->mapped))
3661 goto out;
3662 if (atomic_read(&rb->pending))
3663 goto out;
3664 }
1da177e4 3665
69e3c75f
JB
3666 if (req->tp_block_nr) {
3667 /* Sanity tests and some calculations */
3668 err = -EBUSY;
3669 if (unlikely(rb->pg_vec))
3670 goto out;
1da177e4 3671
bbd6ef87
PM
3672 switch (po->tp_version) {
3673 case TPACKET_V1:
3674 po->tp_hdrlen = TPACKET_HDRLEN;
3675 break;
3676 case TPACKET_V2:
3677 po->tp_hdrlen = TPACKET2_HDRLEN;
3678 break;
f6fb8f10 3679 case TPACKET_V3:
3680 po->tp_hdrlen = TPACKET3_HDRLEN;
3681 break;
bbd6ef87
PM
3682 }
3683
69e3c75f 3684 err = -EINVAL;
4ebf0ae2 3685 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3686 goto out;
4ebf0ae2 3687 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3688 goto out;
8913336a 3689 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3690 po->tp_reserve))
3691 goto out;
4ebf0ae2 3692 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3693 goto out;
1da177e4 3694
69e3c75f
JB
3695 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3696 if (unlikely(rb->frames_per_block <= 0))
3697 goto out;
3698 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3699 req->tp_frame_nr))
3700 goto out;
1da177e4
LT
3701
3702 err = -ENOMEM;
4ebf0ae2
DM
3703 order = get_order(req->tp_block_size);
3704 pg_vec = alloc_pg_vec(req, order);
3705 if (unlikely(!pg_vec))
1da177e4 3706 goto out;
f6fb8f10 3707 switch (po->tp_version) {
3708 case TPACKET_V3:
3709 /* Transmit path is not supported. We checked
3710 * it above but just being paranoid
3711 */
3712 if (!tx_ring)
3713 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3714 break;
3715 default:
3716 break;
3717 }
69e3c75f
JB
3718 }
3719 /* Done */
3720 else {
3721 err = -EINVAL;
4ebf0ae2 3722 if (unlikely(req->tp_frame_nr))
69e3c75f 3723 goto out;
1da177e4
LT
3724 }
3725
3726 lock_sock(sk);
3727
3728 /* Detach socket from network */
3729 spin_lock(&po->bind_lock);
3730 was_running = po->running;
3731 num = po->num;
3732 if (was_running) {
1da177e4 3733 po->num = 0;
ce06b03e 3734 __unregister_prot_hook(sk, false);
1da177e4
LT
3735 }
3736 spin_unlock(&po->bind_lock);
1ce4f28b 3737
1da177e4
LT
3738 synchronize_net();
3739
3740 err = -EBUSY;
905db440 3741 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3742 if (closing || atomic_read(&po->mapped) == 0) {
3743 err = 0;
69e3c75f 3744 spin_lock_bh(&rb_queue->lock);
c053fd96 3745 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3746 rb->frame_max = (req->tp_frame_nr - 1);
3747 rb->head = 0;
3748 rb->frame_size = req->tp_frame_size;
3749 spin_unlock_bh(&rb_queue->lock);
3750
c053fd96
CG
3751 swap(rb->pg_vec_order, order);
3752 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3753
3754 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3755 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3756 tpacket_rcv : packet_rcv;
3757 skb_queue_purge(rb_queue);
1da177e4 3758 if (atomic_read(&po->mapped))
40d4e3df
ED
3759 pr_err("packet_mmap: vma is busy: %d\n",
3760 atomic_read(&po->mapped));
1da177e4 3761 }
905db440 3762 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3763
3764 spin_lock(&po->bind_lock);
ce06b03e 3765 if (was_running) {
1da177e4 3766 po->num = num;
ce06b03e 3767 register_prot_hook(sk);
1da177e4
LT
3768 }
3769 spin_unlock(&po->bind_lock);
f6fb8f10 3770 if (closing && (po->tp_version > TPACKET_V2)) {
3771 /* Because we don't support block-based V3 on tx-ring */
3772 if (!tx_ring)
3773 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3774 }
1da177e4
LT
3775 release_sock(sk);
3776
1da177e4
LT
3777 if (pg_vec)
3778 free_pg_vec(pg_vec, order, req->tp_block_nr);
3779out:
3780 return err;
3781}
3782
69e3c75f
JB
3783static int packet_mmap(struct file *file, struct socket *sock,
3784 struct vm_area_struct *vma)
1da177e4
LT
3785{
3786 struct sock *sk = sock->sk;
3787 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3788 unsigned long size, expected_size;
3789 struct packet_ring_buffer *rb;
1da177e4
LT
3790 unsigned long start;
3791 int err = -EINVAL;
3792 int i;
3793
3794 if (vma->vm_pgoff)
3795 return -EINVAL;
3796
905db440 3797 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3798
3799 expected_size = 0;
3800 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3801 if (rb->pg_vec) {
3802 expected_size += rb->pg_vec_len
3803 * rb->pg_vec_pages
3804 * PAGE_SIZE;
3805 }
3806 }
3807
3808 if (expected_size == 0)
1da177e4 3809 goto out;
69e3c75f
JB
3810
3811 size = vma->vm_end - vma->vm_start;
3812 if (size != expected_size)
1da177e4
LT
3813 goto out;
3814
1da177e4 3815 start = vma->vm_start;
69e3c75f
JB
3816 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3817 if (rb->pg_vec == NULL)
3818 continue;
3819
3820 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3821 struct page *page;
3822 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3823 int pg_num;
3824
c56b4d90
CG
3825 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3826 page = pgv_to_page(kaddr);
69e3c75f
JB
3827 err = vm_insert_page(vma, start, page);
3828 if (unlikely(err))
3829 goto out;
3830 start += PAGE_SIZE;
0e3125c7 3831 kaddr += PAGE_SIZE;
69e3c75f 3832 }
4ebf0ae2 3833 }
1da177e4 3834 }
69e3c75f 3835
4ebf0ae2 3836 atomic_inc(&po->mapped);
1da177e4
LT
3837 vma->vm_ops = &packet_mmap_ops;
3838 err = 0;
3839
3840out:
905db440 3841 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3842 return err;
3843}
1da177e4 3844
90ddc4f0 3845static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3846 .family = PF_PACKET,
3847 .owner = THIS_MODULE,
3848 .release = packet_release,
3849 .bind = packet_bind_spkt,
3850 .connect = sock_no_connect,
3851 .socketpair = sock_no_socketpair,
3852 .accept = sock_no_accept,
3853 .getname = packet_getname_spkt,
3854 .poll = datagram_poll,
3855 .ioctl = packet_ioctl,
3856 .listen = sock_no_listen,
3857 .shutdown = sock_no_shutdown,
3858 .setsockopt = sock_no_setsockopt,
3859 .getsockopt = sock_no_getsockopt,
3860 .sendmsg = packet_sendmsg_spkt,
3861 .recvmsg = packet_recvmsg,
3862 .mmap = sock_no_mmap,
3863 .sendpage = sock_no_sendpage,
3864};
1da177e4 3865
90ddc4f0 3866static const struct proto_ops packet_ops = {
1da177e4
LT
3867 .family = PF_PACKET,
3868 .owner = THIS_MODULE,
3869 .release = packet_release,
3870 .bind = packet_bind,
3871 .connect = sock_no_connect,
3872 .socketpair = sock_no_socketpair,
3873 .accept = sock_no_accept,
1ce4f28b 3874 .getname = packet_getname,
1da177e4
LT
3875 .poll = packet_poll,
3876 .ioctl = packet_ioctl,
3877 .listen = sock_no_listen,
3878 .shutdown = sock_no_shutdown,
3879 .setsockopt = packet_setsockopt,
3880 .getsockopt = packet_getsockopt,
3881 .sendmsg = packet_sendmsg,
3882 .recvmsg = packet_recvmsg,
3883 .mmap = packet_mmap,
3884 .sendpage = sock_no_sendpage,
3885};
3886
ec1b4cf7 3887static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3888 .family = PF_PACKET,
3889 .create = packet_create,
3890 .owner = THIS_MODULE,
3891};
3892
3893static struct notifier_block packet_netdev_notifier = {
40d4e3df 3894 .notifier_call = packet_notifier,
1da177e4
LT
3895};
3896
3897#ifdef CONFIG_PROC_FS
1da177e4
LT
3898
3899static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3900 __acquires(RCU)
1da177e4 3901{
e372c414 3902 struct net *net = seq_file_net(seq);
808f5114 3903
3904 rcu_read_lock();
3905 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3906}
3907
3908static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3909{
1bf40954 3910 struct net *net = seq_file_net(seq);
808f5114 3911 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3912}
3913
3914static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3915 __releases(RCU)
1da177e4 3916{
808f5114 3917 rcu_read_unlock();
1da177e4
LT
3918}
3919
1ce4f28b 3920static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3921{
3922 if (v == SEQ_START_TOKEN)
3923 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3924 else {
b7ceabd9 3925 struct sock *s = sk_entry(v);
1da177e4
LT
3926 const struct packet_sock *po = pkt_sk(s);
3927
3928 seq_printf(seq,
71338aa7 3929 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3930 s,
3931 atomic_read(&s->sk_refcnt),
3932 s->sk_type,
3933 ntohs(po->num),
3934 po->ifindex,
3935 po->running,
3936 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3937 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3938 sock_i_ino(s));
1da177e4
LT
3939 }
3940
3941 return 0;
3942}
3943
56b3d975 3944static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3945 .start = packet_seq_start,
3946 .next = packet_seq_next,
3947 .stop = packet_seq_stop,
3948 .show = packet_seq_show,
3949};
3950
3951static int packet_seq_open(struct inode *inode, struct file *file)
3952{
e372c414
DL
3953 return seq_open_net(inode, file, &packet_seq_ops,
3954 sizeof(struct seq_net_private));
1da177e4
LT
3955}
3956
da7071d7 3957static const struct file_operations packet_seq_fops = {
1da177e4
LT
3958 .owner = THIS_MODULE,
3959 .open = packet_seq_open,
3960 .read = seq_read,
3961 .llseek = seq_lseek,
e372c414 3962 .release = seq_release_net,
1da177e4
LT
3963};
3964
3965#endif
3966
2c8c1e72 3967static int __net_init packet_net_init(struct net *net)
d12d01d6 3968{
0fa7fa98 3969 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3970 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3971
d4beaa66 3972 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3973 return -ENOMEM;
3974
3975 return 0;
3976}
3977
2c8c1e72 3978static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3979{
ece31ffd 3980 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3981}
3982
3983static struct pernet_operations packet_net_ops = {
3984 .init = packet_net_init,
3985 .exit = packet_net_exit,
3986};
3987
3988
1da177e4
LT
3989static void __exit packet_exit(void)
3990{
1da177e4 3991 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3992 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3993 sock_unregister(PF_PACKET);
3994 proto_unregister(&packet_proto);
3995}
3996
3997static int __init packet_init(void)
3998{
3999 int rc = proto_register(&packet_proto, 0);
4000
4001 if (rc != 0)
4002 goto out;
4003
4004 sock_register(&packet_family_ops);
d12d01d6 4005 register_pernet_subsys(&packet_net_ops);
1da177e4 4006 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4007out:
4008 return rc;
4009}
4010
4011module_init(packet_init);
4012module_exit(packet_exit);
4013MODULE_LICENSE("GPL");
4014MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.005723 seconds and 5 git commands to generate.