sctp: Add buffer utilization fields to /proc/net/sctp/assocs
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
f6fb8f10 161static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
162 int closing, int tx_ring);
163
f6fb8f10 164
165#define V3_ALIGNMENT (8)
166
bc59ba39 167#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 168
169#define BLK_PLUS_PRIV(sz_of_priv) \
170 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
171
f6fb8f10 172#define PGV_FROM_VMALLOC 1
69e3c75f 173
f6fb8f10 174#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
175#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
176#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
181
69e3c75f
JB
182struct packet_sock;
183static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
184static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
185 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 186
f6fb8f10 187static void *packet_previous_frame(struct packet_sock *po,
188 struct packet_ring_buffer *rb,
189 int status);
190static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 191static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
192 struct tpacket_block_desc *);
193static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 194 struct packet_sock *);
bc59ba39 195static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 196 struct packet_sock *, unsigned int status);
bc59ba39 197static int prb_queue_frozen(struct tpacket_kbdq_core *);
198static void prb_open_block(struct tpacket_kbdq_core *,
199 struct tpacket_block_desc *);
f6fb8f10 200static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 201static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
202static void prb_init_blk_timer(struct packet_sock *,
203 struct tpacket_kbdq_core *,
204 void (*func) (unsigned long));
205static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
206static void prb_clear_rxhash(struct tpacket_kbdq_core *,
207 struct tpacket3_hdr *);
208static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
209 struct tpacket3_hdr *);
1da177e4
LT
210static void packet_flush_mclist(struct sock *sk);
211
ffbc6111
HX
212struct packet_skb_cb {
213 unsigned int origlen;
214 union {
215 struct sockaddr_pkt pkt;
216 struct sockaddr_ll ll;
217 } sa;
218};
219
220#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 221
bc59ba39 222#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 223#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 224 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 225#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 226 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 227#define GET_NEXT_PRB_BLK_NUM(x) \
228 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
229 ((x)->kactive_blk_num+1) : 0)
230
dc99f600
DM
231static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
232static void __fanout_link(struct sock *sk, struct packet_sock *po);
233
ce06b03e
DM
234/* register_prot_hook must be invoked with the po->bind_lock held,
235 * or from a context in which asynchronous accesses to the packet
236 * socket is not possible (packet_create()).
237 */
238static void register_prot_hook(struct sock *sk)
239{
240 struct packet_sock *po = pkt_sk(sk);
241 if (!po->running) {
dc99f600
DM
242 if (po->fanout)
243 __fanout_link(sk, po);
244 else
245 dev_add_pack(&po->prot_hook);
ce06b03e
DM
246 sock_hold(sk);
247 po->running = 1;
248 }
249}
250
251/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
252 * held. If the sync parameter is true, we will temporarily drop
253 * the po->bind_lock and do a synchronize_net to make sure no
254 * asynchronous packet processing paths still refer to the elements
255 * of po->prot_hook. If the sync parameter is false, it is the
256 * callers responsibility to take care of this.
257 */
258static void __unregister_prot_hook(struct sock *sk, bool sync)
259{
260 struct packet_sock *po = pkt_sk(sk);
261
262 po->running = 0;
dc99f600
DM
263 if (po->fanout)
264 __fanout_unlink(sk, po);
265 else
266 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
267 __sock_put(sk);
268
269 if (sync) {
270 spin_unlock(&po->bind_lock);
271 synchronize_net();
272 spin_lock(&po->bind_lock);
273 }
274}
275
276static void unregister_prot_hook(struct sock *sk, bool sync)
277{
278 struct packet_sock *po = pkt_sk(sk);
279
280 if (po->running)
281 __unregister_prot_hook(sk, sync);
282}
283
f6dafa95 284static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
285{
286 if (is_vmalloc_addr(addr))
287 return vmalloc_to_page(addr);
288 return virt_to_page(addr);
289}
290
69e3c75f 291static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 292{
bbd6ef87
PM
293 union {
294 struct tpacket_hdr *h1;
295 struct tpacket2_hdr *h2;
296 void *raw;
297 } h;
1da177e4 298
69e3c75f 299 h.raw = frame;
bbd6ef87
PM
300 switch (po->tp_version) {
301 case TPACKET_V1:
69e3c75f 302 h.h1->tp_status = status;
0af55bb5 303 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
304 break;
305 case TPACKET_V2:
69e3c75f 306 h.h2->tp_status = status;
0af55bb5 307 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 308 break;
f6fb8f10 309 case TPACKET_V3:
69e3c75f 310 default:
f6fb8f10 311 WARN(1, "TPACKET version not supported.\n");
69e3c75f 312 BUG();
bbd6ef87 313 }
69e3c75f
JB
314
315 smp_wmb();
bbd6ef87
PM
316}
317
69e3c75f 318static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
319{
320 union {
321 struct tpacket_hdr *h1;
322 struct tpacket2_hdr *h2;
323 void *raw;
324 } h;
325
69e3c75f
JB
326 smp_rmb();
327
bbd6ef87
PM
328 h.raw = frame;
329 switch (po->tp_version) {
330 case TPACKET_V1:
0af55bb5 331 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 332 return h.h1->tp_status;
bbd6ef87 333 case TPACKET_V2:
0af55bb5 334 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 335 return h.h2->tp_status;
f6fb8f10 336 case TPACKET_V3:
69e3c75f 337 default:
f6fb8f10 338 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
339 BUG();
340 return 0;
bbd6ef87 341 }
1da177e4 342}
69e3c75f
JB
343
344static void *packet_lookup_frame(struct packet_sock *po,
345 struct packet_ring_buffer *rb,
346 unsigned int position,
347 int status)
348{
349 unsigned int pg_vec_pos, frame_offset;
350 union {
351 struct tpacket_hdr *h1;
352 struct tpacket2_hdr *h2;
353 void *raw;
354 } h;
355
356 pg_vec_pos = position / rb->frames_per_block;
357 frame_offset = position % rb->frames_per_block;
358
0e3125c7
NH
359 h.raw = rb->pg_vec[pg_vec_pos].buffer +
360 (frame_offset * rb->frame_size);
69e3c75f
JB
361
362 if (status != __packet_get_status(po, h.raw))
363 return NULL;
364
365 return h.raw;
366}
367
eea49cc9 368static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
369 struct packet_ring_buffer *rb,
370 int status)
371{
372 return packet_lookup_frame(po, rb, rb->head, status);
373}
374
bc59ba39 375static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 376{
377 del_timer_sync(&pkc->retire_blk_timer);
378}
379
380static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
381 int tx_ring,
382 struct sk_buff_head *rb_queue)
383{
bc59ba39 384 struct tpacket_kbdq_core *pkc;
f6fb8f10 385
386 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
387
388 spin_lock(&rb_queue->lock);
389 pkc->delete_blk_timer = 1;
390 spin_unlock(&rb_queue->lock);
391
392 prb_del_retire_blk_timer(pkc);
393}
394
395static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 396 struct tpacket_kbdq_core *pkc,
f6fb8f10 397 void (*func) (unsigned long))
398{
399 init_timer(&pkc->retire_blk_timer);
400 pkc->retire_blk_timer.data = (long)po;
401 pkc->retire_blk_timer.function = func;
402 pkc->retire_blk_timer.expires = jiffies;
403}
404
405static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
406{
bc59ba39 407 struct tpacket_kbdq_core *pkc;
f6fb8f10 408
409 if (tx_ring)
410 BUG();
411
412 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
413 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
414}
415
416static int prb_calc_retire_blk_tmo(struct packet_sock *po,
417 int blk_size_in_bytes)
418{
419 struct net_device *dev;
420 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
421 struct ethtool_cmd ecmd;
422 int err;
e440cf2c 423 u32 speed;
f6fb8f10 424
4bc71cb9
JP
425 rtnl_lock();
426 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
427 if (unlikely(!dev)) {
428 rtnl_unlock();
f6fb8f10 429 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
430 }
431 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 432 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
433 rtnl_unlock();
434 if (!err) {
4bc71cb9
JP
435 /*
436 * If the link speed is so slow you don't really
437 * need to worry about perf anyways
438 */
e440cf2c 439 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 440 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 441 } else {
442 msec = 1;
443 div = speed / 1000;
f6fb8f10 444 }
445 }
446
447 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
448
449 if (div)
450 mbits /= div;
451
452 tmo = mbits * msec;
453
454 if (div)
455 return tmo+1;
456 return tmo;
457}
458
bc59ba39 459static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 460 union tpacket_req_u *req_u)
461{
462 p1->feature_req_word = req_u->req3.tp_feature_req_word;
463}
464
465static void init_prb_bdqc(struct packet_sock *po,
466 struct packet_ring_buffer *rb,
467 struct pgv *pg_vec,
468 union tpacket_req_u *req_u, int tx_ring)
469{
bc59ba39 470 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
471 struct tpacket_block_desc *pbd;
f6fb8f10 472
473 memset(p1, 0x0, sizeof(*p1));
474
475 p1->knxt_seq_num = 1;
476 p1->pkbdq = pg_vec;
bc59ba39 477 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 478 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 479 p1->kblk_size = req_u->req3.tp_block_size;
480 p1->knum_blocks = req_u->req3.tp_block_nr;
481 p1->hdrlen = po->tp_hdrlen;
482 p1->version = po->tp_version;
483 p1->last_kactive_blk_num = 0;
484 po->stats_u.stats3.tp_freeze_q_cnt = 0;
485 if (req_u->req3.tp_retire_blk_tov)
486 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
487 else
488 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
489 req_u->req3.tp_block_size);
490 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
491 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
492
493 prb_init_ft_ops(p1, req_u);
494 prb_setup_retire_blk_timer(po, tx_ring);
495 prb_open_block(p1, pbd);
496}
497
498/* Do NOT update the last_blk_num first.
499 * Assumes sk_buff_head lock is held.
500 */
bc59ba39 501static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 502{
503 mod_timer(&pkc->retire_blk_timer,
504 jiffies + pkc->tov_in_jiffies);
505 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
506}
507
508/*
509 * Timer logic:
510 * 1) We refresh the timer only when we open a block.
511 * By doing this we don't waste cycles refreshing the timer
512 * on packet-by-packet basis.
513 *
514 * With a 1MB block-size, on a 1Gbps line, it will take
515 * i) ~8 ms to fill a block + ii) memcpy etc.
516 * In this cut we are not accounting for the memcpy time.
517 *
518 * So, if the user sets the 'tmo' to 10ms then the timer
519 * will never fire while the block is still getting filled
520 * (which is what we want). However, the user could choose
521 * to close a block early and that's fine.
522 *
523 * But when the timer does fire, we check whether or not to refresh it.
524 * Since the tmo granularity is in msecs, it is not too expensive
525 * to refresh the timer, lets say every '8' msecs.
526 * Either the user can set the 'tmo' or we can derive it based on
527 * a) line-speed and b) block-size.
528 * prb_calc_retire_blk_tmo() calculates the tmo.
529 *
530 */
531static void prb_retire_rx_blk_timer_expired(unsigned long data)
532{
533 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 534 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 535 unsigned int frozen;
bc59ba39 536 struct tpacket_block_desc *pbd;
f6fb8f10 537
538 spin_lock(&po->sk.sk_receive_queue.lock);
539
540 frozen = prb_queue_frozen(pkc);
541 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
542
543 if (unlikely(pkc->delete_blk_timer))
544 goto out;
545
546 /* We only need to plug the race when the block is partially filled.
547 * tpacket_rcv:
548 * lock(); increment BLOCK_NUM_PKTS; unlock()
549 * copy_bits() is in progress ...
550 * timer fires on other cpu:
551 * we can't retire the current block because copy_bits
552 * is in progress.
553 *
554 */
555 if (BLOCK_NUM_PKTS(pbd)) {
556 while (atomic_read(&pkc->blk_fill_in_prog)) {
557 /* Waiting for skb_copy_bits to finish... */
558 cpu_relax();
559 }
560 }
561
562 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
563 if (!frozen) {
564 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
565 if (!prb_dispatch_next_block(pkc, po))
566 goto refresh_timer;
567 else
568 goto out;
569 } else {
570 /* Case 1. Queue was frozen because user-space was
571 * lagging behind.
572 */
573 if (prb_curr_blk_in_use(pkc, pbd)) {
574 /*
575 * Ok, user-space is still behind.
576 * So just refresh the timer.
577 */
578 goto refresh_timer;
579 } else {
580 /* Case 2. queue was frozen,user-space caught up,
581 * now the link went idle && the timer fired.
582 * We don't have a block to close.So we open this
583 * block and restart the timer.
584 * opening a block thaws the queue,restarts timer
585 * Thawing/timer-refresh is a side effect.
586 */
587 prb_open_block(pkc, pbd);
588 goto out;
589 }
590 }
591 }
592
593refresh_timer:
594 _prb_refresh_rx_retire_blk_timer(pkc);
595
596out:
597 spin_unlock(&po->sk.sk_receive_queue.lock);
598}
599
eea49cc9 600static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 601 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 602{
603 /* Flush everything minus the block header */
604
605#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
606 u8 *start, *end;
607
608 start = (u8 *)pbd1;
609
610 /* Skip the block header(we know header WILL fit in 4K) */
611 start += PAGE_SIZE;
612
613 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
614 for (; start < end; start += PAGE_SIZE)
615 flush_dcache_page(pgv_to_page(start));
616
617 smp_wmb();
618#endif
619
620 /* Now update the block status. */
621
622 BLOCK_STATUS(pbd1) = status;
623
624 /* Flush the block header */
625
626#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
627 start = (u8 *)pbd1;
628 flush_dcache_page(pgv_to_page(start));
629
630 smp_wmb();
631#endif
632}
633
634/*
635 * Side effect:
636 *
637 * 1) flush the block
638 * 2) Increment active_blk_num
639 *
640 * Note:We DONT refresh the timer on purpose.
641 * Because almost always the next block will be opened.
642 */
bc59ba39 643static void prb_close_block(struct tpacket_kbdq_core *pkc1,
644 struct tpacket_block_desc *pbd1,
f6fb8f10 645 struct packet_sock *po, unsigned int stat)
646{
647 __u32 status = TP_STATUS_USER | stat;
648
649 struct tpacket3_hdr *last_pkt;
bc59ba39 650 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 651
652 if (po->stats.tp_drops)
653 status |= TP_STATUS_LOSING;
654
655 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
656 last_pkt->tp_next_offset = 0;
657
658 /* Get the ts of the last pkt */
659 if (BLOCK_NUM_PKTS(pbd1)) {
660 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
661 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
662 } else {
663 /* Ok, we tmo'd - so get the current time */
664 struct timespec ts;
665 getnstimeofday(&ts);
666 h1->ts_last_pkt.ts_sec = ts.tv_sec;
667 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
668 }
669
670 smp_wmb();
671
672 /* Flush the block */
673 prb_flush_block(pkc1, pbd1, status);
674
675 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
676}
677
eea49cc9 678static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 679{
680 pkc->reset_pending_on_curr_blk = 0;
681}
682
683/*
684 * Side effect of opening a block:
685 *
686 * 1) prb_queue is thawed.
687 * 2) retire_blk_timer is refreshed.
688 *
689 */
bc59ba39 690static void prb_open_block(struct tpacket_kbdq_core *pkc1,
691 struct tpacket_block_desc *pbd1)
f6fb8f10 692{
693 struct timespec ts;
bc59ba39 694 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 695
696 smp_rmb();
697
698 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
699
700 /* We could have just memset this but we will lose the
701 * flexibility of making the priv area sticky
702 */
703 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
704 BLOCK_NUM_PKTS(pbd1) = 0;
705 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
706 getnstimeofday(&ts);
707 h1->ts_first_pkt.ts_sec = ts.tv_sec;
708 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
709 pkc1->pkblk_start = (char *)pbd1;
e3192690 710 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 711 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
712 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
713 pbd1->version = pkc1->version;
714 pkc1->prev = pkc1->nxt_offset;
715 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
716 prb_thaw_queue(pkc1);
717 _prb_refresh_rx_retire_blk_timer(pkc1);
718
719 smp_wmb();
720
721 return;
722 }
723
724 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
725 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
726 dump_stack();
727 BUG();
728}
729
730/*
731 * Queue freeze logic:
732 * 1) Assume tp_block_nr = 8 blocks.
733 * 2) At time 't0', user opens Rx ring.
734 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
735 * 4) user-space is either sleeping or processing block '0'.
736 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
737 * it will close block-7,loop around and try to fill block '0'.
738 * call-flow:
739 * __packet_lookup_frame_in_block
740 * prb_retire_current_block()
741 * prb_dispatch_next_block()
742 * |->(BLOCK_STATUS == USER) evaluates to true
743 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
744 * 6) Now there are two cases:
745 * 6.1) Link goes idle right after the queue is frozen.
746 * But remember, the last open_block() refreshed the timer.
747 * When this timer expires,it will refresh itself so that we can
748 * re-open block-0 in near future.
749 * 6.2) Link is busy and keeps on receiving packets. This is a simple
750 * case and __packet_lookup_frame_in_block will check if block-0
751 * is free and can now be re-used.
752 */
eea49cc9 753static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 754 struct packet_sock *po)
755{
756 pkc->reset_pending_on_curr_blk = 1;
757 po->stats_u.stats3.tp_freeze_q_cnt++;
758}
759
760#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
761
762/*
763 * If the next block is free then we will dispatch it
764 * and return a good offset.
765 * Else, we will freeze the queue.
766 * So, caller must check the return value.
767 */
bc59ba39 768static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 769 struct packet_sock *po)
770{
bc59ba39 771 struct tpacket_block_desc *pbd;
f6fb8f10 772
773 smp_rmb();
774
775 /* 1. Get current block num */
776 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
777
778 /* 2. If this block is currently in_use then freeze the queue */
779 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
780 prb_freeze_queue(pkc, po);
781 return NULL;
782 }
783
784 /*
785 * 3.
786 * open this block and return the offset where the first packet
787 * needs to get stored.
788 */
789 prb_open_block(pkc, pbd);
790 return (void *)pkc->nxt_offset;
791}
792
bc59ba39 793static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 794 struct packet_sock *po, unsigned int status)
795{
bc59ba39 796 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 797
798 /* retire/close the current block */
799 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
800 /*
801 * Plug the case where copy_bits() is in progress on
802 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
803 * have space to copy the pkt in the current block and
804 * called prb_retire_current_block()
805 *
806 * We don't need to worry about the TMO case because
807 * the timer-handler already handled this case.
808 */
809 if (!(status & TP_STATUS_BLK_TMO)) {
810 while (atomic_read(&pkc->blk_fill_in_prog)) {
811 /* Waiting for skb_copy_bits to finish... */
812 cpu_relax();
813 }
814 }
815 prb_close_block(pkc, pbd, po, status);
816 return;
817 }
818
819 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
820 dump_stack();
821 BUG();
822}
823
eea49cc9 824static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 825 struct tpacket_block_desc *pbd)
f6fb8f10 826{
827 return TP_STATUS_USER & BLOCK_STATUS(pbd);
828}
829
eea49cc9 830static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 831{
832 return pkc->reset_pending_on_curr_blk;
833}
834
eea49cc9 835static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 836{
bc59ba39 837 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 838 atomic_dec(&pkc->blk_fill_in_prog);
839}
840
eea49cc9 841static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 842 struct tpacket3_hdr *ppd)
843{
844 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
845}
846
eea49cc9 847static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 848 struct tpacket3_hdr *ppd)
849{
850 ppd->hv1.tp_rxhash = 0;
851}
852
eea49cc9 853static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 854 struct tpacket3_hdr *ppd)
855{
856 if (vlan_tx_tag_present(pkc->skb)) {
857 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
858 ppd->tp_status = TP_STATUS_VLAN_VALID;
859 } else {
9e67030a 860 ppd->hv1.tp_vlan_tci = 0;
861 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 862 }
863}
864
bc59ba39 865static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 866 struct tpacket3_hdr *ppd)
867{
868 prb_fill_vlan_info(pkc, ppd);
869
870 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
871 prb_fill_rxhash(pkc, ppd);
872 else
873 prb_clear_rxhash(pkc, ppd);
874}
875
eea49cc9 876static void prb_fill_curr_block(char *curr,
bc59ba39 877 struct tpacket_kbdq_core *pkc,
878 struct tpacket_block_desc *pbd,
f6fb8f10 879 unsigned int len)
880{
881 struct tpacket3_hdr *ppd;
882
883 ppd = (struct tpacket3_hdr *)curr;
884 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
885 pkc->prev = curr;
886 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
887 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
888 BLOCK_NUM_PKTS(pbd) += 1;
889 atomic_inc(&pkc->blk_fill_in_prog);
890 prb_run_all_ft_ops(pkc, ppd);
891}
892
893/* Assumes caller has the sk->rx_queue.lock */
894static void *__packet_lookup_frame_in_block(struct packet_sock *po,
895 struct sk_buff *skb,
896 int status,
897 unsigned int len
898 )
899{
bc59ba39 900 struct tpacket_kbdq_core *pkc;
901 struct tpacket_block_desc *pbd;
f6fb8f10 902 char *curr, *end;
903
e3192690 904 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 905 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
906
907 /* Queue is frozen when user space is lagging behind */
908 if (prb_queue_frozen(pkc)) {
909 /*
910 * Check if that last block which caused the queue to freeze,
911 * is still in_use by user-space.
912 */
913 if (prb_curr_blk_in_use(pkc, pbd)) {
914 /* Can't record this packet */
915 return NULL;
916 } else {
917 /*
918 * Ok, the block was released by user-space.
919 * Now let's open that block.
920 * opening a block also thaws the queue.
921 * Thawing is a side effect.
922 */
923 prb_open_block(pkc, pbd);
924 }
925 }
926
927 smp_mb();
928 curr = pkc->nxt_offset;
929 pkc->skb = skb;
e3192690 930 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 931
932 /* first try the current block */
933 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
934 prb_fill_curr_block(curr, pkc, pbd, len);
935 return (void *)curr;
936 }
937
938 /* Ok, close the current block */
939 prb_retire_current_block(pkc, po, 0);
940
941 /* Now, try to dispatch the next block */
942 curr = (char *)prb_dispatch_next_block(pkc, po);
943 if (curr) {
944 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
945 prb_fill_curr_block(curr, pkc, pbd, len);
946 return (void *)curr;
947 }
948
949 /*
950 * No free blocks are available.user_space hasn't caught up yet.
951 * Queue was just frozen and now this packet will get dropped.
952 */
953 return NULL;
954}
955
eea49cc9 956static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 957 struct sk_buff *skb,
958 int status, unsigned int len)
959{
960 char *curr = NULL;
961 switch (po->tp_version) {
962 case TPACKET_V1:
963 case TPACKET_V2:
964 curr = packet_lookup_frame(po, &po->rx_ring,
965 po->rx_ring.head, status);
966 return curr;
967 case TPACKET_V3:
968 return __packet_lookup_frame_in_block(po, skb, status, len);
969 default:
970 WARN(1, "TPACKET version not supported\n");
971 BUG();
99aa3473 972 return NULL;
f6fb8f10 973 }
974}
975
eea49cc9 976static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 977 struct packet_ring_buffer *rb,
77f65ebd 978 unsigned int idx,
f6fb8f10 979 int status)
980{
bc59ba39 981 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 982 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 983
984 if (status != BLOCK_STATUS(pbd))
985 return NULL;
986 return pbd;
987}
988
eea49cc9 989static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 990{
991 unsigned int prev;
992 if (rb->prb_bdqc.kactive_blk_num)
993 prev = rb->prb_bdqc.kactive_blk_num-1;
994 else
995 prev = rb->prb_bdqc.knum_blocks-1;
996 return prev;
997}
998
999/* Assumes caller has held the rx_queue.lock */
eea49cc9 1000static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1001 struct packet_ring_buffer *rb,
1002 int status)
1003{
1004 unsigned int previous = prb_previous_blk_num(rb);
1005 return prb_lookup_block(po, rb, previous, status);
1006}
1007
eea49cc9 1008static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1009 struct packet_ring_buffer *rb,
1010 int status)
1011{
1012 if (po->tp_version <= TPACKET_V2)
1013 return packet_previous_frame(po, rb, status);
1014
1015 return __prb_previous_block(po, rb, status);
1016}
1017
eea49cc9 1018static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1019 struct packet_ring_buffer *rb)
1020{
1021 switch (po->tp_version) {
1022 case TPACKET_V1:
1023 case TPACKET_V2:
1024 return packet_increment_head(rb);
1025 case TPACKET_V3:
1026 default:
1027 WARN(1, "TPACKET version not supported.\n");
1028 BUG();
1029 return;
1030 }
1031}
1032
eea49cc9 1033static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1034 struct packet_ring_buffer *rb,
1035 int status)
1036{
1037 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1038 return packet_lookup_frame(po, rb, previous, status);
1039}
1040
eea49cc9 1041static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1042{
1043 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1044}
1045
77f65ebd
WB
1046static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1047{
1048 struct sock *sk = &po->sk;
1049 bool has_room;
1050
1051 if (po->prot_hook.func != tpacket_rcv)
1052 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1053 <= sk->sk_rcvbuf;
1054
1055 spin_lock(&sk->sk_receive_queue.lock);
1056 if (po->tp_version == TPACKET_V3)
1057 has_room = prb_lookup_block(po, &po->rx_ring,
1058 po->rx_ring.prb_bdqc.kactive_blk_num,
1059 TP_STATUS_KERNEL);
1060 else
1061 has_room = packet_lookup_frame(po, &po->rx_ring,
1062 po->rx_ring.head,
1063 TP_STATUS_KERNEL);
1064 spin_unlock(&sk->sk_receive_queue.lock);
1065
1066 return has_room;
1067}
1068
1da177e4
LT
1069static void packet_sock_destruct(struct sock *sk)
1070{
ed85b565
RC
1071 skb_queue_purge(&sk->sk_error_queue);
1072
547b792c
IJ
1073 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1074 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1075
1076 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1077 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1078 return;
1079 }
1080
17ab56a2 1081 sk_refcnt_debug_dec(sk);
1da177e4
LT
1082}
1083
dc99f600
DM
1084static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1085{
1086 int x = atomic_read(&f->rr_cur) + 1;
1087
1088 if (x >= num)
1089 x = 0;
1090
1091 return x;
1092}
1093
77f65ebd
WB
1094static unsigned int fanout_demux_hash(struct packet_fanout *f,
1095 struct sk_buff *skb,
1096 unsigned int num)
dc99f600 1097{
77f65ebd 1098 return (((u64)skb->rxhash) * num) >> 32;
dc99f600
DM
1099}
1100
77f65ebd
WB
1101static unsigned int fanout_demux_lb(struct packet_fanout *f,
1102 struct sk_buff *skb,
1103 unsigned int num)
dc99f600
DM
1104{
1105 int cur, old;
1106
1107 cur = atomic_read(&f->rr_cur);
1108 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1109 fanout_rr_next(f, num))) != cur)
1110 cur = old;
77f65ebd
WB
1111 return cur;
1112}
1113
1114static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1115 struct sk_buff *skb,
1116 unsigned int num)
1117{
1118 return smp_processor_id() % num;
dc99f600
DM
1119}
1120
77f65ebd
WB
1121static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1122 struct sk_buff *skb,
1123 unsigned int idx, unsigned int skip,
1124 unsigned int num)
95ec3eb4 1125{
77f65ebd 1126 unsigned int i, j;
95ec3eb4 1127
77f65ebd
WB
1128 i = j = min_t(int, f->next[idx], num - 1);
1129 do {
1130 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1131 if (i != j)
1132 f->next[idx] = i;
1133 return i;
1134 }
1135 if (++i == num)
1136 i = 0;
1137 } while (i != j);
1138
1139 return idx;
1140}
1141
1142static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1143{
1144 return f->flags & (flag >> 8);
95ec3eb4
DM
1145}
1146
95ec3eb4
DM
1147static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1148 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1149{
1150 struct packet_fanout *f = pt->af_packet_priv;
1151 unsigned int num = f->num_members;
1152 struct packet_sock *po;
77f65ebd 1153 unsigned int idx;
dc99f600
DM
1154
1155 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1156 !num) {
1157 kfree_skb(skb);
1158 return 0;
1159 }
1160
95ec3eb4
DM
1161 switch (f->type) {
1162 case PACKET_FANOUT_HASH:
1163 default:
77f65ebd 1164 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1165 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1166 if (!skb)
1167 return 0;
1168 }
1169 skb_get_rxhash(skb);
77f65ebd 1170 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1171 break;
1172 case PACKET_FANOUT_LB:
77f65ebd 1173 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1174 break;
1175 case PACKET_FANOUT_CPU:
77f65ebd
WB
1176 idx = fanout_demux_cpu(f, skb, num);
1177 break;
1178 case PACKET_FANOUT_ROLLOVER:
1179 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1180 break;
dc99f600
DM
1181 }
1182
77f65ebd
WB
1183 po = pkt_sk(f->arr[idx]);
1184 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1185 unlikely(!packet_rcv_has_room(po, skb))) {
1186 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1187 po = pkt_sk(f->arr[idx]);
1188 }
dc99f600
DM
1189
1190 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1191}
1192
fff3321d
PE
1193DEFINE_MUTEX(fanout_mutex);
1194EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1195static LIST_HEAD(fanout_list);
1196
1197static void __fanout_link(struct sock *sk, struct packet_sock *po)
1198{
1199 struct packet_fanout *f = po->fanout;
1200
1201 spin_lock(&f->lock);
1202 f->arr[f->num_members] = sk;
1203 smp_wmb();
1204 f->num_members++;
1205 spin_unlock(&f->lock);
1206}
1207
1208static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1209{
1210 struct packet_fanout *f = po->fanout;
1211 int i;
1212
1213 spin_lock(&f->lock);
1214 for (i = 0; i < f->num_members; i++) {
1215 if (f->arr[i] == sk)
1216 break;
1217 }
1218 BUG_ON(i >= f->num_members);
1219 f->arr[i] = f->arr[f->num_members - 1];
1220 f->num_members--;
1221 spin_unlock(&f->lock);
1222}
1223
a0dfb263 1224static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
c0de08d0
EL
1225{
1226 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1227 return true;
1228
1229 return false;
1230}
1231
7736d33f 1232static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1233{
1234 struct packet_sock *po = pkt_sk(sk);
1235 struct packet_fanout *f, *match;
7736d33f 1236 u8 type = type_flags & 0xff;
77f65ebd 1237 u8 flags = type_flags >> 8;
dc99f600
DM
1238 int err;
1239
1240 switch (type) {
77f65ebd
WB
1241 case PACKET_FANOUT_ROLLOVER:
1242 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1243 return -EINVAL;
dc99f600
DM
1244 case PACKET_FANOUT_HASH:
1245 case PACKET_FANOUT_LB:
95ec3eb4 1246 case PACKET_FANOUT_CPU:
dc99f600
DM
1247 break;
1248 default:
1249 return -EINVAL;
1250 }
1251
1252 if (!po->running)
1253 return -EINVAL;
1254
1255 if (po->fanout)
1256 return -EALREADY;
1257
1258 mutex_lock(&fanout_mutex);
1259 match = NULL;
1260 list_for_each_entry(f, &fanout_list, list) {
1261 if (f->id == id &&
1262 read_pnet(&f->net) == sock_net(sk)) {
1263 match = f;
1264 break;
1265 }
1266 }
afe62c68 1267 err = -EINVAL;
77f65ebd 1268 if (match && match->flags != flags)
afe62c68 1269 goto out;
dc99f600 1270 if (!match) {
afe62c68 1271 err = -ENOMEM;
dc99f600 1272 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1273 if (!match)
1274 goto out;
1275 write_pnet(&match->net, sock_net(sk));
1276 match->id = id;
1277 match->type = type;
77f65ebd 1278 match->flags = flags;
afe62c68
ED
1279 atomic_set(&match->rr_cur, 0);
1280 INIT_LIST_HEAD(&match->list);
1281 spin_lock_init(&match->lock);
1282 atomic_set(&match->sk_ref, 0);
1283 match->prot_hook.type = po->prot_hook.type;
1284 match->prot_hook.dev = po->prot_hook.dev;
1285 match->prot_hook.func = packet_rcv_fanout;
1286 match->prot_hook.af_packet_priv = match;
c0de08d0 1287 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1288 dev_add_pack(&match->prot_hook);
1289 list_add(&match->list, &fanout_list);
dc99f600 1290 }
afe62c68
ED
1291 err = -EINVAL;
1292 if (match->type == type &&
1293 match->prot_hook.type == po->prot_hook.type &&
1294 match->prot_hook.dev == po->prot_hook.dev) {
1295 err = -ENOSPC;
1296 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1297 __dev_remove_pack(&po->prot_hook);
1298 po->fanout = match;
1299 atomic_inc(&match->sk_ref);
1300 __fanout_link(sk, po);
1301 err = 0;
dc99f600
DM
1302 }
1303 }
afe62c68 1304out:
dc99f600
DM
1305 mutex_unlock(&fanout_mutex);
1306 return err;
1307}
1308
1309static void fanout_release(struct sock *sk)
1310{
1311 struct packet_sock *po = pkt_sk(sk);
1312 struct packet_fanout *f;
1313
1314 f = po->fanout;
1315 if (!f)
1316 return;
1317
fff3321d 1318 mutex_lock(&fanout_mutex);
dc99f600
DM
1319 po->fanout = NULL;
1320
dc99f600
DM
1321 if (atomic_dec_and_test(&f->sk_ref)) {
1322 list_del(&f->list);
1323 dev_remove_pack(&f->prot_hook);
1324 kfree(f);
1325 }
1326 mutex_unlock(&fanout_mutex);
1327}
1da177e4 1328
90ddc4f0 1329static const struct proto_ops packet_ops;
1da177e4 1330
90ddc4f0 1331static const struct proto_ops packet_ops_spkt;
1da177e4 1332
40d4e3df
ED
1333static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1334 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1335{
1336 struct sock *sk;
1337 struct sockaddr_pkt *spkt;
1338
1339 /*
1340 * When we registered the protocol we saved the socket in the data
1341 * field for just this event.
1342 */
1343
1344 sk = pt->af_packet_priv;
1ce4f28b 1345
1da177e4
LT
1346 /*
1347 * Yank back the headers [hope the device set this
1348 * right or kerboom...]
1349 *
1350 * Incoming packets have ll header pulled,
1351 * push it back.
1352 *
98e399f8 1353 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1354 * so that this procedure is noop.
1355 */
1356
1357 if (skb->pkt_type == PACKET_LOOPBACK)
1358 goto out;
1359
09ad9bc7 1360 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1361 goto out;
1362
40d4e3df
ED
1363 skb = skb_share_check(skb, GFP_ATOMIC);
1364 if (skb == NULL)
1da177e4
LT
1365 goto oom;
1366
1367 /* drop any routing info */
adf30907 1368 skb_dst_drop(skb);
1da177e4 1369
84531c24
PO
1370 /* drop conntrack reference */
1371 nf_reset(skb);
1372
ffbc6111 1373 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1374
98e399f8 1375 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1376
1377 /*
1378 * The SOCK_PACKET socket receives _all_ frames.
1379 */
1380
1381 spkt->spkt_family = dev->type;
1382 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1383 spkt->spkt_protocol = skb->protocol;
1384
1385 /*
1386 * Charge the memory to the socket. This is done specifically
1387 * to prevent sockets using all the memory up.
1388 */
1389
40d4e3df 1390 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1391 return 0;
1392
1393out:
1394 kfree_skb(skb);
1395oom:
1396 return 0;
1397}
1398
1399
1400/*
1401 * Output a raw packet to a device layer. This bypasses all the other
1402 * protocol layers and you must therefore supply it with a complete frame
1403 */
1ce4f28b 1404
1da177e4
LT
1405static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1406 struct msghdr *msg, size_t len)
1407{
1408 struct sock *sk = sock->sk;
40d4e3df 1409 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1410 struct sk_buff *skb = NULL;
1da177e4 1411 struct net_device *dev;
40d4e3df 1412 __be16 proto = 0;
1da177e4 1413 int err;
3bdc0eba 1414 int extra_len = 0;
1ce4f28b 1415
1da177e4 1416 /*
1ce4f28b 1417 * Get and verify the address.
1da177e4
LT
1418 */
1419
40d4e3df 1420 if (saddr) {
1da177e4 1421 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1422 return -EINVAL;
1423 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1424 proto = saddr->spkt_protocol;
1425 } else
1426 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1427
1428 /*
1ce4f28b 1429 * Find the device first to size check it
1da177e4
LT
1430 */
1431
de74e92a 1432 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1433retry:
654d1f8a
ED
1434 rcu_read_lock();
1435 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1436 err = -ENODEV;
1437 if (dev == NULL)
1438 goto out_unlock;
1ce4f28b 1439
d5e76b0a
DM
1440 err = -ENETDOWN;
1441 if (!(dev->flags & IFF_UP))
1442 goto out_unlock;
1443
1da177e4 1444 /*
40d4e3df
ED
1445 * You may not queue a frame bigger than the mtu. This is the lowest level
1446 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1447 */
1ce4f28b 1448
3bdc0eba
BG
1449 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1450 if (!netif_supports_nofcs(dev)) {
1451 err = -EPROTONOSUPPORT;
1452 goto out_unlock;
1453 }
1454 extra_len = 4; /* We're doing our own CRC */
1455 }
1456
1da177e4 1457 err = -EMSGSIZE;
3bdc0eba 1458 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1459 goto out_unlock;
1460
1a35ca80
ED
1461 if (!skb) {
1462 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1463 int tlen = dev->needed_tailroom;
1a35ca80
ED
1464 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1465
1466 rcu_read_unlock();
4ce40912 1467 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1468 if (skb == NULL)
1469 return -ENOBUFS;
1470 /* FIXME: Save some space for broken drivers that write a hard
1471 * header at transmission time by themselves. PPP is the notable
1472 * one here. This should really be fixed at the driver level.
1473 */
1474 skb_reserve(skb, reserved);
1475 skb_reset_network_header(skb);
1476
1477 /* Try to align data part correctly */
1478 if (hhlen) {
1479 skb->data -= hhlen;
1480 skb->tail -= hhlen;
1481 if (len < hhlen)
1482 skb_reset_network_header(skb);
1483 }
1484 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1485 if (err)
1486 goto out_free;
1487 goto retry;
1da177e4
LT
1488 }
1489
3bdc0eba 1490 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1491 /* Earlier code assumed this would be a VLAN pkt,
1492 * double-check this now that we have the actual
1493 * packet in hand.
1494 */
1495 struct ethhdr *ehdr;
1496 skb_reset_mac_header(skb);
1497 ehdr = eth_hdr(skb);
1498 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1499 err = -EMSGSIZE;
1500 goto out_unlock;
1501 }
1502 }
1a35ca80 1503
1da177e4
LT
1504 skb->protocol = proto;
1505 skb->dev = dev;
1506 skb->priority = sk->sk_priority;
2d37a186 1507 skb->mark = sk->sk_mark;
bf84a010
DB
1508
1509 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1510
3bdc0eba
BG
1511 if (unlikely(extra_len == 4))
1512 skb->no_fcs = 1;
1513
40893fd0 1514 skb_probe_transport_header(skb, 0);
c1aad275 1515
1da177e4 1516 dev_queue_xmit(skb);
654d1f8a 1517 rcu_read_unlock();
40d4e3df 1518 return len;
1da177e4 1519
1da177e4 1520out_unlock:
654d1f8a 1521 rcu_read_unlock();
1a35ca80
ED
1522out_free:
1523 kfree_skb(skb);
1da177e4
LT
1524 return err;
1525}
1da177e4 1526
eea49cc9 1527static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1528 const struct sock *sk,
dbcb5855 1529 unsigned int res)
1da177e4
LT
1530{
1531 struct sk_filter *filter;
fda9ef5d 1532
80f8f102
ED
1533 rcu_read_lock();
1534 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1535 if (filter != NULL)
0a14842f 1536 res = SK_RUN_FILTER(filter, skb);
80f8f102 1537 rcu_read_unlock();
1da177e4 1538
dbcb5855 1539 return res;
1da177e4
LT
1540}
1541
1542/*
62ab0812
ED
1543 * This function makes lazy skb cloning in hope that most of packets
1544 * are discarded by BPF.
1545 *
1546 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1547 * and skb->cb are mangled. It works because (and until) packets
1548 * falling here are owned by current CPU. Output packets are cloned
1549 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1550 * sequencially, so that if we return skb to original state on exit,
1551 * we will not harm anyone.
1da177e4
LT
1552 */
1553
40d4e3df
ED
1554static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1555 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1556{
1557 struct sock *sk;
1558 struct sockaddr_ll *sll;
1559 struct packet_sock *po;
40d4e3df 1560 u8 *skb_head = skb->data;
1da177e4 1561 int skb_len = skb->len;
dbcb5855 1562 unsigned int snaplen, res;
1da177e4
LT
1563
1564 if (skb->pkt_type == PACKET_LOOPBACK)
1565 goto drop;
1566
1567 sk = pt->af_packet_priv;
1568 po = pkt_sk(sk);
1569
09ad9bc7 1570 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1571 goto drop;
1572
1da177e4
LT
1573 skb->dev = dev;
1574
3b04ddde 1575 if (dev->header_ops) {
1da177e4 1576 /* The device has an explicit notion of ll header,
62ab0812
ED
1577 * exported to higher levels.
1578 *
1579 * Otherwise, the device hides details of its frame
1580 * structure, so that corresponding packet head is
1581 * never delivered to user.
1da177e4
LT
1582 */
1583 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1584 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1585 else if (skb->pkt_type == PACKET_OUTGOING) {
1586 /* Special case: outgoing packets have ll header at head */
bbe735e4 1587 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1588 }
1589 }
1590
1591 snaplen = skb->len;
1592
dbcb5855
DM
1593 res = run_filter(skb, sk, snaplen);
1594 if (!res)
fda9ef5d 1595 goto drop_n_restore;
dbcb5855
DM
1596 if (snaplen > res)
1597 snaplen = res;
1da177e4 1598
0fd7bac6 1599 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1600 goto drop_n_acct;
1601
1602 if (skb_shared(skb)) {
1603 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1604 if (nskb == NULL)
1605 goto drop_n_acct;
1606
1607 if (skb_head != skb->data) {
1608 skb->data = skb_head;
1609 skb->len = skb_len;
1610 }
abc4e4fa 1611 consume_skb(skb);
1da177e4
LT
1612 skb = nskb;
1613 }
1614
ffbc6111
HX
1615 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1616 sizeof(skb->cb));
1617
1618 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1619 sll->sll_family = AF_PACKET;
1620 sll->sll_hatype = dev->type;
1621 sll->sll_protocol = skb->protocol;
1622 sll->sll_pkttype = skb->pkt_type;
8032b464 1623 if (unlikely(po->origdev))
80feaacb
PWJ
1624 sll->sll_ifindex = orig_dev->ifindex;
1625 else
1626 sll->sll_ifindex = dev->ifindex;
1da177e4 1627
b95cce35 1628 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1629
ffbc6111 1630 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1631
1da177e4
LT
1632 if (pskb_trim(skb, snaplen))
1633 goto drop_n_acct;
1634
1635 skb_set_owner_r(skb, sk);
1636 skb->dev = NULL;
adf30907 1637 skb_dst_drop(skb);
1da177e4 1638
84531c24
PO
1639 /* drop conntrack reference */
1640 nf_reset(skb);
1641
1da177e4
LT
1642 spin_lock(&sk->sk_receive_queue.lock);
1643 po->stats.tp_packets++;
3b885787 1644 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1645 __skb_queue_tail(&sk->sk_receive_queue, skb);
1646 spin_unlock(&sk->sk_receive_queue.lock);
1647 sk->sk_data_ready(sk, skb->len);
1648 return 0;
1649
1650drop_n_acct:
7091fbd8
WB
1651 spin_lock(&sk->sk_receive_queue.lock);
1652 po->stats.tp_drops++;
1653 atomic_inc(&sk->sk_drops);
1654 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1655
1656drop_n_restore:
1657 if (skb_head != skb->data && skb_shared(skb)) {
1658 skb->data = skb_head;
1659 skb->len = skb_len;
1660 }
1661drop:
ead2ceb0 1662 consume_skb(skb);
1da177e4
LT
1663 return 0;
1664}
1665
40d4e3df
ED
1666static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1667 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1668{
1669 struct sock *sk;
1670 struct packet_sock *po;
1671 struct sockaddr_ll *sll;
bbd6ef87
PM
1672 union {
1673 struct tpacket_hdr *h1;
1674 struct tpacket2_hdr *h2;
f6fb8f10 1675 struct tpacket3_hdr *h3;
bbd6ef87
PM
1676 void *raw;
1677 } h;
40d4e3df 1678 u8 *skb_head = skb->data;
1da177e4 1679 int skb_len = skb->len;
dbcb5855 1680 unsigned int snaplen, res;
f6fb8f10 1681 unsigned long status = TP_STATUS_USER;
bbd6ef87 1682 unsigned short macoff, netoff, hdrlen;
1da177e4 1683 struct sk_buff *copy_skb = NULL;
b7aa0bf7 1684 struct timeval tv;
bbd6ef87 1685 struct timespec ts;
614f60fa 1686 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
1687
1688 if (skb->pkt_type == PACKET_LOOPBACK)
1689 goto drop;
1690
1691 sk = pt->af_packet_priv;
1692 po = pkt_sk(sk);
1693
09ad9bc7 1694 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1695 goto drop;
1696
3b04ddde 1697 if (dev->header_ops) {
1da177e4 1698 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1699 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1700 else if (skb->pkt_type == PACKET_OUTGOING) {
1701 /* Special case: outgoing packets have ll header at head */
bbe735e4 1702 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1703 }
1704 }
1705
8dc41944
HX
1706 if (skb->ip_summed == CHECKSUM_PARTIAL)
1707 status |= TP_STATUS_CSUMNOTREADY;
1708
1da177e4
LT
1709 snaplen = skb->len;
1710
dbcb5855
DM
1711 res = run_filter(skb, sk, snaplen);
1712 if (!res)
fda9ef5d 1713 goto drop_n_restore;
dbcb5855
DM
1714 if (snaplen > res)
1715 snaplen = res;
1da177e4
LT
1716
1717 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1718 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1719 po->tp_reserve;
1da177e4 1720 } else {
95c96174 1721 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1722 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1723 (maclen < 16 ? 16 : maclen)) +
1724 po->tp_reserve;
1da177e4
LT
1725 macoff = netoff - maclen;
1726 }
f6fb8f10 1727 if (po->tp_version <= TPACKET_V2) {
1728 if (macoff + snaplen > po->rx_ring.frame_size) {
1729 if (po->copy_thresh &&
0fd7bac6 1730 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1731 if (skb_shared(skb)) {
1732 copy_skb = skb_clone(skb, GFP_ATOMIC);
1733 } else {
1734 copy_skb = skb_get(skb);
1735 skb_head = skb->data;
1736 }
1737 if (copy_skb)
1738 skb_set_owner_r(copy_skb, sk);
1da177e4 1739 }
f6fb8f10 1740 snaplen = po->rx_ring.frame_size - macoff;
1741 if ((int)snaplen < 0)
1742 snaplen = 0;
1da177e4 1743 }
1da177e4 1744 }
1da177e4 1745 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1746 h.raw = packet_current_rx_frame(po, skb,
1747 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1748 if (!h.raw)
1da177e4 1749 goto ring_is_full;
f6fb8f10 1750 if (po->tp_version <= TPACKET_V2) {
1751 packet_increment_rx_head(po, &po->rx_ring);
1752 /*
1753 * LOSING will be reported till you read the stats,
1754 * because it's COR - Clear On Read.
1755 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1756 * at packet level.
1757 */
1758 if (po->stats.tp_drops)
1759 status |= TP_STATUS_LOSING;
1760 }
1da177e4
LT
1761 po->stats.tp_packets++;
1762 if (copy_skb) {
1763 status |= TP_STATUS_COPY;
1764 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1765 }
1da177e4
LT
1766 spin_unlock(&sk->sk_receive_queue.lock);
1767
bbd6ef87 1768 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 1769
bbd6ef87
PM
1770 switch (po->tp_version) {
1771 case TPACKET_V1:
1772 h.h1->tp_len = skb->len;
1773 h.h1->tp_snaplen = snaplen;
1774 h.h1->tp_mac = macoff;
1775 h.h1->tp_net = netoff;
614f60fa
SM
1776 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1777 && shhwtstamps->syststamp.tv64)
1778 tv = ktime_to_timeval(shhwtstamps->syststamp);
1779 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1780 && shhwtstamps->hwtstamp.tv64)
1781 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1782 else if (skb->tstamp.tv64)
bbd6ef87
PM
1783 tv = ktime_to_timeval(skb->tstamp);
1784 else
1785 do_gettimeofday(&tv);
1786 h.h1->tp_sec = tv.tv_sec;
1787 h.h1->tp_usec = tv.tv_usec;
1788 hdrlen = sizeof(*h.h1);
1789 break;
1790 case TPACKET_V2:
1791 h.h2->tp_len = skb->len;
1792 h.h2->tp_snaplen = snaplen;
1793 h.h2->tp_mac = macoff;
1794 h.h2->tp_net = netoff;
614f60fa
SM
1795 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1796 && shhwtstamps->syststamp.tv64)
1797 ts = ktime_to_timespec(shhwtstamps->syststamp);
1798 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1799 && shhwtstamps->hwtstamp.tv64)
1800 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1801 else if (skb->tstamp.tv64)
bbd6ef87
PM
1802 ts = ktime_to_timespec(skb->tstamp);
1803 else
1804 getnstimeofday(&ts);
1805 h.h2->tp_sec = ts.tv_sec;
1806 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1807 if (vlan_tx_tag_present(skb)) {
1808 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1809 status |= TP_STATUS_VLAN_VALID;
1810 } else {
1811 h.h2->tp_vlan_tci = 0;
1812 }
13fcb7bd 1813 h.h2->tp_padding = 0;
bbd6ef87
PM
1814 hdrlen = sizeof(*h.h2);
1815 break;
f6fb8f10 1816 case TPACKET_V3:
1817 /* tp_nxt_offset,vlan are already populated above.
1818 * So DONT clear those fields here
1819 */
1820 h.h3->tp_status |= status;
1821 h.h3->tp_len = skb->len;
1822 h.h3->tp_snaplen = snaplen;
1823 h.h3->tp_mac = macoff;
1824 h.h3->tp_net = netoff;
1825 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1826 && shhwtstamps->syststamp.tv64)
1827 ts = ktime_to_timespec(shhwtstamps->syststamp);
1828 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1829 && shhwtstamps->hwtstamp.tv64)
1830 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1831 else if (skb->tstamp.tv64)
1832 ts = ktime_to_timespec(skb->tstamp);
1833 else
1834 getnstimeofday(&ts);
1835 h.h3->tp_sec = ts.tv_sec;
1836 h.h3->tp_nsec = ts.tv_nsec;
1837 hdrlen = sizeof(*h.h3);
1838 break;
bbd6ef87
PM
1839 default:
1840 BUG();
1841 }
1da177e4 1842
bbd6ef87 1843 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1844 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1845 sll->sll_family = AF_PACKET;
1846 sll->sll_hatype = dev->type;
1847 sll->sll_protocol = skb->protocol;
1848 sll->sll_pkttype = skb->pkt_type;
8032b464 1849 if (unlikely(po->origdev))
80feaacb
PWJ
1850 sll->sll_ifindex = orig_dev->ifindex;
1851 else
1852 sll->sll_ifindex = dev->ifindex;
1da177e4 1853
e16aa207 1854 smp_mb();
f6dafa95 1855#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1856 {
0af55bb5
CG
1857 u8 *start, *end;
1858
f6fb8f10 1859 if (po->tp_version <= TPACKET_V2) {
1860 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1861 + macoff + snaplen);
1862 for (start = h.raw; start < end; start += PAGE_SIZE)
1863 flush_dcache_page(pgv_to_page(start));
1864 }
cc9f01b2 1865 smp_wmb();
1da177e4 1866 }
f6dafa95 1867#endif
f6fb8f10 1868 if (po->tp_version <= TPACKET_V2)
1869 __packet_set_status(po, h.raw, status);
1870 else
1871 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1872
1873 sk->sk_data_ready(sk, 0);
1874
1875drop_n_restore:
1876 if (skb_head != skb->data && skb_shared(skb)) {
1877 skb->data = skb_head;
1878 skb->len = skb_len;
1879 }
1880drop:
1ce4f28b 1881 kfree_skb(skb);
1da177e4
LT
1882 return 0;
1883
1884ring_is_full:
1885 po->stats.tp_drops++;
1886 spin_unlock(&sk->sk_receive_queue.lock);
1887
1888 sk->sk_data_ready(sk, 0);
acb5d75b 1889 kfree_skb(copy_skb);
1da177e4
LT
1890 goto drop_n_restore;
1891}
1892
69e3c75f
JB
1893static void tpacket_destruct_skb(struct sk_buff *skb)
1894{
1895 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1896 void *ph;
1da177e4 1897
69e3c75f
JB
1898 if (likely(po->tx_ring.pg_vec)) {
1899 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
1900 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1901 atomic_dec(&po->tx_ring.pending);
1902 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1903 }
1904
1905 sock_wfree(skb);
1906}
1907
40d4e3df
ED
1908static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1909 void *frame, struct net_device *dev, int size_max,
ae641949 1910 __be16 proto, unsigned char *addr, int hlen)
69e3c75f
JB
1911{
1912 union {
1913 struct tpacket_hdr *h1;
1914 struct tpacket2_hdr *h2;
1915 void *raw;
1916 } ph;
1917 int to_write, offset, len, tp_len, nr_frags, len_max;
1918 struct socket *sock = po->sk.sk_socket;
1919 struct page *page;
1920 void *data;
1921 int err;
1922
1923 ph.raw = frame;
1924
1925 skb->protocol = proto;
1926 skb->dev = dev;
1927 skb->priority = po->sk.sk_priority;
2d37a186 1928 skb->mark = po->sk.sk_mark;
69e3c75f
JB
1929 skb_shinfo(skb)->destructor_arg = ph.raw;
1930
1931 switch (po->tp_version) {
1932 case TPACKET_V2:
1933 tp_len = ph.h2->tp_len;
1934 break;
1935 default:
1936 tp_len = ph.h1->tp_len;
1937 break;
1938 }
1939 if (unlikely(tp_len > size_max)) {
40d4e3df 1940 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1941 return -EMSGSIZE;
1942 }
1943
ae641949 1944 skb_reserve(skb, hlen);
69e3c75f 1945 skb_reset_network_header(skb);
40893fd0 1946 skb_probe_transport_header(skb, 0);
c1aad275 1947
5920cd3a
PC
1948 if (po->tp_tx_has_off) {
1949 int off_min, off_max, off;
1950 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1951 off_max = po->tx_ring.frame_size - tp_len;
1952 if (sock->type == SOCK_DGRAM) {
1953 switch (po->tp_version) {
1954 case TPACKET_V2:
1955 off = ph.h2->tp_net;
1956 break;
1957 default:
1958 off = ph.h1->tp_net;
1959 break;
1960 }
1961 } else {
1962 switch (po->tp_version) {
1963 case TPACKET_V2:
1964 off = ph.h2->tp_mac;
1965 break;
1966 default:
1967 off = ph.h1->tp_mac;
1968 break;
1969 }
1970 }
1971 if (unlikely((off < off_min) || (off_max < off)))
1972 return -EINVAL;
1973 data = ph.raw + off;
1974 } else {
1975 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1976 }
69e3c75f
JB
1977 to_write = tp_len;
1978
1979 if (sock->type == SOCK_DGRAM) {
1980 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1981 NULL, tp_len);
1982 if (unlikely(err < 0))
1983 return -EINVAL;
40d4e3df 1984 } else if (dev->hard_header_len) {
69e3c75f
JB
1985 /* net device doesn't like empty head */
1986 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
1987 pr_err("packet size is too short (%d < %d)\n",
1988 tp_len, dev->hard_header_len);
69e3c75f
JB
1989 return -EINVAL;
1990 }
1991
1992 skb_push(skb, dev->hard_header_len);
1993 err = skb_store_bits(skb, 0, data,
1994 dev->hard_header_len);
1995 if (unlikely(err))
1996 return err;
1997
1998 data += dev->hard_header_len;
1999 to_write -= dev->hard_header_len;
2000 }
2001
69e3c75f
JB
2002 offset = offset_in_page(data);
2003 len_max = PAGE_SIZE - offset;
2004 len = ((to_write > len_max) ? len_max : to_write);
2005
2006 skb->data_len = to_write;
2007 skb->len += to_write;
2008 skb->truesize += to_write;
2009 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2010
2011 while (likely(to_write)) {
2012 nr_frags = skb_shinfo(skb)->nr_frags;
2013
2014 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2015 pr_err("Packet exceed the number of skb frags(%lu)\n",
2016 MAX_SKB_FRAGS);
69e3c75f
JB
2017 return -EFAULT;
2018 }
2019
0af55bb5
CG
2020 page = pgv_to_page(data);
2021 data += len;
69e3c75f
JB
2022 flush_dcache_page(page);
2023 get_page(page);
0af55bb5 2024 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2025 to_write -= len;
2026 offset = 0;
2027 len_max = PAGE_SIZE;
2028 len = ((to_write > len_max) ? len_max : to_write);
2029 }
2030
2031 return tp_len;
2032}
2033
2034static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2035{
69e3c75f
JB
2036 struct sk_buff *skb;
2037 struct net_device *dev;
2038 __be16 proto;
827d9780
BG
2039 bool need_rls_dev = false;
2040 int err, reserve = 0;
40d4e3df
ED
2041 void *ph;
2042 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2043 int tp_len, size_max;
2044 unsigned char *addr;
2045 int len_sum = 0;
9e67030a 2046 int status = TP_STATUS_AVAILABLE;
ae641949 2047 int hlen, tlen;
69e3c75f 2048
69e3c75f
JB
2049 mutex_lock(&po->pg_vec_lock);
2050
69e3c75f 2051 if (saddr == NULL) {
827d9780 2052 dev = po->prot_hook.dev;
69e3c75f
JB
2053 proto = po->num;
2054 addr = NULL;
2055 } else {
2056 err = -EINVAL;
2057 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2058 goto out;
2059 if (msg->msg_namelen < (saddr->sll_halen
2060 + offsetof(struct sockaddr_ll,
2061 sll_addr)))
2062 goto out;
69e3c75f
JB
2063 proto = saddr->sll_protocol;
2064 addr = saddr->sll_addr;
827d9780
BG
2065 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2066 need_rls_dev = true;
69e3c75f
JB
2067 }
2068
69e3c75f
JB
2069 err = -ENXIO;
2070 if (unlikely(dev == NULL))
2071 goto out;
2072
2073 reserve = dev->hard_header_len;
2074
2075 err = -ENETDOWN;
2076 if (unlikely(!(dev->flags & IFF_UP)))
2077 goto out_put;
2078
2079 size_max = po->tx_ring.frame_size
b5dd884e 2080 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2081
2082 if (size_max > dev->mtu + reserve)
2083 size_max = dev->mtu + reserve;
2084
2085 do {
2086 ph = packet_current_frame(po, &po->tx_ring,
2087 TP_STATUS_SEND_REQUEST);
2088
2089 if (unlikely(ph == NULL)) {
2090 schedule();
2091 continue;
2092 }
2093
2094 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2095 hlen = LL_RESERVED_SPACE(dev);
2096 tlen = dev->needed_tailroom;
69e3c75f 2097 skb = sock_alloc_send_skb(&po->sk,
ae641949 2098 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2099 0, &err);
2100
2101 if (unlikely(skb == NULL))
2102 goto out_status;
2103
2104 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2105 addr, hlen);
69e3c75f
JB
2106
2107 if (unlikely(tp_len < 0)) {
2108 if (po->tp_loss) {
2109 __packet_set_status(po, ph,
2110 TP_STATUS_AVAILABLE);
2111 packet_increment_head(&po->tx_ring);
2112 kfree_skb(skb);
2113 continue;
2114 } else {
2115 status = TP_STATUS_WRONG_FORMAT;
2116 err = tp_len;
2117 goto out_status;
2118 }
2119 }
2120
2121 skb->destructor = tpacket_destruct_skb;
2122 __packet_set_status(po, ph, TP_STATUS_SENDING);
2123 atomic_inc(&po->tx_ring.pending);
2124
2125 status = TP_STATUS_SEND_REQUEST;
2126 err = dev_queue_xmit(skb);
eb70df13
JP
2127 if (unlikely(err > 0)) {
2128 err = net_xmit_errno(err);
2129 if (err && __packet_get_status(po, ph) ==
2130 TP_STATUS_AVAILABLE) {
2131 /* skb was destructed already */
2132 skb = NULL;
2133 goto out_status;
2134 }
2135 /*
2136 * skb was dropped but not destructed yet;
2137 * let's treat it like congestion or err < 0
2138 */
2139 err = 0;
2140 }
69e3c75f
JB
2141 packet_increment_head(&po->tx_ring);
2142 len_sum += tp_len;
f64f9e71
JP
2143 } while (likely((ph != NULL) ||
2144 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2145 (atomic_read(&po->tx_ring.pending))))
2146 );
69e3c75f
JB
2147
2148 err = len_sum;
2149 goto out_put;
2150
69e3c75f
JB
2151out_status:
2152 __packet_set_status(po, ph, status);
2153 kfree_skb(skb);
2154out_put:
827d9780
BG
2155 if (need_rls_dev)
2156 dev_put(dev);
69e3c75f
JB
2157out:
2158 mutex_unlock(&po->pg_vec_lock);
2159 return err;
2160}
69e3c75f 2161
eea49cc9
OJ
2162static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2163 size_t reserve, size_t len,
2164 size_t linear, int noblock,
2165 int *err)
bfd5f4a3
SS
2166{
2167 struct sk_buff *skb;
2168
2169 /* Under a page? Don't bother with paged skb. */
2170 if (prepad + len < PAGE_SIZE || !linear)
2171 linear = len;
2172
2173 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2174 err);
2175 if (!skb)
2176 return NULL;
2177
2178 skb_reserve(skb, reserve);
2179 skb_put(skb, linear);
2180 skb->data_len = len - linear;
2181 skb->len += len - linear;
2182
2183 return skb;
2184}
2185
69e3c75f 2186static int packet_snd(struct socket *sock,
1da177e4
LT
2187 struct msghdr *msg, size_t len)
2188{
2189 struct sock *sk = sock->sk;
40d4e3df 2190 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2191 struct sk_buff *skb;
2192 struct net_device *dev;
0e11c91e 2193 __be16 proto;
827d9780 2194 bool need_rls_dev = false;
1da177e4 2195 unsigned char *addr;
827d9780 2196 int err, reserve = 0;
bfd5f4a3
SS
2197 struct virtio_net_hdr vnet_hdr = { 0 };
2198 int offset = 0;
2199 int vnet_hdr_len;
2200 struct packet_sock *po = pkt_sk(sk);
2201 unsigned short gso_type = 0;
ae641949 2202 int hlen, tlen;
3bdc0eba 2203 int extra_len = 0;
1da177e4
LT
2204
2205 /*
1ce4f28b 2206 * Get and verify the address.
1da177e4 2207 */
1ce4f28b 2208
1da177e4 2209 if (saddr == NULL) {
827d9780 2210 dev = po->prot_hook.dev;
1da177e4
LT
2211 proto = po->num;
2212 addr = NULL;
2213 } else {
2214 err = -EINVAL;
2215 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2216 goto out;
0fb375fb
EB
2217 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2218 goto out;
1da177e4
LT
2219 proto = saddr->sll_protocol;
2220 addr = saddr->sll_addr;
827d9780
BG
2221 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2222 need_rls_dev = true;
1da177e4
LT
2223 }
2224
1da177e4
LT
2225 err = -ENXIO;
2226 if (dev == NULL)
2227 goto out_unlock;
2228 if (sock->type == SOCK_RAW)
2229 reserve = dev->hard_header_len;
2230
d5e76b0a
DM
2231 err = -ENETDOWN;
2232 if (!(dev->flags & IFF_UP))
2233 goto out_unlock;
2234
bfd5f4a3
SS
2235 if (po->has_vnet_hdr) {
2236 vnet_hdr_len = sizeof(vnet_hdr);
2237
2238 err = -EINVAL;
2239 if (len < vnet_hdr_len)
2240 goto out_unlock;
2241
2242 len -= vnet_hdr_len;
2243
2244 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2245 vnet_hdr_len);
2246 if (err < 0)
2247 goto out_unlock;
2248
2249 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2250 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2251 vnet_hdr.hdr_len))
2252 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2253 vnet_hdr.csum_offset + 2;
2254
2255 err = -EINVAL;
2256 if (vnet_hdr.hdr_len > len)
2257 goto out_unlock;
2258
2259 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2260 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2261 case VIRTIO_NET_HDR_GSO_TCPV4:
2262 gso_type = SKB_GSO_TCPV4;
2263 break;
2264 case VIRTIO_NET_HDR_GSO_TCPV6:
2265 gso_type = SKB_GSO_TCPV6;
2266 break;
2267 case VIRTIO_NET_HDR_GSO_UDP:
2268 gso_type = SKB_GSO_UDP;
2269 break;
2270 default:
2271 goto out_unlock;
2272 }
2273
2274 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2275 gso_type |= SKB_GSO_TCP_ECN;
2276
2277 if (vnet_hdr.gso_size == 0)
2278 goto out_unlock;
2279
2280 }
2281 }
2282
3bdc0eba
BG
2283 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2284 if (!netif_supports_nofcs(dev)) {
2285 err = -EPROTONOSUPPORT;
2286 goto out_unlock;
2287 }
2288 extra_len = 4; /* We're doing our own CRC */
2289 }
2290
1da177e4 2291 err = -EMSGSIZE;
3bdc0eba 2292 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2293 goto out_unlock;
2294
bfd5f4a3 2295 err = -ENOBUFS;
ae641949
HX
2296 hlen = LL_RESERVED_SPACE(dev);
2297 tlen = dev->needed_tailroom;
2298 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2299 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2300 if (skb == NULL)
1da177e4
LT
2301 goto out_unlock;
2302
bfd5f4a3 2303 skb_set_network_header(skb, reserve);
1da177e4 2304
0c4e8581
SH
2305 err = -EINVAL;
2306 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2307 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2308 goto out_free;
1da177e4
LT
2309
2310 /* Returns -EFAULT on error */
bfd5f4a3 2311 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2312 if (err)
2313 goto out_free;
bf84a010
DB
2314
2315 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2316
3bdc0eba 2317 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2318 /* Earlier code assumed this would be a VLAN pkt,
2319 * double-check this now that we have the actual
2320 * packet in hand.
2321 */
2322 struct ethhdr *ehdr;
2323 skb_reset_mac_header(skb);
2324 ehdr = eth_hdr(skb);
2325 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2326 err = -EMSGSIZE;
2327 goto out_free;
2328 }
2329 }
2330
1da177e4
LT
2331 skb->protocol = proto;
2332 skb->dev = dev;
2333 skb->priority = sk->sk_priority;
2d37a186 2334 skb->mark = sk->sk_mark;
1da177e4 2335
bfd5f4a3
SS
2336 if (po->has_vnet_hdr) {
2337 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2338 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2339 vnet_hdr.csum_offset)) {
2340 err = -EINVAL;
2341 goto out_free;
2342 }
2343 }
2344
2345 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2346 skb_shinfo(skb)->gso_type = gso_type;
2347
2348 /* Header must be checked, and gso_segs computed. */
2349 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2350 skb_shinfo(skb)->gso_segs = 0;
2351
2352 len += vnet_hdr_len;
2353 }
2354
40893fd0 2355 skb_probe_transport_header(skb, reserve);
c1aad275 2356
3bdc0eba
BG
2357 if (unlikely(extra_len == 4))
2358 skb->no_fcs = 1;
2359
1da177e4
LT
2360 /*
2361 * Now send it
2362 */
2363
2364 err = dev_queue_xmit(skb);
2365 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2366 goto out_unlock;
2367
827d9780
BG
2368 if (need_rls_dev)
2369 dev_put(dev);
1da177e4 2370
40d4e3df 2371 return len;
1da177e4
LT
2372
2373out_free:
2374 kfree_skb(skb);
2375out_unlock:
827d9780 2376 if (dev && need_rls_dev)
1da177e4
LT
2377 dev_put(dev);
2378out:
2379 return err;
2380}
2381
69e3c75f
JB
2382static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2383 struct msghdr *msg, size_t len)
2384{
69e3c75f
JB
2385 struct sock *sk = sock->sk;
2386 struct packet_sock *po = pkt_sk(sk);
2387 if (po->tx_ring.pg_vec)
2388 return tpacket_snd(po, msg);
2389 else
69e3c75f
JB
2390 return packet_snd(sock, msg, len);
2391}
2392
1da177e4
LT
2393/*
2394 * Close a PACKET socket. This is fairly simple. We immediately go
2395 * to 'closed' state and remove our protocol entry in the device list.
2396 */
2397
2398static int packet_release(struct socket *sock)
2399{
2400 struct sock *sk = sock->sk;
2401 struct packet_sock *po;
d12d01d6 2402 struct net *net;
f6fb8f10 2403 union tpacket_req_u req_u;
1da177e4
LT
2404
2405 if (!sk)
2406 return 0;
2407
3b1e0a65 2408 net = sock_net(sk);
1da177e4
LT
2409 po = pkt_sk(sk);
2410
0fa7fa98 2411 mutex_lock(&net->packet.sklist_lock);
808f5114 2412 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2413 mutex_unlock(&net->packet.sklist_lock);
2414
2415 preempt_disable();
920de804 2416 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2417 preempt_enable();
1da177e4 2418
808f5114 2419 spin_lock(&po->bind_lock);
ce06b03e 2420 unregister_prot_hook(sk, false);
160ff18a
BG
2421 if (po->prot_hook.dev) {
2422 dev_put(po->prot_hook.dev);
2423 po->prot_hook.dev = NULL;
2424 }
808f5114 2425 spin_unlock(&po->bind_lock);
1da177e4 2426
1da177e4 2427 packet_flush_mclist(sk);
1da177e4 2428
9665d5d6
PS
2429 if (po->rx_ring.pg_vec) {
2430 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2431 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2432 }
69e3c75f 2433
9665d5d6
PS
2434 if (po->tx_ring.pg_vec) {
2435 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2436 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2437 }
1da177e4 2438
dc99f600
DM
2439 fanout_release(sk);
2440
808f5114 2441 synchronize_net();
1da177e4
LT
2442 /*
2443 * Now the socket is dead. No more input will appear.
2444 */
1da177e4
LT
2445 sock_orphan(sk);
2446 sock->sk = NULL;
2447
2448 /* Purge queues */
2449
2450 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2451 sk_refcnt_debug_release(sk);
1da177e4
LT
2452
2453 sock_put(sk);
2454 return 0;
2455}
2456
2457/*
2458 * Attach a packet hook.
2459 */
2460
0e11c91e 2461static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2462{
2463 struct packet_sock *po = pkt_sk(sk);
dc99f600 2464
aef950b4
WY
2465 if (po->fanout) {
2466 if (dev)
2467 dev_put(dev);
2468
dc99f600 2469 return -EINVAL;
aef950b4 2470 }
1da177e4
LT
2471
2472 lock_sock(sk);
2473
2474 spin_lock(&po->bind_lock);
ce06b03e 2475 unregister_prot_hook(sk, true);
1da177e4
LT
2476 po->num = protocol;
2477 po->prot_hook.type = protocol;
160ff18a
BG
2478 if (po->prot_hook.dev)
2479 dev_put(po->prot_hook.dev);
1da177e4
LT
2480 po->prot_hook.dev = dev;
2481
2482 po->ifindex = dev ? dev->ifindex : 0;
2483
2484 if (protocol == 0)
2485 goto out_unlock;
2486
be85d4ad 2487 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2488 register_prot_hook(sk);
be85d4ad
UT
2489 } else {
2490 sk->sk_err = ENETDOWN;
2491 if (!sock_flag(sk, SOCK_DEAD))
2492 sk->sk_error_report(sk);
1da177e4
LT
2493 }
2494
2495out_unlock:
2496 spin_unlock(&po->bind_lock);
2497 release_sock(sk);
2498 return 0;
2499}
2500
2501/*
2502 * Bind a packet socket to a device
2503 */
2504
40d4e3df
ED
2505static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2506 int addr_len)
1da177e4 2507{
40d4e3df 2508 struct sock *sk = sock->sk;
1da177e4
LT
2509 char name[15];
2510 struct net_device *dev;
2511 int err = -ENODEV;
1ce4f28b 2512
1da177e4
LT
2513 /*
2514 * Check legality
2515 */
1ce4f28b 2516
8ae55f04 2517 if (addr_len != sizeof(struct sockaddr))
1da177e4 2518 return -EINVAL;
40d4e3df 2519 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2520
3b1e0a65 2521 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2522 if (dev)
1da177e4 2523 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2524 return err;
2525}
1da177e4
LT
2526
2527static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2528{
40d4e3df
ED
2529 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2530 struct sock *sk = sock->sk;
1da177e4
LT
2531 struct net_device *dev = NULL;
2532 int err;
2533
2534
2535 /*
2536 * Check legality
2537 */
1ce4f28b 2538
1da177e4
LT
2539 if (addr_len < sizeof(struct sockaddr_ll))
2540 return -EINVAL;
2541 if (sll->sll_family != AF_PACKET)
2542 return -EINVAL;
2543
2544 if (sll->sll_ifindex) {
2545 err = -ENODEV;
3b1e0a65 2546 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2547 if (dev == NULL)
2548 goto out;
2549 }
2550 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2551
2552out:
2553 return err;
2554}
2555
2556static struct proto packet_proto = {
2557 .name = "PACKET",
2558 .owner = THIS_MODULE,
2559 .obj_size = sizeof(struct packet_sock),
2560};
2561
2562/*
1ce4f28b 2563 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2564 */
2565
3f378b68
EP
2566static int packet_create(struct net *net, struct socket *sock, int protocol,
2567 int kern)
1da177e4
LT
2568{
2569 struct sock *sk;
2570 struct packet_sock *po;
0e11c91e 2571 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2572 int err;
2573
df008c91 2574 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2575 return -EPERM;
be02097c
DM
2576 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2577 sock->type != SOCK_PACKET)
1da177e4
LT
2578 return -ESOCKTNOSUPPORT;
2579
2580 sock->state = SS_UNCONNECTED;
2581
2582 err = -ENOBUFS;
6257ff21 2583 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2584 if (sk == NULL)
2585 goto out;
2586
2587 sock->ops = &packet_ops;
1da177e4
LT
2588 if (sock->type == SOCK_PACKET)
2589 sock->ops = &packet_ops_spkt;
be02097c 2590
1da177e4
LT
2591 sock_init_data(sock, sk);
2592
2593 po = pkt_sk(sk);
2594 sk->sk_family = PF_PACKET;
0e11c91e 2595 po->num = proto;
1da177e4
LT
2596
2597 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2598 sk_refcnt_debug_inc(sk);
1da177e4
LT
2599
2600 /*
2601 * Attach a protocol block
2602 */
2603
2604 spin_lock_init(&po->bind_lock);
905db440 2605 mutex_init(&po->pg_vec_lock);
1da177e4 2606 po->prot_hook.func = packet_rcv;
be02097c 2607
1da177e4
LT
2608 if (sock->type == SOCK_PACKET)
2609 po->prot_hook.func = packet_rcv_spkt;
be02097c 2610
1da177e4
LT
2611 po->prot_hook.af_packet_priv = sk;
2612
0e11c91e
AV
2613 if (proto) {
2614 po->prot_hook.type = proto;
ce06b03e 2615 register_prot_hook(sk);
1da177e4
LT
2616 }
2617
0fa7fa98 2618 mutex_lock(&net->packet.sklist_lock);
808f5114 2619 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2620 mutex_unlock(&net->packet.sklist_lock);
2621
2622 preempt_disable();
3680453c 2623 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2624 preempt_enable();
808f5114 2625
40d4e3df 2626 return 0;
1da177e4
LT
2627out:
2628 return err;
2629}
2630
ed85b565
RC
2631static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2632{
2633 struct sock_exterr_skb *serr;
2634 struct sk_buff *skb, *skb2;
2635 int copied, err;
2636
2637 err = -EAGAIN;
2638 skb = skb_dequeue(&sk->sk_error_queue);
2639 if (skb == NULL)
2640 goto out;
2641
2642 copied = skb->len;
2643 if (copied > len) {
2644 msg->msg_flags |= MSG_TRUNC;
2645 copied = len;
2646 }
2647 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2648 if (err)
2649 goto out_free_skb;
2650
2651 sock_recv_timestamp(msg, sk, skb);
2652
2653 serr = SKB_EXT_ERR(skb);
2654 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2655 sizeof(serr->ee), &serr->ee);
2656
2657 msg->msg_flags |= MSG_ERRQUEUE;
2658 err = copied;
2659
2660 /* Reset and regenerate socket error */
2661 spin_lock_bh(&sk->sk_error_queue.lock);
2662 sk->sk_err = 0;
2663 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2664 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2665 spin_unlock_bh(&sk->sk_error_queue.lock);
2666 sk->sk_error_report(sk);
2667 } else
2668 spin_unlock_bh(&sk->sk_error_queue.lock);
2669
2670out_free_skb:
2671 kfree_skb(skb);
2672out:
2673 return err;
2674}
2675
1da177e4
LT
2676/*
2677 * Pull a packet from our receive queue and hand it to the user.
2678 * If necessary we block.
2679 */
2680
2681static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2682 struct msghdr *msg, size_t len, int flags)
2683{
2684 struct sock *sk = sock->sk;
2685 struct sk_buff *skb;
2686 int copied, err;
0fb375fb 2687 struct sockaddr_ll *sll;
bfd5f4a3 2688 int vnet_hdr_len = 0;
1da177e4
LT
2689
2690 err = -EINVAL;
ed85b565 2691 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2692 goto out;
2693
2694#if 0
2695 /* What error should we return now? EUNATTACH? */
2696 if (pkt_sk(sk)->ifindex < 0)
2697 return -ENODEV;
2698#endif
2699
ed85b565
RC
2700 if (flags & MSG_ERRQUEUE) {
2701 err = packet_recv_error(sk, msg, len);
2702 goto out;
2703 }
2704
1da177e4
LT
2705 /*
2706 * Call the generic datagram receiver. This handles all sorts
2707 * of horrible races and re-entrancy so we can forget about it
2708 * in the protocol layers.
2709 *
2710 * Now it will return ENETDOWN, if device have just gone down,
2711 * but then it will block.
2712 */
2713
40d4e3df 2714 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2715
2716 /*
1ce4f28b 2717 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2718 * handles the blocking we don't see and worry about blocking
2719 * retries.
2720 */
2721
8ae55f04 2722 if (skb == NULL)
1da177e4
LT
2723 goto out;
2724
bfd5f4a3
SS
2725 if (pkt_sk(sk)->has_vnet_hdr) {
2726 struct virtio_net_hdr vnet_hdr = { 0 };
2727
2728 err = -EINVAL;
2729 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2730 if (len < vnet_hdr_len)
bfd5f4a3
SS
2731 goto out_free;
2732
1f18b717
MK
2733 len -= vnet_hdr_len;
2734
bfd5f4a3
SS
2735 if (skb_is_gso(skb)) {
2736 struct skb_shared_info *sinfo = skb_shinfo(skb);
2737
2738 /* This is a hint as to how much should be linear. */
2739 vnet_hdr.hdr_len = skb_headlen(skb);
2740 vnet_hdr.gso_size = sinfo->gso_size;
2741 if (sinfo->gso_type & SKB_GSO_TCPV4)
2742 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2743 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2744 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2745 else if (sinfo->gso_type & SKB_GSO_UDP)
2746 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2747 else if (sinfo->gso_type & SKB_GSO_FCOE)
2748 goto out_free;
2749 else
2750 BUG();
2751 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2752 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2753 } else
2754 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2755
2756 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2757 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2758 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2759 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2760 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2761 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2762 } /* else everything is zero */
2763
2764 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2765 vnet_hdr_len);
2766 if (err < 0)
2767 goto out_free;
2768 }
2769
0fb375fb
EB
2770 /*
2771 * If the address length field is there to be filled in, we fill
2772 * it in now.
2773 */
2774
ffbc6111 2775 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2776 if (sock->type == SOCK_PACKET)
2777 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2778 else
2779 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2780
1da177e4
LT
2781 /*
2782 * You lose any data beyond the buffer you gave. If it worries a
2783 * user program they can ask the device for its MTU anyway.
2784 */
2785
2786 copied = skb->len;
40d4e3df
ED
2787 if (copied > len) {
2788 copied = len;
2789 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2790 }
2791
2792 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2793 if (err)
2794 goto out_free;
2795
3b885787 2796 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2797
2798 if (msg->msg_name)
ffbc6111
HX
2799 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2800 msg->msg_namelen);
1da177e4 2801
8dc41944 2802 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2803 struct tpacket_auxdata aux;
2804
2805 aux.tp_status = TP_STATUS_USER;
2806 if (skb->ip_summed == CHECKSUM_PARTIAL)
2807 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2808 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2809 aux.tp_snaplen = skb->len;
2810 aux.tp_mac = 0;
bbe735e4 2811 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2812 if (vlan_tx_tag_present(skb)) {
2813 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2814 aux.tp_status |= TP_STATUS_VLAN_VALID;
2815 } else {
2816 aux.tp_vlan_tci = 0;
2817 }
13fcb7bd 2818 aux.tp_padding = 0;
ffbc6111 2819 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2820 }
2821
1da177e4
LT
2822 /*
2823 * Free or return the buffer as appropriate. Again this
2824 * hides all the races and re-entrancy issues from us.
2825 */
bfd5f4a3 2826 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2827
2828out_free:
2829 skb_free_datagram(sk, skb);
2830out:
2831 return err;
2832}
2833
1da177e4
LT
2834static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2835 int *uaddr_len, int peer)
2836{
2837 struct net_device *dev;
2838 struct sock *sk = sock->sk;
2839
2840 if (peer)
2841 return -EOPNOTSUPP;
2842
2843 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2844 rcu_read_lock();
2845 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2846 if (dev)
67286640 2847 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2848 else
1da177e4 2849 memset(uaddr->sa_data, 0, 14);
654d1f8a 2850 rcu_read_unlock();
1da177e4
LT
2851 *uaddr_len = sizeof(*uaddr);
2852
2853 return 0;
2854}
1da177e4
LT
2855
2856static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2857 int *uaddr_len, int peer)
2858{
2859 struct net_device *dev;
2860 struct sock *sk = sock->sk;
2861 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2862 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2863
2864 if (peer)
2865 return -EOPNOTSUPP;
2866
2867 sll->sll_family = AF_PACKET;
2868 sll->sll_ifindex = po->ifindex;
2869 sll->sll_protocol = po->num;
67286640 2870 sll->sll_pkttype = 0;
654d1f8a
ED
2871 rcu_read_lock();
2872 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2873 if (dev) {
2874 sll->sll_hatype = dev->type;
2875 sll->sll_halen = dev->addr_len;
2876 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2877 } else {
2878 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2879 sll->sll_halen = 0;
2880 }
654d1f8a 2881 rcu_read_unlock();
0fb375fb 2882 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2883
2884 return 0;
2885}
2886
2aeb0b88
WC
2887static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2888 int what)
1da177e4
LT
2889{
2890 switch (i->type) {
2891 case PACKET_MR_MULTICAST:
1162563f
JP
2892 if (i->alen != dev->addr_len)
2893 return -EINVAL;
1da177e4 2894 if (what > 0)
22bedad3 2895 return dev_mc_add(dev, i->addr);
1da177e4 2896 else
22bedad3 2897 return dev_mc_del(dev, i->addr);
1da177e4
LT
2898 break;
2899 case PACKET_MR_PROMISC:
2aeb0b88 2900 return dev_set_promiscuity(dev, what);
1da177e4
LT
2901 break;
2902 case PACKET_MR_ALLMULTI:
2aeb0b88 2903 return dev_set_allmulti(dev, what);
1da177e4 2904 break;
d95ed927 2905 case PACKET_MR_UNICAST:
1162563f
JP
2906 if (i->alen != dev->addr_len)
2907 return -EINVAL;
d95ed927 2908 if (what > 0)
a748ee24 2909 return dev_uc_add(dev, i->addr);
d95ed927 2910 else
a748ee24 2911 return dev_uc_del(dev, i->addr);
d95ed927 2912 break;
40d4e3df
ED
2913 default:
2914 break;
1da177e4 2915 }
2aeb0b88 2916 return 0;
1da177e4
LT
2917}
2918
2919static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2920{
40d4e3df 2921 for ( ; i; i = i->next) {
1da177e4
LT
2922 if (i->ifindex == dev->ifindex)
2923 packet_dev_mc(dev, i, what);
2924 }
2925}
2926
0fb375fb 2927static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2928{
2929 struct packet_sock *po = pkt_sk(sk);
2930 struct packet_mclist *ml, *i;
2931 struct net_device *dev;
2932 int err;
2933
2934 rtnl_lock();
2935
2936 err = -ENODEV;
3b1e0a65 2937 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2938 if (!dev)
2939 goto done;
2940
2941 err = -EINVAL;
1162563f 2942 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2943 goto done;
2944
2945 err = -ENOBUFS;
8b3a7005 2946 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2947 if (i == NULL)
2948 goto done;
2949
2950 err = 0;
2951 for (ml = po->mclist; ml; ml = ml->next) {
2952 if (ml->ifindex == mreq->mr_ifindex &&
2953 ml->type == mreq->mr_type &&
2954 ml->alen == mreq->mr_alen &&
2955 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2956 ml->count++;
2957 /* Free the new element ... */
2958 kfree(i);
2959 goto done;
2960 }
2961 }
2962
2963 i->type = mreq->mr_type;
2964 i->ifindex = mreq->mr_ifindex;
2965 i->alen = mreq->mr_alen;
2966 memcpy(i->addr, mreq->mr_address, i->alen);
2967 i->count = 1;
2968 i->next = po->mclist;
2969 po->mclist = i;
2aeb0b88
WC
2970 err = packet_dev_mc(dev, i, 1);
2971 if (err) {
2972 po->mclist = i->next;
2973 kfree(i);
2974 }
1da177e4
LT
2975
2976done:
2977 rtnl_unlock();
2978 return err;
2979}
2980
0fb375fb 2981static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2982{
2983 struct packet_mclist *ml, **mlp;
2984
2985 rtnl_lock();
2986
2987 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2988 if (ml->ifindex == mreq->mr_ifindex &&
2989 ml->type == mreq->mr_type &&
2990 ml->alen == mreq->mr_alen &&
2991 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2992 if (--ml->count == 0) {
2993 struct net_device *dev;
2994 *mlp = ml->next;
ad959e76
ED
2995 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2996 if (dev)
1da177e4 2997 packet_dev_mc(dev, ml, -1);
1da177e4
LT
2998 kfree(ml);
2999 }
3000 rtnl_unlock();
3001 return 0;
3002 }
3003 }
3004 rtnl_unlock();
3005 return -EADDRNOTAVAIL;
3006}
3007
3008static void packet_flush_mclist(struct sock *sk)
3009{
3010 struct packet_sock *po = pkt_sk(sk);
3011 struct packet_mclist *ml;
3012
3013 if (!po->mclist)
3014 return;
3015
3016 rtnl_lock();
3017 while ((ml = po->mclist) != NULL) {
3018 struct net_device *dev;
3019
3020 po->mclist = ml->next;
ad959e76
ED
3021 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3022 if (dev != NULL)
1da177e4 3023 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3024 kfree(ml);
3025 }
3026 rtnl_unlock();
3027}
1da177e4
LT
3028
3029static int
b7058842 3030packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3031{
3032 struct sock *sk = sock->sk;
8dc41944 3033 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3034 int ret;
3035
3036 if (level != SOL_PACKET)
3037 return -ENOPROTOOPT;
3038
69e3c75f 3039 switch (optname) {
1ce4f28b 3040 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3041 case PACKET_DROP_MEMBERSHIP:
3042 {
0fb375fb
EB
3043 struct packet_mreq_max mreq;
3044 int len = optlen;
3045 memset(&mreq, 0, sizeof(mreq));
3046 if (len < sizeof(struct packet_mreq))
1da177e4 3047 return -EINVAL;
0fb375fb
EB
3048 if (len > sizeof(mreq))
3049 len = sizeof(mreq);
40d4e3df 3050 if (copy_from_user(&mreq, optval, len))
1da177e4 3051 return -EFAULT;
0fb375fb
EB
3052 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3053 return -EINVAL;
1da177e4
LT
3054 if (optname == PACKET_ADD_MEMBERSHIP)
3055 ret = packet_mc_add(sk, &mreq);
3056 else
3057 ret = packet_mc_drop(sk, &mreq);
3058 return ret;
3059 }
a2efcfa0 3060
1da177e4 3061 case PACKET_RX_RING:
69e3c75f 3062 case PACKET_TX_RING:
1da177e4 3063 {
f6fb8f10 3064 union tpacket_req_u req_u;
3065 int len;
1da177e4 3066
f6fb8f10 3067 switch (po->tp_version) {
3068 case TPACKET_V1:
3069 case TPACKET_V2:
3070 len = sizeof(req_u.req);
3071 break;
3072 case TPACKET_V3:
3073 default:
3074 len = sizeof(req_u.req3);
3075 break;
3076 }
3077 if (optlen < len)
1da177e4 3078 return -EINVAL;
bfd5f4a3
SS
3079 if (pkt_sk(sk)->has_vnet_hdr)
3080 return -EINVAL;
f6fb8f10 3081 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3082 return -EFAULT;
f6fb8f10 3083 return packet_set_ring(sk, &req_u, 0,
3084 optname == PACKET_TX_RING);
1da177e4
LT
3085 }
3086 case PACKET_COPY_THRESH:
3087 {
3088 int val;
3089
40d4e3df 3090 if (optlen != sizeof(val))
1da177e4 3091 return -EINVAL;
40d4e3df 3092 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3093 return -EFAULT;
3094
3095 pkt_sk(sk)->copy_thresh = val;
3096 return 0;
3097 }
bbd6ef87
PM
3098 case PACKET_VERSION:
3099 {
3100 int val;
3101
3102 if (optlen != sizeof(val))
3103 return -EINVAL;
69e3c75f 3104 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3105 return -EBUSY;
3106 if (copy_from_user(&val, optval, sizeof(val)))
3107 return -EFAULT;
3108 switch (val) {
3109 case TPACKET_V1:
3110 case TPACKET_V2:
f6fb8f10 3111 case TPACKET_V3:
bbd6ef87
PM
3112 po->tp_version = val;
3113 return 0;
3114 default:
3115 return -EINVAL;
3116 }
3117 }
8913336a
PM
3118 case PACKET_RESERVE:
3119 {
3120 unsigned int val;
3121
3122 if (optlen != sizeof(val))
3123 return -EINVAL;
69e3c75f 3124 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3125 return -EBUSY;
3126 if (copy_from_user(&val, optval, sizeof(val)))
3127 return -EFAULT;
3128 po->tp_reserve = val;
3129 return 0;
3130 }
69e3c75f
JB
3131 case PACKET_LOSS:
3132 {
3133 unsigned int val;
3134
3135 if (optlen != sizeof(val))
3136 return -EINVAL;
3137 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3138 return -EBUSY;
3139 if (copy_from_user(&val, optval, sizeof(val)))
3140 return -EFAULT;
3141 po->tp_loss = !!val;
3142 return 0;
3143 }
8dc41944
HX
3144 case PACKET_AUXDATA:
3145 {
3146 int val;
3147
3148 if (optlen < sizeof(val))
3149 return -EINVAL;
3150 if (copy_from_user(&val, optval, sizeof(val)))
3151 return -EFAULT;
3152
3153 po->auxdata = !!val;
3154 return 0;
3155 }
80feaacb
PWJ
3156 case PACKET_ORIGDEV:
3157 {
3158 int val;
3159
3160 if (optlen < sizeof(val))
3161 return -EINVAL;
3162 if (copy_from_user(&val, optval, sizeof(val)))
3163 return -EFAULT;
3164
3165 po->origdev = !!val;
3166 return 0;
3167 }
bfd5f4a3
SS
3168 case PACKET_VNET_HDR:
3169 {
3170 int val;
3171
3172 if (sock->type != SOCK_RAW)
3173 return -EINVAL;
3174 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3175 return -EBUSY;
3176 if (optlen < sizeof(val))
3177 return -EINVAL;
3178 if (copy_from_user(&val, optval, sizeof(val)))
3179 return -EFAULT;
3180
3181 po->has_vnet_hdr = !!val;
3182 return 0;
3183 }
614f60fa
SM
3184 case PACKET_TIMESTAMP:
3185 {
3186 int val;
3187
3188 if (optlen != sizeof(val))
3189 return -EINVAL;
3190 if (copy_from_user(&val, optval, sizeof(val)))
3191 return -EFAULT;
3192
3193 po->tp_tstamp = val;
3194 return 0;
3195 }
dc99f600
DM
3196 case PACKET_FANOUT:
3197 {
3198 int val;
3199
3200 if (optlen != sizeof(val))
3201 return -EINVAL;
3202 if (copy_from_user(&val, optval, sizeof(val)))
3203 return -EFAULT;
3204
3205 return fanout_add(sk, val & 0xffff, val >> 16);
3206 }
5920cd3a
PC
3207 case PACKET_TX_HAS_OFF:
3208 {
3209 unsigned int val;
3210
3211 if (optlen != sizeof(val))
3212 return -EINVAL;
3213 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3214 return -EBUSY;
3215 if (copy_from_user(&val, optval, sizeof(val)))
3216 return -EFAULT;
3217 po->tp_tx_has_off = !!val;
3218 return 0;
3219 }
1da177e4
LT
3220 default:
3221 return -ENOPROTOOPT;
3222 }
3223}
3224
3225static int packet_getsockopt(struct socket *sock, int level, int optname,
3226 char __user *optval, int __user *optlen)
3227{
3228 int len;
c06fff6e 3229 int val, lv = sizeof(val);
1da177e4
LT
3230 struct sock *sk = sock->sk;
3231 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3232 void *data = &val;
8dc41944 3233 struct tpacket_stats st;
f6fb8f10 3234 union tpacket_stats_u st_u;
1da177e4
LT
3235
3236 if (level != SOL_PACKET)
3237 return -ENOPROTOOPT;
3238
8ae55f04
KK
3239 if (get_user(len, optlen))
3240 return -EFAULT;
1da177e4
LT
3241
3242 if (len < 0)
3243 return -EINVAL;
1ce4f28b 3244
69e3c75f 3245 switch (optname) {
1da177e4 3246 case PACKET_STATISTICS:
1da177e4 3247 spin_lock_bh(&sk->sk_receive_queue.lock);
f6fb8f10 3248 if (po->tp_version == TPACKET_V3) {
c06fff6e 3249 lv = sizeof(struct tpacket_stats_v3);
f6fb8f10 3250 memcpy(&st_u.stats3, &po->stats,
c06fff6e 3251 sizeof(struct tpacket_stats));
f6fb8f10 3252 st_u.stats3.tp_freeze_q_cnt =
c06fff6e 3253 po->stats_u.stats3.tp_freeze_q_cnt;
f6fb8f10 3254 st_u.stats3.tp_packets += po->stats.tp_drops;
3255 data = &st_u.stats3;
3256 } else {
c06fff6e 3257 lv = sizeof(struct tpacket_stats);
f6fb8f10 3258 st = po->stats;
3259 st.tp_packets += st.tp_drops;
3260 data = &st;
3261 }
1da177e4
LT
3262 memset(&po->stats, 0, sizeof(st));
3263 spin_unlock_bh(&sk->sk_receive_queue.lock);
8dc41944
HX
3264 break;
3265 case PACKET_AUXDATA:
8dc41944 3266 val = po->auxdata;
80feaacb
PWJ
3267 break;
3268 case PACKET_ORIGDEV:
80feaacb 3269 val = po->origdev;
bfd5f4a3
SS
3270 break;
3271 case PACKET_VNET_HDR:
bfd5f4a3 3272 val = po->has_vnet_hdr;
1da177e4 3273 break;
bbd6ef87 3274 case PACKET_VERSION:
bbd6ef87 3275 val = po->tp_version;
bbd6ef87
PM
3276 break;
3277 case PACKET_HDRLEN:
3278 if (len > sizeof(int))
3279 len = sizeof(int);
3280 if (copy_from_user(&val, optval, len))
3281 return -EFAULT;
3282 switch (val) {
3283 case TPACKET_V1:
3284 val = sizeof(struct tpacket_hdr);
3285 break;
3286 case TPACKET_V2:
3287 val = sizeof(struct tpacket2_hdr);
3288 break;
f6fb8f10 3289 case TPACKET_V3:
3290 val = sizeof(struct tpacket3_hdr);
3291 break;
bbd6ef87
PM
3292 default:
3293 return -EINVAL;
3294 }
bbd6ef87 3295 break;
8913336a 3296 case PACKET_RESERVE:
8913336a 3297 val = po->tp_reserve;
8913336a 3298 break;
69e3c75f 3299 case PACKET_LOSS:
69e3c75f 3300 val = po->tp_loss;
69e3c75f 3301 break;
614f60fa 3302 case PACKET_TIMESTAMP:
614f60fa 3303 val = po->tp_tstamp;
614f60fa 3304 break;
dc99f600 3305 case PACKET_FANOUT:
dc99f600
DM
3306 val = (po->fanout ?
3307 ((u32)po->fanout->id |
77f65ebd
WB
3308 ((u32)po->fanout->type << 16) |
3309 ((u32)po->fanout->flags << 24)) :
dc99f600 3310 0);
dc99f600 3311 break;
5920cd3a
PC
3312 case PACKET_TX_HAS_OFF:
3313 val = po->tp_tx_has_off;
3314 break;
1da177e4
LT
3315 default:
3316 return -ENOPROTOOPT;
3317 }
3318
c06fff6e
ED
3319 if (len > lv)
3320 len = lv;
8ae55f04
KK
3321 if (put_user(len, optlen))
3322 return -EFAULT;
8dc41944
HX
3323 if (copy_to_user(optval, data, len))
3324 return -EFAULT;
8ae55f04 3325 return 0;
1da177e4
LT
3326}
3327
3328
3329static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3330{
3331 struct sock *sk;
ad930650 3332 struct net_device *dev = data;
c346dca1 3333 struct net *net = dev_net(dev);
1da177e4 3334
808f5114 3335 rcu_read_lock();
b67bfe0d 3336 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3337 struct packet_sock *po = pkt_sk(sk);
3338
3339 switch (msg) {
3340 case NETDEV_UNREGISTER:
1da177e4
LT
3341 if (po->mclist)
3342 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3343 /* fallthrough */
3344
1da177e4
LT
3345 case NETDEV_DOWN:
3346 if (dev->ifindex == po->ifindex) {
3347 spin_lock(&po->bind_lock);
3348 if (po->running) {
ce06b03e 3349 __unregister_prot_hook(sk, false);
1da177e4
LT
3350 sk->sk_err = ENETDOWN;
3351 if (!sock_flag(sk, SOCK_DEAD))
3352 sk->sk_error_report(sk);
3353 }
3354 if (msg == NETDEV_UNREGISTER) {
3355 po->ifindex = -1;
160ff18a
BG
3356 if (po->prot_hook.dev)
3357 dev_put(po->prot_hook.dev);
1da177e4
LT
3358 po->prot_hook.dev = NULL;
3359 }
3360 spin_unlock(&po->bind_lock);
3361 }
3362 break;
3363 case NETDEV_UP:
808f5114 3364 if (dev->ifindex == po->ifindex) {
3365 spin_lock(&po->bind_lock);
ce06b03e
DM
3366 if (po->num)
3367 register_prot_hook(sk);
808f5114 3368 spin_unlock(&po->bind_lock);
1da177e4 3369 }
1da177e4
LT
3370 break;
3371 }
3372 }
808f5114 3373 rcu_read_unlock();
1da177e4
LT
3374 return NOTIFY_DONE;
3375}
3376
3377
3378static int packet_ioctl(struct socket *sock, unsigned int cmd,
3379 unsigned long arg)
3380{
3381 struct sock *sk = sock->sk;
3382
69e3c75f 3383 switch (cmd) {
40d4e3df
ED
3384 case SIOCOUTQ:
3385 {
3386 int amount = sk_wmem_alloc_get(sk);
31e6d363 3387
40d4e3df
ED
3388 return put_user(amount, (int __user *)arg);
3389 }
3390 case SIOCINQ:
3391 {
3392 struct sk_buff *skb;
3393 int amount = 0;
3394
3395 spin_lock_bh(&sk->sk_receive_queue.lock);
3396 skb = skb_peek(&sk->sk_receive_queue);
3397 if (skb)
3398 amount = skb->len;
3399 spin_unlock_bh(&sk->sk_receive_queue.lock);
3400 return put_user(amount, (int __user *)arg);
3401 }
3402 case SIOCGSTAMP:
3403 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3404 case SIOCGSTAMPNS:
3405 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3406
1da177e4 3407#ifdef CONFIG_INET
40d4e3df
ED
3408 case SIOCADDRT:
3409 case SIOCDELRT:
3410 case SIOCDARP:
3411 case SIOCGARP:
3412 case SIOCSARP:
3413 case SIOCGIFADDR:
3414 case SIOCSIFADDR:
3415 case SIOCGIFBRDADDR:
3416 case SIOCSIFBRDADDR:
3417 case SIOCGIFNETMASK:
3418 case SIOCSIFNETMASK:
3419 case SIOCGIFDSTADDR:
3420 case SIOCSIFDSTADDR:
3421 case SIOCSIFFLAGS:
40d4e3df 3422 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3423#endif
3424
40d4e3df
ED
3425 default:
3426 return -ENOIOCTLCMD;
1da177e4
LT
3427 }
3428 return 0;
3429}
3430
40d4e3df 3431static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3432 poll_table *wait)
3433{
3434 struct sock *sk = sock->sk;
3435 struct packet_sock *po = pkt_sk(sk);
3436 unsigned int mask = datagram_poll(file, sock, wait);
3437
3438 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3439 if (po->rx_ring.pg_vec) {
f6fb8f10 3440 if (!packet_previous_rx_frame(po, &po->rx_ring,
3441 TP_STATUS_KERNEL))
1da177e4
LT
3442 mask |= POLLIN | POLLRDNORM;
3443 }
3444 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3445 spin_lock_bh(&sk->sk_write_queue.lock);
3446 if (po->tx_ring.pg_vec) {
3447 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3448 mask |= POLLOUT | POLLWRNORM;
3449 }
3450 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3451 return mask;
3452}
3453
3454
3455/* Dirty? Well, I still did not learn better way to account
3456 * for user mmaps.
3457 */
3458
3459static void packet_mm_open(struct vm_area_struct *vma)
3460{
3461 struct file *file = vma->vm_file;
40d4e3df 3462 struct socket *sock = file->private_data;
1da177e4 3463 struct sock *sk = sock->sk;
1ce4f28b 3464
1da177e4
LT
3465 if (sk)
3466 atomic_inc(&pkt_sk(sk)->mapped);
3467}
3468
3469static void packet_mm_close(struct vm_area_struct *vma)
3470{
3471 struct file *file = vma->vm_file;
40d4e3df 3472 struct socket *sock = file->private_data;
1da177e4 3473 struct sock *sk = sock->sk;
1ce4f28b 3474
1da177e4
LT
3475 if (sk)
3476 atomic_dec(&pkt_sk(sk)->mapped);
3477}
3478
f0f37e2f 3479static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3480 .open = packet_mm_open,
3481 .close = packet_mm_close,
1da177e4
LT
3482};
3483
0e3125c7
NH
3484static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3485 unsigned int len)
1da177e4
LT
3486{
3487 int i;
3488
4ebf0ae2 3489 for (i = 0; i < len; i++) {
0e3125c7 3490 if (likely(pg_vec[i].buffer)) {
c56b4d90 3491 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3492 vfree(pg_vec[i].buffer);
3493 else
3494 free_pages((unsigned long)pg_vec[i].buffer,
3495 order);
3496 pg_vec[i].buffer = NULL;
3497 }
1da177e4
LT
3498 }
3499 kfree(pg_vec);
3500}
3501
eea49cc9 3502static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3503{
0e3125c7
NH
3504 char *buffer = NULL;
3505 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3506 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3507
3508 buffer = (char *) __get_free_pages(gfp_flags, order);
3509
3510 if (buffer)
3511 return buffer;
3512
3513 /*
3514 * __get_free_pages failed, fall back to vmalloc
3515 */
bbce5a59 3516 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3517
0e3125c7
NH
3518 if (buffer)
3519 return buffer;
3520
3521 /*
3522 * vmalloc failed, lets dig into swap here
3523 */
0e3125c7
NH
3524 gfp_flags &= ~__GFP_NORETRY;
3525 buffer = (char *)__get_free_pages(gfp_flags, order);
3526 if (buffer)
3527 return buffer;
3528
3529 /*
3530 * complete and utter failure
3531 */
3532 return NULL;
4ebf0ae2
DM
3533}
3534
0e3125c7 3535static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3536{
3537 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3538 struct pgv *pg_vec;
4ebf0ae2
DM
3539 int i;
3540
0e3125c7 3541 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3542 if (unlikely(!pg_vec))
3543 goto out;
3544
3545 for (i = 0; i < block_nr; i++) {
c56b4d90 3546 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3547 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3548 goto out_free_pgvec;
3549 }
3550
3551out:
3552 return pg_vec;
3553
3554out_free_pgvec:
3555 free_pg_vec(pg_vec, order, block_nr);
3556 pg_vec = NULL;
3557 goto out;
3558}
1da177e4 3559
f6fb8f10 3560static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3561 int closing, int tx_ring)
1da177e4 3562{
0e3125c7 3563 struct pgv *pg_vec = NULL;
1da177e4 3564 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3565 int was_running, order = 0;
69e3c75f
JB
3566 struct packet_ring_buffer *rb;
3567 struct sk_buff_head *rb_queue;
0e11c91e 3568 __be16 num;
f6fb8f10 3569 int err = -EINVAL;
3570 /* Added to avoid minimal code churn */
3571 struct tpacket_req *req = &req_u->req;
3572
3573 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3574 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3575 WARN(1, "Tx-ring is not supported.\n");
3576 goto out;
3577 }
1ce4f28b 3578
69e3c75f
JB
3579 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3580 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3581
69e3c75f
JB
3582 err = -EBUSY;
3583 if (!closing) {
3584 if (atomic_read(&po->mapped))
3585 goto out;
3586 if (atomic_read(&rb->pending))
3587 goto out;
3588 }
1da177e4 3589
69e3c75f
JB
3590 if (req->tp_block_nr) {
3591 /* Sanity tests and some calculations */
3592 err = -EBUSY;
3593 if (unlikely(rb->pg_vec))
3594 goto out;
1da177e4 3595
bbd6ef87
PM
3596 switch (po->tp_version) {
3597 case TPACKET_V1:
3598 po->tp_hdrlen = TPACKET_HDRLEN;
3599 break;
3600 case TPACKET_V2:
3601 po->tp_hdrlen = TPACKET2_HDRLEN;
3602 break;
f6fb8f10 3603 case TPACKET_V3:
3604 po->tp_hdrlen = TPACKET3_HDRLEN;
3605 break;
bbd6ef87
PM
3606 }
3607
69e3c75f 3608 err = -EINVAL;
4ebf0ae2 3609 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3610 goto out;
4ebf0ae2 3611 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3612 goto out;
8913336a 3613 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3614 po->tp_reserve))
3615 goto out;
4ebf0ae2 3616 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3617 goto out;
1da177e4 3618
69e3c75f
JB
3619 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3620 if (unlikely(rb->frames_per_block <= 0))
3621 goto out;
3622 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3623 req->tp_frame_nr))
3624 goto out;
1da177e4
LT
3625
3626 err = -ENOMEM;
4ebf0ae2
DM
3627 order = get_order(req->tp_block_size);
3628 pg_vec = alloc_pg_vec(req, order);
3629 if (unlikely(!pg_vec))
1da177e4 3630 goto out;
f6fb8f10 3631 switch (po->tp_version) {
3632 case TPACKET_V3:
3633 /* Transmit path is not supported. We checked
3634 * it above but just being paranoid
3635 */
3636 if (!tx_ring)
3637 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3638 break;
3639 default:
3640 break;
3641 }
69e3c75f
JB
3642 }
3643 /* Done */
3644 else {
3645 err = -EINVAL;
4ebf0ae2 3646 if (unlikely(req->tp_frame_nr))
69e3c75f 3647 goto out;
1da177e4
LT
3648 }
3649
3650 lock_sock(sk);
3651
3652 /* Detach socket from network */
3653 spin_lock(&po->bind_lock);
3654 was_running = po->running;
3655 num = po->num;
3656 if (was_running) {
1da177e4 3657 po->num = 0;
ce06b03e 3658 __unregister_prot_hook(sk, false);
1da177e4
LT
3659 }
3660 spin_unlock(&po->bind_lock);
1ce4f28b 3661
1da177e4
LT
3662 synchronize_net();
3663
3664 err = -EBUSY;
905db440 3665 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3666 if (closing || atomic_read(&po->mapped) == 0) {
3667 err = 0;
69e3c75f 3668 spin_lock_bh(&rb_queue->lock);
c053fd96 3669 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3670 rb->frame_max = (req->tp_frame_nr - 1);
3671 rb->head = 0;
3672 rb->frame_size = req->tp_frame_size;
3673 spin_unlock_bh(&rb_queue->lock);
3674
c053fd96
CG
3675 swap(rb->pg_vec_order, order);
3676 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3677
3678 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3679 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3680 tpacket_rcv : packet_rcv;
3681 skb_queue_purge(rb_queue);
1da177e4 3682 if (atomic_read(&po->mapped))
40d4e3df
ED
3683 pr_err("packet_mmap: vma is busy: %d\n",
3684 atomic_read(&po->mapped));
1da177e4 3685 }
905db440 3686 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3687
3688 spin_lock(&po->bind_lock);
ce06b03e 3689 if (was_running) {
1da177e4 3690 po->num = num;
ce06b03e 3691 register_prot_hook(sk);
1da177e4
LT
3692 }
3693 spin_unlock(&po->bind_lock);
f6fb8f10 3694 if (closing && (po->tp_version > TPACKET_V2)) {
3695 /* Because we don't support block-based V3 on tx-ring */
3696 if (!tx_ring)
3697 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3698 }
1da177e4
LT
3699 release_sock(sk);
3700
1da177e4
LT
3701 if (pg_vec)
3702 free_pg_vec(pg_vec, order, req->tp_block_nr);
3703out:
3704 return err;
3705}
3706
69e3c75f
JB
3707static int packet_mmap(struct file *file, struct socket *sock,
3708 struct vm_area_struct *vma)
1da177e4
LT
3709{
3710 struct sock *sk = sock->sk;
3711 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3712 unsigned long size, expected_size;
3713 struct packet_ring_buffer *rb;
1da177e4
LT
3714 unsigned long start;
3715 int err = -EINVAL;
3716 int i;
3717
3718 if (vma->vm_pgoff)
3719 return -EINVAL;
3720
905db440 3721 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3722
3723 expected_size = 0;
3724 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3725 if (rb->pg_vec) {
3726 expected_size += rb->pg_vec_len
3727 * rb->pg_vec_pages
3728 * PAGE_SIZE;
3729 }
3730 }
3731
3732 if (expected_size == 0)
1da177e4 3733 goto out;
69e3c75f
JB
3734
3735 size = vma->vm_end - vma->vm_start;
3736 if (size != expected_size)
1da177e4
LT
3737 goto out;
3738
1da177e4 3739 start = vma->vm_start;
69e3c75f
JB
3740 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3741 if (rb->pg_vec == NULL)
3742 continue;
3743
3744 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3745 struct page *page;
3746 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3747 int pg_num;
3748
c56b4d90
CG
3749 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3750 page = pgv_to_page(kaddr);
69e3c75f
JB
3751 err = vm_insert_page(vma, start, page);
3752 if (unlikely(err))
3753 goto out;
3754 start += PAGE_SIZE;
0e3125c7 3755 kaddr += PAGE_SIZE;
69e3c75f 3756 }
4ebf0ae2 3757 }
1da177e4 3758 }
69e3c75f 3759
4ebf0ae2 3760 atomic_inc(&po->mapped);
1da177e4
LT
3761 vma->vm_ops = &packet_mmap_ops;
3762 err = 0;
3763
3764out:
905db440 3765 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3766 return err;
3767}
1da177e4 3768
90ddc4f0 3769static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3770 .family = PF_PACKET,
3771 .owner = THIS_MODULE,
3772 .release = packet_release,
3773 .bind = packet_bind_spkt,
3774 .connect = sock_no_connect,
3775 .socketpair = sock_no_socketpair,
3776 .accept = sock_no_accept,
3777 .getname = packet_getname_spkt,
3778 .poll = datagram_poll,
3779 .ioctl = packet_ioctl,
3780 .listen = sock_no_listen,
3781 .shutdown = sock_no_shutdown,
3782 .setsockopt = sock_no_setsockopt,
3783 .getsockopt = sock_no_getsockopt,
3784 .sendmsg = packet_sendmsg_spkt,
3785 .recvmsg = packet_recvmsg,
3786 .mmap = sock_no_mmap,
3787 .sendpage = sock_no_sendpage,
3788};
1da177e4 3789
90ddc4f0 3790static const struct proto_ops packet_ops = {
1da177e4
LT
3791 .family = PF_PACKET,
3792 .owner = THIS_MODULE,
3793 .release = packet_release,
3794 .bind = packet_bind,
3795 .connect = sock_no_connect,
3796 .socketpair = sock_no_socketpair,
3797 .accept = sock_no_accept,
1ce4f28b 3798 .getname = packet_getname,
1da177e4
LT
3799 .poll = packet_poll,
3800 .ioctl = packet_ioctl,
3801 .listen = sock_no_listen,
3802 .shutdown = sock_no_shutdown,
3803 .setsockopt = packet_setsockopt,
3804 .getsockopt = packet_getsockopt,
3805 .sendmsg = packet_sendmsg,
3806 .recvmsg = packet_recvmsg,
3807 .mmap = packet_mmap,
3808 .sendpage = sock_no_sendpage,
3809};
3810
ec1b4cf7 3811static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3812 .family = PF_PACKET,
3813 .create = packet_create,
3814 .owner = THIS_MODULE,
3815};
3816
3817static struct notifier_block packet_netdev_notifier = {
40d4e3df 3818 .notifier_call = packet_notifier,
1da177e4
LT
3819};
3820
3821#ifdef CONFIG_PROC_FS
1da177e4
LT
3822
3823static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3824 __acquires(RCU)
1da177e4 3825{
e372c414 3826 struct net *net = seq_file_net(seq);
808f5114 3827
3828 rcu_read_lock();
3829 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3830}
3831
3832static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3833{
1bf40954 3834 struct net *net = seq_file_net(seq);
808f5114 3835 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3836}
3837
3838static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3839 __releases(RCU)
1da177e4 3840{
808f5114 3841 rcu_read_unlock();
1da177e4
LT
3842}
3843
1ce4f28b 3844static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3845{
3846 if (v == SEQ_START_TOKEN)
3847 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3848 else {
b7ceabd9 3849 struct sock *s = sk_entry(v);
1da177e4
LT
3850 const struct packet_sock *po = pkt_sk(s);
3851
3852 seq_printf(seq,
71338aa7 3853 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3854 s,
3855 atomic_read(&s->sk_refcnt),
3856 s->sk_type,
3857 ntohs(po->num),
3858 po->ifindex,
3859 po->running,
3860 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3861 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3862 sock_i_ino(s));
1da177e4
LT
3863 }
3864
3865 return 0;
3866}
3867
56b3d975 3868static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3869 .start = packet_seq_start,
3870 .next = packet_seq_next,
3871 .stop = packet_seq_stop,
3872 .show = packet_seq_show,
3873};
3874
3875static int packet_seq_open(struct inode *inode, struct file *file)
3876{
e372c414
DL
3877 return seq_open_net(inode, file, &packet_seq_ops,
3878 sizeof(struct seq_net_private));
1da177e4
LT
3879}
3880
da7071d7 3881static const struct file_operations packet_seq_fops = {
1da177e4
LT
3882 .owner = THIS_MODULE,
3883 .open = packet_seq_open,
3884 .read = seq_read,
3885 .llseek = seq_lseek,
e372c414 3886 .release = seq_release_net,
1da177e4
LT
3887};
3888
3889#endif
3890
2c8c1e72 3891static int __net_init packet_net_init(struct net *net)
d12d01d6 3892{
0fa7fa98 3893 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3894 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3895
d4beaa66 3896 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3897 return -ENOMEM;
3898
3899 return 0;
3900}
3901
2c8c1e72 3902static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3903{
ece31ffd 3904 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3905}
3906
3907static struct pernet_operations packet_net_ops = {
3908 .init = packet_net_init,
3909 .exit = packet_net_exit,
3910};
3911
3912
1da177e4
LT
3913static void __exit packet_exit(void)
3914{
1da177e4 3915 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3916 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3917 sock_unregister(PF_PACKET);
3918 proto_unregister(&packet_proto);
3919}
3920
3921static int __init packet_init(void)
3922{
3923 int rc = proto_register(&packet_proto, 0);
3924
3925 if (rc != 0)
3926 goto out;
3927
3928 sock_register(&packet_family_ops);
d12d01d6 3929 register_pernet_subsys(&packet_net_ops);
1da177e4 3930 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3931out:
3932 return rc;
3933}
3934
3935module_init(packet_init);
3936module_exit(packet_exit);
3937MODULE_LICENSE("GPL");
3938MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.399307 seconds and 5 git commands to generate.