packet: Report rings cfg via diag engine
[deliverable/linux.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
f6fb8f10 161static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
162 int closing, int tx_ring);
163
f6fb8f10 164
165#define V3_ALIGNMENT (8)
166
bc59ba39 167#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 168
169#define BLK_PLUS_PRIV(sz_of_priv) \
170 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
171
f6fb8f10 172#define PGV_FROM_VMALLOC 1
69e3c75f 173
f6fb8f10 174#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
175#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
176#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
181
69e3c75f
JB
182struct packet_sock;
183static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4 184
f6fb8f10 185static void *packet_previous_frame(struct packet_sock *po,
186 struct packet_ring_buffer *rb,
187 int status);
188static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 189static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
190 struct tpacket_block_desc *);
191static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 192 struct packet_sock *);
bc59ba39 193static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 194 struct packet_sock *, unsigned int status);
bc59ba39 195static int prb_queue_frozen(struct tpacket_kbdq_core *);
196static void prb_open_block(struct tpacket_kbdq_core *,
197 struct tpacket_block_desc *);
f6fb8f10 198static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 199static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
200static void prb_init_blk_timer(struct packet_sock *,
201 struct tpacket_kbdq_core *,
202 void (*func) (unsigned long));
203static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
204static void prb_clear_rxhash(struct tpacket_kbdq_core *,
205 struct tpacket3_hdr *);
206static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
207 struct tpacket3_hdr *);
1da177e4
LT
208static void packet_flush_mclist(struct sock *sk);
209
dc99f600
DM
210#define PACKET_FANOUT_MAX 256
211
212struct packet_fanout {
213#ifdef CONFIG_NET_NS
214 struct net *net;
215#endif
216 unsigned int num_members;
217 u16 id;
218 u8 type;
7736d33f 219 u8 defrag;
dc99f600
DM
220 atomic_t rr_cur;
221 struct list_head list;
222 struct sock *arr[PACKET_FANOUT_MAX];
223 spinlock_t lock;
224 atomic_t sk_ref;
225 struct packet_type prot_hook ____cacheline_aligned_in_smp;
226};
227
ffbc6111
HX
228struct packet_skb_cb {
229 unsigned int origlen;
230 union {
231 struct sockaddr_pkt pkt;
232 struct sockaddr_ll ll;
233 } sa;
234};
235
236#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 237
bc59ba39 238#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 239#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 240 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 241#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 242 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 243#define GET_NEXT_PRB_BLK_NUM(x) \
244 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
245 ((x)->kactive_blk_num+1) : 0)
246
dc99f600
DM
247static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
248static void __fanout_link(struct sock *sk, struct packet_sock *po);
249
ce06b03e
DM
250/* register_prot_hook must be invoked with the po->bind_lock held,
251 * or from a context in which asynchronous accesses to the packet
252 * socket is not possible (packet_create()).
253 */
254static void register_prot_hook(struct sock *sk)
255{
256 struct packet_sock *po = pkt_sk(sk);
257 if (!po->running) {
dc99f600
DM
258 if (po->fanout)
259 __fanout_link(sk, po);
260 else
261 dev_add_pack(&po->prot_hook);
ce06b03e
DM
262 sock_hold(sk);
263 po->running = 1;
264 }
265}
266
267/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
268 * held. If the sync parameter is true, we will temporarily drop
269 * the po->bind_lock and do a synchronize_net to make sure no
270 * asynchronous packet processing paths still refer to the elements
271 * of po->prot_hook. If the sync parameter is false, it is the
272 * callers responsibility to take care of this.
273 */
274static void __unregister_prot_hook(struct sock *sk, bool sync)
275{
276 struct packet_sock *po = pkt_sk(sk);
277
278 po->running = 0;
dc99f600
DM
279 if (po->fanout)
280 __fanout_unlink(sk, po);
281 else
282 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
283 __sock_put(sk);
284
285 if (sync) {
286 spin_unlock(&po->bind_lock);
287 synchronize_net();
288 spin_lock(&po->bind_lock);
289 }
290}
291
292static void unregister_prot_hook(struct sock *sk, bool sync)
293{
294 struct packet_sock *po = pkt_sk(sk);
295
296 if (po->running)
297 __unregister_prot_hook(sk, sync);
298}
299
f6dafa95 300static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
301{
302 if (is_vmalloc_addr(addr))
303 return vmalloc_to_page(addr);
304 return virt_to_page(addr);
305}
306
69e3c75f 307static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 308{
bbd6ef87
PM
309 union {
310 struct tpacket_hdr *h1;
311 struct tpacket2_hdr *h2;
312 void *raw;
313 } h;
1da177e4 314
69e3c75f 315 h.raw = frame;
bbd6ef87
PM
316 switch (po->tp_version) {
317 case TPACKET_V1:
69e3c75f 318 h.h1->tp_status = status;
0af55bb5 319 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
320 break;
321 case TPACKET_V2:
69e3c75f 322 h.h2->tp_status = status;
0af55bb5 323 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 324 break;
f6fb8f10 325 case TPACKET_V3:
69e3c75f 326 default:
f6fb8f10 327 WARN(1, "TPACKET version not supported.\n");
69e3c75f 328 BUG();
bbd6ef87 329 }
69e3c75f
JB
330
331 smp_wmb();
bbd6ef87
PM
332}
333
69e3c75f 334static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
335{
336 union {
337 struct tpacket_hdr *h1;
338 struct tpacket2_hdr *h2;
339 void *raw;
340 } h;
341
69e3c75f
JB
342 smp_rmb();
343
bbd6ef87
PM
344 h.raw = frame;
345 switch (po->tp_version) {
346 case TPACKET_V1:
0af55bb5 347 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 348 return h.h1->tp_status;
bbd6ef87 349 case TPACKET_V2:
0af55bb5 350 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 351 return h.h2->tp_status;
f6fb8f10 352 case TPACKET_V3:
69e3c75f 353 default:
f6fb8f10 354 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
355 BUG();
356 return 0;
bbd6ef87 357 }
1da177e4 358}
69e3c75f
JB
359
360static void *packet_lookup_frame(struct packet_sock *po,
361 struct packet_ring_buffer *rb,
362 unsigned int position,
363 int status)
364{
365 unsigned int pg_vec_pos, frame_offset;
366 union {
367 struct tpacket_hdr *h1;
368 struct tpacket2_hdr *h2;
369 void *raw;
370 } h;
371
372 pg_vec_pos = position / rb->frames_per_block;
373 frame_offset = position % rb->frames_per_block;
374
0e3125c7
NH
375 h.raw = rb->pg_vec[pg_vec_pos].buffer +
376 (frame_offset * rb->frame_size);
69e3c75f
JB
377
378 if (status != __packet_get_status(po, h.raw))
379 return NULL;
380
381 return h.raw;
382}
383
eea49cc9 384static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
385 struct packet_ring_buffer *rb,
386 int status)
387{
388 return packet_lookup_frame(po, rb, rb->head, status);
389}
390
bc59ba39 391static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 392{
393 del_timer_sync(&pkc->retire_blk_timer);
394}
395
396static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
397 int tx_ring,
398 struct sk_buff_head *rb_queue)
399{
bc59ba39 400 struct tpacket_kbdq_core *pkc;
f6fb8f10 401
402 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
403
404 spin_lock(&rb_queue->lock);
405 pkc->delete_blk_timer = 1;
406 spin_unlock(&rb_queue->lock);
407
408 prb_del_retire_blk_timer(pkc);
409}
410
411static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 412 struct tpacket_kbdq_core *pkc,
f6fb8f10 413 void (*func) (unsigned long))
414{
415 init_timer(&pkc->retire_blk_timer);
416 pkc->retire_blk_timer.data = (long)po;
417 pkc->retire_blk_timer.function = func;
418 pkc->retire_blk_timer.expires = jiffies;
419}
420
421static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
422{
bc59ba39 423 struct tpacket_kbdq_core *pkc;
f6fb8f10 424
425 if (tx_ring)
426 BUG();
427
428 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
429 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
430}
431
432static int prb_calc_retire_blk_tmo(struct packet_sock *po,
433 int blk_size_in_bytes)
434{
435 struct net_device *dev;
436 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
437 struct ethtool_cmd ecmd;
438 int err;
e440cf2c 439 u32 speed;
f6fb8f10 440
4bc71cb9
JP
441 rtnl_lock();
442 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
443 if (unlikely(!dev)) {
444 rtnl_unlock();
f6fb8f10 445 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
446 }
447 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 448 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
449 rtnl_unlock();
450 if (!err) {
4bc71cb9
JP
451 /*
452 * If the link speed is so slow you don't really
453 * need to worry about perf anyways
454 */
e440cf2c 455 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 456 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 457 } else {
458 msec = 1;
459 div = speed / 1000;
f6fb8f10 460 }
461 }
462
463 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
464
465 if (div)
466 mbits /= div;
467
468 tmo = mbits * msec;
469
470 if (div)
471 return tmo+1;
472 return tmo;
473}
474
bc59ba39 475static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 476 union tpacket_req_u *req_u)
477{
478 p1->feature_req_word = req_u->req3.tp_feature_req_word;
479}
480
481static void init_prb_bdqc(struct packet_sock *po,
482 struct packet_ring_buffer *rb,
483 struct pgv *pg_vec,
484 union tpacket_req_u *req_u, int tx_ring)
485{
bc59ba39 486 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
487 struct tpacket_block_desc *pbd;
f6fb8f10 488
489 memset(p1, 0x0, sizeof(*p1));
490
491 p1->knxt_seq_num = 1;
492 p1->pkbdq = pg_vec;
bc59ba39 493 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 494 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 495 p1->kblk_size = req_u->req3.tp_block_size;
496 p1->knum_blocks = req_u->req3.tp_block_nr;
497 p1->hdrlen = po->tp_hdrlen;
498 p1->version = po->tp_version;
499 p1->last_kactive_blk_num = 0;
500 po->stats_u.stats3.tp_freeze_q_cnt = 0;
501 if (req_u->req3.tp_retire_blk_tov)
502 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
503 else
504 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
505 req_u->req3.tp_block_size);
506 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
507 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
508
509 prb_init_ft_ops(p1, req_u);
510 prb_setup_retire_blk_timer(po, tx_ring);
511 prb_open_block(p1, pbd);
512}
513
514/* Do NOT update the last_blk_num first.
515 * Assumes sk_buff_head lock is held.
516 */
bc59ba39 517static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 518{
519 mod_timer(&pkc->retire_blk_timer,
520 jiffies + pkc->tov_in_jiffies);
521 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
522}
523
524/*
525 * Timer logic:
526 * 1) We refresh the timer only when we open a block.
527 * By doing this we don't waste cycles refreshing the timer
528 * on packet-by-packet basis.
529 *
530 * With a 1MB block-size, on a 1Gbps line, it will take
531 * i) ~8 ms to fill a block + ii) memcpy etc.
532 * In this cut we are not accounting for the memcpy time.
533 *
534 * So, if the user sets the 'tmo' to 10ms then the timer
535 * will never fire while the block is still getting filled
536 * (which is what we want). However, the user could choose
537 * to close a block early and that's fine.
538 *
539 * But when the timer does fire, we check whether or not to refresh it.
540 * Since the tmo granularity is in msecs, it is not too expensive
541 * to refresh the timer, lets say every '8' msecs.
542 * Either the user can set the 'tmo' or we can derive it based on
543 * a) line-speed and b) block-size.
544 * prb_calc_retire_blk_tmo() calculates the tmo.
545 *
546 */
547static void prb_retire_rx_blk_timer_expired(unsigned long data)
548{
549 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 550 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 551 unsigned int frozen;
bc59ba39 552 struct tpacket_block_desc *pbd;
f6fb8f10 553
554 spin_lock(&po->sk.sk_receive_queue.lock);
555
556 frozen = prb_queue_frozen(pkc);
557 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
558
559 if (unlikely(pkc->delete_blk_timer))
560 goto out;
561
562 /* We only need to plug the race when the block is partially filled.
563 * tpacket_rcv:
564 * lock(); increment BLOCK_NUM_PKTS; unlock()
565 * copy_bits() is in progress ...
566 * timer fires on other cpu:
567 * we can't retire the current block because copy_bits
568 * is in progress.
569 *
570 */
571 if (BLOCK_NUM_PKTS(pbd)) {
572 while (atomic_read(&pkc->blk_fill_in_prog)) {
573 /* Waiting for skb_copy_bits to finish... */
574 cpu_relax();
575 }
576 }
577
578 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
579 if (!frozen) {
580 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
581 if (!prb_dispatch_next_block(pkc, po))
582 goto refresh_timer;
583 else
584 goto out;
585 } else {
586 /* Case 1. Queue was frozen because user-space was
587 * lagging behind.
588 */
589 if (prb_curr_blk_in_use(pkc, pbd)) {
590 /*
591 * Ok, user-space is still behind.
592 * So just refresh the timer.
593 */
594 goto refresh_timer;
595 } else {
596 /* Case 2. queue was frozen,user-space caught up,
597 * now the link went idle && the timer fired.
598 * We don't have a block to close.So we open this
599 * block and restart the timer.
600 * opening a block thaws the queue,restarts timer
601 * Thawing/timer-refresh is a side effect.
602 */
603 prb_open_block(pkc, pbd);
604 goto out;
605 }
606 }
607 }
608
609refresh_timer:
610 _prb_refresh_rx_retire_blk_timer(pkc);
611
612out:
613 spin_unlock(&po->sk.sk_receive_queue.lock);
614}
615
eea49cc9 616static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 617 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 618{
619 /* Flush everything minus the block header */
620
621#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
622 u8 *start, *end;
623
624 start = (u8 *)pbd1;
625
626 /* Skip the block header(we know header WILL fit in 4K) */
627 start += PAGE_SIZE;
628
629 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
630 for (; start < end; start += PAGE_SIZE)
631 flush_dcache_page(pgv_to_page(start));
632
633 smp_wmb();
634#endif
635
636 /* Now update the block status. */
637
638 BLOCK_STATUS(pbd1) = status;
639
640 /* Flush the block header */
641
642#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
643 start = (u8 *)pbd1;
644 flush_dcache_page(pgv_to_page(start));
645
646 smp_wmb();
647#endif
648}
649
650/*
651 * Side effect:
652 *
653 * 1) flush the block
654 * 2) Increment active_blk_num
655 *
656 * Note:We DONT refresh the timer on purpose.
657 * Because almost always the next block will be opened.
658 */
bc59ba39 659static void prb_close_block(struct tpacket_kbdq_core *pkc1,
660 struct tpacket_block_desc *pbd1,
f6fb8f10 661 struct packet_sock *po, unsigned int stat)
662{
663 __u32 status = TP_STATUS_USER | stat;
664
665 struct tpacket3_hdr *last_pkt;
bc59ba39 666 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 667
668 if (po->stats.tp_drops)
669 status |= TP_STATUS_LOSING;
670
671 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
672 last_pkt->tp_next_offset = 0;
673
674 /* Get the ts of the last pkt */
675 if (BLOCK_NUM_PKTS(pbd1)) {
676 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
677 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
678 } else {
679 /* Ok, we tmo'd - so get the current time */
680 struct timespec ts;
681 getnstimeofday(&ts);
682 h1->ts_last_pkt.ts_sec = ts.tv_sec;
683 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
684 }
685
686 smp_wmb();
687
688 /* Flush the block */
689 prb_flush_block(pkc1, pbd1, status);
690
691 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
692}
693
eea49cc9 694static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 695{
696 pkc->reset_pending_on_curr_blk = 0;
697}
698
699/*
700 * Side effect of opening a block:
701 *
702 * 1) prb_queue is thawed.
703 * 2) retire_blk_timer is refreshed.
704 *
705 */
bc59ba39 706static void prb_open_block(struct tpacket_kbdq_core *pkc1,
707 struct tpacket_block_desc *pbd1)
f6fb8f10 708{
709 struct timespec ts;
bc59ba39 710 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 711
712 smp_rmb();
713
714 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
715
716 /* We could have just memset this but we will lose the
717 * flexibility of making the priv area sticky
718 */
719 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
720 BLOCK_NUM_PKTS(pbd1) = 0;
721 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
722 getnstimeofday(&ts);
723 h1->ts_first_pkt.ts_sec = ts.tv_sec;
724 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
725 pkc1->pkblk_start = (char *)pbd1;
e3192690 726 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 727 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
728 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
729 pbd1->version = pkc1->version;
730 pkc1->prev = pkc1->nxt_offset;
731 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
732 prb_thaw_queue(pkc1);
733 _prb_refresh_rx_retire_blk_timer(pkc1);
734
735 smp_wmb();
736
737 return;
738 }
739
740 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
741 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
742 dump_stack();
743 BUG();
744}
745
746/*
747 * Queue freeze logic:
748 * 1) Assume tp_block_nr = 8 blocks.
749 * 2) At time 't0', user opens Rx ring.
750 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
751 * 4) user-space is either sleeping or processing block '0'.
752 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
753 * it will close block-7,loop around and try to fill block '0'.
754 * call-flow:
755 * __packet_lookup_frame_in_block
756 * prb_retire_current_block()
757 * prb_dispatch_next_block()
758 * |->(BLOCK_STATUS == USER) evaluates to true
759 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
760 * 6) Now there are two cases:
761 * 6.1) Link goes idle right after the queue is frozen.
762 * But remember, the last open_block() refreshed the timer.
763 * When this timer expires,it will refresh itself so that we can
764 * re-open block-0 in near future.
765 * 6.2) Link is busy and keeps on receiving packets. This is a simple
766 * case and __packet_lookup_frame_in_block will check if block-0
767 * is free and can now be re-used.
768 */
eea49cc9 769static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 770 struct packet_sock *po)
771{
772 pkc->reset_pending_on_curr_blk = 1;
773 po->stats_u.stats3.tp_freeze_q_cnt++;
774}
775
776#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
777
778/*
779 * If the next block is free then we will dispatch it
780 * and return a good offset.
781 * Else, we will freeze the queue.
782 * So, caller must check the return value.
783 */
bc59ba39 784static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 785 struct packet_sock *po)
786{
bc59ba39 787 struct tpacket_block_desc *pbd;
f6fb8f10 788
789 smp_rmb();
790
791 /* 1. Get current block num */
792 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
793
794 /* 2. If this block is currently in_use then freeze the queue */
795 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
796 prb_freeze_queue(pkc, po);
797 return NULL;
798 }
799
800 /*
801 * 3.
802 * open this block and return the offset where the first packet
803 * needs to get stored.
804 */
805 prb_open_block(pkc, pbd);
806 return (void *)pkc->nxt_offset;
807}
808
bc59ba39 809static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 810 struct packet_sock *po, unsigned int status)
811{
bc59ba39 812 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 813
814 /* retire/close the current block */
815 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
816 /*
817 * Plug the case where copy_bits() is in progress on
818 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
819 * have space to copy the pkt in the current block and
820 * called prb_retire_current_block()
821 *
822 * We don't need to worry about the TMO case because
823 * the timer-handler already handled this case.
824 */
825 if (!(status & TP_STATUS_BLK_TMO)) {
826 while (atomic_read(&pkc->blk_fill_in_prog)) {
827 /* Waiting for skb_copy_bits to finish... */
828 cpu_relax();
829 }
830 }
831 prb_close_block(pkc, pbd, po, status);
832 return;
833 }
834
835 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
836 dump_stack();
837 BUG();
838}
839
eea49cc9 840static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 841 struct tpacket_block_desc *pbd)
f6fb8f10 842{
843 return TP_STATUS_USER & BLOCK_STATUS(pbd);
844}
845
eea49cc9 846static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 847{
848 return pkc->reset_pending_on_curr_blk;
849}
850
eea49cc9 851static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 852{
bc59ba39 853 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 854 atomic_dec(&pkc->blk_fill_in_prog);
855}
856
eea49cc9 857static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 858 struct tpacket3_hdr *ppd)
859{
860 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
861}
862
eea49cc9 863static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 864 struct tpacket3_hdr *ppd)
865{
866 ppd->hv1.tp_rxhash = 0;
867}
868
eea49cc9 869static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 870 struct tpacket3_hdr *ppd)
871{
872 if (vlan_tx_tag_present(pkc->skb)) {
873 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
874 ppd->tp_status = TP_STATUS_VLAN_VALID;
875 } else {
876 ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
877 }
878}
879
bc59ba39 880static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 881 struct tpacket3_hdr *ppd)
882{
883 prb_fill_vlan_info(pkc, ppd);
884
885 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
886 prb_fill_rxhash(pkc, ppd);
887 else
888 prb_clear_rxhash(pkc, ppd);
889}
890
eea49cc9 891static void prb_fill_curr_block(char *curr,
bc59ba39 892 struct tpacket_kbdq_core *pkc,
893 struct tpacket_block_desc *pbd,
f6fb8f10 894 unsigned int len)
895{
896 struct tpacket3_hdr *ppd;
897
898 ppd = (struct tpacket3_hdr *)curr;
899 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
900 pkc->prev = curr;
901 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
902 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
903 BLOCK_NUM_PKTS(pbd) += 1;
904 atomic_inc(&pkc->blk_fill_in_prog);
905 prb_run_all_ft_ops(pkc, ppd);
906}
907
908/* Assumes caller has the sk->rx_queue.lock */
909static void *__packet_lookup_frame_in_block(struct packet_sock *po,
910 struct sk_buff *skb,
911 int status,
912 unsigned int len
913 )
914{
bc59ba39 915 struct tpacket_kbdq_core *pkc;
916 struct tpacket_block_desc *pbd;
f6fb8f10 917 char *curr, *end;
918
e3192690 919 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 920 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
921
922 /* Queue is frozen when user space is lagging behind */
923 if (prb_queue_frozen(pkc)) {
924 /*
925 * Check if that last block which caused the queue to freeze,
926 * is still in_use by user-space.
927 */
928 if (prb_curr_blk_in_use(pkc, pbd)) {
929 /* Can't record this packet */
930 return NULL;
931 } else {
932 /*
933 * Ok, the block was released by user-space.
934 * Now let's open that block.
935 * opening a block also thaws the queue.
936 * Thawing is a side effect.
937 */
938 prb_open_block(pkc, pbd);
939 }
940 }
941
942 smp_mb();
943 curr = pkc->nxt_offset;
944 pkc->skb = skb;
e3192690 945 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 946
947 /* first try the current block */
948 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
949 prb_fill_curr_block(curr, pkc, pbd, len);
950 return (void *)curr;
951 }
952
953 /* Ok, close the current block */
954 prb_retire_current_block(pkc, po, 0);
955
956 /* Now, try to dispatch the next block */
957 curr = (char *)prb_dispatch_next_block(pkc, po);
958 if (curr) {
959 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
960 prb_fill_curr_block(curr, pkc, pbd, len);
961 return (void *)curr;
962 }
963
964 /*
965 * No free blocks are available.user_space hasn't caught up yet.
966 * Queue was just frozen and now this packet will get dropped.
967 */
968 return NULL;
969}
970
eea49cc9 971static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 972 struct sk_buff *skb,
973 int status, unsigned int len)
974{
975 char *curr = NULL;
976 switch (po->tp_version) {
977 case TPACKET_V1:
978 case TPACKET_V2:
979 curr = packet_lookup_frame(po, &po->rx_ring,
980 po->rx_ring.head, status);
981 return curr;
982 case TPACKET_V3:
983 return __packet_lookup_frame_in_block(po, skb, status, len);
984 default:
985 WARN(1, "TPACKET version not supported\n");
986 BUG();
987 return 0;
988 }
989}
990
eea49cc9 991static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 992 struct packet_ring_buffer *rb,
993 unsigned int previous,
994 int status)
995{
bc59ba39 996 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
997 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
f6fb8f10 998
999 if (status != BLOCK_STATUS(pbd))
1000 return NULL;
1001 return pbd;
1002}
1003
eea49cc9 1004static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1005{
1006 unsigned int prev;
1007 if (rb->prb_bdqc.kactive_blk_num)
1008 prev = rb->prb_bdqc.kactive_blk_num-1;
1009 else
1010 prev = rb->prb_bdqc.knum_blocks-1;
1011 return prev;
1012}
1013
1014/* Assumes caller has held the rx_queue.lock */
eea49cc9 1015static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1016 struct packet_ring_buffer *rb,
1017 int status)
1018{
1019 unsigned int previous = prb_previous_blk_num(rb);
1020 return prb_lookup_block(po, rb, previous, status);
1021}
1022
eea49cc9 1023static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1024 struct packet_ring_buffer *rb,
1025 int status)
1026{
1027 if (po->tp_version <= TPACKET_V2)
1028 return packet_previous_frame(po, rb, status);
1029
1030 return __prb_previous_block(po, rb, status);
1031}
1032
eea49cc9 1033static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1034 struct packet_ring_buffer *rb)
1035{
1036 switch (po->tp_version) {
1037 case TPACKET_V1:
1038 case TPACKET_V2:
1039 return packet_increment_head(rb);
1040 case TPACKET_V3:
1041 default:
1042 WARN(1, "TPACKET version not supported.\n");
1043 BUG();
1044 return;
1045 }
1046}
1047
eea49cc9 1048static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1049 struct packet_ring_buffer *rb,
1050 int status)
1051{
1052 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1053 return packet_lookup_frame(po, rb, previous, status);
1054}
1055
eea49cc9 1056static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1057{
1058 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1059}
1060
1da177e4
LT
1061static void packet_sock_destruct(struct sock *sk)
1062{
ed85b565
RC
1063 skb_queue_purge(&sk->sk_error_queue);
1064
547b792c
IJ
1065 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1066 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1067
1068 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1069 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1070 return;
1071 }
1072
17ab56a2 1073 sk_refcnt_debug_dec(sk);
1da177e4
LT
1074}
1075
dc99f600
DM
1076static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1077{
1078 int x = atomic_read(&f->rr_cur) + 1;
1079
1080 if (x >= num)
1081 x = 0;
1082
1083 return x;
1084}
1085
1086static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1087{
1088 u32 idx, hash = skb->rxhash;
1089
1090 idx = ((u64)hash * num) >> 32;
1091
1092 return f->arr[idx];
1093}
1094
1095static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1096{
1097 int cur, old;
1098
1099 cur = atomic_read(&f->rr_cur);
1100 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1101 fanout_rr_next(f, num))) != cur)
1102 cur = old;
1103 return f->arr[cur];
1104}
1105
95ec3eb4
DM
1106static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1107{
1108 unsigned int cpu = smp_processor_id();
1109
1110 return f->arr[cpu % num];
1111}
1112
95ec3eb4
DM
1113static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1114 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1115{
1116 struct packet_fanout *f = pt->af_packet_priv;
1117 unsigned int num = f->num_members;
1118 struct packet_sock *po;
1119 struct sock *sk;
1120
1121 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1122 !num) {
1123 kfree_skb(skb);
1124 return 0;
1125 }
1126
95ec3eb4
DM
1127 switch (f->type) {
1128 case PACKET_FANOUT_HASH:
1129 default:
1130 if (f->defrag) {
bc416d97 1131 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1132 if (!skb)
1133 return 0;
1134 }
1135 skb_get_rxhash(skb);
1136 sk = fanout_demux_hash(f, skb, num);
1137 break;
1138 case PACKET_FANOUT_LB:
1139 sk = fanout_demux_lb(f, skb, num);
1140 break;
1141 case PACKET_FANOUT_CPU:
1142 sk = fanout_demux_cpu(f, skb, num);
1143 break;
dc99f600
DM
1144 }
1145
dc99f600
DM
1146 po = pkt_sk(sk);
1147
1148 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1149}
1150
1151static DEFINE_MUTEX(fanout_mutex);
1152static LIST_HEAD(fanout_list);
1153
1154static void __fanout_link(struct sock *sk, struct packet_sock *po)
1155{
1156 struct packet_fanout *f = po->fanout;
1157
1158 spin_lock(&f->lock);
1159 f->arr[f->num_members] = sk;
1160 smp_wmb();
1161 f->num_members++;
1162 spin_unlock(&f->lock);
1163}
1164
1165static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1166{
1167 struct packet_fanout *f = po->fanout;
1168 int i;
1169
1170 spin_lock(&f->lock);
1171 for (i = 0; i < f->num_members; i++) {
1172 if (f->arr[i] == sk)
1173 break;
1174 }
1175 BUG_ON(i >= f->num_members);
1176 f->arr[i] = f->arr[f->num_members - 1];
1177 f->num_members--;
1178 spin_unlock(&f->lock);
1179}
1180
7736d33f 1181static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1182{
1183 struct packet_sock *po = pkt_sk(sk);
1184 struct packet_fanout *f, *match;
7736d33f
DM
1185 u8 type = type_flags & 0xff;
1186 u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
dc99f600
DM
1187 int err;
1188
1189 switch (type) {
1190 case PACKET_FANOUT_HASH:
1191 case PACKET_FANOUT_LB:
95ec3eb4 1192 case PACKET_FANOUT_CPU:
dc99f600
DM
1193 break;
1194 default:
1195 return -EINVAL;
1196 }
1197
1198 if (!po->running)
1199 return -EINVAL;
1200
1201 if (po->fanout)
1202 return -EALREADY;
1203
1204 mutex_lock(&fanout_mutex);
1205 match = NULL;
1206 list_for_each_entry(f, &fanout_list, list) {
1207 if (f->id == id &&
1208 read_pnet(&f->net) == sock_net(sk)) {
1209 match = f;
1210 break;
1211 }
1212 }
afe62c68 1213 err = -EINVAL;
7736d33f 1214 if (match && match->defrag != defrag)
afe62c68 1215 goto out;
dc99f600 1216 if (!match) {
afe62c68 1217 err = -ENOMEM;
dc99f600 1218 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1219 if (!match)
1220 goto out;
1221 write_pnet(&match->net, sock_net(sk));
1222 match->id = id;
1223 match->type = type;
1224 match->defrag = defrag;
1225 atomic_set(&match->rr_cur, 0);
1226 INIT_LIST_HEAD(&match->list);
1227 spin_lock_init(&match->lock);
1228 atomic_set(&match->sk_ref, 0);
1229 match->prot_hook.type = po->prot_hook.type;
1230 match->prot_hook.dev = po->prot_hook.dev;
1231 match->prot_hook.func = packet_rcv_fanout;
1232 match->prot_hook.af_packet_priv = match;
1233 dev_add_pack(&match->prot_hook);
1234 list_add(&match->list, &fanout_list);
dc99f600 1235 }
afe62c68
ED
1236 err = -EINVAL;
1237 if (match->type == type &&
1238 match->prot_hook.type == po->prot_hook.type &&
1239 match->prot_hook.dev == po->prot_hook.dev) {
1240 err = -ENOSPC;
1241 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1242 __dev_remove_pack(&po->prot_hook);
1243 po->fanout = match;
1244 atomic_inc(&match->sk_ref);
1245 __fanout_link(sk, po);
1246 err = 0;
dc99f600
DM
1247 }
1248 }
afe62c68 1249out:
dc99f600
DM
1250 mutex_unlock(&fanout_mutex);
1251 return err;
1252}
1253
1254static void fanout_release(struct sock *sk)
1255{
1256 struct packet_sock *po = pkt_sk(sk);
1257 struct packet_fanout *f;
1258
1259 f = po->fanout;
1260 if (!f)
1261 return;
1262
1263 po->fanout = NULL;
1264
1265 mutex_lock(&fanout_mutex);
1266 if (atomic_dec_and_test(&f->sk_ref)) {
1267 list_del(&f->list);
1268 dev_remove_pack(&f->prot_hook);
1269 kfree(f);
1270 }
1271 mutex_unlock(&fanout_mutex);
1272}
1da177e4 1273
90ddc4f0 1274static const struct proto_ops packet_ops;
1da177e4 1275
90ddc4f0 1276static const struct proto_ops packet_ops_spkt;
1da177e4 1277
40d4e3df
ED
1278static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1279 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1280{
1281 struct sock *sk;
1282 struct sockaddr_pkt *spkt;
1283
1284 /*
1285 * When we registered the protocol we saved the socket in the data
1286 * field for just this event.
1287 */
1288
1289 sk = pt->af_packet_priv;
1ce4f28b 1290
1da177e4
LT
1291 /*
1292 * Yank back the headers [hope the device set this
1293 * right or kerboom...]
1294 *
1295 * Incoming packets have ll header pulled,
1296 * push it back.
1297 *
98e399f8 1298 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1299 * so that this procedure is noop.
1300 */
1301
1302 if (skb->pkt_type == PACKET_LOOPBACK)
1303 goto out;
1304
09ad9bc7 1305 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1306 goto out;
1307
40d4e3df
ED
1308 skb = skb_share_check(skb, GFP_ATOMIC);
1309 if (skb == NULL)
1da177e4
LT
1310 goto oom;
1311
1312 /* drop any routing info */
adf30907 1313 skb_dst_drop(skb);
1da177e4 1314
84531c24
PO
1315 /* drop conntrack reference */
1316 nf_reset(skb);
1317
ffbc6111 1318 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1319
98e399f8 1320 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1321
1322 /*
1323 * The SOCK_PACKET socket receives _all_ frames.
1324 */
1325
1326 spkt->spkt_family = dev->type;
1327 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1328 spkt->spkt_protocol = skb->protocol;
1329
1330 /*
1331 * Charge the memory to the socket. This is done specifically
1332 * to prevent sockets using all the memory up.
1333 */
1334
40d4e3df 1335 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1336 return 0;
1337
1338out:
1339 kfree_skb(skb);
1340oom:
1341 return 0;
1342}
1343
1344
1345/*
1346 * Output a raw packet to a device layer. This bypasses all the other
1347 * protocol layers and you must therefore supply it with a complete frame
1348 */
1ce4f28b 1349
1da177e4
LT
1350static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1351 struct msghdr *msg, size_t len)
1352{
1353 struct sock *sk = sock->sk;
40d4e3df 1354 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1355 struct sk_buff *skb = NULL;
1da177e4 1356 struct net_device *dev;
40d4e3df 1357 __be16 proto = 0;
1da177e4 1358 int err;
3bdc0eba 1359 int extra_len = 0;
1ce4f28b 1360
1da177e4 1361 /*
1ce4f28b 1362 * Get and verify the address.
1da177e4
LT
1363 */
1364
40d4e3df 1365 if (saddr) {
1da177e4 1366 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1367 return -EINVAL;
1368 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1369 proto = saddr->spkt_protocol;
1370 } else
1371 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1372
1373 /*
1ce4f28b 1374 * Find the device first to size check it
1da177e4
LT
1375 */
1376
de74e92a 1377 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1378retry:
654d1f8a
ED
1379 rcu_read_lock();
1380 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1381 err = -ENODEV;
1382 if (dev == NULL)
1383 goto out_unlock;
1ce4f28b 1384
d5e76b0a
DM
1385 err = -ENETDOWN;
1386 if (!(dev->flags & IFF_UP))
1387 goto out_unlock;
1388
1da177e4 1389 /*
40d4e3df
ED
1390 * You may not queue a frame bigger than the mtu. This is the lowest level
1391 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1392 */
1ce4f28b 1393
3bdc0eba
BG
1394 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1395 if (!netif_supports_nofcs(dev)) {
1396 err = -EPROTONOSUPPORT;
1397 goto out_unlock;
1398 }
1399 extra_len = 4; /* We're doing our own CRC */
1400 }
1401
1da177e4 1402 err = -EMSGSIZE;
3bdc0eba 1403 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1404 goto out_unlock;
1405
1a35ca80
ED
1406 if (!skb) {
1407 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1408 int tlen = dev->needed_tailroom;
1a35ca80
ED
1409 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1410
1411 rcu_read_unlock();
4ce40912 1412 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1413 if (skb == NULL)
1414 return -ENOBUFS;
1415 /* FIXME: Save some space for broken drivers that write a hard
1416 * header at transmission time by themselves. PPP is the notable
1417 * one here. This should really be fixed at the driver level.
1418 */
1419 skb_reserve(skb, reserved);
1420 skb_reset_network_header(skb);
1421
1422 /* Try to align data part correctly */
1423 if (hhlen) {
1424 skb->data -= hhlen;
1425 skb->tail -= hhlen;
1426 if (len < hhlen)
1427 skb_reset_network_header(skb);
1428 }
1429 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1430 if (err)
1431 goto out_free;
1432 goto retry;
1da177e4
LT
1433 }
1434
3bdc0eba 1435 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1436 /* Earlier code assumed this would be a VLAN pkt,
1437 * double-check this now that we have the actual
1438 * packet in hand.
1439 */
1440 struct ethhdr *ehdr;
1441 skb_reset_mac_header(skb);
1442 ehdr = eth_hdr(skb);
1443 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1444 err = -EMSGSIZE;
1445 goto out_unlock;
1446 }
1447 }
1a35ca80 1448
1da177e4
LT
1449 skb->protocol = proto;
1450 skb->dev = dev;
1451 skb->priority = sk->sk_priority;
2d37a186 1452 skb->mark = sk->sk_mark;
2244d07b 1453 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1454 if (err < 0)
1455 goto out_unlock;
1da177e4 1456
3bdc0eba
BG
1457 if (unlikely(extra_len == 4))
1458 skb->no_fcs = 1;
1459
1da177e4 1460 dev_queue_xmit(skb);
654d1f8a 1461 rcu_read_unlock();
40d4e3df 1462 return len;
1da177e4 1463
1da177e4 1464out_unlock:
654d1f8a 1465 rcu_read_unlock();
1a35ca80
ED
1466out_free:
1467 kfree_skb(skb);
1da177e4
LT
1468 return err;
1469}
1da177e4 1470
eea49cc9 1471static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1472 const struct sock *sk,
dbcb5855 1473 unsigned int res)
1da177e4
LT
1474{
1475 struct sk_filter *filter;
fda9ef5d 1476
80f8f102
ED
1477 rcu_read_lock();
1478 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1479 if (filter != NULL)
0a14842f 1480 res = SK_RUN_FILTER(filter, skb);
80f8f102 1481 rcu_read_unlock();
1da177e4 1482
dbcb5855 1483 return res;
1da177e4
LT
1484}
1485
1486/*
62ab0812
ED
1487 * This function makes lazy skb cloning in hope that most of packets
1488 * are discarded by BPF.
1489 *
1490 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1491 * and skb->cb are mangled. It works because (and until) packets
1492 * falling here are owned by current CPU. Output packets are cloned
1493 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1494 * sequencially, so that if we return skb to original state on exit,
1495 * we will not harm anyone.
1da177e4
LT
1496 */
1497
40d4e3df
ED
1498static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1499 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1500{
1501 struct sock *sk;
1502 struct sockaddr_ll *sll;
1503 struct packet_sock *po;
40d4e3df 1504 u8 *skb_head = skb->data;
1da177e4 1505 int skb_len = skb->len;
dbcb5855 1506 unsigned int snaplen, res;
1da177e4
LT
1507
1508 if (skb->pkt_type == PACKET_LOOPBACK)
1509 goto drop;
1510
1511 sk = pt->af_packet_priv;
1512 po = pkt_sk(sk);
1513
09ad9bc7 1514 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1515 goto drop;
1516
1da177e4
LT
1517 skb->dev = dev;
1518
3b04ddde 1519 if (dev->header_ops) {
1da177e4 1520 /* The device has an explicit notion of ll header,
62ab0812
ED
1521 * exported to higher levels.
1522 *
1523 * Otherwise, the device hides details of its frame
1524 * structure, so that corresponding packet head is
1525 * never delivered to user.
1da177e4
LT
1526 */
1527 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1528 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1529 else if (skb->pkt_type == PACKET_OUTGOING) {
1530 /* Special case: outgoing packets have ll header at head */
bbe735e4 1531 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1532 }
1533 }
1534
1535 snaplen = skb->len;
1536
dbcb5855
DM
1537 res = run_filter(skb, sk, snaplen);
1538 if (!res)
fda9ef5d 1539 goto drop_n_restore;
dbcb5855
DM
1540 if (snaplen > res)
1541 snaplen = res;
1da177e4 1542
0fd7bac6 1543 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1544 goto drop_n_acct;
1545
1546 if (skb_shared(skb)) {
1547 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1548 if (nskb == NULL)
1549 goto drop_n_acct;
1550
1551 if (skb_head != skb->data) {
1552 skb->data = skb_head;
1553 skb->len = skb_len;
1554 }
abc4e4fa 1555 consume_skb(skb);
1da177e4
LT
1556 skb = nskb;
1557 }
1558
ffbc6111
HX
1559 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1560 sizeof(skb->cb));
1561
1562 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1563 sll->sll_family = AF_PACKET;
1564 sll->sll_hatype = dev->type;
1565 sll->sll_protocol = skb->protocol;
1566 sll->sll_pkttype = skb->pkt_type;
8032b464 1567 if (unlikely(po->origdev))
80feaacb
PWJ
1568 sll->sll_ifindex = orig_dev->ifindex;
1569 else
1570 sll->sll_ifindex = dev->ifindex;
1da177e4 1571
b95cce35 1572 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1573
ffbc6111 1574 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1575
1da177e4
LT
1576 if (pskb_trim(skb, snaplen))
1577 goto drop_n_acct;
1578
1579 skb_set_owner_r(skb, sk);
1580 skb->dev = NULL;
adf30907 1581 skb_dst_drop(skb);
1da177e4 1582
84531c24
PO
1583 /* drop conntrack reference */
1584 nf_reset(skb);
1585
1da177e4
LT
1586 spin_lock(&sk->sk_receive_queue.lock);
1587 po->stats.tp_packets++;
3b885787 1588 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1589 __skb_queue_tail(&sk->sk_receive_queue, skb);
1590 spin_unlock(&sk->sk_receive_queue.lock);
1591 sk->sk_data_ready(sk, skb->len);
1592 return 0;
1593
1594drop_n_acct:
7091fbd8
WB
1595 spin_lock(&sk->sk_receive_queue.lock);
1596 po->stats.tp_drops++;
1597 atomic_inc(&sk->sk_drops);
1598 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1599
1600drop_n_restore:
1601 if (skb_head != skb->data && skb_shared(skb)) {
1602 skb->data = skb_head;
1603 skb->len = skb_len;
1604 }
1605drop:
ead2ceb0 1606 consume_skb(skb);
1da177e4
LT
1607 return 0;
1608}
1609
40d4e3df
ED
1610static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1611 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1612{
1613 struct sock *sk;
1614 struct packet_sock *po;
1615 struct sockaddr_ll *sll;
bbd6ef87
PM
1616 union {
1617 struct tpacket_hdr *h1;
1618 struct tpacket2_hdr *h2;
f6fb8f10 1619 struct tpacket3_hdr *h3;
bbd6ef87
PM
1620 void *raw;
1621 } h;
40d4e3df 1622 u8 *skb_head = skb->data;
1da177e4 1623 int skb_len = skb->len;
dbcb5855 1624 unsigned int snaplen, res;
f6fb8f10 1625 unsigned long status = TP_STATUS_USER;
bbd6ef87 1626 unsigned short macoff, netoff, hdrlen;
1da177e4 1627 struct sk_buff *copy_skb = NULL;
b7aa0bf7 1628 struct timeval tv;
bbd6ef87 1629 struct timespec ts;
614f60fa 1630 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
1631
1632 if (skb->pkt_type == PACKET_LOOPBACK)
1633 goto drop;
1634
1635 sk = pt->af_packet_priv;
1636 po = pkt_sk(sk);
1637
09ad9bc7 1638 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1639 goto drop;
1640
3b04ddde 1641 if (dev->header_ops) {
1da177e4 1642 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1643 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1644 else if (skb->pkt_type == PACKET_OUTGOING) {
1645 /* Special case: outgoing packets have ll header at head */
bbe735e4 1646 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1647 }
1648 }
1649
8dc41944
HX
1650 if (skb->ip_summed == CHECKSUM_PARTIAL)
1651 status |= TP_STATUS_CSUMNOTREADY;
1652
1da177e4
LT
1653 snaplen = skb->len;
1654
dbcb5855
DM
1655 res = run_filter(skb, sk, snaplen);
1656 if (!res)
fda9ef5d 1657 goto drop_n_restore;
dbcb5855
DM
1658 if (snaplen > res)
1659 snaplen = res;
1da177e4
LT
1660
1661 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1662 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1663 po->tp_reserve;
1da177e4 1664 } else {
95c96174 1665 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1666 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1667 (maclen < 16 ? 16 : maclen)) +
1668 po->tp_reserve;
1da177e4
LT
1669 macoff = netoff - maclen;
1670 }
f6fb8f10 1671 if (po->tp_version <= TPACKET_V2) {
1672 if (macoff + snaplen > po->rx_ring.frame_size) {
1673 if (po->copy_thresh &&
0fd7bac6 1674 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1675 if (skb_shared(skb)) {
1676 copy_skb = skb_clone(skb, GFP_ATOMIC);
1677 } else {
1678 copy_skb = skb_get(skb);
1679 skb_head = skb->data;
1680 }
1681 if (copy_skb)
1682 skb_set_owner_r(copy_skb, sk);
1da177e4 1683 }
f6fb8f10 1684 snaplen = po->rx_ring.frame_size - macoff;
1685 if ((int)snaplen < 0)
1686 snaplen = 0;
1da177e4 1687 }
1da177e4 1688 }
1da177e4 1689 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1690 h.raw = packet_current_rx_frame(po, skb,
1691 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1692 if (!h.raw)
1da177e4 1693 goto ring_is_full;
f6fb8f10 1694 if (po->tp_version <= TPACKET_V2) {
1695 packet_increment_rx_head(po, &po->rx_ring);
1696 /*
1697 * LOSING will be reported till you read the stats,
1698 * because it's COR - Clear On Read.
1699 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1700 * at packet level.
1701 */
1702 if (po->stats.tp_drops)
1703 status |= TP_STATUS_LOSING;
1704 }
1da177e4
LT
1705 po->stats.tp_packets++;
1706 if (copy_skb) {
1707 status |= TP_STATUS_COPY;
1708 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1709 }
1da177e4
LT
1710 spin_unlock(&sk->sk_receive_queue.lock);
1711
bbd6ef87 1712 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 1713
bbd6ef87
PM
1714 switch (po->tp_version) {
1715 case TPACKET_V1:
1716 h.h1->tp_len = skb->len;
1717 h.h1->tp_snaplen = snaplen;
1718 h.h1->tp_mac = macoff;
1719 h.h1->tp_net = netoff;
614f60fa
SM
1720 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1721 && shhwtstamps->syststamp.tv64)
1722 tv = ktime_to_timeval(shhwtstamps->syststamp);
1723 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1724 && shhwtstamps->hwtstamp.tv64)
1725 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1726 else if (skb->tstamp.tv64)
bbd6ef87
PM
1727 tv = ktime_to_timeval(skb->tstamp);
1728 else
1729 do_gettimeofday(&tv);
1730 h.h1->tp_sec = tv.tv_sec;
1731 h.h1->tp_usec = tv.tv_usec;
1732 hdrlen = sizeof(*h.h1);
1733 break;
1734 case TPACKET_V2:
1735 h.h2->tp_len = skb->len;
1736 h.h2->tp_snaplen = snaplen;
1737 h.h2->tp_mac = macoff;
1738 h.h2->tp_net = netoff;
614f60fa
SM
1739 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1740 && shhwtstamps->syststamp.tv64)
1741 ts = ktime_to_timespec(shhwtstamps->syststamp);
1742 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1743 && shhwtstamps->hwtstamp.tv64)
1744 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1745 else if (skb->tstamp.tv64)
bbd6ef87
PM
1746 ts = ktime_to_timespec(skb->tstamp);
1747 else
1748 getnstimeofday(&ts);
1749 h.h2->tp_sec = ts.tv_sec;
1750 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1751 if (vlan_tx_tag_present(skb)) {
1752 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1753 status |= TP_STATUS_VLAN_VALID;
1754 } else {
1755 h.h2->tp_vlan_tci = 0;
1756 }
13fcb7bd 1757 h.h2->tp_padding = 0;
bbd6ef87
PM
1758 hdrlen = sizeof(*h.h2);
1759 break;
f6fb8f10 1760 case TPACKET_V3:
1761 /* tp_nxt_offset,vlan are already populated above.
1762 * So DONT clear those fields here
1763 */
1764 h.h3->tp_status |= status;
1765 h.h3->tp_len = skb->len;
1766 h.h3->tp_snaplen = snaplen;
1767 h.h3->tp_mac = macoff;
1768 h.h3->tp_net = netoff;
1769 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1770 && shhwtstamps->syststamp.tv64)
1771 ts = ktime_to_timespec(shhwtstamps->syststamp);
1772 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1773 && shhwtstamps->hwtstamp.tv64)
1774 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1775 else if (skb->tstamp.tv64)
1776 ts = ktime_to_timespec(skb->tstamp);
1777 else
1778 getnstimeofday(&ts);
1779 h.h3->tp_sec = ts.tv_sec;
1780 h.h3->tp_nsec = ts.tv_nsec;
1781 hdrlen = sizeof(*h.h3);
1782 break;
bbd6ef87
PM
1783 default:
1784 BUG();
1785 }
1da177e4 1786
bbd6ef87 1787 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1788 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1789 sll->sll_family = AF_PACKET;
1790 sll->sll_hatype = dev->type;
1791 sll->sll_protocol = skb->protocol;
1792 sll->sll_pkttype = skb->pkt_type;
8032b464 1793 if (unlikely(po->origdev))
80feaacb
PWJ
1794 sll->sll_ifindex = orig_dev->ifindex;
1795 else
1796 sll->sll_ifindex = dev->ifindex;
1da177e4 1797
e16aa207 1798 smp_mb();
f6dafa95 1799#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1800 {
0af55bb5
CG
1801 u8 *start, *end;
1802
f6fb8f10 1803 if (po->tp_version <= TPACKET_V2) {
1804 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1805 + macoff + snaplen);
1806 for (start = h.raw; start < end; start += PAGE_SIZE)
1807 flush_dcache_page(pgv_to_page(start));
1808 }
cc9f01b2 1809 smp_wmb();
1da177e4 1810 }
f6dafa95 1811#endif
f6fb8f10 1812 if (po->tp_version <= TPACKET_V2)
1813 __packet_set_status(po, h.raw, status);
1814 else
1815 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1816
1817 sk->sk_data_ready(sk, 0);
1818
1819drop_n_restore:
1820 if (skb_head != skb->data && skb_shared(skb)) {
1821 skb->data = skb_head;
1822 skb->len = skb_len;
1823 }
1824drop:
1ce4f28b 1825 kfree_skb(skb);
1da177e4
LT
1826 return 0;
1827
1828ring_is_full:
1829 po->stats.tp_drops++;
1830 spin_unlock(&sk->sk_receive_queue.lock);
1831
1832 sk->sk_data_ready(sk, 0);
acb5d75b 1833 kfree_skb(copy_skb);
1da177e4
LT
1834 goto drop_n_restore;
1835}
1836
69e3c75f
JB
1837static void tpacket_destruct_skb(struct sk_buff *skb)
1838{
1839 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1840 void *ph;
1da177e4 1841
69e3c75f
JB
1842 if (likely(po->tx_ring.pg_vec)) {
1843 ph = skb_shinfo(skb)->destructor_arg;
1844 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
1845 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1846 atomic_dec(&po->tx_ring.pending);
1847 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1848 }
1849
1850 sock_wfree(skb);
1851}
1852
40d4e3df
ED
1853static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1854 void *frame, struct net_device *dev, int size_max,
ae641949 1855 __be16 proto, unsigned char *addr, int hlen)
69e3c75f
JB
1856{
1857 union {
1858 struct tpacket_hdr *h1;
1859 struct tpacket2_hdr *h2;
1860 void *raw;
1861 } ph;
1862 int to_write, offset, len, tp_len, nr_frags, len_max;
1863 struct socket *sock = po->sk.sk_socket;
1864 struct page *page;
1865 void *data;
1866 int err;
1867
1868 ph.raw = frame;
1869
1870 skb->protocol = proto;
1871 skb->dev = dev;
1872 skb->priority = po->sk.sk_priority;
2d37a186 1873 skb->mark = po->sk.sk_mark;
69e3c75f
JB
1874 skb_shinfo(skb)->destructor_arg = ph.raw;
1875
1876 switch (po->tp_version) {
1877 case TPACKET_V2:
1878 tp_len = ph.h2->tp_len;
1879 break;
1880 default:
1881 tp_len = ph.h1->tp_len;
1882 break;
1883 }
1884 if (unlikely(tp_len > size_max)) {
40d4e3df 1885 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1886 return -EMSGSIZE;
1887 }
1888
ae641949 1889 skb_reserve(skb, hlen);
69e3c75f
JB
1890 skb_reset_network_header(skb);
1891
1892 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1893 to_write = tp_len;
1894
1895 if (sock->type == SOCK_DGRAM) {
1896 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1897 NULL, tp_len);
1898 if (unlikely(err < 0))
1899 return -EINVAL;
40d4e3df 1900 } else if (dev->hard_header_len) {
69e3c75f
JB
1901 /* net device doesn't like empty head */
1902 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
1903 pr_err("packet size is too short (%d < %d)\n",
1904 tp_len, dev->hard_header_len);
69e3c75f
JB
1905 return -EINVAL;
1906 }
1907
1908 skb_push(skb, dev->hard_header_len);
1909 err = skb_store_bits(skb, 0, data,
1910 dev->hard_header_len);
1911 if (unlikely(err))
1912 return err;
1913
1914 data += dev->hard_header_len;
1915 to_write -= dev->hard_header_len;
1916 }
1917
1918 err = -EFAULT;
69e3c75f
JB
1919 offset = offset_in_page(data);
1920 len_max = PAGE_SIZE - offset;
1921 len = ((to_write > len_max) ? len_max : to_write);
1922
1923 skb->data_len = to_write;
1924 skb->len += to_write;
1925 skb->truesize += to_write;
1926 atomic_add(to_write, &po->sk.sk_wmem_alloc);
1927
1928 while (likely(to_write)) {
1929 nr_frags = skb_shinfo(skb)->nr_frags;
1930
1931 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
1932 pr_err("Packet exceed the number of skb frags(%lu)\n",
1933 MAX_SKB_FRAGS);
69e3c75f
JB
1934 return -EFAULT;
1935 }
1936
0af55bb5
CG
1937 page = pgv_to_page(data);
1938 data += len;
69e3c75f
JB
1939 flush_dcache_page(page);
1940 get_page(page);
0af55bb5 1941 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
1942 to_write -= len;
1943 offset = 0;
1944 len_max = PAGE_SIZE;
1945 len = ((to_write > len_max) ? len_max : to_write);
1946 }
1947
1948 return tp_len;
1949}
1950
1951static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
1952{
69e3c75f
JB
1953 struct sk_buff *skb;
1954 struct net_device *dev;
1955 __be16 proto;
827d9780
BG
1956 bool need_rls_dev = false;
1957 int err, reserve = 0;
40d4e3df
ED
1958 void *ph;
1959 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
1960 int tp_len, size_max;
1961 unsigned char *addr;
1962 int len_sum = 0;
1963 int status = 0;
ae641949 1964 int hlen, tlen;
69e3c75f 1965
69e3c75f
JB
1966 mutex_lock(&po->pg_vec_lock);
1967
1968 err = -EBUSY;
1969 if (saddr == NULL) {
827d9780 1970 dev = po->prot_hook.dev;
69e3c75f
JB
1971 proto = po->num;
1972 addr = NULL;
1973 } else {
1974 err = -EINVAL;
1975 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1976 goto out;
1977 if (msg->msg_namelen < (saddr->sll_halen
1978 + offsetof(struct sockaddr_ll,
1979 sll_addr)))
1980 goto out;
69e3c75f
JB
1981 proto = saddr->sll_protocol;
1982 addr = saddr->sll_addr;
827d9780
BG
1983 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
1984 need_rls_dev = true;
69e3c75f
JB
1985 }
1986
69e3c75f
JB
1987 err = -ENXIO;
1988 if (unlikely(dev == NULL))
1989 goto out;
1990
1991 reserve = dev->hard_header_len;
1992
1993 err = -ENETDOWN;
1994 if (unlikely(!(dev->flags & IFF_UP)))
1995 goto out_put;
1996
1997 size_max = po->tx_ring.frame_size
b5dd884e 1998 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
1999
2000 if (size_max > dev->mtu + reserve)
2001 size_max = dev->mtu + reserve;
2002
2003 do {
2004 ph = packet_current_frame(po, &po->tx_ring,
2005 TP_STATUS_SEND_REQUEST);
2006
2007 if (unlikely(ph == NULL)) {
2008 schedule();
2009 continue;
2010 }
2011
2012 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2013 hlen = LL_RESERVED_SPACE(dev);
2014 tlen = dev->needed_tailroom;
69e3c75f 2015 skb = sock_alloc_send_skb(&po->sk,
ae641949 2016 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2017 0, &err);
2018
2019 if (unlikely(skb == NULL))
2020 goto out_status;
2021
2022 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2023 addr, hlen);
69e3c75f
JB
2024
2025 if (unlikely(tp_len < 0)) {
2026 if (po->tp_loss) {
2027 __packet_set_status(po, ph,
2028 TP_STATUS_AVAILABLE);
2029 packet_increment_head(&po->tx_ring);
2030 kfree_skb(skb);
2031 continue;
2032 } else {
2033 status = TP_STATUS_WRONG_FORMAT;
2034 err = tp_len;
2035 goto out_status;
2036 }
2037 }
2038
2039 skb->destructor = tpacket_destruct_skb;
2040 __packet_set_status(po, ph, TP_STATUS_SENDING);
2041 atomic_inc(&po->tx_ring.pending);
2042
2043 status = TP_STATUS_SEND_REQUEST;
2044 err = dev_queue_xmit(skb);
eb70df13
JP
2045 if (unlikely(err > 0)) {
2046 err = net_xmit_errno(err);
2047 if (err && __packet_get_status(po, ph) ==
2048 TP_STATUS_AVAILABLE) {
2049 /* skb was destructed already */
2050 skb = NULL;
2051 goto out_status;
2052 }
2053 /*
2054 * skb was dropped but not destructed yet;
2055 * let's treat it like congestion or err < 0
2056 */
2057 err = 0;
2058 }
69e3c75f
JB
2059 packet_increment_head(&po->tx_ring);
2060 len_sum += tp_len;
f64f9e71
JP
2061 } while (likely((ph != NULL) ||
2062 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2063 (atomic_read(&po->tx_ring.pending))))
2064 );
69e3c75f
JB
2065
2066 err = len_sum;
2067 goto out_put;
2068
69e3c75f
JB
2069out_status:
2070 __packet_set_status(po, ph, status);
2071 kfree_skb(skb);
2072out_put:
827d9780
BG
2073 if (need_rls_dev)
2074 dev_put(dev);
69e3c75f
JB
2075out:
2076 mutex_unlock(&po->pg_vec_lock);
2077 return err;
2078}
69e3c75f 2079
eea49cc9
OJ
2080static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2081 size_t reserve, size_t len,
2082 size_t linear, int noblock,
2083 int *err)
bfd5f4a3
SS
2084{
2085 struct sk_buff *skb;
2086
2087 /* Under a page? Don't bother with paged skb. */
2088 if (prepad + len < PAGE_SIZE || !linear)
2089 linear = len;
2090
2091 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2092 err);
2093 if (!skb)
2094 return NULL;
2095
2096 skb_reserve(skb, reserve);
2097 skb_put(skb, linear);
2098 skb->data_len = len - linear;
2099 skb->len += len - linear;
2100
2101 return skb;
2102}
2103
69e3c75f 2104static int packet_snd(struct socket *sock,
1da177e4
LT
2105 struct msghdr *msg, size_t len)
2106{
2107 struct sock *sk = sock->sk;
40d4e3df 2108 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2109 struct sk_buff *skb;
2110 struct net_device *dev;
0e11c91e 2111 __be16 proto;
827d9780 2112 bool need_rls_dev = false;
1da177e4 2113 unsigned char *addr;
827d9780 2114 int err, reserve = 0;
bfd5f4a3
SS
2115 struct virtio_net_hdr vnet_hdr = { 0 };
2116 int offset = 0;
2117 int vnet_hdr_len;
2118 struct packet_sock *po = pkt_sk(sk);
2119 unsigned short gso_type = 0;
ae641949 2120 int hlen, tlen;
3bdc0eba 2121 int extra_len = 0;
1da177e4
LT
2122
2123 /*
1ce4f28b 2124 * Get and verify the address.
1da177e4 2125 */
1ce4f28b 2126
1da177e4 2127 if (saddr == NULL) {
827d9780 2128 dev = po->prot_hook.dev;
1da177e4
LT
2129 proto = po->num;
2130 addr = NULL;
2131 } else {
2132 err = -EINVAL;
2133 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2134 goto out;
0fb375fb
EB
2135 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2136 goto out;
1da177e4
LT
2137 proto = saddr->sll_protocol;
2138 addr = saddr->sll_addr;
827d9780
BG
2139 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2140 need_rls_dev = true;
1da177e4
LT
2141 }
2142
1da177e4
LT
2143 err = -ENXIO;
2144 if (dev == NULL)
2145 goto out_unlock;
2146 if (sock->type == SOCK_RAW)
2147 reserve = dev->hard_header_len;
2148
d5e76b0a
DM
2149 err = -ENETDOWN;
2150 if (!(dev->flags & IFF_UP))
2151 goto out_unlock;
2152
bfd5f4a3
SS
2153 if (po->has_vnet_hdr) {
2154 vnet_hdr_len = sizeof(vnet_hdr);
2155
2156 err = -EINVAL;
2157 if (len < vnet_hdr_len)
2158 goto out_unlock;
2159
2160 len -= vnet_hdr_len;
2161
2162 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2163 vnet_hdr_len);
2164 if (err < 0)
2165 goto out_unlock;
2166
2167 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2168 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2169 vnet_hdr.hdr_len))
2170 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2171 vnet_hdr.csum_offset + 2;
2172
2173 err = -EINVAL;
2174 if (vnet_hdr.hdr_len > len)
2175 goto out_unlock;
2176
2177 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2178 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2179 case VIRTIO_NET_HDR_GSO_TCPV4:
2180 gso_type = SKB_GSO_TCPV4;
2181 break;
2182 case VIRTIO_NET_HDR_GSO_TCPV6:
2183 gso_type = SKB_GSO_TCPV6;
2184 break;
2185 case VIRTIO_NET_HDR_GSO_UDP:
2186 gso_type = SKB_GSO_UDP;
2187 break;
2188 default:
2189 goto out_unlock;
2190 }
2191
2192 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2193 gso_type |= SKB_GSO_TCP_ECN;
2194
2195 if (vnet_hdr.gso_size == 0)
2196 goto out_unlock;
2197
2198 }
2199 }
2200
3bdc0eba
BG
2201 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2202 if (!netif_supports_nofcs(dev)) {
2203 err = -EPROTONOSUPPORT;
2204 goto out_unlock;
2205 }
2206 extra_len = 4; /* We're doing our own CRC */
2207 }
2208
1da177e4 2209 err = -EMSGSIZE;
3bdc0eba 2210 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2211 goto out_unlock;
2212
bfd5f4a3 2213 err = -ENOBUFS;
ae641949
HX
2214 hlen = LL_RESERVED_SPACE(dev);
2215 tlen = dev->needed_tailroom;
2216 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2217 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2218 if (skb == NULL)
1da177e4
LT
2219 goto out_unlock;
2220
bfd5f4a3 2221 skb_set_network_header(skb, reserve);
1da177e4 2222
0c4e8581
SH
2223 err = -EINVAL;
2224 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2225 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2226 goto out_free;
1da177e4
LT
2227
2228 /* Returns -EFAULT on error */
bfd5f4a3 2229 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2230 if (err)
2231 goto out_free;
2244d07b 2232 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
2233 if (err < 0)
2234 goto out_free;
1da177e4 2235
3bdc0eba 2236 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2237 /* Earlier code assumed this would be a VLAN pkt,
2238 * double-check this now that we have the actual
2239 * packet in hand.
2240 */
2241 struct ethhdr *ehdr;
2242 skb_reset_mac_header(skb);
2243 ehdr = eth_hdr(skb);
2244 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2245 err = -EMSGSIZE;
2246 goto out_free;
2247 }
2248 }
2249
1da177e4
LT
2250 skb->protocol = proto;
2251 skb->dev = dev;
2252 skb->priority = sk->sk_priority;
2d37a186 2253 skb->mark = sk->sk_mark;
1da177e4 2254
bfd5f4a3
SS
2255 if (po->has_vnet_hdr) {
2256 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2257 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2258 vnet_hdr.csum_offset)) {
2259 err = -EINVAL;
2260 goto out_free;
2261 }
2262 }
2263
2264 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2265 skb_shinfo(skb)->gso_type = gso_type;
2266
2267 /* Header must be checked, and gso_segs computed. */
2268 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2269 skb_shinfo(skb)->gso_segs = 0;
2270
2271 len += vnet_hdr_len;
2272 }
2273
3bdc0eba
BG
2274 if (unlikely(extra_len == 4))
2275 skb->no_fcs = 1;
2276
1da177e4
LT
2277 /*
2278 * Now send it
2279 */
2280
2281 err = dev_queue_xmit(skb);
2282 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2283 goto out_unlock;
2284
827d9780
BG
2285 if (need_rls_dev)
2286 dev_put(dev);
1da177e4 2287
40d4e3df 2288 return len;
1da177e4
LT
2289
2290out_free:
2291 kfree_skb(skb);
2292out_unlock:
827d9780 2293 if (dev && need_rls_dev)
1da177e4
LT
2294 dev_put(dev);
2295out:
2296 return err;
2297}
2298
69e3c75f
JB
2299static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2300 struct msghdr *msg, size_t len)
2301{
69e3c75f
JB
2302 struct sock *sk = sock->sk;
2303 struct packet_sock *po = pkt_sk(sk);
2304 if (po->tx_ring.pg_vec)
2305 return tpacket_snd(po, msg);
2306 else
69e3c75f
JB
2307 return packet_snd(sock, msg, len);
2308}
2309
1da177e4
LT
2310/*
2311 * Close a PACKET socket. This is fairly simple. We immediately go
2312 * to 'closed' state and remove our protocol entry in the device list.
2313 */
2314
2315static int packet_release(struct socket *sock)
2316{
2317 struct sock *sk = sock->sk;
2318 struct packet_sock *po;
d12d01d6 2319 struct net *net;
f6fb8f10 2320 union tpacket_req_u req_u;
1da177e4
LT
2321
2322 if (!sk)
2323 return 0;
2324
3b1e0a65 2325 net = sock_net(sk);
1da177e4
LT
2326 po = pkt_sk(sk);
2327
808f5114 2328 spin_lock_bh(&net->packet.sklist_lock);
2329 sk_del_node_init_rcu(sk);
920de804 2330 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 2331 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 2332
808f5114 2333 spin_lock(&po->bind_lock);
ce06b03e 2334 unregister_prot_hook(sk, false);
160ff18a
BG
2335 if (po->prot_hook.dev) {
2336 dev_put(po->prot_hook.dev);
2337 po->prot_hook.dev = NULL;
2338 }
808f5114 2339 spin_unlock(&po->bind_lock);
1da177e4 2340
1da177e4 2341 packet_flush_mclist(sk);
1da177e4 2342
f6fb8f10 2343 memset(&req_u, 0, sizeof(req_u));
69e3c75f
JB
2344
2345 if (po->rx_ring.pg_vec)
f6fb8f10 2346 packet_set_ring(sk, &req_u, 1, 0);
69e3c75f
JB
2347
2348 if (po->tx_ring.pg_vec)
f6fb8f10 2349 packet_set_ring(sk, &req_u, 1, 1);
1da177e4 2350
dc99f600
DM
2351 fanout_release(sk);
2352
808f5114 2353 synchronize_net();
1da177e4
LT
2354 /*
2355 * Now the socket is dead. No more input will appear.
2356 */
1da177e4
LT
2357 sock_orphan(sk);
2358 sock->sk = NULL;
2359
2360 /* Purge queues */
2361
2362 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2363 sk_refcnt_debug_release(sk);
1da177e4
LT
2364
2365 sock_put(sk);
2366 return 0;
2367}
2368
2369/*
2370 * Attach a packet hook.
2371 */
2372
0e11c91e 2373static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2374{
2375 struct packet_sock *po = pkt_sk(sk);
dc99f600 2376
aef950b4
WY
2377 if (po->fanout) {
2378 if (dev)
2379 dev_put(dev);
2380
dc99f600 2381 return -EINVAL;
aef950b4 2382 }
1da177e4
LT
2383
2384 lock_sock(sk);
2385
2386 spin_lock(&po->bind_lock);
ce06b03e 2387 unregister_prot_hook(sk, true);
1da177e4
LT
2388 po->num = protocol;
2389 po->prot_hook.type = protocol;
160ff18a
BG
2390 if (po->prot_hook.dev)
2391 dev_put(po->prot_hook.dev);
1da177e4
LT
2392 po->prot_hook.dev = dev;
2393
2394 po->ifindex = dev ? dev->ifindex : 0;
2395
2396 if (protocol == 0)
2397 goto out_unlock;
2398
be85d4ad 2399 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2400 register_prot_hook(sk);
be85d4ad
UT
2401 } else {
2402 sk->sk_err = ENETDOWN;
2403 if (!sock_flag(sk, SOCK_DEAD))
2404 sk->sk_error_report(sk);
1da177e4
LT
2405 }
2406
2407out_unlock:
2408 spin_unlock(&po->bind_lock);
2409 release_sock(sk);
2410 return 0;
2411}
2412
2413/*
2414 * Bind a packet socket to a device
2415 */
2416
40d4e3df
ED
2417static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2418 int addr_len)
1da177e4 2419{
40d4e3df 2420 struct sock *sk = sock->sk;
1da177e4
LT
2421 char name[15];
2422 struct net_device *dev;
2423 int err = -ENODEV;
1ce4f28b 2424
1da177e4
LT
2425 /*
2426 * Check legality
2427 */
1ce4f28b 2428
8ae55f04 2429 if (addr_len != sizeof(struct sockaddr))
1da177e4 2430 return -EINVAL;
40d4e3df 2431 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2432
3b1e0a65 2433 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2434 if (dev)
1da177e4 2435 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2436 return err;
2437}
1da177e4
LT
2438
2439static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2440{
40d4e3df
ED
2441 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2442 struct sock *sk = sock->sk;
1da177e4
LT
2443 struct net_device *dev = NULL;
2444 int err;
2445
2446
2447 /*
2448 * Check legality
2449 */
1ce4f28b 2450
1da177e4
LT
2451 if (addr_len < sizeof(struct sockaddr_ll))
2452 return -EINVAL;
2453 if (sll->sll_family != AF_PACKET)
2454 return -EINVAL;
2455
2456 if (sll->sll_ifindex) {
2457 err = -ENODEV;
3b1e0a65 2458 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2459 if (dev == NULL)
2460 goto out;
2461 }
2462 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2463
2464out:
2465 return err;
2466}
2467
2468static struct proto packet_proto = {
2469 .name = "PACKET",
2470 .owner = THIS_MODULE,
2471 .obj_size = sizeof(struct packet_sock),
2472};
2473
2474/*
1ce4f28b 2475 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2476 */
2477
3f378b68
EP
2478static int packet_create(struct net *net, struct socket *sock, int protocol,
2479 int kern)
1da177e4
LT
2480{
2481 struct sock *sk;
2482 struct packet_sock *po;
0e11c91e 2483 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2484 int err;
2485
2486 if (!capable(CAP_NET_RAW))
2487 return -EPERM;
be02097c
DM
2488 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2489 sock->type != SOCK_PACKET)
1da177e4
LT
2490 return -ESOCKTNOSUPPORT;
2491
2492 sock->state = SS_UNCONNECTED;
2493
2494 err = -ENOBUFS;
6257ff21 2495 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2496 if (sk == NULL)
2497 goto out;
2498
2499 sock->ops = &packet_ops;
1da177e4
LT
2500 if (sock->type == SOCK_PACKET)
2501 sock->ops = &packet_ops_spkt;
be02097c 2502
1da177e4
LT
2503 sock_init_data(sock, sk);
2504
2505 po = pkt_sk(sk);
2506 sk->sk_family = PF_PACKET;
0e11c91e 2507 po->num = proto;
1da177e4
LT
2508
2509 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2510 sk_refcnt_debug_inc(sk);
1da177e4
LT
2511
2512 /*
2513 * Attach a protocol block
2514 */
2515
2516 spin_lock_init(&po->bind_lock);
905db440 2517 mutex_init(&po->pg_vec_lock);
1da177e4 2518 po->prot_hook.func = packet_rcv;
be02097c 2519
1da177e4
LT
2520 if (sock->type == SOCK_PACKET)
2521 po->prot_hook.func = packet_rcv_spkt;
be02097c 2522
1da177e4
LT
2523 po->prot_hook.af_packet_priv = sk;
2524
0e11c91e
AV
2525 if (proto) {
2526 po->prot_hook.type = proto;
ce06b03e 2527 register_prot_hook(sk);
1da177e4
LT
2528 }
2529
808f5114 2530 spin_lock_bh(&net->packet.sklist_lock);
2531 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 2532 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 2533 spin_unlock_bh(&net->packet.sklist_lock);
2534
40d4e3df 2535 return 0;
1da177e4
LT
2536out:
2537 return err;
2538}
2539
ed85b565
RC
2540static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2541{
2542 struct sock_exterr_skb *serr;
2543 struct sk_buff *skb, *skb2;
2544 int copied, err;
2545
2546 err = -EAGAIN;
2547 skb = skb_dequeue(&sk->sk_error_queue);
2548 if (skb == NULL)
2549 goto out;
2550
2551 copied = skb->len;
2552 if (copied > len) {
2553 msg->msg_flags |= MSG_TRUNC;
2554 copied = len;
2555 }
2556 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2557 if (err)
2558 goto out_free_skb;
2559
2560 sock_recv_timestamp(msg, sk, skb);
2561
2562 serr = SKB_EXT_ERR(skb);
2563 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2564 sizeof(serr->ee), &serr->ee);
2565
2566 msg->msg_flags |= MSG_ERRQUEUE;
2567 err = copied;
2568
2569 /* Reset and regenerate socket error */
2570 spin_lock_bh(&sk->sk_error_queue.lock);
2571 sk->sk_err = 0;
2572 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2573 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2574 spin_unlock_bh(&sk->sk_error_queue.lock);
2575 sk->sk_error_report(sk);
2576 } else
2577 spin_unlock_bh(&sk->sk_error_queue.lock);
2578
2579out_free_skb:
2580 kfree_skb(skb);
2581out:
2582 return err;
2583}
2584
1da177e4
LT
2585/*
2586 * Pull a packet from our receive queue and hand it to the user.
2587 * If necessary we block.
2588 */
2589
2590static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2591 struct msghdr *msg, size_t len, int flags)
2592{
2593 struct sock *sk = sock->sk;
2594 struct sk_buff *skb;
2595 int copied, err;
0fb375fb 2596 struct sockaddr_ll *sll;
bfd5f4a3 2597 int vnet_hdr_len = 0;
1da177e4
LT
2598
2599 err = -EINVAL;
ed85b565 2600 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2601 goto out;
2602
2603#if 0
2604 /* What error should we return now? EUNATTACH? */
2605 if (pkt_sk(sk)->ifindex < 0)
2606 return -ENODEV;
2607#endif
2608
ed85b565
RC
2609 if (flags & MSG_ERRQUEUE) {
2610 err = packet_recv_error(sk, msg, len);
2611 goto out;
2612 }
2613
1da177e4
LT
2614 /*
2615 * Call the generic datagram receiver. This handles all sorts
2616 * of horrible races and re-entrancy so we can forget about it
2617 * in the protocol layers.
2618 *
2619 * Now it will return ENETDOWN, if device have just gone down,
2620 * but then it will block.
2621 */
2622
40d4e3df 2623 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2624
2625 /*
1ce4f28b 2626 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2627 * handles the blocking we don't see and worry about blocking
2628 * retries.
2629 */
2630
8ae55f04 2631 if (skb == NULL)
1da177e4
LT
2632 goto out;
2633
bfd5f4a3
SS
2634 if (pkt_sk(sk)->has_vnet_hdr) {
2635 struct virtio_net_hdr vnet_hdr = { 0 };
2636
2637 err = -EINVAL;
2638 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2639 if (len < vnet_hdr_len)
bfd5f4a3
SS
2640 goto out_free;
2641
1f18b717
MK
2642 len -= vnet_hdr_len;
2643
bfd5f4a3
SS
2644 if (skb_is_gso(skb)) {
2645 struct skb_shared_info *sinfo = skb_shinfo(skb);
2646
2647 /* This is a hint as to how much should be linear. */
2648 vnet_hdr.hdr_len = skb_headlen(skb);
2649 vnet_hdr.gso_size = sinfo->gso_size;
2650 if (sinfo->gso_type & SKB_GSO_TCPV4)
2651 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2652 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2653 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2654 else if (sinfo->gso_type & SKB_GSO_UDP)
2655 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2656 else if (sinfo->gso_type & SKB_GSO_FCOE)
2657 goto out_free;
2658 else
2659 BUG();
2660 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2661 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2662 } else
2663 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2664
2665 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2666 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2667 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2668 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2669 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2670 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2671 } /* else everything is zero */
2672
2673 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2674 vnet_hdr_len);
2675 if (err < 0)
2676 goto out_free;
2677 }
2678
0fb375fb
EB
2679 /*
2680 * If the address length field is there to be filled in, we fill
2681 * it in now.
2682 */
2683
ffbc6111 2684 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2685 if (sock->type == SOCK_PACKET)
2686 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2687 else
2688 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2689
1da177e4
LT
2690 /*
2691 * You lose any data beyond the buffer you gave. If it worries a
2692 * user program they can ask the device for its MTU anyway.
2693 */
2694
2695 copied = skb->len;
40d4e3df
ED
2696 if (copied > len) {
2697 copied = len;
2698 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2699 }
2700
2701 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2702 if (err)
2703 goto out_free;
2704
3b885787 2705 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2706
2707 if (msg->msg_name)
ffbc6111
HX
2708 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2709 msg->msg_namelen);
1da177e4 2710
8dc41944 2711 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2712 struct tpacket_auxdata aux;
2713
2714 aux.tp_status = TP_STATUS_USER;
2715 if (skb->ip_summed == CHECKSUM_PARTIAL)
2716 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2717 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2718 aux.tp_snaplen = skb->len;
2719 aux.tp_mac = 0;
bbe735e4 2720 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2721 if (vlan_tx_tag_present(skb)) {
2722 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2723 aux.tp_status |= TP_STATUS_VLAN_VALID;
2724 } else {
2725 aux.tp_vlan_tci = 0;
2726 }
13fcb7bd 2727 aux.tp_padding = 0;
ffbc6111 2728 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2729 }
2730
1da177e4
LT
2731 /*
2732 * Free or return the buffer as appropriate. Again this
2733 * hides all the races and re-entrancy issues from us.
2734 */
bfd5f4a3 2735 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2736
2737out_free:
2738 skb_free_datagram(sk, skb);
2739out:
2740 return err;
2741}
2742
1da177e4
LT
2743static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2744 int *uaddr_len, int peer)
2745{
2746 struct net_device *dev;
2747 struct sock *sk = sock->sk;
2748
2749 if (peer)
2750 return -EOPNOTSUPP;
2751
2752 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2753 rcu_read_lock();
2754 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2755 if (dev)
67286640 2756 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2757 else
1da177e4 2758 memset(uaddr->sa_data, 0, 14);
654d1f8a 2759 rcu_read_unlock();
1da177e4
LT
2760 *uaddr_len = sizeof(*uaddr);
2761
2762 return 0;
2763}
1da177e4
LT
2764
2765static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2766 int *uaddr_len, int peer)
2767{
2768 struct net_device *dev;
2769 struct sock *sk = sock->sk;
2770 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2771 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2772
2773 if (peer)
2774 return -EOPNOTSUPP;
2775
2776 sll->sll_family = AF_PACKET;
2777 sll->sll_ifindex = po->ifindex;
2778 sll->sll_protocol = po->num;
67286640 2779 sll->sll_pkttype = 0;
654d1f8a
ED
2780 rcu_read_lock();
2781 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2782 if (dev) {
2783 sll->sll_hatype = dev->type;
2784 sll->sll_halen = dev->addr_len;
2785 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2786 } else {
2787 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2788 sll->sll_halen = 0;
2789 }
654d1f8a 2790 rcu_read_unlock();
0fb375fb 2791 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2792
2793 return 0;
2794}
2795
2aeb0b88
WC
2796static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2797 int what)
1da177e4
LT
2798{
2799 switch (i->type) {
2800 case PACKET_MR_MULTICAST:
1162563f
JP
2801 if (i->alen != dev->addr_len)
2802 return -EINVAL;
1da177e4 2803 if (what > 0)
22bedad3 2804 return dev_mc_add(dev, i->addr);
1da177e4 2805 else
22bedad3 2806 return dev_mc_del(dev, i->addr);
1da177e4
LT
2807 break;
2808 case PACKET_MR_PROMISC:
2aeb0b88 2809 return dev_set_promiscuity(dev, what);
1da177e4
LT
2810 break;
2811 case PACKET_MR_ALLMULTI:
2aeb0b88 2812 return dev_set_allmulti(dev, what);
1da177e4 2813 break;
d95ed927 2814 case PACKET_MR_UNICAST:
1162563f
JP
2815 if (i->alen != dev->addr_len)
2816 return -EINVAL;
d95ed927 2817 if (what > 0)
a748ee24 2818 return dev_uc_add(dev, i->addr);
d95ed927 2819 else
a748ee24 2820 return dev_uc_del(dev, i->addr);
d95ed927 2821 break;
40d4e3df
ED
2822 default:
2823 break;
1da177e4 2824 }
2aeb0b88 2825 return 0;
1da177e4
LT
2826}
2827
2828static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2829{
40d4e3df 2830 for ( ; i; i = i->next) {
1da177e4
LT
2831 if (i->ifindex == dev->ifindex)
2832 packet_dev_mc(dev, i, what);
2833 }
2834}
2835
0fb375fb 2836static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2837{
2838 struct packet_sock *po = pkt_sk(sk);
2839 struct packet_mclist *ml, *i;
2840 struct net_device *dev;
2841 int err;
2842
2843 rtnl_lock();
2844
2845 err = -ENODEV;
3b1e0a65 2846 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2847 if (!dev)
2848 goto done;
2849
2850 err = -EINVAL;
1162563f 2851 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2852 goto done;
2853
2854 err = -ENOBUFS;
8b3a7005 2855 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2856 if (i == NULL)
2857 goto done;
2858
2859 err = 0;
2860 for (ml = po->mclist; ml; ml = ml->next) {
2861 if (ml->ifindex == mreq->mr_ifindex &&
2862 ml->type == mreq->mr_type &&
2863 ml->alen == mreq->mr_alen &&
2864 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2865 ml->count++;
2866 /* Free the new element ... */
2867 kfree(i);
2868 goto done;
2869 }
2870 }
2871
2872 i->type = mreq->mr_type;
2873 i->ifindex = mreq->mr_ifindex;
2874 i->alen = mreq->mr_alen;
2875 memcpy(i->addr, mreq->mr_address, i->alen);
2876 i->count = 1;
2877 i->next = po->mclist;
2878 po->mclist = i;
2aeb0b88
WC
2879 err = packet_dev_mc(dev, i, 1);
2880 if (err) {
2881 po->mclist = i->next;
2882 kfree(i);
2883 }
1da177e4
LT
2884
2885done:
2886 rtnl_unlock();
2887 return err;
2888}
2889
0fb375fb 2890static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2891{
2892 struct packet_mclist *ml, **mlp;
2893
2894 rtnl_lock();
2895
2896 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2897 if (ml->ifindex == mreq->mr_ifindex &&
2898 ml->type == mreq->mr_type &&
2899 ml->alen == mreq->mr_alen &&
2900 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2901 if (--ml->count == 0) {
2902 struct net_device *dev;
2903 *mlp = ml->next;
ad959e76
ED
2904 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2905 if (dev)
1da177e4 2906 packet_dev_mc(dev, ml, -1);
1da177e4
LT
2907 kfree(ml);
2908 }
2909 rtnl_unlock();
2910 return 0;
2911 }
2912 }
2913 rtnl_unlock();
2914 return -EADDRNOTAVAIL;
2915}
2916
2917static void packet_flush_mclist(struct sock *sk)
2918{
2919 struct packet_sock *po = pkt_sk(sk);
2920 struct packet_mclist *ml;
2921
2922 if (!po->mclist)
2923 return;
2924
2925 rtnl_lock();
2926 while ((ml = po->mclist) != NULL) {
2927 struct net_device *dev;
2928
2929 po->mclist = ml->next;
ad959e76
ED
2930 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2931 if (dev != NULL)
1da177e4 2932 packet_dev_mc(dev, ml, -1);
1da177e4
LT
2933 kfree(ml);
2934 }
2935 rtnl_unlock();
2936}
1da177e4
LT
2937
2938static int
b7058842 2939packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
2940{
2941 struct sock *sk = sock->sk;
8dc41944 2942 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
2943 int ret;
2944
2945 if (level != SOL_PACKET)
2946 return -ENOPROTOOPT;
2947
69e3c75f 2948 switch (optname) {
1ce4f28b 2949 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
2950 case PACKET_DROP_MEMBERSHIP:
2951 {
0fb375fb
EB
2952 struct packet_mreq_max mreq;
2953 int len = optlen;
2954 memset(&mreq, 0, sizeof(mreq));
2955 if (len < sizeof(struct packet_mreq))
1da177e4 2956 return -EINVAL;
0fb375fb
EB
2957 if (len > sizeof(mreq))
2958 len = sizeof(mreq);
40d4e3df 2959 if (copy_from_user(&mreq, optval, len))
1da177e4 2960 return -EFAULT;
0fb375fb
EB
2961 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
2962 return -EINVAL;
1da177e4
LT
2963 if (optname == PACKET_ADD_MEMBERSHIP)
2964 ret = packet_mc_add(sk, &mreq);
2965 else
2966 ret = packet_mc_drop(sk, &mreq);
2967 return ret;
2968 }
a2efcfa0 2969
1da177e4 2970 case PACKET_RX_RING:
69e3c75f 2971 case PACKET_TX_RING:
1da177e4 2972 {
f6fb8f10 2973 union tpacket_req_u req_u;
2974 int len;
1da177e4 2975
f6fb8f10 2976 switch (po->tp_version) {
2977 case TPACKET_V1:
2978 case TPACKET_V2:
2979 len = sizeof(req_u.req);
2980 break;
2981 case TPACKET_V3:
2982 default:
2983 len = sizeof(req_u.req3);
2984 break;
2985 }
2986 if (optlen < len)
1da177e4 2987 return -EINVAL;
bfd5f4a3
SS
2988 if (pkt_sk(sk)->has_vnet_hdr)
2989 return -EINVAL;
f6fb8f10 2990 if (copy_from_user(&req_u.req, optval, len))
1da177e4 2991 return -EFAULT;
f6fb8f10 2992 return packet_set_ring(sk, &req_u, 0,
2993 optname == PACKET_TX_RING);
1da177e4
LT
2994 }
2995 case PACKET_COPY_THRESH:
2996 {
2997 int val;
2998
40d4e3df 2999 if (optlen != sizeof(val))
1da177e4 3000 return -EINVAL;
40d4e3df 3001 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3002 return -EFAULT;
3003
3004 pkt_sk(sk)->copy_thresh = val;
3005 return 0;
3006 }
bbd6ef87
PM
3007 case PACKET_VERSION:
3008 {
3009 int val;
3010
3011 if (optlen != sizeof(val))
3012 return -EINVAL;
69e3c75f 3013 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3014 return -EBUSY;
3015 if (copy_from_user(&val, optval, sizeof(val)))
3016 return -EFAULT;
3017 switch (val) {
3018 case TPACKET_V1:
3019 case TPACKET_V2:
f6fb8f10 3020 case TPACKET_V3:
bbd6ef87
PM
3021 po->tp_version = val;
3022 return 0;
3023 default:
3024 return -EINVAL;
3025 }
3026 }
8913336a
PM
3027 case PACKET_RESERVE:
3028 {
3029 unsigned int val;
3030
3031 if (optlen != sizeof(val))
3032 return -EINVAL;
69e3c75f 3033 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3034 return -EBUSY;
3035 if (copy_from_user(&val, optval, sizeof(val)))
3036 return -EFAULT;
3037 po->tp_reserve = val;
3038 return 0;
3039 }
69e3c75f
JB
3040 case PACKET_LOSS:
3041 {
3042 unsigned int val;
3043
3044 if (optlen != sizeof(val))
3045 return -EINVAL;
3046 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3047 return -EBUSY;
3048 if (copy_from_user(&val, optval, sizeof(val)))
3049 return -EFAULT;
3050 po->tp_loss = !!val;
3051 return 0;
3052 }
8dc41944
HX
3053 case PACKET_AUXDATA:
3054 {
3055 int val;
3056
3057 if (optlen < sizeof(val))
3058 return -EINVAL;
3059 if (copy_from_user(&val, optval, sizeof(val)))
3060 return -EFAULT;
3061
3062 po->auxdata = !!val;
3063 return 0;
3064 }
80feaacb
PWJ
3065 case PACKET_ORIGDEV:
3066 {
3067 int val;
3068
3069 if (optlen < sizeof(val))
3070 return -EINVAL;
3071 if (copy_from_user(&val, optval, sizeof(val)))
3072 return -EFAULT;
3073
3074 po->origdev = !!val;
3075 return 0;
3076 }
bfd5f4a3
SS
3077 case PACKET_VNET_HDR:
3078 {
3079 int val;
3080
3081 if (sock->type != SOCK_RAW)
3082 return -EINVAL;
3083 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3084 return -EBUSY;
3085 if (optlen < sizeof(val))
3086 return -EINVAL;
3087 if (copy_from_user(&val, optval, sizeof(val)))
3088 return -EFAULT;
3089
3090 po->has_vnet_hdr = !!val;
3091 return 0;
3092 }
614f60fa
SM
3093 case PACKET_TIMESTAMP:
3094 {
3095 int val;
3096
3097 if (optlen != sizeof(val))
3098 return -EINVAL;
3099 if (copy_from_user(&val, optval, sizeof(val)))
3100 return -EFAULT;
3101
3102 po->tp_tstamp = val;
3103 return 0;
3104 }
dc99f600
DM
3105 case PACKET_FANOUT:
3106 {
3107 int val;
3108
3109 if (optlen != sizeof(val))
3110 return -EINVAL;
3111 if (copy_from_user(&val, optval, sizeof(val)))
3112 return -EFAULT;
3113
3114 return fanout_add(sk, val & 0xffff, val >> 16);
3115 }
1da177e4
LT
3116 default:
3117 return -ENOPROTOOPT;
3118 }
3119}
3120
3121static int packet_getsockopt(struct socket *sock, int level, int optname,
3122 char __user *optval, int __user *optlen)
3123{
3124 int len;
c06fff6e 3125 int val, lv = sizeof(val);
1da177e4
LT
3126 struct sock *sk = sock->sk;
3127 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3128 void *data = &val;
8dc41944 3129 struct tpacket_stats st;
f6fb8f10 3130 union tpacket_stats_u st_u;
1da177e4
LT
3131
3132 if (level != SOL_PACKET)
3133 return -ENOPROTOOPT;
3134
8ae55f04
KK
3135 if (get_user(len, optlen))
3136 return -EFAULT;
1da177e4
LT
3137
3138 if (len < 0)
3139 return -EINVAL;
1ce4f28b 3140
69e3c75f 3141 switch (optname) {
1da177e4 3142 case PACKET_STATISTICS:
1da177e4 3143 spin_lock_bh(&sk->sk_receive_queue.lock);
f6fb8f10 3144 if (po->tp_version == TPACKET_V3) {
c06fff6e 3145 lv = sizeof(struct tpacket_stats_v3);
f6fb8f10 3146 memcpy(&st_u.stats3, &po->stats,
c06fff6e 3147 sizeof(struct tpacket_stats));
f6fb8f10 3148 st_u.stats3.tp_freeze_q_cnt =
c06fff6e 3149 po->stats_u.stats3.tp_freeze_q_cnt;
f6fb8f10 3150 st_u.stats3.tp_packets += po->stats.tp_drops;
3151 data = &st_u.stats3;
3152 } else {
c06fff6e 3153 lv = sizeof(struct tpacket_stats);
f6fb8f10 3154 st = po->stats;
3155 st.tp_packets += st.tp_drops;
3156 data = &st;
3157 }
1da177e4
LT
3158 memset(&po->stats, 0, sizeof(st));
3159 spin_unlock_bh(&sk->sk_receive_queue.lock);
8dc41944
HX
3160 break;
3161 case PACKET_AUXDATA:
8dc41944 3162 val = po->auxdata;
80feaacb
PWJ
3163 break;
3164 case PACKET_ORIGDEV:
80feaacb 3165 val = po->origdev;
bfd5f4a3
SS
3166 break;
3167 case PACKET_VNET_HDR:
bfd5f4a3 3168 val = po->has_vnet_hdr;
1da177e4 3169 break;
bbd6ef87 3170 case PACKET_VERSION:
bbd6ef87 3171 val = po->tp_version;
bbd6ef87
PM
3172 break;
3173 case PACKET_HDRLEN:
3174 if (len > sizeof(int))
3175 len = sizeof(int);
3176 if (copy_from_user(&val, optval, len))
3177 return -EFAULT;
3178 switch (val) {
3179 case TPACKET_V1:
3180 val = sizeof(struct tpacket_hdr);
3181 break;
3182 case TPACKET_V2:
3183 val = sizeof(struct tpacket2_hdr);
3184 break;
f6fb8f10 3185 case TPACKET_V3:
3186 val = sizeof(struct tpacket3_hdr);
3187 break;
bbd6ef87
PM
3188 default:
3189 return -EINVAL;
3190 }
bbd6ef87 3191 break;
8913336a 3192 case PACKET_RESERVE:
8913336a 3193 val = po->tp_reserve;
8913336a 3194 break;
69e3c75f 3195 case PACKET_LOSS:
69e3c75f 3196 val = po->tp_loss;
69e3c75f 3197 break;
614f60fa 3198 case PACKET_TIMESTAMP:
614f60fa 3199 val = po->tp_tstamp;
614f60fa 3200 break;
dc99f600 3201 case PACKET_FANOUT:
dc99f600
DM
3202 val = (po->fanout ?
3203 ((u32)po->fanout->id |
3204 ((u32)po->fanout->type << 16)) :
3205 0);
dc99f600 3206 break;
1da177e4
LT
3207 default:
3208 return -ENOPROTOOPT;
3209 }
3210
c06fff6e
ED
3211 if (len > lv)
3212 len = lv;
8ae55f04
KK
3213 if (put_user(len, optlen))
3214 return -EFAULT;
8dc41944
HX
3215 if (copy_to_user(optval, data, len))
3216 return -EFAULT;
8ae55f04 3217 return 0;
1da177e4
LT
3218}
3219
3220
3221static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3222{
3223 struct sock *sk;
3224 struct hlist_node *node;
ad930650 3225 struct net_device *dev = data;
c346dca1 3226 struct net *net = dev_net(dev);
1da177e4 3227
808f5114 3228 rcu_read_lock();
3229 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
3230 struct packet_sock *po = pkt_sk(sk);
3231
3232 switch (msg) {
3233 case NETDEV_UNREGISTER:
1da177e4
LT
3234 if (po->mclist)
3235 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3236 /* fallthrough */
3237
1da177e4
LT
3238 case NETDEV_DOWN:
3239 if (dev->ifindex == po->ifindex) {
3240 spin_lock(&po->bind_lock);
3241 if (po->running) {
ce06b03e 3242 __unregister_prot_hook(sk, false);
1da177e4
LT
3243 sk->sk_err = ENETDOWN;
3244 if (!sock_flag(sk, SOCK_DEAD))
3245 sk->sk_error_report(sk);
3246 }
3247 if (msg == NETDEV_UNREGISTER) {
3248 po->ifindex = -1;
160ff18a
BG
3249 if (po->prot_hook.dev)
3250 dev_put(po->prot_hook.dev);
1da177e4
LT
3251 po->prot_hook.dev = NULL;
3252 }
3253 spin_unlock(&po->bind_lock);
3254 }
3255 break;
3256 case NETDEV_UP:
808f5114 3257 if (dev->ifindex == po->ifindex) {
3258 spin_lock(&po->bind_lock);
ce06b03e
DM
3259 if (po->num)
3260 register_prot_hook(sk);
808f5114 3261 spin_unlock(&po->bind_lock);
1da177e4 3262 }
1da177e4
LT
3263 break;
3264 }
3265 }
808f5114 3266 rcu_read_unlock();
1da177e4
LT
3267 return NOTIFY_DONE;
3268}
3269
3270
3271static int packet_ioctl(struct socket *sock, unsigned int cmd,
3272 unsigned long arg)
3273{
3274 struct sock *sk = sock->sk;
3275
69e3c75f 3276 switch (cmd) {
40d4e3df
ED
3277 case SIOCOUTQ:
3278 {
3279 int amount = sk_wmem_alloc_get(sk);
31e6d363 3280
40d4e3df
ED
3281 return put_user(amount, (int __user *)arg);
3282 }
3283 case SIOCINQ:
3284 {
3285 struct sk_buff *skb;
3286 int amount = 0;
3287
3288 spin_lock_bh(&sk->sk_receive_queue.lock);
3289 skb = skb_peek(&sk->sk_receive_queue);
3290 if (skb)
3291 amount = skb->len;
3292 spin_unlock_bh(&sk->sk_receive_queue.lock);
3293 return put_user(amount, (int __user *)arg);
3294 }
3295 case SIOCGSTAMP:
3296 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3297 case SIOCGSTAMPNS:
3298 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3299
1da177e4 3300#ifdef CONFIG_INET
40d4e3df
ED
3301 case SIOCADDRT:
3302 case SIOCDELRT:
3303 case SIOCDARP:
3304 case SIOCGARP:
3305 case SIOCSARP:
3306 case SIOCGIFADDR:
3307 case SIOCSIFADDR:
3308 case SIOCGIFBRDADDR:
3309 case SIOCSIFBRDADDR:
3310 case SIOCGIFNETMASK:
3311 case SIOCSIFNETMASK:
3312 case SIOCGIFDSTADDR:
3313 case SIOCSIFDSTADDR:
3314 case SIOCSIFFLAGS:
40d4e3df 3315 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3316#endif
3317
40d4e3df
ED
3318 default:
3319 return -ENOIOCTLCMD;
1da177e4
LT
3320 }
3321 return 0;
3322}
3323
40d4e3df 3324static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3325 poll_table *wait)
3326{
3327 struct sock *sk = sock->sk;
3328 struct packet_sock *po = pkt_sk(sk);
3329 unsigned int mask = datagram_poll(file, sock, wait);
3330
3331 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3332 if (po->rx_ring.pg_vec) {
f6fb8f10 3333 if (!packet_previous_rx_frame(po, &po->rx_ring,
3334 TP_STATUS_KERNEL))
1da177e4
LT
3335 mask |= POLLIN | POLLRDNORM;
3336 }
3337 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3338 spin_lock_bh(&sk->sk_write_queue.lock);
3339 if (po->tx_ring.pg_vec) {
3340 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3341 mask |= POLLOUT | POLLWRNORM;
3342 }
3343 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3344 return mask;
3345}
3346
3347
3348/* Dirty? Well, I still did not learn better way to account
3349 * for user mmaps.
3350 */
3351
3352static void packet_mm_open(struct vm_area_struct *vma)
3353{
3354 struct file *file = vma->vm_file;
40d4e3df 3355 struct socket *sock = file->private_data;
1da177e4 3356 struct sock *sk = sock->sk;
1ce4f28b 3357
1da177e4
LT
3358 if (sk)
3359 atomic_inc(&pkt_sk(sk)->mapped);
3360}
3361
3362static void packet_mm_close(struct vm_area_struct *vma)
3363{
3364 struct file *file = vma->vm_file;
40d4e3df 3365 struct socket *sock = file->private_data;
1da177e4 3366 struct sock *sk = sock->sk;
1ce4f28b 3367
1da177e4
LT
3368 if (sk)
3369 atomic_dec(&pkt_sk(sk)->mapped);
3370}
3371
f0f37e2f 3372static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3373 .open = packet_mm_open,
3374 .close = packet_mm_close,
1da177e4
LT
3375};
3376
0e3125c7
NH
3377static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3378 unsigned int len)
1da177e4
LT
3379{
3380 int i;
3381
4ebf0ae2 3382 for (i = 0; i < len; i++) {
0e3125c7 3383 if (likely(pg_vec[i].buffer)) {
c56b4d90 3384 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3385 vfree(pg_vec[i].buffer);
3386 else
3387 free_pages((unsigned long)pg_vec[i].buffer,
3388 order);
3389 pg_vec[i].buffer = NULL;
3390 }
1da177e4
LT
3391 }
3392 kfree(pg_vec);
3393}
3394
eea49cc9 3395static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3396{
0e3125c7
NH
3397 char *buffer = NULL;
3398 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3399 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3400
3401 buffer = (char *) __get_free_pages(gfp_flags, order);
3402
3403 if (buffer)
3404 return buffer;
3405
3406 /*
3407 * __get_free_pages failed, fall back to vmalloc
3408 */
bbce5a59 3409 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3410
0e3125c7
NH
3411 if (buffer)
3412 return buffer;
3413
3414 /*
3415 * vmalloc failed, lets dig into swap here
3416 */
0e3125c7
NH
3417 gfp_flags &= ~__GFP_NORETRY;
3418 buffer = (char *)__get_free_pages(gfp_flags, order);
3419 if (buffer)
3420 return buffer;
3421
3422 /*
3423 * complete and utter failure
3424 */
3425 return NULL;
4ebf0ae2
DM
3426}
3427
0e3125c7 3428static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3429{
3430 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3431 struct pgv *pg_vec;
4ebf0ae2
DM
3432 int i;
3433
0e3125c7 3434 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3435 if (unlikely(!pg_vec))
3436 goto out;
3437
3438 for (i = 0; i < block_nr; i++) {
c56b4d90 3439 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3440 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3441 goto out_free_pgvec;
3442 }
3443
3444out:
3445 return pg_vec;
3446
3447out_free_pgvec:
3448 free_pg_vec(pg_vec, order, block_nr);
3449 pg_vec = NULL;
3450 goto out;
3451}
1da177e4 3452
f6fb8f10 3453static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3454 int closing, int tx_ring)
1da177e4 3455{
0e3125c7 3456 struct pgv *pg_vec = NULL;
1da177e4 3457 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3458 int was_running, order = 0;
69e3c75f
JB
3459 struct packet_ring_buffer *rb;
3460 struct sk_buff_head *rb_queue;
0e11c91e 3461 __be16 num;
f6fb8f10 3462 int err = -EINVAL;
3463 /* Added to avoid minimal code churn */
3464 struct tpacket_req *req = &req_u->req;
3465
3466 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3467 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3468 WARN(1, "Tx-ring is not supported.\n");
3469 goto out;
3470 }
1ce4f28b 3471
69e3c75f
JB
3472 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3473 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3474
69e3c75f
JB
3475 err = -EBUSY;
3476 if (!closing) {
3477 if (atomic_read(&po->mapped))
3478 goto out;
3479 if (atomic_read(&rb->pending))
3480 goto out;
3481 }
1da177e4 3482
69e3c75f
JB
3483 if (req->tp_block_nr) {
3484 /* Sanity tests and some calculations */
3485 err = -EBUSY;
3486 if (unlikely(rb->pg_vec))
3487 goto out;
1da177e4 3488
bbd6ef87
PM
3489 switch (po->tp_version) {
3490 case TPACKET_V1:
3491 po->tp_hdrlen = TPACKET_HDRLEN;
3492 break;
3493 case TPACKET_V2:
3494 po->tp_hdrlen = TPACKET2_HDRLEN;
3495 break;
f6fb8f10 3496 case TPACKET_V3:
3497 po->tp_hdrlen = TPACKET3_HDRLEN;
3498 break;
bbd6ef87
PM
3499 }
3500
69e3c75f 3501 err = -EINVAL;
4ebf0ae2 3502 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3503 goto out;
4ebf0ae2 3504 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3505 goto out;
8913336a 3506 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3507 po->tp_reserve))
3508 goto out;
4ebf0ae2 3509 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3510 goto out;
1da177e4 3511
69e3c75f
JB
3512 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3513 if (unlikely(rb->frames_per_block <= 0))
3514 goto out;
3515 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3516 req->tp_frame_nr))
3517 goto out;
1da177e4
LT
3518
3519 err = -ENOMEM;
4ebf0ae2
DM
3520 order = get_order(req->tp_block_size);
3521 pg_vec = alloc_pg_vec(req, order);
3522 if (unlikely(!pg_vec))
1da177e4 3523 goto out;
f6fb8f10 3524 switch (po->tp_version) {
3525 case TPACKET_V3:
3526 /* Transmit path is not supported. We checked
3527 * it above but just being paranoid
3528 */
3529 if (!tx_ring)
3530 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3531 break;
3532 default:
3533 break;
3534 }
69e3c75f
JB
3535 }
3536 /* Done */
3537 else {
3538 err = -EINVAL;
4ebf0ae2 3539 if (unlikely(req->tp_frame_nr))
69e3c75f 3540 goto out;
1da177e4
LT
3541 }
3542
3543 lock_sock(sk);
3544
3545 /* Detach socket from network */
3546 spin_lock(&po->bind_lock);
3547 was_running = po->running;
3548 num = po->num;
3549 if (was_running) {
1da177e4 3550 po->num = 0;
ce06b03e 3551 __unregister_prot_hook(sk, false);
1da177e4
LT
3552 }
3553 spin_unlock(&po->bind_lock);
1ce4f28b 3554
1da177e4
LT
3555 synchronize_net();
3556
3557 err = -EBUSY;
905db440 3558 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3559 if (closing || atomic_read(&po->mapped) == 0) {
3560 err = 0;
69e3c75f 3561 spin_lock_bh(&rb_queue->lock);
c053fd96 3562 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3563 rb->frame_max = (req->tp_frame_nr - 1);
3564 rb->head = 0;
3565 rb->frame_size = req->tp_frame_size;
3566 spin_unlock_bh(&rb_queue->lock);
3567
c053fd96
CG
3568 swap(rb->pg_vec_order, order);
3569 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3570
3571 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3572 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3573 tpacket_rcv : packet_rcv;
3574 skb_queue_purge(rb_queue);
1da177e4 3575 if (atomic_read(&po->mapped))
40d4e3df
ED
3576 pr_err("packet_mmap: vma is busy: %d\n",
3577 atomic_read(&po->mapped));
1da177e4 3578 }
905db440 3579 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3580
3581 spin_lock(&po->bind_lock);
ce06b03e 3582 if (was_running) {
1da177e4 3583 po->num = num;
ce06b03e 3584 register_prot_hook(sk);
1da177e4
LT
3585 }
3586 spin_unlock(&po->bind_lock);
f6fb8f10 3587 if (closing && (po->tp_version > TPACKET_V2)) {
3588 /* Because we don't support block-based V3 on tx-ring */
3589 if (!tx_ring)
3590 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3591 }
1da177e4
LT
3592 release_sock(sk);
3593
1da177e4
LT
3594 if (pg_vec)
3595 free_pg_vec(pg_vec, order, req->tp_block_nr);
3596out:
3597 return err;
3598}
3599
69e3c75f
JB
3600static int packet_mmap(struct file *file, struct socket *sock,
3601 struct vm_area_struct *vma)
1da177e4
LT
3602{
3603 struct sock *sk = sock->sk;
3604 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3605 unsigned long size, expected_size;
3606 struct packet_ring_buffer *rb;
1da177e4
LT
3607 unsigned long start;
3608 int err = -EINVAL;
3609 int i;
3610
3611 if (vma->vm_pgoff)
3612 return -EINVAL;
3613
905db440 3614 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3615
3616 expected_size = 0;
3617 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3618 if (rb->pg_vec) {
3619 expected_size += rb->pg_vec_len
3620 * rb->pg_vec_pages
3621 * PAGE_SIZE;
3622 }
3623 }
3624
3625 if (expected_size == 0)
1da177e4 3626 goto out;
69e3c75f
JB
3627
3628 size = vma->vm_end - vma->vm_start;
3629 if (size != expected_size)
1da177e4
LT
3630 goto out;
3631
1da177e4 3632 start = vma->vm_start;
69e3c75f
JB
3633 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3634 if (rb->pg_vec == NULL)
3635 continue;
3636
3637 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3638 struct page *page;
3639 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3640 int pg_num;
3641
c56b4d90
CG
3642 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3643 page = pgv_to_page(kaddr);
69e3c75f
JB
3644 err = vm_insert_page(vma, start, page);
3645 if (unlikely(err))
3646 goto out;
3647 start += PAGE_SIZE;
0e3125c7 3648 kaddr += PAGE_SIZE;
69e3c75f 3649 }
4ebf0ae2 3650 }
1da177e4 3651 }
69e3c75f 3652
4ebf0ae2 3653 atomic_inc(&po->mapped);
1da177e4
LT
3654 vma->vm_ops = &packet_mmap_ops;
3655 err = 0;
3656
3657out:
905db440 3658 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3659 return err;
3660}
1da177e4 3661
90ddc4f0 3662static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3663 .family = PF_PACKET,
3664 .owner = THIS_MODULE,
3665 .release = packet_release,
3666 .bind = packet_bind_spkt,
3667 .connect = sock_no_connect,
3668 .socketpair = sock_no_socketpair,
3669 .accept = sock_no_accept,
3670 .getname = packet_getname_spkt,
3671 .poll = datagram_poll,
3672 .ioctl = packet_ioctl,
3673 .listen = sock_no_listen,
3674 .shutdown = sock_no_shutdown,
3675 .setsockopt = sock_no_setsockopt,
3676 .getsockopt = sock_no_getsockopt,
3677 .sendmsg = packet_sendmsg_spkt,
3678 .recvmsg = packet_recvmsg,
3679 .mmap = sock_no_mmap,
3680 .sendpage = sock_no_sendpage,
3681};
1da177e4 3682
90ddc4f0 3683static const struct proto_ops packet_ops = {
1da177e4
LT
3684 .family = PF_PACKET,
3685 .owner = THIS_MODULE,
3686 .release = packet_release,
3687 .bind = packet_bind,
3688 .connect = sock_no_connect,
3689 .socketpair = sock_no_socketpair,
3690 .accept = sock_no_accept,
1ce4f28b 3691 .getname = packet_getname,
1da177e4
LT
3692 .poll = packet_poll,
3693 .ioctl = packet_ioctl,
3694 .listen = sock_no_listen,
3695 .shutdown = sock_no_shutdown,
3696 .setsockopt = packet_setsockopt,
3697 .getsockopt = packet_getsockopt,
3698 .sendmsg = packet_sendmsg,
3699 .recvmsg = packet_recvmsg,
3700 .mmap = packet_mmap,
3701 .sendpage = sock_no_sendpage,
3702};
3703
ec1b4cf7 3704static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3705 .family = PF_PACKET,
3706 .create = packet_create,
3707 .owner = THIS_MODULE,
3708};
3709
3710static struct notifier_block packet_netdev_notifier = {
40d4e3df 3711 .notifier_call = packet_notifier,
1da177e4
LT
3712};
3713
3714#ifdef CONFIG_PROC_FS
1da177e4
LT
3715
3716static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3717 __acquires(RCU)
1da177e4 3718{
e372c414 3719 struct net *net = seq_file_net(seq);
808f5114 3720
3721 rcu_read_lock();
3722 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3723}
3724
3725static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3726{
1bf40954 3727 struct net *net = seq_file_net(seq);
808f5114 3728 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3729}
3730
3731static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3732 __releases(RCU)
1da177e4 3733{
808f5114 3734 rcu_read_unlock();
1da177e4
LT
3735}
3736
1ce4f28b 3737static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3738{
3739 if (v == SEQ_START_TOKEN)
3740 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3741 else {
b7ceabd9 3742 struct sock *s = sk_entry(v);
1da177e4
LT
3743 const struct packet_sock *po = pkt_sk(s);
3744
3745 seq_printf(seq,
71338aa7 3746 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3747 s,
3748 atomic_read(&s->sk_refcnt),
3749 s->sk_type,
3750 ntohs(po->num),
3751 po->ifindex,
3752 po->running,
3753 atomic_read(&s->sk_rmem_alloc),
3754 sock_i_uid(s),
40d4e3df 3755 sock_i_ino(s));
1da177e4
LT
3756 }
3757
3758 return 0;
3759}
3760
56b3d975 3761static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3762 .start = packet_seq_start,
3763 .next = packet_seq_next,
3764 .stop = packet_seq_stop,
3765 .show = packet_seq_show,
3766};
3767
3768static int packet_seq_open(struct inode *inode, struct file *file)
3769{
e372c414
DL
3770 return seq_open_net(inode, file, &packet_seq_ops,
3771 sizeof(struct seq_net_private));
1da177e4
LT
3772}
3773
da7071d7 3774static const struct file_operations packet_seq_fops = {
1da177e4
LT
3775 .owner = THIS_MODULE,
3776 .open = packet_seq_open,
3777 .read = seq_read,
3778 .llseek = seq_lseek,
e372c414 3779 .release = seq_release_net,
1da177e4
LT
3780};
3781
3782#endif
3783
2c8c1e72 3784static int __net_init packet_net_init(struct net *net)
d12d01d6 3785{
808f5114 3786 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 3787 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
3788
3789 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
3790 return -ENOMEM;
3791
3792 return 0;
3793}
3794
2c8c1e72 3795static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
3796{
3797 proc_net_remove(net, "packet");
3798}
3799
3800static struct pernet_operations packet_net_ops = {
3801 .init = packet_net_init,
3802 .exit = packet_net_exit,
3803};
3804
3805
1da177e4
LT
3806static void __exit packet_exit(void)
3807{
1da177e4 3808 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3809 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3810 sock_unregister(PF_PACKET);
3811 proto_unregister(&packet_proto);
3812}
3813
3814static int __init packet_init(void)
3815{
3816 int rc = proto_register(&packet_proto, 0);
3817
3818 if (rc != 0)
3819 goto out;
3820
3821 sock_register(&packet_family_ops);
d12d01d6 3822 register_pernet_subsys(&packet_net_ops);
1da177e4 3823 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3824out:
3825 return rc;
3826}
3827
3828module_init(packet_init);
3829module_exit(packet_exit);
3830MODULE_LICENSE("GPL");
3831MODULE_ALIAS_NETPROTO(PF_PACKET);
This page took 1.355956 seconds and 5 git commands to generate.