a1784e95b049b6dfb297d4f5eea616dfa7d3fcc4
[deliverable/linux.git] / net / core / filter.c
1 /*
2 * Linux Socket Filter - Kernel level socket filtering
3 *
4 * Based on the design of the Berkeley Packet Filter. The new
5 * internal format has been designed by PLUMgrid:
6 *
7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
8 *
9 * Authors:
10 *
11 * Jay Schulist <jschlst@samba.org>
12 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com>
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
19 *
20 * Andi Kleen - Fix a few bad bugs and races.
21 * Kris Katterjohn - Added many additional checks in sk_chk_filter()
22 */
23
24 #include <linux/module.h>
25 #include <linux/types.h>
26 #include <linux/mm.h>
27 #include <linux/fcntl.h>
28 #include <linux/socket.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/netdevice.h>
32 #include <linux/if_packet.h>
33 #include <linux/gfp.h>
34 #include <net/ip.h>
35 #include <net/protocol.h>
36 #include <net/netlink.h>
37 #include <linux/skbuff.h>
38 #include <net/sock.h>
39 #include <linux/errno.h>
40 #include <linux/timer.h>
41 #include <asm/uaccess.h>
42 #include <asm/unaligned.h>
43 #include <linux/filter.h>
44 #include <linux/ratelimit.h>
45 #include <linux/seccomp.h>
46 #include <linux/if_vlan.h>
47
48 /* No hurry in this branch
49 *
50 * Exported for the bpf jit load helper.
51 */
52 void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
53 {
54 u8 *ptr = NULL;
55
56 if (k >= SKF_NET_OFF)
57 ptr = skb_network_header(skb) + k - SKF_NET_OFF;
58 else if (k >= SKF_LL_OFF)
59 ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
60
61 if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
62 return ptr;
63 return NULL;
64 }
65
66 static inline void *load_pointer(const struct sk_buff *skb, int k,
67 unsigned int size, void *buffer)
68 {
69 if (k >= 0)
70 return skb_header_pointer(skb, k, size, buffer);
71 return bpf_internal_load_pointer_neg_helper(skb, k, size);
72 }
73
74 /**
75 * sk_filter - run a packet through a socket filter
76 * @sk: sock associated with &sk_buff
77 * @skb: buffer to filter
78 *
79 * Run the filter code and then cut skb->data to correct size returned by
80 * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
81 * than pkt_len we keep whole skb->data. This is the socket level
82 * wrapper to sk_run_filter. It returns 0 if the packet should
83 * be accepted or -EPERM if the packet should be tossed.
84 *
85 */
86 int sk_filter(struct sock *sk, struct sk_buff *skb)
87 {
88 int err;
89 struct sk_filter *filter;
90
91 /*
92 * If the skb was allocated from pfmemalloc reserves, only
93 * allow SOCK_MEMALLOC sockets to use it as this socket is
94 * helping free memory
95 */
96 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
97 return -ENOMEM;
98
99 err = security_sock_rcv_skb(sk, skb);
100 if (err)
101 return err;
102
103 rcu_read_lock();
104 filter = rcu_dereference(sk->sk_filter);
105 if (filter) {
106 unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
107
108 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
109 }
110 rcu_read_unlock();
111
112 return err;
113 }
114 EXPORT_SYMBOL(sk_filter);
115
116 /* Base function for offset calculation. Needs to go into .text section,
117 * therefore keeping it non-static as well; will also be used by JITs
118 * anyway later on, so do not let the compiler omit it.
119 */
120 noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
121 {
122 return 0;
123 }
124
125 /* Register mappings for user programs. */
126 #define A_REG 0
127 #define X_REG 7
128 #define TMP_REG 8
129 #define ARG2_REG 2
130 #define ARG3_REG 3
131
132 /**
133 * __sk_run_filter - run a filter on a given context
134 * @ctx: buffer to run the filter on
135 * @insn: filter to apply
136 *
137 * Decode and apply filter instructions to the skb->data. Return length to
138 * keep, 0 for none. @ctx is the data we are operating on, @insn is the
139 * array of filter instructions.
140 */
141 unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
142 {
143 u64 stack[MAX_BPF_STACK / sizeof(u64)];
144 u64 regs[MAX_BPF_REG], tmp;
145 void *ptr;
146 int off;
147
148 #define K insn->imm
149 #define A regs[insn->a_reg]
150 #define X regs[insn->x_reg]
151 #define R0 regs[0]
152
153 #define CONT ({insn++; goto select_insn; })
154 #define CONT_JMP ({insn++; goto select_insn; })
155
156 static const void *jumptable[256] = {
157 [0 ... 255] = &&default_label,
158 /* Now overwrite non-defaults ... */
159 #define DL(A, B, C) [BPF_##A|BPF_##B|BPF_##C] = &&A##_##B##_##C
160 DL(ALU, ADD, X),
161 DL(ALU, ADD, K),
162 DL(ALU, SUB, X),
163 DL(ALU, SUB, K),
164 DL(ALU, AND, X),
165 DL(ALU, AND, K),
166 DL(ALU, OR, X),
167 DL(ALU, OR, K),
168 DL(ALU, LSH, X),
169 DL(ALU, LSH, K),
170 DL(ALU, RSH, X),
171 DL(ALU, RSH, K),
172 DL(ALU, XOR, X),
173 DL(ALU, XOR, K),
174 DL(ALU, MUL, X),
175 DL(ALU, MUL, K),
176 DL(ALU, MOV, X),
177 DL(ALU, MOV, K),
178 DL(ALU, DIV, X),
179 DL(ALU, DIV, K),
180 DL(ALU, MOD, X),
181 DL(ALU, MOD, K),
182 DL(ALU, NEG, 0),
183 DL(ALU, END, TO_BE),
184 DL(ALU, END, TO_LE),
185 DL(ALU64, ADD, X),
186 DL(ALU64, ADD, K),
187 DL(ALU64, SUB, X),
188 DL(ALU64, SUB, K),
189 DL(ALU64, AND, X),
190 DL(ALU64, AND, K),
191 DL(ALU64, OR, X),
192 DL(ALU64, OR, K),
193 DL(ALU64, LSH, X),
194 DL(ALU64, LSH, K),
195 DL(ALU64, RSH, X),
196 DL(ALU64, RSH, K),
197 DL(ALU64, XOR, X),
198 DL(ALU64, XOR, K),
199 DL(ALU64, MUL, X),
200 DL(ALU64, MUL, K),
201 DL(ALU64, MOV, X),
202 DL(ALU64, MOV, K),
203 DL(ALU64, ARSH, X),
204 DL(ALU64, ARSH, K),
205 DL(ALU64, DIV, X),
206 DL(ALU64, DIV, K),
207 DL(ALU64, MOD, X),
208 DL(ALU64, MOD, K),
209 DL(ALU64, NEG, 0),
210 DL(JMP, CALL, 0),
211 DL(JMP, JA, 0),
212 DL(JMP, JEQ, X),
213 DL(JMP, JEQ, K),
214 DL(JMP, JNE, X),
215 DL(JMP, JNE, K),
216 DL(JMP, JGT, X),
217 DL(JMP, JGT, K),
218 DL(JMP, JGE, X),
219 DL(JMP, JGE, K),
220 DL(JMP, JSGT, X),
221 DL(JMP, JSGT, K),
222 DL(JMP, JSGE, X),
223 DL(JMP, JSGE, K),
224 DL(JMP, JSET, X),
225 DL(JMP, JSET, K),
226 DL(JMP, EXIT, 0),
227 DL(STX, MEM, B),
228 DL(STX, MEM, H),
229 DL(STX, MEM, W),
230 DL(STX, MEM, DW),
231 DL(STX, XADD, W),
232 DL(STX, XADD, DW),
233 DL(ST, MEM, B),
234 DL(ST, MEM, H),
235 DL(ST, MEM, W),
236 DL(ST, MEM, DW),
237 DL(LDX, MEM, B),
238 DL(LDX, MEM, H),
239 DL(LDX, MEM, W),
240 DL(LDX, MEM, DW),
241 DL(LD, ABS, W),
242 DL(LD, ABS, H),
243 DL(LD, ABS, B),
244 DL(LD, IND, W),
245 DL(LD, IND, H),
246 DL(LD, IND, B),
247 #undef DL
248 };
249
250 regs[FP_REG] = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
251 regs[ARG1_REG] = (u64) (unsigned long) ctx;
252 regs[A_REG] = 0;
253 regs[X_REG] = 0;
254
255 select_insn:
256 goto *jumptable[insn->code];
257
258 /* ALU */
259 #define ALU(OPCODE, OP) \
260 ALU64_##OPCODE##_X: \
261 A = A OP X; \
262 CONT; \
263 ALU_##OPCODE##_X: \
264 A = (u32) A OP (u32) X; \
265 CONT; \
266 ALU64_##OPCODE##_K: \
267 A = A OP K; \
268 CONT; \
269 ALU_##OPCODE##_K: \
270 A = (u32) A OP (u32) K; \
271 CONT;
272
273 ALU(ADD, +)
274 ALU(SUB, -)
275 ALU(AND, &)
276 ALU(OR, |)
277 ALU(LSH, <<)
278 ALU(RSH, >>)
279 ALU(XOR, ^)
280 ALU(MUL, *)
281 #undef ALU
282 ALU_NEG_0:
283 A = (u32) -A;
284 CONT;
285 ALU64_NEG_0:
286 A = -A;
287 CONT;
288 ALU_MOV_X:
289 A = (u32) X;
290 CONT;
291 ALU_MOV_K:
292 A = (u32) K;
293 CONT;
294 ALU64_MOV_X:
295 A = X;
296 CONT;
297 ALU64_MOV_K:
298 A = K;
299 CONT;
300 ALU64_ARSH_X:
301 (*(s64 *) &A) >>= X;
302 CONT;
303 ALU64_ARSH_K:
304 (*(s64 *) &A) >>= K;
305 CONT;
306 ALU64_MOD_X:
307 if (unlikely(X == 0))
308 return 0;
309 tmp = A;
310 A = do_div(tmp, X);
311 CONT;
312 ALU_MOD_X:
313 if (unlikely(X == 0))
314 return 0;
315 tmp = (u32) A;
316 A = do_div(tmp, (u32) X);
317 CONT;
318 ALU64_MOD_K:
319 tmp = A;
320 A = do_div(tmp, K);
321 CONT;
322 ALU_MOD_K:
323 tmp = (u32) A;
324 A = do_div(tmp, (u32) K);
325 CONT;
326 ALU64_DIV_X:
327 if (unlikely(X == 0))
328 return 0;
329 do_div(A, X);
330 CONT;
331 ALU_DIV_X:
332 if (unlikely(X == 0))
333 return 0;
334 tmp = (u32) A;
335 do_div(tmp, (u32) X);
336 A = (u32) tmp;
337 CONT;
338 ALU64_DIV_K:
339 do_div(A, K);
340 CONT;
341 ALU_DIV_K:
342 tmp = (u32) A;
343 do_div(tmp, (u32) K);
344 A = (u32) tmp;
345 CONT;
346 ALU_END_TO_BE:
347 switch (K) {
348 case 16:
349 A = (__force u16) cpu_to_be16(A);
350 break;
351 case 32:
352 A = (__force u32) cpu_to_be32(A);
353 break;
354 case 64:
355 A = (__force u64) cpu_to_be64(A);
356 break;
357 }
358 CONT;
359 ALU_END_TO_LE:
360 switch (K) {
361 case 16:
362 A = (__force u16) cpu_to_le16(A);
363 break;
364 case 32:
365 A = (__force u32) cpu_to_le32(A);
366 break;
367 case 64:
368 A = (__force u64) cpu_to_le64(A);
369 break;
370 }
371 CONT;
372
373 /* CALL */
374 JMP_CALL_0:
375 /* Function call scratches R1-R5 registers, preserves R6-R9,
376 * and stores return value into R0.
377 */
378 R0 = (__bpf_call_base + insn->imm)(regs[1], regs[2], regs[3],
379 regs[4], regs[5]);
380 CONT;
381
382 /* JMP */
383 JMP_JA_0:
384 insn += insn->off;
385 CONT;
386 JMP_JEQ_X:
387 if (A == X) {
388 insn += insn->off;
389 CONT_JMP;
390 }
391 CONT;
392 JMP_JEQ_K:
393 if (A == K) {
394 insn += insn->off;
395 CONT_JMP;
396 }
397 CONT;
398 JMP_JNE_X:
399 if (A != X) {
400 insn += insn->off;
401 CONT_JMP;
402 }
403 CONT;
404 JMP_JNE_K:
405 if (A != K) {
406 insn += insn->off;
407 CONT_JMP;
408 }
409 CONT;
410 JMP_JGT_X:
411 if (A > X) {
412 insn += insn->off;
413 CONT_JMP;
414 }
415 CONT;
416 JMP_JGT_K:
417 if (A > K) {
418 insn += insn->off;
419 CONT_JMP;
420 }
421 CONT;
422 JMP_JGE_X:
423 if (A >= X) {
424 insn += insn->off;
425 CONT_JMP;
426 }
427 CONT;
428 JMP_JGE_K:
429 if (A >= K) {
430 insn += insn->off;
431 CONT_JMP;
432 }
433 CONT;
434 JMP_JSGT_X:
435 if (((s64) A) > ((s64) X)) {
436 insn += insn->off;
437 CONT_JMP;
438 }
439 CONT;
440 JMP_JSGT_K:
441 if (((s64) A) > ((s64) K)) {
442 insn += insn->off;
443 CONT_JMP;
444 }
445 CONT;
446 JMP_JSGE_X:
447 if (((s64) A) >= ((s64) X)) {
448 insn += insn->off;
449 CONT_JMP;
450 }
451 CONT;
452 JMP_JSGE_K:
453 if (((s64) A) >= ((s64) K)) {
454 insn += insn->off;
455 CONT_JMP;
456 }
457 CONT;
458 JMP_JSET_X:
459 if (A & X) {
460 insn += insn->off;
461 CONT_JMP;
462 }
463 CONT;
464 JMP_JSET_K:
465 if (A & K) {
466 insn += insn->off;
467 CONT_JMP;
468 }
469 CONT;
470 JMP_EXIT_0:
471 return R0;
472
473 /* STX and ST and LDX*/
474 #define LDST(SIZEOP, SIZE) \
475 STX_MEM_##SIZEOP: \
476 *(SIZE *)(unsigned long) (A + insn->off) = X; \
477 CONT; \
478 ST_MEM_##SIZEOP: \
479 *(SIZE *)(unsigned long) (A + insn->off) = K; \
480 CONT; \
481 LDX_MEM_##SIZEOP: \
482 A = *(SIZE *)(unsigned long) (X + insn->off); \
483 CONT;
484
485 LDST(B, u8)
486 LDST(H, u16)
487 LDST(W, u32)
488 LDST(DW, u64)
489 #undef LDST
490 STX_XADD_W: /* lock xadd *(u32 *)(A + insn->off) += X */
491 atomic_add((u32) X, (atomic_t *)(unsigned long)
492 (A + insn->off));
493 CONT;
494 STX_XADD_DW: /* lock xadd *(u64 *)(A + insn->off) += X */
495 atomic64_add((u64) X, (atomic64_t *)(unsigned long)
496 (A + insn->off));
497 CONT;
498 LD_ABS_W: /* R0 = ntohl(*(u32 *) (skb->data + K)) */
499 off = K;
500 load_word:
501 /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
502 * appearing in the programs where ctx == skb. All programs
503 * keep 'ctx' in regs[CTX_REG] == R6, sk_convert_filter()
504 * saves it in R6, internal BPF verifier will check that
505 * R6 == ctx.
506 *
507 * BPF_ABS and BPF_IND are wrappers of function calls, so
508 * they scratch R1-R5 registers, preserve R6-R9, and store
509 * return value into R0.
510 *
511 * Implicit input:
512 * ctx
513 *
514 * Explicit input:
515 * X == any register
516 * K == 32-bit immediate
517 *
518 * Output:
519 * R0 - 8/16/32-bit skb data converted to cpu endianness
520 */
521 ptr = load_pointer((struct sk_buff *) ctx, off, 4, &tmp);
522 if (likely(ptr != NULL)) {
523 R0 = get_unaligned_be32(ptr);
524 CONT;
525 }
526 return 0;
527 LD_ABS_H: /* R0 = ntohs(*(u16 *) (skb->data + K)) */
528 off = K;
529 load_half:
530 ptr = load_pointer((struct sk_buff *) ctx, off, 2, &tmp);
531 if (likely(ptr != NULL)) {
532 R0 = get_unaligned_be16(ptr);
533 CONT;
534 }
535 return 0;
536 LD_ABS_B: /* R0 = *(u8 *) (ctx + K) */
537 off = K;
538 load_byte:
539 ptr = load_pointer((struct sk_buff *) ctx, off, 1, &tmp);
540 if (likely(ptr != NULL)) {
541 R0 = *(u8 *)ptr;
542 CONT;
543 }
544 return 0;
545 LD_IND_W: /* R0 = ntohl(*(u32 *) (skb->data + X + K)) */
546 off = K + X;
547 goto load_word;
548 LD_IND_H: /* R0 = ntohs(*(u16 *) (skb->data + X + K)) */
549 off = K + X;
550 goto load_half;
551 LD_IND_B: /* R0 = *(u8 *) (skb->data + X + K) */
552 off = K + X;
553 goto load_byte;
554
555 default_label:
556 /* If we ever reach this, we have a bug somewhere. */
557 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
558 return 0;
559 #undef CONT_JMP
560 #undef CONT
561
562 #undef R0
563 #undef X
564 #undef A
565 #undef K
566 }
567
568 u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
569 const struct sock_filter_int *insni)
570 __attribute__ ((alias ("__sk_run_filter")));
571
572 u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
573 const struct sock_filter_int *insni)
574 __attribute__ ((alias ("__sk_run_filter")));
575 EXPORT_SYMBOL_GPL(sk_run_filter_int_skb);
576
577 /* Helper to find the offset of pkt_type in sk_buff structure. We want
578 * to make sure its still a 3bit field starting at a byte boundary;
579 * taken from arch/x86/net/bpf_jit_comp.c.
580 */
581 #define PKT_TYPE_MAX 7
582 static unsigned int pkt_type_offset(void)
583 {
584 struct sk_buff skb_probe = { .pkt_type = ~0, };
585 u8 *ct = (u8 *) &skb_probe;
586 unsigned int off;
587
588 for (off = 0; off < sizeof(struct sk_buff); off++) {
589 if (ct[off] == PKT_TYPE_MAX)
590 return off;
591 }
592
593 pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
594 return -1;
595 }
596
597 static u64 __skb_get_pay_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
598 {
599 struct sk_buff *skb = (struct sk_buff *)(long) ctx;
600
601 return __skb_get_poff(skb);
602 }
603
604 static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
605 {
606 struct sk_buff *skb = (struct sk_buff *)(long) ctx;
607 struct nlattr *nla;
608
609 if (skb_is_nonlinear(skb))
610 return 0;
611
612 if (skb->len < sizeof(struct nlattr))
613 return 0;
614
615 if (A > skb->len - sizeof(struct nlattr))
616 return 0;
617
618 nla = nla_find((struct nlattr *) &skb->data[A], skb->len - A, X);
619 if (nla)
620 return (void *) nla - (void *) skb->data;
621
622 return 0;
623 }
624
625 static u64 __skb_get_nlattr_nest(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
626 {
627 struct sk_buff *skb = (struct sk_buff *)(long) ctx;
628 struct nlattr *nla;
629
630 if (skb_is_nonlinear(skb))
631 return 0;
632
633 if (skb->len < sizeof(struct nlattr))
634 return 0;
635
636 if (A > skb->len - sizeof(struct nlattr))
637 return 0;
638
639 nla = (struct nlattr *) &skb->data[A];
640 if (nla->nla_len > skb->len - A)
641 return 0;
642
643 nla = nla_find_nested(nla, X);
644 if (nla)
645 return (void *) nla - (void *) skb->data;
646
647 return 0;
648 }
649
650 static u64 __get_raw_cpu_id(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
651 {
652 return raw_smp_processor_id();
653 }
654
655 /* note that this only generates 32-bit random numbers */
656 static u64 __get_random_u32(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
657 {
658 return (u64)prandom_u32();
659 }
660
661 static bool convert_bpf_extensions(struct sock_filter *fp,
662 struct sock_filter_int **insnp)
663 {
664 struct sock_filter_int *insn = *insnp;
665
666 switch (fp->k) {
667 case SKF_AD_OFF + SKF_AD_PROTOCOL:
668 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
669
670 insn->code = BPF_LDX | BPF_MEM | BPF_H;
671 insn->a_reg = A_REG;
672 insn->x_reg = CTX_REG;
673 insn->off = offsetof(struct sk_buff, protocol);
674 insn++;
675
676 /* A = ntohs(A) [emitting a nop or swap16] */
677 insn->code = BPF_ALU | BPF_END | BPF_FROM_BE;
678 insn->a_reg = A_REG;
679 insn->imm = 16;
680 break;
681
682 case SKF_AD_OFF + SKF_AD_PKTTYPE:
683 insn->code = BPF_LDX | BPF_MEM | BPF_B;
684 insn->a_reg = A_REG;
685 insn->x_reg = CTX_REG;
686 insn->off = pkt_type_offset();
687 if (insn->off < 0)
688 return false;
689 insn++;
690
691 insn->code = BPF_ALU | BPF_AND | BPF_K;
692 insn->a_reg = A_REG;
693 insn->imm = PKT_TYPE_MAX;
694 break;
695
696 case SKF_AD_OFF + SKF_AD_IFINDEX:
697 case SKF_AD_OFF + SKF_AD_HATYPE:
698 if (FIELD_SIZEOF(struct sk_buff, dev) == 8)
699 insn->code = BPF_LDX | BPF_MEM | BPF_DW;
700 else
701 insn->code = BPF_LDX | BPF_MEM | BPF_W;
702 insn->a_reg = TMP_REG;
703 insn->x_reg = CTX_REG;
704 insn->off = offsetof(struct sk_buff, dev);
705 insn++;
706
707 insn->code = BPF_JMP | BPF_JNE | BPF_K;
708 insn->a_reg = TMP_REG;
709 insn->imm = 0;
710 insn->off = 1;
711 insn++;
712
713 insn->code = BPF_JMP | BPF_EXIT;
714 insn++;
715
716 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
717 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
718
719 insn->a_reg = A_REG;
720 insn->x_reg = TMP_REG;
721
722 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) {
723 insn->code = BPF_LDX | BPF_MEM | BPF_W;
724 insn->off = offsetof(struct net_device, ifindex);
725 } else {
726 insn->code = BPF_LDX | BPF_MEM | BPF_H;
727 insn->off = offsetof(struct net_device, type);
728 }
729 break;
730
731 case SKF_AD_OFF + SKF_AD_MARK:
732 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
733
734 insn->code = BPF_LDX | BPF_MEM | BPF_W;
735 insn->a_reg = A_REG;
736 insn->x_reg = CTX_REG;
737 insn->off = offsetof(struct sk_buff, mark);
738 break;
739
740 case SKF_AD_OFF + SKF_AD_RXHASH:
741 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
742
743 insn->code = BPF_LDX | BPF_MEM | BPF_W;
744 insn->a_reg = A_REG;
745 insn->x_reg = CTX_REG;
746 insn->off = offsetof(struct sk_buff, hash);
747 break;
748
749 case SKF_AD_OFF + SKF_AD_QUEUE:
750 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
751
752 insn->code = BPF_LDX | BPF_MEM | BPF_H;
753 insn->a_reg = A_REG;
754 insn->x_reg = CTX_REG;
755 insn->off = offsetof(struct sk_buff, queue_mapping);
756 break;
757
758 case SKF_AD_OFF + SKF_AD_VLAN_TAG:
759 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
760 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
761
762 insn->code = BPF_LDX | BPF_MEM | BPF_H;
763 insn->a_reg = A_REG;
764 insn->x_reg = CTX_REG;
765 insn->off = offsetof(struct sk_buff, vlan_tci);
766 insn++;
767
768 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
769
770 if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
771 insn->code = BPF_ALU | BPF_AND | BPF_K;
772 insn->a_reg = A_REG;
773 insn->imm = ~VLAN_TAG_PRESENT;
774 } else {
775 insn->code = BPF_ALU | BPF_RSH | BPF_K;
776 insn->a_reg = A_REG;
777 insn->imm = 12;
778 insn++;
779
780 insn->code = BPF_ALU | BPF_AND | BPF_K;
781 insn->a_reg = A_REG;
782 insn->imm = 1;
783 }
784 break;
785
786 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
787 case SKF_AD_OFF + SKF_AD_NLATTR:
788 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
789 case SKF_AD_OFF + SKF_AD_CPU:
790 case SKF_AD_OFF + SKF_AD_RANDOM:
791 /* arg1 = ctx */
792 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
793 insn->a_reg = ARG1_REG;
794 insn->x_reg = CTX_REG;
795 insn++;
796
797 /* arg2 = A */
798 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
799 insn->a_reg = ARG2_REG;
800 insn->x_reg = A_REG;
801 insn++;
802
803 /* arg3 = X */
804 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
805 insn->a_reg = ARG3_REG;
806 insn->x_reg = X_REG;
807 insn++;
808
809 /* Emit call(ctx, arg2=A, arg3=X) */
810 insn->code = BPF_JMP | BPF_CALL;
811 switch (fp->k) {
812 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
813 insn->imm = __skb_get_pay_offset - __bpf_call_base;
814 break;
815 case SKF_AD_OFF + SKF_AD_NLATTR:
816 insn->imm = __skb_get_nlattr - __bpf_call_base;
817 break;
818 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
819 insn->imm = __skb_get_nlattr_nest - __bpf_call_base;
820 break;
821 case SKF_AD_OFF + SKF_AD_CPU:
822 insn->imm = __get_raw_cpu_id - __bpf_call_base;
823 break;
824 case SKF_AD_OFF + SKF_AD_RANDOM:
825 insn->imm = __get_random_u32 - __bpf_call_base;
826 break;
827 }
828 break;
829
830 case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
831 insn->code = BPF_ALU | BPF_XOR | BPF_X;
832 insn->a_reg = A_REG;
833 insn->x_reg = X_REG;
834 break;
835
836 default:
837 /* This is just a dummy call to avoid letting the compiler
838 * evict __bpf_call_base() as an optimization. Placed here
839 * where no-one bothers.
840 */
841 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
842 return false;
843 }
844
845 *insnp = insn;
846 return true;
847 }
848
849 /**
850 * sk_convert_filter - convert filter program
851 * @prog: the user passed filter program
852 * @len: the length of the user passed filter program
853 * @new_prog: buffer where converted program will be stored
854 * @new_len: pointer to store length of converted program
855 *
856 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
857 * Conversion workflow:
858 *
859 * 1) First pass for calculating the new program length:
860 * sk_convert_filter(old_prog, old_len, NULL, &new_len)
861 *
862 * 2) 2nd pass to remap in two passes: 1st pass finds new
863 * jump offsets, 2nd pass remapping:
864 * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len);
865 * sk_convert_filter(old_prog, old_len, new_prog, &new_len);
866 *
867 * User BPF's register A is mapped to our BPF register 6, user BPF
868 * register X is mapped to BPF register 7; frame pointer is always
869 * register 10; Context 'void *ctx' is stored in register 1, that is,
870 * for socket filters: ctx == 'struct sk_buff *', for seccomp:
871 * ctx == 'struct seccomp_data *'.
872 */
873 int sk_convert_filter(struct sock_filter *prog, int len,
874 struct sock_filter_int *new_prog, int *new_len)
875 {
876 int new_flen = 0, pass = 0, target, i;
877 struct sock_filter_int *new_insn;
878 struct sock_filter *fp;
879 int *addrs = NULL;
880 u8 bpf_src;
881
882 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
883 BUILD_BUG_ON(FP_REG + 1 != MAX_BPF_REG);
884
885 if (len <= 0 || len >= BPF_MAXINSNS)
886 return -EINVAL;
887
888 if (new_prog) {
889 addrs = kzalloc(len * sizeof(*addrs), GFP_KERNEL);
890 if (!addrs)
891 return -ENOMEM;
892 }
893
894 do_pass:
895 new_insn = new_prog;
896 fp = prog;
897
898 if (new_insn) {
899 new_insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
900 new_insn->a_reg = CTX_REG;
901 new_insn->x_reg = ARG1_REG;
902 }
903 new_insn++;
904
905 for (i = 0; i < len; fp++, i++) {
906 struct sock_filter_int tmp_insns[6] = { };
907 struct sock_filter_int *insn = tmp_insns;
908
909 if (addrs)
910 addrs[i] = new_insn - new_prog;
911
912 switch (fp->code) {
913 /* All arithmetic insns and skb loads map as-is. */
914 case BPF_ALU | BPF_ADD | BPF_X:
915 case BPF_ALU | BPF_ADD | BPF_K:
916 case BPF_ALU | BPF_SUB | BPF_X:
917 case BPF_ALU | BPF_SUB | BPF_K:
918 case BPF_ALU | BPF_AND | BPF_X:
919 case BPF_ALU | BPF_AND | BPF_K:
920 case BPF_ALU | BPF_OR | BPF_X:
921 case BPF_ALU | BPF_OR | BPF_K:
922 case BPF_ALU | BPF_LSH | BPF_X:
923 case BPF_ALU | BPF_LSH | BPF_K:
924 case BPF_ALU | BPF_RSH | BPF_X:
925 case BPF_ALU | BPF_RSH | BPF_K:
926 case BPF_ALU | BPF_XOR | BPF_X:
927 case BPF_ALU | BPF_XOR | BPF_K:
928 case BPF_ALU | BPF_MUL | BPF_X:
929 case BPF_ALU | BPF_MUL | BPF_K:
930 case BPF_ALU | BPF_DIV | BPF_X:
931 case BPF_ALU | BPF_DIV | BPF_K:
932 case BPF_ALU | BPF_MOD | BPF_X:
933 case BPF_ALU | BPF_MOD | BPF_K:
934 case BPF_ALU | BPF_NEG:
935 case BPF_LD | BPF_ABS | BPF_W:
936 case BPF_LD | BPF_ABS | BPF_H:
937 case BPF_LD | BPF_ABS | BPF_B:
938 case BPF_LD | BPF_IND | BPF_W:
939 case BPF_LD | BPF_IND | BPF_H:
940 case BPF_LD | BPF_IND | BPF_B:
941 /* Check for overloaded BPF extension and
942 * directly convert it if found, otherwise
943 * just move on with mapping.
944 */
945 if (BPF_CLASS(fp->code) == BPF_LD &&
946 BPF_MODE(fp->code) == BPF_ABS &&
947 convert_bpf_extensions(fp, &insn))
948 break;
949
950 insn->code = fp->code;
951 insn->a_reg = A_REG;
952 insn->x_reg = X_REG;
953 insn->imm = fp->k;
954 break;
955
956 /* Jump opcodes map as-is, but offsets need adjustment. */
957 case BPF_JMP | BPF_JA:
958 target = i + fp->k + 1;
959 insn->code = fp->code;
960 #define EMIT_JMP \
961 do { \
962 if (target >= len || target < 0) \
963 goto err; \
964 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
965 /* Adjust pc relative offset for 2nd or 3rd insn. */ \
966 insn->off -= insn - tmp_insns; \
967 } while (0)
968
969 EMIT_JMP;
970 break;
971
972 case BPF_JMP | BPF_JEQ | BPF_K:
973 case BPF_JMP | BPF_JEQ | BPF_X:
974 case BPF_JMP | BPF_JSET | BPF_K:
975 case BPF_JMP | BPF_JSET | BPF_X:
976 case BPF_JMP | BPF_JGT | BPF_K:
977 case BPF_JMP | BPF_JGT | BPF_X:
978 case BPF_JMP | BPF_JGE | BPF_K:
979 case BPF_JMP | BPF_JGE | BPF_X:
980 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
981 /* BPF immediates are signed, zero extend
982 * immediate into tmp register and use it
983 * in compare insn.
984 */
985 insn->code = BPF_ALU | BPF_MOV | BPF_K;
986 insn->a_reg = TMP_REG;
987 insn->imm = fp->k;
988 insn++;
989
990 insn->a_reg = A_REG;
991 insn->x_reg = TMP_REG;
992 bpf_src = BPF_X;
993 } else {
994 insn->a_reg = A_REG;
995 insn->x_reg = X_REG;
996 insn->imm = fp->k;
997 bpf_src = BPF_SRC(fp->code);
998 }
999
1000 /* Common case where 'jump_false' is next insn. */
1001 if (fp->jf == 0) {
1002 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
1003 target = i + fp->jt + 1;
1004 EMIT_JMP;
1005 break;
1006 }
1007
1008 /* Convert JEQ into JNE when 'jump_true' is next insn. */
1009 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
1010 insn->code = BPF_JMP | BPF_JNE | bpf_src;
1011 target = i + fp->jf + 1;
1012 EMIT_JMP;
1013 break;
1014 }
1015
1016 /* Other jumps are mapped into two insns: Jxx and JA. */
1017 target = i + fp->jt + 1;
1018 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
1019 EMIT_JMP;
1020 insn++;
1021
1022 insn->code = BPF_JMP | BPF_JA;
1023 target = i + fp->jf + 1;
1024 EMIT_JMP;
1025 break;
1026
1027 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
1028 case BPF_LDX | BPF_MSH | BPF_B:
1029 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
1030 insn->a_reg = TMP_REG;
1031 insn->x_reg = A_REG;
1032 insn++;
1033
1034 insn->code = BPF_LD | BPF_ABS | BPF_B;
1035 insn->a_reg = A_REG;
1036 insn->imm = fp->k;
1037 insn++;
1038
1039 insn->code = BPF_ALU | BPF_AND | BPF_K;
1040 insn->a_reg = A_REG;
1041 insn->imm = 0xf;
1042 insn++;
1043
1044 insn->code = BPF_ALU | BPF_LSH | BPF_K;
1045 insn->a_reg = A_REG;
1046 insn->imm = 2;
1047 insn++;
1048
1049 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
1050 insn->a_reg = X_REG;
1051 insn->x_reg = A_REG;
1052 insn++;
1053
1054 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
1055 insn->a_reg = A_REG;
1056 insn->x_reg = TMP_REG;
1057 break;
1058
1059 /* RET_K, RET_A are remaped into 2 insns. */
1060 case BPF_RET | BPF_A:
1061 case BPF_RET | BPF_K:
1062 insn->code = BPF_ALU | BPF_MOV |
1063 (BPF_RVAL(fp->code) == BPF_K ?
1064 BPF_K : BPF_X);
1065 insn->a_reg = 0;
1066 insn->x_reg = A_REG;
1067 insn->imm = fp->k;
1068 insn++;
1069
1070 insn->code = BPF_JMP | BPF_EXIT;
1071 break;
1072
1073 /* Store to stack. */
1074 case BPF_ST:
1075 case BPF_STX:
1076 insn->code = BPF_STX | BPF_MEM | BPF_W;
1077 insn->a_reg = FP_REG;
1078 insn->x_reg = fp->code == BPF_ST ? A_REG : X_REG;
1079 insn->off = -(BPF_MEMWORDS - fp->k) * 4;
1080 break;
1081
1082 /* Load from stack. */
1083 case BPF_LD | BPF_MEM:
1084 case BPF_LDX | BPF_MEM:
1085 insn->code = BPF_LDX | BPF_MEM | BPF_W;
1086 insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
1087 A_REG : X_REG;
1088 insn->x_reg = FP_REG;
1089 insn->off = -(BPF_MEMWORDS - fp->k) * 4;
1090 break;
1091
1092 /* A = K or X = K */
1093 case BPF_LD | BPF_IMM:
1094 case BPF_LDX | BPF_IMM:
1095 insn->code = BPF_ALU | BPF_MOV | BPF_K;
1096 insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
1097 A_REG : X_REG;
1098 insn->imm = fp->k;
1099 break;
1100
1101 /* X = A */
1102 case BPF_MISC | BPF_TAX:
1103 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
1104 insn->a_reg = X_REG;
1105 insn->x_reg = A_REG;
1106 break;
1107
1108 /* A = X */
1109 case BPF_MISC | BPF_TXA:
1110 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
1111 insn->a_reg = A_REG;
1112 insn->x_reg = X_REG;
1113 break;
1114
1115 /* A = skb->len or X = skb->len */
1116 case BPF_LD | BPF_W | BPF_LEN:
1117 case BPF_LDX | BPF_W | BPF_LEN:
1118 insn->code = BPF_LDX | BPF_MEM | BPF_W;
1119 insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
1120 A_REG : X_REG;
1121 insn->x_reg = CTX_REG;
1122 insn->off = offsetof(struct sk_buff, len);
1123 break;
1124
1125 /* access seccomp_data fields */
1126 case BPF_LDX | BPF_ABS | BPF_W:
1127 insn->code = BPF_LDX | BPF_MEM | BPF_W;
1128 insn->a_reg = A_REG;
1129 insn->x_reg = CTX_REG;
1130 insn->off = fp->k;
1131 break;
1132
1133 default:
1134 goto err;
1135 }
1136
1137 insn++;
1138 if (new_prog)
1139 memcpy(new_insn, tmp_insns,
1140 sizeof(*insn) * (insn - tmp_insns));
1141
1142 new_insn += insn - tmp_insns;
1143 }
1144
1145 if (!new_prog) {
1146 /* Only calculating new length. */
1147 *new_len = new_insn - new_prog;
1148 return 0;
1149 }
1150
1151 pass++;
1152 if (new_flen != new_insn - new_prog) {
1153 new_flen = new_insn - new_prog;
1154 if (pass > 2)
1155 goto err;
1156
1157 goto do_pass;
1158 }
1159
1160 kfree(addrs);
1161 BUG_ON(*new_len != new_flen);
1162 return 0;
1163 err:
1164 kfree(addrs);
1165 return -EINVAL;
1166 }
1167
1168 /* Security:
1169 *
1170 * A BPF program is able to use 16 cells of memory to store intermediate
1171 * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
1172 *
1173 * As we dont want to clear mem[] array for each packet going through
1174 * sk_run_filter(), we check that filter loaded by user never try to read
1175 * a cell if not previously written, and we check all branches to be sure
1176 * a malicious user doesn't try to abuse us.
1177 */
1178 static int check_load_and_stores(struct sock_filter *filter, int flen)
1179 {
1180 u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
1181 int pc, ret = 0;
1182
1183 BUILD_BUG_ON(BPF_MEMWORDS > 16);
1184 masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
1185 if (!masks)
1186 return -ENOMEM;
1187 memset(masks, 0xff, flen * sizeof(*masks));
1188
1189 for (pc = 0; pc < flen; pc++) {
1190 memvalid &= masks[pc];
1191
1192 switch (filter[pc].code) {
1193 case BPF_S_ST:
1194 case BPF_S_STX:
1195 memvalid |= (1 << filter[pc].k);
1196 break;
1197 case BPF_S_LD_MEM:
1198 case BPF_S_LDX_MEM:
1199 if (!(memvalid & (1 << filter[pc].k))) {
1200 ret = -EINVAL;
1201 goto error;
1202 }
1203 break;
1204 case BPF_S_JMP_JA:
1205 /* a jump must set masks on target */
1206 masks[pc + 1 + filter[pc].k] &= memvalid;
1207 memvalid = ~0;
1208 break;
1209 case BPF_S_JMP_JEQ_K:
1210 case BPF_S_JMP_JEQ_X:
1211 case BPF_S_JMP_JGE_K:
1212 case BPF_S_JMP_JGE_X:
1213 case BPF_S_JMP_JGT_K:
1214 case BPF_S_JMP_JGT_X:
1215 case BPF_S_JMP_JSET_X:
1216 case BPF_S_JMP_JSET_K:
1217 /* a jump must set masks on targets */
1218 masks[pc + 1 + filter[pc].jt] &= memvalid;
1219 masks[pc + 1 + filter[pc].jf] &= memvalid;
1220 memvalid = ~0;
1221 break;
1222 }
1223 }
1224 error:
1225 kfree(masks);
1226 return ret;
1227 }
1228
1229 /**
1230 * sk_chk_filter - verify socket filter code
1231 * @filter: filter to verify
1232 * @flen: length of filter
1233 *
1234 * Check the user's filter code. If we let some ugly
1235 * filter code slip through kaboom! The filter must contain
1236 * no references or jumps that are out of range, no illegal
1237 * instructions, and must end with a RET instruction.
1238 *
1239 * All jumps are forward as they are not signed.
1240 *
1241 * Returns 0 if the rule set is legal or -EINVAL if not.
1242 */
1243 int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
1244 {
1245 /*
1246 * Valid instructions are initialized to non-0.
1247 * Invalid instructions are initialized to 0.
1248 */
1249 static const u8 codes[] = {
1250 [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K,
1251 [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X,
1252 [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K,
1253 [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X,
1254 [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K,
1255 [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X,
1256 [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X,
1257 [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K,
1258 [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X,
1259 [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K,
1260 [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X,
1261 [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K,
1262 [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X,
1263 [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K,
1264 [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X,
1265 [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K,
1266 [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X,
1267 [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K,
1268 [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X,
1269 [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG,
1270 [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS,
1271 [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS,
1272 [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS,
1273 [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN,
1274 [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND,
1275 [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND,
1276 [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND,
1277 [BPF_LD|BPF_IMM] = BPF_S_LD_IMM,
1278 [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN,
1279 [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH,
1280 [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM,
1281 [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX,
1282 [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA,
1283 [BPF_RET|BPF_K] = BPF_S_RET_K,
1284 [BPF_RET|BPF_A] = BPF_S_RET_A,
1285 [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K,
1286 [BPF_LD|BPF_MEM] = BPF_S_LD_MEM,
1287 [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM,
1288 [BPF_ST] = BPF_S_ST,
1289 [BPF_STX] = BPF_S_STX,
1290 [BPF_JMP|BPF_JA] = BPF_S_JMP_JA,
1291 [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K,
1292 [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X,
1293 [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K,
1294 [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X,
1295 [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K,
1296 [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X,
1297 [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
1298 [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
1299 };
1300 int pc;
1301 bool anc_found;
1302
1303 if (flen == 0 || flen > BPF_MAXINSNS)
1304 return -EINVAL;
1305
1306 /* check the filter code now */
1307 for (pc = 0; pc < flen; pc++) {
1308 struct sock_filter *ftest = &filter[pc];
1309 u16 code = ftest->code;
1310
1311 if (code >= ARRAY_SIZE(codes))
1312 return -EINVAL;
1313 code = codes[code];
1314 if (!code)
1315 return -EINVAL;
1316 /* Some instructions need special checks */
1317 switch (code) {
1318 case BPF_S_ALU_DIV_K:
1319 case BPF_S_ALU_MOD_K:
1320 /* check for division by zero */
1321 if (ftest->k == 0)
1322 return -EINVAL;
1323 break;
1324 case BPF_S_LD_MEM:
1325 case BPF_S_LDX_MEM:
1326 case BPF_S_ST:
1327 case BPF_S_STX:
1328 /* check for invalid memory addresses */
1329 if (ftest->k >= BPF_MEMWORDS)
1330 return -EINVAL;
1331 break;
1332 case BPF_S_JMP_JA:
1333 /*
1334 * Note, the large ftest->k might cause loops.
1335 * Compare this with conditional jumps below,
1336 * where offsets are limited. --ANK (981016)
1337 */
1338 if (ftest->k >= (unsigned int)(flen-pc-1))
1339 return -EINVAL;
1340 break;
1341 case BPF_S_JMP_JEQ_K:
1342 case BPF_S_JMP_JEQ_X:
1343 case BPF_S_JMP_JGE_K:
1344 case BPF_S_JMP_JGE_X:
1345 case BPF_S_JMP_JGT_K:
1346 case BPF_S_JMP_JGT_X:
1347 case BPF_S_JMP_JSET_X:
1348 case BPF_S_JMP_JSET_K:
1349 /* for conditionals both must be safe */
1350 if (pc + ftest->jt + 1 >= flen ||
1351 pc + ftest->jf + 1 >= flen)
1352 return -EINVAL;
1353 break;
1354 case BPF_S_LD_W_ABS:
1355 case BPF_S_LD_H_ABS:
1356 case BPF_S_LD_B_ABS:
1357 anc_found = false;
1358 #define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \
1359 code = BPF_S_ANC_##CODE; \
1360 anc_found = true; \
1361 break
1362 switch (ftest->k) {
1363 ANCILLARY(PROTOCOL);
1364 ANCILLARY(PKTTYPE);
1365 ANCILLARY(IFINDEX);
1366 ANCILLARY(NLATTR);
1367 ANCILLARY(NLATTR_NEST);
1368 ANCILLARY(MARK);
1369 ANCILLARY(QUEUE);
1370 ANCILLARY(HATYPE);
1371 ANCILLARY(RXHASH);
1372 ANCILLARY(CPU);
1373 ANCILLARY(ALU_XOR_X);
1374 ANCILLARY(VLAN_TAG);
1375 ANCILLARY(VLAN_TAG_PRESENT);
1376 ANCILLARY(PAY_OFFSET);
1377 ANCILLARY(RANDOM);
1378 }
1379
1380 /* ancillary operation unknown or unsupported */
1381 if (anc_found == false && ftest->k >= SKF_AD_OFF)
1382 return -EINVAL;
1383 }
1384 ftest->code = code;
1385 }
1386
1387 /* last instruction must be a RET code */
1388 switch (filter[flen - 1].code) {
1389 case BPF_S_RET_K:
1390 case BPF_S_RET_A:
1391 return check_load_and_stores(filter, flen);
1392 }
1393 return -EINVAL;
1394 }
1395 EXPORT_SYMBOL(sk_chk_filter);
1396
1397 static int sk_store_orig_filter(struct sk_filter *fp,
1398 const struct sock_fprog *fprog)
1399 {
1400 unsigned int fsize = sk_filter_proglen(fprog);
1401 struct sock_fprog_kern *fkprog;
1402
1403 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
1404 if (!fp->orig_prog)
1405 return -ENOMEM;
1406
1407 fkprog = fp->orig_prog;
1408 fkprog->len = fprog->len;
1409 fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
1410 if (!fkprog->filter) {
1411 kfree(fp->orig_prog);
1412 return -ENOMEM;
1413 }
1414
1415 return 0;
1416 }
1417
1418 static void sk_release_orig_filter(struct sk_filter *fp)
1419 {
1420 struct sock_fprog_kern *fprog = fp->orig_prog;
1421
1422 if (fprog) {
1423 kfree(fprog->filter);
1424 kfree(fprog);
1425 }
1426 }
1427
1428 /**
1429 * sk_filter_release_rcu - Release a socket filter by rcu_head
1430 * @rcu: rcu_head that contains the sk_filter to free
1431 */
1432 static void sk_filter_release_rcu(struct rcu_head *rcu)
1433 {
1434 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
1435
1436 sk_release_orig_filter(fp);
1437 bpf_jit_free(fp);
1438 }
1439
1440 /**
1441 * sk_filter_release - release a socket filter
1442 * @fp: filter to remove
1443 *
1444 * Remove a filter from a socket and release its resources.
1445 */
1446 static void sk_filter_release(struct sk_filter *fp)
1447 {
1448 if (atomic_dec_and_test(&fp->refcnt))
1449 call_rcu(&fp->rcu, sk_filter_release_rcu);
1450 }
1451
1452 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
1453 {
1454 atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
1455 sk_filter_release(fp);
1456 }
1457
1458 void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1459 {
1460 atomic_inc(&fp->refcnt);
1461 atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
1462 }
1463
1464 static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
1465 struct sock *sk,
1466 unsigned int len)
1467 {
1468 struct sk_filter *fp_new;
1469
1470 if (sk == NULL)
1471 return krealloc(fp, len, GFP_KERNEL);
1472
1473 fp_new = sock_kmalloc(sk, len, GFP_KERNEL);
1474 if (fp_new) {
1475 memcpy(fp_new, fp, sizeof(struct sk_filter));
1476 /* As we're kepping orig_prog in fp_new along,
1477 * we need to make sure we're not evicting it
1478 * from the old fp.
1479 */
1480 fp->orig_prog = NULL;
1481 sk_filter_uncharge(sk, fp);
1482 }
1483
1484 return fp_new;
1485 }
1486
1487 static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
1488 struct sock *sk)
1489 {
1490 struct sock_filter *old_prog;
1491 struct sk_filter *old_fp;
1492 int i, err, new_len, old_len = fp->len;
1493
1494 /* We are free to overwrite insns et al right here as it
1495 * won't be used at this point in time anymore internally
1496 * after the migration to the internal BPF instruction
1497 * representation.
1498 */
1499 BUILD_BUG_ON(sizeof(struct sock_filter) !=
1500 sizeof(struct sock_filter_int));
1501
1502 /* For now, we need to unfiddle BPF_S_* identifiers in place.
1503 * This can sooner or later on be subject to removal, e.g. when
1504 * JITs have been converted.
1505 */
1506 for (i = 0; i < fp->len; i++)
1507 sk_decode_filter(&fp->insns[i], &fp->insns[i]);
1508
1509 /* Conversion cannot happen on overlapping memory areas,
1510 * so we need to keep the user BPF around until the 2nd
1511 * pass. At this time, the user BPF is stored in fp->insns.
1512 */
1513 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
1514 GFP_KERNEL);
1515 if (!old_prog) {
1516 err = -ENOMEM;
1517 goto out_err;
1518 }
1519
1520 /* 1st pass: calculate the new program length. */
1521 err = sk_convert_filter(old_prog, old_len, NULL, &new_len);
1522 if (err)
1523 goto out_err_free;
1524
1525 /* Expand fp for appending the new filter representation. */
1526 old_fp = fp;
1527 fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len));
1528 if (!fp) {
1529 /* The old_fp is still around in case we couldn't
1530 * allocate new memory, so uncharge on that one.
1531 */
1532 fp = old_fp;
1533 err = -ENOMEM;
1534 goto out_err_free;
1535 }
1536
1537 fp->bpf_func = sk_run_filter_int_skb;
1538 fp->len = new_len;
1539
1540 /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
1541 err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
1542 if (err)
1543 /* 2nd sk_convert_filter() can fail only if it fails
1544 * to allocate memory, remapping must succeed. Note,
1545 * that at this time old_fp has already been released
1546 * by __sk_migrate_realloc().
1547 */
1548 goto out_err_free;
1549
1550 kfree(old_prog);
1551 return fp;
1552
1553 out_err_free:
1554 kfree(old_prog);
1555 out_err:
1556 /* Rollback filter setup. */
1557 if (sk != NULL)
1558 sk_filter_uncharge(sk, fp);
1559 else
1560 kfree(fp);
1561 return ERR_PTR(err);
1562 }
1563
1564 static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
1565 struct sock *sk)
1566 {
1567 int err;
1568
1569 fp->bpf_func = NULL;
1570 fp->jited = 0;
1571
1572 err = sk_chk_filter(fp->insns, fp->len);
1573 if (err)
1574 return ERR_PTR(err);
1575
1576 /* Probe if we can JIT compile the filter and if so, do
1577 * the compilation of the filter.
1578 */
1579 bpf_jit_compile(fp);
1580
1581 /* JIT compiler couldn't process this filter, so do the
1582 * internal BPF translation for the optimized interpreter.
1583 */
1584 if (!fp->jited)
1585 fp = __sk_migrate_filter(fp, sk);
1586
1587 return fp;
1588 }
1589
1590 /**
1591 * sk_unattached_filter_create - create an unattached filter
1592 * @fprog: the filter program
1593 * @pfp: the unattached filter that is created
1594 *
1595 * Create a filter independent of any socket. We first run some
1596 * sanity checks on it to make sure it does not explode on us later.
1597 * If an error occurs or there is insufficient memory for the filter
1598 * a negative errno code is returned. On success the return is zero.
1599 */
1600 int sk_unattached_filter_create(struct sk_filter **pfp,
1601 struct sock_fprog *fprog)
1602 {
1603 unsigned int fsize = sk_filter_proglen(fprog);
1604 struct sk_filter *fp;
1605
1606 /* Make sure new filter is there and in the right amounts. */
1607 if (fprog->filter == NULL)
1608 return -EINVAL;
1609
1610 fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
1611 if (!fp)
1612 return -ENOMEM;
1613
1614 memcpy(fp->insns, fprog->filter, fsize);
1615
1616 atomic_set(&fp->refcnt, 1);
1617 fp->len = fprog->len;
1618 /* Since unattached filters are not copied back to user
1619 * space through sk_get_filter(), we do not need to hold
1620 * a copy here, and can spare us the work.
1621 */
1622 fp->orig_prog = NULL;
1623
1624 /* __sk_prepare_filter() already takes care of uncharging
1625 * memory in case something goes wrong.
1626 */
1627 fp = __sk_prepare_filter(fp, NULL);
1628 if (IS_ERR(fp))
1629 return PTR_ERR(fp);
1630
1631 *pfp = fp;
1632 return 0;
1633 }
1634 EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
1635
1636 void sk_unattached_filter_destroy(struct sk_filter *fp)
1637 {
1638 sk_filter_release(fp);
1639 }
1640 EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
1641
1642 /**
1643 * sk_attach_filter - attach a socket filter
1644 * @fprog: the filter program
1645 * @sk: the socket to use
1646 *
1647 * Attach the user's filter code. We first run some sanity checks on
1648 * it to make sure it does not explode on us later. If an error
1649 * occurs or there is insufficient memory for the filter a negative
1650 * errno code is returned. On success the return is zero.
1651 */
1652 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1653 {
1654 struct sk_filter *fp, *old_fp;
1655 unsigned int fsize = sk_filter_proglen(fprog);
1656 unsigned int sk_fsize = sk_filter_size(fprog->len);
1657 int err;
1658
1659 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1660 return -EPERM;
1661
1662 /* Make sure new filter is there and in the right amounts. */
1663 if (fprog->filter == NULL)
1664 return -EINVAL;
1665
1666 fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
1667 if (!fp)
1668 return -ENOMEM;
1669
1670 if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1671 sock_kfree_s(sk, fp, sk_fsize);
1672 return -EFAULT;
1673 }
1674
1675 atomic_set(&fp->refcnt, 1);
1676 fp->len = fprog->len;
1677
1678 err = sk_store_orig_filter(fp, fprog);
1679 if (err) {
1680 sk_filter_uncharge(sk, fp);
1681 return -ENOMEM;
1682 }
1683
1684 /* __sk_prepare_filter() already takes care of uncharging
1685 * memory in case something goes wrong.
1686 */
1687 fp = __sk_prepare_filter(fp, sk);
1688 if (IS_ERR(fp))
1689 return PTR_ERR(fp);
1690
1691 old_fp = rcu_dereference_protected(sk->sk_filter,
1692 sock_owned_by_user(sk));
1693 rcu_assign_pointer(sk->sk_filter, fp);
1694
1695 if (old_fp)
1696 sk_filter_uncharge(sk, old_fp);
1697
1698 return 0;
1699 }
1700 EXPORT_SYMBOL_GPL(sk_attach_filter);
1701
1702 int sk_detach_filter(struct sock *sk)
1703 {
1704 int ret = -ENOENT;
1705 struct sk_filter *filter;
1706
1707 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1708 return -EPERM;
1709
1710 filter = rcu_dereference_protected(sk->sk_filter,
1711 sock_owned_by_user(sk));
1712 if (filter) {
1713 RCU_INIT_POINTER(sk->sk_filter, NULL);
1714 sk_filter_uncharge(sk, filter);
1715 ret = 0;
1716 }
1717
1718 return ret;
1719 }
1720 EXPORT_SYMBOL_GPL(sk_detach_filter);
1721
1722 void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
1723 {
1724 static const u16 decodes[] = {
1725 [BPF_S_ALU_ADD_K] = BPF_ALU|BPF_ADD|BPF_K,
1726 [BPF_S_ALU_ADD_X] = BPF_ALU|BPF_ADD|BPF_X,
1727 [BPF_S_ALU_SUB_K] = BPF_ALU|BPF_SUB|BPF_K,
1728 [BPF_S_ALU_SUB_X] = BPF_ALU|BPF_SUB|BPF_X,
1729 [BPF_S_ALU_MUL_K] = BPF_ALU|BPF_MUL|BPF_K,
1730 [BPF_S_ALU_MUL_X] = BPF_ALU|BPF_MUL|BPF_X,
1731 [BPF_S_ALU_DIV_X] = BPF_ALU|BPF_DIV|BPF_X,
1732 [BPF_S_ALU_MOD_K] = BPF_ALU|BPF_MOD|BPF_K,
1733 [BPF_S_ALU_MOD_X] = BPF_ALU|BPF_MOD|BPF_X,
1734 [BPF_S_ALU_AND_K] = BPF_ALU|BPF_AND|BPF_K,
1735 [BPF_S_ALU_AND_X] = BPF_ALU|BPF_AND|BPF_X,
1736 [BPF_S_ALU_OR_K] = BPF_ALU|BPF_OR|BPF_K,
1737 [BPF_S_ALU_OR_X] = BPF_ALU|BPF_OR|BPF_X,
1738 [BPF_S_ALU_XOR_K] = BPF_ALU|BPF_XOR|BPF_K,
1739 [BPF_S_ALU_XOR_X] = BPF_ALU|BPF_XOR|BPF_X,
1740 [BPF_S_ALU_LSH_K] = BPF_ALU|BPF_LSH|BPF_K,
1741 [BPF_S_ALU_LSH_X] = BPF_ALU|BPF_LSH|BPF_X,
1742 [BPF_S_ALU_RSH_K] = BPF_ALU|BPF_RSH|BPF_K,
1743 [BPF_S_ALU_RSH_X] = BPF_ALU|BPF_RSH|BPF_X,
1744 [BPF_S_ALU_NEG] = BPF_ALU|BPF_NEG,
1745 [BPF_S_LD_W_ABS] = BPF_LD|BPF_W|BPF_ABS,
1746 [BPF_S_LD_H_ABS] = BPF_LD|BPF_H|BPF_ABS,
1747 [BPF_S_LD_B_ABS] = BPF_LD|BPF_B|BPF_ABS,
1748 [BPF_S_ANC_PROTOCOL] = BPF_LD|BPF_B|BPF_ABS,
1749 [BPF_S_ANC_PKTTYPE] = BPF_LD|BPF_B|BPF_ABS,
1750 [BPF_S_ANC_IFINDEX] = BPF_LD|BPF_B|BPF_ABS,
1751 [BPF_S_ANC_NLATTR] = BPF_LD|BPF_B|BPF_ABS,
1752 [BPF_S_ANC_NLATTR_NEST] = BPF_LD|BPF_B|BPF_ABS,
1753 [BPF_S_ANC_MARK] = BPF_LD|BPF_B|BPF_ABS,
1754 [BPF_S_ANC_QUEUE] = BPF_LD|BPF_B|BPF_ABS,
1755 [BPF_S_ANC_HATYPE] = BPF_LD|BPF_B|BPF_ABS,
1756 [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS,
1757 [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS,
1758 [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS,
1759 [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS,
1760 [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
1761 [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS,
1762 [BPF_S_ANC_RANDOM] = BPF_LD|BPF_B|BPF_ABS,
1763 [BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN,
1764 [BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND,
1765 [BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND,
1766 [BPF_S_LD_B_IND] = BPF_LD|BPF_B|BPF_IND,
1767 [BPF_S_LD_IMM] = BPF_LD|BPF_IMM,
1768 [BPF_S_LDX_W_LEN] = BPF_LDX|BPF_W|BPF_LEN,
1769 [BPF_S_LDX_B_MSH] = BPF_LDX|BPF_B|BPF_MSH,
1770 [BPF_S_LDX_IMM] = BPF_LDX|BPF_IMM,
1771 [BPF_S_MISC_TAX] = BPF_MISC|BPF_TAX,
1772 [BPF_S_MISC_TXA] = BPF_MISC|BPF_TXA,
1773 [BPF_S_RET_K] = BPF_RET|BPF_K,
1774 [BPF_S_RET_A] = BPF_RET|BPF_A,
1775 [BPF_S_ALU_DIV_K] = BPF_ALU|BPF_DIV|BPF_K,
1776 [BPF_S_LD_MEM] = BPF_LD|BPF_MEM,
1777 [BPF_S_LDX_MEM] = BPF_LDX|BPF_MEM,
1778 [BPF_S_ST] = BPF_ST,
1779 [BPF_S_STX] = BPF_STX,
1780 [BPF_S_JMP_JA] = BPF_JMP|BPF_JA,
1781 [BPF_S_JMP_JEQ_K] = BPF_JMP|BPF_JEQ|BPF_K,
1782 [BPF_S_JMP_JEQ_X] = BPF_JMP|BPF_JEQ|BPF_X,
1783 [BPF_S_JMP_JGE_K] = BPF_JMP|BPF_JGE|BPF_K,
1784 [BPF_S_JMP_JGE_X] = BPF_JMP|BPF_JGE|BPF_X,
1785 [BPF_S_JMP_JGT_K] = BPF_JMP|BPF_JGT|BPF_K,
1786 [BPF_S_JMP_JGT_X] = BPF_JMP|BPF_JGT|BPF_X,
1787 [BPF_S_JMP_JSET_K] = BPF_JMP|BPF_JSET|BPF_K,
1788 [BPF_S_JMP_JSET_X] = BPF_JMP|BPF_JSET|BPF_X,
1789 };
1790 u16 code;
1791
1792 code = filt->code;
1793
1794 to->code = decodes[code];
1795 to->jt = filt->jt;
1796 to->jf = filt->jf;
1797 to->k = filt->k;
1798 }
1799
1800 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
1801 unsigned int len)
1802 {
1803 struct sock_fprog_kern *fprog;
1804 struct sk_filter *filter;
1805 int ret = 0;
1806
1807 lock_sock(sk);
1808 filter = rcu_dereference_protected(sk->sk_filter,
1809 sock_owned_by_user(sk));
1810 if (!filter)
1811 goto out;
1812
1813 /* We're copying the filter that has been originally attached,
1814 * so no conversion/decode needed anymore.
1815 */
1816 fprog = filter->orig_prog;
1817
1818 ret = fprog->len;
1819 if (!len)
1820 /* User space only enquires number of filter blocks. */
1821 goto out;
1822
1823 ret = -EINVAL;
1824 if (len < fprog->len)
1825 goto out;
1826
1827 ret = -EFAULT;
1828 if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog)))
1829 goto out;
1830
1831 /* Instead of bytes, the API requests to return the number
1832 * of filter blocks.
1833 */
1834 ret = fprog->len;
1835 out:
1836 release_sock(sk);
1837 return ret;
1838 }
This page took 0.091426 seconds and 4 git commands to generate.