2 * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
3 * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
4 * Copyright (c) 2019 The strace developers.
7 * SPDX-License-Identifier: LGPL-2.1-or-later
14 #include <sys/prctl.h>
16 #include <linux/filter.h>
18 #include "filter_seccomp.h"
19 #include "number_set.h"
23 bool seccomp_filtering;
24 bool seccomp_before_sysentry;
26 #ifdef HAVE_LINUX_SECCOMP_H
28 # include <linux/seccomp.h>
30 /* PERSONALITY*_AUDIT_ARCH definitions depend on AUDIT_ARCH_* constants. */
31 # ifdef PERSONALITY0_AUDIT_ARCH
32 # include <linux/audit.h>
33 # define XLAT_MACROS_ONLY
34 # include "xlat/elf_em.h"
35 # include "xlat/audit_arch.h"
36 # undef XLAT_MACROS_ONLY
40 # define BPF_MAXINSNS 4096
43 # define JMP_PLACEHOLDER_NEXT ((unsigned char) -1)
44 # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
46 # define SET_BPF(filter, code, jt, jf, k) \
47 (*(filter) = (struct sock_filter) { code, jt, jf, k })
49 # define SET_BPF_STMT(filter, code, k) \
50 SET_BPF(filter, code, 0, 0, k)
52 # define SET_BPF_JUMP(filter, code, k, jt, jf) \
53 SET_BPF(filter, BPF_JMP | code, jt, jf, k)
60 static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
61 # if SUPPORTED_PERSONALITIES > 1
62 PERSONALITY0_AUDIT_ARCH,
63 PERSONALITY1_AUDIT_ARCH,
64 # if SUPPORTED_PERSONALITIES > 2
65 PERSONALITY2_AUDIT_ARCH,
72 # ifdef ENABLE_COVERAGE_GCOV
73 extern void __gcov_flush(void);
76 static void ATTRIBUTE_NORETURN
77 check_seccomp_order_do_child(void)
79 static const struct sock_filter filter[] = {
80 /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
81 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
82 offsetof(struct seccomp_data, nr)),
83 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
84 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
85 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
87 static const struct sock_fprog prog = {
88 .len = ARRAY_SIZE(filter),
89 .filter = (struct sock_filter *) filter
92 /* Get everything ready before PTRACE_TRACEME. */
93 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
94 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
95 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
96 perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
99 if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
100 /* Exit with a nonzero exit status. */
101 perror_func_msg_and_die("PTRACE_TRACEME");
104 # ifdef ENABLE_COVERAGE_GCOV
109 syscall(__NR_gettid);
114 check_seccomp_order_tracer(int pid)
118 for (step = 0; ; ++step) {
122 long rc = waitpid(pid, &status, 0);
123 if (rc < 0 && errno == EINTR)
128 perror_func_msg("#%d: unexpected wait result %ld",
133 if (WIFEXITED(status)) {
134 /* The tracee is no more. */
137 int exitstatus = WEXITSTATUS(status);
138 if (step == 5 && exitstatus == 0) {
139 seccomp_filtering = true;
141 error_func_msg("#%d: unexpected exit status %u",
147 if (WIFSIGNALED(status)) {
148 /* The tracee is no more. */
151 error_func_msg("#%d: unexpected signal %u",
152 step, WTERMSIG(status));
156 if (!WIFSTOPPED(status)) {
158 error_func_msg("#%d: unexpected wait status %#x",
163 unsigned int event = (unsigned int) status >> 16;
165 switch (WSTOPSIG(status)) {
168 error_func_msg("#%d: unexpected signal stop",
172 if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
173 PTRACE_O_TRACESYSGOOD|
174 PTRACE_O_TRACESECCOMP) < 0) {
175 perror_func_msg("PTRACE_SETOPTIONS");
181 if (event != PTRACE_EVENT_SECCOMP) {
182 error_func_msg("#%d: unexpected trap %#x",
188 case 1: /* Seccomp stop before entering gettid. */
189 seccomp_before_sysentry = true;
191 case 2: /* Seccomp stop after entering gettid. */
192 if (!seccomp_before_sysentry)
194 ATTRIBUTE_FALLTHROUGH;
196 error_func_msg("#%d: unexpected seccomp stop",
204 case 3: /* Exiting gettid. */
205 case 4: /* Entering exit_group. */
207 case 1: /* Entering gettid before seccomp stop. */
208 seccomp_before_sysentry = false;
210 case 2: /* Entering gettid after seccomp stop. */
211 if (seccomp_before_sysentry)
213 ATTRIBUTE_FALLTHROUGH;
215 error_func_msg("#%d: unexpected syscall stop",
222 error_func_msg("#%d: unexpected stop signal %#x",
223 step, WSTOPSIG(status));
227 if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
229 perror_func_msg("#%d: PTRACE_SYSCALL", step);
236 # endif /* HAVE_FORK */
239 check_seccomp_order(void)
241 seccomp_filtering = false;
243 /* NOMMU provides no forks necessary for the test. */
247 perror_func_msg("fork");
252 check_seccomp_order_do_child();
254 pid = check_seccomp_order_tracer(pid);
258 long rc = waitpid(pid, NULL, 0);
259 if (rc < 0 && errno == EINTR)
264 # endif /* HAVE_FORK */
268 traced_by_seccomp(unsigned int scno, unsigned int p)
270 if (is_number_in_set_array(scno, trace_set, p)
271 || sysent_vec[p][scno].sys_flags
272 & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
278 check_bpf_program_size(void)
280 unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
283 * Implements a simplified form of init_sock_filter()'s bytecode
284 * generation algorithm, to count the number of instructions that will
287 for (int p = SUPPORTED_PERSONALITIES - 1;
288 p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
289 unsigned int nb_insns_personality = 0;
290 unsigned int lower = UINT_MAX;
292 nb_insns_personality++;
293 # if SUPPORTED_PERSONALITIES > 1
294 nb_insns_personality++;
295 if (audit_arch_vec[p].flag)
296 nb_insns_personality += 3;
299 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
300 if (traced_by_seccomp(i, p)) {
301 if (lower == UINT_MAX)
305 if (lower == UINT_MAX)
308 nb_insns_personality++;
310 nb_insns_personality += 2;
313 if (lower != UINT_MAX) {
314 if (lower + 1 == nsyscall_vec[p])
315 nb_insns_personality++;
317 nb_insns_personality += 2;
320 nb_insns_personality += 3;
323 * Within generated BPF programs, the origin and destination of
324 * jumps are always in the same personality section. The
325 * largest jump is therefore the jump from the first
326 * instruction of the section to the last, to skip the
327 * personality and try to compare .arch to the next
329 * If we have a personality section with more than 255
330 * instructions, the jump offset will overflow. Such program
331 * is unlikely to happen, so we simply disable seccomp filter
334 if (nb_insns_personality > UCHAR_MAX) {
335 debug_msg("seccomp filter disabled due to "
336 "possibility of overflow");
337 seccomp_filtering = false;
340 nb_insns += nb_insns_personality;
343 # if SUPPORTED_PERSONALITIES > 1
347 if (nb_insns > BPF_MAXINSNS) {
348 debug_msg("seccomp filter disabled due to BPF program being "
349 "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
350 seccomp_filtering = false;
355 check_seccomp_filter_properties(void)
357 int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
358 seccomp_filtering = rc < 0 && errno != EINVAL;
359 if (!seccomp_filtering)
360 debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
362 if (seccomp_filtering)
363 check_bpf_program_size();
364 if (seccomp_filtering)
365 check_seccomp_order();
369 dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
371 for (unsigned int i = 0; i < len; ++i) {
372 switch (filter[i].code) {
373 case BPF_LD | BPF_W | BPF_ABS:
374 switch (filter[i].k) {
375 case offsetof(struct seccomp_data, arch):
376 error_msg("STMT(BPF_LDWABS, data->arch)");
378 case offsetof(struct seccomp_data, nr):
379 error_msg("STMT(BPF_LDWABS, data->nr)");
382 error_msg("STMT(BPF_LDWABS, 0x%x)",
386 case BPF_RET | BPF_K:
387 switch (filter[i].k) {
388 case SECCOMP_RET_TRACE:
389 error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
391 case SECCOMP_RET_ALLOW:
392 error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
395 error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
398 case BPF_JMP | BPF_JEQ | BPF_K:
399 error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
400 filter[i].jt, filter[i].jf,
403 case BPF_JMP | BPF_JGE | BPF_K:
404 error_msg("JUMP(BPF_JGE, %u, %u, %u)",
405 filter[i].jt, filter[i].jf,
408 case BPF_JMP | BPF_JA:
409 error_msg("JUMP(BPF_JA, %u)", filter[i].k);
412 error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
413 filter[i].jt, filter[i].jf, filter[i].k);
419 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
420 unsigned char jmp_trace)
422 switch (*jmp_offset) {
423 case JMP_PLACEHOLDER_NEXT:
424 *jmp_offset = jmp_next;
426 case JMP_PLACEHOLDER_TRACE:
427 *jmp_offset = jmp_trace;
434 static unsigned short
435 bpf_syscalls_cmp(struct sock_filter *filter,
436 unsigned int lower, unsigned int upper)
438 if (lower + 1 == upper) {
439 /* if (nr == lower) return RET_TRACE; */
440 SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
441 JMP_PLACEHOLDER_TRACE, 0);
444 /* if (nr >= lower && nr < upper) return RET_TRACE; */
445 SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
446 SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
447 JMP_PLACEHOLDER_TRACE);
452 static unsigned short
453 init_sock_filter(struct sock_filter *filter)
456 * Generated program looks like:
457 * if (arch == AUDIT_ARCH_A && nr >= flag) {
459 * return SECCOMP_RET_TRACE;
460 * if (nr >= 321 && nr <= 323)
461 * return SECCOMP_RET_TRACE;
463 * return SECCOMP_RET_ALLOW;
465 * if (arch == AUDIT_ARCH_A) {
468 * if (arch == AUDIT_ARCH_B) {
471 * return SECCOMP_RET_TRACE;
473 unsigned short pos = 0;
475 # if SUPPORTED_PERSONALITIES > 1
476 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
477 offsetof(struct seccomp_data, arch));
481 * Personalities are iterated in reverse-order in the BPF program so
482 * that the x86 case is naturally handled. On x86, the first and third
483 * personalities have the same arch identifier. The third can be
484 * distinguished based on its associated syscall flag, so we check it
485 * first. The only drawback here is that the first personality is more
486 * common, which may make the BPF program slower to match syscalls on
489 for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
490 unsigned int lower = UINT_MAX;
491 unsigned short start = pos, end;
493 # if SUPPORTED_PERSONALITIES > 1
494 /* if (arch != audit_arch_vec[p].arch) goto next; */
495 SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
496 audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
498 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
499 offsetof(struct seccomp_data, nr));
501 # if SUPPORTED_PERSONALITIES > 1
502 if (audit_arch_vec[p].flag) {
503 /* if (nr < audit_arch_vec[p].flag) goto next; */
504 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
505 audit_arch_vec[p].flag, 2, 0);
506 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
507 offsetof(struct seccomp_data, arch));
508 SET_BPF_JUMP(&filter[pos++], BPF_JA,
509 JMP_PLACEHOLDER_NEXT, 0, 0);
513 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
514 if (traced_by_seccomp(i, p)) {
515 if (lower == UINT_MAX)
519 if (lower == UINT_MAX)
521 pos += bpf_syscalls_cmp(filter + pos,
522 lower | audit_arch_vec[p].flag,
523 i | audit_arch_vec[p].flag);
526 if (lower != UINT_MAX)
527 pos += bpf_syscalls_cmp(filter + pos,
528 lower | audit_arch_vec[p].flag,
530 | audit_arch_vec[p].flag);
533 /* if (nr >= max_nr) return RET_TRACE; */
534 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
535 nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
537 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
539 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
542 for (unsigned int i = start; i < end; ++i) {
543 if (BPF_CLASS(filter[i].code) != BPF_JMP)
545 unsigned char jmp_next = pos - i - 1;
546 unsigned char jmp_trace = pos - i - 2;
547 replace_jmp_placeholders(&filter[i].jt, jmp_next,
549 replace_jmp_placeholders(&filter[i].jf, jmp_next,
551 if (BPF_OP(filter[i].code) == BPF_JA)
552 filter[i].k = (unsigned int) jmp_next;
556 # if SUPPORTED_PERSONALITIES > 1
557 /* Jumps conditioned on .arch default to this RET_TRACE. */
558 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
562 dump_seccomp_bpf(filter, pos);
568 init_seccomp_filter(void)
570 struct sock_filter filter[BPF_MAXINSNS];
573 len = init_sock_filter(filter);
575 struct sock_fprog prog = {
580 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
581 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
583 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
584 perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
588 seccomp_filter_restart_operator(const struct tcb *tcp)
590 if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
591 && traced_by_seccomp(tcp->scno, current_personality))
592 return PTRACE_SYSCALL;
596 #else /* !HAVE_LINUX_SECCOMP_H */
598 # warning <linux/seccomp.h> is not available, seccomp filtering is not supported
601 check_seccomp_filter_properties(void)
603 seccomp_filtering = false;
607 init_seccomp_filter(void)
612 seccomp_filter_restart_operator(const struct tcb *tcp)
614 return PTRACE_SYSCALL;
620 check_seccomp_filter(void)
622 /* Let's avoid enabling seccomp if all syscalls are traced. */
623 seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
624 SUPPORTED_PERSONALITIES);
625 if (!seccomp_filtering) {
626 error_msg("Seccomp filter is requested "
627 "but there are no syscalls to filter. "
628 "See -e trace to filter syscalls.");
632 check_seccomp_filter_properties();
634 if (!seccomp_filtering)
635 error_msg("seccomp filter is requested but unavailable");