2 * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
3 * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
4 * Copyright (c) 2019 The strace developers.
7 * SPDX-License-Identifier: LGPL-2.1-or-later
14 #include <sys/prctl.h>
16 #include <linux/audit.h>
17 #include <linux/filter.h>
19 #include "filter_seccomp.h"
20 #include "number_set.h"
24 bool seccomp_filtering;
25 bool seccomp_before_sysentry;
27 #ifdef HAVE_LINUX_SECCOMP_H
29 # include <linux/seccomp.h>
32 # define BPF_MAXINSNS 4096
35 # define JMP_PLACEHOLDER_NEXT ((unsigned char) -1)
36 # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
38 # define SET_BPF(filter, code, jt, jf, k) \
39 (*(filter) = (struct sock_filter) { code, jt, jf, k })
41 # define SET_BPF_STMT(filter, code, k) \
42 SET_BPF(filter, code, 0, 0, k)
44 # define SET_BPF_JUMP(filter, code, k, jt, jf) \
45 SET_BPF(filter, BPF_JMP | code, jt, jf, k)
52 static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
53 # if SUPPORTED_PERSONALITIES > 1
54 PERSONALITY0_AUDIT_ARCH,
55 PERSONALITY1_AUDIT_ARCH,
56 # if SUPPORTED_PERSONALITIES > 2
57 PERSONALITY2_AUDIT_ARCH,
64 # ifdef ENABLE_COVERAGE_GCOV
65 extern void __gcov_flush(void);
68 static void ATTRIBUTE_NORETURN
69 check_seccomp_order_do_child(void)
71 static const struct sock_filter filter[] = {
72 /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
73 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
74 offsetof(struct seccomp_data, nr)),
75 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
76 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
77 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
79 static const struct sock_fprog prog = {
80 .len = ARRAY_SIZE(filter),
81 .filter = (struct sock_filter *) filter
84 /* Get everything ready before PTRACE_TRACEME. */
85 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
86 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
87 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
88 perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
91 if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
92 /* Exit with a nonzero exit status. */
93 perror_func_msg_and_die("PTRACE_TRACEME");
96 # ifdef ENABLE_COVERAGE_GCOV
101 syscall(__NR_gettid);
106 check_seccomp_order_tracer(int pid)
110 for (step = 0; ; ++step) {
114 long rc = waitpid(pid, &status, 0);
115 if (rc < 0 && errno == EINTR)
120 perror_func_msg("#%d: unexpected wait result %ld",
125 if (WIFEXITED(status)) {
126 /* The tracee is no more. */
129 int exitstatus = WEXITSTATUS(status);
130 if (step == 5 && exitstatus == 0) {
131 seccomp_filtering = true;
133 error_func_msg("#%d: unexpected exit status %u",
139 if (WIFSIGNALED(status)) {
140 /* The tracee is no more. */
143 error_func_msg("#%d: unexpected signal %u",
144 step, WTERMSIG(status));
148 if (!WIFSTOPPED(status)) {
150 error_func_msg("#%d: unexpected wait status %#x",
155 unsigned int event = (unsigned int) status >> 16;
157 switch (WSTOPSIG(status)) {
160 error_func_msg("#%d: unexpected signal stop",
164 if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
165 PTRACE_O_TRACESYSGOOD|
166 PTRACE_O_TRACESECCOMP) < 0) {
167 perror_func_msg("PTRACE_SETOPTIONS");
173 if (event != PTRACE_EVENT_SECCOMP) {
174 error_func_msg("#%d: unexpected trap %#x",
180 case 1: /* Seccomp stop before entering gettid. */
181 seccomp_before_sysentry = true;
183 case 2: /* Seccomp stop after entering gettid. */
184 if (!seccomp_before_sysentry)
186 ATTRIBUTE_FALLTHROUGH;
188 error_func_msg("#%d: unexpected seccomp stop",
196 case 3: /* Exiting gettid. */
197 case 4: /* Entering exit_group. */
199 case 1: /* Entering gettid before seccomp stop. */
200 seccomp_before_sysentry = false;
202 case 2: /* Entering gettid after seccomp stop. */
203 if (seccomp_before_sysentry)
205 ATTRIBUTE_FALLTHROUGH;
207 error_func_msg("#%d: unexpected syscall stop",
214 error_func_msg("#%d: unexpected stop signal %#x",
215 step, WSTOPSIG(status));
219 if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
221 perror_func_msg("#%d: PTRACE_SYSCALL", step);
228 # endif /* HAVE_FORK */
231 check_seccomp_order(void)
233 seccomp_filtering = false;
235 /* NOMMU provides no forks necessary for the test. */
239 perror_func_msg("fork");
244 check_seccomp_order_do_child();
246 pid = check_seccomp_order_tracer(pid);
250 long rc = waitpid(pid, NULL, 0);
251 if (rc < 0 && errno == EINTR)
256 # endif /* HAVE_FORK */
260 traced_by_seccomp(unsigned int scno, unsigned int p)
262 if (is_number_in_set_array(scno, trace_set, p)
263 || sysent_vec[p][scno].sys_flags
264 & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
270 check_bpf_program_size(void)
272 unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
275 * Implements a simplified form of init_sock_filter()'s bytecode
276 * generation algorithm, to count the number of instructions that will
279 for (int p = SUPPORTED_PERSONALITIES - 1;
280 p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
281 unsigned int nb_insns_personality = 0;
282 unsigned int lower = UINT_MAX;
284 nb_insns_personality++;
285 # if SUPPORTED_PERSONALITIES > 1
286 nb_insns_personality++;
287 if (audit_arch_vec[p].flag)
288 nb_insns_personality += 3;
291 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
292 if (traced_by_seccomp(i, p)) {
293 if (lower == UINT_MAX)
297 if (lower == UINT_MAX)
300 nb_insns_personality++;
302 nb_insns_personality += 2;
305 if (lower != UINT_MAX) {
306 if (lower + 1 == nsyscall_vec[p])
307 nb_insns_personality++;
309 nb_insns_personality += 2;
312 nb_insns_personality += 3;
315 * Within generated BPF programs, the origin and destination of
316 * jumps are always in the same personality section. The
317 * largest jump is therefore the jump from the first
318 * instruction of the section to the last, to skip the
319 * personality and try to compare .arch to the next
321 * If we have a personality section with more than 255
322 * instructions, the jump offset will overflow. Such program
323 * is unlikely to happen, so we simply disable seccomp filter
326 if (nb_insns_personality > UCHAR_MAX) {
327 debug_msg("seccomp filter disabled due to "
328 "possibility of overflow");
329 seccomp_filtering = false;
332 nb_insns += nb_insns_personality;
335 # if SUPPORTED_PERSONALITIES > 1
339 if (nb_insns > BPF_MAXINSNS) {
340 debug_msg("seccomp filter disabled due to BPF program being "
341 "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
342 seccomp_filtering = false;
347 check_seccomp_filter_properties(void)
349 int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
350 seccomp_filtering = rc < 0 && errno != EINVAL;
351 if (!seccomp_filtering)
352 debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
354 if (seccomp_filtering)
355 check_bpf_program_size();
356 if (seccomp_filtering)
357 check_seccomp_order();
361 dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
363 for (unsigned int i = 0; i < len; ++i) {
364 switch (filter[i].code) {
365 case BPF_LD | BPF_W | BPF_ABS:
366 switch (filter[i].k) {
367 case offsetof(struct seccomp_data, arch):
368 error_msg("STMT(BPF_LDWABS, data->arch)");
370 case offsetof(struct seccomp_data, nr):
371 error_msg("STMT(BPF_LDWABS, data->nr)");
374 error_msg("STMT(BPF_LDWABS, 0x%x)",
378 case BPF_RET | BPF_K:
379 switch (filter[i].k) {
380 case SECCOMP_RET_TRACE:
381 error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
383 case SECCOMP_RET_ALLOW:
384 error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
387 error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
390 case BPF_JMP | BPF_JEQ | BPF_K:
391 error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
392 filter[i].jt, filter[i].jf,
395 case BPF_JMP | BPF_JGE | BPF_K:
396 error_msg("JUMP(BPF_JGE, %u, %u, %u)",
397 filter[i].jt, filter[i].jf,
400 case BPF_JMP | BPF_JA:
401 error_msg("JUMP(BPF_JA, %u)", filter[i].k);
404 error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
405 filter[i].jt, filter[i].jf, filter[i].k);
411 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
412 unsigned char jmp_trace)
414 switch (*jmp_offset) {
415 case JMP_PLACEHOLDER_NEXT:
416 *jmp_offset = jmp_next;
418 case JMP_PLACEHOLDER_TRACE:
419 *jmp_offset = jmp_trace;
426 static unsigned short
427 bpf_syscalls_cmp(struct sock_filter *filter,
428 unsigned int lower, unsigned int upper)
430 if (lower + 1 == upper) {
431 /* if (nr == lower) return RET_TRACE; */
432 SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
433 JMP_PLACEHOLDER_TRACE, 0);
436 /* if (nr >= lower && nr < upper) return RET_TRACE; */
437 SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
438 SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
439 JMP_PLACEHOLDER_TRACE);
444 static unsigned short
445 init_sock_filter(struct sock_filter *filter)
448 * Generated program looks like:
449 * if (arch == AUDIT_ARCH_A && nr >= flag) {
451 * return SECCOMP_RET_TRACE;
452 * if (nr >= 321 && nr <= 323)
453 * return SECCOMP_RET_TRACE;
455 * return SECCOMP_RET_ALLOW;
457 * if (arch == AUDIT_ARCH_A) {
460 * if (arch == AUDIT_ARCH_B) {
463 * return SECCOMP_RET_TRACE;
465 unsigned short pos = 0;
467 # if SUPPORTED_PERSONALITIES > 1
468 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
469 offsetof(struct seccomp_data, arch));
473 * Personalities are iterated in reverse-order in the BPF program so
474 * that the x86 case is naturally handled. On x86, the first and third
475 * personalities have the same arch identifier. The third can be
476 * distinguished based on its associated syscall flag, so we check it
477 * first. The only drawback here is that the first personality is more
478 * common, which may make the BPF program slower to match syscalls on
481 for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
482 unsigned int lower = UINT_MAX;
483 unsigned short start = pos, end;
485 # if SUPPORTED_PERSONALITIES > 1
486 /* if (arch != audit_arch_vec[p].arch) goto next; */
487 SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
488 audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
490 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
491 offsetof(struct seccomp_data, nr));
493 # if SUPPORTED_PERSONALITIES > 1
494 if (audit_arch_vec[p].flag) {
495 /* if (nr < audit_arch_vec[p].flag) goto next; */
496 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
497 audit_arch_vec[p].flag, 2, 0);
498 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
499 offsetof(struct seccomp_data, arch));
500 SET_BPF_JUMP(&filter[pos++], BPF_JA,
501 JMP_PLACEHOLDER_NEXT, 0, 0);
505 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
506 if (traced_by_seccomp(i, p)) {
507 if (lower == UINT_MAX)
511 if (lower == UINT_MAX)
513 pos += bpf_syscalls_cmp(filter + pos,
514 lower | audit_arch_vec[p].flag,
515 i | audit_arch_vec[p].flag);
518 if (lower != UINT_MAX)
519 pos += bpf_syscalls_cmp(filter + pos,
520 lower | audit_arch_vec[p].flag,
522 | audit_arch_vec[p].flag);
525 /* if (nr >= max_nr) return RET_TRACE; */
526 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
527 nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
529 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
531 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
534 for (unsigned int i = start; i < end; ++i) {
535 if (BPF_CLASS(filter[i].code) != BPF_JMP)
537 unsigned char jmp_next = pos - i - 1;
538 unsigned char jmp_trace = pos - i - 2;
539 replace_jmp_placeholders(&filter[i].jt, jmp_next,
541 replace_jmp_placeholders(&filter[i].jf, jmp_next,
543 if (BPF_OP(filter[i].code) == BPF_JA)
544 filter[i].k = (unsigned int) jmp_next;
548 # if SUPPORTED_PERSONALITIES > 1
549 /* Jumps conditioned on .arch default to this RET_TRACE. */
550 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
554 dump_seccomp_bpf(filter, pos);
560 init_seccomp_filter(void)
562 struct sock_filter filter[BPF_MAXINSNS];
565 len = init_sock_filter(filter);
567 struct sock_fprog prog = {
572 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
573 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
575 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
576 perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
580 seccomp_filter_restart_operator(const struct tcb *tcp)
582 if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
583 && traced_by_seccomp(tcp->scno, current_personality))
584 return PTRACE_SYSCALL;
588 #else /* !HAVE_LINUX_SECCOMP_H */
590 # warning <linux/seccomp.h> is not available, seccomp filtering is not supported
593 check_seccomp_filter_properties(void)
595 seccomp_filtering = false;
599 init_seccomp_filter(void)
604 seccomp_filter_restart_operator(const struct tcb *tcp)
606 return PTRACE_SYSCALL;
612 check_seccomp_filter(void)
614 /* Let's avoid enabling seccomp if all syscalls are traced. */
615 seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
616 SUPPORTED_PERSONALITIES);
617 if (!seccomp_filtering) {
618 error_msg("Seccomp filter is requested "
619 "but there are no syscalls to filter. "
620 "See -e trace to filter syscalls.");
624 check_seccomp_filter_properties();
626 if (!seccomp_filtering)
627 error_msg("seccomp filter is requested but unavailable");