2 * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
3 * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
4 * Copyright (c) 2019 The strace developers.
7 * SPDX-License-Identifier: LGPL-2.1-or-later
14 #include <sys/prctl.h>
16 #include <linux/audit.h>
17 #include <linux/filter.h>
19 #include "filter_seccomp.h"
20 #include "number_set.h"
24 bool seccomp_filtering;
25 bool seccomp_before_sysentry;
27 #ifdef HAVE_LINUX_SECCOMP_H
29 # include <linux/seccomp.h>
32 # define BPF_MAXINSNS 4096
35 # define JMP_PLACEHOLDER_NEXT ((unsigned char) -1)
36 # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
38 # define SET_BPF(filter, code, jt, jf, k) \
39 (*(filter) = (struct sock_filter) { code, jt, jf, k })
41 # define SET_BPF_STMT(filter, code, k) \
42 SET_BPF(filter, code, 0, 0, k)
44 # define SET_BPF_JUMP(filter, code, k, jt, jf) \
45 SET_BPF(filter, BPF_JMP | code, jt, jf, k)
52 static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
53 # if SUPPORTED_PERSONALITIES > 1
54 PERSONALITY0_AUDIT_ARCH,
55 PERSONALITY1_AUDIT_ARCH,
56 # if SUPPORTED_PERSONALITIES > 2
57 PERSONALITY2_AUDIT_ARCH,
62 # ifdef ENABLE_COVERAGE_GCOV
63 extern void __gcov_flush(void);
66 static void ATTRIBUTE_NORETURN
67 check_seccomp_order_do_child(void)
69 static const struct sock_filter filter[] = {
70 /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
71 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
72 offsetof(struct seccomp_data, nr)),
73 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
74 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
75 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
77 static const struct sock_fprog prog = {
78 .len = ARRAY_SIZE(filter),
79 .filter = (struct sock_filter *) filter
82 /* Get everything ready before PTRACE_TRACEME. */
83 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
84 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
85 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
86 perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
89 if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
90 /* Exit with a nonzero exit status. */
91 perror_func_msg_and_die("PTRACE_TRACEME");
94 # ifdef ENABLE_COVERAGE_GCOV
104 check_seccomp_order_tracer(int pid)
108 for (step = 0; ; ++step) {
112 long rc = waitpid(pid, &status, 0);
113 if (rc < 0 && errno == EINTR)
118 perror_func_msg("#%d: unexpected wait result %ld",
123 if (WIFEXITED(status)) {
124 /* The tracee is no more. */
127 int exitstatus = WEXITSTATUS(status);
128 if (step == 5 && exitstatus == 0) {
129 seccomp_filtering = true;
131 error_func_msg("#%d: unexpected exit status %u",
137 if (WIFSIGNALED(status)) {
138 /* The tracee is no more. */
141 error_func_msg("#%d: unexpected signal %u",
142 step, WTERMSIG(status));
146 if (!WIFSTOPPED(status)) {
148 error_func_msg("#%d: unexpected wait status %#x",
153 unsigned int event = (unsigned int) status >> 16;
155 switch (WSTOPSIG(status)) {
158 error_func_msg("#%d: unexpected signal stop",
162 if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
163 PTRACE_O_TRACESYSGOOD|
164 PTRACE_O_TRACESECCOMP) < 0) {
165 perror_func_msg("PTRACE_SETOPTIONS");
171 if (event != PTRACE_EVENT_SECCOMP) {
172 error_func_msg("#%d: unexpected trap %#x",
178 case 1: /* Seccomp stop before entering gettid. */
179 seccomp_before_sysentry = true;
181 case 2: /* Seccomp stop after entering gettid. */
182 if (!seccomp_before_sysentry)
184 ATTRIBUTE_FALLTHROUGH;
186 error_func_msg("#%d: unexpected seccomp stop",
194 case 3: /* Exiting gettid. */
195 case 4: /* Entering exit_group. */
197 case 1: /* Entering gettid before seccomp stop. */
198 seccomp_before_sysentry = false;
200 case 2: /* Entering gettid after seccomp stop. */
201 if (seccomp_before_sysentry)
203 ATTRIBUTE_FALLTHROUGH;
205 error_func_msg("#%d: unexpected syscall stop",
212 error_func_msg("#%d: unexpected stop signal %#x",
213 step, WSTOPSIG(status));
217 if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
219 perror_func_msg("#%d: PTRACE_SYSCALL", step);
228 check_seccomp_order(void)
230 seccomp_filtering = false;
234 perror_func_msg("fork");
239 check_seccomp_order_do_child();
241 pid = check_seccomp_order_tracer(pid);
245 long rc = waitpid(pid, NULL, 0);
246 if (rc < 0 && errno == EINTR)
254 traced_by_seccomp(unsigned int scno, unsigned int p)
256 if (is_number_in_set_array(scno, trace_set, p)
257 || sysent_vec[p][scno].sys_flags
258 & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
264 check_bpf_program_size(void)
266 unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
269 * Implements a simplified form of init_sock_filter()'s bytecode
270 * generation algorithm, to count the number of instructions that will
273 for (int p = SUPPORTED_PERSONALITIES - 1;
274 p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
275 unsigned int nb_insns_personality = 0;
276 unsigned int lower = UINT_MAX;
278 nb_insns_personality++;
279 # if SUPPORTED_PERSONALITIES > 1
280 nb_insns_personality++;
281 if (audit_arch_vec[p].flag)
282 nb_insns_personality += 3;
285 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
286 if (traced_by_seccomp(i, p)) {
287 if (lower == UINT_MAX)
291 if (lower == UINT_MAX)
294 nb_insns_personality++;
296 nb_insns_personality += 2;
299 if (lower != UINT_MAX) {
300 if (lower + 1 == nsyscall_vec[p])
301 nb_insns_personality++;
303 nb_insns_personality += 2;
306 nb_insns_personality += 3;
309 * Within generated BPF programs, the origin and destination of
310 * jumps are always in the same personality section. The
311 * largest jump is therefore the jump from the first
312 * instruction of the section to the last, to skip the
313 * personality and try to compare .arch to the next
315 * If we have a personality section with more than 255
316 * instructions, the jump offset will overflow. Such program
317 * is unlikely to happen, so we simply disable seccomp filter
320 if (nb_insns_personality > UCHAR_MAX) {
321 debug_msg("seccomp filter disabled due to "
322 "possibility of overflow");
323 seccomp_filtering = false;
326 nb_insns += nb_insns_personality;
329 # if SUPPORTED_PERSONALITIES > 1
333 if (nb_insns > BPF_MAXINSNS) {
334 debug_msg("seccomp filter disabled due to BPF program being "
335 "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
336 seccomp_filtering = false;
341 check_seccomp_filter_properties(void)
344 seccomp_filtering = false;
348 int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
349 seccomp_filtering = rc < 0 && errno != EINVAL;
350 if (!seccomp_filtering)
351 debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
353 if (seccomp_filtering)
354 check_bpf_program_size();
355 if (seccomp_filtering)
356 check_seccomp_order();
360 dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
362 for (unsigned int i = 0; i < len; ++i) {
363 switch (filter[i].code) {
364 case BPF_LD | BPF_W | BPF_ABS:
365 switch (filter[i].k) {
366 case offsetof(struct seccomp_data, arch):
367 error_msg("STMT(BPF_LDWABS, data->arch)");
369 case offsetof(struct seccomp_data, nr):
370 error_msg("STMT(BPF_LDWABS, data->nr)");
373 error_msg("STMT(BPF_LDWABS, 0x%x)",
377 case BPF_RET | BPF_K:
378 switch (filter[i].k) {
379 case SECCOMP_RET_TRACE:
380 error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
382 case SECCOMP_RET_ALLOW:
383 error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
386 error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
389 case BPF_JMP | BPF_JEQ | BPF_K:
390 error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
391 filter[i].jt, filter[i].jf,
394 case BPF_JMP | BPF_JGE | BPF_K:
395 error_msg("JUMP(BPF_JGE, %u, %u, %u)",
396 filter[i].jt, filter[i].jf,
399 case BPF_JMP | BPF_JA:
400 error_msg("JUMP(BPF_JA, %u)", filter[i].k);
403 error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
404 filter[i].jt, filter[i].jf, filter[i].k);
410 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
411 unsigned char jmp_trace)
413 switch (*jmp_offset) {
414 case JMP_PLACEHOLDER_NEXT:
415 *jmp_offset = jmp_next;
417 case JMP_PLACEHOLDER_TRACE:
418 *jmp_offset = jmp_trace;
425 static unsigned short
426 bpf_syscalls_cmp(struct sock_filter *filter,
427 unsigned int lower, unsigned int upper)
429 if (lower + 1 == upper) {
430 /* if (nr == lower) return RET_TRACE; */
431 SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
432 JMP_PLACEHOLDER_TRACE, 0);
435 /* if (nr >= lower && nr < upper) return RET_TRACE; */
436 SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
437 SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
438 JMP_PLACEHOLDER_TRACE);
443 static unsigned short
444 init_sock_filter(struct sock_filter *filter)
447 * Generated program looks like:
448 * if (arch == AUDIT_ARCH_A && nr >= flag) {
450 * return SECCOMP_RET_TRACE;
451 * if (nr >= 321 && nr <= 323)
452 * return SECCOMP_RET_TRACE;
454 * return SECCOMP_RET_ALLOW;
456 * if (arch == AUDIT_ARCH_A) {
459 * if (arch == AUDIT_ARCH_B) {
462 * return SECCOMP_RET_TRACE;
464 unsigned short pos = 0;
466 # if SUPPORTED_PERSONALITIES > 1
467 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
468 offsetof(struct seccomp_data, arch));
472 * Personalities are iterated in reverse-order in the BPF program so
473 * that the x86 case is naturally handled. On x86, the first and third
474 * personalities have the same arch identifier. The third can be
475 * distinguished based on its associated syscall flag, so we check it
476 * first. The only drawback here is that the first personality is more
477 * common, which may make the BPF program slower to match syscalls on
480 for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
481 unsigned int lower = UINT_MAX;
482 unsigned short start = pos, end;
484 # if SUPPORTED_PERSONALITIES > 1
485 /* if (arch != audit_arch_vec[p].arch) goto next; */
486 SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
487 audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
489 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
490 offsetof(struct seccomp_data, nr));
492 # if SUPPORTED_PERSONALITIES > 1
493 if (audit_arch_vec[p].flag) {
494 /* if (nr < audit_arch_vec[p].flag) goto next; */
495 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
496 audit_arch_vec[p].flag, 2, 0);
497 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
498 offsetof(struct seccomp_data, arch));
499 SET_BPF_JUMP(&filter[pos++], BPF_JA,
500 JMP_PLACEHOLDER_NEXT, 0, 0);
504 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
505 if (traced_by_seccomp(i, p)) {
506 if (lower == UINT_MAX)
510 if (lower == UINT_MAX)
512 pos += bpf_syscalls_cmp(filter + pos,
513 lower | audit_arch_vec[p].flag,
514 i | audit_arch_vec[p].flag);
517 if (lower != UINT_MAX)
518 pos += bpf_syscalls_cmp(filter + pos,
519 lower | audit_arch_vec[p].flag,
521 | audit_arch_vec[p].flag);
524 /* if (nr >= max_nr) return RET_TRACE; */
525 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
526 nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
528 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
530 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
533 for (unsigned int i = start; i < end; ++i) {
534 if (BPF_CLASS(filter[i].code) != BPF_JMP)
536 unsigned char jmp_next = pos - i - 1;
537 unsigned char jmp_trace = pos - i - 2;
538 replace_jmp_placeholders(&filter[i].jt, jmp_next,
540 replace_jmp_placeholders(&filter[i].jf, jmp_next,
542 if (BPF_OP(filter[i].code) == BPF_JA)
543 filter[i].k = (unsigned int) jmp_next;
547 # if SUPPORTED_PERSONALITIES > 1
548 /* Jumps conditioned on .arch default to this RET_TRACE. */
549 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
553 dump_seccomp_bpf(filter, pos);
559 init_seccomp_filter(void)
561 struct sock_filter filter[BPF_MAXINSNS];
564 len = init_sock_filter(filter);
566 struct sock_fprog prog = {
571 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
572 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
574 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
575 perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
579 seccomp_filter_restart_operator(const struct tcb *tcp)
581 if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
582 && traced_by_seccomp(tcp->scno, current_personality))
583 return PTRACE_SYSCALL;
587 #else /* !HAVE_LINUX_SECCOMP_H */
589 # warning <linux/seccomp.h> is not available, seccomp filtering is not supported
592 check_seccomp_filter_properties(void)
594 seccomp_filtering = false;
598 init_seccomp_filter(void)
603 seccomp_filter_restart_operator(const struct tcb *tcp)
605 return PTRACE_SYSCALL;
611 check_seccomp_filter(void)
613 /* Let's avoid enabling seccomp if all syscalls are traced. */
614 seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
615 SUPPORTED_PERSONALITIES);
616 if (!seccomp_filtering) {
617 error_msg("Seccomp filter is requested "
618 "but there are no syscalls to filter. "
619 "See -e trace to filter syscalls.");
623 check_seccomp_filter_properties();
625 if (!seccomp_filtering)
626 error_msg("seccomp filter is requested but unavailable");