]> granicus.if.org Git - strace/commitdiff
Introduce seccomp-assisted syscall filtering
authorChen Jingpiao <chenjingpiao@gmail.com>
Thu, 3 May 2018 13:00:38 +0000 (21:00 +0800)
committerDmitry V. Levin <ldv@altlinux.org>
Wed, 25 Sep 2019 01:02:03 +0000 (01:02 +0000)
With this patch, strace can rely on seccomp to only be stopped at syscalls
of interest, instead of stopping at all syscalls.  The seccomp filtering
of syscalls is opt-in only; it must be enabled with the --seccomp-bpf
option.  Kernel support is first checked with check_seccomp_filter(),
which also ensures the BPF program derived from the syscalls to filter
is not larger than the kernel's limit.

The --seccomp-bpf option implies -f, but a warning is emitted if -f is not
explicitly specified.  Since a task's children inherit its seccomp
filters, we want to ensure all children are also traced to avoid their
syscalls failing with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man page).

Fork/vfork/clone children of traced processes are marked as not having a
seccomp filter until we receive a first seccomp-stop.  They are therefore
stopped at every syscall entries and exits until that first seccomp-stop.

The current BPF program implements a simple linear match of the syscall
numbers.  Contiguous sequences of syscall numbers are however matched as
an interval, with two instructions only.  The algorithm can be improved
or replaced in the future without impacting user-observed behavior.

The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8
(cf. PTRACE_EVENT_SECCOMP in ptrace man page).  This patch supports both
behaviors by checking the kernel's actual behavior before installing the
seccomp filter.

* filter_seccomp.c: New file.
* filter_seccomp.h: New file.
* Makefile.am (strace_SOURCES): Add filter_seccomp.c and
filter_seccomp.h.
* linux/aarch64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Define for aarch64.
* linux/powerpc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for powerpc64.
* linux/s390x/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
* linux/sparc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for sparc64.
PERSONALITY1_AUDIT_ARCH): Likewise for s390x.
* linux/tile/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for tile.
* linux/x32/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for x32.
* linux/x86_64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH, PERSONALITY2_AUDIT_ARCH): Likewise for x86_64.
* linux/ia64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH): Likewise for IA64.
* strace.c (usage): Document --seccomp-bpf option.
(startup_child): Mark process has having seccomp filter.
(exec_or_die): Initialize seccomp filtering if requested.
(init): Handle --seccomp-bpf option and check that seccomp can be
enabled.
(print_debug_info): Handle PTRACE_EVENT_SECCOMP.
(next_event): Capture PTRACE_EVENT_SECCOMP event.
(dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
* trace_event.h (trace_event): New enumeration entity.
* strace.1.in: Document new --seccomp-bpf option.
* NEWS: Mention this change.

Co-authored-by: Paul Chaignon <paul.chaignon@gmail.com>
Co-Authored-by: Dmitry V. Levin <ldv@altlinux.org>
16 files changed:
Makefile.am
NEWS
defs.h
filter_seccomp.c [new file with mode: 0644]
filter_seccomp.h [new file with mode: 0644]
linux/aarch64/arch_defs_.h
linux/ia64/arch_defs_.h
linux/powerpc64/arch_defs_.h
linux/s390x/arch_defs_.h
linux/sparc64/arch_defs_.h
linux/tile/arch_defs_.h
linux/x32/arch_defs_.h
linux/x86_64/arch_defs_.h
strace.1.in
strace.c
trace_event.h

index 2dbc806f57f28c149f011fefda5390bea4365d0f..9e0d2ff051a8d34774f8d9c7ea0aa3396bb1cdd3 100644 (file)
@@ -129,6 +129,8 @@ strace_SOURCES =    \
        file_ioctl.c    \
        filter.h        \
        filter_qualify.c \
+       filter_seccomp.c \
+       filter_seccomp.h \
        flock.c         \
        flock.h         \
        fs_x_ioctl.c    \
diff --git a/NEWS b/NEWS
index 867cfdcbe7873ab07dc5ee8c098449df57a5faac..7c1fc0fd021678c26d53cf95c95a61a08a509159 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,8 @@ Noteworthy changes in release ?.? (????-??-??)
 ==============================================
 
 * Improvements
+  * Implemented usage of seccomp-bpf for stopping tracees only for filtered
+    syscalls.  Use --seccomp-bpf option to enable.
   * Implemented decoding of pidfd_open and clone3 syscalls.
   * Enhanced decoding of NETLINK_ROUTE protocol.
   * Implemented decoding of UNIX_DIAG_UID netlink attribute.
diff --git a/defs.h b/defs.h
index 7278bc36cc75bdee2775bd6758ada24aaede5e4f..119ed515e4cc3cec46d00269d600fba6fb0000ad 100644 (file)
--- a/defs.h
+++ b/defs.h
@@ -332,6 +332,9 @@ struct tcb {
 # define TCB_DELAYED   0x2000  /* Current syscall has been delayed */
 # define TCB_TAMPERED_NO_FAIL 0x4000   /* We tamper tcb with syscall
                                           that should not fail. */
+# define TCB_SECCOMP_FILTER    0x8000  /* This process has a seccomp filter
+                                        * attached.
+                                        */
 
 /* qualifier flags */
 # define QUAL_TRACE    0x001   /* this system call should be traced */
@@ -358,6 +361,7 @@ struct tcb {
 # define inject_delay_exit(tcp)        ((tcp)->flags & TCB_INJECT_DELAY_EXIT)
 # define syscall_delayed(tcp)  ((tcp)->flags & TCB_DELAYED)
 # define syscall_tampered_nofail(tcp) ((tcp)->flags & TCB_TAMPERED_NO_FAIL)
+# define has_seccomp_filter(tcp)       ((tcp)->flags & TCB_SECCOMP_FILTER)
 
 extern const struct_sysent stub_sysent;
 # define tcp_sysent(tcp) (tcp->s_ent ?: &stub_sysent)
diff --git a/filter_seccomp.c b/filter_seccomp.c
new file mode 100644 (file)
index 0000000..dd3aa17
--- /dev/null
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
+ * Copyright (c) 2019 The strace developers.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#include "defs.h"
+
+#include "ptrace.h"
+#include <signal.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+
+#include "filter_seccomp.h"
+#include "number_set.h"
+#include "syscall.h"
+#include "scno.h"
+
+bool seccomp_filtering;
+bool seccomp_before_sysentry;
+
+#ifdef HAVE_LINUX_SECCOMP_H
+
+# include <linux/seccomp.h>
+
+# ifndef BPF_MAXINSNS
+#  define BPF_MAXINSNS 4096
+# endif
+
+# define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
+# define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
+
+# define SET_BPF(filter, code, jt, jf, k) \
+       (*(filter) = (struct sock_filter) { code, jt, jf, k })
+
+# define SET_BPF_STMT(filter, code, k) \
+       SET_BPF(filter, code, 0, 0, k)
+
+# define SET_BPF_JUMP(filter, code, k, jt, jf) \
+       SET_BPF(filter, BPF_JMP | code, jt, jf, k)
+
+struct audit_arch_t {
+       unsigned int arch;
+       unsigned int flag;
+};
+
+static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
+# if SUPPORTED_PERSONALITIES > 1
+       PERSONALITY0_AUDIT_ARCH,
+       PERSONALITY1_AUDIT_ARCH,
+#  if SUPPORTED_PERSONALITIES > 2
+       PERSONALITY2_AUDIT_ARCH,
+#  endif
+# endif
+};
+
+# ifdef ENABLE_COVERAGE_GCOV
+extern void __gcov_flush(void);
+# endif
+
+static void ATTRIBUTE_NORETURN
+check_seccomp_order_do_child(void)
+{
+       static const struct sock_filter filter[] = {
+               /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
+               BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+                        offsetof(struct seccomp_data, nr)),
+               BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
+               BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
+               BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
+       };
+       static const struct sock_fprog prog = {
+               .len = ARRAY_SIZE(filter),
+               .filter = (struct sock_filter *) filter
+       };
+
+       /* Get everything ready before PTRACE_TRACEME.  */
+       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+               perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
+       if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+               perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
+       int pid = getpid();
+
+       if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
+               /* Exit with a nonzero exit status.  */
+               perror_func_msg_and_die("PTRACE_TRACEME");
+       }
+
+# ifdef ENABLE_COVERAGE_GCOV
+       __gcov_flush();
+# endif
+
+       kill(pid, SIGSTOP);
+       syscall(__NR_gettid);
+       _exit(0);
+}
+
+static int
+check_seccomp_order_tracer(int pid)
+{
+       unsigned int step;
+
+       for (step = 0; ; ++step) {
+               int status;
+
+               for (;;) {
+                       long rc = waitpid(pid, &status, 0);
+                       if (rc < 0 && errno == EINTR)
+                               continue;
+                       if (rc == pid)
+                               break;
+                       /* Cannot happen.  */
+                       perror_func_msg("#%d: unexpected wait result %ld",
+                                       step, rc);
+                       return pid;
+               }
+
+               if (WIFEXITED(status)) {
+                       /* The tracee is no more.  */
+                       pid = 0;
+
+                       int exitstatus = WEXITSTATUS(status);
+                       if (step == 5 && exitstatus == 0) {
+                               seccomp_filtering = true;
+                       } else {
+                               error_func_msg("#%d: unexpected exit status %u",
+                                              step, exitstatus);
+                       }
+                       break;
+               }
+
+               if (WIFSIGNALED(status)) {
+                       /* The tracee is no more.  */
+                       pid = 0;
+
+                       error_func_msg("#%d: unexpected signal %u",
+                                      step, WTERMSIG(status));
+                       break;
+               }
+
+               if (!WIFSTOPPED(status)) {
+                       /* Cannot happen.  */
+                       error_func_msg("#%d: unexpected wait status %#x",
+                                      step, status);
+                       break;
+               }
+
+               unsigned int event = (unsigned int) status >> 16;
+
+               switch (WSTOPSIG(status)) {
+               case SIGSTOP:
+                       if (step != 0) {
+                               error_func_msg("#%d: unexpected signal stop",
+                                              step);
+                               return pid;
+                       }
+                       if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
+                                  PTRACE_O_TRACESYSGOOD|
+                                  PTRACE_O_TRACESECCOMP) < 0) {
+                               perror_func_msg("PTRACE_SETOPTIONS");
+                               return pid;
+                       }
+                       break;
+
+               case SIGTRAP:
+                       if (event != PTRACE_EVENT_SECCOMP) {
+                               error_func_msg("#%d: unexpected trap %#x",
+                                              step, event);
+                               return pid;
+                       }
+
+                       switch (step) {
+                       case 1: /* Seccomp stop before entering gettid.  */
+                               seccomp_before_sysentry = true;
+                               break;
+                       case 2: /* Seccomp stop after entering gettid.  */
+                               if (!seccomp_before_sysentry)
+                                       break;
+                               ATTRIBUTE_FALLTHROUGH;
+                       default:
+                               error_func_msg("#%d: unexpected seccomp stop",
+                                              step);
+                               return pid;
+                       }
+                       break;
+
+               case SIGTRAP | 0x80:
+                       switch (step) {
+                       case 3: /* Exiting gettid.  */
+                       case 4: /* Entering exit_group.  */
+                               break;
+                       case 1: /* Entering gettid before seccomp stop.  */
+                               seccomp_before_sysentry = false;
+                               break;
+                       case 2: /* Entering gettid after seccomp stop.  */
+                               if (seccomp_before_sysentry)
+                                       break;
+                               ATTRIBUTE_FALLTHROUGH;
+                       default:
+                               error_func_msg("#%d: unexpected syscall stop",
+                                              step);
+                               return pid;
+                       }
+                       break;
+
+               default:
+                       error_func_msg("#%d: unexpected stop signal %#x",
+                                      step, WSTOPSIG(status));
+                       return pid;
+               }
+
+               if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
+                       /* Cannot happen.  */
+                       perror_func_msg("#%d: PTRACE_SYSCALL", step);
+                       break;
+               }
+       }
+
+       return pid;
+}
+
+static void
+check_seccomp_order(void)
+{
+       seccomp_filtering = false;
+
+       int pid = fork();
+       if (pid < 0) {
+               perror_func_msg("fork");
+               return;
+       }
+
+       if (pid == 0)
+               check_seccomp_order_do_child();
+
+       pid = check_seccomp_order_tracer(pid);
+       if (pid) {
+               kill(pid, SIGKILL);
+               for (;;) {
+                       long rc = waitpid(pid, NULL, 0);
+                       if (rc < 0 && errno == EINTR)
+                               continue;
+                       break;
+               }
+       }
+}
+
+static bool
+traced_by_seccomp(unsigned int scno, unsigned int p)
+{
+       if (is_number_in_set_array(scno, trace_set, p)
+           || sysent_vec[p][scno].sys_flags
+           & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
+               return true;
+       return false;
+}
+
+static void
+check_bpf_program_size(void)
+{
+       unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
+
+       /*
+        * Implements a simplified form of init_sock_filter()'s bytecode
+        * generation algorithm, to count the number of instructions that will
+        * be generated.
+        */
+       for (int p = SUPPORTED_PERSONALITIES - 1;
+            p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
+               unsigned int nb_insns_personality = 0;
+               unsigned int lower = UINT_MAX;
+
+               nb_insns_personality++;
+# if SUPPORTED_PERSONALITIES > 1
+               nb_insns_personality++;
+               if (audit_arch_vec[p].flag)
+                       nb_insns_personality += 3;
+# endif
+
+               for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+                       if (traced_by_seccomp(i, p)) {
+                               if (lower == UINT_MAX)
+                                       lower = i;
+                               continue;
+                       }
+                       if (lower == UINT_MAX)
+                               continue;
+                       if (lower + 1 == i)
+                               nb_insns_personality++;
+                       else
+                               nb_insns_personality += 2;
+                       lower = UINT_MAX;
+               }
+               if (lower != UINT_MAX) {
+                       if (lower + 1 == nsyscall_vec[p])
+                               nb_insns_personality++;
+                       else
+                               nb_insns_personality += 2;
+               }
+
+               nb_insns_personality += 3;
+
+               /*
+                * Within generated BPF programs, the origin and destination of
+                * jumps are always in the same personality section.  The
+                * largest jump is therefore the jump from the first
+                * instruction of the section to the last, to skip the
+                * personality and try to compare .arch to the next
+                * personality.
+                * If we have a personality section with more than 255
+                * instructions, the jump offset will overflow.  Such program
+                * is unlikely to happen, so we simply disable seccomp filter
+                * is such a case.
+                */
+               if (nb_insns_personality > UCHAR_MAX) {
+                       debug_msg("seccomp filter disabled due to "
+                                 "possibility of overflow");
+                       seccomp_filtering = false;
+                       return;
+               }
+               nb_insns += nb_insns_personality;
+       }
+
+# if SUPPORTED_PERSONALITIES > 1
+       nb_insns++;
+# endif
+
+       if (nb_insns > BPF_MAXINSNS) {
+               debug_msg("seccomp filter disabled due to BPF program being "
+                         "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
+               seccomp_filtering = false;
+       }
+}
+
+static void
+check_seccomp_filter_properties(void)
+{
+       if (NOMMU_SYSTEM) {
+               seccomp_filtering = false;
+               return;
+       }
+
+       int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
+       seccomp_filtering = rc < 0 && errno != EINVAL;
+       if (!seccomp_filtering)
+               debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+
+       if (seccomp_filtering)
+               check_bpf_program_size();
+       if (seccomp_filtering)
+               check_seccomp_order();
+}
+
+static void
+dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
+{
+       for (unsigned int i = 0; i < len; ++i) {
+               switch (filter[i].code) {
+               case BPF_LD | BPF_W | BPF_ABS:
+                       switch (filter[i].k) {
+                       case offsetof(struct seccomp_data, arch):
+                               error_msg("STMT(BPF_LDWABS, data->arch)");
+                               break;
+                       case offsetof(struct seccomp_data, nr):
+                               error_msg("STMT(BPF_LDWABS, data->nr)");
+                               break;
+                       default:
+                               error_msg("STMT(BPF_LDWABS, 0x%x)",
+                                         filter[i].k);
+                       }
+                       break;
+               case BPF_RET | BPF_K:
+                       switch (filter[i].k) {
+                       case SECCOMP_RET_TRACE:
+                               error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
+                               break;
+                       case SECCOMP_RET_ALLOW:
+                               error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
+                               break;
+                       default:
+                               error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
+                       }
+                       break;
+               case BPF_JMP | BPF_JEQ | BPF_K:
+                       error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
+                                 filter[i].jt, filter[i].jf,
+                                 filter[i].k);
+                       break;
+               case BPF_JMP | BPF_JGE | BPF_K:
+                       error_msg("JUMP(BPF_JGE, %u, %u, %u)",
+                                 filter[i].jt, filter[i].jf,
+                                 filter[i].k);
+                       break;
+               case BPF_JMP | BPF_JA:
+                       error_msg("JUMP(BPF_JA, %u)", filter[i].k);
+                       break;
+               default:
+                       error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
+                                 filter[i].jt, filter[i].jf, filter[i].k);
+               }
+       }
+}
+
+static void
+replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
+                        unsigned char jmp_trace)
+{
+       switch (*jmp_offset) {
+       case JMP_PLACEHOLDER_NEXT:
+               *jmp_offset = jmp_next;
+               break;
+       case JMP_PLACEHOLDER_TRACE:
+               *jmp_offset = jmp_trace;
+               break;
+       default:
+               break;
+       }
+}
+
+static unsigned short
+bpf_syscalls_cmp(struct sock_filter *filter,
+                unsigned int lower, unsigned int upper)
+{
+       if (lower + 1 == upper) {
+               /* if (nr == lower) return RET_TRACE; */
+               SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
+                            JMP_PLACEHOLDER_TRACE, 0);
+               return 1;
+       } else {
+               /* if (nr >= lower && nr < upper) return RET_TRACE; */
+               SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
+               SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
+                            JMP_PLACEHOLDER_TRACE);
+               return 2;
+       }
+}
+
+static unsigned short
+init_sock_filter(struct sock_filter *filter)
+{
+       /*
+        * Generated program looks like:
+        * if (arch == AUDIT_ARCH_A && nr >= flag) {
+        *      if (nr == 59)
+        *              return SECCOMP_RET_TRACE;
+        *      if (nr >= 321 && nr <= 323)
+        *              return SECCOMP_RET_TRACE;
+        *      ...
+        *      return SECCOMP_RET_ALLOW;
+        * }
+        * if (arch == AUDIT_ARCH_A) {
+        *      ...
+        * }
+        * if (arch == AUDIT_ARCH_B) {
+        *      ...
+        * }
+        * return SECCOMP_RET_TRACE;
+        */
+       unsigned short pos = 0;
+
+# if SUPPORTED_PERSONALITIES > 1
+       SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+                    offsetof(struct seccomp_data, arch));
+# endif
+
+       /*
+        * Personalities are iterated in reverse-order in the BPF program so
+        * that the x86 case is naturally handled.  On x86, the first and third
+        * personalities have the same arch identifier.  The third can be
+        * distinguished based on its associated syscall flag, so we check it
+        * first.  The only drawback here is that the first personality is more
+        * common, which may make the BPF program slower to match syscalls on
+        * average.
+        */
+       for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
+               unsigned int lower = UINT_MAX;
+               unsigned short start = pos, end;
+
+# if SUPPORTED_PERSONALITIES > 1
+               /* if (arch != audit_arch_vec[p].arch) goto next; */
+               SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
+                            audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
+# endif
+               SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+                            offsetof(struct seccomp_data, nr));
+
+# if SUPPORTED_PERSONALITIES > 1
+               if (audit_arch_vec[p].flag) {
+                       /* if (nr < audit_arch_vec[p].flag) goto next; */
+                       SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+                                    audit_arch_vec[p].flag, 2, 0);
+                       SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+                                    offsetof(struct seccomp_data, arch));
+                       SET_BPF_JUMP(&filter[pos++], BPF_JA,
+                                    JMP_PLACEHOLDER_NEXT, 0, 0);
+               }
+# endif
+
+               for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+                       if (traced_by_seccomp(i, p)) {
+                               if (lower == UINT_MAX)
+                                       lower = i;
+                               continue;
+                       }
+                       if (lower == UINT_MAX)
+                               continue;
+                       pos += bpf_syscalls_cmp(filter + pos,
+                                               lower | audit_arch_vec[p].flag,
+                                               i | audit_arch_vec[p].flag);
+                       lower = UINT_MAX;
+               }
+               if (lower != UINT_MAX)
+                       pos += bpf_syscalls_cmp(filter + pos,
+                                               lower | audit_arch_vec[p].flag,
+                                               nsyscall_vec[p]
+                                               | audit_arch_vec[p].flag);
+               end = pos;
+
+               /* if (nr >= max_nr) return RET_TRACE; */
+               SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+                            nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
+
+               SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+                            SECCOMP_RET_ALLOW);
+               SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+                            SECCOMP_RET_TRACE);
+
+               for (unsigned int i = start; i < end; ++i) {
+                       if (BPF_CLASS(filter[i].code) != BPF_JMP)
+                               continue;
+                       unsigned char jmp_next = pos - i - 1;
+                       unsigned char jmp_trace = pos - i - 2;
+                       replace_jmp_placeholders(&filter[i].jt, jmp_next,
+                                                jmp_trace);
+                       replace_jmp_placeholders(&filter[i].jf, jmp_next,
+                                                jmp_trace);
+                       if (BPF_OP(filter[i].code) == BPF_JA)
+                               filter[i].k = (unsigned int) jmp_next;
+               }
+       }
+
+# if SUPPORTED_PERSONALITIES > 1
+       /* Jumps conditioned on .arch default to this RET_TRACE. */
+       SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
+# endif
+
+       if (debug_flag)
+               dump_seccomp_bpf(filter, pos);
+
+       return pos;
+}
+
+void
+init_seccomp_filter(void)
+{
+       struct sock_filter filter[BPF_MAXINSNS];
+       unsigned short len;
+
+       len = init_sock_filter(filter);
+
+       struct sock_fprog prog = {
+               .len = len,
+               .filter = filter
+       };
+
+       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+               perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
+
+       if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+               perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+       if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
+           && traced_by_seccomp(tcp->scno, current_personality))
+               return PTRACE_SYSCALL;
+       return PTRACE_CONT;
+}
+
+#else /* !HAVE_LINUX_SECCOMP_H */
+
+# warning <linux/seccomp.h> is not available, seccomp filtering is not supported
+
+static void
+check_seccomp_filter_properties(void)
+{
+       seccomp_filtering = false;
+}
+
+void
+init_seccomp_filter(void)
+{
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+       return PTRACE_SYSCALL;
+}
+
+#endif
+
+void
+check_seccomp_filter(void)
+{
+       check_seccomp_filter_properties();
+
+       if (!seccomp_filtering)
+               error_msg("seccomp filter is requested but unavailable");
+}
diff --git a/filter_seccomp.h b/filter_seccomp.h
new file mode 100644 (file)
index 0000000..bc06c8c
--- /dev/null
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#ifndef STRACE_SECCOMP_FILTER_H
+# define STRACE_SECCOMP_FILTER_H
+
+# include "defs.h"
+
+extern bool seccomp_filtering;
+extern bool seccomp_before_sysentry;
+
+extern void check_seccomp_filter(void);
+extern void init_seccomp_filter(void);
+extern int seccomp_filter_restart_operator(const struct tcb *);
+
+#endif /* !STRACE_SECCOMP_FILTER_H */
index ed9261f5aff258b345df5044b7df73324a1939d3..fb75722f6bf712fa9ad310d6c2c960a21214898b 100644 (file)
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_AARCH64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_ARM,     0 }
index 87ca0cdbdb8fb9bfca8c23034aa644c284c3a9d6..107a74df2ffac1786d7a40b76a7053d961534e39 100644 (file)
@@ -9,3 +9,4 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 0
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_IA64, SYSCALLENT_BASE_NR }
index 871f4109d51ee02758b714138f7790c7abb34344..a4ac007efeab39831f414435aed926fa0f73244a 100644 (file)
@@ -8,3 +8,5 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define SUPPORTED_PERSONALITIES 2
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_PPC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_PPC,   0 }
index 1e520761d68737d9dffb17eb2ed4bd147d1812a8..750ab51214f9909560a9b91f8159770ea8940af2 100644 (file)
@@ -9,3 +9,5 @@
 #define HAVE_ARCH_OLD_MMAP_PGOFF 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_S390X, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_S390,  0 }
index 68eef4fcedcc31d157002bcf0c477812157e2d2d..9eacaa4013e6aee413232faf495554e7000ac5e0 100644 (file)
@@ -9,4 +9,6 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_SA_RESTORER 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_SPARC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_SPARC,   0 }
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
index a781208c2ca07a3c67b2a1683022659a7de2a35a..12ba0d8b6524b9fbcf9f398a2cde01565098dbfa 100644 (file)
@@ -6,6 +6,8 @@
  */
 
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_TILEGX,   0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_TILEGX32, 0 }
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
 
 #ifdef __tilepro__
index 1055de123d7d60319cb69283ce2267d45237d691..9f48d3137b42fa251fd8bf3be891f958ea3d7c45 100644 (file)
@@ -11,3 +11,5 @@
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define HAVE_ARCH_OLD_TIME64_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
index a8c1d99188b7561cc4e43e1306b717717ba3733c..c2924ac21bdeaa06c24f39f95381553d55ae199a 100644 (file)
@@ -9,3 +9,6 @@
 #define HAVE_ARCH_OLD_SELECT 1
 #define HAVE_ARCH_UID16_SYSCALLS 1
 #define SUPPORTED_PERSONALITIES 3
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
+#define PERSONALITY2_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
index bba3bc3b612d11f6d3ae1eb283336fcb08dae6f5..531779c6a34e11fd002380289e0592dd0a055fd4 100644 (file)
@@ -49,6 +49,7 @@ strace \- trace system calls and signals
 .OP \-X format
 .OM \-P path
 .OM \-p pid
+.OP \-\-seccomp\-bpf
 .BR "" {
 .OR \-p pid
 .BR "" |
@@ -68,6 +69,7 @@ strace \- trace system calls and signals
 .OP \-S sortby
 .OM \-P path
 .OM \-p pid
+.OP \-\-seccomp\-bpf
 .BR "" {
 .OR \-p pid
 .BR "" |
@@ -970,6 +972,23 @@ Show some debugging output of
 .B strace
 itself on the standard error.
 .TP
+.B \-\-seccomp\-bpf
+Enable (experimental) usage of seccomp-bpf to have ptrace(2)-stops only when
+system calls that are being traced occur in the traced processes.  Implies the
+.B \-f
+option.
+An attempt to rely on seccomp-bpf to filter system calls may fail for various
+reasons, e.g. there are too many system calls to filter, the seccomp API is not
+available, or
+.B strace
+itself is being traced.
+.B \-\-seccomp\-bpf
+is also ineffective on processes attached using
+.BR \-p .
+In cases when seccomp-bpf filter setup failed,
+.B strace
+proceeds as usual and stops traced processes on every system call.
+.TP
 .B \-F
 This option is deprecated.  It is retained for backward compatibility only
 and may be removed in future releases.
index 701e57393ef51ca1231edb1f24600b8e678fa056..b52a3db3412a92d88a3df4f76a6344107968cb6f 100644 (file)
--- a/strace.c
+++ b/strace.c
@@ -31,6 +31,7 @@
 #endif
 
 #include "kill_save_errno.h"
+#include "filter_seccomp.h"
 #include "largefile_wrappers.h"
 #include "mmap_cache.h"
 #include "number_set.h"
@@ -239,10 +240,10 @@ usage(void)
        printf("\
 usage: strace [-ACdffhi" K_OPT "qqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
               [-a column] [-o file] [-s strsize] [-X format] [-P path]...\n\
-              [-p pid]...\n\
+              [-p pid]... [--seccomp-bpf]\n\
              { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
    or: strace -c[dfwzZ] [-I n] [-b execve] [-e expr]... [-O overhead]\n\
-              [-S sortby] [-P path]... [-p pid]...\n\
+              [-S sortby] [-P path]... [-p pid]... [--seccomp-bpf]\n\
               { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
 \n\
 Output format:\n\
@@ -308,6 +309,7 @@ Startup:\n\
   -u username    run command as username handling setuid and/or setgid\n\
 \n\
 Miscellaneous:\n\
+  --seccomp-bpf  enable seccomp-bpf filtering\n\
   -d             enable debug output to stderr\n\
   -h, --help     print help message\n\
   -V, --version  print version\n\
@@ -1232,6 +1234,10 @@ exec_or_die(void)
        if (params_for_tracee.child_sa.sa_handler != SIG_DFL)
                sigaction(SIGCHLD, &params_for_tracee.child_sa, NULL);
 
+       debug_msg("seccomp filter %s",
+                 seccomp_filtering ? "enabled" : "disabled");
+       if (seccomp_filtering)
+               init_seccomp_filter();
        execv(params->pathname, params->argv);
        perror_msg_and_die("exec");
 }
@@ -1470,6 +1476,10 @@ startup_child(char **argv)
                 * to create a genuine separate stack and execute on it.
                 */
        }
+
+       if (seccomp_filtering)
+               tcp->flags |= TCB_SECCOMP_FILTER;
+
        /*
         * A case where straced process is part of a pipe:
         * { sleep 1; yes | head -n99999; } | strace -o/dev/null sh -c 'exec <&-; sleep 9'
@@ -1609,7 +1619,12 @@ init(int argc, char *argv[])
            "k"
 #endif
            "a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxX:yzZ";
+
+       enum {
+               SECCOMP_OPTION = 0x100
+       };
        static const struct option longopts[] = {
+               { "seccomp-bpf", no_argument, 0, SECCOMP_OPTION },
                { "help", no_argument, 0, 'h' },
                { "version", no_argument, 0, 'V' },
                { 0, 0, 0, 0 }
@@ -1751,6 +1766,9 @@ init(int argc, char *argv[])
                        add_number_to_set(STATUS_FAILED, status_set);
                        zflags++;
                        break;
+               case SECCOMP_OPTION:
+                       seccomp_filtering = true;
+                       break;
                default:
                        error_msg_and_help(NULL);
                        break;
@@ -1768,6 +1786,16 @@ init(int argc, char *argv[])
                error_msg_and_help("PROG [ARGS] must be specified with -D");
        }
 
+       if (seccomp_filtering) {
+               if (nprocs && (!argc || debug_flag))
+                       error_msg("--seccomp-bpf is not enabled for processes"
+                                 " attached with -p");
+               if (!followfork) {
+                       error_msg("--seccomp-bpf implies -f");
+                       followfork = 1;
+               }
+       }
+
        if (optF) {
                if (followfork) {
                        error_msg("deprecated option -F ignored");
@@ -1843,6 +1871,12 @@ init(int argc, char *argv[])
                ptrace_setoptions |= PTRACE_O_TRACECLONE |
                                     PTRACE_O_TRACEFORK |
                                     PTRACE_O_TRACEVFORK;
+
+       if (seccomp_filtering)
+               check_seccomp_filter();
+       if (seccomp_filtering)
+               ptrace_setoptions |= PTRACE_O_TRACESECCOMP;
+
        debug_msg("ptrace_setoptions = %#x", ptrace_setoptions);
        test_ptrace_seize();
        test_ptrace_get_syscall_info();
@@ -2030,6 +2064,7 @@ print_debug_info(const int pid, int status)
                        [PTRACE_EVENT_VFORK_DONE] = "VFORK_DONE",
                        [PTRACE_EVENT_EXEC]  = "EXEC",
                        [PTRACE_EVENT_EXIT]  = "EXIT",
+                       [PTRACE_EVENT_SECCOMP]  = "SECCOMP",
                        /* [PTRACE_EVENT_STOP (=128)] would make biggish array */
                };
                const char *e = "??";
@@ -2555,6 +2590,9 @@ next_event(void)
                        case PTRACE_EVENT_EXIT:
                                wd->te = TE_STOP_BEFORE_EXIT;
                                break;
+                       case PTRACE_EVENT_SECCOMP:
+                               wd->te = TE_SECCOMP;
+                               break;
                        default:
                                wd->te = TE_RESTART;
                        }
@@ -2640,7 +2678,7 @@ trace_syscall(struct tcb *tcp, unsigned int *sig)
 static bool
 dispatch_event(const struct tcb_wait_data *wd)
 {
-       unsigned int restart_op = PTRACE_SYSCALL;
+       unsigned int restart_op;
        unsigned int restart_sig = 0;
        enum trace_event te = wd ? wd->te : TE_BREAK;
        /*
@@ -2649,6 +2687,11 @@ dispatch_event(const struct tcb_wait_data *wd)
         */
        int status = wd ? wd->status : 0;
 
+       if (current_tcp && has_seccomp_filter(current_tcp))
+               restart_op = seccomp_filter_restart_operator(current_tcp);
+       else
+               restart_op = PTRACE_SYSCALL;
+
        switch (te) {
        case TE_BREAK:
                return false;
@@ -2659,6 +2702,27 @@ dispatch_event(const struct tcb_wait_data *wd)
        case TE_RESTART:
                break;
 
+       case TE_SECCOMP:
+               if (!has_seccomp_filter(current_tcp)) {
+                       /*
+                        * We don't know if forks/clones have a seccomp filter
+                        * when they are created, but we can detect it when we
+                        * have a seccomp-stop.
+                        * In such a case, if !seccomp_before_sysentry, we have
+                        * already processed the syscall entry, so we avoid
+                        * processing it a second time.
+                        */
+                       current_tcp->flags |= TCB_SECCOMP_FILTER;
+                       restart_op = PTRACE_SYSCALL;
+                       break;
+               }
+
+               if (seccomp_before_sysentry) {
+                       restart_op = PTRACE_SYSCALL;
+                       break;
+               }
+               ATTRIBUTE_FALLTHROUGH;
+
        case TE_SYSCALL_STOP:
                if (trace_syscall(current_tcp, &restart_sig) < 0) {
                        /*
@@ -2674,6 +2738,42 @@ dispatch_event(const struct tcb_wait_data *wd)
                         */
                        return true;
                }
+               if (has_seccomp_filter(current_tcp)) {
+                       /*
+                        * Syscall and seccomp stops can happen in different
+                        * orders depending on kernel.  strace tests this in
+                        * check_seccomp_order_tracer().
+                        *
+                        * Linux 3.5--4.7:
+                        * (seccomp-stop before syscall-entry-stop)
+                        *         +--> seccomp-stop ->-PTRACE_SYSCALL->-+
+                        *         |                                     |
+                        *     PTRACE_CONT                   syscall-entry-stop
+                        *         |                                     |
+                        * syscall-exit-stop <---PTRACE_SYSCALL-----<----+
+                        *
+                        * Linux 4.8+:
+                        * (seccomp-stop after syscall-entry-stop)
+                        *                 syscall-entry-stop
+                        *
+                        *         +---->-----PTRACE_CONT---->----+
+                        *         |                              |
+                        *  syscall-exit-stop               seccomp-stop
+                        *         |                              |
+                        *         +----<----PTRACE_SYSCALL---<---+
+                        *
+                        * Note in Linux 4.8+, we restart in PTRACE_CONT
+                        * after syscall-exit to skip the syscall-entry-stop.
+                        * The next seccomp-stop will be treated as a syscall
+                        * entry.
+                        *
+                        * The line below implements this behavior.
+                        * Note that exiting(current_tcp) actually marks
+                        * a syscall-entry-stop because the flag was inverted
+                        * in the above call to trace_syscall.
+                        */
+                       restart_op = exiting(current_tcp) ? PTRACE_SYSCALL : PTRACE_CONT;
+               }
                break;
 
        case TE_SIGNAL_DELIVERY_STOP:
index 53a711b821e5f0cee3168967485a680bf80c5c12..9021fc5503f3adec64ee3626667a0d63af499033 100644 (file)
@@ -66,6 +66,11 @@ enum trace_event {
         * Restart the tracee with signal 0.
         */
        TE_STOP_BEFORE_EXIT,
+
+       /*
+        * SECCOMP_RET_TRACE rule is triggered.
+        */
+       TE_SECCOMP,
 };
 
 #endif /* !STRACE_TRACE_EVENT_H */