Introduce seccomp-assisted syscall filtering

author Chen Jingpiao <chenjingpiao@gmail.com>

Thu, 3 May 2018 13:00:38 +0000 (21:00 +0800)

committer Dmitry V. Levin <ldv@altlinux.org>

Wed, 25 Sep 2019 01:02:03 +0000 (01:02 +0000)
author Chen Jingpiao <chenjingpiao@gmail.com>
Thu, 3 May 2018 13:00:38 +0000 (21:00 +0800)
committer Dmitry V. Levin <ldv@altlinux.org>
Wed, 25 Sep 2019 01:02:03 +0000 (01:02 +0000)
diff --git a/Makefile.am b/Makefile.am

index 2dbc806f57f28c149f011fefda5390bea4365d0f..9e0d2ff051a8d34774f8d9c7ea0aa3396bb1cdd3 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -129,6 +129,8 @@ strace_SOURCES =    \
         file_ioctl.c    \
         filter.h        \
         filter_qualify.c \
+       filter_seccomp.c \
+       filter_seccomp.h \
         flock.c         \
         flock.h         \
         fs_x_ioctl.c    \
diff --git a/NEWS b/NEWS

index 867cfdcbe7873ab07dc5ee8c098449df57a5faac..7c1fc0fd021678c26d53cf95c95a61a08a509159 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,8 @@ Noteworthy changes in release ?.? (????-??-??)
  ==============================================
  
  * Improvements
+  * Implemented usage of seccomp-bpf for stopping tracees only for filtered
+    syscalls.  Use --seccomp-bpf option to enable.
    * Implemented decoding of pidfd_open and clone3 syscalls.
    * Enhanced decoding of NETLINK_ROUTE protocol.
    * Implemented decoding of UNIX_DIAG_UID netlink attribute.
diff --git a/defs.h b/defs.h

index 7278bc36cc75bdee2775bd6758ada24aaede5e4f..119ed515e4cc3cec46d00269d600fba6fb0000ad 100644 (file)
--- a/defs.h
+++ b/defs.h
@@ -332,6 +332,9 @@ struct tcb {
  # define TCB_DELAYED   0x2000  /* Current syscall has been delayed */
  # define TCB_TAMPERED_NO_FAIL 0x4000   /* We tamper tcb with syscall
                                            that should not fail. */
+# define TCB_SECCOMP_FILTER    0x8000  /* This process has a seccomp filter
+                                        * attached.
+                                        */
  
  /* qualifier flags */
  # define QUAL_TRACE    0x001   /* this system call should be traced */
@@ -358,6 +361,7 @@ struct tcb {
  # define inject_delay_exit(tcp)        ((tcp)->flags & TCB_INJECT_DELAY_EXIT)
  # define syscall_delayed(tcp)  ((tcp)->flags & TCB_DELAYED)
  # define syscall_tampered_nofail(tcp) ((tcp)->flags & TCB_TAMPERED_NO_FAIL)
+# define has_seccomp_filter(tcp)       ((tcp)->flags & TCB_SECCOMP_FILTER)
  
  extern const struct_sysent stub_sysent;
  # define tcp_sysent(tcp) (tcp->s_ent ?: &stub_sysent)
diff --git a/filter_seccomp.c b/filter_seccomp.c

new file mode 100644 (file)

index 0000000..dd3aa17
--- /dev/null
+++ b/filter_seccomp.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
+ * Copyright (c) 2019 The strace developers.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#include "defs.h"
+
+#include "ptrace.h"
+#include <signal.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+
+#include "filter_seccomp.h"
+#include "number_set.h"
+#include "syscall.h"
+#include "scno.h"
+
+bool seccomp_filtering;
+bool seccomp_before_sysentry;
+
+#ifdef HAVE_LINUX_SECCOMP_H
+
+# include <linux/seccomp.h>
+
+# ifndef BPF_MAXINSNS
+#  define BPF_MAXINSNS 4096
+# endif
+
+# define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
+# define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
+
+# define SET_BPF(filter, code, jt, jf, k) \
+       (*(filter) = (struct sock_filter) { code, jt, jf, k })
+
+# define SET_BPF_STMT(filter, code, k) \
+       SET_BPF(filter, code, 0, 0, k)
+
+# define SET_BPF_JUMP(filter, code, k, jt, jf) \
+       SET_BPF(filter, BPF_JMP | code, jt, jf, k)
+
+struct audit_arch_t {
+       unsigned int arch;
+       unsigned int flag;
+};
+
+static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
+# if SUPPORTED_PERSONALITIES > 1
+       PERSONALITY0_AUDIT_ARCH,
+       PERSONALITY1_AUDIT_ARCH,
+#  if SUPPORTED_PERSONALITIES > 2
+       PERSONALITY2_AUDIT_ARCH,
+#  endif
+# endif
+};
+
+# ifdef ENABLE_COVERAGE_GCOV
+extern void __gcov_flush(void);
+# endif
+
+static void ATTRIBUTE_NORETURN
+check_seccomp_order_do_child(void)
+{
+       static const struct sock_filter filter[] = {
+               /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
+               BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+                        offsetof(struct seccomp_data, nr)),
+               BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
+               BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
+               BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
+       };
+       static const struct sock_fprog prog = {
+               .len = ARRAY_SIZE(filter),
+               .filter = (struct sock_filter *) filter
+       };
+
+       /* Get everything ready before PTRACE_TRACEME.  */
+       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+               perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
+       if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+               perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
+       int pid = getpid();
+
+       if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
+               /* Exit with a nonzero exit status.  */
+               perror_func_msg_and_die("PTRACE_TRACEME");
+       }
+
+# ifdef ENABLE_COVERAGE_GCOV
+       __gcov_flush();
+# endif
+
+       kill(pid, SIGSTOP);
+       syscall(__NR_gettid);
+       _exit(0);
+}
+
+static int
+check_seccomp_order_tracer(int pid)
+{
+       unsigned int step;
+
+       for (step = 0; ; ++step) {
+               int status;
+
+               for (;;) {
+                       long rc = waitpid(pid, &status, 0);
+                       if (rc < 0 && errno == EINTR)
+                               continue;
+                       if (rc == pid)
+                               break;
+                       /* Cannot happen.  */
+                       perror_func_msg("#%d: unexpected wait result %ld",
+                                       step, rc);
+                       return pid;
+               }
+
+               if (WIFEXITED(status)) {
+                       /* The tracee is no more.  */
+                       pid = 0;
+
+                       int exitstatus = WEXITSTATUS(status);
+                       if (step == 5 && exitstatus == 0) {
+                               seccomp_filtering = true;
+                       } else {
+                               error_func_msg("#%d: unexpected exit status %u",
+                                              step, exitstatus);
+                       }
+                       break;
+               }
+
+               if (WIFSIGNALED(status)) {
+                       /* The tracee is no more.  */
+                       pid = 0;
+
+                       error_func_msg("#%d: unexpected signal %u",
+                                      step, WTERMSIG(status));
+                       break;
+               }
+
+               if (!WIFSTOPPED(status)) {
+                       /* Cannot happen.  */
+                       error_func_msg("#%d: unexpected wait status %#x",
+                                      step, status);
+                       break;
+               }
+
+               unsigned int event = (unsigned int) status >> 16;
+
+               switch (WSTOPSIG(status)) {
+               case SIGSTOP:
+                       if (step != 0) {
+                               error_func_msg("#%d: unexpected signal stop",
+                                              step);
+                               return pid;
+                       }
+                       if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
+                                  PTRACE_O_TRACESYSGOOD|
+                                  PTRACE_O_TRACESECCOMP) < 0) {
+                               perror_func_msg("PTRACE_SETOPTIONS");
+                               return pid;
+                       }
+                       break;
+
+               case SIGTRAP:
+                       if (event != PTRACE_EVENT_SECCOMP) {
+                               error_func_msg("#%d: unexpected trap %#x",
+                                              step, event);
+                               return pid;
+                       }
+
+                       switch (step) {
+                       case 1: /* Seccomp stop before entering gettid.  */
+                               seccomp_before_sysentry = true;
+                               break;
+                       case 2: /* Seccomp stop after entering gettid.  */
+                               if (!seccomp_before_sysentry)
+                                       break;
+                               ATTRIBUTE_FALLTHROUGH;
+                       default:
+                               error_func_msg("#%d: unexpected seccomp stop",
+                                              step);
+                               return pid;
+                       }
+                       break;
+
+               case SIGTRAP | 0x80:
+                       switch (step) {
+                       case 3: /* Exiting gettid.  */
+                       case 4: /* Entering exit_group.  */
+                               break;
+                       case 1: /* Entering gettid before seccomp stop.  */
+                               seccomp_before_sysentry = false;
+                               break;
+                       case 2: /* Entering gettid after seccomp stop.  */
+                               if (seccomp_before_sysentry)
+                                       break;
+                               ATTRIBUTE_FALLTHROUGH;
+                       default:
+                               error_func_msg("#%d: unexpected syscall stop",
+                                              step);
+                               return pid;
+                       }
+                       break;
+
+               default:
+                       error_func_msg("#%d: unexpected stop signal %#x",
+                                      step, WSTOPSIG(status));
+                       return pid;
+               }
+
+               if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
+                       /* Cannot happen.  */
+                       perror_func_msg("#%d: PTRACE_SYSCALL", step);
+                       break;
+               }
+       }
+
+       return pid;
+}
+
+static void
+check_seccomp_order(void)
+{
+       seccomp_filtering = false;
+
+       int pid = fork();
+       if (pid < 0) {
+               perror_func_msg("fork");
+               return;
+       }
+
+       if (pid == 0)
+               check_seccomp_order_do_child();
+
+       pid = check_seccomp_order_tracer(pid);
+       if (pid) {
+               kill(pid, SIGKILL);
+               for (;;) {
+                       long rc = waitpid(pid, NULL, 0);
+                       if (rc < 0 && errno == EINTR)
+                               continue;
+                       break;
+               }
+       }
+}
+
+static bool
+traced_by_seccomp(unsigned int scno, unsigned int p)
+{
+       if (is_number_in_set_array(scno, trace_set, p)
+           || sysent_vec[p][scno].sys_flags
+           & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
+               return true;
+       return false;
+}
+
+static void
+check_bpf_program_size(void)
+{
+       unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
+
+       /*
+        * Implements a simplified form of init_sock_filter()'s bytecode
+        * generation algorithm, to count the number of instructions that will
+        * be generated.
+        */
+       for (int p = SUPPORTED_PERSONALITIES - 1;
+            p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
+               unsigned int nb_insns_personality = 0;
+               unsigned int lower = UINT_MAX;
+
+               nb_insns_personality++;
+# if SUPPORTED_PERSONALITIES > 1
+               nb_insns_personality++;
+               if (audit_arch_vec[p].flag)
+                       nb_insns_personality += 3;
+# endif
+
+               for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+                       if (traced_by_seccomp(i, p)) {
+                               if (lower == UINT_MAX)
+                                       lower = i;
+                               continue;
+                       }
+                       if (lower == UINT_MAX)
+                               continue;
+                       if (lower + 1 == i)
+                               nb_insns_personality++;
+                       else
+                               nb_insns_personality += 2;
+                       lower = UINT_MAX;
+               }
+               if (lower != UINT_MAX) {
+                       if (lower + 1 == nsyscall_vec[p])
+                               nb_insns_personality++;
+                       else
+                               nb_insns_personality += 2;
+               }
+
+               nb_insns_personality += 3;
+
+               /*
+                * Within generated BPF programs, the origin and destination of
+                * jumps are always in the same personality section.  The
+                * largest jump is therefore the jump from the first
+                * instruction of the section to the last, to skip the
+                * personality and try to compare .arch to the next
+                * personality.
+                * If we have a personality section with more than 255
+                * instructions, the jump offset will overflow.  Such program
+                * is unlikely to happen, so we simply disable seccomp filter
+                * is such a case.
+                */
+               if (nb_insns_personality > UCHAR_MAX) {
+                       debug_msg("seccomp filter disabled due to "
+                                 "possibility of overflow");
+                       seccomp_filtering = false;
+                       return;
+               }
+               nb_insns += nb_insns_personality;
+       }
+
+# if SUPPORTED_PERSONALITIES > 1
+       nb_insns++;
+# endif
+
+       if (nb_insns > BPF_MAXINSNS) {
+               debug_msg("seccomp filter disabled due to BPF program being "
+                         "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
+               seccomp_filtering = false;
+       }
+}
+
+static void
+check_seccomp_filter_properties(void)
+{
+       if (NOMMU_SYSTEM) {
+               seccomp_filtering = false;
+               return;
+       }
+
+       int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
+       seccomp_filtering = rc < 0 && errno != EINVAL;
+       if (!seccomp_filtering)
+               debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+
+       if (seccomp_filtering)
+               check_bpf_program_size();
+       if (seccomp_filtering)
+               check_seccomp_order();
+}
+
+static void
+dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
+{
+       for (unsigned int i = 0; i < len; ++i) {
+               switch (filter[i].code) {
+               case BPF_LD | BPF_W | BPF_ABS:
+                       switch (filter[i].k) {
+                       case offsetof(struct seccomp_data, arch):
+                               error_msg("STMT(BPF_LDWABS, data->arch)");
+                               break;
+                       case offsetof(struct seccomp_data, nr):
+                               error_msg("STMT(BPF_LDWABS, data->nr)");
+                               break;
+                       default:
+                               error_msg("STMT(BPF_LDWABS, 0x%x)",
+                                         filter[i].k);
+                       }
+                       break;
+               case BPF_RET | BPF_K:
+                       switch (filter[i].k) {
+                       case SECCOMP_RET_TRACE:
+                               error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
+                               break;
+                       case SECCOMP_RET_ALLOW:
+                               error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
+                               break;
+                       default:
+                               error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
+                       }
+                       break;
+               case BPF_JMP | BPF_JEQ | BPF_K:
+                       error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
+                                 filter[i].jt, filter[i].jf,
+                                 filter[i].k);
+                       break;
+               case BPF_JMP | BPF_JGE | BPF_K:
+                       error_msg("JUMP(BPF_JGE, %u, %u, %u)",
+                                 filter[i].jt, filter[i].jf,
+                                 filter[i].k);
+                       break;
+               case BPF_JMP | BPF_JA:
+                       error_msg("JUMP(BPF_JA, %u)", filter[i].k);
+                       break;
+               default:
+                       error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
+                                 filter[i].jt, filter[i].jf, filter[i].k);
+               }
+       }
+}
+
+static void
+replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
+                        unsigned char jmp_trace)
+{
+       switch (*jmp_offset) {
+       case JMP_PLACEHOLDER_NEXT:
+               *jmp_offset = jmp_next;
+               break;
+       case JMP_PLACEHOLDER_TRACE:
+               *jmp_offset = jmp_trace;
+               break;
+       default:
+               break;
+       }
+}
+
+static unsigned short
+bpf_syscalls_cmp(struct sock_filter *filter,
+                unsigned int lower, unsigned int upper)
+{
+       if (lower + 1 == upper) {
+               /* if (nr == lower) return RET_TRACE; */
+               SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
+                            JMP_PLACEHOLDER_TRACE, 0);
+               return 1;
+       } else {
+               /* if (nr >= lower && nr < upper) return RET_TRACE; */
+               SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
+               SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
+                            JMP_PLACEHOLDER_TRACE);
+               return 2;
+       }
+}
+
+static unsigned short
+init_sock_filter(struct sock_filter *filter)
+{
+       /*
+        * Generated program looks like:
+        * if (arch == AUDIT_ARCH_A && nr >= flag) {
+        *      if (nr == 59)
+        *              return SECCOMP_RET_TRACE;
+        *      if (nr >= 321 && nr <= 323)
+        *              return SECCOMP_RET_TRACE;
+        *      ...
+        *      return SECCOMP_RET_ALLOW;
+        * }
+        * if (arch == AUDIT_ARCH_A) {
+        *      ...
+        * }
+        * if (arch == AUDIT_ARCH_B) {
+        *      ...
+        * }
+        * return SECCOMP_RET_TRACE;
+        */
+       unsigned short pos = 0;
+
+# if SUPPORTED_PERSONALITIES > 1
+       SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+                    offsetof(struct seccomp_data, arch));
+# endif
+
+       /*
+        * Personalities are iterated in reverse-order in the BPF program so
+        * that the x86 case is naturally handled.  On x86, the first and third
+        * personalities have the same arch identifier.  The third can be
+        * distinguished based on its associated syscall flag, so we check it
+        * first.  The only drawback here is that the first personality is more
+        * common, which may make the BPF program slower to match syscalls on
+        * average.
+        */
+       for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
+               unsigned int lower = UINT_MAX;
+               unsigned short start = pos, end;
+
+# if SUPPORTED_PERSONALITIES > 1
+               /* if (arch != audit_arch_vec[p].arch) goto next; */
+               SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
+                            audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
+# endif
+               SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+                            offsetof(struct seccomp_data, nr));
+
+# if SUPPORTED_PERSONALITIES > 1
+               if (audit_arch_vec[p].flag) {
+                       /* if (nr < audit_arch_vec[p].flag) goto next; */
+                       SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+                                    audit_arch_vec[p].flag, 2, 0);
+                       SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
+                                    offsetof(struct seccomp_data, arch));
+                       SET_BPF_JUMP(&filter[pos++], BPF_JA,
+                                    JMP_PLACEHOLDER_NEXT, 0, 0);
+               }
+# endif
+
+               for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
+                       if (traced_by_seccomp(i, p)) {
+                               if (lower == UINT_MAX)
+                                       lower = i;
+                               continue;
+                       }
+                       if (lower == UINT_MAX)
+                               continue;
+                       pos += bpf_syscalls_cmp(filter + pos,
+                                               lower | audit_arch_vec[p].flag,
+                                               i | audit_arch_vec[p].flag);
+                       lower = UINT_MAX;
+               }
+               if (lower != UINT_MAX)
+                       pos += bpf_syscalls_cmp(filter + pos,
+                                               lower | audit_arch_vec[p].flag,
+                                               nsyscall_vec[p]
+                                               | audit_arch_vec[p].flag);
+               end = pos;
+
+               /* if (nr >= max_nr) return RET_TRACE; */
+               SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
+                            nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
+
+               SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+                            SECCOMP_RET_ALLOW);
+               SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
+                            SECCOMP_RET_TRACE);
+
+               for (unsigned int i = start; i < end; ++i) {
+                       if (BPF_CLASS(filter[i].code) != BPF_JMP)
+                               continue;
+                       unsigned char jmp_next = pos - i - 1;
+                       unsigned char jmp_trace = pos - i - 2;
+                       replace_jmp_placeholders(&filter[i].jt, jmp_next,
+                                                jmp_trace);
+                       replace_jmp_placeholders(&filter[i].jf, jmp_next,
+                                                jmp_trace);
+                       if (BPF_OP(filter[i].code) == BPF_JA)
+                               filter[i].k = (unsigned int) jmp_next;
+               }
+       }
+
+# if SUPPORTED_PERSONALITIES > 1
+       /* Jumps conditioned on .arch default to this RET_TRACE. */
+       SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
+# endif
+
+       if (debug_flag)
+               dump_seccomp_bpf(filter, pos);
+
+       return pos;
+}
+
+void
+init_seccomp_filter(void)
+{
+       struct sock_filter filter[BPF_MAXINSNS];
+       unsigned short len;
+
+       len = init_sock_filter(filter);
+
+       struct sock_fprog prog = {
+               .len = len,
+               .filter = filter
+       };
+
+       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+               perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
+
+       if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
+               perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+       if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
+           && traced_by_seccomp(tcp->scno, current_personality))
+               return PTRACE_SYSCALL;
+       return PTRACE_CONT;
+}
+
+#else /* !HAVE_LINUX_SECCOMP_H */
+
+# warning <linux/seccomp.h> is not available, seccomp filtering is not supported
+
+static void
+check_seccomp_filter_properties(void)
+{
+       seccomp_filtering = false;
+}
+
+void
+init_seccomp_filter(void)
+{
+}
+
+int
+seccomp_filter_restart_operator(const struct tcb *tcp)
+{
+       return PTRACE_SYSCALL;
+}
+
+#endif
+
+void
+check_seccomp_filter(void)
+{
+       check_seccomp_filter_properties();
+
+       if (!seccomp_filtering)
+               error_msg("seccomp filter is requested but unavailable");
+}
diff --git a/filter_seccomp.h b/filter_seccomp.h

new file mode 100644 (file)

index 0000000..bc06c8c
--- /dev/null
+++ b/filter_seccomp.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#ifndef STRACE_SECCOMP_FILTER_H
+# define STRACE_SECCOMP_FILTER_H
+
+# include "defs.h"
+
+extern bool seccomp_filtering;
+extern bool seccomp_before_sysentry;
+
+extern void check_seccomp_filter(void);
+extern void init_seccomp_filter(void);
+extern int seccomp_filter_restart_operator(const struct tcb *);
+
+#endif /* !STRACE_SECCOMP_FILTER_H */
diff --git a/linux/aarch64/arch_defs_.h b/linux/aarch64/arch_defs_.h

index ed9261f5aff258b345df5044b7df73324a1939d3..fb75722f6bf712fa9ad310d6c2c960a21214898b 100644 (file)
--- a/linux/aarch64/arch_defs_.h
+++ b/linux/aarch64/arch_defs_.h
@@ -9,3 +9,5 @@
  #define HAVE_ARCH_OLD_SELECT 1
  #define HAVE_ARCH_UID16_SYSCALLS 1
  #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_AARCH64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_ARM,     0 }
diff --git a/linux/ia64/arch_defs_.h b/linux/ia64/arch_defs_.h

index 87ca0cdbdb8fb9bfca8c23034aa644c284c3a9d6..107a74df2ffac1786d7a40b76a7053d961534e39 100644 (file)
--- a/linux/ia64/arch_defs_.h
+++ b/linux/ia64/arch_defs_.h
@@ -9,3 +9,4 @@
  #define HAVE_ARCH_UID16_SYSCALLS 1
  #define HAVE_ARCH_SA_RESTORER 0
  #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_IA64, SYSCALLENT_BASE_NR }
diff --git a/linux/powerpc64/arch_defs_.h b/linux/powerpc64/arch_defs_.h

index 871f4109d51ee02758b714138f7790c7abb34344..a4ac007efeab39831f414435aed926fa0f73244a 100644 (file)
--- a/linux/powerpc64/arch_defs_.h
+++ b/linux/powerpc64/arch_defs_.h
@@ -8,3 +8,5 @@
  #define HAVE_ARCH_OLD_SELECT 1
  #define SUPPORTED_PERSONALITIES 2
  #define HAVE_ARCH_DEDICATED_ERR_REG 1
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_PPC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_PPC,   0 }
diff --git a/linux/s390x/arch_defs_.h b/linux/s390x/arch_defs_.h

index 1e520761d68737d9dffb17eb2ed4bd147d1812a8..750ab51214f9909560a9b91f8159770ea8940af2 100644 (file)
--- a/linux/s390x/arch_defs_.h
+++ b/linux/s390x/arch_defs_.h
@@ -9,3 +9,5 @@
  #define HAVE_ARCH_OLD_MMAP_PGOFF 1
  #define HAVE_ARCH_UID16_SYSCALLS 1
  #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_S390X, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_S390,  0 }
diff --git a/linux/sparc64/arch_defs_.h b/linux/sparc64/arch_defs_.h

index 68eef4fcedcc31d157002bcf0c477812157e2d2d..9eacaa4013e6aee413232faf495554e7000ac5e0 100644 (file)
--- a/linux/sparc64/arch_defs_.h
+++ b/linux/sparc64/arch_defs_.h
@@ -9,4 +9,6 @@
  #define HAVE_ARCH_UID16_SYSCALLS 1
  #define HAVE_ARCH_SA_RESTORER 1
  #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_SPARC64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_SPARC,   0 }
  #define HAVE_ARCH_DEDICATED_ERR_REG 1
diff --git a/linux/tile/arch_defs_.h b/linux/tile/arch_defs_.h

index a781208c2ca07a3c67b2a1683022659a7de2a35a..12ba0d8b6524b9fbcf9f398a2cde01565098dbfa 100644 (file)
--- a/linux/tile/arch_defs_.h
+++ b/linux/tile/arch_defs_.h
@@ -6,6 +6,8 @@
   */
  
  #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_TILEGX,   0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_TILEGX32, 0 }
  #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
  
  #ifdef __tilepro__
diff --git a/linux/x32/arch_defs_.h b/linux/x32/arch_defs_.h

index 1055de123d7d60319cb69283ce2267d45237d691..9f48d3137b42fa251fd8bf3be891f958ea3d7c45 100644 (file)
--- a/linux/x32/arch_defs_.h
+++ b/linux/x32/arch_defs_.h
@@ -11,3 +11,5 @@
  #define HAVE_ARCH_UID16_SYSCALLS 1
  #define HAVE_ARCH_OLD_TIME64_SYSCALLS 1
  #define SUPPORTED_PERSONALITIES 2
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
diff --git a/linux/x86_64/arch_defs_.h b/linux/x86_64/arch_defs_.h

index a8c1d99188b7561cc4e43e1306b717717ba3733c..c2924ac21bdeaa06c24f39f95381553d55ae199a 100644 (file)
--- a/linux/x86_64/arch_defs_.h
+++ b/linux/x86_64/arch_defs_.h
@@ -9,3 +9,6 @@
  #define HAVE_ARCH_OLD_SELECT 1
  #define HAVE_ARCH_UID16_SYSCALLS 1
  #define SUPPORTED_PERSONALITIES 3
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, 0 }
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
+#define PERSONALITY2_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
diff --git a/strace.1.in b/strace.1.in

index bba3bc3b612d11f6d3ae1eb283336fcb08dae6f5..531779c6a34e11fd002380289e0592dd0a055fd4 100644 (file)
--- a/strace.1.in
+++ b/strace.1.in
@@ -49,6 +49,7 @@ strace \- trace system calls and signals
  .OP \-X format
  .OM \-P path
  .OM \-p pid
+.OP \-\-seccomp\-bpf
  .BR "" {
  .OR \-p pid
  .BR "" |
@@ -68,6 +69,7 @@ strace \- trace system calls and signals
  .OP \-S sortby
  .OM \-P path
  .OM \-p pid
+.OP \-\-seccomp\-bpf
  .BR "" {
  .OR \-p pid
  .BR "" |
@@ -970,6 +972,23 @@ Show some debugging output of
  .B strace
  itself on the standard error.
  .TP
+.B \-\-seccomp\-bpf
+Enable (experimental) usage of seccomp-bpf to have ptrace(2)-stops only when
+system calls that are being traced occur in the traced processes.  Implies the
+.B \-f
+option.
+An attempt to rely on seccomp-bpf to filter system calls may fail for various
+reasons, e.g. there are too many system calls to filter, the seccomp API is not
+available, or
+.B strace
+itself is being traced.
+.B \-\-seccomp\-bpf
+is also ineffective on processes attached using
+.BR \-p .
+In cases when seccomp-bpf filter setup failed,
+.B strace
+proceeds as usual and stops traced processes on every system call.
+.TP
  .B \-F
  This option is deprecated.  It is retained for backward compatibility only
  and may be removed in future releases.
diff --git a/strace.c b/strace.c

index 701e57393ef51ca1231edb1f24600b8e678fa056..b52a3db3412a92d88a3df4f76a6344107968cb6f 100644 (file)
--- a/strace.c
+++ b/strace.c
@@ -31,6 +31,7 @@
  #endif
  
  #include "kill_save_errno.h"
+#include "filter_seccomp.h"
  #include "largefile_wrappers.h"
  #include "mmap_cache.h"
  #include "number_set.h"
@@ -239,10 +240,10 @@ usage(void)
         printf("\
  usage: strace [-ACdffhi" K_OPT "qqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
                [-a column] [-o file] [-s strsize] [-X format] [-P path]...\n\
-              [-p pid]...\n\
+              [-p pid]... [--seccomp-bpf]\n\
               { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
     or: strace -c[dfwzZ] [-I n] [-b execve] [-e expr]... [-O overhead]\n\
-              [-S sortby] [-P path]... [-p pid]...\n\
+              [-S sortby] [-P path]... [-p pid]... [--seccomp-bpf]\n\
                { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
  \n\
  Output format:\n\
@@ -308,6 +309,7 @@ Startup:\n\
    -u username    run command as username handling setuid and/or setgid\n\
  \n\
  Miscellaneous:\n\
+  --seccomp-bpf  enable seccomp-bpf filtering\n\
    -d             enable debug output to stderr\n\
    -h, --help     print help message\n\
    -V, --version  print version\n\
@@ -1232,6 +1234,10 @@ exec_or_die(void)
         if (params_for_tracee.child_sa.sa_handler != SIG_DFL)
                 sigaction(SIGCHLD, &params_for_tracee.child_sa, NULL);
  
+       debug_msg("seccomp filter %s",
+                 seccomp_filtering ? "enabled" : "disabled");
+       if (seccomp_filtering)
+               init_seccomp_filter();
         execv(params->pathname, params->argv);
         perror_msg_and_die("exec");
  }
@@ -1470,6 +1476,10 @@ startup_child(char **argv)
                  * to create a genuine separate stack and execute on it.
                  */
         }
+
+       if (seccomp_filtering)
+               tcp->flags |= TCB_SECCOMP_FILTER;
+
         /*
          * A case where straced process is part of a pipe:
          * { sleep 1; yes | head -n99999; } | strace -o/dev/null sh -c 'exec <&-; sleep 9'
@@ -1609,7 +1619,12 @@ init(int argc, char *argv[])
             "k"
  #endif
             "a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxX:yzZ";
+
+       enum {
+               SECCOMP_OPTION = 0x100
+       };
         static const struct option longopts[] = {
+               { "seccomp-bpf", no_argument, 0, SECCOMP_OPTION },
                 { "help", no_argument, 0, 'h' },
                 { "version", no_argument, 0, 'V' },
                 { 0, 0, 0, 0 }
@@ -1751,6 +1766,9 @@ init(int argc, char *argv[])
                         add_number_to_set(STATUS_FAILED, status_set);
                         zflags++;
                         break;
+               case SECCOMP_OPTION:
+                       seccomp_filtering = true;
+                       break;
                 default:
                         error_msg_and_help(NULL);
                         break;
@@ -1768,6 +1786,16 @@ init(int argc, char *argv[])
                 error_msg_and_help("PROG [ARGS] must be specified with -D");
         }
  
+       if (seccomp_filtering) {
+               if (nprocs && (!argc || debug_flag))
+                       error_msg("--seccomp-bpf is not enabled for processes"
+                                 " attached with -p");
+               if (!followfork) {
+                       error_msg("--seccomp-bpf implies -f");
+                       followfork = 1;
+               }
+       }
+
         if (optF) {
                 if (followfork) {
                         error_msg("deprecated option -F ignored");
@@ -1843,6 +1871,12 @@ init(int argc, char *argv[])
                 ptrace_setoptions |= PTRACE_O_TRACECLONE |
                                      PTRACE_O_TRACEFORK |
                                      PTRACE_O_TRACEVFORK;
+
+       if (seccomp_filtering)
+               check_seccomp_filter();
+       if (seccomp_filtering)
+               ptrace_setoptions |= PTRACE_O_TRACESECCOMP;
+
         debug_msg("ptrace_setoptions = %#x", ptrace_setoptions);
         test_ptrace_seize();
         test_ptrace_get_syscall_info();
@@ -2030,6 +2064,7 @@ print_debug_info(const int pid, int status)
                         [PTRACE_EVENT_VFORK_DONE] = "VFORK_DONE",
                         [PTRACE_EVENT_EXEC]  = "EXEC",
                         [PTRACE_EVENT_EXIT]  = "EXIT",
+                       [PTRACE_EVENT_SECCOMP]  = "SECCOMP",
                         /* [PTRACE_EVENT_STOP (=128)] would make biggish array */
                 };
                 const char *e = "??";
@@ -2555,6 +2590,9 @@ next_event(void)
                         case PTRACE_EVENT_EXIT:
                                 wd->te = TE_STOP_BEFORE_EXIT;
                                 break;
+                       case PTRACE_EVENT_SECCOMP:
+                               wd->te = TE_SECCOMP;
+                               break;
                         default:
                                 wd->te = TE_RESTART;
                         }
@@ -2640,7 +2678,7 @@ trace_syscall(struct tcb *tcp, unsigned int *sig)
  static bool
  dispatch_event(const struct tcb_wait_data *wd)
  {
-       unsigned int restart_op = PTRACE_SYSCALL;
+       unsigned int restart_op;
         unsigned int restart_sig = 0;
         enum trace_event te = wd ? wd->te : TE_BREAK;
         /*
@@ -2649,6 +2687,11 @@ dispatch_event(const struct tcb_wait_data *wd)
          */
         int status = wd ? wd->status : 0;
  
+       if (current_tcp && has_seccomp_filter(current_tcp))
+               restart_op = seccomp_filter_restart_operator(current_tcp);
+       else
+               restart_op = PTRACE_SYSCALL;
+
         switch (te) {
         case TE_BREAK:
                 return false;
@@ -2659,6 +2702,27 @@ dispatch_event(const struct tcb_wait_data *wd)
         case TE_RESTART:
                 break;
  
+       case TE_SECCOMP:
+               if (!has_seccomp_filter(current_tcp)) {
+                       /*
+                        * We don't know if forks/clones have a seccomp filter
+                        * when they are created, but we can detect it when we
+                        * have a seccomp-stop.
+                        * In such a case, if !seccomp_before_sysentry, we have
+                        * already processed the syscall entry, so we avoid
+                        * processing it a second time.
+                        */
+                       current_tcp->flags |= TCB_SECCOMP_FILTER;
+                       restart_op = PTRACE_SYSCALL;
+                       break;
+               }
+
+               if (seccomp_before_sysentry) {
+                       restart_op = PTRACE_SYSCALL;
+                       break;
+               }
+               ATTRIBUTE_FALLTHROUGH;
+
         case TE_SYSCALL_STOP:
                 if (trace_syscall(current_tcp, &restart_sig) < 0) {
                         /*
@@ -2674,6 +2738,42 @@ dispatch_event(const struct tcb_wait_data *wd)
                          */
                         return true;
                 }
+               if (has_seccomp_filter(current_tcp)) {
+                       /*
+                        * Syscall and seccomp stops can happen in different
+                        * orders depending on kernel.  strace tests this in
+                        * check_seccomp_order_tracer().
+                        *
+                        * Linux 3.5--4.7:
+                        * (seccomp-stop before syscall-entry-stop)
+                        *         +--> seccomp-stop ->-PTRACE_SYSCALL->-+
+                        *         |                                     |
+                        *     PTRACE_CONT                   syscall-entry-stop
+                        *         |                                     |
+                        * syscall-exit-stop <---PTRACE_SYSCALL-----<----+
+                        *
+                        * Linux 4.8+:
+                        * (seccomp-stop after syscall-entry-stop)
+                        *                 syscall-entry-stop
+                        *
+                        *         +---->-----PTRACE_CONT---->----+
+                        *         |                              |
+                        *  syscall-exit-stop               seccomp-stop
+                        *         |                              |
+                        *         +----<----PTRACE_SYSCALL---<---+
+                        *
+                        * Note in Linux 4.8+, we restart in PTRACE_CONT
+                        * after syscall-exit to skip the syscall-entry-stop.
+                        * The next seccomp-stop will be treated as a syscall
+                        * entry.
+                        *
+                        * The line below implements this behavior.
+                        * Note that exiting(current_tcp) actually marks
+                        * a syscall-entry-stop because the flag was inverted
+                        * in the above call to trace_syscall.
+                        */
+                       restart_op = exiting(current_tcp) ? PTRACE_SYSCALL : PTRACE_CONT;
+               }
                 break;
  
         case TE_SIGNAL_DELIVERY_STOP:
diff --git a/trace_event.h b/trace_event.h

index 53a711b821e5f0cee3168967485a680bf80c5c12..9021fc5503f3adec64ee3626667a0d63af499033 100644 (file)
--- a/trace_event.h
+++ b/trace_event.h
@@ -66,6 +66,11 @@ enum trace_event {
          * Restart the tracee with signal 0.
          */
         TE_STOP_BEFORE_EXIT,
+
+       /*
+        * SECCOMP_RET_TRACE rule is triggered.
+        */
+       TE_SECCOMP,
  };
  
  #endif /* !STRACE_TRACE_EVENT_H */
author	Chen Jingpiao <chenjingpiao@gmail.com>
	Thu, 3 May 2018 13:00:38 +0000 (21:00 +0800)
committer	Dmitry V. Levin <ldv@altlinux.org>
	Wed, 25 Sep 2019 01:02:03 +0000 (01:02 +0000)
Makefile.am		patch \| blob \| history
NEWS		patch \| blob \| history
defs.h		patch \| blob \| history
filter_seccomp.c	[new file with mode: 0644]	patch \| blob
filter_seccomp.h	[new file with mode: 0644]	patch \| blob
linux/aarch64/arch_defs_.h		patch \| blob \| history
linux/ia64/arch_defs_.h		patch \| blob \| history
linux/powerpc64/arch_defs_.h		patch \| blob \| history
linux/s390x/arch_defs_.h		patch \| blob \| history
linux/sparc64/arch_defs_.h		patch \| blob \| history
linux/tile/arch_defs_.h		patch \| blob \| history
linux/x32/arch_defs_.h		patch \| blob \| history
linux/x86_64/arch_defs_.h		patch \| blob \| history
strace.1.in		patch \| blob \| history
strace.c		patch \| blob \| history
trace_event.h		patch \| blob \| history