granicus.if.org Git - strace/blob - filter_seccomp.c

   1 /*
   2  * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
   3  * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
   4  * Copyright (c) 2019 The strace developers.
   5  * All rights reserved.
   6  *
   7  * SPDX-License-Identifier: LGPL-2.1-or-later
   8  */
   9
  10 #include "defs.h"
  11
  12 #include "ptrace.h"
  13 #include <signal.h>
  14 #include <sys/prctl.h>
  15 #include <sys/wait.h>
  16 #include <linux/audit.h>
  17 #include <linux/filter.h>
  18
  19 #include "filter_seccomp.h"
  20 #include "number_set.h"
  21 #include "syscall.h"
  22 #include "scno.h"
  23
  24 bool seccomp_filtering;
  25 bool seccomp_before_sysentry;
  26
  27 #ifdef HAVE_LINUX_SECCOMP_H
  28
  29 # include <linux/seccomp.h>
  30
  31 # ifndef BPF_MAXINSNS
  32 #  define BPF_MAXINSNS 4096
  33 # endif
  34
  35 # define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
  36 # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
  37
  38 # define SET_BPF(filter, code, jt, jf, k) \
  39         (*(filter) = (struct sock_filter) { code, jt, jf, k })
  40
  41 # define SET_BPF_STMT(filter, code, k) \
  42         SET_BPF(filter, code, 0, 0, k)
  43
  44 # define SET_BPF_JUMP(filter, code, k, jt, jf) \
  45         SET_BPF(filter, BPF_JMP | code, jt, jf, k)
  46
  47 struct audit_arch_t {
  48         unsigned int arch;
  49         unsigned int flag;
  50 };
  51
  52 static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
  53 # if SUPPORTED_PERSONALITIES > 1
  54         PERSONALITY0_AUDIT_ARCH,
  55         PERSONALITY1_AUDIT_ARCH,
  56 #  if SUPPORTED_PERSONALITIES > 2
  57         PERSONALITY2_AUDIT_ARCH,
  58 #  endif
  59 # endif
  60 };
  61
  62 # ifdef HAVE_FORK
  63
  64 #  ifdef ENABLE_COVERAGE_GCOV
  65 extern void __gcov_flush(void);
  66 #  endif
  67
  68 static void ATTRIBUTE_NORETURN
  69 check_seccomp_order_do_child(void)
  70 {
  71         static const struct sock_filter filter[] = {
  72                 /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
  73                 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
  74                          offsetof(struct seccomp_data, nr)),
  75                 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
  76                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
  77                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
  78         };
  79         static const struct sock_fprog prog = {
  80                 .len = ARRAY_SIZE(filter),
  81                 .filter = (struct sock_filter *) filter
  82         };
  83
  84         /* Get everything ready before PTRACE_TRACEME.  */
  85         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
  86                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
  87         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
  88                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
  89         int pid = getpid();
  90
  91         if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
  92                 /* Exit with a nonzero exit status.  */
  93                 perror_func_msg_and_die("PTRACE_TRACEME");
  94         }
  95
  96 #  ifdef ENABLE_COVERAGE_GCOV
  97         __gcov_flush();
  98 #  endif
  99
 100         kill(pid, SIGSTOP);
 101         syscall(__NR_gettid);
 102         _exit(0);
 103 }
 104
 105 static int
 106 check_seccomp_order_tracer(int pid)
 107 {
 108         unsigned int step;
 109
 110         for (step = 0; ; ++step) {
 111                 int status;
 112
 113                 for (;;) {
 114                         long rc = waitpid(pid, &status, 0);
 115                         if (rc < 0 && errno == EINTR)
 116                                 continue;
 117                         if (rc == pid)
 118                                 break;
 119                         /* Cannot happen.  */
 120                         perror_func_msg("#%d: unexpected wait result %ld",
 121                                         step, rc);
 122                         return pid;
 123                 }
 124
 125                 if (WIFEXITED(status)) {
 126                         /* The tracee is no more.  */
 127                         pid = 0;
 128
 129                         int exitstatus = WEXITSTATUS(status);
 130                         if (step == 5 && exitstatus == 0) {
 131                                 seccomp_filtering = true;
 132                         } else {
 133                                 error_func_msg("#%d: unexpected exit status %u",
 134                                                step, exitstatus);
 135                         }
 136                         break;
 137                 }
 138
 139                 if (WIFSIGNALED(status)) {
 140                         /* The tracee is no more.  */
 141                         pid = 0;
 142
 143                         error_func_msg("#%d: unexpected signal %u",
 144                                        step, WTERMSIG(status));
 145                         break;
 146                 }
 147
 148                 if (!WIFSTOPPED(status)) {
 149                         /* Cannot happen.  */
 150                         error_func_msg("#%d: unexpected wait status %#x",
 151                                        step, status);
 152                         break;
 153                 }
 154
 155                 unsigned int event = (unsigned int) status >> 16;
 156
 157                 switch (WSTOPSIG(status)) {
 158                 case SIGSTOP:
 159                         if (step != 0) {
 160                                 error_func_msg("#%d: unexpected signal stop",
 161                                                step);
 162                                 return pid;
 163                         }
 164                         if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
 165                                    PTRACE_O_TRACESYSGOOD|
 166                                    PTRACE_O_TRACESECCOMP) < 0) {
 167                                 perror_func_msg("PTRACE_SETOPTIONS");
 168                                 return pid;
 169                         }
 170                         break;
 171
 172                 case SIGTRAP:
 173                         if (event != PTRACE_EVENT_SECCOMP) {
 174                                 error_func_msg("#%d: unexpected trap %#x",
 175                                                step, event);
 176                                 return pid;
 177                         }
 178
 179                         switch (step) {
 180                         case 1: /* Seccomp stop before entering gettid.  */
 181                                 seccomp_before_sysentry = true;
 182                                 break;
 183                         case 2: /* Seccomp stop after entering gettid.  */
 184                                 if (!seccomp_before_sysentry)
 185                                         break;
 186                                 ATTRIBUTE_FALLTHROUGH;
 187                         default:
 188                                 error_func_msg("#%d: unexpected seccomp stop",
 189                                                step);
 190                                 return pid;
 191                         }
 192                         break;
 193
 194                 case SIGTRAP | 0x80:
 195                         switch (step) {
 196                         case 3: /* Exiting gettid.  */
 197                         case 4: /* Entering exit_group.  */
 198                                 break;
 199                         case 1: /* Entering gettid before seccomp stop.  */
 200                                 seccomp_before_sysentry = false;
 201                                 break;
 202                         case 2: /* Entering gettid after seccomp stop.  */
 203                                 if (seccomp_before_sysentry)
 204                                         break;
 205                                 ATTRIBUTE_FALLTHROUGH;
 206                         default:
 207                                 error_func_msg("#%d: unexpected syscall stop",
 208                                                step);
 209                                 return pid;
 210                         }
 211                         break;
 212
 213                 default:
 214                         error_func_msg("#%d: unexpected stop signal %#x",
 215                                        step, WSTOPSIG(status));
 216                         return pid;
 217                 }
 218
 219                 if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
 220                         /* Cannot happen.  */
 221                         perror_func_msg("#%d: PTRACE_SYSCALL", step);
 222                         break;
 223                 }
 224         }
 225
 226         return pid;
 227 }
 228 # endif /* HAVE_FORK */
 229
 230 static void
 231 check_seccomp_order(void)
 232 {
 233         seccomp_filtering = false;
 234
 235         /* NOMMU provides no forks necessary for the test.  */
 236 # ifdef HAVE_FORK
 237         int pid = fork();
 238         if (pid < 0) {
 239                 perror_func_msg("fork");
 240                 return;
 241         }
 242
 243         if (pid == 0)
 244                 check_seccomp_order_do_child();
 245
 246         pid = check_seccomp_order_tracer(pid);
 247         if (pid) {
 248                 kill(pid, SIGKILL);
 249                 for (;;) {
 250                         long rc = waitpid(pid, NULL, 0);
 251                         if (rc < 0 && errno == EINTR)
 252                                 continue;
 253                         break;
 254                 }
 255         }
 256 # endif /* HAVE_FORK */
 257 }
 258
 259 static bool
 260 traced_by_seccomp(unsigned int scno, unsigned int p)
 261 {
 262         if (is_number_in_set_array(scno, trace_set, p)
 263             || sysent_vec[p][scno].sys_flags
 264             & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
 265                 return true;
 266         return false;
 267 }
 268
 269 static void
 270 check_bpf_program_size(void)
 271 {
 272         unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
 273
 274         /*
 275          * Implements a simplified form of init_sock_filter()'s bytecode
 276          * generation algorithm, to count the number of instructions that will
 277          * be generated.
 278          */
 279         for (int p = SUPPORTED_PERSONALITIES - 1;
 280              p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
 281                 unsigned int nb_insns_personality = 0;
 282                 unsigned int lower = UINT_MAX;
 283
 284                 nb_insns_personality++;
 285 # if SUPPORTED_PERSONALITIES > 1
 286                 nb_insns_personality++;
 287                 if (audit_arch_vec[p].flag)
 288                         nb_insns_personality += 3;
 289 # endif
 290
 291                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
 292                         if (traced_by_seccomp(i, p)) {
 293                                 if (lower == UINT_MAX)
 294                                         lower = i;
 295                                 continue;
 296                         }
 297                         if (lower == UINT_MAX)
 298                                 continue;
 299                         if (lower + 1 == i)
 300                                 nb_insns_personality++;
 301                         else
 302                                 nb_insns_personality += 2;
 303                         lower = UINT_MAX;
 304                 }
 305                 if (lower != UINT_MAX) {
 306                         if (lower + 1 == nsyscall_vec[p])
 307                                 nb_insns_personality++;
 308                         else
 309                                 nb_insns_personality += 2;
 310                 }
 311
 312                 nb_insns_personality += 3;
 313
 314                 /*
 315                  * Within generated BPF programs, the origin and destination of
 316                  * jumps are always in the same personality section.  The
 317                  * largest jump is therefore the jump from the first
 318                  * instruction of the section to the last, to skip the
 319                  * personality and try to compare .arch to the next
 320                  * personality.
 321                  * If we have a personality section with more than 255
 322                  * instructions, the jump offset will overflow.  Such program
 323                  * is unlikely to happen, so we simply disable seccomp filter
 324                  * is such a case.
 325                  */
 326                 if (nb_insns_personality > UCHAR_MAX) {
 327                         debug_msg("seccomp filter disabled due to "
 328                                   "possibility of overflow");
 329                         seccomp_filtering = false;
 330                         return;
 331                 }
 332                 nb_insns += nb_insns_personality;
 333         }
 334
 335 # if SUPPORTED_PERSONALITIES > 1
 336         nb_insns++;
 337 # endif
 338
 339         if (nb_insns > BPF_MAXINSNS) {
 340                 debug_msg("seccomp filter disabled due to BPF program being "
 341                           "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
 342                 seccomp_filtering = false;
 343         }
 344 }
 345
 346 static void
 347 check_seccomp_filter_properties(void)
 348 {
 349         int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
 350         seccomp_filtering = rc < 0 && errno != EINVAL;
 351         if (!seccomp_filtering)
 352                 debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
 353
 354         if (seccomp_filtering)
 355                 check_bpf_program_size();
 356         if (seccomp_filtering)
 357                 check_seccomp_order();
 358 }
 359
 360 static void
 361 dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
 362 {
 363         for (unsigned int i = 0; i < len; ++i) {
 364                 switch (filter[i].code) {
 365                 case BPF_LD | BPF_W | BPF_ABS:
 366                         switch (filter[i].k) {
 367                         case offsetof(struct seccomp_data, arch):
 368                                 error_msg("STMT(BPF_LDWABS, data->arch)");
 369                                 break;
 370                         case offsetof(struct seccomp_data, nr):
 371                                 error_msg("STMT(BPF_LDWABS, data->nr)");
 372                                 break;
 373                         default:
 374                                 error_msg("STMT(BPF_LDWABS, 0x%x)",
 375                                           filter[i].k);
 376                         }
 377                         break;
 378                 case BPF_RET | BPF_K:
 379                         switch (filter[i].k) {
 380                         case SECCOMP_RET_TRACE:
 381                                 error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
 382                                 break;
 383                         case SECCOMP_RET_ALLOW:
 384                                 error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
 385                                 break;
 386                         default:
 387                                 error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
 388                         }
 389                         break;
 390                 case BPF_JMP | BPF_JEQ | BPF_K:
 391                         error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
 392                                   filter[i].jt, filter[i].jf,
 393                                   filter[i].k);
 394                         break;
 395                 case BPF_JMP | BPF_JGE | BPF_K:
 396                         error_msg("JUMP(BPF_JGE, %u, %u, %u)",
 397                                   filter[i].jt, filter[i].jf,
 398                                   filter[i].k);
 399                         break;
 400                 case BPF_JMP | BPF_JA:
 401                         error_msg("JUMP(BPF_JA, %u)", filter[i].k);
 402                         break;
 403                 default:
 404                         error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
 405                                   filter[i].jt, filter[i].jf, filter[i].k);
 406                 }
 407         }
 408 }
 409
 410 static void
 411 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
 412                          unsigned char jmp_trace)
 413 {
 414         switch (*jmp_offset) {
 415         case JMP_PLACEHOLDER_NEXT:
 416                 *jmp_offset = jmp_next;
 417                 break;
 418         case JMP_PLACEHOLDER_TRACE:
 419                 *jmp_offset = jmp_trace;
 420                 break;
 421         default:
 422                 break;
 423         }
 424 }
 425
 426 static unsigned short
 427 bpf_syscalls_cmp(struct sock_filter *filter,
 428                  unsigned int lower, unsigned int upper)
 429 {
 430         if (lower + 1 == upper) {
 431                 /* if (nr == lower) return RET_TRACE; */
 432                 SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
 433                              JMP_PLACEHOLDER_TRACE, 0);
 434                 return 1;
 435         } else {
 436                 /* if (nr >= lower && nr < upper) return RET_TRACE; */
 437                 SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
 438                 SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
 439                              JMP_PLACEHOLDER_TRACE);
 440                 return 2;
 441         }
 442 }
 443
 444 static unsigned short
 445 init_sock_filter(struct sock_filter *filter)
 446 {
 447         /*
 448          * Generated program looks like:
 449          * if (arch == AUDIT_ARCH_A && nr >= flag) {
 450          *      if (nr == 59)
 451          *              return SECCOMP_RET_TRACE;
 452          *      if (nr >= 321 && nr <= 323)
 453          *              return SECCOMP_RET_TRACE;
 454          *      ...
 455          *      return SECCOMP_RET_ALLOW;
 456          * }
 457          * if (arch == AUDIT_ARCH_A) {
 458          *      ...
 459          * }
 460          * if (arch == AUDIT_ARCH_B) {
 461          *      ...
 462          * }
 463          * return SECCOMP_RET_TRACE;
 464          */
 465         unsigned short pos = 0;
 466
 467 # if SUPPORTED_PERSONALITIES > 1
 468         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
 469                      offsetof(struct seccomp_data, arch));
 470 # endif
 471
 472         /*
 473          * Personalities are iterated in reverse-order in the BPF program so
 474          * that the x86 case is naturally handled.  On x86, the first and third
 475          * personalities have the same arch identifier.  The third can be
 476          * distinguished based on its associated syscall flag, so we check it
 477          * first.  The only drawback here is that the first personality is more
 478          * common, which may make the BPF program slower to match syscalls on
 479          * average.
 480          */
 481         for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
 482                 unsigned int lower = UINT_MAX;
 483                 unsigned short start = pos, end;
 484
 485 # if SUPPORTED_PERSONALITIES > 1
 486                 /* if (arch != audit_arch_vec[p].arch) goto next; */
 487                 SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
 488                              audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
 489 # endif
 490                 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
 491                              offsetof(struct seccomp_data, nr));
 492
 493 # if SUPPORTED_PERSONALITIES > 1
 494                 if (audit_arch_vec[p].flag) {
 495                         /* if (nr < audit_arch_vec[p].flag) goto next; */
 496                         SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
 497                                      audit_arch_vec[p].flag, 2, 0);
 498                         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
 499                                      offsetof(struct seccomp_data, arch));
 500                         SET_BPF_JUMP(&filter[pos++], BPF_JA,
 501                                      JMP_PLACEHOLDER_NEXT, 0, 0);
 502                 }
 503 # endif
 504
 505                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
 506                         if (traced_by_seccomp(i, p)) {
 507                                 if (lower == UINT_MAX)
 508                                         lower = i;
 509                                 continue;
 510                         }
 511                         if (lower == UINT_MAX)
 512                                 continue;
 513                         pos += bpf_syscalls_cmp(filter + pos,
 514                                                 lower | audit_arch_vec[p].flag,
 515                                                 i | audit_arch_vec[p].flag);
 516                         lower = UINT_MAX;
 517                 }
 518                 if (lower != UINT_MAX)
 519                         pos += bpf_syscalls_cmp(filter + pos,
 520                                                 lower | audit_arch_vec[p].flag,
 521                                                 nsyscall_vec[p]
 522                                                 | audit_arch_vec[p].flag);
 523                 end = pos;
 524
 525                 /* if (nr >= max_nr) return RET_TRACE; */
 526                 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
 527                              nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
 528
 529                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
 530                              SECCOMP_RET_ALLOW);
 531                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
 532                              SECCOMP_RET_TRACE);
 533
 534                 for (unsigned int i = start; i < end; ++i) {
 535                         if (BPF_CLASS(filter[i].code) != BPF_JMP)
 536                                 continue;
 537                         unsigned char jmp_next = pos - i - 1;
 538                         unsigned char jmp_trace = pos - i - 2;
 539                         replace_jmp_placeholders(&filter[i].jt, jmp_next,
 540                                                  jmp_trace);
 541                         replace_jmp_placeholders(&filter[i].jf, jmp_next,
 542                                                  jmp_trace);
 543                         if (BPF_OP(filter[i].code) == BPF_JA)
 544                                 filter[i].k = (unsigned int) jmp_next;
 545                 }
 546         }
 547
 548 # if SUPPORTED_PERSONALITIES > 1
 549         /* Jumps conditioned on .arch default to this RET_TRACE. */
 550         SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
 551 # endif
 552
 553         if (debug_flag)
 554                 dump_seccomp_bpf(filter, pos);
 555
 556         return pos;
 557 }
 558
 559 void
 560 init_seccomp_filter(void)
 561 {
 562         struct sock_filter filter[BPF_MAXINSNS];
 563         unsigned short len;
 564
 565         len = init_sock_filter(filter);
 566
 567         struct sock_fprog prog = {
 568                 .len = len,
 569                 .filter = filter
 570         };
 571
 572         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
 573                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
 574
 575         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
 576                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
 577 }
 578
 579 int
 580 seccomp_filter_restart_operator(const struct tcb *tcp)
 581 {
 582         if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
 583             && traced_by_seccomp(tcp->scno, current_personality))
 584                 return PTRACE_SYSCALL;
 585         return PTRACE_CONT;
 586 }
 587
 588 #else /* !HAVE_LINUX_SECCOMP_H */
 589
 590 # warning <linux/seccomp.h> is not available, seccomp filtering is not supported
 591
 592 static void
 593 check_seccomp_filter_properties(void)
 594 {
 595         seccomp_filtering = false;
 596 }
 597
 598 void
 599 init_seccomp_filter(void)
 600 {
 601 }
 602
 603 int
 604 seccomp_filter_restart_operator(const struct tcb *tcp)
 605 {
 606         return PTRACE_SYSCALL;
 607 }
 608
 609 #endif
 610
 611 void
 612 check_seccomp_filter(void)
 613 {
 614         /* Let's avoid enabling seccomp if all syscalls are traced. */
 615         seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
 616                                                    SUPPORTED_PERSONALITIES);
 617         if (!seccomp_filtering) {
 618                 error_msg("Seccomp filter is requested "
 619                           "but there are no syscalls to filter.  "
 620                           "See -e trace to filter syscalls.");
 621                 return;
 622         }
 623
 624         check_seccomp_filter_properties();
 625
 626         if (!seccomp_filtering)
 627                 error_msg("seccomp filter is requested but unavailable");
 628 }