granicus.if.org Git - strace/blob - filter_seccomp.c

   1 /*
   2  * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
   3  * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
   4  * Copyright (c) 2019 The strace developers.
   5  * All rights reserved.
   6  *
   7  * SPDX-License-Identifier: LGPL-2.1-or-later
   8  */
   9
  10 #include "defs.h"
  11
  12 #include "ptrace.h"
  13 #include <signal.h>
  14 #include <sys/prctl.h>
  15 #include <sys/wait.h>
  16 #include <linux/audit.h>
  17 #include <linux/filter.h>
  18
  19 #include "filter_seccomp.h"
  20 #include "number_set.h"
  21 #include "syscall.h"
  22 #include "scno.h"
  23
  24 bool seccomp_filtering;
  25 bool seccomp_before_sysentry;
  26
  27 #ifdef HAVE_LINUX_SECCOMP_H
  28
  29 # include <linux/seccomp.h>
  30
  31 # ifndef BPF_MAXINSNS
  32 #  define BPF_MAXINSNS 4096
  33 # endif
  34
  35 # define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
  36 # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
  37
  38 # define SET_BPF(filter, code, jt, jf, k) \
  39         (*(filter) = (struct sock_filter) { code, jt, jf, k })
  40
  41 # define SET_BPF_STMT(filter, code, k) \
  42         SET_BPF(filter, code, 0, 0, k)
  43
  44 # define SET_BPF_JUMP(filter, code, k, jt, jf) \
  45         SET_BPF(filter, BPF_JMP | code, jt, jf, k)
  46
  47 struct audit_arch_t {
  48         unsigned int arch;
  49         unsigned int flag;
  50 };
  51
  52 static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
  53 # if SUPPORTED_PERSONALITIES > 1
  54         PERSONALITY0_AUDIT_ARCH,
  55         PERSONALITY1_AUDIT_ARCH,
  56 #  if SUPPORTED_PERSONALITIES > 2
  57         PERSONALITY2_AUDIT_ARCH,
  58 #  endif
  59 # endif
  60 };
  61
  62 # ifdef ENABLE_COVERAGE_GCOV
  63 extern void __gcov_flush(void);
  64 # endif
  65
  66 static void ATTRIBUTE_NORETURN
  67 check_seccomp_order_do_child(void)
  68 {
  69         static const struct sock_filter filter[] = {
  70                 /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
  71                 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
  72                          offsetof(struct seccomp_data, nr)),
  73                 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
  74                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
  75                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
  76         };
  77         static const struct sock_fprog prog = {
  78                 .len = ARRAY_SIZE(filter),
  79                 .filter = (struct sock_filter *) filter
  80         };
  81
  82         /* Get everything ready before PTRACE_TRACEME.  */
  83         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
  84                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
  85         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
  86                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
  87         int pid = getpid();
  88
  89         if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
  90                 /* Exit with a nonzero exit status.  */
  91                 perror_func_msg_and_die("PTRACE_TRACEME");
  92         }
  93
  94 # ifdef ENABLE_COVERAGE_GCOV
  95         __gcov_flush();
  96 # endif
  97
  98         kill(pid, SIGSTOP);
  99         syscall(__NR_gettid);
 100         _exit(0);
 101 }
 102
 103 static int
 104 check_seccomp_order_tracer(int pid)
 105 {
 106         unsigned int step;
 107
 108         for (step = 0; ; ++step) {
 109                 int status;
 110
 111                 for (;;) {
 112                         long rc = waitpid(pid, &status, 0);
 113                         if (rc < 0 && errno == EINTR)
 114                                 continue;
 115                         if (rc == pid)
 116                                 break;
 117                         /* Cannot happen.  */
 118                         perror_func_msg("#%d: unexpected wait result %ld",
 119                                         step, rc);
 120                         return pid;
 121                 }
 122
 123                 if (WIFEXITED(status)) {
 124                         /* The tracee is no more.  */
 125                         pid = 0;
 126
 127                         int exitstatus = WEXITSTATUS(status);
 128                         if (step == 5 && exitstatus == 0) {
 129                                 seccomp_filtering = true;
 130                         } else {
 131                                 error_func_msg("#%d: unexpected exit status %u",
 132                                                step, exitstatus);
 133                         }
 134                         break;
 135                 }
 136
 137                 if (WIFSIGNALED(status)) {
 138                         /* The tracee is no more.  */
 139                         pid = 0;
 140
 141                         error_func_msg("#%d: unexpected signal %u",
 142                                        step, WTERMSIG(status));
 143                         break;
 144                 }
 145
 146                 if (!WIFSTOPPED(status)) {
 147                         /* Cannot happen.  */
 148                         error_func_msg("#%d: unexpected wait status %#x",
 149                                        step, status);
 150                         break;
 151                 }
 152
 153                 unsigned int event = (unsigned int) status >> 16;
 154
 155                 switch (WSTOPSIG(status)) {
 156                 case SIGSTOP:
 157                         if (step != 0) {
 158                                 error_func_msg("#%d: unexpected signal stop",
 159                                                step);
 160                                 return pid;
 161                         }
 162                         if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
 163                                    PTRACE_O_TRACESYSGOOD|
 164                                    PTRACE_O_TRACESECCOMP) < 0) {
 165                                 perror_func_msg("PTRACE_SETOPTIONS");
 166                                 return pid;
 167                         }
 168                         break;
 169
 170                 case SIGTRAP:
 171                         if (event != PTRACE_EVENT_SECCOMP) {
 172                                 error_func_msg("#%d: unexpected trap %#x",
 173                                                step, event);
 174                                 return pid;
 175                         }
 176
 177                         switch (step) {
 178                         case 1: /* Seccomp stop before entering gettid.  */
 179                                 seccomp_before_sysentry = true;
 180                                 break;
 181                         case 2: /* Seccomp stop after entering gettid.  */
 182                                 if (!seccomp_before_sysentry)
 183                                         break;
 184                                 ATTRIBUTE_FALLTHROUGH;
 185                         default:
 186                                 error_func_msg("#%d: unexpected seccomp stop",
 187                                                step);
 188                                 return pid;
 189                         }
 190                         break;
 191
 192                 case SIGTRAP | 0x80:
 193                         switch (step) {
 194                         case 3: /* Exiting gettid.  */
 195                         case 4: /* Entering exit_group.  */
 196                                 break;
 197                         case 1: /* Entering gettid before seccomp stop.  */
 198                                 seccomp_before_sysentry = false;
 199                                 break;
 200                         case 2: /* Entering gettid after seccomp stop.  */
 201                                 if (seccomp_before_sysentry)
 202                                         break;
 203                                 ATTRIBUTE_FALLTHROUGH;
 204                         default:
 205                                 error_func_msg("#%d: unexpected syscall stop",
 206                                                step);
 207                                 return pid;
 208                         }
 209                         break;
 210
 211                 default:
 212                         error_func_msg("#%d: unexpected stop signal %#x",
 213                                        step, WSTOPSIG(status));
 214                         return pid;
 215                 }
 216
 217                 if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
 218                         /* Cannot happen.  */
 219                         perror_func_msg("#%d: PTRACE_SYSCALL", step);
 220                         break;
 221                 }
 222         }
 223
 224         return pid;
 225 }
 226
 227 static void
 228 check_seccomp_order(void)
 229 {
 230         seccomp_filtering = false;
 231
 232         int pid = fork();
 233         if (pid < 0) {
 234                 perror_func_msg("fork");
 235                 return;
 236         }
 237
 238         if (pid == 0)
 239                 check_seccomp_order_do_child();
 240
 241         pid = check_seccomp_order_tracer(pid);
 242         if (pid) {
 243                 kill(pid, SIGKILL);
 244                 for (;;) {
 245                         long rc = waitpid(pid, NULL, 0);
 246                         if (rc < 0 && errno == EINTR)
 247                                 continue;
 248                         break;
 249                 }
 250         }
 251 }
 252
 253 static bool
 254 traced_by_seccomp(unsigned int scno, unsigned int p)
 255 {
 256         if (is_number_in_set_array(scno, trace_set, p)
 257             || sysent_vec[p][scno].sys_flags
 258             & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
 259                 return true;
 260         return false;
 261 }
 262
 263 static void
 264 check_bpf_program_size(void)
 265 {
 266         unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
 267
 268         /*
 269          * Implements a simplified form of init_sock_filter()'s bytecode
 270          * generation algorithm, to count the number of instructions that will
 271          * be generated.
 272          */
 273         for (int p = SUPPORTED_PERSONALITIES - 1;
 274              p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
 275                 unsigned int nb_insns_personality = 0;
 276                 unsigned int lower = UINT_MAX;
 277
 278                 nb_insns_personality++;
 279 # if SUPPORTED_PERSONALITIES > 1
 280                 nb_insns_personality++;
 281                 if (audit_arch_vec[p].flag)
 282                         nb_insns_personality += 3;
 283 # endif
 284
 285                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
 286                         if (traced_by_seccomp(i, p)) {
 287                                 if (lower == UINT_MAX)
 288                                         lower = i;
 289                                 continue;
 290                         }
 291                         if (lower == UINT_MAX)
 292                                 continue;
 293                         if (lower + 1 == i)
 294                                 nb_insns_personality++;
 295                         else
 296                                 nb_insns_personality += 2;
 297                         lower = UINT_MAX;
 298                 }
 299                 if (lower != UINT_MAX) {
 300                         if (lower + 1 == nsyscall_vec[p])
 301                                 nb_insns_personality++;
 302                         else
 303                                 nb_insns_personality += 2;
 304                 }
 305
 306                 nb_insns_personality += 3;
 307
 308                 /*
 309                  * Within generated BPF programs, the origin and destination of
 310                  * jumps are always in the same personality section.  The
 311                  * largest jump is therefore the jump from the first
 312                  * instruction of the section to the last, to skip the
 313                  * personality and try to compare .arch to the next
 314                  * personality.
 315                  * If we have a personality section with more than 255
 316                  * instructions, the jump offset will overflow.  Such program
 317                  * is unlikely to happen, so we simply disable seccomp filter
 318                  * is such a case.
 319                  */
 320                 if (nb_insns_personality > UCHAR_MAX) {
 321                         debug_msg("seccomp filter disabled due to "
 322                                   "possibility of overflow");
 323                         seccomp_filtering = false;
 324                         return;
 325                 }
 326                 nb_insns += nb_insns_personality;
 327         }
 328
 329 # if SUPPORTED_PERSONALITIES > 1
 330         nb_insns++;
 331 # endif
 332
 333         if (nb_insns > BPF_MAXINSNS) {
 334                 debug_msg("seccomp filter disabled due to BPF program being "
 335                           "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
 336                 seccomp_filtering = false;
 337         }
 338 }
 339
 340 static void
 341 check_seccomp_filter_properties(void)
 342 {
 343         if (NOMMU_SYSTEM) {
 344                 seccomp_filtering = false;
 345                 return;
 346         }
 347
 348         int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
 349         seccomp_filtering = rc < 0 && errno != EINVAL;
 350         if (!seccomp_filtering)
 351                 debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
 352
 353         if (seccomp_filtering)
 354                 check_bpf_program_size();
 355         if (seccomp_filtering)
 356                 check_seccomp_order();
 357 }
 358
 359 static void
 360 dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
 361 {
 362         for (unsigned int i = 0; i < len; ++i) {
 363                 switch (filter[i].code) {
 364                 case BPF_LD | BPF_W | BPF_ABS:
 365                         switch (filter[i].k) {
 366                         case offsetof(struct seccomp_data, arch):
 367                                 error_msg("STMT(BPF_LDWABS, data->arch)");
 368                                 break;
 369                         case offsetof(struct seccomp_data, nr):
 370                                 error_msg("STMT(BPF_LDWABS, data->nr)");
 371                                 break;
 372                         default:
 373                                 error_msg("STMT(BPF_LDWABS, 0x%x)",
 374                                           filter[i].k);
 375                         }
 376                         break;
 377                 case BPF_RET | BPF_K:
 378                         switch (filter[i].k) {
 379                         case SECCOMP_RET_TRACE:
 380                                 error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
 381                                 break;
 382                         case SECCOMP_RET_ALLOW:
 383                                 error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
 384                                 break;
 385                         default:
 386                                 error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
 387                         }
 388                         break;
 389                 case BPF_JMP | BPF_JEQ | BPF_K:
 390                         error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
 391                                   filter[i].jt, filter[i].jf,
 392                                   filter[i].k);
 393                         break;
 394                 case BPF_JMP | BPF_JGE | BPF_K:
 395                         error_msg("JUMP(BPF_JGE, %u, %u, %u)",
 396                                   filter[i].jt, filter[i].jf,
 397                                   filter[i].k);
 398                         break;
 399                 case BPF_JMP | BPF_JA:
 400                         error_msg("JUMP(BPF_JA, %u)", filter[i].k);
 401                         break;
 402                 default:
 403                         error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
 404                                   filter[i].jt, filter[i].jf, filter[i].k);
 405                 }
 406         }
 407 }
 408
 409 static void
 410 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
 411                          unsigned char jmp_trace)
 412 {
 413         switch (*jmp_offset) {
 414         case JMP_PLACEHOLDER_NEXT:
 415                 *jmp_offset = jmp_next;
 416                 break;
 417         case JMP_PLACEHOLDER_TRACE:
 418                 *jmp_offset = jmp_trace;
 419                 break;
 420         default:
 421                 break;
 422         }
 423 }
 424
 425 static unsigned short
 426 bpf_syscalls_cmp(struct sock_filter *filter,
 427                  unsigned int lower, unsigned int upper)
 428 {
 429         if (lower + 1 == upper) {
 430                 /* if (nr == lower) return RET_TRACE; */
 431                 SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
 432                              JMP_PLACEHOLDER_TRACE, 0);
 433                 return 1;
 434         } else {
 435                 /* if (nr >= lower && nr < upper) return RET_TRACE; */
 436                 SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
 437                 SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
 438                              JMP_PLACEHOLDER_TRACE);
 439                 return 2;
 440         }
 441 }
 442
 443 static unsigned short
 444 init_sock_filter(struct sock_filter *filter)
 445 {
 446         /*
 447          * Generated program looks like:
 448          * if (arch == AUDIT_ARCH_A && nr >= flag) {
 449          *      if (nr == 59)
 450          *              return SECCOMP_RET_TRACE;
 451          *      if (nr >= 321 && nr <= 323)
 452          *              return SECCOMP_RET_TRACE;
 453          *      ...
 454          *      return SECCOMP_RET_ALLOW;
 455          * }
 456          * if (arch == AUDIT_ARCH_A) {
 457          *      ...
 458          * }
 459          * if (arch == AUDIT_ARCH_B) {
 460          *      ...
 461          * }
 462          * return SECCOMP_RET_TRACE;
 463          */
 464         unsigned short pos = 0;
 465
 466 # if SUPPORTED_PERSONALITIES > 1
 467         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
 468                      offsetof(struct seccomp_data, arch));
 469 # endif
 470
 471         /*
 472          * Personalities are iterated in reverse-order in the BPF program so
 473          * that the x86 case is naturally handled.  On x86, the first and third
 474          * personalities have the same arch identifier.  The third can be
 475          * distinguished based on its associated syscall flag, so we check it
 476          * first.  The only drawback here is that the first personality is more
 477          * common, which may make the BPF program slower to match syscalls on
 478          * average.
 479          */
 480         for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
 481                 unsigned int lower = UINT_MAX;
 482                 unsigned short start = pos, end;
 483
 484 # if SUPPORTED_PERSONALITIES > 1
 485                 /* if (arch != audit_arch_vec[p].arch) goto next; */
 486                 SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
 487                              audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
 488 # endif
 489                 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
 490                              offsetof(struct seccomp_data, nr));
 491
 492 # if SUPPORTED_PERSONALITIES > 1
 493                 if (audit_arch_vec[p].flag) {
 494                         /* if (nr < audit_arch_vec[p].flag) goto next; */
 495                         SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
 496                                      audit_arch_vec[p].flag, 2, 0);
 497                         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
 498                                      offsetof(struct seccomp_data, arch));
 499                         SET_BPF_JUMP(&filter[pos++], BPF_JA,
 500                                      JMP_PLACEHOLDER_NEXT, 0, 0);
 501                 }
 502 # endif
 503
 504                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
 505                         if (traced_by_seccomp(i, p)) {
 506                                 if (lower == UINT_MAX)
 507                                         lower = i;
 508                                 continue;
 509                         }
 510                         if (lower == UINT_MAX)
 511                                 continue;
 512                         pos += bpf_syscalls_cmp(filter + pos,
 513                                                 lower | audit_arch_vec[p].flag,
 514                                                 i | audit_arch_vec[p].flag);
 515                         lower = UINT_MAX;
 516                 }
 517                 if (lower != UINT_MAX)
 518                         pos += bpf_syscalls_cmp(filter + pos,
 519                                                 lower | audit_arch_vec[p].flag,
 520                                                 nsyscall_vec[p]
 521                                                 | audit_arch_vec[p].flag);
 522                 end = pos;
 523
 524                 /* if (nr >= max_nr) return RET_TRACE; */
 525                 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
 526                              nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
 527
 528                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
 529                              SECCOMP_RET_ALLOW);
 530                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
 531                              SECCOMP_RET_TRACE);
 532
 533                 for (unsigned int i = start; i < end; ++i) {
 534                         if (BPF_CLASS(filter[i].code) != BPF_JMP)
 535                                 continue;
 536                         unsigned char jmp_next = pos - i - 1;
 537                         unsigned char jmp_trace = pos - i - 2;
 538                         replace_jmp_placeholders(&filter[i].jt, jmp_next,
 539                                                  jmp_trace);
 540                         replace_jmp_placeholders(&filter[i].jf, jmp_next,
 541                                                  jmp_trace);
 542                         if (BPF_OP(filter[i].code) == BPF_JA)
 543                                 filter[i].k = (unsigned int) jmp_next;
 544                 }
 545         }
 546
 547 # if SUPPORTED_PERSONALITIES > 1
 548         /* Jumps conditioned on .arch default to this RET_TRACE. */
 549         SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
 550 # endif
 551
 552         if (debug_flag)
 553                 dump_seccomp_bpf(filter, pos);
 554
 555         return pos;
 556 }
 557
 558 void
 559 init_seccomp_filter(void)
 560 {
 561         struct sock_filter filter[BPF_MAXINSNS];
 562         unsigned short len;
 563
 564         len = init_sock_filter(filter);
 565
 566         struct sock_fprog prog = {
 567                 .len = len,
 568                 .filter = filter
 569         };
 570
 571         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
 572                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
 573
 574         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
 575                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
 576 }
 577
 578 int
 579 seccomp_filter_restart_operator(const struct tcb *tcp)
 580 {
 581         if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
 582             && traced_by_seccomp(tcp->scno, current_personality))
 583                 return PTRACE_SYSCALL;
 584         return PTRACE_CONT;
 585 }
 586
 587 #else /* !HAVE_LINUX_SECCOMP_H */
 588
 589 # warning <linux/seccomp.h> is not available, seccomp filtering is not supported
 590
 591 static void
 592 check_seccomp_filter_properties(void)
 593 {
 594         seccomp_filtering = false;
 595 }
 596
 597 void
 598 init_seccomp_filter(void)
 599 {
 600 }
 601
 602 int
 603 seccomp_filter_restart_operator(const struct tcb *tcp)
 604 {
 605         return PTRACE_SYSCALL;
 606 }
 607
 608 #endif
 609
 610 void
 611 check_seccomp_filter(void)
 612 {
 613         /* Let's avoid enabling seccomp if all syscalls are traced. */
 614         seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
 615                                                    SUPPORTED_PERSONALITIES);
 616         if (!seccomp_filtering) {
 617                 error_msg("Seccomp filter is requested "
 618                           "but there are no syscalls to filter.  "
 619                           "See -e trace to filter syscalls.");
 620                 return;
 621         }
 622
 623         check_seccomp_filter_properties();
 624
 625         if (!seccomp_filtering)
 626                 error_msg("seccomp filter is requested but unavailable");
 627 }