]> granicus.if.org Git - strace/blob - filter_seccomp.c
tests: add support of multi-line diagnostics to check_h
[strace] / filter_seccomp.c
1 /*
2  * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
3  * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
4  * Copyright (c) 2019 The strace developers.
5  * All rights reserved.
6  *
7  * SPDX-License-Identifier: LGPL-2.1-or-later
8  */
9
10 #include "defs.h"
11
12 #include "ptrace.h"
13 #include <signal.h>
14 #include <sys/prctl.h>
15 #include <sys/wait.h>
16 #include <linux/audit.h>
17 #include <linux/filter.h>
18
19 #include "filter_seccomp.h"
20 #include "number_set.h"
21 #include "syscall.h"
22 #include "scno.h"
23
24 bool seccomp_filtering;
25 bool seccomp_before_sysentry;
26
27 #ifdef HAVE_LINUX_SECCOMP_H
28
29 # include <linux/seccomp.h>
30
31 # ifndef BPF_MAXINSNS
32 #  define BPF_MAXINSNS 4096
33 # endif
34
35 # define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
36 # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
37
38 # define SET_BPF(filter, code, jt, jf, k) \
39         (*(filter) = (struct sock_filter) { code, jt, jf, k })
40
41 # define SET_BPF_STMT(filter, code, k) \
42         SET_BPF(filter, code, 0, 0, k)
43
44 # define SET_BPF_JUMP(filter, code, k, jt, jf) \
45         SET_BPF(filter, BPF_JMP | code, jt, jf, k)
46
47 struct audit_arch_t {
48         unsigned int arch;
49         unsigned int flag;
50 };
51
52 static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
53 # if SUPPORTED_PERSONALITIES > 1
54         PERSONALITY0_AUDIT_ARCH,
55         PERSONALITY1_AUDIT_ARCH,
56 #  if SUPPORTED_PERSONALITIES > 2
57         PERSONALITY2_AUDIT_ARCH,
58 #  endif
59 # endif
60 };
61
62 # ifdef ENABLE_COVERAGE_GCOV
63 extern void __gcov_flush(void);
64 # endif
65
66 static void ATTRIBUTE_NORETURN
67 check_seccomp_order_do_child(void)
68 {
69         static const struct sock_filter filter[] = {
70                 /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
71                 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
72                          offsetof(struct seccomp_data, nr)),
73                 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
74                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
75                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
76         };
77         static const struct sock_fprog prog = {
78                 .len = ARRAY_SIZE(filter),
79                 .filter = (struct sock_filter *) filter
80         };
81
82         /* Get everything ready before PTRACE_TRACEME.  */
83         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
84                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
85         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
86                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
87         int pid = getpid();
88
89         if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
90                 /* Exit with a nonzero exit status.  */
91                 perror_func_msg_and_die("PTRACE_TRACEME");
92         }
93
94 # ifdef ENABLE_COVERAGE_GCOV
95         __gcov_flush();
96 # endif
97
98         kill(pid, SIGSTOP);
99         syscall(__NR_gettid);
100         _exit(0);
101 }
102
103 static int
104 check_seccomp_order_tracer(int pid)
105 {
106         unsigned int step;
107
108         for (step = 0; ; ++step) {
109                 int status;
110
111                 for (;;) {
112                         long rc = waitpid(pid, &status, 0);
113                         if (rc < 0 && errno == EINTR)
114                                 continue;
115                         if (rc == pid)
116                                 break;
117                         /* Cannot happen.  */
118                         perror_func_msg("#%d: unexpected wait result %ld",
119                                         step, rc);
120                         return pid;
121                 }
122
123                 if (WIFEXITED(status)) {
124                         /* The tracee is no more.  */
125                         pid = 0;
126
127                         int exitstatus = WEXITSTATUS(status);
128                         if (step == 5 && exitstatus == 0) {
129                                 seccomp_filtering = true;
130                         } else {
131                                 error_func_msg("#%d: unexpected exit status %u",
132                                                step, exitstatus);
133                         }
134                         break;
135                 }
136
137                 if (WIFSIGNALED(status)) {
138                         /* The tracee is no more.  */
139                         pid = 0;
140
141                         error_func_msg("#%d: unexpected signal %u",
142                                        step, WTERMSIG(status));
143                         break;
144                 }
145
146                 if (!WIFSTOPPED(status)) {
147                         /* Cannot happen.  */
148                         error_func_msg("#%d: unexpected wait status %#x",
149                                        step, status);
150                         break;
151                 }
152
153                 unsigned int event = (unsigned int) status >> 16;
154
155                 switch (WSTOPSIG(status)) {
156                 case SIGSTOP:
157                         if (step != 0) {
158                                 error_func_msg("#%d: unexpected signal stop",
159                                                step);
160                                 return pid;
161                         }
162                         if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
163                                    PTRACE_O_TRACESYSGOOD|
164                                    PTRACE_O_TRACESECCOMP) < 0) {
165                                 perror_func_msg("PTRACE_SETOPTIONS");
166                                 return pid;
167                         }
168                         break;
169
170                 case SIGTRAP:
171                         if (event != PTRACE_EVENT_SECCOMP) {
172                                 error_func_msg("#%d: unexpected trap %#x",
173                                                step, event);
174                                 return pid;
175                         }
176
177                         switch (step) {
178                         case 1: /* Seccomp stop before entering gettid.  */
179                                 seccomp_before_sysentry = true;
180                                 break;
181                         case 2: /* Seccomp stop after entering gettid.  */
182                                 if (!seccomp_before_sysentry)
183                                         break;
184                                 ATTRIBUTE_FALLTHROUGH;
185                         default:
186                                 error_func_msg("#%d: unexpected seccomp stop",
187                                                step);
188                                 return pid;
189                         }
190                         break;
191
192                 case SIGTRAP | 0x80:
193                         switch (step) {
194                         case 3: /* Exiting gettid.  */
195                         case 4: /* Entering exit_group.  */
196                                 break;
197                         case 1: /* Entering gettid before seccomp stop.  */
198                                 seccomp_before_sysentry = false;
199                                 break;
200                         case 2: /* Entering gettid after seccomp stop.  */
201                                 if (seccomp_before_sysentry)
202                                         break;
203                                 ATTRIBUTE_FALLTHROUGH;
204                         default:
205                                 error_func_msg("#%d: unexpected syscall stop",
206                                                step);
207                                 return pid;
208                         }
209                         break;
210
211                 default:
212                         error_func_msg("#%d: unexpected stop signal %#x",
213                                        step, WSTOPSIG(status));
214                         return pid;
215                 }
216
217                 if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
218                         /* Cannot happen.  */
219                         perror_func_msg("#%d: PTRACE_SYSCALL", step);
220                         break;
221                 }
222         }
223
224         return pid;
225 }
226
227 static void
228 check_seccomp_order(void)
229 {
230         seccomp_filtering = false;
231
232         int pid = fork();
233         if (pid < 0) {
234                 perror_func_msg("fork");
235                 return;
236         }
237
238         if (pid == 0)
239                 check_seccomp_order_do_child();
240
241         pid = check_seccomp_order_tracer(pid);
242         if (pid) {
243                 kill(pid, SIGKILL);
244                 for (;;) {
245                         long rc = waitpid(pid, NULL, 0);
246                         if (rc < 0 && errno == EINTR)
247                                 continue;
248                         break;
249                 }
250         }
251 }
252
253 static bool
254 traced_by_seccomp(unsigned int scno, unsigned int p)
255 {
256         if (is_number_in_set_array(scno, trace_set, p)
257             || sysent_vec[p][scno].sys_flags
258             & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
259                 return true;
260         return false;
261 }
262
263 static void
264 check_bpf_program_size(void)
265 {
266         unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
267
268         /*
269          * Implements a simplified form of init_sock_filter()'s bytecode
270          * generation algorithm, to count the number of instructions that will
271          * be generated.
272          */
273         for (int p = SUPPORTED_PERSONALITIES - 1;
274              p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
275                 unsigned int nb_insns_personality = 0;
276                 unsigned int lower = UINT_MAX;
277
278                 nb_insns_personality++;
279 # if SUPPORTED_PERSONALITIES > 1
280                 nb_insns_personality++;
281                 if (audit_arch_vec[p].flag)
282                         nb_insns_personality += 3;
283 # endif
284
285                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
286                         if (traced_by_seccomp(i, p)) {
287                                 if (lower == UINT_MAX)
288                                         lower = i;
289                                 continue;
290                         }
291                         if (lower == UINT_MAX)
292                                 continue;
293                         if (lower + 1 == i)
294                                 nb_insns_personality++;
295                         else
296                                 nb_insns_personality += 2;
297                         lower = UINT_MAX;
298                 }
299                 if (lower != UINT_MAX) {
300                         if (lower + 1 == nsyscall_vec[p])
301                                 nb_insns_personality++;
302                         else
303                                 nb_insns_personality += 2;
304                 }
305
306                 nb_insns_personality += 3;
307
308                 /*
309                  * Within generated BPF programs, the origin and destination of
310                  * jumps are always in the same personality section.  The
311                  * largest jump is therefore the jump from the first
312                  * instruction of the section to the last, to skip the
313                  * personality and try to compare .arch to the next
314                  * personality.
315                  * If we have a personality section with more than 255
316                  * instructions, the jump offset will overflow.  Such program
317                  * is unlikely to happen, so we simply disable seccomp filter
318                  * is such a case.
319                  */
320                 if (nb_insns_personality > UCHAR_MAX) {
321                         debug_msg("seccomp filter disabled due to "
322                                   "possibility of overflow");
323                         seccomp_filtering = false;
324                         return;
325                 }
326                 nb_insns += nb_insns_personality;
327         }
328
329 # if SUPPORTED_PERSONALITIES > 1
330         nb_insns++;
331 # endif
332
333         if (nb_insns > BPF_MAXINSNS) {
334                 debug_msg("seccomp filter disabled due to BPF program being "
335                           "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
336                 seccomp_filtering = false;
337         }
338 }
339
340 static void
341 check_seccomp_filter_properties(void)
342 {
343         if (NOMMU_SYSTEM) {
344                 seccomp_filtering = false;
345                 return;
346         }
347
348         int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
349         seccomp_filtering = rc < 0 && errno != EINVAL;
350         if (!seccomp_filtering)
351                 debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
352
353         if (seccomp_filtering)
354                 check_bpf_program_size();
355         if (seccomp_filtering)
356                 check_seccomp_order();
357 }
358
359 static void
360 dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
361 {
362         for (unsigned int i = 0; i < len; ++i) {
363                 switch (filter[i].code) {
364                 case BPF_LD | BPF_W | BPF_ABS:
365                         switch (filter[i].k) {
366                         case offsetof(struct seccomp_data, arch):
367                                 error_msg("STMT(BPF_LDWABS, data->arch)");
368                                 break;
369                         case offsetof(struct seccomp_data, nr):
370                                 error_msg("STMT(BPF_LDWABS, data->nr)");
371                                 break;
372                         default:
373                                 error_msg("STMT(BPF_LDWABS, 0x%x)",
374                                           filter[i].k);
375                         }
376                         break;
377                 case BPF_RET | BPF_K:
378                         switch (filter[i].k) {
379                         case SECCOMP_RET_TRACE:
380                                 error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
381                                 break;
382                         case SECCOMP_RET_ALLOW:
383                                 error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
384                                 break;
385                         default:
386                                 error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
387                         }
388                         break;
389                 case BPF_JMP | BPF_JEQ | BPF_K:
390                         error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
391                                   filter[i].jt, filter[i].jf,
392                                   filter[i].k);
393                         break;
394                 case BPF_JMP | BPF_JGE | BPF_K:
395                         error_msg("JUMP(BPF_JGE, %u, %u, %u)",
396                                   filter[i].jt, filter[i].jf,
397                                   filter[i].k);
398                         break;
399                 case BPF_JMP | BPF_JA:
400                         error_msg("JUMP(BPF_JA, %u)", filter[i].k);
401                         break;
402                 default:
403                         error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
404                                   filter[i].jt, filter[i].jf, filter[i].k);
405                 }
406         }
407 }
408
409 static void
410 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
411                          unsigned char jmp_trace)
412 {
413         switch (*jmp_offset) {
414         case JMP_PLACEHOLDER_NEXT:
415                 *jmp_offset = jmp_next;
416                 break;
417         case JMP_PLACEHOLDER_TRACE:
418                 *jmp_offset = jmp_trace;
419                 break;
420         default:
421                 break;
422         }
423 }
424
425 static unsigned short
426 bpf_syscalls_cmp(struct sock_filter *filter,
427                  unsigned int lower, unsigned int upper)
428 {
429         if (lower + 1 == upper) {
430                 /* if (nr == lower) return RET_TRACE; */
431                 SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
432                              JMP_PLACEHOLDER_TRACE, 0);
433                 return 1;
434         } else {
435                 /* if (nr >= lower && nr < upper) return RET_TRACE; */
436                 SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
437                 SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
438                              JMP_PLACEHOLDER_TRACE);
439                 return 2;
440         }
441 }
442
443 static unsigned short
444 init_sock_filter(struct sock_filter *filter)
445 {
446         /*
447          * Generated program looks like:
448          * if (arch == AUDIT_ARCH_A && nr >= flag) {
449          *      if (nr == 59)
450          *              return SECCOMP_RET_TRACE;
451          *      if (nr >= 321 && nr <= 323)
452          *              return SECCOMP_RET_TRACE;
453          *      ...
454          *      return SECCOMP_RET_ALLOW;
455          * }
456          * if (arch == AUDIT_ARCH_A) {
457          *      ...
458          * }
459          * if (arch == AUDIT_ARCH_B) {
460          *      ...
461          * }
462          * return SECCOMP_RET_TRACE;
463          */
464         unsigned short pos = 0;
465
466 # if SUPPORTED_PERSONALITIES > 1
467         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
468                      offsetof(struct seccomp_data, arch));
469 # endif
470
471         /*
472          * Personalities are iterated in reverse-order in the BPF program so
473          * that the x86 case is naturally handled.  On x86, the first and third
474          * personalities have the same arch identifier.  The third can be
475          * distinguished based on its associated syscall flag, so we check it
476          * first.  The only drawback here is that the first personality is more
477          * common, which may make the BPF program slower to match syscalls on
478          * average.
479          */
480         for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
481                 unsigned int lower = UINT_MAX;
482                 unsigned short start = pos, end;
483
484 # if SUPPORTED_PERSONALITIES > 1
485                 /* if (arch != audit_arch_vec[p].arch) goto next; */
486                 SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
487                              audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
488 # endif
489                 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
490                              offsetof(struct seccomp_data, nr));
491
492 # if SUPPORTED_PERSONALITIES > 1
493                 if (audit_arch_vec[p].flag) {
494                         /* if (nr < audit_arch_vec[p].flag) goto next; */
495                         SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
496                                      audit_arch_vec[p].flag, 2, 0);
497                         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
498                                      offsetof(struct seccomp_data, arch));
499                         SET_BPF_JUMP(&filter[pos++], BPF_JA,
500                                      JMP_PLACEHOLDER_NEXT, 0, 0);
501                 }
502 # endif
503
504                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
505                         if (traced_by_seccomp(i, p)) {
506                                 if (lower == UINT_MAX)
507                                         lower = i;
508                                 continue;
509                         }
510                         if (lower == UINT_MAX)
511                                 continue;
512                         pos += bpf_syscalls_cmp(filter + pos,
513                                                 lower | audit_arch_vec[p].flag,
514                                                 i | audit_arch_vec[p].flag);
515                         lower = UINT_MAX;
516                 }
517                 if (lower != UINT_MAX)
518                         pos += bpf_syscalls_cmp(filter + pos,
519                                                 lower | audit_arch_vec[p].flag,
520                                                 nsyscall_vec[p]
521                                                 | audit_arch_vec[p].flag);
522                 end = pos;
523
524                 /* if (nr >= max_nr) return RET_TRACE; */
525                 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
526                              nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
527
528                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
529                              SECCOMP_RET_ALLOW);
530                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
531                              SECCOMP_RET_TRACE);
532
533                 for (unsigned int i = start; i < end; ++i) {
534                         if (BPF_CLASS(filter[i].code) != BPF_JMP)
535                                 continue;
536                         unsigned char jmp_next = pos - i - 1;
537                         unsigned char jmp_trace = pos - i - 2;
538                         replace_jmp_placeholders(&filter[i].jt, jmp_next,
539                                                  jmp_trace);
540                         replace_jmp_placeholders(&filter[i].jf, jmp_next,
541                                                  jmp_trace);
542                         if (BPF_OP(filter[i].code) == BPF_JA)
543                                 filter[i].k = (unsigned int) jmp_next;
544                 }
545         }
546
547 # if SUPPORTED_PERSONALITIES > 1
548         /* Jumps conditioned on .arch default to this RET_TRACE. */
549         SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
550 # endif
551
552         if (debug_flag)
553                 dump_seccomp_bpf(filter, pos);
554
555         return pos;
556 }
557
558 void
559 init_seccomp_filter(void)
560 {
561         struct sock_filter filter[BPF_MAXINSNS];
562         unsigned short len;
563
564         len = init_sock_filter(filter);
565
566         struct sock_fprog prog = {
567                 .len = len,
568                 .filter = filter
569         };
570
571         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
572                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
573
574         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
575                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
576 }
577
578 int
579 seccomp_filter_restart_operator(const struct tcb *tcp)
580 {
581         if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
582             && traced_by_seccomp(tcp->scno, current_personality))
583                 return PTRACE_SYSCALL;
584         return PTRACE_CONT;
585 }
586
587 #else /* !HAVE_LINUX_SECCOMP_H */
588
589 # warning <linux/seccomp.h> is not available, seccomp filtering is not supported
590
591 static void
592 check_seccomp_filter_properties(void)
593 {
594         seccomp_filtering = false;
595 }
596
597 void
598 init_seccomp_filter(void)
599 {
600 }
601
602 int
603 seccomp_filter_restart_operator(const struct tcb *tcp)
604 {
605         return PTRACE_SYSCALL;
606 }
607
608 #endif
609
610 void
611 check_seccomp_filter(void)
612 {
613         /* Let's avoid enabling seccomp if all syscalls are traced. */
614         seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
615                                                    SUPPORTED_PERSONALITIES);
616         if (!seccomp_filtering) {
617                 error_msg("Seccomp filter is requested "
618                           "but there are no syscalls to filter.  "
619                           "See -e trace to filter syscalls.");
620                 return;
621         }
622
623         check_seccomp_filter_properties();
624
625         if (!seccomp_filtering)
626                 error_msg("seccomp filter is requested but unavailable");
627 }