]> granicus.if.org Git - strace/blob - filter_seccomp.c
filter_seccomp: fix build for no-MMU targets
[strace] / filter_seccomp.c
1 /*
2  * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
3  * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
4  * Copyright (c) 2019 The strace developers.
5  * All rights reserved.
6  *
7  * SPDX-License-Identifier: LGPL-2.1-or-later
8  */
9
10 #include "defs.h"
11
12 #include "ptrace.h"
13 #include <signal.h>
14 #include <sys/prctl.h>
15 #include <sys/wait.h>
16 #include <linux/audit.h>
17 #include <linux/filter.h>
18
19 #include "filter_seccomp.h"
20 #include "number_set.h"
21 #include "syscall.h"
22 #include "scno.h"
23
24 bool seccomp_filtering;
25 bool seccomp_before_sysentry;
26
27 #ifdef HAVE_LINUX_SECCOMP_H
28
29 # include <linux/seccomp.h>
30
31 # ifndef BPF_MAXINSNS
32 #  define BPF_MAXINSNS 4096
33 # endif
34
35 # define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
36 # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
37
38 # define SET_BPF(filter, code, jt, jf, k) \
39         (*(filter) = (struct sock_filter) { code, jt, jf, k })
40
41 # define SET_BPF_STMT(filter, code, k) \
42         SET_BPF(filter, code, 0, 0, k)
43
44 # define SET_BPF_JUMP(filter, code, k, jt, jf) \
45         SET_BPF(filter, BPF_JMP | code, jt, jf, k)
46
47 struct audit_arch_t {
48         unsigned int arch;
49         unsigned int flag;
50 };
51
52 static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
53 # if SUPPORTED_PERSONALITIES > 1
54         PERSONALITY0_AUDIT_ARCH,
55         PERSONALITY1_AUDIT_ARCH,
56 #  if SUPPORTED_PERSONALITIES > 2
57         PERSONALITY2_AUDIT_ARCH,
58 #  endif
59 # endif
60 };
61
62 # ifdef HAVE_FORK
63
64 #  ifdef ENABLE_COVERAGE_GCOV
65 extern void __gcov_flush(void);
66 #  endif
67
68 static void ATTRIBUTE_NORETURN
69 check_seccomp_order_do_child(void)
70 {
71         static const struct sock_filter filter[] = {
72                 /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
73                 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
74                          offsetof(struct seccomp_data, nr)),
75                 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
76                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
77                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
78         };
79         static const struct sock_fprog prog = {
80                 .len = ARRAY_SIZE(filter),
81                 .filter = (struct sock_filter *) filter
82         };
83
84         /* Get everything ready before PTRACE_TRACEME.  */
85         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
86                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
87         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
88                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
89         int pid = getpid();
90
91         if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
92                 /* Exit with a nonzero exit status.  */
93                 perror_func_msg_and_die("PTRACE_TRACEME");
94         }
95
96 #  ifdef ENABLE_COVERAGE_GCOV
97         __gcov_flush();
98 #  endif
99
100         kill(pid, SIGSTOP);
101         syscall(__NR_gettid);
102         _exit(0);
103 }
104
105 static int
106 check_seccomp_order_tracer(int pid)
107 {
108         unsigned int step;
109
110         for (step = 0; ; ++step) {
111                 int status;
112
113                 for (;;) {
114                         long rc = waitpid(pid, &status, 0);
115                         if (rc < 0 && errno == EINTR)
116                                 continue;
117                         if (rc == pid)
118                                 break;
119                         /* Cannot happen.  */
120                         perror_func_msg("#%d: unexpected wait result %ld",
121                                         step, rc);
122                         return pid;
123                 }
124
125                 if (WIFEXITED(status)) {
126                         /* The tracee is no more.  */
127                         pid = 0;
128
129                         int exitstatus = WEXITSTATUS(status);
130                         if (step == 5 && exitstatus == 0) {
131                                 seccomp_filtering = true;
132                         } else {
133                                 error_func_msg("#%d: unexpected exit status %u",
134                                                step, exitstatus);
135                         }
136                         break;
137                 }
138
139                 if (WIFSIGNALED(status)) {
140                         /* The tracee is no more.  */
141                         pid = 0;
142
143                         error_func_msg("#%d: unexpected signal %u",
144                                        step, WTERMSIG(status));
145                         break;
146                 }
147
148                 if (!WIFSTOPPED(status)) {
149                         /* Cannot happen.  */
150                         error_func_msg("#%d: unexpected wait status %#x",
151                                        step, status);
152                         break;
153                 }
154
155                 unsigned int event = (unsigned int) status >> 16;
156
157                 switch (WSTOPSIG(status)) {
158                 case SIGSTOP:
159                         if (step != 0) {
160                                 error_func_msg("#%d: unexpected signal stop",
161                                                step);
162                                 return pid;
163                         }
164                         if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
165                                    PTRACE_O_TRACESYSGOOD|
166                                    PTRACE_O_TRACESECCOMP) < 0) {
167                                 perror_func_msg("PTRACE_SETOPTIONS");
168                                 return pid;
169                         }
170                         break;
171
172                 case SIGTRAP:
173                         if (event != PTRACE_EVENT_SECCOMP) {
174                                 error_func_msg("#%d: unexpected trap %#x",
175                                                step, event);
176                                 return pid;
177                         }
178
179                         switch (step) {
180                         case 1: /* Seccomp stop before entering gettid.  */
181                                 seccomp_before_sysentry = true;
182                                 break;
183                         case 2: /* Seccomp stop after entering gettid.  */
184                                 if (!seccomp_before_sysentry)
185                                         break;
186                                 ATTRIBUTE_FALLTHROUGH;
187                         default:
188                                 error_func_msg("#%d: unexpected seccomp stop",
189                                                step);
190                                 return pid;
191                         }
192                         break;
193
194                 case SIGTRAP | 0x80:
195                         switch (step) {
196                         case 3: /* Exiting gettid.  */
197                         case 4: /* Entering exit_group.  */
198                                 break;
199                         case 1: /* Entering gettid before seccomp stop.  */
200                                 seccomp_before_sysentry = false;
201                                 break;
202                         case 2: /* Entering gettid after seccomp stop.  */
203                                 if (seccomp_before_sysentry)
204                                         break;
205                                 ATTRIBUTE_FALLTHROUGH;
206                         default:
207                                 error_func_msg("#%d: unexpected syscall stop",
208                                                step);
209                                 return pid;
210                         }
211                         break;
212
213                 default:
214                         error_func_msg("#%d: unexpected stop signal %#x",
215                                        step, WSTOPSIG(status));
216                         return pid;
217                 }
218
219                 if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
220                         /* Cannot happen.  */
221                         perror_func_msg("#%d: PTRACE_SYSCALL", step);
222                         break;
223                 }
224         }
225
226         return pid;
227 }
228 # endif /* HAVE_FORK */
229
230 static void
231 check_seccomp_order(void)
232 {
233         seccomp_filtering = false;
234
235         /* NOMMU provides no forks necessary for the test.  */
236 # ifdef HAVE_FORK
237         int pid = fork();
238         if (pid < 0) {
239                 perror_func_msg("fork");
240                 return;
241         }
242
243         if (pid == 0)
244                 check_seccomp_order_do_child();
245
246         pid = check_seccomp_order_tracer(pid);
247         if (pid) {
248                 kill(pid, SIGKILL);
249                 for (;;) {
250                         long rc = waitpid(pid, NULL, 0);
251                         if (rc < 0 && errno == EINTR)
252                                 continue;
253                         break;
254                 }
255         }
256 # endif /* HAVE_FORK */
257 }
258
259 static bool
260 traced_by_seccomp(unsigned int scno, unsigned int p)
261 {
262         if (is_number_in_set_array(scno, trace_set, p)
263             || sysent_vec[p][scno].sys_flags
264             & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
265                 return true;
266         return false;
267 }
268
269 static void
270 check_bpf_program_size(void)
271 {
272         unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
273
274         /*
275          * Implements a simplified form of init_sock_filter()'s bytecode
276          * generation algorithm, to count the number of instructions that will
277          * be generated.
278          */
279         for (int p = SUPPORTED_PERSONALITIES - 1;
280              p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
281                 unsigned int nb_insns_personality = 0;
282                 unsigned int lower = UINT_MAX;
283
284                 nb_insns_personality++;
285 # if SUPPORTED_PERSONALITIES > 1
286                 nb_insns_personality++;
287                 if (audit_arch_vec[p].flag)
288                         nb_insns_personality += 3;
289 # endif
290
291                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
292                         if (traced_by_seccomp(i, p)) {
293                                 if (lower == UINT_MAX)
294                                         lower = i;
295                                 continue;
296                         }
297                         if (lower == UINT_MAX)
298                                 continue;
299                         if (lower + 1 == i)
300                                 nb_insns_personality++;
301                         else
302                                 nb_insns_personality += 2;
303                         lower = UINT_MAX;
304                 }
305                 if (lower != UINT_MAX) {
306                         if (lower + 1 == nsyscall_vec[p])
307                                 nb_insns_personality++;
308                         else
309                                 nb_insns_personality += 2;
310                 }
311
312                 nb_insns_personality += 3;
313
314                 /*
315                  * Within generated BPF programs, the origin and destination of
316                  * jumps are always in the same personality section.  The
317                  * largest jump is therefore the jump from the first
318                  * instruction of the section to the last, to skip the
319                  * personality and try to compare .arch to the next
320                  * personality.
321                  * If we have a personality section with more than 255
322                  * instructions, the jump offset will overflow.  Such program
323                  * is unlikely to happen, so we simply disable seccomp filter
324                  * is such a case.
325                  */
326                 if (nb_insns_personality > UCHAR_MAX) {
327                         debug_msg("seccomp filter disabled due to "
328                                   "possibility of overflow");
329                         seccomp_filtering = false;
330                         return;
331                 }
332                 nb_insns += nb_insns_personality;
333         }
334
335 # if SUPPORTED_PERSONALITIES > 1
336         nb_insns++;
337 # endif
338
339         if (nb_insns > BPF_MAXINSNS) {
340                 debug_msg("seccomp filter disabled due to BPF program being "
341                           "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
342                 seccomp_filtering = false;
343         }
344 }
345
346 static void
347 check_seccomp_filter_properties(void)
348 {
349         int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
350         seccomp_filtering = rc < 0 && errno != EINVAL;
351         if (!seccomp_filtering)
352                 debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
353
354         if (seccomp_filtering)
355                 check_bpf_program_size();
356         if (seccomp_filtering)
357                 check_seccomp_order();
358 }
359
360 static void
361 dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
362 {
363         for (unsigned int i = 0; i < len; ++i) {
364                 switch (filter[i].code) {
365                 case BPF_LD | BPF_W | BPF_ABS:
366                         switch (filter[i].k) {
367                         case offsetof(struct seccomp_data, arch):
368                                 error_msg("STMT(BPF_LDWABS, data->arch)");
369                                 break;
370                         case offsetof(struct seccomp_data, nr):
371                                 error_msg("STMT(BPF_LDWABS, data->nr)");
372                                 break;
373                         default:
374                                 error_msg("STMT(BPF_LDWABS, 0x%x)",
375                                           filter[i].k);
376                         }
377                         break;
378                 case BPF_RET | BPF_K:
379                         switch (filter[i].k) {
380                         case SECCOMP_RET_TRACE:
381                                 error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
382                                 break;
383                         case SECCOMP_RET_ALLOW:
384                                 error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
385                                 break;
386                         default:
387                                 error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
388                         }
389                         break;
390                 case BPF_JMP | BPF_JEQ | BPF_K:
391                         error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
392                                   filter[i].jt, filter[i].jf,
393                                   filter[i].k);
394                         break;
395                 case BPF_JMP | BPF_JGE | BPF_K:
396                         error_msg("JUMP(BPF_JGE, %u, %u, %u)",
397                                   filter[i].jt, filter[i].jf,
398                                   filter[i].k);
399                         break;
400                 case BPF_JMP | BPF_JA:
401                         error_msg("JUMP(BPF_JA, %u)", filter[i].k);
402                         break;
403                 default:
404                         error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
405                                   filter[i].jt, filter[i].jf, filter[i].k);
406                 }
407         }
408 }
409
410 static void
411 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
412                          unsigned char jmp_trace)
413 {
414         switch (*jmp_offset) {
415         case JMP_PLACEHOLDER_NEXT:
416                 *jmp_offset = jmp_next;
417                 break;
418         case JMP_PLACEHOLDER_TRACE:
419                 *jmp_offset = jmp_trace;
420                 break;
421         default:
422                 break;
423         }
424 }
425
426 static unsigned short
427 bpf_syscalls_cmp(struct sock_filter *filter,
428                  unsigned int lower, unsigned int upper)
429 {
430         if (lower + 1 == upper) {
431                 /* if (nr == lower) return RET_TRACE; */
432                 SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
433                              JMP_PLACEHOLDER_TRACE, 0);
434                 return 1;
435         } else {
436                 /* if (nr >= lower && nr < upper) return RET_TRACE; */
437                 SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
438                 SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
439                              JMP_PLACEHOLDER_TRACE);
440                 return 2;
441         }
442 }
443
444 static unsigned short
445 init_sock_filter(struct sock_filter *filter)
446 {
447         /*
448          * Generated program looks like:
449          * if (arch == AUDIT_ARCH_A && nr >= flag) {
450          *      if (nr == 59)
451          *              return SECCOMP_RET_TRACE;
452          *      if (nr >= 321 && nr <= 323)
453          *              return SECCOMP_RET_TRACE;
454          *      ...
455          *      return SECCOMP_RET_ALLOW;
456          * }
457          * if (arch == AUDIT_ARCH_A) {
458          *      ...
459          * }
460          * if (arch == AUDIT_ARCH_B) {
461          *      ...
462          * }
463          * return SECCOMP_RET_TRACE;
464          */
465         unsigned short pos = 0;
466
467 # if SUPPORTED_PERSONALITIES > 1
468         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
469                      offsetof(struct seccomp_data, arch));
470 # endif
471
472         /*
473          * Personalities are iterated in reverse-order in the BPF program so
474          * that the x86 case is naturally handled.  On x86, the first and third
475          * personalities have the same arch identifier.  The third can be
476          * distinguished based on its associated syscall flag, so we check it
477          * first.  The only drawback here is that the first personality is more
478          * common, which may make the BPF program slower to match syscalls on
479          * average.
480          */
481         for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
482                 unsigned int lower = UINT_MAX;
483                 unsigned short start = pos, end;
484
485 # if SUPPORTED_PERSONALITIES > 1
486                 /* if (arch != audit_arch_vec[p].arch) goto next; */
487                 SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
488                              audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
489 # endif
490                 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
491                              offsetof(struct seccomp_data, nr));
492
493 # if SUPPORTED_PERSONALITIES > 1
494                 if (audit_arch_vec[p].flag) {
495                         /* if (nr < audit_arch_vec[p].flag) goto next; */
496                         SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
497                                      audit_arch_vec[p].flag, 2, 0);
498                         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
499                                      offsetof(struct seccomp_data, arch));
500                         SET_BPF_JUMP(&filter[pos++], BPF_JA,
501                                      JMP_PLACEHOLDER_NEXT, 0, 0);
502                 }
503 # endif
504
505                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
506                         if (traced_by_seccomp(i, p)) {
507                                 if (lower == UINT_MAX)
508                                         lower = i;
509                                 continue;
510                         }
511                         if (lower == UINT_MAX)
512                                 continue;
513                         pos += bpf_syscalls_cmp(filter + pos,
514                                                 lower | audit_arch_vec[p].flag,
515                                                 i | audit_arch_vec[p].flag);
516                         lower = UINT_MAX;
517                 }
518                 if (lower != UINT_MAX)
519                         pos += bpf_syscalls_cmp(filter + pos,
520                                                 lower | audit_arch_vec[p].flag,
521                                                 nsyscall_vec[p]
522                                                 | audit_arch_vec[p].flag);
523                 end = pos;
524
525                 /* if (nr >= max_nr) return RET_TRACE; */
526                 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
527                              nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
528
529                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
530                              SECCOMP_RET_ALLOW);
531                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
532                              SECCOMP_RET_TRACE);
533
534                 for (unsigned int i = start; i < end; ++i) {
535                         if (BPF_CLASS(filter[i].code) != BPF_JMP)
536                                 continue;
537                         unsigned char jmp_next = pos - i - 1;
538                         unsigned char jmp_trace = pos - i - 2;
539                         replace_jmp_placeholders(&filter[i].jt, jmp_next,
540                                                  jmp_trace);
541                         replace_jmp_placeholders(&filter[i].jf, jmp_next,
542                                                  jmp_trace);
543                         if (BPF_OP(filter[i].code) == BPF_JA)
544                                 filter[i].k = (unsigned int) jmp_next;
545                 }
546         }
547
548 # if SUPPORTED_PERSONALITIES > 1
549         /* Jumps conditioned on .arch default to this RET_TRACE. */
550         SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
551 # endif
552
553         if (debug_flag)
554                 dump_seccomp_bpf(filter, pos);
555
556         return pos;
557 }
558
559 void
560 init_seccomp_filter(void)
561 {
562         struct sock_filter filter[BPF_MAXINSNS];
563         unsigned short len;
564
565         len = init_sock_filter(filter);
566
567         struct sock_fprog prog = {
568                 .len = len,
569                 .filter = filter
570         };
571
572         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
573                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
574
575         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
576                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
577 }
578
579 int
580 seccomp_filter_restart_operator(const struct tcb *tcp)
581 {
582         if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
583             && traced_by_seccomp(tcp->scno, current_personality))
584                 return PTRACE_SYSCALL;
585         return PTRACE_CONT;
586 }
587
588 #else /* !HAVE_LINUX_SECCOMP_H */
589
590 # warning <linux/seccomp.h> is not available, seccomp filtering is not supported
591
592 static void
593 check_seccomp_filter_properties(void)
594 {
595         seccomp_filtering = false;
596 }
597
598 void
599 init_seccomp_filter(void)
600 {
601 }
602
603 int
604 seccomp_filter_restart_operator(const struct tcb *tcp)
605 {
606         return PTRACE_SYSCALL;
607 }
608
609 #endif
610
611 void
612 check_seccomp_filter(void)
613 {
614         /* Let's avoid enabling seccomp if all syscalls are traced. */
615         seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
616                                                    SUPPORTED_PERSONALITIES);
617         if (!seccomp_filtering) {
618                 error_msg("Seccomp filter is requested "
619                           "but there are no syscalls to filter.  "
620                           "See -e trace to filter syscalls.");
621                 return;
622         }
623
624         check_seccomp_filter_properties();
625
626         if (!seccomp_filtering)
627                 error_msg("seccomp filter is requested but unavailable");
628 }