]> granicus.if.org Git - strace/blob - filter_seccomp.c
rtnl_link: use internal rtnl_link_stats* and ifla_port_vsi definitions
[strace] / filter_seccomp.c
1 /*
2  * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
3  * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
4  * Copyright (c) 2019 The strace developers.
5  * All rights reserved.
6  *
7  * SPDX-License-Identifier: LGPL-2.1-or-later
8  */
9
10 #include "defs.h"
11
12 #include "ptrace.h"
13 #include <signal.h>
14 #include <sys/prctl.h>
15 #include <sys/wait.h>
16 #include <linux/filter.h>
17
18 #include "filter_seccomp.h"
19 #include "number_set.h"
20 #include "syscall.h"
21 #include "scno.h"
22
23 bool seccomp_filtering;
24 bool seccomp_before_sysentry;
25
26 #ifdef HAVE_LINUX_SECCOMP_H
27
28 # include <linux/seccomp.h>
29
30 /* PERSONALITY*_AUDIT_ARCH definitions depend on AUDIT_ARCH_* constants.  */
31 # ifdef PERSONALITY0_AUDIT_ARCH
32 #  include <linux/audit.h>
33 #  define XLAT_MACROS_ONLY
34 #   include "xlat/elf_em.h"
35 #   include "xlat/audit_arch.h"
36 #  undef XLAT_MACROS_ONLY
37 # endif
38
39 # ifndef BPF_MAXINSNS
40 #  define BPF_MAXINSNS 4096
41 # endif
42
43 # define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
44 # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
45
46 # define SET_BPF(filter, code, jt, jf, k) \
47         (*(filter) = (struct sock_filter) { code, jt, jf, k })
48
49 # define SET_BPF_STMT(filter, code, k) \
50         SET_BPF(filter, code, 0, 0, k)
51
52 # define SET_BPF_JUMP(filter, code, k, jt, jf) \
53         SET_BPF(filter, BPF_JMP | code, jt, jf, k)
54
55 struct audit_arch_t {
56         unsigned int arch;
57         unsigned int flag;
58 };
59
60 static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
61 # if SUPPORTED_PERSONALITIES > 1
62         PERSONALITY0_AUDIT_ARCH,
63         PERSONALITY1_AUDIT_ARCH,
64 #  if SUPPORTED_PERSONALITIES > 2
65         PERSONALITY2_AUDIT_ARCH,
66 #  endif
67 # endif
68 };
69
70 # ifdef HAVE_FORK
71
72 #  ifdef ENABLE_COVERAGE_GCOV
73 extern void __gcov_flush(void);
74 #  endif
75
76 static void ATTRIBUTE_NORETURN
77 check_seccomp_order_do_child(void)
78 {
79         static const struct sock_filter filter[] = {
80                 /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
81                 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
82                          offsetof(struct seccomp_data, nr)),
83                 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
84                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
85                 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
86         };
87         static const struct sock_fprog prog = {
88                 .len = ARRAY_SIZE(filter),
89                 .filter = (struct sock_filter *) filter
90         };
91
92         /* Get everything ready before PTRACE_TRACEME.  */
93         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
94                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
95         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
96                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
97         int pid = getpid();
98
99         if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
100                 /* Exit with a nonzero exit status.  */
101                 perror_func_msg_and_die("PTRACE_TRACEME");
102         }
103
104 #  ifdef ENABLE_COVERAGE_GCOV
105         __gcov_flush();
106 #  endif
107
108         kill(pid, SIGSTOP);
109         syscall(__NR_gettid);
110         _exit(0);
111 }
112
113 static int
114 check_seccomp_order_tracer(int pid)
115 {
116         unsigned int step;
117
118         for (step = 0; ; ++step) {
119                 int status;
120
121                 for (;;) {
122                         long rc = waitpid(pid, &status, 0);
123                         if (rc < 0 && errno == EINTR)
124                                 continue;
125                         if (rc == pid)
126                                 break;
127                         /* Cannot happen.  */
128                         perror_func_msg("#%d: unexpected wait result %ld",
129                                         step, rc);
130                         return pid;
131                 }
132
133                 if (WIFEXITED(status)) {
134                         /* The tracee is no more.  */
135                         pid = 0;
136
137                         int exitstatus = WEXITSTATUS(status);
138                         if (step == 5 && exitstatus == 0) {
139                                 seccomp_filtering = true;
140                         } else {
141                                 error_func_msg("#%d: unexpected exit status %u",
142                                                step, exitstatus);
143                         }
144                         break;
145                 }
146
147                 if (WIFSIGNALED(status)) {
148                         /* The tracee is no more.  */
149                         pid = 0;
150
151                         error_func_msg("#%d: unexpected signal %u",
152                                        step, WTERMSIG(status));
153                         break;
154                 }
155
156                 if (!WIFSTOPPED(status)) {
157                         /* Cannot happen.  */
158                         error_func_msg("#%d: unexpected wait status %#x",
159                                        step, status);
160                         break;
161                 }
162
163                 unsigned int event = (unsigned int) status >> 16;
164
165                 switch (WSTOPSIG(status)) {
166                 case SIGSTOP:
167                         if (step != 0) {
168                                 error_func_msg("#%d: unexpected signal stop",
169                                                step);
170                                 return pid;
171                         }
172                         if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
173                                    PTRACE_O_TRACESYSGOOD|
174                                    PTRACE_O_TRACESECCOMP) < 0) {
175                                 perror_func_msg("PTRACE_SETOPTIONS");
176                                 return pid;
177                         }
178                         break;
179
180                 case SIGTRAP:
181                         if (event != PTRACE_EVENT_SECCOMP) {
182                                 error_func_msg("#%d: unexpected trap %#x",
183                                                step, event);
184                                 return pid;
185                         }
186
187                         switch (step) {
188                         case 1: /* Seccomp stop before entering gettid.  */
189                                 seccomp_before_sysentry = true;
190                                 break;
191                         case 2: /* Seccomp stop after entering gettid.  */
192                                 if (!seccomp_before_sysentry)
193                                         break;
194                                 ATTRIBUTE_FALLTHROUGH;
195                         default:
196                                 error_func_msg("#%d: unexpected seccomp stop",
197                                                step);
198                                 return pid;
199                         }
200                         break;
201
202                 case SIGTRAP | 0x80:
203                         switch (step) {
204                         case 3: /* Exiting gettid.  */
205                         case 4: /* Entering exit_group.  */
206                                 break;
207                         case 1: /* Entering gettid before seccomp stop.  */
208                                 seccomp_before_sysentry = false;
209                                 break;
210                         case 2: /* Entering gettid after seccomp stop.  */
211                                 if (seccomp_before_sysentry)
212                                         break;
213                                 ATTRIBUTE_FALLTHROUGH;
214                         default:
215                                 error_func_msg("#%d: unexpected syscall stop",
216                                                step);
217                                 return pid;
218                         }
219                         break;
220
221                 default:
222                         error_func_msg("#%d: unexpected stop signal %#x",
223                                        step, WSTOPSIG(status));
224                         return pid;
225                 }
226
227                 if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
228                         /* Cannot happen.  */
229                         perror_func_msg("#%d: PTRACE_SYSCALL", step);
230                         break;
231                 }
232         }
233
234         return pid;
235 }
236 # endif /* HAVE_FORK */
237
238 static void
239 check_seccomp_order(void)
240 {
241         seccomp_filtering = false;
242
243         /* NOMMU provides no forks necessary for the test.  */
244 # ifdef HAVE_FORK
245         int pid = fork();
246         if (pid < 0) {
247                 perror_func_msg("fork");
248                 return;
249         }
250
251         if (pid == 0)
252                 check_seccomp_order_do_child();
253
254         pid = check_seccomp_order_tracer(pid);
255         if (pid) {
256                 kill(pid, SIGKILL);
257                 for (;;) {
258                         long rc = waitpid(pid, NULL, 0);
259                         if (rc < 0 && errno == EINTR)
260                                 continue;
261                         break;
262                 }
263         }
264 # endif /* HAVE_FORK */
265 }
266
267 static bool
268 traced_by_seccomp(unsigned int scno, unsigned int p)
269 {
270         if (is_number_in_set_array(scno, trace_set, p)
271             || sysent_vec[p][scno].sys_flags
272             & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
273                 return true;
274         return false;
275 }
276
277 static void
278 check_bpf_program_size(void)
279 {
280         unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
281
282         /*
283          * Implements a simplified form of init_sock_filter()'s bytecode
284          * generation algorithm, to count the number of instructions that will
285          * be generated.
286          */
287         for (int p = SUPPORTED_PERSONALITIES - 1;
288              p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
289                 unsigned int nb_insns_personality = 0;
290                 unsigned int lower = UINT_MAX;
291
292                 nb_insns_personality++;
293 # if SUPPORTED_PERSONALITIES > 1
294                 nb_insns_personality++;
295                 if (audit_arch_vec[p].flag)
296                         nb_insns_personality += 3;
297 # endif
298
299                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
300                         if (traced_by_seccomp(i, p)) {
301                                 if (lower == UINT_MAX)
302                                         lower = i;
303                                 continue;
304                         }
305                         if (lower == UINT_MAX)
306                                 continue;
307                         if (lower + 1 == i)
308                                 nb_insns_personality++;
309                         else
310                                 nb_insns_personality += 2;
311                         lower = UINT_MAX;
312                 }
313                 if (lower != UINT_MAX) {
314                         if (lower + 1 == nsyscall_vec[p])
315                                 nb_insns_personality++;
316                         else
317                                 nb_insns_personality += 2;
318                 }
319
320                 nb_insns_personality += 3;
321
322                 /*
323                  * Within generated BPF programs, the origin and destination of
324                  * jumps are always in the same personality section.  The
325                  * largest jump is therefore the jump from the first
326                  * instruction of the section to the last, to skip the
327                  * personality and try to compare .arch to the next
328                  * personality.
329                  * If we have a personality section with more than 255
330                  * instructions, the jump offset will overflow.  Such program
331                  * is unlikely to happen, so we simply disable seccomp filter
332                  * is such a case.
333                  */
334                 if (nb_insns_personality > UCHAR_MAX) {
335                         debug_msg("seccomp filter disabled due to "
336                                   "possibility of overflow");
337                         seccomp_filtering = false;
338                         return;
339                 }
340                 nb_insns += nb_insns_personality;
341         }
342
343 # if SUPPORTED_PERSONALITIES > 1
344         nb_insns++;
345 # endif
346
347         if (nb_insns > BPF_MAXINSNS) {
348                 debug_msg("seccomp filter disabled due to BPF program being "
349                           "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
350                 seccomp_filtering = false;
351         }
352 }
353
354 static void
355 check_seccomp_filter_properties(void)
356 {
357         int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
358         seccomp_filtering = rc < 0 && errno != EINVAL;
359         if (!seccomp_filtering)
360                 debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
361
362         if (seccomp_filtering)
363                 check_bpf_program_size();
364         if (seccomp_filtering)
365                 check_seccomp_order();
366 }
367
368 static void
369 dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
370 {
371         for (unsigned int i = 0; i < len; ++i) {
372                 switch (filter[i].code) {
373                 case BPF_LD | BPF_W | BPF_ABS:
374                         switch (filter[i].k) {
375                         case offsetof(struct seccomp_data, arch):
376                                 error_msg("STMT(BPF_LDWABS, data->arch)");
377                                 break;
378                         case offsetof(struct seccomp_data, nr):
379                                 error_msg("STMT(BPF_LDWABS, data->nr)");
380                                 break;
381                         default:
382                                 error_msg("STMT(BPF_LDWABS, 0x%x)",
383                                           filter[i].k);
384                         }
385                         break;
386                 case BPF_RET | BPF_K:
387                         switch (filter[i].k) {
388                         case SECCOMP_RET_TRACE:
389                                 error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
390                                 break;
391                         case SECCOMP_RET_ALLOW:
392                                 error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
393                                 break;
394                         default:
395                                 error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
396                         }
397                         break;
398                 case BPF_JMP | BPF_JEQ | BPF_K:
399                         error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
400                                   filter[i].jt, filter[i].jf,
401                                   filter[i].k);
402                         break;
403                 case BPF_JMP | BPF_JGE | BPF_K:
404                         error_msg("JUMP(BPF_JGE, %u, %u, %u)",
405                                   filter[i].jt, filter[i].jf,
406                                   filter[i].k);
407                         break;
408                 case BPF_JMP | BPF_JA:
409                         error_msg("JUMP(BPF_JA, %u)", filter[i].k);
410                         break;
411                 default:
412                         error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
413                                   filter[i].jt, filter[i].jf, filter[i].k);
414                 }
415         }
416 }
417
418 static void
419 replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
420                          unsigned char jmp_trace)
421 {
422         switch (*jmp_offset) {
423         case JMP_PLACEHOLDER_NEXT:
424                 *jmp_offset = jmp_next;
425                 break;
426         case JMP_PLACEHOLDER_TRACE:
427                 *jmp_offset = jmp_trace;
428                 break;
429         default:
430                 break;
431         }
432 }
433
434 static unsigned short
435 bpf_syscalls_cmp(struct sock_filter *filter,
436                  unsigned int lower, unsigned int upper)
437 {
438         if (lower + 1 == upper) {
439                 /* if (nr == lower) return RET_TRACE; */
440                 SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
441                              JMP_PLACEHOLDER_TRACE, 0);
442                 return 1;
443         } else {
444                 /* if (nr >= lower && nr < upper) return RET_TRACE; */
445                 SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
446                 SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
447                              JMP_PLACEHOLDER_TRACE);
448                 return 2;
449         }
450 }
451
452 static unsigned short
453 init_sock_filter(struct sock_filter *filter)
454 {
455         /*
456          * Generated program looks like:
457          * if (arch == AUDIT_ARCH_A && nr >= flag) {
458          *      if (nr == 59)
459          *              return SECCOMP_RET_TRACE;
460          *      if (nr >= 321 && nr <= 323)
461          *              return SECCOMP_RET_TRACE;
462          *      ...
463          *      return SECCOMP_RET_ALLOW;
464          * }
465          * if (arch == AUDIT_ARCH_A) {
466          *      ...
467          * }
468          * if (arch == AUDIT_ARCH_B) {
469          *      ...
470          * }
471          * return SECCOMP_RET_TRACE;
472          */
473         unsigned short pos = 0;
474
475 # if SUPPORTED_PERSONALITIES > 1
476         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
477                      offsetof(struct seccomp_data, arch));
478 # endif
479
480         /*
481          * Personalities are iterated in reverse-order in the BPF program so
482          * that the x86 case is naturally handled.  On x86, the first and third
483          * personalities have the same arch identifier.  The third can be
484          * distinguished based on its associated syscall flag, so we check it
485          * first.  The only drawback here is that the first personality is more
486          * common, which may make the BPF program slower to match syscalls on
487          * average.
488          */
489         for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
490                 unsigned int lower = UINT_MAX;
491                 unsigned short start = pos, end;
492
493 # if SUPPORTED_PERSONALITIES > 1
494                 /* if (arch != audit_arch_vec[p].arch) goto next; */
495                 SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
496                              audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
497 # endif
498                 SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
499                              offsetof(struct seccomp_data, nr));
500
501 # if SUPPORTED_PERSONALITIES > 1
502                 if (audit_arch_vec[p].flag) {
503                         /* if (nr < audit_arch_vec[p].flag) goto next; */
504                         SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
505                                      audit_arch_vec[p].flag, 2, 0);
506                         SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
507                                      offsetof(struct seccomp_data, arch));
508                         SET_BPF_JUMP(&filter[pos++], BPF_JA,
509                                      JMP_PLACEHOLDER_NEXT, 0, 0);
510                 }
511 # endif
512
513                 for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
514                         if (traced_by_seccomp(i, p)) {
515                                 if (lower == UINT_MAX)
516                                         lower = i;
517                                 continue;
518                         }
519                         if (lower == UINT_MAX)
520                                 continue;
521                         pos += bpf_syscalls_cmp(filter + pos,
522                                                 lower | audit_arch_vec[p].flag,
523                                                 i | audit_arch_vec[p].flag);
524                         lower = UINT_MAX;
525                 }
526                 if (lower != UINT_MAX)
527                         pos += bpf_syscalls_cmp(filter + pos,
528                                                 lower | audit_arch_vec[p].flag,
529                                                 nsyscall_vec[p]
530                                                 | audit_arch_vec[p].flag);
531                 end = pos;
532
533                 /* if (nr >= max_nr) return RET_TRACE; */
534                 SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
535                              nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
536
537                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
538                              SECCOMP_RET_ALLOW);
539                 SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
540                              SECCOMP_RET_TRACE);
541
542                 for (unsigned int i = start; i < end; ++i) {
543                         if (BPF_CLASS(filter[i].code) != BPF_JMP)
544                                 continue;
545                         unsigned char jmp_next = pos - i - 1;
546                         unsigned char jmp_trace = pos - i - 2;
547                         replace_jmp_placeholders(&filter[i].jt, jmp_next,
548                                                  jmp_trace);
549                         replace_jmp_placeholders(&filter[i].jf, jmp_next,
550                                                  jmp_trace);
551                         if (BPF_OP(filter[i].code) == BPF_JA)
552                                 filter[i].k = (unsigned int) jmp_next;
553                 }
554         }
555
556 # if SUPPORTED_PERSONALITIES > 1
557         /* Jumps conditioned on .arch default to this RET_TRACE. */
558         SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
559 # endif
560
561         if (debug_flag)
562                 dump_seccomp_bpf(filter, pos);
563
564         return pos;
565 }
566
567 void
568 init_seccomp_filter(void)
569 {
570         struct sock_filter filter[BPF_MAXINSNS];
571         unsigned short len;
572
573         len = init_sock_filter(filter);
574
575         struct sock_fprog prog = {
576                 .len = len,
577                 .filter = filter
578         };
579
580         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
581                 perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
582
583         if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
584                 perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
585 }
586
587 int
588 seccomp_filter_restart_operator(const struct tcb *tcp)
589 {
590         if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
591             && traced_by_seccomp(tcp->scno, current_personality))
592                 return PTRACE_SYSCALL;
593         return PTRACE_CONT;
594 }
595
596 #else /* !HAVE_LINUX_SECCOMP_H */
597
598 # warning <linux/seccomp.h> is not available, seccomp filtering is not supported
599
600 static void
601 check_seccomp_filter_properties(void)
602 {
603         seccomp_filtering = false;
604 }
605
606 void
607 init_seccomp_filter(void)
608 {
609 }
610
611 int
612 seccomp_filter_restart_operator(const struct tcb *tcp)
613 {
614         return PTRACE_SYSCALL;
615 }
616
617 #endif
618
619 void
620 check_seccomp_filter(void)
621 {
622         /* Let's avoid enabling seccomp if all syscalls are traced. */
623         seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
624                                                    SUPPORTED_PERSONALITIES);
625         if (!seccomp_filtering) {
626                 error_msg("Seccomp filter is requested "
627                           "but there are no syscalls to filter.  "
628                           "See -e trace to filter syscalls.");
629                 return;
630         }
631
632         check_seccomp_filter_properties();
633
634         if (!seccomp_filtering)
635                 error_msg("seccomp filter is requested but unavailable");
636 }