jail: add option to provide /dev/console to containers
[oweals/procd.git] / jail / jail.c
1 /*
2  * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU Lesser General Public License version 2.1
6  * as published by the Free Software Foundation
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  * GNU General Public License for more details.
12  */
13
14 #define _GNU_SOURCE
15 #include <sys/mount.h>
16 #include <sys/prctl.h>
17 #include <sys/wait.h>
18 #include <sys/types.h>
19
20 #include <stdlib.h>
21 #include <unistd.h>
22 #include <errno.h>
23 #include <pwd.h>
24 #include <grp.h>
25 #include <string.h>
26 #include <sys/stat.h>
27 #include <fcntl.h>
28 #include <libgen.h>
29 #include <sched.h>
30 #include <linux/limits.h>
31 #include <signal.h>
32
33 #include "capabilities.h"
34 #include "elf.h"
35 #include "fs.h"
36 #include "jail.h"
37 #include "log.h"
38
39 #include <libubox/uloop.h>
40 #include <libubus.h>
41
42 #define STACK_SIZE      (1024 * 1024)
43 #define OPT_ARGS        "S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:Ey"
44
45 static struct {
46         char *name;
47         char *hostname;
48         char **jail_argv;
49         char *seccomp;
50         char *capabilities;
51         char *user;
52         char *group;
53         char *extroot;
54         char *overlaydir;
55         char *tmpoverlaysize;
56         int no_new_privs;
57         int namespace;
58         int procfs;
59         int ronly;
60         int sysfs;
61         int console;
62         int pw_uid;
63         int pw_gid;
64         int gr_gid;
65         int require_jail;
66 } opts;
67
68
69 extern int pivot_root(const char *new_root, const char *put_old);
70
71 int debug = 0;
72
73 static char child_stack[STACK_SIZE];
74
75 int console_fd;
76
77 static int mkdir_p(char *dir, mode_t mask)
78 {
79         char *l = strrchr(dir, '/');
80         int ret;
81
82         if (!l)
83                 return 0;
84
85         *l = '\0';
86
87         if (mkdir_p(dir, mask))
88                 return -1;
89
90         *l = '/';
91
92         ret = mkdir(dir, mask);
93         if (ret && errno == EEXIST)
94                 return 0;
95
96         if (ret)
97                 ERROR("mkdir(%s, %d) failed: %m\n", dir, mask);
98
99         return ret;
100 }
101
102 static int _mount_bind(const char *root, const char *path, const char *target, int readonly, int strict, int error)
103 {
104         struct stat s;
105         char new[PATH_MAX];
106         int fd;
107         int remount_flags = MS_BIND | MS_REMOUNT;
108
109         if (stat(path, &s)) {
110                 ERROR("stat(%s) failed: %m\n", path);
111                 return error;
112         }
113
114         snprintf(new, sizeof(new), "%s%s", root, target?target:path);
115
116         if (S_ISDIR(s.st_mode)) {
117                 mkdir_p(new, 0755);
118         } else {
119                 mkdir_p(dirname(new), 0755);
120                 snprintf(new, sizeof(new), "%s%s", root, target?target:path);
121                 fd = creat(new, 0644);
122                 if (fd == -1) {
123                         ERROR("creat(%s) failed: %m\n", new);
124                         return -1;
125                 }
126                 close(fd);
127         }
128
129         if (mount(path, new, NULL, MS_BIND, NULL)) {
130                 ERROR("failed to mount -B %s %s: %m\n", path, new);
131                 return -1;
132         }
133
134         if (readonly)
135                 remount_flags |= MS_RDONLY;
136
137         if (strict)
138                 remount_flags |= MS_NOEXEC | MS_NOSUID | MS_NODEV;
139
140         if ((strict || readonly) && mount(NULL, new, NULL, remount_flags, NULL)) {
141                 ERROR("failed to remount (%s%s%s) %s: %m\n", readonly?"ro":"rw",
142                       (readonly && strict)?", ":"", strict?"strict":"", new);
143                 return -1;
144         }
145
146         DEBUG("mount -B %s %s (%s%s%s)\n", path, new,
147               readonly?"ro":"rw", (readonly && strict)?", ":"", strict?"strict":"");
148
149         return 0;
150 }
151
152 int mount_bind(const char *root, const char *path, int readonly, int error) {
153         return _mount_bind(root, path, NULL, readonly, 0, error);
154 }
155
156 static int mount_overlay(char *jail_root, char *overlaydir) {
157         char *upperdir, *workdir, *optsstr;
158         const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
159         int ret = -1;
160
161         if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0)
162                 goto out;
163
164         if (asprintf(&workdir, "%s%s", overlaydir, "/work") < 0)
165                 goto upper_printf;
166
167         if (asprintf(&optsstr, mountoptsformat, jail_root, upperdir, workdir) < 0)
168                 goto work_printf;
169
170         if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755))
171                 goto opts_printf;
172
173         DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr);
174
175         if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr))
176                 goto opts_printf;
177
178         ret = 0;
179
180 opts_printf:
181         free(optsstr);
182 work_printf:
183         free(workdir);
184 upper_printf:
185         free(upperdir);
186 out:
187         return ret;
188 }
189
190 static void pass_console(int console_fd)
191 {
192         struct ubus_context *ctx = ubus_connect(NULL);
193         static struct blob_buf req;
194         uint32_t id;
195
196         if (!ctx)
197                 return;
198
199         blob_buf_init(&req, 0);
200         blobmsg_add_string(&req, "name", opts.name);
201
202         if (ubus_lookup_id(ctx, "service", &id) ||
203             ubus_invoke_fd(ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd))
204                 INFO("ubus request failed\n");
205         else
206                 close(console_fd);
207
208         blob_buf_free(&req);
209         ubus_free(ctx);
210 }
211
212 static int create_dev_console(const char *jail_root)
213 {
214         char *console_fname;
215         char dev_console_path[PATH_MAX];
216         int slave_console_fd;
217
218         /* Open UNIX/98 virtual console */
219         console_fd = posix_openpt(O_RDWR | O_NOCTTY);
220         if (console_fd == -1)
221                 return -1;
222
223         console_fname = ptsname(console_fd);
224         DEBUG("got console fd %d and PTS client name %s\n", console_fd, console_fname);
225         if (!console_fname)
226                 goto no_console;
227
228         grantpt(console_fd);
229         unlockpt(console_fd);
230
231         /* pass PTY master to procd */
232         pass_console(console_fd);
233
234         /* mount-bind PTY slave to /dev/console in jail */
235         snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root);
236         close(creat(dev_console_path, 0620));
237
238         if (mount(console_fname, dev_console_path, NULL, MS_BIND, NULL))
239                 goto no_console;
240
241         /* use PTY slave for stdio */
242         slave_console_fd = open(console_fname, O_RDWR); /* | O_NOCTTY */
243         dup2(slave_console_fd, 0);
244         dup2(slave_console_fd, 1);
245         dup2(slave_console_fd, 2);
246         close(slave_console_fd);
247
248         INFO("using guest console %s\n", console_fname);
249
250         return 0;
251
252 no_console:
253         close(console_fd);
254         return 1;
255 }
256
257 static int build_jail_fs(void)
258 {
259         char jail_root[] = "/tmp/ujail-XXXXXX";
260         char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX";
261         char tmpdevdir[] = "/tmp/ujail-XXXXXX/dev";
262         char tmpdevptsdir[] = "/tmp/ujail-XXXXXX/dev/pts";
263         char *overlaydir = NULL;
264
265         if (mkdtemp(jail_root) == NULL) {
266                 ERROR("mkdtemp(%s) failed: %m\n", jail_root);
267                 return -1;
268         }
269
270         /* oldroot can't be MS_SHARED else pivot_root() fails */
271         if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) {
272                 ERROR("private mount failed %m\n");
273                 return -1;
274         }
275
276         if (opts.extroot) {
277                 if (mount(opts.extroot, jail_root, NULL, MS_BIND, NULL)) {
278                         ERROR("extroot mount failed %m\n");
279                         return -1;
280                 }
281         } else {
282                 if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) {
283                         ERROR("tmpfs mount failed %m\n");
284                         return -1;
285                 }
286         }
287
288         if (opts.tmpoverlaysize) {
289                 char mountoptsstr[] = "mode=0755,size=XXXXXXXX";
290
291                 snprintf(mountoptsstr, sizeof(mountoptsstr),
292                          "mode=0755,size=%s", opts.tmpoverlaysize);
293                 if (mkdtemp(tmpovdir) == NULL) {
294                         ERROR("mkdtemp(%s) failed: %m\n", jail_root);
295                         return -1;
296                 }
297                 if (mount("tmpfs", tmpovdir, "tmpfs", MS_NOATIME,
298                           mountoptsstr)) {
299                         ERROR("failed to mount tmpfs for overlay (size=%s)\n", opts.tmpoverlaysize);
300                         return -1;
301                 }
302                 overlaydir = tmpovdir;
303         }
304
305         if (opts.overlaydir)
306                 overlaydir = opts.overlaydir;
307
308         if (overlaydir)
309                 mount_overlay(jail_root, overlaydir);
310
311         if (chdir(jail_root)) {
312                 ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root);
313                 return -1;
314         }
315
316         snprintf(tmpdevdir, sizeof(tmpdevdir), "%s/dev", jail_root);
317         mkdir_p(tmpdevdir, 0755);
318         if (mount(NULL, tmpdevdir, "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "size=1M"))
319                 return -1;
320
321         snprintf(tmpdevptsdir, sizeof(tmpdevptsdir), "%s/dev/pts", jail_root);
322         mkdir_p(tmpdevptsdir, 0755);
323         if (mount(NULL, tmpdevptsdir, "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, NULL))
324                 return -1;
325
326         if (opts.console)
327                 create_dev_console(jail_root);
328
329         if (mount_all(jail_root)) {
330                 ERROR("mount_all() failed\n");
331                 return -1;
332         }
333
334         if (opts.namespace & CLONE_NEWNET) {
335                 char hostdir[PATH_MAX], jailetc[PATH_MAX], jaillink[PATH_MAX];
336
337                 snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name);
338                 mkdir_p(hostdir, 0755);
339                 _mount_bind(jail_root, hostdir, "/tmp/resolv.conf.d", 1, 1, -1);
340                 snprintf(jailetc, PATH_MAX, "%s/etc", jail_root);
341                 mkdir_p(jailetc, 0755);
342                 snprintf(jaillink, PATH_MAX, "%s/etc/resolv.conf", jail_root);
343                 if (overlaydir)
344                         unlink(jaillink);
345                 symlink("../tmp/resolv.conf.d/resolv.conf.auto", jaillink);
346         }
347
348         char dirbuf[sizeof(jail_root) + 4];
349         snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root);
350         mkdir(dirbuf, 0755);
351
352         if (pivot_root(jail_root, dirbuf) == -1) {
353                 ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf);
354                 return -1;
355         }
356         if (chdir("/")) {
357                 ERROR("chdir(/) (after pivot_root) failed: %m\n");
358                 return -1;
359         }
360
361         snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root);
362         umount2(dirbuf, MNT_DETACH);
363         rmdir(dirbuf);
364         if (opts.tmpoverlaysize) {
365                 char tmpdirbuf[sizeof(tmpovdir) + 4];
366                 snprintf(tmpdirbuf, sizeof(tmpdirbuf), "/old%s", tmpovdir);
367                 umount2(tmpdirbuf, MNT_DETACH);
368                 rmdir(tmpdirbuf);
369         }
370
371         umount2("/old", MNT_DETACH);
372         rmdir("/old");
373
374         if (opts.procfs) {
375                 mkdir("/proc", 0755);
376                 mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
377                 /*
378                  * make /proc/sys read-only while keeping read-write to
379                  * /proc/sys/net if CLONE_NEWNET is set.
380                  */
381                 if (opts.namespace & CLONE_NEWNET)
382                         mount("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, 0);
383
384                 mount("/proc/sys", "/proc/sys", NULL, MS_BIND, 0);
385                 mount(NULL, "/proc/sys", NULL, MS_REMOUNT | MS_RDONLY, 0);
386                 mount(NULL, "/proc", NULL, MS_REMOUNT, 0);
387
388                 if (opts.namespace & CLONE_NEWNET)
389                         mount("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, 0);
390         }
391         if (opts.sysfs) {
392                 mkdir("/sys", 0755);
393                 mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0);
394         }
395         if (opts.ronly)
396                 mount(NULL, "/", NULL, MS_RDONLY | MS_REMOUNT, 0);
397
398         return 0;
399 }
400
401 static int write_uid_gid_map(pid_t child_pid, bool gidmap, int id)
402 {
403         int map_file;
404         char map_path[64];
405         const char *map_format = "%d %d %d\n";
406         if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
407                 child_pid, gidmap?"gid_map":"uid_map") < 0)
408                 return -1;
409
410         if ((map_file = open(map_path, O_WRONLY)) == -1)
411                 return -1;
412
413         if (dprintf(map_file, map_format, 0, id, 1) == -1) {
414                 close(map_file);
415                 return -1;
416         }
417
418         close(map_file);
419         return 0;
420 }
421
422 static int write_setgroups(pid_t child_pid, bool allow)
423 {
424         int setgroups_file;
425         char setgroups_path[64];
426
427         if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups",
428                 child_pid) < 0) {
429                 return -1;
430         }
431
432         if ((setgroups_file = open(setgroups_path, O_WRONLY)) == -1) {
433                 return -1;
434         }
435
436         if (dprintf(setgroups_file, allow?"allow":"deny") == -1) {
437                 close(setgroups_file);
438                 return -1;
439         }
440
441         close(setgroups_file);
442         return 0;
443 }
444
445 static void get_jail_user(int *user, int *user_gid, int *gr_gid)
446 {
447         struct passwd *p = NULL;
448         struct group *g = NULL;
449
450         if (opts.user) {
451                 p = getpwnam(opts.user);
452                 if (!p) {
453                         ERROR("failed to get uid/gid for user %s: %d (%s)\n",
454                               opts.user, errno, strerror(errno));
455                         exit(EXIT_FAILURE);
456                 }
457                 *user = p->pw_uid;
458                 *user_gid = p->pw_gid;
459         } else {
460                 *user = -1;
461                 *user_gid = -1;
462         }
463
464         if (opts.group) {
465                 g = getgrnam(opts.group);
466                 if (!g) {
467                         ERROR("failed to get gid for group %s: %m\n", opts.group);
468                         exit(EXIT_FAILURE);
469                 }
470                 *gr_gid = g->gr_gid;
471         } else {
472                 *gr_gid = -1;
473         }
474 };
475
476 static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
477 {
478         if ((user_gid != -1) && initgroups(opts.user, user_gid)) {
479                 ERROR("failed to initgroups() for user %s: %m\n", opts.user);
480                 exit(EXIT_FAILURE);
481         }
482
483         if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) {
484                 ERROR("failed to set group id %d: %m\n", gr_gid);
485                 exit(EXIT_FAILURE);
486         }
487
488         if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) {
489                 ERROR("failed to set user id %d: %m\n", pw_uid);
490                 exit(EXIT_FAILURE);
491         }
492 }
493
494 #define MAX_ENVP        8
495 static char** build_envp(const char *seccomp)
496 {
497         static char *envp[MAX_ENVP];
498         static char preload_var[PATH_MAX];
499         static char seccomp_var[PATH_MAX];
500         static char debug_var[] = "LD_DEBUG=all";
501         static char container_var[] = "container=ujail";
502         const char *preload_lib = find_lib("libpreload-seccomp.so");
503         int count = 0;
504
505         if (seccomp && !preload_lib) {
506                 ERROR("failed to add preload-lib to env\n");
507                 return NULL;
508         }
509         if (seccomp) {
510                 snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp);
511                 envp[count++] = seccomp_var;
512                 snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
513                 envp[count++] = preload_var;
514         }
515
516         envp[count++] = container_var;
517
518         if (debug > 1)
519                 envp[count++] = debug_var;
520
521         return envp;
522 }
523
524 static void usage(void)
525 {
526         fprintf(stderr, "ujail <options> -- <binary> <params ...>\n");
527         fprintf(stderr, "  -d <num>\tshow debug log (increase num to increase verbosity)\n");
528         fprintf(stderr, "  -S <file>\tseccomp filter config\n");
529         fprintf(stderr, "  -C <file>\tcapabilities drop config\n");
530         fprintf(stderr, "  -c\t\tset PR_SET_NO_NEW_PRIVS\n");
531         fprintf(stderr, "  -n <name>\tthe name of the jail\n");
532         fprintf(stderr, "namespace jail options:\n");
533         fprintf(stderr, "  -h <hostname>\tchange the hostname of the jail\n");
534         fprintf(stderr, "  -N\t\tjail has network namespace\n");
535         fprintf(stderr, "  -f\t\tjail has user namespace\n");
536         fprintf(stderr, "  -F\t\tjail has cgroups namespace\n");
537         fprintf(stderr, "  -r <file>\treadonly files that should be staged\n");
538         fprintf(stderr, "  -w <file>\twriteable files that should be staged\n");
539         fprintf(stderr, "  -p\t\tjail has /proc\n");
540         fprintf(stderr, "  -s\t\tjail has /sys\n");
541         fprintf(stderr, "  -l\t\tjail has /dev/log\n");
542         fprintf(stderr, "  -u\t\tjail has a ubus socket\n");
543         fprintf(stderr, "  -U <name>\tuser to run jailed process\n");
544         fprintf(stderr, "  -G <name>\tgroup to run jailed process\n");
545         fprintf(stderr, "  -o\t\tremont jail root (/) read only\n");
546         fprintf(stderr, "  -R <dir>\texternal jail rootfs (system container)\n");
547         fprintf(stderr, "  -O <dir>\tdirectory for r/w overlayfs\n");
548         fprintf(stderr, "  -T <size>\tuse tmpfs r/w overlayfs with <size>\n");
549         fprintf(stderr, "  -E\t\tfail if jail cannot be setup\n");
550         fprintf(stderr, "  -y\t\tprovide jail console\n");
551         fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
552 and he has the same powers as root outside the jail,\n\
553 thus he can escape the jail and/or break stuff.\n\
554 Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\
555 If you use none of the namespace jail options,\n\
556 ujail will not use namespace/build a jail,\n\
557 and will only drop capabilities/apply seccomp filter.\n\n");
558 }
559
560 static int exec_jail(void *pipes_ptr)
561 {
562         int *pipes = (int*)pipes_ptr;
563         char buf[1];
564         int pw_uid, pw_gid, gr_gid;
565
566         close(pipes[0]);
567         close(pipes[3]);
568
569         buf[0] = 'i';
570         if (write(pipes[1], buf, 1) < 1) {
571                 ERROR("can't write to parent\n");
572                 exit(EXIT_FAILURE);
573         }
574         if (read(pipes[2], buf, 1) < 1) {
575                 ERROR("can't read from parent\n");
576                 exit(EXIT_FAILURE);
577         }
578         if (buf[0] != 'O') {
579                 ERROR("parent had an error, child exiting\n");
580                 exit(EXIT_FAILURE);
581         }
582
583         close(pipes[1]);
584         close(pipes[2]);
585
586         if (opts.namespace & CLONE_NEWUSER) {
587                 if (setgid(0) < 0) {
588                         ERROR("setgid\n");
589                         exit(EXIT_FAILURE);
590                 }
591                 if (setuid(0) < 0) {
592                         ERROR("setuid\n");
593                         exit(EXIT_FAILURE);
594                 }
595 //              if (setgroups(0, NULL) < 0) {
596 //                      ERROR("setgroups\n");
597 //                      exit(EXIT_FAILURE);
598 //              }
599         }
600
601         if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
602                         && sethostname(opts.hostname, strlen(opts.hostname))) {
603                 ERROR("sethostname(%s) failed: %m\n", opts.hostname);
604                 exit(EXIT_FAILURE);
605         }
606
607         if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) {
608                 ERROR("failed to build jail fs\n");
609                 exit(EXIT_FAILURE);
610         }
611
612         if (opts.capabilities && drop_capabilities(opts.capabilities))
613                 exit(EXIT_FAILURE);
614
615         if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
616                 ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
617                 exit(EXIT_FAILURE);
618         }
619
620         if (!(opts.namespace & CLONE_NEWUSER)) {
621                 get_jail_user(&pw_uid, &pw_gid, &gr_gid);
622                 set_jail_user(pw_uid, pw_gid, gr_gid);
623         }
624
625         char **envp = build_envp(opts.seccomp);
626         if (!envp)
627                 exit(EXIT_FAILURE);
628
629         INFO("exec-ing %s\n", *opts.jail_argv);
630         execve(*opts.jail_argv, opts.jail_argv, envp);
631         /* we get there only if execve fails */
632         ERROR("failed to execve %s: %m\n", *opts.jail_argv);
633         exit(EXIT_FAILURE);
634 }
635
636 static int jail_running = 1;
637 static int jail_return_code = 0;
638
639 static void jail_process_timeout_cb(struct uloop_timeout *t);
640 static struct uloop_timeout jail_process_timeout = {
641         .cb = jail_process_timeout_cb,
642 };
643
644 static void jail_process_handler(struct uloop_process *c, int ret)
645 {
646         uloop_timeout_cancel(&jail_process_timeout);
647         if (WIFEXITED(ret)) {
648                 jail_return_code = WEXITSTATUS(ret);
649                 INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
650         } else {
651                 jail_return_code = WTERMSIG(ret);
652                 INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code);
653         }
654         jail_running = 0;
655         uloop_end();
656 }
657
658 static struct uloop_process jail_process = {
659         .cb = jail_process_handler,
660 };
661
662 static void jail_process_timeout_cb(struct uloop_timeout *t)
663 {
664         DEBUG("jail process failed to stop, sending SIGKILL\n");
665         kill(jail_process.pid, SIGKILL);
666 }
667
668 static void jail_handle_signal(int signo)
669 {
670         DEBUG("forwarding signal %d to the jailed process\n", signo);
671         kill(jail_process.pid, signo);
672 }
673
674 static int netns_open_pid(const pid_t target_ns)
675 {
676         char pid_net_path[PATH_MAX];
677
678         snprintf(pid_net_path, sizeof(pid_net_path), "/proc/%u/ns/net", target_ns);
679
680         return open(pid_net_path, O_RDONLY);
681 }
682
683 static void netns_updown(pid_t pid, bool start)
684 {
685         struct ubus_context *ctx = ubus_connect(NULL);
686         static struct blob_buf req;
687         uint32_t id;
688
689         if (!ctx)
690                 return;
691
692         blob_buf_init(&req, 0);
693         blobmsg_add_string(&req, "jail", opts.name);
694         blobmsg_add_u32(&req, "pid", pid);
695         blobmsg_add_u8(&req, "start", start);
696
697         if (ubus_lookup_id(ctx, "network", &id) ||
698             ubus_invoke(ctx, id, "netns_updown", req.head, NULL, NULL, 3000))
699                 INFO("ubus request failed\n");
700
701         blob_buf_free(&req);
702         ubus_free(ctx);
703 }
704
705 int main(int argc, char **argv)
706 {
707         sigset_t sigmask;
708         uid_t uid = getuid();
709         char log[] = "/dev/log";
710         char ubus[] = "/var/run/ubus.sock";
711         int ch, i;
712         int pipes[4];
713         char sig_buf[1];
714         int netns_fd;
715
716         if (uid) {
717                 ERROR("not root, aborting: %m\n");
718                 return EXIT_FAILURE;
719         }
720
721         umask(022);
722         mount_list_init();
723         init_library_search();
724
725         while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) {
726                 switch (ch) {
727                 case 'd':
728                         debug = atoi(optarg);
729                         break;
730                 case 'p':
731                         opts.namespace |= CLONE_NEWNS;
732                         opts.procfs = 1;
733                         break;
734                 case 'o':
735                         opts.namespace |= CLONE_NEWNS;
736                         opts.ronly = 1;
737                         break;
738                 case 'f':
739                         opts.namespace |= CLONE_NEWUSER;
740                         break;
741                 case 'F':
742                         opts.namespace |= CLONE_NEWCGROUP;
743                         break;
744                 case 'R':
745                         opts.extroot = optarg;
746                         break;
747                 case 's':
748                         opts.namespace |= CLONE_NEWNS;
749                         opts.sysfs = 1;
750                         break;
751                 case 'S':
752                         opts.seccomp = optarg;
753                         add_mount(optarg, 1, -1);
754                         break;
755                 case 'C':
756                         opts.capabilities = optarg;
757                         break;
758                 case 'c':
759                         opts.no_new_privs = 1;
760                         break;
761                 case 'n':
762                         opts.name = optarg;
763                         break;
764                 case 'N':
765                         opts.namespace |= CLONE_NEWNET;
766                         break;
767                 case 'h':
768                         opts.namespace |= CLONE_NEWUTS;
769                         opts.hostname = optarg;
770                         break;
771                 case 'r':
772                         opts.namespace |= CLONE_NEWNS;
773                         add_path_and_deps(optarg, 1, 0, 0);
774                         break;
775                 case 'w':
776                         opts.namespace |= CLONE_NEWNS;
777                         add_path_and_deps(optarg, 0, 0, 0);
778                         break;
779                 case 'u':
780                         opts.namespace |= CLONE_NEWNS;
781                         add_mount(ubus, 0, -1);
782                         break;
783                 case 'l':
784                         opts.namespace |= CLONE_NEWNS;
785                         add_mount(log, 0, -1);
786                         break;
787                 case 'U':
788                         opts.user = optarg;
789                         break;
790                 case 'G':
791                         opts.group = optarg;
792                         break;
793                 case 'O':
794                         opts.overlaydir = optarg;
795                         break;
796                 case 'T':
797                         opts.tmpoverlaysize = optarg;
798                         break;
799                 case 'E':
800                         opts.require_jail = 1;
801                         break;
802                 case 'y':
803                         opts.console = 1;
804                         break;
805                 }
806         }
807
808         if (opts.namespace)
809                 opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID;
810
811         if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) {
812                 ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize);
813                 return -1;
814         }
815
816         /* no <binary> param found */
817         if (argc - optind < 1) {
818                 usage();
819                 return EXIT_FAILURE;
820         }
821         if (!(opts.namespace||opts.capabilities||opts.seccomp)) {
822                 ERROR("Not using namespaces, capabilities or seccomp !!!\n\n");
823                 usage();
824                 return EXIT_FAILURE;
825         }
826         DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n",
827                 opts.namespace,
828                 opts.capabilities != 0,
829                 opts.seccomp != 0);
830
831         opts.jail_argv = &argv[optind];
832
833         get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
834
835         if (!opts.extroot) {
836                 if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
837                         ERROR("failed to load dependencies\n");
838                         return -1;
839                 }
840         }
841
842         if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) {
843                 ERROR("failed to load libpreload-seccomp.so\n");
844                 opts.seccomp = 0;
845                 if (opts.require_jail)
846                         return -1;
847         }
848
849         if (opts.name)
850                 prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
851
852         uloop_init();
853
854         sigfillset(&sigmask);
855         for (i = 0; i < _NSIG; i++) {
856                 struct sigaction s = { 0 };
857
858                 if (!sigismember(&sigmask, i))
859                         continue;
860                 if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV))
861                         continue;
862
863                 s.sa_handler = jail_handle_signal;
864                 sigaction(i, &s, NULL);
865         }
866
867         if (opts.namespace) {
868                 if (opts.namespace & CLONE_NEWNS) {
869                         add_mount("/dev/full", 0, -1);
870                         add_mount("/dev/null", 0, -1);
871                         add_mount("/dev/random", 0, -1);
872                         add_mount("/dev/urandom", 0, -1);
873                         add_mount("/dev/zero", 0, -1);
874                         add_mount("/dev/ptmx", 0, -1);
875                         add_mount("/dev/tty", 0, -1);
876
877                         if (!opts.extroot && (opts.user || opts.group)) {
878                                 add_mount("/etc/passwd", 0, -1);
879                                 add_mount("/etc/group", 0, -1);
880                         }
881
882 #if defined(__GLIBC__)
883                         if (!opts.extroot)
884                                 add_mount("/etc/nsswitch.conf", 0, -1);
885 #endif
886
887                         if (!(opts.namespace & CLONE_NEWNET)) {
888                                 add_mount("/etc/resolv.conf", 0, -1);
889                         }
890                 }
891
892                 if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0)
893                         return -1;
894
895                 jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | opts.namespace, &pipes);
896         } else {
897                 jail_process.pid = fork();
898         }
899
900         if (jail_process.pid > 0) {
901                 seteuid(0);
902                 /* parent process */
903                 close(pipes[1]);
904                 close(pipes[2]);
905                 if (read(pipes[0], sig_buf, 1) < 1) {
906                         ERROR("can't read from child\n");
907                         return -1;
908                 }
909                 close(pipes[0]);
910                 if (opts.namespace & CLONE_NEWUSER) {
911                         bool has_gr = (opts.gr_gid != -1);
912                         if (write_setgroups(jail_process.pid, false)) {
913                                 ERROR("can't write setgroups\n");
914                                 return -1;
915                         }
916                         if (opts.pw_uid != -1) {
917                                 write_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
918                                 write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
919                         } else {
920                                 write_uid_gid_map(jail_process.pid, 0, 65534);
921                                 write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
922                         }
923                 }
924
925                 if (opts.namespace & CLONE_NEWNET) {
926                         if (!opts.name) {
927                                 ERROR("netns needs a named jail\n");
928                                 return -1;
929                         }
930                         netns_fd = netns_open_pid(jail_process.pid);
931                         netns_updown(jail_process.pid, true);
932                 }
933
934                 sig_buf[0] = 'O';
935                 if (write(pipes[3], sig_buf, 1) < 0) {
936                         ERROR("can't write to child\n");
937                         return -1;
938                 }
939                 close(pipes[3]);
940                 uloop_process_add(&jail_process);
941                 uloop_run();
942                 if (jail_running) {
943                         DEBUG("uloop interrupted, killing jail process\n");
944                         kill(jail_process.pid, SIGTERM);
945                         uloop_timeout_set(&jail_process_timeout, 1000);
946                         uloop_run();
947                 }
948                 uloop_done();
949                 if (opts.namespace & CLONE_NEWNET) {
950                         setns(netns_fd, CLONE_NEWNET);
951                         netns_updown(getpid(), false);
952                         close(netns_fd);
953                 }
954                 return jail_return_code;
955         } else if (jail_process.pid == 0) {
956                 /* fork child process */
957                 return exec_jail(NULL);
958         } else {
959                 ERROR("failed to clone/fork: %m\n");
960                 return EXIT_FAILURE;
961         }
962 }