jail: handle containers seperately
[oweals/procd.git] / jail / jail.c
index 56dc9cab7337d829e5b10b192cdb50fb04f3ef88..45906904451bee621108f7e5df3fb12930fe2538 100644 (file)
 #include <sys/mount.h>
 #include <sys/prctl.h>
 #include <sys/wait.h>
+#include <sys/types.h>
 
 #include <stdlib.h>
 #include <unistd.h>
-#include <values.h>
 #include <errno.h>
-#include <stdio.h>
+#include <pwd.h>
+#include <grp.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <libgen.h>
 #include <sched.h>
+#include <linux/limits.h>
+#include <signal.h>
 
-#include "elf.h"
 #include "capabilities.h"
+#include "elf.h"
+#include "fs.h"
+#include "jail.h"
+#include "log.h"
 
-#include <libubox/list.h>
 #include <libubox/uloop.h>
+#include <libubus.h>
 
 #define STACK_SIZE     (1024 * 1024)
-#define OPT_ARGS       "P:S:C:n:r:w:d:psulo"
+#define OPT_ARGS       "S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:Ey"
 
 static struct {
-       char *path;
        char *name;
+       char *hostname;
        char **jail_argv;
        char *seccomp;
        char *capabilities;
+       char *user;
+       char *group;
+       char *extroot;
+       char *overlaydir;
+       char *tmpoverlaysize;
+       int no_new_privs;
        int namespace;
        int procfs;
        int ronly;
        int sysfs;
+       int console;
+       int pw_uid;
+       int pw_gid;
+       int gr_gid;
+       int require_jail;
 } opts;
 
-struct extra {
-       struct list_head list;
-
-       const char *path;
-       const char *name;
-       int readonly;
-};
-
-static LIST_HEAD(extras);
 
 extern int pivot_root(const char *new_root, const char *put_old);
 
@@ -64,6 +72,8 @@ int debug = 0;
 
 static char child_stack[STACK_SIZE];
 
+int console_fd;
+
 static int mkdir_p(char *dir, mode_t mask)
 {
        char *l = strrchr(dir, '/');
@@ -84,120 +94,303 @@ static int mkdir_p(char *dir, mode_t mask)
                return 0;
 
        if (ret)
-               ERROR("mkdir failed on %s: %s\n", dir, strerror(errno));
+               ERROR("mkdir(%s, %d) failed: %m\n", dir, mask);
 
        return ret;
 }
 
-static int mount_bind(const char *root, const char *path, const char *name, int readonly, int error)
+static int _mount_bind(const char *root, const char *path, const char *target, int readonly, int strict, int error)
 {
-       const char *p = path;
        struct stat s;
-       char old[256];
-       char new[256];
+       char new[PATH_MAX];
        int fd;
+       int remount_flags = MS_BIND | MS_REMOUNT;
 
-       if (strstr(p, "local"))
-               p = "/lib";
-
-       snprintf(old, sizeof(old), "%s/%s", path, name);
-       snprintf(new, sizeof(new), "%s%s", root, p);
-
-       mkdir_p(new, 0755);
-
-       snprintf(new, sizeof(new), "%s%s/%s", root, p, name);
-
-       if (stat(old, &s)) {
-               ERROR("%s does not exist\n", old);
+       if (stat(path, &s)) {
+               ERROR("stat(%s) failed: %m\n", path);
                return error;
        }
 
+       snprintf(new, sizeof(new), "%s%s", root, target?target:path);
+
        if (S_ISDIR(s.st_mode)) {
                mkdir_p(new, 0755);
        } else {
+               mkdir_p(dirname(new), 0755);
+               snprintf(new, sizeof(new), "%s%s", root, target?target:path);
                fd = creat(new, 0644);
                if (fd == -1) {
-                       ERROR("failed to create %s: %s\n", new, strerror(errno));
+                       ERROR("creat(%s) failed: %m\n", new);
                        return -1;
                }
                close(fd);
        }
 
-       if (mount(old, new, NULL, MS_BIND, NULL)) {
-               ERROR("failed to mount -B %s %s: %s\n", old, new, strerror(errno));
+       if (mount(path, new, NULL, MS_BIND, NULL)) {
+               ERROR("failed to mount -B %s %s: %m\n", path, new);
                return -1;
        }
 
-       if (readonly && mount(NULL, new, NULL, MS_BIND | MS_REMOUNT | MS_RDONLY, NULL)) {
-               ERROR("failed to remount ro %s: %s\n", new, strerror(errno));
+       if (readonly)
+               remount_flags |= MS_RDONLY;
+
+       if (strict)
+               remount_flags |= MS_NOEXEC | MS_NOSUID | MS_NODEV;
+
+       if ((strict || readonly) && mount(NULL, new, NULL, remount_flags, NULL)) {
+               ERROR("failed to remount (%s%s%s) %s: %m\n", readonly?"ro":"rw",
+                     (readonly && strict)?", ":"", strict?"strict":"", new);
                return -1;
        }
 
-       DEBUG("mount -B %s %s\n", old, new);
+       DEBUG("mount -B %s %s (%s%s%s)\n", path, new,
+             readonly?"ro":"rw", (readonly && strict)?", ":"", strict?"strict":"");
 
        return 0;
 }
 
-static int build_jail_fs()
+int mount_bind(const char *root, const char *path, int readonly, int error) {
+       return _mount_bind(root, path, NULL, readonly, 0, error);
+}
+
+static int mount_overlay(char *jail_root, char *overlaydir) {
+       char *upperdir, *workdir, *optsstr;
+       const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
+       int ret = -1;
+
+       if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0)
+               goto out;
+
+       if (asprintf(&workdir, "%s%s", overlaydir, "/work") < 0)
+               goto upper_printf;
+
+       if (asprintf(&optsstr, mountoptsformat, jail_root, upperdir, workdir) < 0)
+               goto work_printf;
+
+       if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755))
+               goto opts_printf;
+
+       DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr);
+
+       if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr))
+               goto opts_printf;
+
+       ret = 0;
+
+opts_printf:
+       free(optsstr);
+work_printf:
+       free(workdir);
+upper_printf:
+       free(upperdir);
+out:
+       return ret;
+}
+
+static void pass_console(int console_fd)
+{
+       struct ubus_context *ctx = ubus_connect(NULL);
+       static struct blob_buf req;
+       uint32_t id;
+
+       if (!ctx)
+               return;
+
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "name", opts.name);
+
+       if (ubus_lookup_id(ctx, "container", &id) ||
+           ubus_invoke_fd(ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd))
+               INFO("ubus request failed\n");
+       else
+               close(console_fd);
+
+       blob_buf_free(&req);
+       ubus_free(ctx);
+}
+
+static int create_dev_console(const char *jail_root)
 {
-       struct library *l;
-       struct extra *m;
+       char *console_fname;
+       char dev_console_path[PATH_MAX];
+       int slave_console_fd;
+
+       /* Open UNIX/98 virtual console */
+       console_fd = posix_openpt(O_RDWR | O_NOCTTY);
+       if (console_fd == -1)
+               return -1;
+
+       console_fname = ptsname(console_fd);
+       DEBUG("got console fd %d and PTS client name %s\n", console_fd, console_fname);
+       if (!console_fname)
+               goto no_console;
+
+       grantpt(console_fd);
+       unlockpt(console_fd);
+
+       /* pass PTY master to procd */
+       pass_console(console_fd);
+
+       /* mount-bind PTY slave to /dev/console in jail */
+       snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root);
+       close(creat(dev_console_path, 0620));
+
+       if (mount(console_fname, dev_console_path, NULL, MS_BIND, NULL))
+               goto no_console;
+
+       /* use PTY slave for stdio */
+       slave_console_fd = open(console_fname, O_RDWR); /* | O_NOCTTY */
+       dup2(slave_console_fd, 0);
+       dup2(slave_console_fd, 1);
+       dup2(slave_console_fd, 2);
+       close(slave_console_fd);
 
-       if (mount("tmpfs", opts.path, "tmpfs", MS_NOATIME, "mode=0755")) {
-               ERROR("tmpfs mount failed %s\n", strerror(errno));
+       INFO("using guest console %s\n", console_fname);
+
+       return 0;
+
+no_console:
+       close(console_fd);
+       return 1;
+}
+
+static int build_jail_fs(void)
+{
+       char jail_root[] = "/tmp/ujail-XXXXXX";
+       char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX";
+       char tmpdevdir[] = "/tmp/ujail-XXXXXX/dev";
+       char tmpdevptsdir[] = "/tmp/ujail-XXXXXX/dev/pts";
+       char *overlaydir = NULL;
+
+       if (mkdtemp(jail_root) == NULL) {
+               ERROR("mkdtemp(%s) failed: %m\n", jail_root);
                return -1;
        }
 
-       if (chdir(opts.path)) {
-               ERROR("failed to chdir() in the jail root\n");
+       /* oldroot can't be MS_SHARED else pivot_root() fails */
+       if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) {
+               ERROR("private mount failed %m\n");
                return -1;
        }
 
-       avl_init(&libraries, avl_strcmp, false, NULL);
-       alloc_library_path("/lib64");
-       alloc_library_path("/lib");
-       alloc_library_path("/usr/lib");
-       load_ldso_conf("/etc/ld.so.conf");
+       if (opts.extroot) {
+               if (mount(opts.extroot, jail_root, NULL, MS_BIND, NULL)) {
+                       ERROR("extroot mount failed %m\n");
+                       return -1;
+               }
+       } else {
+               if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) {
+                       ERROR("tmpfs mount failed %m\n");
+                       return -1;
+               }
+       }
+
+       if (opts.tmpoverlaysize) {
+               char mountoptsstr[] = "mode=0755,size=XXXXXXXX";
+
+               snprintf(mountoptsstr, sizeof(mountoptsstr),
+                        "mode=0755,size=%s", opts.tmpoverlaysize);
+               if (mkdtemp(tmpovdir) == NULL) {
+                       ERROR("mkdtemp(%s) failed: %m\n", jail_root);
+                       return -1;
+               }
+               if (mount("tmpfs", tmpovdir, "tmpfs", MS_NOATIME,
+                         mountoptsstr)) {
+                       ERROR("failed to mount tmpfs for overlay (size=%s)\n", opts.tmpoverlaysize);
+                       return -1;
+               }
+               overlaydir = tmpovdir;
+       }
+
+       if (opts.overlaydir)
+               overlaydir = opts.overlaydir;
+
+       if (overlaydir)
+               mount_overlay(jail_root, overlaydir);
 
-       if (elf_load_deps(*opts.jail_argv)) {
-               ERROR("failed to load dependencies\n");
+       if (chdir(jail_root)) {
+               ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root);
                return -1;
        }
 
-       if (opts.seccomp && elf_load_deps("libpreload-seccomp.so")) {
-               ERROR("failed to load libpreload-seccomp.so\n");
+       snprintf(tmpdevdir, sizeof(tmpdevdir), "%s/dev", jail_root);
+       mkdir_p(tmpdevdir, 0755);
+       if (mount(NULL, tmpdevdir, "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "size=1M"))
+               return -1;
+
+       snprintf(tmpdevptsdir, sizeof(tmpdevptsdir), "%s/dev/pts", jail_root);
+       mkdir_p(tmpdevptsdir, 0755);
+       if (mount(NULL, tmpdevptsdir, "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, NULL))
+               return -1;
+
+       if (opts.console)
+               create_dev_console(jail_root);
+
+       if (mount_all(jail_root)) {
+               ERROR("mount_all() failed\n");
                return -1;
        }
 
-       avl_for_each_element(&libraries, l, avl)
-               if (mount_bind(opts.path, l->path, l->name, 1, -1))
-                       return -1;
+       if (opts.namespace & CLONE_NEWNET) {
+               char hostdir[PATH_MAX], jailetc[PATH_MAX], jaillink[PATH_MAX];
+
+               snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name);
+               mkdir_p(hostdir, 0755);
+               _mount_bind(jail_root, hostdir, "/tmp/resolv.conf.d", 1, 1, -1);
+               snprintf(jailetc, PATH_MAX, "%s/etc", jail_root);
+               mkdir_p(jailetc, 0755);
+               snprintf(jaillink, PATH_MAX, "%s/etc/resolv.conf", jail_root);
+               if (overlaydir)
+                       unlink(jaillink);
+               symlink("../tmp/resolv.conf.d/resolv.conf.auto", jaillink);
+       }
 
-       list_for_each_entry(m, &extras, list)
-               if (mount_bind(opts.path, m->path, m->name, m->readonly, 0))
-                       return -1;
+       char dirbuf[sizeof(jail_root) + 4];
+       snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root);
+       mkdir(dirbuf, 0755);
 
-       char *mpoint;
-       if (asprintf(&mpoint, "%s/old", opts.path) < 0) {
-               ERROR("failed to alloc pivot path: %s\n", strerror(errno));
+       if (pivot_root(jail_root, dirbuf) == -1) {
+               ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf);
                return -1;
        }
-       mkdir_p(mpoint, 0755);
-       if (pivot_root(opts.path, mpoint) == -1) {
-               ERROR("pivot_root failed:%s\n", strerror(errno));
-               free(mpoint);
+       if (chdir("/")) {
+               ERROR("chdir(/) (after pivot_root) failed: %m\n");
                return -1;
        }
-       free(mpoint);
+
+       snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root);
+       umount2(dirbuf, MNT_DETACH);
+       rmdir(dirbuf);
+       if (opts.tmpoverlaysize) {
+               char tmpdirbuf[sizeof(tmpovdir) + 4];
+               snprintf(tmpdirbuf, sizeof(tmpdirbuf), "/old%s", tmpovdir);
+               umount2(tmpdirbuf, MNT_DETACH);
+               rmdir(tmpdirbuf);
+       }
+
        umount2("/old", MNT_DETACH);
        rmdir("/old");
+
        if (opts.procfs) {
                mkdir("/proc", 0755);
                mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
+               /*
+                * make /proc/sys read-only while keeping read-write to
+                * /proc/sys/net if CLONE_NEWNET is set.
+                */
+               if (opts.namespace & CLONE_NEWNET)
+                       mount("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, 0);
+
+               mount("/proc/sys", "/proc/sys", NULL, MS_BIND, 0);
+               mount(NULL, "/proc/sys", NULL, MS_REMOUNT | MS_RDONLY, 0);
+               mount(NULL, "/proc", NULL, MS_REMOUNT, 0);
+
+               if (opts.namespace & CLONE_NEWNET)
+                       mount("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, 0);
        }
        if (opts.sysfs) {
                mkdir("/sys", 0755);
-               mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
+               mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0);
        }
        if (opts.ronly)
                mount(NULL, "/", NULL, MS_RDONLY | MS_REMOUNT, 0);
@@ -205,14 +398,108 @@ static int build_jail_fs()
        return 0;
 }
 
+static int write_uid_gid_map(pid_t child_pid, bool gidmap, int id)
+{
+       int map_file;
+       char map_path[64];
+       const char *map_format = "%d %d %d\n";
+       if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
+               child_pid, gidmap?"gid_map":"uid_map") < 0)
+               return -1;
+
+       if ((map_file = open(map_path, O_WRONLY)) == -1)
+               return -1;
+
+       if (dprintf(map_file, map_format, 0, id, 1) == -1) {
+               close(map_file);
+               return -1;
+       }
+
+       close(map_file);
+       return 0;
+}
+
+static int write_setgroups(pid_t child_pid, bool allow)
+{
+       int setgroups_file;
+       char setgroups_path[64];
+
+       if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups",
+               child_pid) < 0) {
+               return -1;
+       }
+
+       if ((setgroups_file = open(setgroups_path, O_WRONLY)) == -1) {
+               return -1;
+       }
+
+       if (dprintf(setgroups_file, allow?"allow":"deny") == -1) {
+               close(setgroups_file);
+               return -1;
+       }
+
+       close(setgroups_file);
+       return 0;
+}
+
+static void get_jail_user(int *user, int *user_gid, int *gr_gid)
+{
+       struct passwd *p = NULL;
+       struct group *g = NULL;
+
+       if (opts.user) {
+               p = getpwnam(opts.user);
+               if (!p) {
+                       ERROR("failed to get uid/gid for user %s: %d (%s)\n",
+                             opts.user, errno, strerror(errno));
+                       exit(EXIT_FAILURE);
+               }
+               *user = p->pw_uid;
+               *user_gid = p->pw_gid;
+       } else {
+               *user = -1;
+               *user_gid = -1;
+       }
+
+       if (opts.group) {
+               g = getgrnam(opts.group);
+               if (!g) {
+                       ERROR("failed to get gid for group %s: %m\n", opts.group);
+                       exit(EXIT_FAILURE);
+               }
+               *gr_gid = g->gr_gid;
+       } else {
+               *gr_gid = -1;
+       }
+};
+
+static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
+{
+       if ((user_gid != -1) && initgroups(opts.user, user_gid)) {
+               ERROR("failed to initgroups() for user %s: %m\n", opts.user);
+               exit(EXIT_FAILURE);
+       }
+
+       if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) {
+               ERROR("failed to set group id %d: %m\n", gr_gid);
+               exit(EXIT_FAILURE);
+       }
+
+       if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) {
+               ERROR("failed to set user id %d: %m\n", pw_uid);
+               exit(EXIT_FAILURE);
+       }
+}
+
 #define MAX_ENVP       8
 static char** build_envp(const char *seccomp)
 {
        static char *envp[MAX_ENVP];
-       static char preload_var[64];
-       static char seccomp_var[64];
+       static char preload_var[PATH_MAX];
+       static char seccomp_var[PATH_MAX];
        static char debug_var[] = "LD_DEBUG=all";
-       char *preload_lib = find_lib("libpreload-seccomp.so");
+       static char container_var[] = "container=ujail";
+       const char *preload_lib = find_lib("libpreload-seccomp.so");
        int count = 0;
 
        if (seccomp && !preload_lib) {
@@ -225,6 +512,9 @@ static char** build_envp(const char *seccomp)
                snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
                envp[count++] = preload_var;
        }
+
+       envp[count++] = container_var;
+
        if (debug > 1)
                envp[count++] = debug_var;
 
@@ -237,16 +527,27 @@ static void usage(void)
        fprintf(stderr, "  -d <num>\tshow debug log (increase num to increase verbosity)\n");
        fprintf(stderr, "  -S <file>\tseccomp filter config\n");
        fprintf(stderr, "  -C <file>\tcapabilities drop config\n");
+       fprintf(stderr, "  -c\t\tset PR_SET_NO_NEW_PRIVS\n");
        fprintf(stderr, "  -n <name>\tthe name of the jail\n");
        fprintf(stderr, "namespace jail options:\n");
-       fprintf(stderr, "  -P <path>\tpath where the jail will be staged\n");
+       fprintf(stderr, "  -h <hostname>\tchange the hostname of the jail\n");
+       fprintf(stderr, "  -N\t\tjail has network namespace\n");
+       fprintf(stderr, "  -f\t\tjail has user namespace\n");
+       fprintf(stderr, "  -F\t\tjail has cgroups namespace\n");
        fprintf(stderr, "  -r <file>\treadonly files that should be staged\n");
        fprintf(stderr, "  -w <file>\twriteable files that should be staged\n");
        fprintf(stderr, "  -p\t\tjail has /proc\n");
        fprintf(stderr, "  -s\t\tjail has /sys\n");
        fprintf(stderr, "  -l\t\tjail has /dev/log\n");
        fprintf(stderr, "  -u\t\tjail has a ubus socket\n");
+       fprintf(stderr, "  -U <name>\tuser to run jailed process\n");
+       fprintf(stderr, "  -G <name>\tgroup to run jailed process\n");
        fprintf(stderr, "  -o\t\tremont jail root (/) read only\n");
+       fprintf(stderr, "  -R <dir>\texternal jail rootfs (system container)\n");
+       fprintf(stderr, "  -O <dir>\tdirectory for r/w overlayfs\n");
+       fprintf(stderr, "  -T <size>\tuse tmpfs r/w overlayfs with <size>\n");
+       fprintf(stderr, "  -E\t\tfail if jail cannot be setup\n");
+       fprintf(stderr, "  -y\t\tprovide jail console\n");
        fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
 and he has the same powers as root outside the jail,\n\
 thus he can escape the jail and/or break stuff.\n\
@@ -256,41 +557,93 @@ ujail will not use namespace/build a jail,\n\
 and will only drop capabilities/apply seccomp filter.\n\n");
 }
 
-static int exec_jail()
+static int exec_jail(void *pipes_ptr)
 {
-       char **envp = build_envp(opts.seccomp);
-       if (!envp)
+       int *pipes = (int*)pipes_ptr;
+       char buf[1];
+       int pw_uid, pw_gid, gr_gid;
+
+       close(pipes[0]);
+       close(pipes[3]);
+
+       buf[0] = 'i';
+       if (write(pipes[1], buf, 1) < 1) {
+               ERROR("can't write to parent\n");
+               exit(EXIT_FAILURE);
+       }
+       if (read(pipes[2], buf, 1) < 1) {
+               ERROR("can't read from parent\n");
+               exit(EXIT_FAILURE);
+       }
+       if (buf[0] != 'O') {
+               ERROR("parent had an error, child exiting\n");
+               exit(EXIT_FAILURE);
+       }
+
+       close(pipes[1]);
+       close(pipes[2]);
+
+       if (opts.namespace & CLONE_NEWUSER) {
+               if (setgid(0) < 0) {
+                       ERROR("setgid\n");
+                       exit(EXIT_FAILURE);
+               }
+               if (setuid(0) < 0) {
+                       ERROR("setuid\n");
+                       exit(EXIT_FAILURE);
+               }
+//             if (setgroups(0, NULL) < 0) {
+//                     ERROR("setgroups\n");
+//                     exit(EXIT_FAILURE);
+//             }
+       }
+
+       if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
+                       && sethostname(opts.hostname, strlen(opts.hostname))) {
+               ERROR("sethostname(%s) failed: %m\n", opts.hostname);
                exit(EXIT_FAILURE);
+       }
+
+       if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) {
+               ERROR("failed to build jail fs\n");
+               exit(EXIT_FAILURE);
+       }
 
        if (opts.capabilities && drop_capabilities(opts.capabilities))
                exit(EXIT_FAILURE);
 
-       INFO("exec-ing %s\n", *opts.jail_argv);
-       execve(*opts.jail_argv, opts.jail_argv, envp);
-       //we get there only if execve fails
-       ERROR("failed to execve %s: %s\n", *opts.jail_argv, strerror(errno));
-       exit(EXIT_FAILURE);
-}
+       if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+                ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
+               exit(EXIT_FAILURE);
+       }
 
-static int spawn_jail(void *arg)
-{
-       if (opts.name && sethostname(opts.name, strlen(opts.name))) {
-               ERROR("failed to sethostname: %s\n", strerror(errno));
+       if (!(opts.namespace & CLONE_NEWUSER)) {
+               get_jail_user(&pw_uid, &pw_gid, &gr_gid);
+               set_jail_user(pw_uid, pw_gid, gr_gid);
        }
 
-       if (build_jail_fs()) {
-               ERROR("failed to build jail fs");
+       char **envp = build_envp(opts.seccomp);
+       if (!envp)
                exit(EXIT_FAILURE);
-       }
 
-       return exec_jail();
+       INFO("exec-ing %s\n", *opts.jail_argv);
+       execve(*opts.jail_argv, opts.jail_argv, envp);
+       /* we get there only if execve fails */
+       ERROR("failed to execve %s: %m\n", *opts.jail_argv);
+       exit(EXIT_FAILURE);
 }
 
 static int jail_running = 1;
 static int jail_return_code = 0;
 
+static void jail_process_timeout_cb(struct uloop_timeout *t);
+static struct uloop_timeout jail_process_timeout = {
+       .cb = jail_process_timeout_cb,
+};
+
 static void jail_process_handler(struct uloop_process *c, int ret)
 {
+       uloop_timeout_cancel(&jail_process_timeout);
        if (WIFEXITED(ret)) {
                jail_return_code = WEXITSTATUS(ret);
                INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
@@ -306,38 +659,68 @@ static struct uloop_process jail_process = {
        .cb = jail_process_handler,
 };
 
-static void add_extra(char *name, int readonly)
+static void jail_process_timeout_cb(struct uloop_timeout *t)
+{
+       DEBUG("jail process failed to stop, sending SIGKILL\n");
+       kill(jail_process.pid, SIGKILL);
+}
+
+static void jail_handle_signal(int signo)
+{
+       DEBUG("forwarding signal %d to the jailed process\n", signo);
+       kill(jail_process.pid, signo);
+}
+
+static int netns_open_pid(const pid_t target_ns)
 {
-       struct extra *f;
+       char pid_net_path[PATH_MAX];
+
+       snprintf(pid_net_path, sizeof(pid_net_path), "/proc/%u/ns/net", target_ns);
+
+       return open(pid_net_path, O_RDONLY);
+}
 
-       if (*name != '/') {
-               ERROR("%s is not an absolute path\n", name);
+static void netns_updown(pid_t pid, bool start)
+{
+       struct ubus_context *ctx = ubus_connect(NULL);
+       static struct blob_buf req;
+       uint32_t id;
+
+       if (!ctx)
                return;
-       }
 
-       f = calloc(1, sizeof(struct extra));
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "jail", opts.name);
+       blobmsg_add_u32(&req, "pid", pid);
+       blobmsg_add_u8(&req, "start", start);
 
-       f->name = basename(name);
-       f->path = dirname(strdup(name));
-       f->readonly = readonly;
+       if (ubus_lookup_id(ctx, "network", &id) ||
+           ubus_invoke(ctx, id, "netns_updown", req.head, NULL, NULL, 3000))
+               INFO("ubus request failed\n");
 
-       list_add_tail(&f->list, &extras);
+       blob_buf_free(&req);
+       ubus_free(ctx);
 }
 
 int main(int argc, char **argv)
 {
+       sigset_t sigmask;
        uid_t uid = getuid();
        char log[] = "/dev/log";
        char ubus[] = "/var/run/ubus.sock";
-       int ret = EXIT_SUCCESS;
-       int ch;
+       int ch, i;
+       int pipes[4];
+       char sig_buf[1];
+       int netns_fd;
 
        if (uid) {
-               ERROR("not root, aborting: %s\n", strerror(errno));
+               ERROR("not root, aborting: %m\n");
                return EXIT_FAILURE;
        }
 
        umask(022);
+       mount_list_init();
+       init_library_search();
 
        while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) {
                switch (ch) {
@@ -345,52 +728,92 @@ int main(int argc, char **argv)
                        debug = atoi(optarg);
                        break;
                case 'p':
-                       opts.namespace = 1;
+                       opts.namespace |= CLONE_NEWNS;
                        opts.procfs = 1;
                        break;
                case 'o':
-                       opts.namespace = 1;
+                       opts.namespace |= CLONE_NEWNS;
                        opts.ronly = 1;
                        break;
+               case 'f':
+                       opts.namespace |= CLONE_NEWUSER;
+                       break;
+               case 'F':
+                       opts.namespace |= CLONE_NEWCGROUP;
+                       break;
+               case 'R':
+                       opts.extroot = optarg;
+                       break;
                case 's':
-                       opts.namespace = 1;
+                       opts.namespace |= CLONE_NEWNS;
                        opts.sysfs = 1;
                        break;
                case 'S':
                        opts.seccomp = optarg;
-                       add_extra(optarg, 1);
+                       add_mount(optarg, 1, -1);
                        break;
                case 'C':
                        opts.capabilities = optarg;
-                       add_extra(optarg, 1);
                        break;
-               case 'P':
-                       opts.namespace = 1;
-                       opts.path = optarg;
+               case 'c':
+                       opts.no_new_privs = 1;
                        break;
                case 'n':
                        opts.name = optarg;
                        break;
+               case 'N':
+                       opts.namespace |= CLONE_NEWNET;
+                       break;
+               case 'h':
+                       opts.namespace |= CLONE_NEWUTS;
+                       opts.hostname = optarg;
+                       break;
                case 'r':
-                       opts.namespace = 1;
-                       add_extra(optarg, 1);
+                       opts.namespace |= CLONE_NEWNS;
+                       add_path_and_deps(optarg, 1, 0, 0);
                        break;
                case 'w':
-                       opts.namespace = 1;
-                       add_extra(optarg, 0);
+                       opts.namespace |= CLONE_NEWNS;
+                       add_path_and_deps(optarg, 0, 0, 0);
                        break;
                case 'u':
-                       opts.namespace = 1;
-                       add_extra(ubus, 0);
+                       opts.namespace |= CLONE_NEWNS;
+                       add_mount(ubus, 0, -1);
                        break;
                case 'l':
-                       opts.namespace = 1;
-                       add_extra(log, 0);
+                       opts.namespace |= CLONE_NEWNS;
+                       add_mount(log, 0, -1);
+                       break;
+               case 'U':
+                       opts.user = optarg;
+                       break;
+               case 'G':
+                       opts.group = optarg;
+                       break;
+               case 'O':
+                       opts.overlaydir = optarg;
+                       break;
+               case 'T':
+                       opts.tmpoverlaysize = optarg;
+                       break;
+               case 'E':
+                       opts.require_jail = 1;
+                       break;
+               case 'y':
+                       opts.console = 1;
                        break;
                }
        }
 
-       //no <binary> param found
+       if (opts.namespace)
+               opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID;
+
+       if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) {
+               ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize);
+               return -1;
+       }
+
+       /* no <binary> param found */
        if (argc - optind < 1) {
                usage();
                return EXIT_FAILURE;
@@ -400,60 +823,140 @@ int main(int argc, char **argv)
                usage();
                return EXIT_FAILURE;
        }
-       DEBUG("Using namespaces(%d), capabilities(%d), seccomp(%d)\n",
+       DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n",
                opts.namespace,
                opts.capabilities != 0,
                opts.seccomp != 0);
 
        opts.jail_argv = &argv[optind];
 
-       if (opts.name)
-               prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
+       get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
 
-       if (opts.namespace && !opts.path && asprintf(&opts.path, "/tmp/%s", basename(*opts.jail_argv)) == -1) {
-               ERROR("failed to asprintf root path: %s\n", strerror(errno));
-               return EXIT_FAILURE;
+       if (!opts.extroot) {
+               if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
+                       ERROR("failed to load dependencies\n");
+                       return -1;
+               }
        }
 
-       if (opts.namespace && mkdir(opts.path, 0755)) {
-               ERROR("unable to create root path: %s (%s)\n", opts.path, strerror(errno));
-               return EXIT_FAILURE;
+       if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) {
+               ERROR("failed to load libpreload-seccomp.so\n");
+               opts.seccomp = 0;
+               if (opts.require_jail)
+                       return -1;
        }
 
+       if (opts.name)
+               prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
+
        uloop_init();
+
+       sigfillset(&sigmask);
+       for (i = 0; i < _NSIG; i++) {
+               struct sigaction s = { 0 };
+
+               if (!sigismember(&sigmask, i))
+                       continue;
+               if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV))
+                       continue;
+
+               s.sa_handler = jail_handle_signal;
+               sigaction(i, &s, NULL);
+       }
+
        if (opts.namespace) {
-               jail_process.pid = clone(spawn_jail,
-                       child_stack + STACK_SIZE,
-                       CLONE_NEWUTS | CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | SIGCHLD, argv);
+               if (opts.namespace & CLONE_NEWNS) {
+                       add_mount("/dev/full", 0, -1);
+                       add_mount("/dev/null", 0, -1);
+                       add_mount("/dev/random", 0, -1);
+                       add_mount("/dev/urandom", 0, -1);
+                       add_mount("/dev/zero", 0, -1);
+                       add_mount("/dev/ptmx", 0, -1);
+                       add_mount("/dev/tty", 0, -1);
+
+                       if (!opts.extroot && (opts.user || opts.group)) {
+                               add_mount("/etc/passwd", 0, -1);
+                               add_mount("/etc/group", 0, -1);
+                       }
+
+#if defined(__GLIBC__)
+                       if (!opts.extroot)
+                               add_mount("/etc/nsswitch.conf", 0, -1);
+#endif
+
+                       if (!(opts.namespace & CLONE_NEWNET)) {
+                               add_mount("/etc/resolv.conf", 0, -1);
+                       }
+               }
+
+               if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0)
+                       return -1;
+
+               jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | opts.namespace, &pipes);
        } else {
                jail_process.pid = fork();
        }
 
        if (jail_process.pid > 0) {
-               //parent process
+               seteuid(0);
+               /* parent process */
+               close(pipes[1]);
+               close(pipes[2]);
+               if (read(pipes[0], sig_buf, 1) < 1) {
+                       ERROR("can't read from child\n");
+                       return -1;
+               }
+               close(pipes[0]);
+               if (opts.namespace & CLONE_NEWUSER) {
+                       bool has_gr = (opts.gr_gid != -1);
+                       if (write_setgroups(jail_process.pid, false)) {
+                               ERROR("can't write setgroups\n");
+                               return -1;
+                       }
+                       if (opts.pw_uid != -1) {
+                               write_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
+                               write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
+                       } else {
+                               write_uid_gid_map(jail_process.pid, 0, 65534);
+                               write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
+                       }
+               }
+
+               if (opts.namespace & CLONE_NEWNET) {
+                       if (!opts.name) {
+                               ERROR("netns needs a named jail\n");
+                               return -1;
+                       }
+                       netns_fd = netns_open_pid(jail_process.pid);
+                       netns_updown(jail_process.pid, true);
+               }
+
+               sig_buf[0] = 'O';
+               if (write(pipes[3], sig_buf, 1) < 0) {
+                       ERROR("can't write to child\n");
+                       return -1;
+               }
+               close(pipes[3]);
                uloop_process_add(&jail_process);
                uloop_run();
-               uloop_done();
                if (jail_running) {
                        DEBUG("uloop interrupted, killing jail process\n");
                        kill(jail_process.pid, SIGTERM);
-                       waitpid(jail_process.pid, NULL, 0);
+                       uloop_timeout_set(&jail_process_timeout, 1000);
+                       uloop_run();
+               }
+               uloop_done();
+               if (opts.namespace & CLONE_NEWNET) {
+                       setns(netns_fd, CLONE_NEWNET);
+                       netns_updown(getpid(), false);
+                       close(netns_fd);
                }
+               return jail_return_code;
        } else if (jail_process.pid == 0) {
-               //fork child process
-               return exec_jail();
+               /* fork child process */
+               return exec_jail(NULL);
        } else {
-               ERROR("failed to clone/fork: %s\n", strerror(errno));
-               ret = EXIT_FAILURE;
-       }
-
-       if (opts.namespace && rmdir(opts.path)) {
-               ERROR("Unable to remove root path: %s (%s)\n", opts.path, strerror(errno));
-               ret = EXIT_FAILURE;
+               ERROR("failed to clone/fork: %m\n");
+               return EXIT_FAILURE;
        }
-
-       if (ret)
-               return ret;
-
-       return jail_return_code;
 }