From 6f3dbd283bbdc6f981be5d36d609ee682051f856 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Fri, 20 Mar 2020 18:20:51 +0000 Subject: [PATCH] jail: add support for userns and cgroupsns Add options to have jailed process inside new user namespace and cgroups namespace. Currently only the root user inside the container is mapped. Also, mounting /proc currently still fails in the new user namespace with permission denied for unknown reasons. Signed-off-by: Daniel Golle --- jail/jail.c | 291 ++++++++++++++++++++++++++++++++++----------- service/instance.c | 43 +++++-- service/instance.h | 4 +- 3 files changed, 255 insertions(+), 83 deletions(-) diff --git a/jail/jail.c b/jail/jail.c index df1bda7..78f0c6b 100644 --- a/jail/jail.c +++ b/jail/jail.c @@ -40,15 +40,7 @@ #include #define STACK_SIZE (1024 * 1024) -#define OPT_ARGS "S:C:n:h:r:w:d:psulocU:G:NR:" - -#define NAMESPACE_MOUNT (1U << 0) -#define NAMESPACE_IPC (1U << 1) -#define NAMESPACE_NET (1U << 2) -#define NAMESPACE_PID (1U << 3) -#define NAMESPACE_USER (1U << 4) -#define NAMESPACE_UTS (1U << 5) -#define NAMESPACE_CGROUP (1U << 6) +#define OPT_ARGS "S:C:n:h:r:w:d:psulocU:G:NR:fF" static struct { char *name; @@ -64,8 +56,12 @@ static struct { int procfs; int ronly; int sysfs; + int pw_uid; + int pw_gid; + int gr_gid; } opts; + extern int pivot_root(const char *new_root, const char *put_old); int debug = 0; @@ -166,7 +162,7 @@ static int build_jail_fs(void) } if (opts.extroot) { - if (mount(opts.extroot, jail_root, NULL, MS_BIND | MS_REC, NULL)) { + if (mount(opts.extroot, jail_root, NULL, MS_BIND, NULL)) { ERROR("extroot mount failed %m\n"); return -1; } @@ -187,7 +183,7 @@ static int build_jail_fs(void) return -1; } - if (opts.namespace & NAMESPACE_NET) { + if (opts.namespace & CLONE_NEWNET) { char hostdir[PATH_MAX], jailetc[PATH_MAX], jaillink[PATH_MAX]; snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name); @@ -231,6 +227,99 @@ static int build_jail_fs(void) return 0; } +static int write_uid_gid_map(pid_t child_pid, bool gidmap, int id) +{ + int map_file; + char map_path[64]; + const char *map_format = "%d %d %d\n"; + if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s", + child_pid, gidmap?"gid_map":"uid_map") < 0) + return -1; + + if ((map_file = open(map_path, O_WRONLY)) == -1) + return -1; + + if (dprintf(map_file, map_format, 0, id, 1) == -1) { + close(map_file); + return -1; + } + + close(map_file); + return 0; +} + +static int write_setgroups(pid_t child_pid, bool allow) +{ + int setgroups_file; + char setgroups_path[64]; + + if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups", + child_pid) < 0) { + return -1; + } + + if ((setgroups_file = open(setgroups_path, O_WRONLY)) == -1) { + return -1; + } + + if (dprintf(setgroups_file, allow?"allow":"deny") == -1) { + close(setgroups_file); + return -1; + } + + close(setgroups_file); + return 0; +} + +static void get_jail_user(int *user, int *user_gid, int *gr_gid) +{ + struct passwd *p = NULL; + struct group *g = NULL; + + if (opts.user) { + p = getpwnam(opts.user); + if (!p) { + ERROR("failed to get uid/gid for user %s: %d (%s)\n", + opts.user, errno, strerror(errno)); + exit(EXIT_FAILURE); + } + *user = p->pw_uid; + *user_gid = p->pw_gid; + } else { + *user = -1; + *user_gid = -1; + } + + if (opts.group) { + g = getgrnam(opts.group); + if (!g) { + ERROR("failed to get gid for group %s: %m\n", opts.group); + exit(EXIT_FAILURE); + } + *gr_gid = g->gr_gid; + } else { + *gr_gid = -1; + } +}; + +static void set_jail_user(int pw_uid, int user_gid, int gr_gid) +{ + if ((user_gid != -1) && initgroups(opts.user, user_gid)) { + ERROR("failed to initgroups() for user %s: %m\n", opts.user); + exit(EXIT_FAILURE); + } + + if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) { + ERROR("failed to set group id %d: %m\n", gr_gid); + exit(EXIT_FAILURE); + } + + if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) { + ERROR("failed to set user id %d: %m\n", pw_uid); + exit(EXIT_FAILURE); + } +} + #define MAX_ENVP 8 static char** build_envp(const char *seccomp) { @@ -253,8 +342,7 @@ static char** build_envp(const char *seccomp) envp[count++] = preload_var; } - if (is_extroot) - envp[count++] = container_var; + envp[count++] = container_var; if (debug > 1) envp[count++] = debug_var; @@ -292,62 +380,71 @@ ujail will not use namespace/build a jail,\n\ and will only drop capabilities/apply seccomp filter.\n\n"); } -static int exec_jail(void *_notused) +static int exec_jail(void *pipes_ptr) { - struct passwd *p = NULL; - struct group *g = NULL; + int *pipes = (int*)pipes_ptr; + char buf[1]; + int pw_uid, pw_gid, gr_gid; - if (opts.capabilities && drop_capabilities(opts.capabilities)) - exit(EXIT_FAILURE); + close(pipes[0]); + close(pipes[3]); - if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { - ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n"); + + buf[0] = 'i'; + if (write(pipes[1], buf, 1) < 1) { + ERROR("can't write to parent\n"); exit(EXIT_FAILURE); } - - if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0 - && sethostname(opts.hostname, strlen(opts.hostname))) { - ERROR("sethostname(%s) failed: %m\n", opts.hostname); + if (read(pipes[2], buf, 1) < 1) { + ERROR("can't read from parent\n"); exit(EXIT_FAILURE); } - - if (opts.namespace && build_jail_fs()) { - ERROR("failed to build jail fs\n"); + if (buf[0] != 'O') { + ERROR("parent had an error, child exiting\n"); exit(EXIT_FAILURE); } - if (opts.user) { - p = getpwnam(opts.user); - if (!p) { - ERROR("failed to get uid/gid for user %s: %d (%s)\n", - opts.user, errno, strerror(errno)); + close(pipes[1]); + close(pipes[2]); + + if (opts.namespace & CLONE_NEWUSER) { + if (setgid(0) < 0) { + ERROR("setgid\n"); exit(EXIT_FAILURE); } - } - - if (opts.group) { - g = getgrnam(opts.group); - if (!g) { - ERROR("failed to get gid for group %s: %m\n", opts.group); + if (setuid(0) < 0) { + ERROR("setuid\n"); exit(EXIT_FAILURE); } +// if (setgroups(0, NULL) < 0) { +// ERROR("setgroups\n"); +// exit(EXIT_FAILURE); +// } } - if (p && p->pw_gid && initgroups(opts.user, p->pw_gid)) { - ERROR("failed to initgroups() for user %s: %m\n", opts.user); + if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0 + && sethostname(opts.hostname, strlen(opts.hostname))) { + ERROR("sethostname(%s) failed: %m\n", opts.hostname); exit(EXIT_FAILURE); } - if (g && g->gr_gid && setgid(g->gr_gid)) { - ERROR("failed to set group id %d: %m\n", g?g->gr_gid:p->pw_gid); + if (opts.namespace && build_jail_fs()) { + ERROR("failed to build jail fs\n"); exit(EXIT_FAILURE); } - if (p && p->pw_uid && setuid(p->pw_uid)) { - ERROR("failed to set user id %d: %m\n", p->pw_uid); + if (opts.capabilities && drop_capabilities(opts.capabilities)) + exit(EXIT_FAILURE); + + if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n"); exit(EXIT_FAILURE); } + if (!(opts.namespace & CLONE_NEWUSER)) { + get_jail_user(&pw_uid, &pw_gid, &gr_gid); + set_jail_user(pw_uid, pw_gid, gr_gid); + } char **envp = build_envp(opts.seccomp); if (!envp) @@ -398,12 +495,20 @@ static void jail_handle_signal(int signo) kill(jail_process.pid, signo); } -static void netns_updown(bool start) +static int netns_open_pid(const pid_t target_ns) +{ + char pid_net_path[PATH_MAX]; + + snprintf(pid_net_path, sizeof(pid_net_path), "/proc/%u/ns/net", target_ns); + + return open(pid_net_path, O_RDONLY); +} + +static void netns_updown(pid_t pid, bool start) { struct ubus_context *ctx = ubus_connect(NULL); static struct blob_buf req; uint32_t id; - pid_t pid = getpid(); if (!ctx) return; @@ -428,6 +533,9 @@ int main(int argc, char **argv) char log[] = "/dev/log"; char ubus[] = "/var/run/ubus.sock"; int ch, i; + int pipes[4]; + char sig_buf[1]; + int netns_fd; if (uid) { ERROR("not root, aborting: %m\n"); @@ -444,19 +552,24 @@ int main(int argc, char **argv) debug = atoi(optarg); break; case 'p': - opts.namespace |= NAMESPACE_MOUNT; + opts.namespace |= CLONE_NEWNS; opts.procfs = 1; break; case 'o': - opts.namespace |= NAMESPACE_MOUNT; + opts.namespace |= CLONE_NEWNS; opts.ronly = 1; break; + case 'f': + opts.namespace |= CLONE_NEWUSER; + break; + case 'F': + opts.namespace |= CLONE_NEWCGROUP; + break; case 'R': - opts.namespace |= NAMESPACE_MOUNT | NAMESPACE_UTS; opts.extroot = optarg; break; case 's': - opts.namespace |= NAMESPACE_MOUNT; + opts.namespace |= CLONE_NEWNS; opts.sysfs = 1; break; case 'S': @@ -473,25 +586,26 @@ int main(int argc, char **argv) opts.name = optarg; break; case 'N': - opts.namespace |= NAMESPACE_NET; + opts.namespace |= CLONE_NEWNET; break; case 'h': + opts.namespace |= CLONE_NEWUTS; opts.hostname = optarg; break; case 'r': - opts.namespace |= NAMESPACE_MOUNT; + opts.namespace |= CLONE_NEWNS; add_path_and_deps(optarg, 1, 0, 0); break; case 'w': - opts.namespace |= NAMESPACE_MOUNT; + opts.namespace |= CLONE_NEWNS; add_path_and_deps(optarg, 0, 0, 0); break; case 'u': - opts.namespace |= NAMESPACE_MOUNT; + opts.namespace |= CLONE_NEWNS; add_mount(ubus, 0, -1); break; case 'l': - opts.namespace |= NAMESPACE_MOUNT; + opts.namespace |= CLONE_NEWNS; add_mount(log, 0, -1); break; case 'U': @@ -503,6 +617,9 @@ int main(int argc, char **argv) } } + if (opts.namespace) + opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID; + /* no param found */ if (argc - optind < 1) { usage(); @@ -513,13 +630,15 @@ int main(int argc, char **argv) usage(); return EXIT_FAILURE; } - DEBUG("Using namespaces(%d), capabilities(%d), seccomp(%d)\n", + DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n", opts.namespace, opts.capabilities != 0, opts.seccomp != 0); opts.jail_argv = &argv[optind]; + get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid); + if (!opts.extroot) { if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) { ERROR("failed to load dependencies\n"); @@ -551,10 +670,7 @@ int main(int argc, char **argv) } if (opts.namespace) { - int flags = SIGCHLD | CLONE_NEWPID | CLONE_NEWIPC; - - if (opts.namespace & NAMESPACE_MOUNT) { - flags |= CLONE_NEWNS; + if (opts.namespace & CLONE_NEWNS) { add_mount("/dev/full", 0, -1); add_mount("/dev/null", 0, -1); add_mount("/dev/random", 0, -1); @@ -563,7 +679,7 @@ int main(int argc, char **argv) add_mount("/dev/zero", 0, -1); add_mount("/dev/console", 0, -1); - if (opts.user || opts.group) { + if (!opts.extroot && (opts.user || opts.group)) { add_mount("/etc/passwd", 0, -1); add_mount("/etc/group", 0, -1); } @@ -573,21 +689,50 @@ int main(int argc, char **argv) } } - if (opts.hostname) - flags |= CLONE_NEWUTS; - - if (opts.namespace & NAMESPACE_NET) { - unshare(CLONE_NEWNET); - netns_updown(true); - }; + if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0) + return -1; - jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, flags, NULL); + jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | opts.namespace, &pipes); } else { jail_process.pid = fork(); } if (jail_process.pid > 0) { + seteuid(0); /* parent process */ + close(pipes[1]); + close(pipes[2]); + if (read(pipes[0], sig_buf, 1) < 1) { + ERROR("can't read from child\n"); + return -1; + } + close(pipes[0]); + if (opts.namespace & CLONE_NEWUSER) { + bool has_gr = (opts.gr_gid != -1); + if (write_setgroups(jail_process.pid, false)) { + ERROR("can't write setgroups\n"); + return -1; + } + if (opts.pw_uid != -1) { + write_uid_gid_map(jail_process.pid, 0, opts.pw_uid); + write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid); + } else { + write_uid_gid_map(jail_process.pid, 0, 65534); + write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534); + } + } + + if (opts.namespace & CLONE_NEWNET) { + netns_fd = netns_open_pid(jail_process.pid); + netns_updown(jail_process.pid, true); + } + + sig_buf[0] = 'O'; + if (write(pipes[3], sig_buf, 1) < 0) { + ERROR("can't write to child\n"); + return -1; + } + close(pipes[3]); uloop_process_add(&jail_process); uloop_run(); if (jail_running) { @@ -597,9 +742,11 @@ int main(int argc, char **argv) uloop_run(); } uloop_done(); - if (opts.namespace & NAMESPACE_NET) - netns_updown(false); - + if (opts.namespace & CLONE_NEWNET) { + setns(netns_fd, CLONE_NEWNET); + netns_updown(getpid(), false); + close(netns_fd); + } return jail_return_code; } else if (jail_process.pid == 0) { /* fork child process */ diff --git a/service/instance.c b/service/instance.c index 47b7d09..d29aa0d 100644 --- a/service/instance.c +++ b/service/instance.c @@ -62,6 +62,7 @@ enum { INSTANCE_ATTR_RELOADSIG, INSTANCE_ATTR_TERMTIMEOUT, INSTANCE_ATTR_FACILITY, + INSTANCE_ATTR_EXTROOT, __INSTANCE_ATTR_MAX }; @@ -89,6 +90,7 @@ static const struct blobmsg_policy instance_attr[__INSTANCE_ATTR_MAX] = { [INSTANCE_ATTR_RELOADSIG] = { "reload_signal", BLOBMSG_TYPE_INT32 }, [INSTANCE_ATTR_TERMTIMEOUT] = { "term_timeout", BLOBMSG_TYPE_INT32 }, [INSTANCE_ATTR_FACILITY] = { "facility", BLOBMSG_TYPE_STRING }, + [INSTANCE_ATTR_EXTROOT] = { "extroot", BLOBMSG_TYPE_STRING }, }; enum { @@ -101,8 +103,9 @@ enum { JAIL_ATTR_RONLY, JAIL_ATTR_MOUNT, JAIL_ATTR_NETNS, + JAIL_ATTR_USERNS, + JAIL_ATTR_CGROUPSNS, JAIL_ATTR_REQUIREJAIL, - JAIL_ATTR_EXTROOT, __JAIL_ATTR_MAX, }; @@ -116,8 +119,9 @@ static const struct blobmsg_policy jail_attr[__JAIL_ATTR_MAX] = { [JAIL_ATTR_RONLY] = { "ronly", BLOBMSG_TYPE_BOOL }, [JAIL_ATTR_MOUNT] = { "mount", BLOBMSG_TYPE_TABLE }, [JAIL_ATTR_NETNS] = { "netns", BLOBMSG_TYPE_BOOL }, + [JAIL_ATTR_USERNS] = { "userns", BLOBMSG_TYPE_BOOL }, + [JAIL_ATTR_CGROUPSNS] = { "cgroupsns", BLOBMSG_TYPE_BOOL }, [JAIL_ATTR_REQUIREJAIL] = { "requirejail", BLOBMSG_TYPE_BOOL }, - [JAIL_ATTR_EXTROOT] = { "extroot", BLOBMSG_TYPE_STRING }, }; struct instance_netdev { @@ -260,9 +264,15 @@ jail_run(struct service_instance *in, char **argv) if (jail->netns) argv[argc++] = "-N"; - if (jail->extroot) { + if (jail->userns) + argv[argc++] = "-f"; + + if (jail->cgroupsns) + argv[argc++] = "-F"; + + if (in->extroot) { argv[argc++] = "-R"; - argv[argc++] = jail->extroot; + argv[argc++] = in->extroot; } blobmsg_list_for_each(&jail->mount, var) { @@ -870,9 +880,13 @@ instance_jail_parse(struct service_instance *in, struct blob_attr *attr) jail->netns = blobmsg_get_bool(tb[JAIL_ATTR_NETNS]); jail->argc++; } - if (tb[JAIL_ATTR_EXTROOT]) { - jail->extroot = strdup(blobmsg_get_string(tb[JAIL_ATTR_EXTROOT])); - jail->argc += 2; + if (tb[JAIL_ATTR_USERNS]) { + jail->userns = blobmsg_get_bool(tb[JAIL_ATTR_USERNS]); + jail->argc++; + } + if (tb[JAIL_ATTR_CGROUPSNS]) { + jail->cgroupsns = blobmsg_get_bool(tb[JAIL_ATTR_CGROUPSNS]); + jail->argc++; } if (tb[JAIL_ATTR_MOUNT]) { @@ -892,6 +906,10 @@ instance_jail_parse(struct service_instance *in, struct blob_attr *attr) if (in->group) jail->argc += 2; + if (in->extroot) { + jail->argc += 2; + } + if (in->no_new_privs) jail->argc++; @@ -1003,6 +1021,9 @@ instance_config_parse(struct service_instance *in) if (!in->trace && tb[INSTANCE_ATTR_SECCOMP]) in->seccomp = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_SECCOMP])); + if (tb[INSTANCE_ATTR_EXTROOT]) + in->extroot = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_EXTROOT])); + if (tb[INSTANCE_ATTR_PIDFILE]) { char *pidfile = blobmsg_get_string(tb[INSTANCE_ATTR_PIDFILE]); if (pidfile) @@ -1151,7 +1172,7 @@ instance_free(struct service_instance *in) free(in->config); free(in->user); free(in->group); - free(in->jail.extroot); + free(in->extroot); free(in->jail.name); free(in->jail.hostname); free(in->seccomp); @@ -1275,14 +1296,16 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose) blobmsg_add_string(b, "name", in->jail.name); if (in->jail.hostname) blobmsg_add_string(b, "hostname", in->jail.hostname); - if (in->jail.extroot) - blobmsg_add_string(b, "extroot", in->jail.extroot); + if (in->extroot) + blobmsg_add_string(b, "extroot", in->extroot); blobmsg_add_u8(b, "procfs", in->jail.procfs); blobmsg_add_u8(b, "sysfs", in->jail.sysfs); blobmsg_add_u8(b, "ubus", in->jail.ubus); blobmsg_add_u8(b, "log", in->jail.log); blobmsg_add_u8(b, "ronly", in->jail.ronly); blobmsg_add_u8(b, "netns", in->jail.netns); + blobmsg_add_u8(b, "userns", in->jail.userns); + blobmsg_add_u8(b, "cgroupsns", in->jail.cgroupsns); blobmsg_close_table(b, r); if (!avl_is_empty(&in->jail.mount.avl)) { struct blobmsg_list_node *var; diff --git a/service/instance.h b/service/instance.h index 29406c9..7e80c61 100644 --- a/service/instance.h +++ b/service/instance.h @@ -30,9 +30,10 @@ struct jail { bool log; bool ronly; bool netns; + bool userns; + bool cgroupsns; char *name; char *hostname; - char *extroot; struct blobmsg_list mount; int argc; }; @@ -65,6 +66,7 @@ struct service_instance { struct jail jail; char *seccomp; char *pidfile; + char *extroot; int syslog_facility; int exit_code; -- 2.25.1