From 32d40e541ba635d115b903a3613a4f7581ab26ee Mon Sep 17 00:00:00 2001
From: Johannes 'josch' Schauer <josch@mister-muffin.de>
Date: Sun, 25 Oct 2015 18:32:45 +0100
Subject: [PATCH] initial commit

---
 user-unshare | 436 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 436 insertions(+)
 create mode 100755 user-unshare

diff --git a/user-unshare b/user-unshare
new file mode 100755
index 0000000..edf2ac2
--- /dev/null
+++ b/user-unshare
@@ -0,0 +1,436 @@
+#!/usr/bin/perl
+#
+# Copyright: 2015 Johannes Schauer <josch@mister-muffin.de>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+#
+# This tool tries to be like unshare(1) but with the power of lxc-usernsexec(1)
+# to map more than one id into the new user namespace by using the programs
+# newgidmap and newuidmap. Or in other words: This tool tries to be like
+# lxc-usernsexec(1) but with the power of unshare(1) to unshare more than just
+# the user and mount namespaces.
+#
+# I arbitrarily called it user-unshare because it's like unshare(1) but
+# doesn't require root (except for the suid root new[ug]idmap programs).
+#
+# It is essentially equal to calling:
+#
+#  $ lxc-usernsexec [opts] -- unshare [opts] -- COMMAND
+#
+# Its main points of existence are:
+#
+#  - as a project for me to learn how unprivileged namespaces work
+#  - written in Perl which means:
+#       - architecture independent (same executable on any architecture)
+#       - easily inspectable by other curious minds
+#  - tons of code comments to let others understand how things work
+#  - no need to install the lxc package in a minimal environment (perl itself
+#    might not be called minimal either but is present in every Debian
+#    installation)
+#
+# Further differences to unshare(1):
+#
+#  - No --setgroups and --map-root-user options. These were only useful when
+#    not more than a single uid and gid was available inside the user
+#    namespace
+#  - CLONE_NEWUSER is always enabled (so there is no --user option)
+#  - persistent namespaces to be used with nsenter cannot be supported because
+#    they require that you have permissions to do `mount --bind` in the host
+#    namespace which requires root privileges
+#  - the --mount-proc=XXX option actually works with XXX being other
+#    directories than /proc
+#  - you are immediately root in the new user namespace and can then switch to
+#    another user with runuser(1) if you so wish
+#
+# Further differences to lxc-usernsexec(1):
+#
+#  - there is no parent that is just waiting for the child to exit and
+#    otherwise wasting pid space (unless you request --fork)
+#  - requires only one pipe instead of two for IPC
+#  - the -m option is now uppercase -M because -m was already the short option
+#    for --mount in unshare(1)
+#
+# How it differs from other tools:
+#
+#  - systemd-nspawn requires to be executed by root and this does not seem
+#    likely to change any time soon:
+#    http://lists.freedesktop.org/archives/systemd-devel/2015-February/028139.html
+#  - linux-user-chroot cheats by being suid root
+#
+#
+# Debian kernels carry a patch named
+# add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch
+# which disables unprivileged usernamespaces by default, to enable it do:
+#  $ echo 1 | sudo tee /proc/sys/kernel/unprivileged_userns_clone > /dev/null
+# or
+#  $ sysctl -w kernel.unprivileged_userns_clone=1
+
+use warnings;
+use strict;
+
+require 'syscall.ph';
+
+use Getopt::Long;
+use Pod::Usage;
+
+# read the files /etc/subuid and /etc/subgid and return the mapping of user and
+# group ids, respectively for the given username
+sub read_subuid_subgid($) {
+	my $username = shift;
+	my ($subid, $num_subid, $fh, $n);
+	my @result;
+
+	if (-f "/etc/subuid") {
+		open $fh, "<", "/etc/subuid" or die "cannot open /etc/subuid for reading: $!";
+		while (my $line = <$fh>) {
+			($n, $subid, $num_subid) = split(/:/, $line, 3);
+			last if ($n eq $username);
+		}
+		close $fh;
+		push @result, ["u", 0, $subid, $num_subid];
+	}
+
+	if ( -f "/etc/subgid") {
+		open $fh, "<", "/etc/subgid" or die "cannot open /etc/subgid for reading: $!";
+		while (my $line = <$fh>) {
+			($n, $subid, $num_subid) = split(/:/, $line, 3);
+			last if ($n eq $username);
+		}
+		close $fh;
+		push @result, ["g", 0, $subid, $num_subid];
+	}
+
+	return @result;
+}
+
+# FIXME: get rid of these constants
+# from sched.h
+my $CLONE_NEWNS   = 0x20000;
+my $CLONE_NEWUTS  = 0x4000000;
+my $CLONE_NEWIPC  = 0x8000000;
+my $CLONE_NEWUSER = 0x10000000;
+my $CLONE_NEWPID  = 0x20000000;
+my $CLONE_NEWNET  = 0x40000000;
+# from sys/mount.h
+my $MS_NOSUID     = 0x2;
+my $MS_NODEV      = 0x4;
+my $MS_NOEXEC     = 0x8;
+my $MS_BIND       = 0x1000;
+my $MS_REC        = 0x4000;
+my $MS_PRIVATE    = 0x40000;
+my $MS_SLAVE      = 0x80000;
+
+my $unshare_flags = $CLONE_NEWUSER;
+my @idmap;
+my $procmnt = undef;
+my $fork;
+
+Getopt::Long::Configure ("bundling");
+GetOptions(
+	'h|help'   => sub { pod2usage(-exitval => 0, -verbose => 2) },
+	"i|ipc"    => sub { $unshare_flags |= $CLONE_NEWIPC },
+	"m|mount"  => sub { $unshare_flags |= $CLONE_NEWNS },
+	"n|net"    => sub { $unshare_flags |= $CLONE_NEWNET },
+	"p|pid"    => sub { $unshare_flags |= $CLONE_NEWPID },
+	"u|uts"    => sub { $unshare_flags |= $CLONE_NEWUTS },
+	"mount-proc:s" => sub {
+		$fork = 1; # mounting proc requires a forked child or otherwise EPERM
+		$procmnt = $_[1] eq "" ? "/proc" : $_[1]; # /proc is the default
+		$unshare_flags |= $CLONE_NEWNS; },        # implicitly enable --mount
+	"f|fork" => \$fork,
+	"M=s"    => sub { # parse -M option and make sure it consists of four parts
+		my @v = split /:/, $_[1], 4;
+		die "invalid format for -m option" if scalar @v != 4;
+		push @idmap, \@v },
+) or pod2usage(-exitval => 2, -verbose => 1);
+
+my $origuid = $<;
+my $origgid = $(;
+
+# If the -M option was not given, read /etc/sub[ug]id to get the right
+# sub[ug]ids for the new[gu]idmap calls later by the child process.
+#
+# new[ug]idmap is called with the exact values from /etc/sub[ug]id,
+# respectively.
+if (scalar @idmap == 0) {
+	@idmap = read_subuid_subgid getpwuid $<;
+}
+
+# Create a pipe for the parent process to signal the child process that it is
+# done with calling unshare() so that the child can go ahead setting up
+# uid_map and gid_map.
+pipe my $rfh, my $wfh;
+
+# We have to do this dance with forking a process and then modifying the
+# parent from the child because:
+#  - new[ug]idmap can only be called on a process id after that process has
+#    unshared the user namespace
+#  - a process looses its capabilities if it performs an execve() with nonzero
+#    user ids see the capabilities(7) man page for details.
+#  - a process that unshared the user namespace by default does not have the
+#    privileges to call new[ug]idmap on itself
+#
+# this also works the other way around (the child setting up a user namespace
+# and being modified from the parent) but that way, the parent would have to
+# stay around until the child exited (so a pid would be wasted). Additionally,
+# that variant would require an additional pipe to let the parent signal the
+# child that it is done with calling new[ug]idmap. The way it is done here,
+# this signaling can instead be done by wait()-ing for the exit of the child.
+my $ppid = $$;
+my $cpid = fork() // die "fork() failed: $!";
+if ($cpid == 0) {
+	# child
+
+	# Close the writing descriptor at our end of the pipe so that we see EOF
+	# when parent closes its descriptor.
+	close $wfh;
+
+	# Wait for the parent process to finish its unshare() call by waiting for
+	# an EOF.
+	0 == sysread $rfh, my $c, 1 or die "read() did not receive EOF";
+
+	# The program's new[ug]idmap have to be used because they are setuid root.
+	# These privileges are needed to map the ids from /etc/sub[ug]id to the
+	# user namespace set up by the parent. Without these privileges, only the
+	# id of the user itself can be mapped into the new namespace.
+	#
+	# Since new[ug]idmap is setuid root we also don't need to write "deny" to
+	# /proc/$$/setgroups beforehand (this is otherwise required for
+	# unprivileged processes trying to write to /proc/$$/gid_map since kernel
+	# version 3.19 for security reasons) and therefore the parent process
+	# keeps its ability to change its own group here.
+	#
+	# Since /proc/$ppid/[ug]id_map can only be written to once, respectively,
+	# instead of making multiple calls to new[ug]idmap, we assemble a command
+	# line that makes one call each.
+	my $uidmapcmd = "";
+	my $gidmapcmd = "";
+	foreach (@idmap) {
+		my ($t, $hostid, $nsid, $range) = @{$_};
+		if ($t ne "u" and $t ne "g" and $t ne "b") {
+			die "invalid idmap type: $t";
+		}
+		if ($t eq "u" or $t eq "b") {
+			$uidmapcmd .= " $hostid $nsid $range";
+		}
+		if ($t eq "g" or $t eq "b") {
+			$gidmapcmd .= " $hostid $nsid $range";
+		}
+	}
+	if ($uidmapcmd ne "") {
+		0 == system "newuidmap $ppid $uidmapcmd" or die "newuidmap failed: $!";
+	}
+	if ($gidmapcmd ne "") {
+		0 == system "newgidmap $ppid $gidmapcmd" or die "newgidmap failed: $!";
+	}
+	exit 0;
+}
+
+# parent
+
+# After fork()-ing, the parent immediately calls unshare...
+0 == syscall &SYS_unshare, $unshare_flags or die "unshare() failed: $!";
+
+# .. and then signals the child process that we are done with the unshare()
+# call by sending an EOF.
+close $wfh;
+
+# Wait for the child process to finish its setup by waiting for its exit.
+$cpid == waitpid $cpid, 0 or die "waitpid() failed: $!";
+if ($? != 0) {
+	die "child had a non-zero exit status: $?";
+}
+
+# Currently we are nobody (uid and gid are 65534). So we become root user and
+# group instead.
+#
+# We are using direct syscalls instead of setting $(, $), $< and $> because
+# then perl would do additional stuff which we don't need or want here, like
+# checking /proc/sys/kernel/ngroups_max (which might not exist). It would also
+# also call setgroups() in a way that makes the root user be part of the
+# group unknown.
+0 == syscall &SYS_setgid, 0 or die "setgid failed: $!";
+0 == syscall &SYS_setuid, 0 or die "setuid failed: $!";
+0 == syscall &SYS_setgroups, 0, 0 or die "setgroups failed: $!";
+
+# At this point lxc-usernsexec is checking whether / is mounted MS_SHARED and
+# if yes, will mount / as MS_SLAVE|MS_REC - not sure why we have to do this
+# and it seems to work without so we don't do it.
+#0 == syscall &SYS_mount, 0, my $t = "/", 0, $MS_SLAVE | $MS_REC, 0 or die "mount() failed: $!";
+
+# When the pid namespace is also unshared, then processes expect a master pid
+# to always be alive within the namespace. To achieve this, we fork() here
+# instead of exec() to always have one dummy process running as pid 1 inside
+# the namespace. This is also what the unshare tool does when used with the
+# --fork option.
+#
+# Otherwise, without a pid 1, new processes cannot be forked anymore after pid
+# 1 finished.
+if ($fork) {
+	my $cpid = fork() // die "fork() failed: $!";
+	if ($cpid != 0) {
+		# parent
+
+		# The parent process will stay alive as pid 1 in this namespace until
+		# the child finishes executing. This is important because pid 1 must
+		# never die or otherwise nothing new can be forked.
+		$cpid == waitpid $cpid, 0 or die "waitpid() failed: $!";
+		exit $?;
+	}
+}
+
+if (defined $procmnt) {
+	# Directly bind-mounting the system's /proc into $procmnt would be wrong
+	# because we want to restrict proc to the new pid namespace. Thus we need
+	# to mount a new proc.
+	#
+	# FIXME: Funnily, I wasn't able to figure out how to directly mount a new
+	# proc into any other location than the old /proc without getting EINVAL,
+	# therefore, the workaround is to remount the old /proc and then bind mount
+	# that into the new location.
+	#
+	# This operation also fails with:
+	#    $ lxc-usernsexec -- unshare --mount-proc=... --fork --pid --mount -- ...
+	# or even with:
+	#    $ unshare --mount-proc=... --fork --pid --mount --user -- ...
+	# the authors of unshare(1) should thus be made aware of the workaround
+	# below (or whatever the real fix ends up being):
+	#
+	# The "my $s = ..." and "my $t = ..." constructs are necessary because
+	# string literals cannot be passed as arguments to the syscall function.
+	0 == syscall &SYS_mount, my $s1 = "none", my $t1 = "/proc", 0,
+		$MS_PRIVATE | $MS_REC, 0 or die "mount() failed: $!";
+	# FIXME: The following line will fail if --fork wasn't passed. Why???
+	0 == syscall &SYS_mount, my $s2 = "proc", my $t2 = "/proc", my $t = "proc",
+		$MS_NOSUID|$MS_NOEXEC|$MS_NODEV, 0 or die "mount() failed: $!";
+	# Only do a bindmount if proc was required to be mounted elsewhere than
+	# /proc
+	if ($procmnt ne "/proc") {
+		0 == syscall &SYS_mount, my $s = "/proc", $procmnt, 0,
+			$MS_BIND | $MS_REC, 0 or die "mount failed: $!";
+	}
+}
+
+## having this variable set could cause programs looking for unreachable
+## machines # see http://bugs.debian.org/780587
+#delete $ENV{http_proxy};
+
+# finally, exec our program
+exec @ARGV or die "exec() failed: $!";
+
+__END__
+=head1 NAME
+
+user-unshare - unprivileged linux namespaces with uid and gid maps
+
+=head1 SYNOPSIS
+
+user-unshare [options] [--] command
+
+=head1 DESCRIPTION
+
+This tool combines the ability of lxc-usernsexec(1) to map ranges of user and
+group ids into a new user namespace with the ability of unshare(1) to unshare
+several different kinds of namespaces. The options of this command are nearly
+a plain copy of lxc-usernsexec(1) and unshare(1).
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<-h, --help>
+
+Print a brief help message and exits.
+
+=item B<-M> I<uidmap>
+
+The uid map to use in the user namespace. Each map consists of four
+colon-separate values. First a character 'u', 'g' or 'b' to specify whether
+this map pertains to user ids, group ids, or both; next the first userid in
+the user namespace; next  the first userid as seen on the host; and finally
+the number of ids to be mapped.
+
+More than one map can be specified. If no map is specified, then by default
+the full uid and gid ranges granted by /etc/subuid and /etc/subgid will be
+mapped to the uids and gids starting at 0 in the container.
+
+Note that lxc-usernsexec always tries to setuid and setgid to 0 in the
+namespace. Therefore uid 0 in the namespace  must  be mapped.
+
+=item B<-i, --ipc>
+
+Unshare the IPC namespace.
+
+=item B<-m, --mount>
+
+Unshare the mount namespace.
+
+=item B<-n, --net>
+
+Unshare the network namespace.
+
+=item B<-p, --pid>
+
+Unshare  the pid namespace. See also the --fork and --mount-proc options.
+
+=item B<-u, --uts>
+
+Unshare the UTS namespace.
+
+=item B<-f, --fork>
+
+Fork the specified program as a child process of unshare rather than running
+it directly.  This is useful  when  creating  a new pid namespace.
+
+=item B<--mount-proc>I<[=mountpoint]>
+
+Just before running the program, mount the proc filesystem at I<mountpoint>
+(default is /proc).  This is useful when creating a new pid namespace.  It
+also implies creating a new mount namespace since the /proc mount would
+otherwise  mess  up  existing programs on the system.  The new proc filesystem
+is explicitly mounted as private (by MS_PRIVATE|MS_REC).
+
+=back
+
+=head1 EXAMPLES
+
+This:
+
+	lxc-usernsexec -- unshare --mount-proc --fork --ipc --pid --net --mount \
+		-- sh -c "ls /proc | head && whoami && groups && ip link set lo up && ip addr && /sbin/runuser -u josch whoami"
+
+is equivalent to:
+
+	user-unshare --mount-proc --fork --ipc --pid --net --mount \
+		-- sh -c "ls /proc | head && whoami && groups && ip link set lo up && ip addr && /sbin/runuser -u josch whoami"
+
+Notice that the options are exactly identical.
+
+If  your  user  id is 1000, root in a container is mapped to 190000, and you
+wish to chown a file you own to root in the container, you can use:
+
+	lxc-usernsexec -m b:0:1000:1 -m b:1:190000:1 -- /bin/chown 1:1 $file
+
+which is equivalent to:
+
+	user-unshare -M b:0:1000:1 -M b:1:190000:1 -- /bin/chown 1:1 $file
+
+This maps your userid to root in the user namespace, and 190000 to uid 1.
+Since root in the user namespace is privileged over  all userids  mapped into
+the namespace, you are allowed to change the file ownership, which you could
+not do on the host using a simple chown.
+
+=head1 SEE ALSO
+
+	unshare(1), lxc-usernsexec(1), user_namespaces(7), newgidmap(1), newuidmap(1)
+
+=cut