initial commit
This commit is contained in:
commit
32d40e541b
1 changed files with 436 additions and 0 deletions
436
user-unshare
Executable file
436
user-unshare
Executable file
|
@ -0,0 +1,436 @@
|
|||
#!/usr/bin/perl
|
||||
#
|
||||
# Copyright: 2015 Johannes Schauer <josch@mister-muffin.de>
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to
|
||||
# deal in the Software without restriction, including without limitation the
|
||||
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
# sell copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
#
|
||||
# This tool tries to be like unshare(1) but with the power of lxc-usernsexec(1)
|
||||
# to map more than one id into the new user namespace by using the programs
|
||||
# newgidmap and newuidmap. Or in other words: This tool tries to be like
|
||||
# lxc-usernsexec(1) but with the power of unshare(1) to unshare more than just
|
||||
# the user and mount namespaces.
|
||||
#
|
||||
# I arbitrarily called it user-unshare because it's like unshare(1) but
|
||||
# doesn't require root (except for the suid root new[ug]idmap programs).
|
||||
#
|
||||
# It is essentially equal to calling:
|
||||
#
|
||||
# $ lxc-usernsexec [opts] -- unshare [opts] -- COMMAND
|
||||
#
|
||||
# Its main points of existence are:
|
||||
#
|
||||
# - as a project for me to learn how unprivileged namespaces work
|
||||
# - written in Perl which means:
|
||||
# - architecture independent (same executable on any architecture)
|
||||
# - easily inspectable by other curious minds
|
||||
# - tons of code comments to let others understand how things work
|
||||
# - no need to install the lxc package in a minimal environment (perl itself
|
||||
# might not be called minimal either but is present in every Debian
|
||||
# installation)
|
||||
#
|
||||
# Further differences to unshare(1):
|
||||
#
|
||||
# - No --setgroups and --map-root-user options. These were only useful when
|
||||
# not more than a single uid and gid was available inside the user
|
||||
# namespace
|
||||
# - CLONE_NEWUSER is always enabled (so there is no --user option)
|
||||
# - persistent namespaces to be used with nsenter cannot be supported because
|
||||
# they require that you have permissions to do `mount --bind` in the host
|
||||
# namespace which requires root privileges
|
||||
# - the --mount-proc=XXX option actually works with XXX being other
|
||||
# directories than /proc
|
||||
# - you are immediately root in the new user namespace and can then switch to
|
||||
# another user with runuser(1) if you so wish
|
||||
#
|
||||
# Further differences to lxc-usernsexec(1):
|
||||
#
|
||||
# - there is no parent that is just waiting for the child to exit and
|
||||
# otherwise wasting pid space (unless you request --fork)
|
||||
# - requires only one pipe instead of two for IPC
|
||||
# - the -m option is now uppercase -M because -m was already the short option
|
||||
# for --mount in unshare(1)
|
||||
#
|
||||
# How it differs from other tools:
|
||||
#
|
||||
# - systemd-nspawn requires to be executed by root and this does not seem
|
||||
# likely to change any time soon:
|
||||
# http://lists.freedesktop.org/archives/systemd-devel/2015-February/028139.html
|
||||
# - linux-user-chroot cheats by being suid root
|
||||
#
|
||||
#
|
||||
# Debian kernels carry a patch named
|
||||
# add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch
|
||||
# which disables unprivileged usernamespaces by default, to enable it do:
|
||||
# $ echo 1 | sudo tee /proc/sys/kernel/unprivileged_userns_clone > /dev/null
|
||||
# or
|
||||
# $ sysctl -w kernel.unprivileged_userns_clone=1
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
require 'syscall.ph';
|
||||
|
||||
use Getopt::Long;
|
||||
use Pod::Usage;
|
||||
|
||||
# read the files /etc/subuid and /etc/subgid and return the mapping of user and
|
||||
# group ids, respectively for the given username
|
||||
sub read_subuid_subgid($) {
|
||||
my $username = shift;
|
||||
my ($subid, $num_subid, $fh, $n);
|
||||
my @result;
|
||||
|
||||
if (-f "/etc/subuid") {
|
||||
open $fh, "<", "/etc/subuid" or die "cannot open /etc/subuid for reading: $!";
|
||||
while (my $line = <$fh>) {
|
||||
($n, $subid, $num_subid) = split(/:/, $line, 3);
|
||||
last if ($n eq $username);
|
||||
}
|
||||
close $fh;
|
||||
push @result, ["u", 0, $subid, $num_subid];
|
||||
}
|
||||
|
||||
if ( -f "/etc/subgid") {
|
||||
open $fh, "<", "/etc/subgid" or die "cannot open /etc/subgid for reading: $!";
|
||||
while (my $line = <$fh>) {
|
||||
($n, $subid, $num_subid) = split(/:/, $line, 3);
|
||||
last if ($n eq $username);
|
||||
}
|
||||
close $fh;
|
||||
push @result, ["g", 0, $subid, $num_subid];
|
||||
}
|
||||
|
||||
return @result;
|
||||
}
|
||||
|
||||
# FIXME: get rid of these constants
|
||||
# from sched.h
|
||||
my $CLONE_NEWNS = 0x20000;
|
||||
my $CLONE_NEWUTS = 0x4000000;
|
||||
my $CLONE_NEWIPC = 0x8000000;
|
||||
my $CLONE_NEWUSER = 0x10000000;
|
||||
my $CLONE_NEWPID = 0x20000000;
|
||||
my $CLONE_NEWNET = 0x40000000;
|
||||
# from sys/mount.h
|
||||
my $MS_NOSUID = 0x2;
|
||||
my $MS_NODEV = 0x4;
|
||||
my $MS_NOEXEC = 0x8;
|
||||
my $MS_BIND = 0x1000;
|
||||
my $MS_REC = 0x4000;
|
||||
my $MS_PRIVATE = 0x40000;
|
||||
my $MS_SLAVE = 0x80000;
|
||||
|
||||
my $unshare_flags = $CLONE_NEWUSER;
|
||||
my @idmap;
|
||||
my $procmnt = undef;
|
||||
my $fork;
|
||||
|
||||
Getopt::Long::Configure ("bundling");
|
||||
GetOptions(
|
||||
'h|help' => sub { pod2usage(-exitval => 0, -verbose => 2) },
|
||||
"i|ipc" => sub { $unshare_flags |= $CLONE_NEWIPC },
|
||||
"m|mount" => sub { $unshare_flags |= $CLONE_NEWNS },
|
||||
"n|net" => sub { $unshare_flags |= $CLONE_NEWNET },
|
||||
"p|pid" => sub { $unshare_flags |= $CLONE_NEWPID },
|
||||
"u|uts" => sub { $unshare_flags |= $CLONE_NEWUTS },
|
||||
"mount-proc:s" => sub {
|
||||
$fork = 1; # mounting proc requires a forked child or otherwise EPERM
|
||||
$procmnt = $_[1] eq "" ? "/proc" : $_[1]; # /proc is the default
|
||||
$unshare_flags |= $CLONE_NEWNS; }, # implicitly enable --mount
|
||||
"f|fork" => \$fork,
|
||||
"M=s" => sub { # parse -M option and make sure it consists of four parts
|
||||
my @v = split /:/, $_[1], 4;
|
||||
die "invalid format for -m option" if scalar @v != 4;
|
||||
push @idmap, \@v },
|
||||
) or pod2usage(-exitval => 2, -verbose => 1);
|
||||
|
||||
my $origuid = $<;
|
||||
my $origgid = $(;
|
||||
|
||||
# If the -M option was not given, read /etc/sub[ug]id to get the right
|
||||
# sub[ug]ids for the new[gu]idmap calls later by the child process.
|
||||
#
|
||||
# new[ug]idmap is called with the exact values from /etc/sub[ug]id,
|
||||
# respectively.
|
||||
if (scalar @idmap == 0) {
|
||||
@idmap = read_subuid_subgid getpwuid $<;
|
||||
}
|
||||
|
||||
# Create a pipe for the parent process to signal the child process that it is
|
||||
# done with calling unshare() so that the child can go ahead setting up
|
||||
# uid_map and gid_map.
|
||||
pipe my $rfh, my $wfh;
|
||||
|
||||
# We have to do this dance with forking a process and then modifying the
|
||||
# parent from the child because:
|
||||
# - new[ug]idmap can only be called on a process id after that process has
|
||||
# unshared the user namespace
|
||||
# - a process looses its capabilities if it performs an execve() with nonzero
|
||||
# user ids see the capabilities(7) man page for details.
|
||||
# - a process that unshared the user namespace by default does not have the
|
||||
# privileges to call new[ug]idmap on itself
|
||||
#
|
||||
# this also works the other way around (the child setting up a user namespace
|
||||
# and being modified from the parent) but that way, the parent would have to
|
||||
# stay around until the child exited (so a pid would be wasted). Additionally,
|
||||
# that variant would require an additional pipe to let the parent signal the
|
||||
# child that it is done with calling new[ug]idmap. The way it is done here,
|
||||
# this signaling can instead be done by wait()-ing for the exit of the child.
|
||||
my $ppid = $$;
|
||||
my $cpid = fork() // die "fork() failed: $!";
|
||||
if ($cpid == 0) {
|
||||
# child
|
||||
|
||||
# Close the writing descriptor at our end of the pipe so that we see EOF
|
||||
# when parent closes its descriptor.
|
||||
close $wfh;
|
||||
|
||||
# Wait for the parent process to finish its unshare() call by waiting for
|
||||
# an EOF.
|
||||
0 == sysread $rfh, my $c, 1 or die "read() did not receive EOF";
|
||||
|
||||
# The program's new[ug]idmap have to be used because they are setuid root.
|
||||
# These privileges are needed to map the ids from /etc/sub[ug]id to the
|
||||
# user namespace set up by the parent. Without these privileges, only the
|
||||
# id of the user itself can be mapped into the new namespace.
|
||||
#
|
||||
# Since new[ug]idmap is setuid root we also don't need to write "deny" to
|
||||
# /proc/$$/setgroups beforehand (this is otherwise required for
|
||||
# unprivileged processes trying to write to /proc/$$/gid_map since kernel
|
||||
# version 3.19 for security reasons) and therefore the parent process
|
||||
# keeps its ability to change its own group here.
|
||||
#
|
||||
# Since /proc/$ppid/[ug]id_map can only be written to once, respectively,
|
||||
# instead of making multiple calls to new[ug]idmap, we assemble a command
|
||||
# line that makes one call each.
|
||||
my $uidmapcmd = "";
|
||||
my $gidmapcmd = "";
|
||||
foreach (@idmap) {
|
||||
my ($t, $hostid, $nsid, $range) = @{$_};
|
||||
if ($t ne "u" and $t ne "g" and $t ne "b") {
|
||||
die "invalid idmap type: $t";
|
||||
}
|
||||
if ($t eq "u" or $t eq "b") {
|
||||
$uidmapcmd .= " $hostid $nsid $range";
|
||||
}
|
||||
if ($t eq "g" or $t eq "b") {
|
||||
$gidmapcmd .= " $hostid $nsid $range";
|
||||
}
|
||||
}
|
||||
if ($uidmapcmd ne "") {
|
||||
0 == system "newuidmap $ppid $uidmapcmd" or die "newuidmap failed: $!";
|
||||
}
|
||||
if ($gidmapcmd ne "") {
|
||||
0 == system "newgidmap $ppid $gidmapcmd" or die "newgidmap failed: $!";
|
||||
}
|
||||
exit 0;
|
||||
}
|
||||
|
||||
# parent
|
||||
|
||||
# After fork()-ing, the parent immediately calls unshare...
|
||||
0 == syscall &SYS_unshare, $unshare_flags or die "unshare() failed: $!";
|
||||
|
||||
# .. and then signals the child process that we are done with the unshare()
|
||||
# call by sending an EOF.
|
||||
close $wfh;
|
||||
|
||||
# Wait for the child process to finish its setup by waiting for its exit.
|
||||
$cpid == waitpid $cpid, 0 or die "waitpid() failed: $!";
|
||||
if ($? != 0) {
|
||||
die "child had a non-zero exit status: $?";
|
||||
}
|
||||
|
||||
# Currently we are nobody (uid and gid are 65534). So we become root user and
|
||||
# group instead.
|
||||
#
|
||||
# We are using direct syscalls instead of setting $(, $), $< and $> because
|
||||
# then perl would do additional stuff which we don't need or want here, like
|
||||
# checking /proc/sys/kernel/ngroups_max (which might not exist). It would also
|
||||
# also call setgroups() in a way that makes the root user be part of the
|
||||
# group unknown.
|
||||
0 == syscall &SYS_setgid, 0 or die "setgid failed: $!";
|
||||
0 == syscall &SYS_setuid, 0 or die "setuid failed: $!";
|
||||
0 == syscall &SYS_setgroups, 0, 0 or die "setgroups failed: $!";
|
||||
|
||||
# At this point lxc-usernsexec is checking whether / is mounted MS_SHARED and
|
||||
# if yes, will mount / as MS_SLAVE|MS_REC - not sure why we have to do this
|
||||
# and it seems to work without so we don't do it.
|
||||
#0 == syscall &SYS_mount, 0, my $t = "/", 0, $MS_SLAVE | $MS_REC, 0 or die "mount() failed: $!";
|
||||
|
||||
# When the pid namespace is also unshared, then processes expect a master pid
|
||||
# to always be alive within the namespace. To achieve this, we fork() here
|
||||
# instead of exec() to always have one dummy process running as pid 1 inside
|
||||
# the namespace. This is also what the unshare tool does when used with the
|
||||
# --fork option.
|
||||
#
|
||||
# Otherwise, without a pid 1, new processes cannot be forked anymore after pid
|
||||
# 1 finished.
|
||||
if ($fork) {
|
||||
my $cpid = fork() // die "fork() failed: $!";
|
||||
if ($cpid != 0) {
|
||||
# parent
|
||||
|
||||
# The parent process will stay alive as pid 1 in this namespace until
|
||||
# the child finishes executing. This is important because pid 1 must
|
||||
# never die or otherwise nothing new can be forked.
|
||||
$cpid == waitpid $cpid, 0 or die "waitpid() failed: $!";
|
||||
exit $?;
|
||||
}
|
||||
}
|
||||
|
||||
if (defined $procmnt) {
|
||||
# Directly bind-mounting the system's /proc into $procmnt would be wrong
|
||||
# because we want to restrict proc to the new pid namespace. Thus we need
|
||||
# to mount a new proc.
|
||||
#
|
||||
# FIXME: Funnily, I wasn't able to figure out how to directly mount a new
|
||||
# proc into any other location than the old /proc without getting EINVAL,
|
||||
# therefore, the workaround is to remount the old /proc and then bind mount
|
||||
# that into the new location.
|
||||
#
|
||||
# This operation also fails with:
|
||||
# $ lxc-usernsexec -- unshare --mount-proc=... --fork --pid --mount -- ...
|
||||
# or even with:
|
||||
# $ unshare --mount-proc=... --fork --pid --mount --user -- ...
|
||||
# the authors of unshare(1) should thus be made aware of the workaround
|
||||
# below (or whatever the real fix ends up being):
|
||||
#
|
||||
# The "my $s = ..." and "my $t = ..." constructs are necessary because
|
||||
# string literals cannot be passed as arguments to the syscall function.
|
||||
0 == syscall &SYS_mount, my $s1 = "none", my $t1 = "/proc", 0,
|
||||
$MS_PRIVATE | $MS_REC, 0 or die "mount() failed: $!";
|
||||
# FIXME: The following line will fail if --fork wasn't passed. Why???
|
||||
0 == syscall &SYS_mount, my $s2 = "proc", my $t2 = "/proc", my $t = "proc",
|
||||
$MS_NOSUID|$MS_NOEXEC|$MS_NODEV, 0 or die "mount() failed: $!";
|
||||
# Only do a bindmount if proc was required to be mounted elsewhere than
|
||||
# /proc
|
||||
if ($procmnt ne "/proc") {
|
||||
0 == syscall &SYS_mount, my $s = "/proc", $procmnt, 0,
|
||||
$MS_BIND | $MS_REC, 0 or die "mount failed: $!";
|
||||
}
|
||||
}
|
||||
|
||||
## having this variable set could cause programs looking for unreachable
|
||||
## machines # see http://bugs.debian.org/780587
|
||||
#delete $ENV{http_proxy};
|
||||
|
||||
# finally, exec our program
|
||||
exec @ARGV or die "exec() failed: $!";
|
||||
|
||||
__END__
|
||||
=head1 NAME
|
||||
|
||||
user-unshare - unprivileged linux namespaces with uid and gid maps
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
user-unshare [options] [--] command
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This tool combines the ability of lxc-usernsexec(1) to map ranges of user and
|
||||
group ids into a new user namespace with the ability of unshare(1) to unshare
|
||||
several different kinds of namespaces. The options of this command are nearly
|
||||
a plain copy of lxc-usernsexec(1) and unshare(1).
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
=over 8
|
||||
|
||||
=item B<-h, --help>
|
||||
|
||||
Print a brief help message and exits.
|
||||
|
||||
=item B<-M> I<uidmap>
|
||||
|
||||
The uid map to use in the user namespace. Each map consists of four
|
||||
colon-separate values. First a character 'u', 'g' or 'b' to specify whether
|
||||
this map pertains to user ids, group ids, or both; next the first userid in
|
||||
the user namespace; next the first userid as seen on the host; and finally
|
||||
the number of ids to be mapped.
|
||||
|
||||
More than one map can be specified. If no map is specified, then by default
|
||||
the full uid and gid ranges granted by /etc/subuid and /etc/subgid will be
|
||||
mapped to the uids and gids starting at 0 in the container.
|
||||
|
||||
Note that lxc-usernsexec always tries to setuid and setgid to 0 in the
|
||||
namespace. Therefore uid 0 in the namespace must be mapped.
|
||||
|
||||
=item B<-i, --ipc>
|
||||
|
||||
Unshare the IPC namespace.
|
||||
|
||||
=item B<-m, --mount>
|
||||
|
||||
Unshare the mount namespace.
|
||||
|
||||
=item B<-n, --net>
|
||||
|
||||
Unshare the network namespace.
|
||||
|
||||
=item B<-p, --pid>
|
||||
|
||||
Unshare the pid namespace. See also the --fork and --mount-proc options.
|
||||
|
||||
=item B<-u, --uts>
|
||||
|
||||
Unshare the UTS namespace.
|
||||
|
||||
=item B<-f, --fork>
|
||||
|
||||
Fork the specified program as a child process of unshare rather than running
|
||||
it directly. This is useful when creating a new pid namespace.
|
||||
|
||||
=item B<--mount-proc>I<[=mountpoint]>
|
||||
|
||||
Just before running the program, mount the proc filesystem at I<mountpoint>
|
||||
(default is /proc). This is useful when creating a new pid namespace. It
|
||||
also implies creating a new mount namespace since the /proc mount would
|
||||
otherwise mess up existing programs on the system. The new proc filesystem
|
||||
is explicitly mounted as private (by MS_PRIVATE|MS_REC).
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
This:
|
||||
|
||||
lxc-usernsexec -- unshare --mount-proc --fork --ipc --pid --net --mount \
|
||||
-- sh -c "ls /proc | head && whoami && groups && ip link set lo up && ip addr && /sbin/runuser -u josch whoami"
|
||||
|
||||
is equivalent to:
|
||||
|
||||
user-unshare --mount-proc --fork --ipc --pid --net --mount \
|
||||
-- sh -c "ls /proc | head && whoami && groups && ip link set lo up && ip addr && /sbin/runuser -u josch whoami"
|
||||
|
||||
Notice that the options are exactly identical.
|
||||
|
||||
If your user id is 1000, root in a container is mapped to 190000, and you
|
||||
wish to chown a file you own to root in the container, you can use:
|
||||
|
||||
lxc-usernsexec -m b:0:1000:1 -m b:1:190000:1 -- /bin/chown 1:1 $file
|
||||
|
||||
which is equivalent to:
|
||||
|
||||
user-unshare -M b:0:1000:1 -M b:1:190000:1 -- /bin/chown 1:1 $file
|
||||
|
||||
This maps your userid to root in the user namespace, and 190000 to uid 1.
|
||||
Since root in the user namespace is privileged over all userids mapped into
|
||||
the namespace, you are allowed to change the file ownership, which you could
|
||||
not do on the host using a simple chown.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
unshare(1), lxc-usernsexec(1), user_namespaces(7), newgidmap(1), newuidmap(1)
|
||||
|
||||
=cut
|
Loading…
Reference in a new issue