initial commit
This commit is contained in:
commit
32d40e541b
1 changed files with 436 additions and 0 deletions
436
user-unshare
Executable file
436
user-unshare
Executable file
|
@ -0,0 +1,436 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
#
|
||||||
|
# Copyright: 2015 Johannes Schauer <josch@mister-muffin.de>
|
||||||
|
#
|
||||||
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
# of this software and associated documentation files (the "Software"), to
|
||||||
|
# deal in the Software without restriction, including without limitation the
|
||||||
|
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||||
|
# sell copies of the Software, and to permit persons to whom the Software is
|
||||||
|
# furnished to do so, subject to the following conditions:
|
||||||
|
#
|
||||||
|
# The above copyright notice and this permission notice shall be included in
|
||||||
|
# all copies or substantial portions of the Software.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# This tool tries to be like unshare(1) but with the power of lxc-usernsexec(1)
|
||||||
|
# to map more than one id into the new user namespace by using the programs
|
||||||
|
# newgidmap and newuidmap. Or in other words: This tool tries to be like
|
||||||
|
# lxc-usernsexec(1) but with the power of unshare(1) to unshare more than just
|
||||||
|
# the user and mount namespaces.
|
||||||
|
#
|
||||||
|
# I arbitrarily called it user-unshare because it's like unshare(1) but
|
||||||
|
# doesn't require root (except for the suid root new[ug]idmap programs).
|
||||||
|
#
|
||||||
|
# It is essentially equal to calling:
|
||||||
|
#
|
||||||
|
# $ lxc-usernsexec [opts] -- unshare [opts] -- COMMAND
|
||||||
|
#
|
||||||
|
# Its main points of existence are:
|
||||||
|
#
|
||||||
|
# - as a project for me to learn how unprivileged namespaces work
|
||||||
|
# - written in Perl which means:
|
||||||
|
# - architecture independent (same executable on any architecture)
|
||||||
|
# - easily inspectable by other curious minds
|
||||||
|
# - tons of code comments to let others understand how things work
|
||||||
|
# - no need to install the lxc package in a minimal environment (perl itself
|
||||||
|
# might not be called minimal either but is present in every Debian
|
||||||
|
# installation)
|
||||||
|
#
|
||||||
|
# Further differences to unshare(1):
|
||||||
|
#
|
||||||
|
# - No --setgroups and --map-root-user options. These were only useful when
|
||||||
|
# not more than a single uid and gid was available inside the user
|
||||||
|
# namespace
|
||||||
|
# - CLONE_NEWUSER is always enabled (so there is no --user option)
|
||||||
|
# - persistent namespaces to be used with nsenter cannot be supported because
|
||||||
|
# they require that you have permissions to do `mount --bind` in the host
|
||||||
|
# namespace which requires root privileges
|
||||||
|
# - the --mount-proc=XXX option actually works with XXX being other
|
||||||
|
# directories than /proc
|
||||||
|
# - you are immediately root in the new user namespace and can then switch to
|
||||||
|
# another user with runuser(1) if you so wish
|
||||||
|
#
|
||||||
|
# Further differences to lxc-usernsexec(1):
|
||||||
|
#
|
||||||
|
# - there is no parent that is just waiting for the child to exit and
|
||||||
|
# otherwise wasting pid space (unless you request --fork)
|
||||||
|
# - requires only one pipe instead of two for IPC
|
||||||
|
# - the -m option is now uppercase -M because -m was already the short option
|
||||||
|
# for --mount in unshare(1)
|
||||||
|
#
|
||||||
|
# How it differs from other tools:
|
||||||
|
#
|
||||||
|
# - systemd-nspawn requires to be executed by root and this does not seem
|
||||||
|
# likely to change any time soon:
|
||||||
|
# http://lists.freedesktop.org/archives/systemd-devel/2015-February/028139.html
|
||||||
|
# - linux-user-chroot cheats by being suid root
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Debian kernels carry a patch named
|
||||||
|
# add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch
|
||||||
|
# which disables unprivileged usernamespaces by default, to enable it do:
|
||||||
|
# $ echo 1 | sudo tee /proc/sys/kernel/unprivileged_userns_clone > /dev/null
|
||||||
|
# or
|
||||||
|
# $ sysctl -w kernel.unprivileged_userns_clone=1
|
||||||
|
|
||||||
|
use warnings;
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
require 'syscall.ph';
|
||||||
|
|
||||||
|
use Getopt::Long;
|
||||||
|
use Pod::Usage;
|
||||||
|
|
||||||
|
# read the files /etc/subuid and /etc/subgid and return the mapping of user and
|
||||||
|
# group ids, respectively for the given username
|
||||||
|
sub read_subuid_subgid($) {
|
||||||
|
my $username = shift;
|
||||||
|
my ($subid, $num_subid, $fh, $n);
|
||||||
|
my @result;
|
||||||
|
|
||||||
|
if (-f "/etc/subuid") {
|
||||||
|
open $fh, "<", "/etc/subuid" or die "cannot open /etc/subuid for reading: $!";
|
||||||
|
while (my $line = <$fh>) {
|
||||||
|
($n, $subid, $num_subid) = split(/:/, $line, 3);
|
||||||
|
last if ($n eq $username);
|
||||||
|
}
|
||||||
|
close $fh;
|
||||||
|
push @result, ["u", 0, $subid, $num_subid];
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( -f "/etc/subgid") {
|
||||||
|
open $fh, "<", "/etc/subgid" or die "cannot open /etc/subgid for reading: $!";
|
||||||
|
while (my $line = <$fh>) {
|
||||||
|
($n, $subid, $num_subid) = split(/:/, $line, 3);
|
||||||
|
last if ($n eq $username);
|
||||||
|
}
|
||||||
|
close $fh;
|
||||||
|
push @result, ["g", 0, $subid, $num_subid];
|
||||||
|
}
|
||||||
|
|
||||||
|
return @result;
|
||||||
|
}
|
||||||
|
|
||||||
|
# FIXME: get rid of these constants
|
||||||
|
# from sched.h
|
||||||
|
my $CLONE_NEWNS = 0x20000;
|
||||||
|
my $CLONE_NEWUTS = 0x4000000;
|
||||||
|
my $CLONE_NEWIPC = 0x8000000;
|
||||||
|
my $CLONE_NEWUSER = 0x10000000;
|
||||||
|
my $CLONE_NEWPID = 0x20000000;
|
||||||
|
my $CLONE_NEWNET = 0x40000000;
|
||||||
|
# from sys/mount.h
|
||||||
|
my $MS_NOSUID = 0x2;
|
||||||
|
my $MS_NODEV = 0x4;
|
||||||
|
my $MS_NOEXEC = 0x8;
|
||||||
|
my $MS_BIND = 0x1000;
|
||||||
|
my $MS_REC = 0x4000;
|
||||||
|
my $MS_PRIVATE = 0x40000;
|
||||||
|
my $MS_SLAVE = 0x80000;
|
||||||
|
|
||||||
|
my $unshare_flags = $CLONE_NEWUSER;
|
||||||
|
my @idmap;
|
||||||
|
my $procmnt = undef;
|
||||||
|
my $fork;
|
||||||
|
|
||||||
|
Getopt::Long::Configure ("bundling");
|
||||||
|
GetOptions(
|
||||||
|
'h|help' => sub { pod2usage(-exitval => 0, -verbose => 2) },
|
||||||
|
"i|ipc" => sub { $unshare_flags |= $CLONE_NEWIPC },
|
||||||
|
"m|mount" => sub { $unshare_flags |= $CLONE_NEWNS },
|
||||||
|
"n|net" => sub { $unshare_flags |= $CLONE_NEWNET },
|
||||||
|
"p|pid" => sub { $unshare_flags |= $CLONE_NEWPID },
|
||||||
|
"u|uts" => sub { $unshare_flags |= $CLONE_NEWUTS },
|
||||||
|
"mount-proc:s" => sub {
|
||||||
|
$fork = 1; # mounting proc requires a forked child or otherwise EPERM
|
||||||
|
$procmnt = $_[1] eq "" ? "/proc" : $_[1]; # /proc is the default
|
||||||
|
$unshare_flags |= $CLONE_NEWNS; }, # implicitly enable --mount
|
||||||
|
"f|fork" => \$fork,
|
||||||
|
"M=s" => sub { # parse -M option and make sure it consists of four parts
|
||||||
|
my @v = split /:/, $_[1], 4;
|
||||||
|
die "invalid format for -m option" if scalar @v != 4;
|
||||||
|
push @idmap, \@v },
|
||||||
|
) or pod2usage(-exitval => 2, -verbose => 1);
|
||||||
|
|
||||||
|
my $origuid = $<;
|
||||||
|
my $origgid = $(;
|
||||||
|
|
||||||
|
# If the -M option was not given, read /etc/sub[ug]id to get the right
|
||||||
|
# sub[ug]ids for the new[gu]idmap calls later by the child process.
|
||||||
|
#
|
||||||
|
# new[ug]idmap is called with the exact values from /etc/sub[ug]id,
|
||||||
|
# respectively.
|
||||||
|
if (scalar @idmap == 0) {
|
||||||
|
@idmap = read_subuid_subgid getpwuid $<;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create a pipe for the parent process to signal the child process that it is
|
||||||
|
# done with calling unshare() so that the child can go ahead setting up
|
||||||
|
# uid_map and gid_map.
|
||||||
|
pipe my $rfh, my $wfh;
|
||||||
|
|
||||||
|
# We have to do this dance with forking a process and then modifying the
|
||||||
|
# parent from the child because:
|
||||||
|
# - new[ug]idmap can only be called on a process id after that process has
|
||||||
|
# unshared the user namespace
|
||||||
|
# - a process looses its capabilities if it performs an execve() with nonzero
|
||||||
|
# user ids see the capabilities(7) man page for details.
|
||||||
|
# - a process that unshared the user namespace by default does not have the
|
||||||
|
# privileges to call new[ug]idmap on itself
|
||||||
|
#
|
||||||
|
# this also works the other way around (the child setting up a user namespace
|
||||||
|
# and being modified from the parent) but that way, the parent would have to
|
||||||
|
# stay around until the child exited (so a pid would be wasted). Additionally,
|
||||||
|
# that variant would require an additional pipe to let the parent signal the
|
||||||
|
# child that it is done with calling new[ug]idmap. The way it is done here,
|
||||||
|
# this signaling can instead be done by wait()-ing for the exit of the child.
|
||||||
|
my $ppid = $$;
|
||||||
|
my $cpid = fork() // die "fork() failed: $!";
|
||||||
|
if ($cpid == 0) {
|
||||||
|
# child
|
||||||
|
|
||||||
|
# Close the writing descriptor at our end of the pipe so that we see EOF
|
||||||
|
# when parent closes its descriptor.
|
||||||
|
close $wfh;
|
||||||
|
|
||||||
|
# Wait for the parent process to finish its unshare() call by waiting for
|
||||||
|
# an EOF.
|
||||||
|
0 == sysread $rfh, my $c, 1 or die "read() did not receive EOF";
|
||||||
|
|
||||||
|
# The program's new[ug]idmap have to be used because they are setuid root.
|
||||||
|
# These privileges are needed to map the ids from /etc/sub[ug]id to the
|
||||||
|
# user namespace set up by the parent. Without these privileges, only the
|
||||||
|
# id of the user itself can be mapped into the new namespace.
|
||||||
|
#
|
||||||
|
# Since new[ug]idmap is setuid root we also don't need to write "deny" to
|
||||||
|
# /proc/$$/setgroups beforehand (this is otherwise required for
|
||||||
|
# unprivileged processes trying to write to /proc/$$/gid_map since kernel
|
||||||
|
# version 3.19 for security reasons) and therefore the parent process
|
||||||
|
# keeps its ability to change its own group here.
|
||||||
|
#
|
||||||
|
# Since /proc/$ppid/[ug]id_map can only be written to once, respectively,
|
||||||
|
# instead of making multiple calls to new[ug]idmap, we assemble a command
|
||||||
|
# line that makes one call each.
|
||||||
|
my $uidmapcmd = "";
|
||||||
|
my $gidmapcmd = "";
|
||||||
|
foreach (@idmap) {
|
||||||
|
my ($t, $hostid, $nsid, $range) = @{$_};
|
||||||
|
if ($t ne "u" and $t ne "g" and $t ne "b") {
|
||||||
|
die "invalid idmap type: $t";
|
||||||
|
}
|
||||||
|
if ($t eq "u" or $t eq "b") {
|
||||||
|
$uidmapcmd .= " $hostid $nsid $range";
|
||||||
|
}
|
||||||
|
if ($t eq "g" or $t eq "b") {
|
||||||
|
$gidmapcmd .= " $hostid $nsid $range";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($uidmapcmd ne "") {
|
||||||
|
0 == system "newuidmap $ppid $uidmapcmd" or die "newuidmap failed: $!";
|
||||||
|
}
|
||||||
|
if ($gidmapcmd ne "") {
|
||||||
|
0 == system "newgidmap $ppid $gidmapcmd" or die "newgidmap failed: $!";
|
||||||
|
}
|
||||||
|
exit 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
# parent
|
||||||
|
|
||||||
|
# After fork()-ing, the parent immediately calls unshare...
|
||||||
|
0 == syscall &SYS_unshare, $unshare_flags or die "unshare() failed: $!";
|
||||||
|
|
||||||
|
# .. and then signals the child process that we are done with the unshare()
|
||||||
|
# call by sending an EOF.
|
||||||
|
close $wfh;
|
||||||
|
|
||||||
|
# Wait for the child process to finish its setup by waiting for its exit.
|
||||||
|
$cpid == waitpid $cpid, 0 or die "waitpid() failed: $!";
|
||||||
|
if ($? != 0) {
|
||||||
|
die "child had a non-zero exit status: $?";
|
||||||
|
}
|
||||||
|
|
||||||
|
# Currently we are nobody (uid and gid are 65534). So we become root user and
|
||||||
|
# group instead.
|
||||||
|
#
|
||||||
|
# We are using direct syscalls instead of setting $(, $), $< and $> because
|
||||||
|
# then perl would do additional stuff which we don't need or want here, like
|
||||||
|
# checking /proc/sys/kernel/ngroups_max (which might not exist). It would also
|
||||||
|
# also call setgroups() in a way that makes the root user be part of the
|
||||||
|
# group unknown.
|
||||||
|
0 == syscall &SYS_setgid, 0 or die "setgid failed: $!";
|
||||||
|
0 == syscall &SYS_setuid, 0 or die "setuid failed: $!";
|
||||||
|
0 == syscall &SYS_setgroups, 0, 0 or die "setgroups failed: $!";
|
||||||
|
|
||||||
|
# At this point lxc-usernsexec is checking whether / is mounted MS_SHARED and
|
||||||
|
# if yes, will mount / as MS_SLAVE|MS_REC - not sure why we have to do this
|
||||||
|
# and it seems to work without so we don't do it.
|
||||||
|
#0 == syscall &SYS_mount, 0, my $t = "/", 0, $MS_SLAVE | $MS_REC, 0 or die "mount() failed: $!";
|
||||||
|
|
||||||
|
# When the pid namespace is also unshared, then processes expect a master pid
|
||||||
|
# to always be alive within the namespace. To achieve this, we fork() here
|
||||||
|
# instead of exec() to always have one dummy process running as pid 1 inside
|
||||||
|
# the namespace. This is also what the unshare tool does when used with the
|
||||||
|
# --fork option.
|
||||||
|
#
|
||||||
|
# Otherwise, without a pid 1, new processes cannot be forked anymore after pid
|
||||||
|
# 1 finished.
|
||||||
|
if ($fork) {
|
||||||
|
my $cpid = fork() // die "fork() failed: $!";
|
||||||
|
if ($cpid != 0) {
|
||||||
|
# parent
|
||||||
|
|
||||||
|
# The parent process will stay alive as pid 1 in this namespace until
|
||||||
|
# the child finishes executing. This is important because pid 1 must
|
||||||
|
# never die or otherwise nothing new can be forked.
|
||||||
|
$cpid == waitpid $cpid, 0 or die "waitpid() failed: $!";
|
||||||
|
exit $?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (defined $procmnt) {
|
||||||
|
# Directly bind-mounting the system's /proc into $procmnt would be wrong
|
||||||
|
# because we want to restrict proc to the new pid namespace. Thus we need
|
||||||
|
# to mount a new proc.
|
||||||
|
#
|
||||||
|
# FIXME: Funnily, I wasn't able to figure out how to directly mount a new
|
||||||
|
# proc into any other location than the old /proc without getting EINVAL,
|
||||||
|
# therefore, the workaround is to remount the old /proc and then bind mount
|
||||||
|
# that into the new location.
|
||||||
|
#
|
||||||
|
# This operation also fails with:
|
||||||
|
# $ lxc-usernsexec -- unshare --mount-proc=... --fork --pid --mount -- ...
|
||||||
|
# or even with:
|
||||||
|
# $ unshare --mount-proc=... --fork --pid --mount --user -- ...
|
||||||
|
# the authors of unshare(1) should thus be made aware of the workaround
|
||||||
|
# below (or whatever the real fix ends up being):
|
||||||
|
#
|
||||||
|
# The "my $s = ..." and "my $t = ..." constructs are necessary because
|
||||||
|
# string literals cannot be passed as arguments to the syscall function.
|
||||||
|
0 == syscall &SYS_mount, my $s1 = "none", my $t1 = "/proc", 0,
|
||||||
|
$MS_PRIVATE | $MS_REC, 0 or die "mount() failed: $!";
|
||||||
|
# FIXME: The following line will fail if --fork wasn't passed. Why???
|
||||||
|
0 == syscall &SYS_mount, my $s2 = "proc", my $t2 = "/proc", my $t = "proc",
|
||||||
|
$MS_NOSUID|$MS_NOEXEC|$MS_NODEV, 0 or die "mount() failed: $!";
|
||||||
|
# Only do a bindmount if proc was required to be mounted elsewhere than
|
||||||
|
# /proc
|
||||||
|
if ($procmnt ne "/proc") {
|
||||||
|
0 == syscall &SYS_mount, my $s = "/proc", $procmnt, 0,
|
||||||
|
$MS_BIND | $MS_REC, 0 or die "mount failed: $!";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
## having this variable set could cause programs looking for unreachable
|
||||||
|
## machines # see http://bugs.debian.org/780587
|
||||||
|
#delete $ENV{http_proxy};
|
||||||
|
|
||||||
|
# finally, exec our program
|
||||||
|
exec @ARGV or die "exec() failed: $!";
|
||||||
|
|
||||||
|
__END__
|
||||||
|
=head1 NAME
|
||||||
|
|
||||||
|
user-unshare - unprivileged linux namespaces with uid and gid maps
|
||||||
|
|
||||||
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
|
user-unshare [options] [--] command
|
||||||
|
|
||||||
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
|
This tool combines the ability of lxc-usernsexec(1) to map ranges of user and
|
||||||
|
group ids into a new user namespace with the ability of unshare(1) to unshare
|
||||||
|
several different kinds of namespaces. The options of this command are nearly
|
||||||
|
a plain copy of lxc-usernsexec(1) and unshare(1).
|
||||||
|
|
||||||
|
=head1 OPTIONS
|
||||||
|
|
||||||
|
=over 8
|
||||||
|
|
||||||
|
=item B<-h, --help>
|
||||||
|
|
||||||
|
Print a brief help message and exits.
|
||||||
|
|
||||||
|
=item B<-M> I<uidmap>
|
||||||
|
|
||||||
|
The uid map to use in the user namespace. Each map consists of four
|
||||||
|
colon-separate values. First a character 'u', 'g' or 'b' to specify whether
|
||||||
|
this map pertains to user ids, group ids, or both; next the first userid in
|
||||||
|
the user namespace; next the first userid as seen on the host; and finally
|
||||||
|
the number of ids to be mapped.
|
||||||
|
|
||||||
|
More than one map can be specified. If no map is specified, then by default
|
||||||
|
the full uid and gid ranges granted by /etc/subuid and /etc/subgid will be
|
||||||
|
mapped to the uids and gids starting at 0 in the container.
|
||||||
|
|
||||||
|
Note that lxc-usernsexec always tries to setuid and setgid to 0 in the
|
||||||
|
namespace. Therefore uid 0 in the namespace must be mapped.
|
||||||
|
|
||||||
|
=item B<-i, --ipc>
|
||||||
|
|
||||||
|
Unshare the IPC namespace.
|
||||||
|
|
||||||
|
=item B<-m, --mount>
|
||||||
|
|
||||||
|
Unshare the mount namespace.
|
||||||
|
|
||||||
|
=item B<-n, --net>
|
||||||
|
|
||||||
|
Unshare the network namespace.
|
||||||
|
|
||||||
|
=item B<-p, --pid>
|
||||||
|
|
||||||
|
Unshare the pid namespace. See also the --fork and --mount-proc options.
|
||||||
|
|
||||||
|
=item B<-u, --uts>
|
||||||
|
|
||||||
|
Unshare the UTS namespace.
|
||||||
|
|
||||||
|
=item B<-f, --fork>
|
||||||
|
|
||||||
|
Fork the specified program as a child process of unshare rather than running
|
||||||
|
it directly. This is useful when creating a new pid namespace.
|
||||||
|
|
||||||
|
=item B<--mount-proc>I<[=mountpoint]>
|
||||||
|
|
||||||
|
Just before running the program, mount the proc filesystem at I<mountpoint>
|
||||||
|
(default is /proc). This is useful when creating a new pid namespace. It
|
||||||
|
also implies creating a new mount namespace since the /proc mount would
|
||||||
|
otherwise mess up existing programs on the system. The new proc filesystem
|
||||||
|
is explicitly mounted as private (by MS_PRIVATE|MS_REC).
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
=head1 EXAMPLES
|
||||||
|
|
||||||
|
This:
|
||||||
|
|
||||||
|
lxc-usernsexec -- unshare --mount-proc --fork --ipc --pid --net --mount \
|
||||||
|
-- sh -c "ls /proc | head && whoami && groups && ip link set lo up && ip addr && /sbin/runuser -u josch whoami"
|
||||||
|
|
||||||
|
is equivalent to:
|
||||||
|
|
||||||
|
user-unshare --mount-proc --fork --ipc --pid --net --mount \
|
||||||
|
-- sh -c "ls /proc | head && whoami && groups && ip link set lo up && ip addr && /sbin/runuser -u josch whoami"
|
||||||
|
|
||||||
|
Notice that the options are exactly identical.
|
||||||
|
|
||||||
|
If your user id is 1000, root in a container is mapped to 190000, and you
|
||||||
|
wish to chown a file you own to root in the container, you can use:
|
||||||
|
|
||||||
|
lxc-usernsexec -m b:0:1000:1 -m b:1:190000:1 -- /bin/chown 1:1 $file
|
||||||
|
|
||||||
|
which is equivalent to:
|
||||||
|
|
||||||
|
user-unshare -M b:0:1000:1 -M b:1:190000:1 -- /bin/chown 1:1 $file
|
||||||
|
|
||||||
|
This maps your userid to root in the user namespace, and 190000 to uid 1.
|
||||||
|
Since root in the user namespace is privileged over all userids mapped into
|
||||||
|
the namespace, you are allowed to change the file ownership, which you could
|
||||||
|
not do on the host using a simple chown.
|
||||||
|
|
||||||
|
=head1 SEE ALSO
|
||||||
|
|
||||||
|
unshare(1), lxc-usernsexec(1), user_namespaces(7), newgidmap(1), newuidmap(1)
|
||||||
|
|
||||||
|
=cut
|
Loading…
Reference in a new issue