deb-shrink-packages/shrink.pl

#!/usr/bin/perl

# this program uses Dpkg::Index to randomly remove packages from the index
# until a minimal test case is found.
#
# Another way to solve this problem is:
#
# 09:02 < helmut> josch: a tool that has worked well for me for a number of
#                 problems was rolling an exponentially distributed number of
#                 lines from a random point. the lambda parameter was adapted
#                 with the success.
# 09:03 < helmut> josch: after finding a minimal set of lines, a
#                 language-specific tool can explode lines. for c that can be
#                 almost as simple as replacing space with newlines. then start
#                 over
# 09:04 < helmut> removing a sequence of lines has the benefit of removing
#                 whole functions/paragraphs/etc. and thus producing
#                 syntactically valid stuff occasionally
# 09:06 < helmut> I recommend that you spend a bit of time looking into haskell
#                 quickcheck's Arbitrary type class. it has a shrink member,
#                 which essentially does the same thing on haskell data
#                 structures
# 09:07 < helmut> drawing inspiration from there might help

use strict;
use warnings;

use Dpkg::Index;
use List::Util 'shuffle';

sub check {
    my $wantedexit = shift;
    print "running: ";
    print join " ", @_;
    print "\n";
    system @_;
    my $exit = $? >> 8;
    if ($exit == $wantedexit) {
	return 1;
    } else {
	print "got exit $exit but wanted exit $wantedexit\n";
	return 0;
    }
}

sub write_enabled_pkgs {
    my $index = shift;
    my $enabled = shift;
    my $fname = shift;
    print "writing to $fname\n";
    # write out all the still enabled packages
    open(my $fh, '>', $fname);
    foreach my $k ($index->get_keys()) {
	if (!$enabled->{$k}) {
	    next;
	}
	print { $fh } $index->get_by_key($k) . "\n";
    }
    close $fh;
}

sub randomize_pkgs {
    my $index = shift;
    my $enabled = shift;
    my $id_to_key = shift;
    my $tmpname = shift;
    my $wantedexit = shift;
    my @checkcmd = @_;
    # randomly delete 0.5% of the packages until we failed to produce the
    # expected result as many times as 1% of the package are.
    my $num_failures = 0;
    while (1) {
	# fill an array with the remaining enabled indices
	my @indices = ();
	my $numpkgs = scalar keys %{$enabled};
	for (my $i = 0; $i < $numpkgs; $i++) {
	    if (!$enabled->{$id_to_key->{$i}}) {
		next;
	    }
	    push @indices, $i;
	}
	my @shuffled_indices = shuffle(@indices);
	printf "1st phase: %d/%d\n", scalar @indices, $numpkgs;
	my $tenpercent = (scalar @indices)*0.005;
	my @to_disable = @shuffled_indices[ 0 .. $tenpercent - 1 ];
	my $num_to_disable = scalar @to_disable;
	print "disabling: $num_to_disable\n";
	print "acc. failures: $num_failures\n";
	for my $i (@to_disable) {
	    $enabled->{$id_to_key->{$i}} = 0;
	}
	write_enabled_pkgs($index, $enabled, $tmpname);
	if (check($wantedexit, @checkcmd)) {
	    # the error we want to see is still there so we keep these keys
	    # disabled and reset the number of failures
	    $num_failures = 0;
	} else {
	    # the error vanished
	    # re-enable the formerly disabled ones and increment the number of
	    # failures
	    for my $i (@to_disable) {
		$enabled->{$id_to_key->{$i}} = 1;
	    }
	    $num_failures += 1;
	}
	# if we have unsuccessfully tried running the check as many times as
	# the current percentage is removing at once, then it starts making
	# sense to remove packages one-by-one
	#
	# In other words: It does not make sense to remove X packages at once
	# if we are are running check() more than X times.
	#
	# We add the factor 2 because the threshold might be reached randomly
	# as well.
	if ($num_failures > $num_to_disable*2) {
	    last;
	}
    }
}

sub iterate_pkgs {
    my $index = shift;
    my $enabled = shift;
    my $id_to_key = shift;
    my $tmpname = shift;
    my $wantedexit = shift;
    my @checkcmd = @_;
    # we extract the array of enabled indices for better progress reporting
    # later on
    my $numpkgs = scalar keys %{$enabled};
    my @indices = ();
    for (my $i = 0; $i < $numpkgs; $i++) {
	if (!$enabled->{$id_to_key->{$i}}) {
	    next;
	}
	push @indices, $i;
    }

    # now delete single packages
    # we need this stage to make sure to have looked at all packages at least
    # once. Otherwise it can be that by chance a package was not randomly
    # selected by the last stage.
    my $i = 0;
    foreach my $idx (@indices) {
	my $key = $id_to_key->{$idx};
	printf "2nd phase: %d/%d\n", $i, scalar @indices;
	$i += 1;
	# disable the current key
	$enabled->{$key} = 0;
	write_enabled_pkgs($index, $enabled, $tmpname);
	if (check($wantedexit, @checkcmd)) {
	    # the error we want to see is still there so we keep this key disabled
	} else {
	    # the error vanished
	    # re-enable the current key
	    $enabled->{$key} = 1;
	}
    }
}

sub delete_field {
    my $index = shift;
    my $enabled = shift;
    my $id_to_key = shift;
    my $field = shift;
    my $tmpname = shift;
    my $wantedexit = shift;
    my @checkcmd = @_;
    # we extract the array of enabled indices for better progress reporting
    # later on
    my $numpkgs = scalar keys %{$enabled};
    my @indices = ();
    for (my $i = 0; $i < $numpkgs; $i++) {
	if (!$enabled->{$id_to_key->{$i}}) {
	    next;
	}
	push @indices, $i;
    }

    # now delete single packages
    # we need this stage to make sure to have looked at all packages at least
    # once. Otherwise it can be that by chance a package was not randomly
    # selected by the last stage.
    my $i = 0;
    foreach my $idx (@indices) {
	my $key = $id_to_key->{$idx};
	printf "3nd phase (deleting $field): %d/%d\n", $i, scalar @indices;
	$i += 1;
	# skip if this package doesn't have such a key
	my $pkg = $index->get_by_key($key);
	if (!exists $pkg->{$field}) {
	    next;
	}
	my $fieldvalue = $pkg->{$field};
	# disable the current key
	delete $pkg->{$field};
	write_enabled_pkgs($index, $enabled, $tmpname);
	if (check($wantedexit, @checkcmd)) {
	    # the error we want to see is still there so we keep this field deleted
	} else {
	    # the error vanished
	    # restore the package
	    $index->get_by_key($key)->{$field} = $fieldvalue;
	}
    }
}

sub main {
    my @srcbinlisttype = ();

    # first argument is the desired exit code
    my $wantedexit = shift @ARGV;

    # the following argument are paths to Packages/Sources files
    # to find out how many of those we expect, we scan the rest of ARGV for
    # the placeholders
    foreach my $arg (@ARGV) {
	if ($arg eq '__src__') {
	    push @srcbinlisttype, "src";
	}
	if ($arg eq '__bin__') {
	    push @srcbinlisttype, "bin";
	}
    }
    # now we now which parts are the actual input file names
    my @srcbinlistcontent = splice @ARGV, 0, (scalar @srcbinlisttype);
    # the rest is the command to execute
    my @execcmd = @ARGV;
    my $i = 0;
    my @tmpfiles = ();
    foreach my $val (@execcmd) {
	if ($val eq '__src__') {
	    $val = "src.$i.tmp";
	    push @tmpfiles, $val;
	    $i += 1;
	}
	if ($val eq '__bin__') {
	    $val = "bin.$i.tmp";
	    push @tmpfiles, $val;
	    $i += 1;
	}
    }

    my @data = ();
    foreach( 0 .. $#srcbinlistcontent ) {
	my $f = $srcbinlistcontent[$_];
	my $t = $srcbinlisttype[$_];
	my $r = $tmpfiles[$_];

	print "reading in $f\n";

	my $key_func = sub {
	    return $_[0]->{Package} . ' ' . $_[0]->{Version} . ' ' . $_[0]->{Architecture};
	};

	my $index = Dpkg::Index->new(get_key_func=>$key_func);

	$index->load($f);
	my %allowed_fields = (
	    'Version' => 1,
	    'Package' => 1,
	    'Architecture' => 1,
	);
	if ($t eq "bin") {
	    $allowed_fields{'Depends'} = 1;
	    # 'Multi-Arch' => 1, # not needed with only one architecture
	    $allowed_fields{'Provides'} = 1;
	    $allowed_fields{'Breaks'} = 1;
	    $allowed_fields{'Conflicts'} = 1;
	    $allowed_fields{'Pre-Depends'} = 1;
	    $allowed_fields{'Source'} = 1;
	    $allowed_fields{'Essential'} = 1; # probably not needed
	} else {
	    $allowed_fields{'Build-Depends'} = 1;
	    $allowed_fields{'Build-Depends-Arch'} = 1;
	    $allowed_fields{'Build-Depends-Indep'} = 1;
	    $allowed_fields{'Build-Conflicts'} = 1;
	    $allowed_fields{'Build-Conflicts-Arch'} = 1;
	    $allowed_fields{'Build-Conflicts-Indep'} = 1;
	}

	my %enabled = ();
	my %id_to_key = ();
	{
	    my $i = 0;
	    foreach my $key ($index->get_keys()) {
		$enabled{$key} = 1;
		$id_to_key{$i} = $key;
		$i += 1;
		# delete unnecessary fields
		my $cdata = $index->get_by_key($key);
		foreach my $field (keys %{$cdata}) {
		    if (! exists $allowed_fields{$field}) {
			delete $cdata->{$field};
		    }
		}
	    }
	}

	write_enabled_pkgs($index, \%enabled, $r);

	push @data, [$r, $t, $index, \%enabled, \%id_to_key];
    }

    foreach my $v (@data) {
	my ($tmpname, $t, $index, $enabled, $id_to_key) = @{$v};
	randomize_pkgs($index, $enabled, $id_to_key, $tmpname, $wantedexit, @execcmd);
    }
    foreach my $v (@data) {
	my ($tmpname, $t, $index, $enabled, $id_to_key) = @{$v};
	iterate_pkgs($index, $enabled, $id_to_key, $tmpname, $wantedexit, @execcmd);
    }

    foreach my $v (@data) {
	my ($tmpname, $t, $index, $enabled, $id_to_key) = @{$v};
	if ($t eq "bin") {
	    delete_field($index, $enabled, $id_to_key, 'Depends', $tmpname, $wantedexit, @execcmd);
	    delete_field($index, $enabled, $id_to_key, 'Pre-Depends', $tmpname, $wantedexit, @execcmd);
	    delete_field($index, $enabled, $id_to_key, 'Conflicts', $tmpname, $wantedexit, @execcmd);
	    delete_field($index, $enabled, $id_to_key, 'Breaks', $tmpname, $wantedexit, @execcmd);
	    delete_field($index, $enabled, $id_to_key, 'Provides', $tmpname, $wantedexit, @execcmd);
	} else {
	    delete_field($index, $enabled, $id_to_key, 'Build-Depends', $tmpname, $wantedexit, @execcmd);
	    delete_field($index, $enabled, $id_to_key, 'Build-Depends-Indep', $tmpname, $wantedexit, @execcmd);
	    delete_field($index, $enabled, $id_to_key, 'Build-Depends-Arch', $tmpname, $wantedexit, @execcmd);
	    delete_field($index, $enabled, $id_to_key, 'Build-Conflicts', $tmpname, $wantedexit, @execcmd);
	    delete_field($index, $enabled, $id_to_key, 'Build-Conflicts-Indep', $tmpname, $wantedexit, @execcmd);
	    delete_field($index, $enabled, $id_to_key, 'Build-Conflicts-Arch', $tmpname, $wantedexit, @execcmd);
	}
    }

    # somehow, calling randomize_pkgs might remove build-essential
    #randomize_pkgs($index, \%enabled, \%id_to_key);
    foreach my $v (@data) {
	my ($tmpname, $t, $index, $enabled, $id_to_key) = @{$v};
	iterate_pkgs($index, $enabled, $id_to_key, $tmpname, $wantedexit, @execcmd);
    }
}

main();