#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Cwd;

# don't bother displaying info for cumulative sizes smaller than total
# divided by this:
my $detail = 30;

#< options
my $divfactor=0;
my $help=0;
my $Links=0;
my $extensions=1;
my @skips;
GetOptions('bytes!'=>sub {$divfactor=1;},
	   'kilobytes|kb!'=>sub {$divfactor=1024;},
	   'megabytes|mb!'=>sub {$divfactor=1024 * 1024;},
	   'gigabytes|gb!'=>sub {$divfactor=1024 * 1024 * 1024;},
	   'extensions+'=>sub {$extensions=!$extensions;},
	   'help!'=>\$help, 
	   'Links!'=>\$Links,
	   'detail=i'=>\$detail,
	   'skip=s@'=>\@skips); 
#(b)ytes, (k)ilobytes, (m)egabytes, (g)igabytes, (h)uman-readable
#(d)etail
if ($help) {
    print qq(usage: duke [-bkmg] [-d n] [-s dir] [-e] [dir]
	     -bkmg   -  show sizes in bytes, kb, Mb, or Gb, respectively.
	                (default is human-readable)
	     -d n    -  amount of detail: show entities at most total/n in size
	                (default $detail -- by necessity this is a bit more 
			 than the number of items displayed)
             -s dir  -  skip dir.  This option may be used multiple times.
             -e      -  accumulate info for files by filname extension\n);
    exit 0;
}

my %skip=();
foreach my $s (@skips) { $s=~s|/$||; $skip{$s}=1; }
#> 
#< human_readable
sub human_readable {
    my $bytes = shift;
    unless ($divfactor==0) {
	return sprintf "%1.0f", ($bytes / $divfactor);
    }
    my $kb = $bytes / 1024;
    if ($bytes < 1000) {
	return sprintf " %d",$bytes;
    } elsif ($bytes < 1024) {
	return sprintf "%0.2fk", $kb;
    } elsif ($kb < 10) {
	return sprintf "%1.1fk", $kb;
    } else {
	my $mb = $kb / 1024;
	if ($kb < 1000) {
	    return sprintf "%3.0fk", $kb;
	} elsif ($kb < 1024) {
	    return sprintf "%0.2fM", $mb;
	} elsif ($mb < 10) {
	    return sprintf "%1.1fM", $mb;
	} else {
	    my $gb = $mb / 1024;
	    if ($mb < 1000) {
		return sprintf "%3.0fM", $mb
	    } elsif ($mb < 1024) {
		return sprintf "%0.2fG", $gb;
	    } elsif ($gb < 10) {
		return sprintf "%1.1fG", $gb;
	    } else {
		return sprintf "%3.0fG", $gb;
	    }
	}
    }
}
#>
#< walk(dir)
sub walk {  
    # for each directory I'm given, I want to return an arrayref of the form
    # [ sum_of_content_sizes, name, age, 
    #   [biggest_constituent], [next_biggest_constituent], ...
    #       ... [smallest_constituent]
    # ]

    my $dirname = shift;
    my $dirsize = -s $dirname;
    my $dirage = -M _;

    local *DIR;
    opendir DIR, $dirname or do {
	warn "WARNING: cannot open directory [$dirname]: $!\n";
	warn "         this directory will be skipped: the report will be wrong.\n";
	return [$dirsize, $dirage, $dirname];
    };

    my @sizes = ();
    my @files = readdir(DIR);
    closedir DIR;
    @files = grep !/^\.\.?$/, @files; # skip . and ..
    foreach my $file (@files) {
	# get the stat info:
	my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, 
	    $atime, $mtime, $ctime, $blksize, $blocks)
	    = lstat qq($dirname/$file);
	
	# we won't use most of it.  The real size is returned in $size,
	# but this isn't what actually matters to counting disk usage,
	# because the part of the block not filled is still considered
	# "used" for the purposes of storage.  So the "size" we want 
	# is the block size $blksize times the number of blocks $blocks.
#	$size = $blksize if $blksize > $size;
	unless ($blksize) {
	    warn "block size is zero for $dirname/$file!\n";
	    warn "    skipping it...\n";
	    next;
	}
	$size = $blksize*(1+(int($size / $blksize)));

	if (not -l _ and -d _ and not $skip{$file}) {
	    my $subdir = walk(qq($dirname/$file));
	    push @sizes, $subdir;
	    $dirsize += $subdir->[0];
	    $dirage = $subdir->[1] if $subdir->[1] < $dirage;
	} else {
	    my $age = -M _;
	    push @sizes, [ $size, $age, $file ];
	    $dirsize += $size;
	    $dirage = $age if $age < $dirage;
	}
    }
    return [ $dirsize, $dirage, $dirname,
	     sort { $b->[0] <=> $a->[0] } @sizes ];
}
#>
#< Lwalk(dir)
sub Lwalk {  
    # Like walk, but follows symbolic links instead of counting their actual
    # sizes.  

    # for each directory I'm given, I want to return an arrayref of the form
    # [ sum_of_content_sizes, name, age, 
    #   [biggest_constituent], [next_biggest_constituent], ...
    #       ... [smallest_constituent]
    # ]

    my $dirname = shift;
    my $dirsize = -s $dirname;
    my $dirage = -M _;

    local *DIR;
    opendir DIR, $dirname or do {
	warn "WARNING: cannot open directory [$dirname]: $!\n";
	warn "         this directory will be skipped: the report will be wrong.\n";
	return [$dirsize, $dirage, $dirname];
    };

    my @sizes = ();
    my @files = readdir(DIR);
    closedir DIR;
    shift @files; shift @files; # skip . and ..
    foreach my $file (@files) {
	# get the stat info:
	my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, 
	    $atime, $mtime, $ctime, $blksize, $blocks)
	    = stat qq($dirname/$file);
	
	# we won't use most of it.  The real size is returned in $size,
	# but this isn't what actually matters to counting disk usage,
	# because the part of the block not filled is still considered
	# "used" for the purposes of storage.  So the "size" we want 
	# is the block size $blksize times the number of blocks $blocks.
#	$size = $blksize if $blksize > $size;
	unless ($blksize) {
	    warn "block size is zero for $dirname/$file!\n";
	    warn "    skipping it...\n";
	    next;
	}
	$size = $blksize*(1+(int($size / $blksize)));

	if (-d _ and not $skip{$file}) {
	    my $subdir = Lwalk(qq($dirname/$file));
	    push @sizes, $subdir;
	    $dirsize += $subdir->[0];
	    $dirage = $subdir->[1] if $subdir->[1] < $dirage;
	} else {
	    my $age = -M _;
	    push @sizes, [ $size, $age, $file ];
	    $dirsize += $size;
	    $dirage = $age if $age < $dirage;
	}
    }
    return [ $dirsize, $dirage, $dirname,
	     sort { $b->[0] <=> $a->[0] } @sizes ];
}
#>
#< display(sizes) 
sub display {
    my $sizes = shift;
    my $depth = shift;
    my $total = shift;
    $depth ||= 0;
#    return if $depth > 1;
    my ($size, $age, $name, @contents) = @$sizes;
    return unless $size;  # in case we had perm problems or anything.
    $name =~ s|/$||;
    $name =~ s|.*/|| if scalar @contents;
    $total ||= $size;
    return if ($size * $detail < $total);
    print "   "x$depth;
    printf "%s  %s (%1.1f)\n", human_readable($size), $name, $age;
    foreach my $ref (@contents) {
	last unless display($ref, $depth+1, $total);
    }
    return 1;
}
#>
#< extdisplay(sizes) 
sub extdisplay {
    my $sizes = shift;
    my $depth = shift;
    my $total = shift;
    $depth ||= 0;
#    return if $depth > 1;
    my ($size, $age, $name, @contents) = @$sizes;
    return unless $size;  # in case we had perm problems or anything.
    $total ||= $size;
    $name =~ s|/$||;
    $name =~ s|.*/||; # remove directory parts of name
    if (not scalar @contents) { # this is a file, not a directory
	if ($size * $detail >= $total) { # file is big enough to get printed
	    print "   "x$depth;
	    printf "%s  %s (%1.1f)\n", human_readable($size), $name, $age;
	    return ();
	} elsif ($name =~ /.\./) { # has an extension, contribute upwards
	    my $ext = $name;
	    $ext =~ s/.*\.//;
	    return ("*.$ext", { size => $size, count => 1, age => $age });
	} else { # file too small, doesn't have an extension.  nothing useful.
	    return (); 
	}
    } else { # a directory
	my %extensions = ();
	if ($size * $detail >= $total) {
	    print "   "x$depth;
	    printf "%s  %s (%1.1f)\n", human_readable($size), $name, $age;
	}
	foreach my $ref (@contents) {
	    &merge(\%extensions, extdisplay($ref, $depth+1, $total));
	}
	foreach my $ext (keys %extensions) {
	    if ($extensions{$ext}->{size} * $detail >= $total) {
		print "   "x($depth+1);
		printf("%s  %s (%1.1f)  %1d files\n", 
		       human_readable($extensions{$ext}->{size}), 
		       substr($ext,2),
		       $extensions{$ext}->{age},
		       $extensions{$ext}->{count});
		delete $extensions{$ext};
	    }
	}
	return %extensions;
    }
}
#< merge(\%hashref, %otherhash)
# requires: all values are of the form { size => number, count => number,
#           age => number }
# modifies: hashref 
# effects: for each key, adds the sizes and counts of hashref{"*/key"} 
#          and otherhash{key} and takes the minimum of the ages, and
#          stores the result in hashref{"*/key"}.
sub merge {
    my $hashref = shift;
    my (%otherhash) = @_;
    foreach my $key (keys %otherhash) {
	if (defined $hashref->{"*/$key"}) {
	    my $left = $hashref->{"*/$key"};
	    $left->{size} += $otherhash{$key}->{size};
	    $left->{count} += $otherhash{$key}->{count};
	    $left->{age} = $otherhash{$key}->{age}
	        if $otherhash{$key}->{age} < $left->{age};
	} else {
	    $hashref->{"*/$key"} = $otherhash{$key};
	}
    }
}
#>

#>

print "#size  name (age in days)\n";
my $target = (shift(@ARGV) or cwd());
my $result;
if ($Links) {
    $result = Lwalk($target);
} else {
    $result = walk($target);
}
if ($extensions) {
    extdisplay($result);
} else {
    display($result);
}

#< documentation

=pod

=head1 Name

duke - short multilevel size and age summary

=head1 Synopsys

duke [-bkmg] [-d n] [-s dir] [-e] [dir]

=head1 Description

Most people use L<du> to see which files and subdirectories are the
biggest to find where their disk space crunch comes from.  This is
only one level deep, and often takes a long time for each level where
you do it.  Another piece of relevant information when deciding what
to move to a different storage medium is the age of the offending
files.  B<duke> offers a short multilevel size and age summary to help
you quickly find the largest, oldest stuff on your disk.

=head1 Options

=over

=item C<-bkmg>

show sizes in bytes, kb, Mb, or Gb, respectively. (default is human-readable)

=item C<-d> I<n>

amount of detail: show entities at most total/I<n> in size (by
necessity this is a bit more than the number of items displayed)

=item C<-L>

like du, dereference symbolic links when counting size.

=item C<-s> I<dir>

skip I<dir>.  This option may be used multiple times.

=item C<-e> 

aggregate files by I<e>xtension.  On by default, use again to disable
(e.g. for faster evaluation of results).

If a file that's not big enough to display on its own has a period in
the name (but not at the beginning), then everything after the last
period is assumed to be a filename extension (often indicating type --
e.g. pdf or jpg)

After listing all of the files and directories which did make the
detail cutoff, duke will then list aggregates of file extensions which
did not make the cutoff individually, but do make it together.  These
extensions can propagate up the directory hierarchy, and "*/"-es will
be displayed for each layer that they propagate.  Once displayed, they
will not be taken into account for shallower directory entries.
Displayed single-file entries also do not count toward the aggregate
for that directory.

It might be considered a bug that these appear after all single-file
entries for a directory even when in aggregate they are larger than
some of the single-file entries that appeared.  In reality it is
probably more worthwhile to deal with large single files first, so I'm
tempted to call this a feature.

=back

=head1 See Also

L<du(1)>

=head1 Author

Gregory A. Marton       L<http://csail.mit.edu/~gremio/>

=begin html

=head1 Download

This program is distributed under the GNU General Public License: L<http://www.gnu.org/copyleft/gpl.html>

The source: L<http://csail.mit.edu/~gremio/code/duke.pl>

=end html

=cut
    ;
#>