#!/usr/bin/perl
#
# oss-license-extract.pl
#
# Copyright (c) 2001 Arbor Networks, Inc.
#
# Written by Scott Iekel-Johnson <scottij@arbor.net>.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. The names of the authors may not be used to endorse or promote products
# derived from this software without specific prior written permission
#
# THIS SOFTWARE IS PROVIDED BY ARBOR NETWORKS, INC. ``AS IS'' AND ANY EXPRESS
# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL ARBOR NETWORKS, INC. BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# $Id: oss-license-extract.pl,v 1.5 2001/08/14 15:17:24 scottij Exp $

$usage = 
"Usage: $0 [-h] [-f listfile] [-x] [-o outfile] file|dir[...]\n";

$version =
"oss-license-extract version 1.1 (8/13/01 17:00)\n";

$help =
"oss-license-extract version 1.1.

This program scans through the files and directories given on the
command line looking for copyright and license notices.  For each
directory given, it recursively scans all files matching *.c, *.h, 
*.sh, *.pl, *.py in that directory and all of its subdirectories.

    -f listfile    Reads the file listfile, and treats it as a list of
                   files and directories to scan as above.  Can be used
		   in addition to other command-line files and direcories.
    -h             Prints this help message.
    -o outfile	   Use outfile for output as the filename of the final 
    		   license and copyright notices.  Default is stdout.
    -v		   Print version information.
    -x             Normally, this program will include any text between the
                   copyright notice and the start of the actual license as
                   a set of addendums at the end of the output.  This 
		   disables the inclusion of that text.

";

$lic_num = 0;
$include_extras = 1;
$#licenses = -1;
$outfile = "-";

close STDIN;

# Read command-line options
while ($ARGV[0] =~ /^-/) {
    $_ = shift;
    if (/-f/) {
	# If -f option used, read in file of filenames
	open(FILLIST, "<$ARGV[0]") || die "$0: Can't open filelist $ARGV[0]\n";
	@fillist = <FILLIST>;
	chop(@fillist);

	# Push list back onto end of ARGV
	push(@ARGV, @fillist);

	# remove filename from ARGV
	shift;

	close FILLIST;
    } elsif (/-h/) {
	print STDERR $usage;
	print STDERR $help;
	exit 0;
    } elsif (/-o/) {
	$outfile = shift;
    } elsif (/-v/) {
	print STDERR $version;
	exit 0;
    } elsif (/-x/) {
	$include_extras = 0;
    } else {
	/-(.+)/;
	print STDERR "$0: illegal option \'$1\'\n";
	print STDERR $usage;
	exit 1;
    }
}

# Determine list of files to scan
for ($i = 0; $i <= $#ARGV; $i++) {
    $filnum = 0;
    $#files = -1;

    if (-d $ARGV[$i]) {

	open(FILES, "find $ARGV[$i] \\( -name '*.c' -or -name '*.sh' -or -name '*.h' -or -name '*.pl' -or -name '*.py' -or -name LICENSE \\)|") || die __FILE__, ": can't open directory $ARGV[$i]\n";

	while (<FILES>) {
	    # Get all filenames.  Assume error messages have a ':'
	    $files[$filnum++] = $_ unless /:/;
	}

	close FILES;

	chop(@files); 

	if ($#files < 0) {
	    die __FILE__, ": no files found in directory $dir.\n";
	} 

	splice(@ARGV, $i, 1, @files);
    } elsif (!(-f $ARGV[$i])) {
	print STDERR "$ARGV[$i]: Not a file or directory\n";
    }
}

if ($#ARGV < 0) {
    print STDERR "No files to scan.\n";
    print STDERR "$usage";
    exit 1;
}

#open(FILLIST, ">files.txt");
#print(FILLIST join("\n", @ARGV));
#close(FILLIST);

SCANFILE:
while (!eof()) {
    $#license = -1;
    $#dates = -1;
    $#holder = -1;
    $#extras = -1;
    $$holder{$ARGV} = 0;
    $#extra_lic = -1;

    if ($ARGV ne $last) {
	print STDERR "Scanning file $ARGV\n";
	$last = $ARGV;
    } 

    while (<>) {
	# extract copyright dates and names
	if (/^\s*[\*\#]?\s*(Copyright\s+)?\s*\(C\)/i) {
	    /(Copyright\s+)?\(C\)\s*((,? ?\-?(and)?\d+)*),?\s+([^\d\s\,\-]?.*)/i;
	    $dates[$holder{$ARGV}] = $2;
	    $owner[$holder{$ARGV}] = $5;
	    # If no dates, check for dates listed after owner
	    if (!$dates[$holder{$ARGV}]) {
		if ($owner[$holder{$ARGV}] =~ /(\D*)\s((,? ?\-?(and)?\d+)*)\s*[\D\s]*/) {
		    $owner[$holder{$ARGV}] = $1;
		    $dates[$holder{$ARGV}] = $2;
		}
	    }
	    $dates[$holder{$ARGV}] =~ s/and/, /g;
	    # Put an "and xxxx" from front of owner into dates if present
	    if ($owner[$holder{$ARGV}] =~ /^and (\d+)/) {
	        $dates[$holder{$ARGV}] .= ", $1";
		$owner[$holder{$ARGV}] =~ s/and (\d+)//;
	    }

	    # if no owner, assume owner is on next line
	    if (!$owner[$holder{$ARGV}]) {
		$next = <>;
		$next =~ m=^\s*/?[\*\#]?\s+(.*)=;
		$owner[$holder{$ARGV}] = $1;
	    }
	    $holder{$ARGV}++;
	} elsif (/\s*[\*\#]?\s*All rights reserved./i) {
	    # Add rights reserved clause to previous owner
	    $tmp = $holder{$ARGV} - 1;
	    if ($tmp >= 0) {
		$owner[$tmp] = $owner[$tmp] . "  All rights reserved.";
	    }
	} elsif (m=^\s*/?[\*\#]?\s*\*?\s*$=) {
	    next;
	} elsif ($holder{$ARGV}) {
	    last;
	} 
    }

    # If no copyright holders were found, print an error message
    if (!$holder{$ARGV}) {
	print STDERR "$ARGV: no copyright notices found\n";
    }

#    for ($i = 0; $i < $holder{$ARGV}; $i++) {
#	print STDERR "Dates = $dates[$i]; holder = $owner[$i]\n";
#    }

    # Now extract license
    # skipping any blank lines between copyright notice and the license
    while (!eof()) {
	last unless /^\s*[\*\#]?\s*\*?\s*\n/;

	$_ = <>;
    }

    if (eof(ARGV)) {
	# No license
	next SCANFILE;
    }

    #Add extra license info
    while (!eof(ARGV)) {
	if (!/^\s*[\*\#]?\s*(?:Redistribution|Permission)/i) {
	    last if (/@\(\#\)/ or m+\*/+ or /\$Id:/);
	    /^\s*[\*\#]?\s*(.*\n)/;
	    if ($include_extras) {
		$extras[++$#extras] = $1;
	    }
	} else {
	    last;
	}
	$_ = <>;
    }

    if (eof(ARGV)) {
	next SCANFILE;
    }

    # Now skip any remaining junk until license begins
    while (!eof(ARGV)) {
	last if (m=^\s*/?[\*\#]?\s*(?:Redistribution|Permission)=i);

	$_ = <>;
    }

    if (eof(ARGV)) {
	next SCANFILE;
    }

    $lines = 0;
GET_LICENSE:
    for (;;) {
	# Check for advertising clause, and replace name with 
	# "copyright holder"
	if ((/includes software developed by/) && !(/^\s*[\*\#]?\s*"/)) {
	    $license[$lines++] = "This product includes software developed by the copyright holder.\n";

	    # Now don't add any remaining lines of this clause to
	    # license.
	    while (<>) {
		last if ((/^\s*[\*\#]?\s*\d\. /) || (/THIS/));
	    }
	    next GET_LICENSE;
	} elsif (/(\d\.)\sThe name\(?s?\)? .* may not be used to endorse or promote products/) {
	    # Change clause to read author
	    $license[$lines++] = $1 . " The names of the authors may not be used to endorse or promote products\n";
	} elsif (/(\d\.)\sThe name\(?s?\)? .* may not be used to endorse or promote/) {
	    # Change clause to read author
	    $license[$lines++] = $1 . " The names of the authors may not be used to endorse or promote\n";
	} elsif (/^\s*[\*\#]?\s*(\S.*)? NO EVENT SHALL .* BE (.*\n)$/) {
	    $license[$lines++] = $1 . 
		" NO EVENT SHALL THE COPYRIGHT HOLDER BE " . $2;
	} elsif (/^\s*[\*\#]?\s*(\S.*)? PROVIDED BY .* ``AS IS'' (.*\n)/) {
	    $license[$lines++] = $1 . 
		" PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' " . $2;
	} elsif (/\s*[\*\#]?\s*([^\s\*\#][^\*]*\n)/) {
	    $license[$lines++] = $1;
	} elsif (/\s*[\*\#]?\s*([^\s\*\#][^\*]*)\s+\*/) {
	    $license[$lines++] = $1 . "\n";
	}

	last if (/DAMAGE\./);

	$_ = <>;

	last if (/@\(\#\)/ or m+\*/+ or /^\s*$/ or /\$Id:/);
    } 

    # If no license found, print error message
    if ($#license < 0) {
	print STDERR "$ARGV: no recognizable license.\n";
	close ARGV;
	next SCANFILE;
    }

    # Create version without \n's and punctuation for diffing
    # Also remove all whitespace and convert to lowercase
    @lic_temp = @license;
    chop @lic_temp;
    $newdiff = join(' ', @lic_temp);
    $newdiff =~ s/\s+//g;
    $newdiff =~ tr/A-Z/a-z/;
    $newdiff =~ tr/.,:;`'//d;

    # Compare to any existing licenses
    for ($i = 0; $i <= $#lic_diff; $i++) {
	    last if ($newdiff eq $lic_diff[$i]);
    }

    if ($#lic_diff < 0) {
	# No existing license files
	$licenses[0] = [@license];
	$i = 0;
	$lic_num++;
	$lic_diff[++$#lic_diff] = $newdiff;
    } elsif ($i > $#lic_diff) {
	# No matches; create new license file
	$licenses[++$#licenses] = [@license];
	$lic_diff[++$#lic_diff] = $newdiff;
	$lic_num++;
    } 

    # Add copyright owner information to record for this license
    for ($j = 0; $j < $holder{$ARGV}; $j++) {
	@add_dates = split(/, /, $dates[$j]);
	@add_dates = sort @add_dates;

	# Expand date ranges (e.g. 1994-1999) into individual dates for now
	for ($x = 0; $x <= $#add_dates; $x++) {
	    if ($add_dates[$x] =~ /(\d+)-(\d+)/) {
		splice(@add_dates, $x, 1, ($1 .. $2));
	    }
	}

	$owner[$j] =~ s/^, //;
	$owner_diff = $owner[$j];
	$owner_diff =~ s/All rights reserved//i;
	$owner_diff =~ s/\s+//g;
	$owner_diff =~ s/\.//g;
	for ($k = 0; $lic_owners_diff[$i][$k]; $k++) {
	    last if ($owner_diff eq $lic_owners_diff[$i][$k]);
	}
	if (!$lic_owners[$i][$k]) {
	    # Add new owner
	    $lic_owners[$i][$k] = $owner[$j];
	    $lic_dates[$i][$k] = [ @add_dates ];
	    $lic_owners_diff[$i][$k] = $owner_diff;
	} else {

	    # Add "all rights reserved" clause if needed
	    if ($owner[$j] =~ /All rights reserved/i) {
		    $lic_owners[$i][$k] = $owner[$j];
	    }

	    # Combine new dates with old
	    *tmp_dates = $lic_dates[$i][$k];
	    @tmp_dates = sort(@add_dates, @tmp_dates);

#	    for $x (0 .. 3) {
#		print STDERR "lic_dates[i][k][$x] = ", $lic_dates[$i][$k][$x], "\n";
#	    }

	    for ($x = 0; $x <= $#tmp_dates; $x++) {
		while ($tmp_dates[$x] == $tmp_dates[$x+1]) {
		    splice(@tmp_dates, $x+1, 1);
		}
	    }
	    $lic_dates[$i][$k] = [ @tmp_dates ];
	}
    }

    # Compare to any existing extras
    if ($#extras >= 0) {

	for ($i = 0; $i <= $#extras; $i++) {
	    $extra_lic[++$#extra_lic] = $extras[$i] unless 
		    ($extras[$i] =~ /^\s*\n/);
	}
	close EXTRAS;

	@extra_tmp = @extra_lic;
	chop @extra_tmp;
	$newexdiff = join(' ', @extra_tmp);
	$newexdiff =~ s/\s+//g;
	$newexdiff =~ tr/A-Z/a-z/;
	$newexdiff =~ tr/.,:;`'//d;

	for ($i = 0; $i <= $#extras_diff; $i++) {
		last if ($newexdiff eq $extras_diff[$i]);
	}

	if ($#extras_diff < 0) {
	    # No existing extras files
	    $extras_save[0] = [@extra_lic];
	    $extras_diff[0] = $newexdiff;
	    $i = 0;
	} elsif ($i > $#extras_diff) {
	    # No matches; create new license file
	    $extras_save[$i] = [@extra_lic];
	    $extras_diff[++$#extras_diff] = $newexdiff;
	} 
    }

#    close ARGV;
}

#for ($i = 0; $lic_owners[$i][0]; $i++) {
#	for ($j = 0;$lic_owners[$i][$j]; $j++) {
#		*tmp_dates = $lic_dates[$i][$j];
#		print "Copyright (c) ", join(', ', @tmp_dates), 
#			" $lic_owners[$i][$j]\n";
#	}
#}

#print $lic_owners[0][0], "\n";
#print $lic_num, "\n";

#for ($i = 0; $i < $lic_num; $i++) {
#    *powners = $lic_owners[$i];
#    print join(' ', @powners), "\n";
#    for ($j = 0; $lic_owners[$i][$j]; $j++) {
#	print "owner($i, $j): $lic_owners[$i][$j]\n";
#    }
#}

# Output license information
open(LICENSE_FILE, ">$outfile") || die "Can't open $outfile: $@\n";
print LICENSE_FILE "This software may contain software that contains the following copyright\n";
print LICENSE_FILE "notices and associated licensing restrictions.\n";
print LICENSE_FILE "\n";

for ($i = 0; $i <= $#licenses; $i++) {
# Write out copyright holder info
    for ($j = 0; $lic_owners[$i][$j]; $j++) {
	next if ($lic_owners[$i][$j] =~ /^\s*\n?$/);
	*tmp_dates = $lic_dates[$i][$j];
	# Collapse consecutive dates into date range e.g. 1994-1999
	for ($x = 0; $x < $#tmp_dates; $x++) {
	    $xstart = $x;
	    while ($tmp_dates[$x] == ($tmp_dates[$x+1]-1)) {
		$x++;
		last if ($x == $#tmp_dates);
	    }
	    if ($x > $xstart+1) {
		$drange = "$tmp_dates[$xstart]" . "-" . "$tmp_dates[$x]";
		splice(@tmp_dates, $startx, $x - $startx + 1, $drange);
	    }
	}
	if ($#tmp_dates >= 0) {
	    print LICENSE_FILE "Copyright (c) ", join(', ', @tmp_dates), 
		    " $lic_owners[$i][$j]\n";
	} else {
	    print LICENSE_FILE "Copyright (c) ", $lic_owners[$i][$j], "\n";
	}
    }

    print LICENSE_FILE "\n";

    # Print license itself
    *license = $licenses[$i];
    print LICENSE_FILE @license;
    print LICENSE_FILE "--\n";
}

for ($i = 0; $i <= $#extras_save; $i++) {
    *extra = $extras_save[$i];
    print LICENSE_FILE @extra;
    print LICENSE_FILE "--\n";
}

close LICENSE_FILE;
