#!/usr/bin/perl
# findnotinb.pl -- shell: finds lines that are NOT IN the first file but ARE
#                      : in the second file and then reverses the filenames
#                      : to find what is NOT IN the second file, but IS in
#                      : the first file.
#
#  Description: Same as findnotin.pl but reverses the files and prints
#             : whats not in each file that is in the other.
#             : This script finds any line which exists in the second file,
#             : but is NOT IN the first file. It does not consider how many
#             : times, but only if a line of text appears in the second file,
#             : but NOT in the first file. This is useful for comparing
#             : configuration files without all the clutter of diff.
#             : For example, you don't care how many extra blank lines one
#             : file has over the other, you want to find what the critical
#             : differences are. findnotin.pl and findnotinb.pl are most
#             : useful for comparing files that should contain unique lines,
#             : such as a list of unique FQDNs and IP addresses. Even if the
#             : files contain many duplicate lines, it can be useful for
#             : comaparing a new version to a backup copy if the changes
#             : are not too drastic. If further analysis is needed, use
#             : the Unix diff command.
#             : All processing is done in memory and we need both files
#             : in memory along with the both file's hash tables. This means
#             : we need an arbitrary size limit. I set that at ~100 megs.

if (@ARGV < 2) {
    print "Finds whats NOT in file1 but IS in file2 and vice versa\n";
    print "\nUsage : $0 file1 file2\n\n";
    print "findnotinb.pl is like running findnotin.pl twice, the second\n";
    print "time, with filename arguments reveresed.\n";
    print "Optional third argument prints bare NOTIN lines only.\n";
    print "The third argument can be anything.\n";
    print "NOTIN lines are printed in the order they occur in the second\n";
    print "file. The number of times a line occurs is not considered.\n";
    print "Blank lines are ignored.\n";
    exit(1);
}

$file1 = $ARGV[0];
$file2 = $ARGV[1];

$cleanout = 0;
if (defined($ARGV[2])) {
    $cleanout = 1;
}

($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size1,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($file1);
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size2,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($file2);

if (($size1 + $size2) > 99999999) {
    print "One or both files are too large to process\n";
    print "Combined size of both files must be less than 50MB\n";
    exit(1);
}

# Why do we need to store both files in memory, along with hash keys for each
# line that essentially doubles the overhead? We need more than twice the
# memory of both filesizes combined. I could just keep only the hashtables,
# but then I would get NOTIN line in whatever order they are stored in perl.
# if I sort the hash keys, then the ouput is still not in order of appearance
# in the files. The only way to output NOTIN lines in the order they are found,
# is to store both files and both sets of hashes.


# Just in case we need to print out the date or something.
($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
@wkdays = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat");
$year += 1900;
$mon += 1;
$today = sprintf("%04d%02d%02d %02d:%02d:%02d", $year, $mon, $mday, $hour, $min, $sec);


# slurping is actually slower. Always read text files a line at a time,
# even if you need to store the data for later access.
open(IN, "<$file1") or die "Can't open $file1 for input";
while ($line = <IN>) {
    push(@file1, $line);
    $isinfile1{"$line"} = 1;
}
close(IN);


open(IN, "<$file2") or die "Can't open $file2 for input";
while ($line = <IN>) {
    push(@file2, $line);
}
close(IN);



# Go through the second file and if the line is NOT already defined from the
# first file, and, its not already defined as seen in the second file, then
# store it away as seen already, and print the NOTIN message.
# This means, that if we see the exact same line again, we don't just keep
# updating it and printing the NOTIN message over and over for the same notin
# line. It also turns out that this is also needed for perl to remember the
# hash name. It's odd, but doing it any other way loses the information.
#



if ($cleanout) {

    print "############-#-#-#-#-#-#------ NOTIN $file1\n";
    foreach $line (@file2) {
        $isinfile2{"$line"} = 1;
        if ((!defined($isinfile1{"$line"})) &&  (!defined($alreadyprinted{"$line"}))) {
            print "$line";
            $alreadyprinted{$line} = 1;
        }
    }
    print "############-#-#-#-#-#-#------ NOTIN $file2\n";
    undef(%alreadyprinted);
    foreach $line (@file1) {
        if ((!defined($isinfile2{"$line"})) &&  (!defined($alreadyprinted{"$line"}))) {
            print "$line";
            $alreadyprinted{$line} = 1;
        }
    }
#    print "\n";

}
else {

    print "\n#-----------------------------------------------------------------\n";
    foreach $line (@file2) {
        $isinfile2{"$line"} = 1;
        if ((!defined($isinfile1{"$line"})) &&  (!defined($alreadyprinted{"$line"}))) {
            print "NOTIN $file1 $line";
            $alreadyprinted{$line} = 1;
        }
    }
    print "\n#-----------------------------------------------------------------\n";
    undef(%alreadyprinted);
    foreach $line (@file1) {
        if ((!defined($isinfile2{"$line"})) &&  (!defined($alreadyprinted{"$line"}))) {
            print "NOTIN $file2 $line";
            $alreadyprinted{$line} = 1;
        }
    }
    print "\n#-----------------------------------------------------------------\n\n";

}


exit(0);
