#!/usr/bin/perl # findnotinb.pl -- shell: finds lines that are NOT IN the first file but ARE # : in the second file and then reverses the filenames # : to find what is NOT IN the second file, but IS in # : the first file. # # Description: Same as findnotin.pl but reverses the files and prints # : whats not in each file that is in the other. # : This script finds any line which exists in the second file, # : but is NOT IN the first file. It does not consider how many # : times, but only if a line of text appears in the second file, # : but NOT in the first file. This is useful for comparing # : configuration files without all the clutter of diff. # : For example, you don't care how many extra blank lines one # : file has over the other, you want to find what the critical # : differences are. findnotin.pl and findnotinb.pl are most # : useful for comparing files that should contain unique lines, # : such as a list of unique FQDNs and IP addresses. Even if the # : files contain many duplicate lines, it can be useful for # : comaparing a new version to a backup copy if the changes # : are not too drastic. If further analysis is needed, use # : the Unix diff command. # : All processing is done in memory and we need both files # : in memory along with the both file's hash tables. This means # : we need an arbitrary size limit. I set that at ~100 megs. if (@ARGV < 2) { print "Finds whats NOT in file1 but IS in file2 and vice versa\n"; print "\nUsage : $0 file1 file2\n\n"; print "findnotinb.pl is like running findnotin.pl twice, the second\n"; print "time, with filename arguments reveresed.\n"; print "Optional third argument prints bare NOTIN lines only.\n"; print "The third argument can be anything.\n"; print "NOTIN lines are printed in the order they occur in the second\n"; print "file. The number of times a line occurs is not considered.\n"; print "Blank lines are ignored.\n"; exit(1); } $file1 = $ARGV[0]; $file2 = $ARGV[1]; $cleanout = 0; if (defined($ARGV[2])) { $cleanout = 1; } ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size1, $atime,$mtime,$ctime,$blksize,$blocks) = stat($file1); ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size2, $atime,$mtime,$ctime,$blksize,$blocks) = stat($file2); if (($size1 + $size2) > 99999999) { print "One or both files are too large to process\n"; print "Combined size of both files must be less than 50MB\n"; exit(1); } # Why do we need to store both files in memory, along with hash keys for each # line that essentially doubles the overhead? We need more than twice the # memory of both filesizes combined. I could just keep only the hashtables, # but then I would get NOTIN line in whatever order they are stored in perl. # if I sort the hash keys, then the ouput is still not in order of appearance # in the files. The only way to output NOTIN lines in the order they are found, # is to store both files and both sets of hashes. # Just in case we need to print out the date or something. ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); @wkdays = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"); $year += 1900; $mon += 1; $today = sprintf("%04d%02d%02d %02d:%02d:%02d", $year, $mon, $mday, $hour, $min, $sec); # slurping is actually slower. Always read text files a line at a time, # even if you need to store the data for later access. open(IN, "<$file1") or die "Can't open $file1 for input"; while ($line = ) { push(@file1, $line); $isinfile1{"$line"} = 1; } close(IN); open(IN, "<$file2") or die "Can't open $file2 for input"; while ($line = ) { push(@file2, $line); } close(IN); # Go through the second file and if the line is NOT already defined from the # first file, and, its not already defined as seen in the second file, then # store it away as seen already, and print the NOTIN message. # This means, that if we see the exact same line again, we don't just keep # updating it and printing the NOTIN message over and over for the same notin # line. It also turns out that this is also needed for perl to remember the # hash name. It's odd, but doing it any other way loses the information. # if ($cleanout) { print "############-#-#-#-#-#-#------ NOTIN $file1\n"; foreach $line (@file2) { $isinfile2{"$line"} = 1; if ((!defined($isinfile1{"$line"})) && (!defined($alreadyprinted{"$line"}))) { print "$line"; $alreadyprinted{$line} = 1; } } print "############-#-#-#-#-#-#------ NOTIN $file2\n"; undef(%alreadyprinted); foreach $line (@file1) { if ((!defined($isinfile2{"$line"})) && (!defined($alreadyprinted{"$line"}))) { print "$line"; $alreadyprinted{$line} = 1; } } # print "\n"; } else { print "\n#-----------------------------------------------------------------\n"; foreach $line (@file2) { $isinfile2{"$line"} = 1; if ((!defined($isinfile1{"$line"})) && (!defined($alreadyprinted{"$line"}))) { print "NOTIN $file1 $line"; $alreadyprinted{$line} = 1; } } print "\n#-----------------------------------------------------------------\n"; undef(%alreadyprinted); foreach $line (@file1) { if ((!defined($isinfile2{"$line"})) && (!defined($alreadyprinted{"$line"}))) { print "NOTIN $file2 $line"; $alreadyprinted{$line} = 1; } } print "\n#-----------------------------------------------------------------\n\n"; } exit(0);