#!/usr/bin/perl
#

#
# Convert William Stearns URI list to use less rules
#
# Original list can be found here:
# http://www.stearns.org/sa-blacklist/
#

$SCORE="";		# What score should these rules use? Default is same as source.
$MAXPERRULE=50;		# How many sites per rule?
$NUMCHARS=1;		# How man characters to optimize with?
$RULENAME="";		# What should the rule name begin with? Default is same as source with OPT_ appended.
			#					Sequence number will also be appended.

%uri=();

print "#\n";
print "# Converted William Stearns URI list to use less rules\n";
print "#\n";
print "# Original list can be found here:\n";
print "# http://www.stearns.org/sa-blacklist/\n";
print "#\n";
$date=localtime();
print "#\n# Generated: $date\n#\n";

while (<>) {
	chomp;
	print $_ . "\n" if (m/^#/);
	$uri{$1}++ if (m/^describe.*URI contains (.*)$/);
	$RULENAME=$1 . "OPT_" if ($RULENAME eq "" && m/^describe\s+(\D+)/);
	$SCORE=$1 if ($SCORE eq "" && m/^score\s+\S+\s+([0-9.]+)$/);
}

$break="";
$linecount=0;
$sitecount=0;

sub ruleout {
	$out.=')\b/i';
	$out='m/\b' . $lastsite . '\b/i' if ($sitecount == 1);
	print "describe $RULENAME$linecount\tURI contains sites starting with $break\n";
	print "uri      $RULENAME$linecount\t$out\n";
	print "score    $RULENAME$linecount\t$SCORE\n\n";
}

foreach $site (sort keys %uri) {
	$key=substr($site, 0, $NUMCHARS);
	if ($break eq "" || $break ne $key || $sitecount > $MAXPERRULE) {
		if ($break ne "") {
			ruleout;
			$linecount++;
			$sitecount=0;
			$out="";
		}
		$out.='m/\b' . $key . '(?:';
		$break=$key;
	}
	$out.='|' if ($sitecount > 0);
	$shortsite=substr($site, $NUMCHARS);
	$shortsite =~ s/\./\\\./g;
	$out.=$shortsite;
	$sitecount++;
	$lastsite=$site;
}

if ($sitecount != 0) {
	ruleout;
}
