#!/usr/bin/perl

use strict;
use warnings;

use File::Temp ();
use LWP::Simple;
use URI::Escape;
use Data::Dumper;

my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; }
my $MAKE_CACHE; $MAKE_CACHE = 1;        # turn this on by default, no harm

# we allow promotion of rules that are "ifplugin" one of these
my @def_plugins = map {
    s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs;
    $_;
  } <lib/Mail/SpamAssassin/Plugin/*.pm>;

my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$";

# number of days to look back; if a rule isn't listed as promotable on
# all of these days, it won't be listed.  (we grant an exception for
# new rules that didn't exist on previous days, however, so new rules
# can be published quickly to handle sudden outbreaks without requiring
# manual update work)
my @DAYS_REQUIRED = (1, 2, 3);

# S/O threshold required
my $SO_THRESHOLD = .8;

###########################################################################

print q{
Bad performing rules, from the past 3 night's mass-checks.

(Note: 'net' rules will be listed as 'no hits' unless you set 'tflags net'.
This also applies for meta rules which use 'net' rules.)

};

my $cgi_url = "https://ruleqa.spamassassin.org/";
my @doc = ();
my $cache = 'ruleqa.cache.';
my $submitters = '';

my $url;        # tracks the last day used
my $dayoffset = 0;
foreach my $day (@DAYS_REQUIRED) {
  if (!$FROM_CACHE || !-f $cache.$day) {

with_new_offset:
    $url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1";
    warn "HTTP get: $url\n";

    $doc[$day] = get ($url);
    if (!$doc[$day]) {
      die "HTTP get failed: $doc[$day]\n";
    }

    if ($MAKE_CACHE) {
      if (open(O, ">$cache$day")) {
        print O $doc[$day]; close O;
      }
    }
  }
  else {
    open(I, "<$cache$day") or die; $doc[$day] = join('',<I>); close I;
  }

###########################################################################

  # the HTML looks like:
  # 
  #   <span class="daterev_masscheck_description" class="mcviewing">
  #   ...
  #   <em><span class="mcsubmitters"> ....... </span></em>
  #   ...
  #   </span>
  #
  # in other words, the machine-parseable metadata is embedded in the HTML
  # as a microformat.

  if ($doc[$day] =~ m{
          <span\s+class="daterev_masscheck_description\smcviewing"
          .{0,400}
          <span\s+class="mcsubmitters">\s*(.*?)\s*</span>
        }sx)
  {
    my $daysubs = $1;

    # ignore days when the mass-check sets contain a --net log, since
    # it's the weekly --net run.  That generally contains a much
    # smaller set of logs (since it takes longer to run mass-check --net)
    # so the results are untrustworthy.
    if ($daysubs =~ /(?:^|\s)net-/) {
      warn "day $day contains a --net mass-check! offsetting by an extra day\n";
      $dayoffset++; goto with_new_offset;
    }

    ($submitters ne '') and $submitters .= "; ";
    $submitters .= "day $day: $daysubs";
  }
  else {
    loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day");
  }
}

###########################################################################

# <rule><test>__HIGHBITS</test><promo>0</promo>
# <spc>8.7654</spc><hpc>0.2056</hpc><so>0.977</so>
# <detailhref>ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1</detailhref></rule>

my $plist;
foreach my $day (@DAYS_REQUIRED) {
  while ($doc[$day] =~ m!<rule>(.*?)</rule>!xg) {
    my $xml = $1;
    my $obj = { };

    while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) {
      $obj->{$1} = $2;
    }
    while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) {
      $obj->{$1} = uri_unescape($2);
    }

    my $name = $obj->{test};
    $obj->{detailhref} = $cgi_url.$obj->{detailhref};

    $plist->[$day]->{$name} = $obj;
  }

  if (!scalar keys %{$plist->[$day]}) {
    loghtml_die("no rules found? on day $day");
  }
}

###########################################################################

## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump;

# use SpamAssassin classes directly, so we can lint rules
# as we go
use lib 'lib';
use Mail::SpamAssassin;

my $mailsa = Mail::SpamAssassin->new({
    rules_filename => "rules",
    site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )),
    local_tests_only => 1,
    dont_copy_prefs => 1,
    config_tree_recurse => 1,
    keep_config_parsing_metadata => 1,
    # debug => 1,
});

# hack hack hack!!  we don't want to load plugin files twice,
# and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm
# to rules/*.pm, they would otherwise appear twice.
foreach my $fname (<rules/*.pm>) {
  my $path = File::Spec->rel2abs($fname);
  $INC{$path} = 1;
  # warn "JMD $path";
}

my %rules_with_errors = ();
my %killed_rules = ();
my %count_rules = ();
my %report_bad_subrules = ();

$mailsa->{lint_callback} = sub {
  my %opts = @_;

  # ignore non-rule-issue lint failures
  return if ($opts{msg} =~ /(?:
        score\sset\sfor\snon-existent|
        description\sexists
    )/x);

  warn "demoting $opts{rule}: $opts{msg}";
  if ($opts{iserror}) {
    $rules_with_errors{$opts{rule}}++;
  }
};

$mailsa->lint_rules();

# print "# active ruleset list, automatically generated from $cgi_url\n";
# print "# with results from: $submitters\n";

my @spcs = ($submitters =~ /\s+/g);
if (scalar @spcs < 2) {
  die "not generating results; less than 3 submitter results available!\n";
}

# base most of our decisions off day 1 (last night's mass-checks).
# note: meta rules must come before their __SUBRULES in this sort;
# default lexical sort will do this.
foreach my $plistkey (sort keys %{$plist->[1]}) {
  my $name = $plistkey;
  my $plistobj = $plist->[1]->{$plistkey};
  my $notes = '';

  # rules in sandboxes without a T_ prefix, will be renamed during the
  # ruleqa process... in other words, the output freqs line will talk
  # about rule "T_FOO".   if there's a rule "FOO" defined, assume that's
  # the one being talked about.
  my $no_t = $name;
  if ($no_t =~ s/^T_//) {
    if (defined $mailsa->{conf}->{scores}->{$no_t}) {
      $name = $no_t;
    }
  }
  
  # ignore rules that don't exist (if they have a desc or score,
  # they exist according to the Conf parser)
  next unless ($mailsa->{conf}->{descriptions}->{$name}
        || $mailsa->{conf}->{scores}->{$name});

  my $tfs = $mailsa->{conf}->{tflags}->{$name} || '';

  my $src = $mailsa->{conf}->{source_file}->{$name};
  if ( defined $src ) {
    $count_rules{$src}++;
  } else {
    $count_rules{'not_present'}++;
  }

  # skip rules of these tflags, we cannot judge them without more data
  if ($tfs =~ /\b(?:userconf|learn|net)\b/) {
    next;
  }

  # rules that fail lint
  next if $rules_with_errors{$name};

  # subrules with ok parent rules
  if ($name =~ /^__/ && !$report_bad_subrules{$name}) {
    # print "  # ignoring subrule $name: parent rules seem fine\n";
    next;
  }

  # certain tests need to be reversed for "nice" rules
  my $is_nice = 0;
  if ($tfs =~ /\bnice\b/) { $is_nice = 1; }

  my $valid = 1;    # number of nights the rule appears in
  my $so = $plist->[1]->{$plistkey}->{so};
  if (defined $plist->[2]->{$plistkey}->{so}) {
      $so += $plist->[2]->{$plistkey}->{so}; $valid++;
  }
  if (defined $plist->[3]->{$plistkey}->{so}) {
      $so += $plist->[3]->{$plistkey}->{so}; $valid++;
  }
  $so /= $valid;      # average across all 3
  my $adj_so;
  if ($is_nice) {
    $adj_so = 1.0 - $so;            # 0.0 => 1.0
  } else {
    $adj_so = $so;
  }
  next unless ($adj_so < $SO_THRESHOLD);

  my $target = ($is_nice ? 'hpc' : 'spc');
  my $spc = $plist->[1]->{$plistkey}->{$target};
  $spc += $plist->[2]->{$plistkey}->{$target} || 0;
  $spc += $plist->[3]->{$plistkey}->{$target} || 0;
  $spc /= $valid;

  $target = ($is_nice ? 'spc' : 'hpc');
  my $hpc = $plist->[1]->{$plistkey}->{$target};
  $hpc += $plist->[2]->{$plistkey}->{$target} || 0;
  $hpc += $plist->[3]->{$plistkey}->{$target} || 0;
  $hpc /= $valid;

  if ($spc <= 0.0001) {
    if ($hpc <= 0.0001) {
      badrule($name, "no hits at all");
    } else {
      badrule($name, "no hits of target type");
    }
    next;
  }

  badrule($name, "bad, avg S/O=".sprintf("%.2f",$so)." ".
                        "avg Spam%=".sprintf("%.2f",$spc)." ".
                        "avg Ham%=".sprintf("%.2f",$hpc)
              );
}

foreach my $srcfile (reverse sort keys %killed_rules) {
  my $set = $killed_rules{$srcfile};
  my $count = $count_rules{$srcfile};
  my $c_bad = scalar keys %{$set};

  print "\n$srcfile ($count rules, $c_bad bad):\n\n";

  foreach my $name (sort keys %{$set}) {
    my $reason = $set->{$name};
    print "  $name:  $reason\n";
  }
}

exit;

sub badrule {
  my ($name, $reason) = @_;
  my $src = $mailsa->{conf}->{source_file}->{$name};
  if ( defined $src ) {
    $killed_rules{$src}->{$name} = $reason;
  } else {
    $killed_rules{'not_present'}->{$name} = $reason;
  }

  # if it's a subrule in a meta rule, note this
  # TODO: this only works reliably for lexically-previous meta rules;
  # that's ok for __SUBRULES used in META_RULES, since "M" < "_".
  if ($report_bad_subrules{$name}) {
    $killed_rules{$src}->{$name} .= "\n      # used in:$report_bad_subrules{$name}";
  }

  # if it's a meta rule, note that we can complain about its subrules too
  foreach my $r (split ' ', $mailsa->{conf}->{meta_dependencies}->{$name} || '') {
    $report_bad_subrules{$r} .= " ".$name;
  }
}

sub loghtml_die {
  die "$_[0]\nURL: $url\n";
}

