#!/usr/bin/perl
#
# Copyright (c) 2010-2012, The Trusted Domain Project.  All rights reserved.
#
# Script to import rate and spam counts from the stats tables into rrdtables.
# EXPERIMENTAL.

###
### Setup
###

use strict;
use warnings;

use DBI;
use File::Basename;
use Getopt::Long;
use IO::Handle;
use POSIX;

use RRDs;

require DBD::mysql;

# general
my $progname      = basename($0);
my $version       = "2.11.0";
my $verbose       = 0;
my $helponly      = 0;
my $showversion   = 0;

my @domains;

my $dbi_s;
my $dbi_h;
my $dbi_a;

# DB parameters
my $def_dbhost    = "localhost";
my $def_dbname    = "opendkim";
my $def_dbuser    = "opendkim";
my $def_dbpasswd  = "opendkim";
my $def_dbport    = "3306";
my $dbhost;
my $dbname;
my $dbuser;
my $dbpasswd;
my $dbport;
my $def_rrdroot   = "/var/db/reprrd";
my $def_interval  = 1;
my $def_levels    = 2;

my $rrdroot;
my $interval;
my $levels;
my $donull;
my $dateclause;

my $tmpin;
my $idx;

my $dbscheme     = "mysql";

# Minimum messages.id value; used to mark the start of useful data
my $minmsgid     = 0;

###
### NO user-serviceable parts beyond this point
###

sub rrd_path
{
	my $name = shift;
	my $type = shift;
	my $path;
	my $c;

	if (!defined($name))
	{
		die "$progname: rrd_path() called without a domain name\n";
	}

	$path = $rrdroot . "/" . $type;
	if (! -d $path)
	{
		mkdir $path
		or die "$progname: cannot create directory $path: $!\n";
	}

	for ($c = 0; $c < $levels; $c++)
	{
		$path = $path . "/" . substr($name, $c, 1);
		if (! -d $path)
		{
			mkdir $path
			or die "$progname: cannot create directory $path: $!\n";
		}
	}

	$path = $path . "/" . $name;

	return $path;
}

sub make_rrd
{
	my $name = shift;
	my $type = shift;
	my $timestamp = shift;
	my $path = $rrdroot;
	my $c;

	$path = rrd_path($name, $type);

	if (-e $path)
	{
		if (! -f $path)
		{
			die "$progname: $path is not a regular file\n";
		}
	}
	else
	{
		if ($verbose)
		{
			print STDOUT "$progname: creating RRD table $path\n";
		}

		$timestamp--;

		RRDs::create($path,
		             "--start",
		             "$timestamp",
		             "--step",
		             "3600",
		             "DS:$type:GAUGE:3600:0:U",
		             "RRA:AVERAGE:0.5:1:336",
		             "RRA:HWPREDICT:336:0.1:0.0035:24")
		or die "$progname: failed to create RRD table $path: " . RRDs::error . "\n";

		RRDs::tune($path,
		           "--deltaneg", "100",
		           "--failure-threshold", "1",
		           "--window-length", "2")
		or die "$progname: failed to tune RRD table $path: " . RRDs::error . "\n";
	}
}

sub update_rrd
{
	my $domain;
	my $type;
	my $timestamp;
	my $value;
	my $path;
	my $data;
	my $cur;
	my $last;

	($domain, $type, $timestamp, $value) = @_;

	$path = rrd_path($domain, $type);

	if ($verbose)
	{
		print STDOUT "$progname: updating RRD table $path for timestamp $timestamp\n";
	}

	$last = RRDs::last($path)
	or die "$progname: failed to query RRD table $path: " . RRDs::error . "\n";

	if ($last >= $timestamp)
	{
		if ($verbose > 0)
		{
			print STDOUT "$progname: skipping update to $path for timestamp $timestamp\n";
		}

		return;
	}

	my ($start, $step, $columns, $values) = RRDs::fetch($path, "AVERAGE")
	or die "$progname: failed to fetch RRD table $path: " . RRDs::error . "\n";

	for ($cur = $last + $step; $cur < $timestamp; $cur = $cur + $step)
	{
		$data = ${cur} . "\@0";

		if ($verbose > 1)
		{
			print STDOUT "\t$data (FILL)\n";
		}

		RRDs::update($path, $data)
		or die "$progname: failed to update RRD table $path: " . RRDs::error . "\n";
	}

	$data = ${timestamp} . "@" . ${value};

	if ($verbose > 1)
	{
		print STDOUT "\t$data\n";
	}

	RRDs::update($path, $data)
	or die "$progname: failed to update RRD table $path: " . RRDs::error . "\n";
}

sub usage
{
	print STDERR "$progname: usage: $progname [options]\n";
	print STDERR "\t--dbhost=host      database host [$def_dbhost]\n";
	print STDERR "\t--dbname=name      database name [$def_dbname]\n";
	print STDERR "\t--dbpasswd=passwd  database password [$def_dbpasswd]\n";
	print STDERR "\t--dbport=port      database port [$def_dbport]\n";
	print STDERR "\t--dbuser=user      database user [$def_dbuser]\n";
	print STDERR "\t--domains=list     list of domains to process\n";
	print STDERR "\t--interval=hour    pull the previous n hours of data [$def_interval]\n";
	print STDERR "\t--levels=n         hashing depth [$def_levels]\n";
	print STDERR "\t--help             print help and exit\n";
	print STDERR "\t--null             include the null sender\n";
	print STDERR "\t--output=file      output file\n";
	print STDERR "\t--rrdroot=path     root of RRD database tree [$def_rrdroot]\n";
	print STDERR "\t--verbose          verbose output\n";
	print STDERR "\t--version          print version and exit\n";
}

#
# parse command line arguments
#

my $opt_retval = &Getopt::Long::GetOptions ('dbhost=s' => \$dbhost,
                                            'dbname=s' => \$dbname,
                                            'dbpasswd=s' => \$dbpasswd,
                                            'dbport=s' => \$dbport,
                                            'dbuser=s' => \$dbuser,
                                            'domain=s@' => \@domains,
                                            'help!' => \$helponly,
                                            'interval=i' => \$interval,
                                            'levels=i' => \$levels,
                                            'null!' => \$donull,
                                            'rrdroot=s' => \$rrdroot,
                                            'verbose+' => \$verbose,
                                            'version!' => \$showversion,
                                           );

if (!$opt_retval || $helponly)
{
	usage();

	if ($helponly)
	{
		exit(0);
	}
	else
	{
		exit(1);
	}
}

if ($showversion)
{
	print STDOUT "$progname v$version\n";
	exit(0);
}

#
# apply defaults
#

if (!defined($dbhost))
{
	if (defined($ENV{'OPENDKIM_DBHOST'}))
	{
		$dbhost = $ENV{'OPENDKIM_DBHOST'};
	}
	else
	{
		$dbhost = $def_dbhost;
	}
}

if (!defined($dbname))
{
	if (defined($ENV{'OPENDKIM_DB'}))
	{
		$dbname = $ENV{'OPENDKIM_DB'};
	}
	else
	{
		$dbname = $def_dbname;
	}
}

if (!defined($dbpasswd))
{
	if (defined($ENV{'OPENDKIM_PASSWORD'}))
	{
		$dbpasswd = $ENV{'OPENDKIM_PASSWORD'};
	}
	else
	{
		$dbpasswd = $def_dbpasswd;
	}
}

if (!defined($dbport))
{
	if (defined($ENV{'OPENDKIM_PORT'}))
	{
		$dbport = $ENV{'OPENDKIM_PORT'};
	}
	else
	{
		$dbport = $def_dbport;
	}
}

if (!defined($dbuser))
{
	if (defined($ENV{'OPENDKIM_USER'}))
	{
		$dbuser = $ENV{'OPENDKIM_USER'};
	}
	else
	{
		$dbuser = $def_dbuser;
	}
}

if (defined($ENV{'OPENDKIM_MINMSGID'}))
{
	$minmsgid = $ENV{'OPENDKIM_MINMSGID'};
}

if (!defined($rrdroot))
{
	if (defined($ENV{'OPENDKIM_RRDROOT'}))
	{
		$rrdroot = $ENV{'OPENDKIM_RRDROOT'};
	}
	else
	{
		$rrdroot = $def_rrdroot;
	}
}

if (!defined($levels))
{
	$levels = $def_levels;
}

if (!defined($interval))
{
	$interval = $def_interval;
}

#
# If any of the domains started with "/" or "./", convert them into
# whatever's in the files they reference.
#

if (scalar @domains > 0)
{
	foreach $idx (0 .. $#domains)
	{
		if (substr($domains[$idx], 0, 1) eq "/" ||
		    substr($domains[$idx], 0, 2) eq "./")
		{
			if (!open($tmpin, "<", $domains[$idx]))
			{
				print STDERR "$progname: cannot open $domains[$idx]: $!\n";
				$dbi_s->finish;
				$dbi_h->disconnect;
				exit(1);
			}

			while (<$tmpin>)
			{
				# ignore comments
				s/#.*//;

				# ignore blank lines
				next if /^(\s)*$/;

				chomp;

				push @domains, lc($_);
			}

			close($tmpin);

			delete $domains[$idx];
		}
		else
		{
			$domains[$idx] = lc($domains[$idx]);
		}
	}
}

if ($verbose)
{
	print STDERR "$progname: started at " . localtime() . "\n";
}

#
# open the SQL database
#

my $dbi_dsn = "DBI:" . $dbscheme . ":database=" . $dbname .
              ";host=" . $dbhost . ";port=" . $dbport;

$dbi_h = DBI->connect($dbi_dsn, $dbuser, $dbpasswd, { PrintError => 0 });
if (!defined($dbi_h))
{
	print STDERR "$progname: unable to connect to database: $DBI::errstr\n";
	exit(1);
}

if ($verbose)
{
	print STDERR "$progname: connected to database\n";
}

#
# Pull data per domain per hour within the interval (0 = epoch)
#

if ($verbose)
{
	print STDERR "$progname: requesting hourly flow data\n";
}

if ($interval == 0)
{
	$dateclause = "";
}
else
{
	$dateclause = "AND msgtime >= DATE_SUB(CURRENT_TIMESTAMP(), INTERVAL $interval HOUR)";
}

if ($donull)
{
	$dbi_s = $dbi_h->prepare("SELECT UNIX_TIMESTAMP(TIMESTAMPADD(HOUR,
								     HOUR(msgtime),
								     DATE(msgtime))),
					 COUNT(messages.id),
					 SUM(SIGN(spam))
				  FROM messages
	                          WHERE (sigcount = 0 OR
	                                 messages.id NOT IN
	                                     (SELECT DISTINCT message
	                                      FROM signatures
	                                      WHERE pass = 1)) 
				  AND NOT spam = -1
	                          AND msgtime < TIMESTAMPADD(HOUR,
	                                                     HOUR(CURRENT_TIMESTAMP()),
	                                                     DATE(CURRENT_TIMESTAMP()))
	                          $dateclause
				  GROUP BY 1
				  ORDER BY 1");

	if (!$dbi_s->execute)
	{
		print STDERR "$progname: SELECT failed: " . $dbi_h->errstr . "\n";
		$dbi_s->finish;
		$dbi_h->disconnect;
		exit(1);
	}

	while ($dbi_a = $dbi_s->fetchrow_arrayref())
	{
		my $timestamp;
		my $count;
		my $spam;
		my $ratio;

		$timestamp = $dbi_a->[0];
		$count = $dbi_a->[1];
		$spam = $dbi_a->[2];

		if (!defined($spam))
		{
			next;
		}

		$ratio = $spam / $count;

		make_rrd("unsigned", "messages", $timestamp);
		update_rrd("unsigned", "messages", $timestamp, $count);

		make_rrd("unsigned", "spam", $timestamp);
		update_rrd("unsigned", "spam", $timestamp, $ratio);
	}

	$dbi_s->finish;
}

$dbi_s = $dbi_h->prepare("SELECT domains.name,
                                 UNIX_TIMESTAMP(TIMESTAMPADD(HOUR,
                                                             HOUR(msgtime),
                                                             DATE(msgtime))),
                                 COUNT(messages.id),
                                 SUM(SIGN(spam))
                          FROM messages
                          JOIN signatures
                               ON signatures.message = messages.id
                          JOIN domains
                               ON signatures.domain = domains.id
                          WHERE NOT spam = -1
                          $dateclause
                          AND msgtime < TIMESTAMPADD(HOUR,
                                                     HOUR(CURRENT_TIMESTAMP()),
                                                     DATE(CURRENT_TIMESTAMP()))
                          GROUP BY 1, 2
                          ORDER BY 1, 2");

if (!$dbi_s->execute)
{
	print STDERR "$progname: SELECT failed: " . $dbi_h->errstr . "\n";
	$dbi_s->finish;
	$dbi_h->disconnect;
	exit(1);
}

while ($dbi_a = $dbi_s->fetchrow_arrayref())
{
	my $domain;
	my $timestamp;
	my $count;
	my $spam;
	my $ratio;

	$domain = $dbi_a->[0];
	$timestamp = $dbi_a->[1];
	$count = $dbi_a->[2];
	$spam = $dbi_a->[3];

	if (!defined($spam))
	{
		next;
	}

	if ($domain eq "(null)")
	{
		$domain = "null";
	}

	$ratio = $spam / $count;
	$domain = lc($domain);

	if (@domains && grep({$_ eq $domain} @domains) == 0)
	{
		next;
	}

	make_rrd($domain, "messages", $timestamp);
	update_rrd($domain, "messages", $timestamp, $count);

	make_rrd($domain, "spam", $timestamp);
	update_rrd($domain, "spam", $timestamp, $ratio);
}

$dbi_s->finish;

#
# all done!
#

if ($verbose)
{
	print STDERR "$progname: terminating at " . localtime() . "\n";
}

$dbi_h->disconnect;

exit(0);
