#!/usr/bin/perl
#
# Yet Another Monitoring script v0.94
# (C) Copyright 2007-2010, Bjarni R. Einarsson
#
# This program is free software, you may use it and redistribute under
# the same terms as Perl itself.
#
# See http://yamon.klaki.net/ for news, updates and documentation.
#
use strict;
use Data::Dumper;
use Sys::Hostname;
use Time::HiRes qw( gettimeofday tv_interval );
use Digest::MD5 qw( md5_hex );
my $default_signature = "\n\n-- \nSent by Yamon 0.94 ";
# Some very stupid argument handling...
my $verbose = 0;
my $test_all = 0;
my $built_in = 0;
my $monitored = 0;
foreach my $a (@ARGV)
{
if ($a eq "-v")
{
$verbose = 1;
}
elsif ($a eq "-a")
{
$test_all = 1;
}
elsif ($a eq "--builtin")
{
$built_in = 1;
}
elsif (-e $a)
{
require $a;
$monitored++;
}
else
{
die "Invalid argument and no such file: $a\n";
}
}
unless ($monitored || $built_in)
{
eval join('', );
die "$@" if ($@);
exit(0);
}
# Our default set of monitors
exit(0) unless ($built_in);
Monitor(
status_file => '/tmp/yamon-test.stat',
status_html => '/tmp/yamon-test.html', #FIXME: Make this work.
alerts_to => '-', # Default just sends text to stdout
alert_threshold => 3,
alert_interval => 3600,
keep_history => 50,
monitors => {
# This machine's basic internet connectivity
'internet' => {
test => "check_ping('www.google.com');",
depends => [ 'local/network', 'local/dns' ],
suppress_alerts => 1,
},
'local/network' => {
test => "check_ping(default_gateway());",
suppress_alerts => 1,
},
'local/dns' => {
test => "gethostbyname('google.com') or die 'FAILED: DNS borked!';",
suppress_alerts => 1,
},
},
);
##[ Testing logic ]############################################################
my %MONITOR_ARGS = undef;
sub Monitor
{
(%MONITOR_ARGS) = @_;
my $monitors = $MONITOR_ARGS{monitors};
my $status = load_status(%MONITOR_ARGS);
my $stamp = time();
my %scheduled = scheduled_tests($monitors, $status);
while (my ($m, $mh) = each(%scheduled))
{
run_test($m, $mh, $stamp, $monitors, $status);
}
fire_alerts($monitors, $status, $stamp, %MONITOR_ARGS);
save_status($status, %MONITOR_ARGS);
return 1;
}
sub fire_alerts
{
my ($monitors, $status, $stamp, %args) = @_;
my $alerts_to = $args{alerts_to} || return undef;
my $th = $args{alert_threshold} || 0;
my $new = 0;
my %new = ( );
my %details = ( );
my $summary = "OK";
my $fail_count = 0;
MON: while (my ($m, $ms) = each(%$status))
{
my $mh = $monitors->{$m};
if (($ms->{last_checked} eq $stamp) &&
(not $mh->{suppress_alerts}) &&
($ms->{status}->[0] =~ /^(FAILED|OK)/))
{
my $s = $1 if ($ms->{status}->[0] =~ /^([^\/]+)/);
for (my $i = 1; $i < ($mh->{alert_threshold} || $th); $i++)
{
my $si = $1 if ($ms->{status}->[$i] =~ /^([^\/]+)/);
if ($s ne $si)
{
$fail_count++ if ($ms->{last_alert} =~ /^FAILED/);
next MON;
}
}
$fail_count++ if ($s =~ /^FAILED/);
my $interval = $mh->{alert_interval} || $args{alert_interval} || 900;
if ((($ms->{last_alert} || "OK") ne $s) ||
($s eq "FAILED" && ((time() - $ms->{last_alert_time}) > $interval)))
{
$summary = $s if ($s ne "OK");
$ms->{last_alert} = $s;
$ms->{last_alert_time} = time();
my $sum = sprintf("%s:%s", $m, $s);
my $det = sprintf("%s:%s", $m, $ms->{last_error} || $s);
my $to = $mh->{alerts_to} || $alerts_to;
push @{ $new{$to} }, $sum;
push @{ $details{$to} }, $det if ($det ne $sum);
$new++;
}
}
else
{
$fail_count++ if ($ms->{last_alert} =~ /^FAILED/);
}
}
return unless ($new);
foreach my $a_to (keys(%new))
{
my @new = @{ $new{$a_to} };
my @details = @{ $details{$a_to} || [ ] };
my $subject = sprintf("%s%d %s %s",
($args{alert_prefix} || ''), $fail_count,
$summary, join(" ", @new));
my $body = join("\n", @details);
my $sig = $args{signature} || $default_signature;
foreach my $to (split(/;\s*/, $a_to))
{
$to = '!logger -t yamon -p user.err "$subject / $body"'
if ($to eq "syslog");
if ($to eq "-")
{
my $msg = join("\n", join(" ", @new), @details);
printf "%s%d %s %s\n",
($args{alert_prefix} || ''), $fail_count, $summary, $msg;
}
elsif ($to =~ s/^([\|\!])\s*//)
{
my $how = $1;
my $cmd = $to;
# FIXME: Security; is this sufficient?
my $b_clean = $body;
$subject =~ s/[\$\;\|\&\<\>\"\']/_/g;
$b_clean =~ s/[\$\;\|\&\<\>\"\']/_/g;
$cmd =~ s/\$subject/$subject/g;
$cmd =~ s/\$body/$b_clean/g;
open (CMD, "|$cmd") || die "$cmd: $!";
print CMD $body, $sig if ($how eq "|");
close(CMD);
print "Ran: $cmd\n" if ($verbose);
}
elsif ($to =~ /\@/)
{
if (my $server = $args{smtp_server})
{
my $from = $args{smtp_from} || 'yamon@'.hostname();
my $host = $args{smtp_helo} || hostname();
$body =~ s/^\.$/>./gm;
eval {
check_tcp($server, 25,
'', '220',
'HELO '.$host."\r\n", '250',
"MAIL FROM: <$from>\r\n", '250',
"RCPT TO: <$to>\r\n", '250',
"DATA\r\n", '354',
"From: <$from>\r\n"
."To: <$to>\r\n"
."Subject: $subject\n\r"
."\r\n"
.$body.$sig
."\r\n.\r\n", '250',
"QUIT\r\n", '221');
print "E-mailed $to ($subject) via $server\n" if ($verbose);
};
die "Error mailing: $@\n" if ($@);
}
else
{
open (MAIL, "|mail -s '$subject' $to") || die "mail: $!";
print MAIL $body, $sig;
close(MAIL);
print "E-mailed $to ($subject) using mail\n" if ($verbose);
}
}
# FIXME: Other targets:
# ? HTTP GET/POST
}
}
}
sub save_status
{
my ($status, %args) = @_;
my $file = $args{status_file} || die "Need status_file argument";
my $html = $args{status_html};
# Truncate histories...
foreach my $mon (keys(%$status))
{
my $history = $status->{$mon}->{status} || [ ];
while (@{ $history } > ($args{keep_history} || 10))
{
pop @{ $history };
}
if (($status->{$mon}->{last_checked} || 0) < time() - 3600*24*7)
{
delete $status->{$mon};
}
}
open(STAT, ">$file") || die "$file: $!";
print STAT Data::Dumper->Dump([ $status ], ["status"]);
close(STAT);
if ($html) {
open(HTML, ">$html") || die "$html: $!";
print HTML "Test | Status | Tested |
\n";
my $count = 0;
foreach my $mon (sort keys(%$status))
{
my ($stat, $stime) = split('/', $status->{$mon}->{status}->[0]);
my $age = (time()-$status->{$mon}->{last_checked} > 1800) ? 'old' : 'new';
printf HTML "%s | %s | %s | %s |
\n",
$age, $stat, (($count % 2) ? 'e' : 'o'),
$mon, sprintf("@ %s", fmt_time($stime)), $stat,
fmt_time($status->{$mon}->{last_checked}),
$status->{$mon}->{last_error} ;
$count++;
}
print HTML "
\n";
close(HTML);
}
}
sub fmt_time
{
my ($time) = @_;
my @t = localtime($time);
return sprintf("%4.4d-%2.2d-%2.2d %2.2d:%2.2d",
1900+$t[5], 1+$t[4], $t[3], $t[2], $t[1]);
}
sub load_status
{
my (%args) = @_;
my $file = $args{status_file} || die "Need status_file argument";
my $status = { };
open(STAT, "<$file") || return $status;
my $stat_data = join('', );
close(STAT);
eval $stat_data;
return $status;
}
sub run_test
{
my ($mon, $mon_hash, $stamp, $monitors, $status) = @_;
# Basic memoization
if ($status->{$mon}->{last_checked} == $stamp)
{
return $status->{$mon}->{status}->[0];
}
# If this test doesn't want to be run too often, just return last status.
# - unless it's failing/alerting, in which case we retest.
if ((!$test_all) &&
($mon_hash->{min_interval}) &&
($status->{$mon}->{status}->[0] !~ /^FAILED/) &&
($status->{$mon}->{last_alert} !~ /^FAILED/) &&
($status->{$mon}->{last_checked} + $mon_hash->{min_interval} > time()))
{
return $status->{$mon}->{status}->[0];
}
# FIXME: If previous status was DEP_FAILED, probe the failed tests
# first. If still broken, just stay DEP_FAILED, else continue
# with the test.
$status->{$mon}->{last_checked} = $stamp;
if (my $test = $mon_hash->{test})
{
my $t0 = [gettimeofday()];
foreach my $k (keys(%$mon_hash))
{
$test =~ s/_${k}_/\$mon_hash->{$k}/g;
}
printf "%s: %s\n", $mon, $test if ($verbose);
if (-e "fail-$mon")
{
$@ = "FAILED: fail-file in place";
}
elsif (-e "succeed-$mon")
{
$@ = undef;
}
else
{
eval $test;
}
if (my $rrd_spec = $mon_hash->{rrd_time})
{
rrd_update($mon, 'time', $rrd_spec, int(1000*tv_interval($t0)));
}
if (my $err = $@)
{
chomp $err;
if ($err =~ /^FAILED/)
{
my $stat = "FAILED";
foreach my $dep (@{ $mon_hash->{depends} })
{
my $res = run_test($dep, $monitors->{$dep}, $stamp, $monitors, $status);
$stat = "DEP_FAILED" if ($res =~ /FAILED/);
}
unshift @{ $status->{$mon}->{status} }, $stat.'/'.time();
$status->{$mon}->{last_error} = $err;
print $err, "\n" if ($verbose);;
if (my $rrd_spec = $mon_hash->{rrd_errors})
{
rrd_update($mon, 'err', $rrd_spec, 1);
}
return $stat;
}
else
{
unshift @{ $status->{$mon}->{status} }, "BROKEN";
$status->{$mon}->{last_error} = $err;
print "BROKEN TEST: ", $err, "\n";
return "BROKEN";
}
}
else
{
unshift @{ $status->{$mon}->{status} }, "OK/".time();
delete $status->{$mon}->{last_error};
if (my $rrd_spec = $mon_hash->{rrd_errors})
{
rrd_update($mon, 'err', $rrd_spec, 0);
}
return "OK";
}
}
else
{
# Test dependencies and summarize.
my $stat = "OK/".time();
foreach my $dep (@{ $mon_hash->{depends} })
{
my $res = run_test($dep, $monitors->{$dep}, $stamp, $monitors, $status);
$stat = "DEP_FAILED/".time() if ($res =~ /FAILED/);
}
unshift @{ $status->{$mon}->{status} }, $stat;
return $stat;
}
}
sub rrd_update
{
my ($mon, $var, $rrd_spec, $data) = @_;
my $name = join('.', $mon, $var, substr(md5_hex(join('', @$rrd_spec)), 0, 8));
$name =~ s/\//-/g;
my $filename = $MONITOR_ARGS{rrd_prefix}.$name;
unless (-e $filename) {
my $dirname = $1 if ($filename =~ /^(.*)\//);
mkdir($dirname);
system('rrdtool', 'create', $filename, '--step', @$rrd_spec);
}
$data =~ s/\%$//;
system('rrdtool', 'update', $filename, join(':', time(), $data));
}
sub scheduled_tests
{
my ($monitors, $status) = @_;
# Figure out which monitors are depended upon. The ones with no
# dependants are "top-level" tests, ones we test (or test their
# dependencies if they don't have a test of their own).
while (my ($m, $mh) = each(%$monitors))
{
foreach my $d (@{ $mh->{depends} })
{
push @{ $monitors->{$d}->{dependants} }, $m;
}
}
my %scheduled = ( );
while (my ($m, $mh) = each(%$monitors))
{
if (($test_all) or
(not $mh->{dependants}) or
($mh->{always}) or
($status->{$m}->{status}->[0] =~ /FAILED/) or
($status->{$m}->{last_alert} =~ /FAILED/))
{
$scheduled{$m} = $mh;
}
}
return %scheduled;
}
##[ Tests ]####################################################################
use IO::Socket::INET;
sub check_tcp
{
my ($host, $port, @expect) = @_;
die "Need host and port" unless ($host and $port);
local $SIG{ALRM} = sub { die "FAILED: Timed out.\n" };
eval {
alarm(15);
die "FAILED: $@\n"
unless (my $s = IO::Socket::INET->new( PeerHost => $host,
PeerPort => $port ));
while (@expect)
{
my $send = shift @expect;
my $want = shift @expect;
if ($send)
{
$s->print($send);
chomp $send;
print ">>> $send\n" if ($verbose);
}
if ($want)
{
my $res;
if ($want =~ s/^\*//)
{
alarm(10);
$res = join('', <$s>);
printf "<<< %d bytes\n", length($res) if ($verbose);
}
else
{
alarm(10);
$res = <$s>;
print "<<< $res" if ($verbose);
}
if ($res !~ $want)
{
$send =~ s/\s+$//s;
$send = " to ".$send if ($send);
$res =~ s/[\r\n]/ /gs;
die "FAILED: Invalid reponse$send: ".substr($res, 0, 200)."\n";
}
}
}
$s->close();
};
alarm(0);
die $@ if ($@);
}
sub check_smtp
{
my ($host, $port, $from, $rcpt, $f_status, $r_status) = @_;
die "Need host" unless ($host);
$port = 25 unless ($port);
$f_status = 250 unless ($f_status);
$r_status = 250 unless ($r_status);
my @expect = ("", "^220 ",
sprintf("HELO %s\r\n", hostname()), "^250 ");
push @expect, sprintf("MAIL FROM: <%s>\r\n", $from), "^$f_status " if ($from);
if ($from && $rcpt)
{
foreach my $r (split(/[,\s]+/, $rcpt))
{
push @expect, sprintf("RCPT TO: <%s>\r\n", $r),"^$r_status ";
}
}
return check_tcp($host, $port, @expect, "QUIT\r\n", "");
}
sub check_http
{
my ($host, $port, $path, $method, $regexp, $ip) = @_;
die "Need host, port and path" unless ($host and $port and $path);
$regexp = '(?si)^HTTP\S+ 200 OK' unless ($regexp);
my $req = sprintf("%s %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n",
$method || 'HEAD', $path, $host);
return check_tcp($ip || $host, $port, $req, "*".$regexp);
}
sub get_syshealth
{
my ($host, $port, $path, $ip) = @_;
die "Need host, port, path" unless ($host and $port and $path);
local $SIG{ALRM} = sub { die "FAILED: Timed out.\n" };
my $values = { };
eval {
alarm(15);
die "FAILED: Can't bind: $@\n"
unless (my $s = IO::Socket::INET->new( PeerHost => ($ip || $host),
PeerPort => $port ));
alarm(30);
$s->printf("GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n",
$path, $host);
while (<$s>)
{
chomp;
if (/^(\S+):\s+(.*)$/)
{
$values->{$1} = $2;
}
}
};
alarm(0);
die $@ if ($@);
return $values;
}
sub check_syshealth
{
my ($host, $port, $path, $ranges, $ip, $values) = @_;
die "Need host, port, path, ranges"
unless ($host and $port and $path and $ranges);
# Get values if they aren't already provided.
$values ||= get_syshealth($host, $port, $path, $ip);
my @problems = ( );
foreach my $k (keys(%$ranges))
{
my $v = $values->{$k};
my $r = $ranges->{$k};
print "Testing $k, value is $v\n" if ($verbose);
my $problem = '';
if (not defined($v))
{
# Negative lower-bounds mean "undefined" is OK...
$problem = "$k unknown" unless ($r->[0] < 0);
}
else
{
$v =~ s/\%$//;
if ($v < $r->[0])
{
$problem = "$k: $v lt ".$r->[0];
}
elsif ($v > $r->[1])
{
$problem = "$k: $v gt ".$r->[1];
}
}
if ($problem)
{
foreach my $i (@{ $r }[2..100])
{
$problem .= " $i: ".$values->{$i} if ($i);
}
push @problems, $problem;
}
}
die "FAILED: ".join(', ', @problems)."\n" if (@problems);
}
sub syshealth
{
my ($host, $port, $path, $ranges, $rrds, $ip) = @_;
die "Need host, port, path, ranges"
unless ($host and $port and $path and $ranges);
# Get values if they aren't already provided.
my $values = get_syshealth($host, $port, $path, $ip);
# Update RRD databases
if ($rrds) {
while (my ($var, $rrd_spec) = each(%$rrds)) {
rrd_update(join('.', $host, $port), $var, $rrd_spec, $values->{$var});
}
}
# Run the checks.
return check_syshealth($host, $port, $path, $ranges, $ip, $values);
}
sub check_dnsbl
{
my ($ip, $bl, $pattern) = @_;
my $check = join('.', (reverse split(/\./, $ip)), $bl);
my ($name,$aliases,$addrtype,$length,@addrs) = gethostbyname($check);
my $result = inet_ntoa($addrs[0]) if (@addrs);
die "FAILED: on $bl" if ($result =~ ($pattern || '^127\.0\.0'));
}
sub check_ssh
{
my ($host, $port) = @_;
return check_tcp($host, $port || 22, undef, '^SSH-');
}
sub check_cmd
{
my ($command, $expect) = @_;
open (EXE, "$command 2>&1 |") || die "exec($command): $!";
my $result = join('', );
close(EXE);
die "FAILED: $command" if ($result !~ $expect);
}
sub check_ping
{
my ($host) = @_;
my $msg = '';
my $res = '';
open (PING, "ping -nc 3 $host 2>&1 |") || die "ping: $!";
while ()
{
chomp;
$msg .= $_;
$res = $_ if (/packet loss/);
}
close(PING);
die "FAILED: $msg\n" if ($res eq '');
die "FAILED: $res\n" if ($res =~ /100\%/);
}
sub default_gateway
{
open (NETSTAT, "netstat -rn|") || die "netstat: $!";
while ()
{
if (/^(0\.0\.0\.0|default)\s+(\d+\.\d+\.\d+\.\d+)/)
{
close(NETSTAT);
return $2;
}
}
close(NETSTAT);
die "FAILED: Default gateway not found!\n";
}
# vi:ts=2 expandtab