#!/usr/bin/perl # # Yet Another Monitoring script v0.94 # (C) Copyright 2007-2010, Bjarni R. Einarsson # # This program is free software, you may use it and redistribute under # the same terms as Perl itself. # # See http://yamon.klaki.net/ for news, updates and documentation. # use strict; use Data::Dumper; use Sys::Hostname; use Time::HiRes qw( gettimeofday tv_interval ); use Digest::MD5 qw( md5_hex ); my $default_signature = "\n\n-- \nSent by Yamon 0.94 "; # Some very stupid argument handling... my $verbose = 0; my $test_all = 0; my $built_in = 0; my $monitored = 0; foreach my $a (@ARGV) { if ($a eq "-v") { $verbose = 1; } elsif ($a eq "-a") { $test_all = 1; } elsif ($a eq "--builtin") { $built_in = 1; } elsif (-e $a) { require $a; $monitored++; } else { die "Invalid argument and no such file: $a\n"; } } unless ($monitored || $built_in) { eval join('', ); die "$@" if ($@); exit(0); } # Our default set of monitors exit(0) unless ($built_in); Monitor( status_file => '/tmp/yamon-test.stat', status_html => '/tmp/yamon-test.html', #FIXME: Make this work. alerts_to => '-', # Default just sends text to stdout alert_threshold => 3, alert_interval => 3600, keep_history => 50, monitors => { # This machine's basic internet connectivity 'internet' => { test => "check_ping('www.google.com');", depends => [ 'local/network', 'local/dns' ], suppress_alerts => 1, }, 'local/network' => { test => "check_ping(default_gateway());", suppress_alerts => 1, }, 'local/dns' => { test => "gethostbyname('google.com') or die 'FAILED: DNS borked!';", suppress_alerts => 1, }, }, ); ##[ Testing logic ]############################################################ my %MONITOR_ARGS = undef; sub Monitor { (%MONITOR_ARGS) = @_; my $monitors = $MONITOR_ARGS{monitors}; my $status = load_status(%MONITOR_ARGS); my $stamp = time(); my %scheduled = scheduled_tests($monitors, $status); while (my ($m, $mh) = each(%scheduled)) { run_test($m, $mh, $stamp, $monitors, $status); } fire_alerts($monitors, $status, $stamp, %MONITOR_ARGS); save_status($status, %MONITOR_ARGS); return 1; } sub fire_alerts { my ($monitors, $status, $stamp, %args) = @_; my $alerts_to = $args{alerts_to} || return undef; my $th = $args{alert_threshold} || 0; my $new = 0; my %new = ( ); my %details = ( ); my $summary = "OK"; my $fail_count = 0; MON: while (my ($m, $ms) = each(%$status)) { my $mh = $monitors->{$m}; if (($ms->{last_checked} eq $stamp) && (not $mh->{suppress_alerts}) && ($ms->{status}->[0] =~ /^(FAILED|OK)/)) { my $s = $1 if ($ms->{status}->[0] =~ /^([^\/]+)/); for (my $i = 1; $i < ($mh->{alert_threshold} || $th); $i++) { my $si = $1 if ($ms->{status}->[$i] =~ /^([^\/]+)/); if ($s ne $si) { $fail_count++ if ($ms->{last_alert} =~ /^FAILED/); next MON; } } $fail_count++ if ($s =~ /^FAILED/); my $interval = $mh->{alert_interval} || $args{alert_interval} || 900; if ((($ms->{last_alert} || "OK") ne $s) || ($s eq "FAILED" && ((time() - $ms->{last_alert_time}) > $interval))) { $summary = $s if ($s ne "OK"); $ms->{last_alert} = $s; $ms->{last_alert_time} = time(); my $sum = sprintf("%s:%s", $m, $s); my $det = sprintf("%s:%s", $m, $ms->{last_error} || $s); my $to = $mh->{alerts_to} || $alerts_to; push @{ $new{$to} }, $sum; push @{ $details{$to} }, $det if ($det ne $sum); $new++; } } else { $fail_count++ if ($ms->{last_alert} =~ /^FAILED/); } } return unless ($new); foreach my $a_to (keys(%new)) { my @new = @{ $new{$a_to} }; my @details = @{ $details{$a_to} || [ ] }; my $subject = sprintf("%s%d %s %s", ($args{alert_prefix} || ''), $fail_count, $summary, join(" ", @new)); my $body = join("\n", @details); my $sig = $args{signature} || $default_signature; foreach my $to (split(/;\s*/, $a_to)) { $to = '!logger -t yamon -p user.err "$subject / $body"' if ($to eq "syslog"); if ($to eq "-") { my $msg = join("\n", join(" ", @new), @details); printf "%s%d %s %s\n", ($args{alert_prefix} || ''), $fail_count, $summary, $msg; } elsif ($to =~ s/^([\|\!])\s*//) { my $how = $1; my $cmd = $to; # FIXME: Security; is this sufficient? my $b_clean = $body; $subject =~ s/[\$\;\|\&\<\>\"\']/_/g; $b_clean =~ s/[\$\;\|\&\<\>\"\']/_/g; $cmd =~ s/\$subject/$subject/g; $cmd =~ s/\$body/$b_clean/g; open (CMD, "|$cmd") || die "$cmd: $!"; print CMD $body, $sig if ($how eq "|"); close(CMD); print "Ran: $cmd\n" if ($verbose); } elsif ($to =~ /\@/) { if (my $server = $args{smtp_server}) { my $from = $args{smtp_from} || 'yamon@'.hostname(); my $host = $args{smtp_helo} || hostname(); $body =~ s/^\.$/>./gm; eval { check_tcp($server, 25, '', '220', 'HELO '.$host."\r\n", '250', "MAIL FROM: <$from>\r\n", '250', "RCPT TO: <$to>\r\n", '250', "DATA\r\n", '354', "From: <$from>\r\n" ."To: <$to>\r\n" ."Subject: $subject\n\r" ."\r\n" .$body.$sig ."\r\n.\r\n", '250', "QUIT\r\n", '221'); print "E-mailed $to ($subject) via $server\n" if ($verbose); }; die "Error mailing: $@\n" if ($@); } else { open (MAIL, "|mail -s '$subject' $to") || die "mail: $!"; print MAIL $body, $sig; close(MAIL); print "E-mailed $to ($subject) using mail\n" if ($verbose); } } # FIXME: Other targets: # ? HTTP GET/POST } } } sub save_status { my ($status, %args) = @_; my $file = $args{status_file} || die "Need status_file argument"; my $html = $args{status_html}; # Truncate histories... foreach my $mon (keys(%$status)) { my $history = $status->{$mon}->{status} || [ ]; while (@{ $history } > ($args{keep_history} || 10)) { pop @{ $history }; } if (($status->{$mon}->{last_checked} || 0) < time() - 3600*24*7) { delete $status->{$mon}; } } open(STAT, ">$file") || die "$file: $!"; print STAT Data::Dumper->Dump([ $status ], ["status"]); close(STAT); if ($html) { open(HTML, ">$html") || die "$html: $!"; print HTML "\n"; my $count = 0; foreach my $mon (sort keys(%$status)) { my ($stat, $stime) = split('/', $status->{$mon}->{status}->[0]); my $age = (time()-$status->{$mon}->{last_checked} > 1800) ? 'old' : 'new'; printf HTML "\n", $age, $stat, (($count % 2) ? 'e' : 'o'), $mon, sprintf("@ %s", fmt_time($stime)), $stat, fmt_time($status->{$mon}->{last_checked}), $status->{$mon}->{last_error} ; $count++; } print HTML "
TestStatusTested
%s%s%s%s
\n"; close(HTML); } } sub fmt_time { my ($time) = @_; my @t = localtime($time); return sprintf("%4.4d-%2.2d-%2.2d %2.2d:%2.2d", 1900+$t[5], 1+$t[4], $t[3], $t[2], $t[1]); } sub load_status { my (%args) = @_; my $file = $args{status_file} || die "Need status_file argument"; my $status = { }; open(STAT, "<$file") || return $status; my $stat_data = join('', ); close(STAT); eval $stat_data; return $status; } sub run_test { my ($mon, $mon_hash, $stamp, $monitors, $status) = @_; # Basic memoization if ($status->{$mon}->{last_checked} == $stamp) { return $status->{$mon}->{status}->[0]; } # If this test doesn't want to be run too often, just return last status. # - unless it's failing/alerting, in which case we retest. if ((!$test_all) && ($mon_hash->{min_interval}) && ($status->{$mon}->{status}->[0] !~ /^FAILED/) && ($status->{$mon}->{last_alert} !~ /^FAILED/) && ($status->{$mon}->{last_checked} + $mon_hash->{min_interval} > time())) { return $status->{$mon}->{status}->[0]; } # FIXME: If previous status was DEP_FAILED, probe the failed tests # first. If still broken, just stay DEP_FAILED, else continue # with the test. $status->{$mon}->{last_checked} = $stamp; if (my $test = $mon_hash->{test}) { my $t0 = [gettimeofday()]; foreach my $k (keys(%$mon_hash)) { $test =~ s/_${k}_/\$mon_hash->{$k}/g; } printf "%s: %s\n", $mon, $test if ($verbose); if (-e "fail-$mon") { $@ = "FAILED: fail-file in place"; } elsif (-e "succeed-$mon") { $@ = undef; } else { eval $test; } if (my $rrd_spec = $mon_hash->{rrd_time}) { rrd_update($mon, 'time', $rrd_spec, int(1000*tv_interval($t0))); } if (my $err = $@) { chomp $err; if ($err =~ /^FAILED/) { my $stat = "FAILED"; foreach my $dep (@{ $mon_hash->{depends} }) { my $res = run_test($dep, $monitors->{$dep}, $stamp, $monitors, $status); $stat = "DEP_FAILED" if ($res =~ /FAILED/); } unshift @{ $status->{$mon}->{status} }, $stat.'/'.time(); $status->{$mon}->{last_error} = $err; print $err, "\n" if ($verbose);; if (my $rrd_spec = $mon_hash->{rrd_errors}) { rrd_update($mon, 'err', $rrd_spec, 1); } return $stat; } else { unshift @{ $status->{$mon}->{status} }, "BROKEN"; $status->{$mon}->{last_error} = $err; print "BROKEN TEST: ", $err, "\n"; return "BROKEN"; } } else { unshift @{ $status->{$mon}->{status} }, "OK/".time(); delete $status->{$mon}->{last_error}; if (my $rrd_spec = $mon_hash->{rrd_errors}) { rrd_update($mon, 'err', $rrd_spec, 0); } return "OK"; } } else { # Test dependencies and summarize. my $stat = "OK/".time(); foreach my $dep (@{ $mon_hash->{depends} }) { my $res = run_test($dep, $monitors->{$dep}, $stamp, $monitors, $status); $stat = "DEP_FAILED/".time() if ($res =~ /FAILED/); } unshift @{ $status->{$mon}->{status} }, $stat; return $stat; } } sub rrd_update { my ($mon, $var, $rrd_spec, $data) = @_; my $name = join('.', $mon, $var, substr(md5_hex(join('', @$rrd_spec)), 0, 8)); $name =~ s/\//-/g; my $filename = $MONITOR_ARGS{rrd_prefix}.$name; unless (-e $filename) { my $dirname = $1 if ($filename =~ /^(.*)\//); mkdir($dirname); system('rrdtool', 'create', $filename, '--step', @$rrd_spec); } $data =~ s/\%$//; system('rrdtool', 'update', $filename, join(':', time(), $data)); } sub scheduled_tests { my ($monitors, $status) = @_; # Figure out which monitors are depended upon. The ones with no # dependants are "top-level" tests, ones we test (or test their # dependencies if they don't have a test of their own). while (my ($m, $mh) = each(%$monitors)) { foreach my $d (@{ $mh->{depends} }) { push @{ $monitors->{$d}->{dependants} }, $m; } } my %scheduled = ( ); while (my ($m, $mh) = each(%$monitors)) { if (($test_all) or (not $mh->{dependants}) or ($mh->{always}) or ($status->{$m}->{status}->[0] =~ /FAILED/) or ($status->{$m}->{last_alert} =~ /FAILED/)) { $scheduled{$m} = $mh; } } return %scheduled; } ##[ Tests ]#################################################################### use IO::Socket::INET; sub check_tcp { my ($host, $port, @expect) = @_; die "Need host and port" unless ($host and $port); local $SIG{ALRM} = sub { die "FAILED: Timed out.\n" }; eval { alarm(15); die "FAILED: $@\n" unless (my $s = IO::Socket::INET->new( PeerHost => $host, PeerPort => $port )); while (@expect) { my $send = shift @expect; my $want = shift @expect; if ($send) { $s->print($send); chomp $send; print ">>> $send\n" if ($verbose); } if ($want) { my $res; if ($want =~ s/^\*//) { alarm(10); $res = join('', <$s>); printf "<<< %d bytes\n", length($res) if ($verbose); } else { alarm(10); $res = <$s>; print "<<< $res" if ($verbose); } if ($res !~ $want) { $send =~ s/\s+$//s; $send = " to ".$send if ($send); $res =~ s/[\r\n]/ /gs; die "FAILED: Invalid reponse$send: ".substr($res, 0, 200)."\n"; } } } $s->close(); }; alarm(0); die $@ if ($@); } sub check_smtp { my ($host, $port, $from, $rcpt, $f_status, $r_status) = @_; die "Need host" unless ($host); $port = 25 unless ($port); $f_status = 250 unless ($f_status); $r_status = 250 unless ($r_status); my @expect = ("", "^220 ", sprintf("HELO %s\r\n", hostname()), "^250 "); push @expect, sprintf("MAIL FROM: <%s>\r\n", $from), "^$f_status " if ($from); if ($from && $rcpt) { foreach my $r (split(/[,\s]+/, $rcpt)) { push @expect, sprintf("RCPT TO: <%s>\r\n", $r),"^$r_status "; } } return check_tcp($host, $port, @expect, "QUIT\r\n", ""); } sub check_http { my ($host, $port, $path, $method, $regexp, $ip) = @_; die "Need host, port and path" unless ($host and $port and $path); $regexp = '(?si)^HTTP\S+ 200 OK' unless ($regexp); my $req = sprintf("%s %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", $method || 'HEAD', $path, $host); return check_tcp($ip || $host, $port, $req, "*".$regexp); } sub get_syshealth { my ($host, $port, $path, $ip) = @_; die "Need host, port, path" unless ($host and $port and $path); local $SIG{ALRM} = sub { die "FAILED: Timed out.\n" }; my $values = { }; eval { alarm(15); die "FAILED: Can't bind: $@\n" unless (my $s = IO::Socket::INET->new( PeerHost => ($ip || $host), PeerPort => $port )); alarm(30); $s->printf("GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", $path, $host); while (<$s>) { chomp; if (/^(\S+):\s+(.*)$/) { $values->{$1} = $2; } } }; alarm(0); die $@ if ($@); return $values; } sub check_syshealth { my ($host, $port, $path, $ranges, $ip, $values) = @_; die "Need host, port, path, ranges" unless ($host and $port and $path and $ranges); # Get values if they aren't already provided. $values ||= get_syshealth($host, $port, $path, $ip); my @problems = ( ); foreach my $k (keys(%$ranges)) { my $v = $values->{$k}; my $r = $ranges->{$k}; print "Testing $k, value is $v\n" if ($verbose); my $problem = ''; if (not defined($v)) { # Negative lower-bounds mean "undefined" is OK... $problem = "$k unknown" unless ($r->[0] < 0); } else { $v =~ s/\%$//; if ($v < $r->[0]) { $problem = "$k: $v lt ".$r->[0]; } elsif ($v > $r->[1]) { $problem = "$k: $v gt ".$r->[1]; } } if ($problem) { foreach my $i (@{ $r }[2..100]) { $problem .= " $i: ".$values->{$i} if ($i); } push @problems, $problem; } } die "FAILED: ".join(', ', @problems)."\n" if (@problems); } sub syshealth { my ($host, $port, $path, $ranges, $rrds, $ip) = @_; die "Need host, port, path, ranges" unless ($host and $port and $path and $ranges); # Get values if they aren't already provided. my $values = get_syshealth($host, $port, $path, $ip); # Update RRD databases if ($rrds) { while (my ($var, $rrd_spec) = each(%$rrds)) { rrd_update(join('.', $host, $port), $var, $rrd_spec, $values->{$var}); } } # Run the checks. return check_syshealth($host, $port, $path, $ranges, $ip, $values); } sub check_dnsbl { my ($ip, $bl, $pattern) = @_; my $check = join('.', (reverse split(/\./, $ip)), $bl); my ($name,$aliases,$addrtype,$length,@addrs) = gethostbyname($check); my $result = inet_ntoa($addrs[0]) if (@addrs); die "FAILED: on $bl" if ($result =~ ($pattern || '^127\.0\.0')); } sub check_ssh { my ($host, $port) = @_; return check_tcp($host, $port || 22, undef, '^SSH-'); } sub check_cmd { my ($command, $expect) = @_; open (EXE, "$command 2>&1 |") || die "exec($command): $!"; my $result = join('', ); close(EXE); die "FAILED: $command" if ($result !~ $expect); } sub check_ping { my ($host) = @_; my $msg = ''; my $res = ''; open (PING, "ping -nc 3 $host 2>&1 |") || die "ping: $!"; while () { chomp; $msg .= $_; $res = $_ if (/packet loss/); } close(PING); die "FAILED: $msg\n" if ($res eq ''); die "FAILED: $res\n" if ($res =~ /100\%/); } sub default_gateway { open (NETSTAT, "netstat -rn|") || die "netstat: $!"; while () { if (/^(0\.0\.0\.0|default)\s+(\d+\.\d+\.\d+\.\d+)/) { close(NETSTAT); return $2; } } close(NETSTAT); die "FAILED: Default gateway not found!\n"; } # vi:ts=2 expandtab