#!/usr/bin/perl
#
# This is a typical Yet Another Monitor definition for monitoring a machine
# named "server.com" on ip 1.2.3.4 which is running sshd, a webserver and
# a mail server.
#
# Dependencies are in place to differentiate between a failed service,
# a dead machine and various network problems. The comments give lots of
# details.
#
# This is the default RRD database specification we use, recording latest,
# maximum and average values over a few time periods. Adjust to taste.
# The first number (300) is the rrdtool --step value.
my @RRD_GAUGE_SPEC = ( '300', 'DS:data:GAUGE:300:0:U',
'RRA:LAST:0.5:1:180',
'RRA:MAX:0.9:6:360',
'RRA:AVERAGE:0.5:6:360' );
Monitor(
# This tells Yamon where to store the state of the world and how much
# history to keep for each tests. Somewhere under /var would probably
# make a good place.
status_file => '/tmp/yamon-SAMPLE.stat',
status_html => '/tmp/yamon-SAMPLE.html', # Test status as a HTML table
keep_history => 50,
# File-name prefix for RRD databases
rrd_prefix => '/tmp/yamon-rrd.',
# These lines tell Yamon to use direct SMTP to the named server instead
# of invoking the 'mail' utility:
smtp_server => 'mail.isp.com',
smtp_helo => 'your.machine.foo', # optional: Your machine's name
smtp_from => 'you@isp.com', # optional: The from-address of sent mail
# This is where alerts go. Multiple destinations may be listed,
# seperated by a semicolon (;), optionally followed by spaces.
# This can be overridden on a per-test bases.
#
# Possible destionations:
# syslog
# email@domain.foo
# | /path/to/program -args "$subject" # Alert body on stdin
# ! /path/to/program -args "$subject $body" # Nothing on stdin
#
alerts_to => 'syslog; your@email.address.foo; other@email.bar',
# This is how many failures required before an alert is fired.
# This can be overridden on a per-test bases.
alert_threshold => 3,
# This tells Yamon to re-send alerts once an hour. This may be excessive.
# This can be overridden on a per-test bases.
alert_interval => 3600,
# This is prefixed to the subject of all sent alerts (so you can sort
# your e-mail better and/or differentiate between different Yamons).
alert_prefix => '/some text/',
# These are the monitors themselves!
monitors => {
'httpd' => {
# This is an HTTPD liveness test, fetch a page and check if the
# returned page contains the expected data.
test => "check_http('server.com', 80, '/', 'GET', '(?s)some text');",
depends => [ 'alive' ],
min_interval => 240,
},
'www.foo.com' => {
# If you're running lots of virtual servers/systems, it may be a good
# idea to test each of them, to catch configuration errors.
#
# This will fetch http://www.foo.com/ and look for the string
#
Foo!
on the page. It will fail if it can't connect or
# if the page doesn't contain that text. This allows you to test
# not only whether the server is running, but whether it is sending
# the expected response.
test => "check_http('www.foo.com', 80, '/', 'GET', '(?s)Foo!
');",
# If you are testing a page which causes database lookups or other
# heavy lifting on the server side (a good idea now and again to make
# sure everything is up and running), you may want to decrease the
# frequency as done here (900s = 15 minutes).
min_interval => 900,
# Alert after 2 failures (the default above is 3).
alert_threshold => 2,
# If this test fails, we check if the httpd is running at all.
depends => [ 'httpd' ],
# Record the response times and errors for the HTTP server as RRD.
rrd_time => [ @RRD_GAUGE_SPEC ],
rrd_errors => [ @RRD_GAUGE_SPEC ],
},
'smtp_basic' => {
# This just checks if the SMTP server is up and running. You probably
# want to delete this in favour of the test below.
test => "check_smtp('server.com');",
depends => [ 'alive' ],
min_interval => 3600,
alert_threshold => 2,
},
'smtp' => {
# This is fancier: it checks if it accepts mail for a list of users
# This may seem superfluous, but it will alert you if a configuration
# change accidentally breaks SMTP delivery for one of your domains.
test => "check_smtp('server.com', 25,
'from\@me.com',
'a\@server.com,b\@server.com,c\@otherdomain.com');",
depends => [ 'alive' ],
min_interval => 3600,
alert_threshold => 2,
},
'smtp_antirelay' => {
# Still fancier: Here we check to make sure we aren't an open relay.
# Note that this test will fail if we're testing from a host that
# is trusted by the SMTP server on server.com.
test => "check_smtp('server.com', 25, 'dude\@spam.com',
'victim\@isp.com', 250, 550);",
depends => [ 'alive' ],
min_interval => 3600,
alert_threshold => 2,
},
'spamcop' => {
# This will fire an alert if our host gets blacklisted by spamcop or
# SORBS. In practise, you'll probably want to keep an eye on some
# others as well.
test => "check_dnsbl('1.2.3.4', 'bl.spamcop.net');"
."check_dnsbl('1.2.3.4', 'dnsbl.sorbs.net');",
min_interval => 3600,
alert_threshold => 1,
},
'dns' => {
# Use dig to verify that our DNS server is sending the correct replies.
test => "check_cmd('dig server.com \@foo.com', 'A[\\t ]+1\\.2\\.3\\.4');",
},
'jabber' => {
# This just checks if something is listening on port 5222, it doesn't
# actually verify that it's a Jabber server.
test => "check_tcp('server.com', 5222);",
depends => [ 'alive' ],
min_interval => 4800,
},
'ssh' => {
# This checks if SSH is up and running. It expects an OpenSSH-style
# banner response beginning with "SSH-".
test => "check_ssh('server.com');",
depends => [ 'alive' ],
},
'pop3' => {
# This test illustrates how to use the "expect" feature of check_tcp
# to test a pop3 server.
test => "check_tcp('server.com', 110, '', '^\+OK POP3',
'QUIT\r\n', '^\+OK');",
depends => [ 'alive' ],
},
# The following two tests are things the above tests all depend on; if
# these fail Yamon knows not to complain about the things above.
'alive' => {
test => "check_ping('server.com');",
depends => [ 'network' ],
},
'network' => {
# This test will try to differentiate between a network failure
# and the host itself being down: for this to work, you want to
# test a host or router "near" server.com, either on the same
# local network or possibly server.com's router (you can find
# a suitable monitoring target with the traceroute tool).
test => "check_ping('other.server.com');",
depends => [ 'internet' ],
suppress_alerts => 1,
},
# Alert at least once a day, so you know yamon is up and running.
'daily' => {
test => 'die "FAILED: test failure\n" if (scalar gmtime =~ / 17:/);',
},
# White-box monitoring, using yamon.cgi
'system' => {
test => "check_syshealth('server.com', 80, '/cgi-bin/yamon.cgi', _ranges_);",
# NOTE: This is just a sample! You almost certainly want to tune the
# following values to match your system!
ranges => {
# Warn if filesystems get too full (or suspiciously empty)
# These numbers are percentages
"blocks_used_/" => [ 20, 90 ],
"inodes_used_/" => [ 0, 50 ],
"blocks_used_/var" => [ 20, 90 ],
"inodes_used_/var" => [ 0, 50 ],
# Complain if load average goes through the roof...
"load_average_one" => [ 0.0, 30.0 ],
"load_average_five" => [ 0.0, 15.0 ],
# Make sure httpd is running, but no more than 150 processes
"running_httpd" => [ 1, 150 ],
# This will complain if any process uses more than 256MB of RAM,
# and will report the name of the process in the alert.
"max_rss" => [ 0, 256000, 'max_rss_command' ],
# Log trends monitoring
"log_trend_/var/log/maillog" => [ 0.5, 2.0 ],
"log_trend_/var/log/httpd/error_log" => [ 0.5, 2.0 ],
},
# Record root disk-utilization to round-robin databases.
rrds => {
"blocks_used_/" => [ @RRD_GAUGE_SPEC ],
"inodes_used_/" => [ @RRD_GAUGE_SPEC ],
},
# I'll alert immediately (on first failure), but won't repeat alerts
# more than 3x a day.
alert_threshold => 1,
alert_interval => 8*3600,
},
##### You probably don't want to change anything below this line :-) ##########
# This machine's basic internet connectivity.
'internet' => {
test => "check_ping('www.google.com');",
depends => [ 'localnet', 'localdns' ],
suppress_alerts => 1,
},
'localnet' => {
test => "check_ping(default_gateway());",
suppress_alerts => 1,
},
'localdns' => {
test => "gethostbyname('google.com') or die 'FAILED: DNS borked!\n';",
suppress_alerts => 1,
},
},
);
# vi:ts=2 expandtab