#!/usr/bin/perl # # This is a typical Yet Another Monitor definition for monitoring a machine # named "server.com" on ip 1.2.3.4 which is running sshd, a webserver and # a mail server. # # Dependencies are in place to differentiate between a failed service, # a dead machine and various network problems. The comments give lots of # details. # # This is the default RRD database specification we use, recording latest, # maximum and average values over a few time periods. Adjust to taste. # The first number (300) is the rrdtool --step value. my @RRD_GAUGE_SPEC = ( '300', 'DS:data:GAUGE:300:0:U', 'RRA:LAST:0.5:1:180', 'RRA:MAX:0.9:6:360', 'RRA:AVERAGE:0.5:6:360' ); Monitor( # This tells Yamon where to store the state of the world and how much # history to keep for each tests. Somewhere under /var would probably # make a good place. status_file => '/tmp/yamon-SAMPLE.stat', status_html => '/tmp/yamon-SAMPLE.html', # Test status as a HTML table keep_history => 50, # File-name prefix for RRD databases rrd_prefix => '/tmp/yamon-rrd.', # These lines tell Yamon to use direct SMTP to the named server instead # of invoking the 'mail' utility: smtp_server => 'mail.isp.com', smtp_helo => 'your.machine.foo', # optional: Your machine's name smtp_from => 'you@isp.com', # optional: The from-address of sent mail # This is where alerts go. Multiple destinations may be listed, # seperated by a semicolon (;), optionally followed by spaces. # This can be overridden on a per-test bases. # # Possible destionations: # syslog # email@domain.foo # | /path/to/program -args "$subject" # Alert body on stdin # ! /path/to/program -args "$subject $body" # Nothing on stdin # alerts_to => 'syslog; your@email.address.foo; other@email.bar', # This is how many failures required before an alert is fired. # This can be overridden on a per-test bases. alert_threshold => 3, # This tells Yamon to re-send alerts once an hour. This may be excessive. # This can be overridden on a per-test bases. alert_interval => 3600, # This is prefixed to the subject of all sent alerts (so you can sort # your e-mail better and/or differentiate between different Yamons). alert_prefix => '/some text/', # These are the monitors themselves! monitors => { 'httpd' => { # This is an HTTPD liveness test, fetch a page and check if the # returned page contains the expected data. test => "check_http('server.com', 80, '/', 'GET', '(?s)some text');", depends => [ 'alive' ], min_interval => 240, }, 'www.foo.com' => { # If you're running lots of virtual servers/systems, it may be a good # idea to test each of them, to catch configuration errors. # # This will fetch http://www.foo.com/ and look for the string #

Foo!

on the page. It will fail if it can't connect or # if the page doesn't contain that text. This allows you to test # not only whether the server is running, but whether it is sending # the expected response. test => "check_http('www.foo.com', 80, '/', 'GET', '(?s)

Foo!

');", # If you are testing a page which causes database lookups or other # heavy lifting on the server side (a good idea now and again to make # sure everything is up and running), you may want to decrease the # frequency as done here (900s = 15 minutes). min_interval => 900, # Alert after 2 failures (the default above is 3). alert_threshold => 2, # If this test fails, we check if the httpd is running at all. depends => [ 'httpd' ], # Record the response times and errors for the HTTP server as RRD. rrd_time => [ @RRD_GAUGE_SPEC ], rrd_errors => [ @RRD_GAUGE_SPEC ], }, 'smtp_basic' => { # This just checks if the SMTP server is up and running. You probably # want to delete this in favour of the test below. test => "check_smtp('server.com');", depends => [ 'alive' ], min_interval => 3600, alert_threshold => 2, }, 'smtp' => { # This is fancier: it checks if it accepts mail for a list of users # This may seem superfluous, but it will alert you if a configuration # change accidentally breaks SMTP delivery for one of your domains. test => "check_smtp('server.com', 25, 'from\@me.com', 'a\@server.com,b\@server.com,c\@otherdomain.com');", depends => [ 'alive' ], min_interval => 3600, alert_threshold => 2, }, 'smtp_antirelay' => { # Still fancier: Here we check to make sure we aren't an open relay. # Note that this test will fail if we're testing from a host that # is trusted by the SMTP server on server.com. test => "check_smtp('server.com', 25, 'dude\@spam.com', 'victim\@isp.com', 250, 550);", depends => [ 'alive' ], min_interval => 3600, alert_threshold => 2, }, 'spamcop' => { # This will fire an alert if our host gets blacklisted by spamcop or # SORBS. In practise, you'll probably want to keep an eye on some # others as well. test => "check_dnsbl('1.2.3.4', 'bl.spamcop.net');" ."check_dnsbl('1.2.3.4', 'dnsbl.sorbs.net');", min_interval => 3600, alert_threshold => 1, }, 'dns' => { # Use dig to verify that our DNS server is sending the correct replies. test => "check_cmd('dig server.com \@foo.com', 'A[\\t ]+1\\.2\\.3\\.4');", }, 'jabber' => { # This just checks if something is listening on port 5222, it doesn't # actually verify that it's a Jabber server. test => "check_tcp('server.com', 5222);", depends => [ 'alive' ], min_interval => 4800, }, 'ssh' => { # This checks if SSH is up and running. It expects an OpenSSH-style # banner response beginning with "SSH-". test => "check_ssh('server.com');", depends => [ 'alive' ], }, 'pop3' => { # This test illustrates how to use the "expect" feature of check_tcp # to test a pop3 server. test => "check_tcp('server.com', 110, '', '^\+OK POP3', 'QUIT\r\n', '^\+OK');", depends => [ 'alive' ], }, # The following two tests are things the above tests all depend on; if # these fail Yamon knows not to complain about the things above. 'alive' => { test => "check_ping('server.com');", depends => [ 'network' ], }, 'network' => { # This test will try to differentiate between a network failure # and the host itself being down: for this to work, you want to # test a host or router "near" server.com, either on the same # local network or possibly server.com's router (you can find # a suitable monitoring target with the traceroute tool). test => "check_ping('other.server.com');", depends => [ 'internet' ], suppress_alerts => 1, }, # Alert at least once a day, so you know yamon is up and running. 'daily' => { test => 'die "FAILED: test failure\n" if (scalar gmtime =~ / 17:/);', }, # White-box monitoring, using yamon.cgi 'system' => { test => "check_syshealth('server.com', 80, '/cgi-bin/yamon.cgi', _ranges_);", # NOTE: This is just a sample! You almost certainly want to tune the # following values to match your system! ranges => { # Warn if filesystems get too full (or suspiciously empty) # These numbers are percentages "blocks_used_/" => [ 20, 90 ], "inodes_used_/" => [ 0, 50 ], "blocks_used_/var" => [ 20, 90 ], "inodes_used_/var" => [ 0, 50 ], # Complain if load average goes through the roof... "load_average_one" => [ 0.0, 30.0 ], "load_average_five" => [ 0.0, 15.0 ], # Make sure httpd is running, but no more than 150 processes "running_httpd" => [ 1, 150 ], # This will complain if any process uses more than 256MB of RAM, # and will report the name of the process in the alert. "max_rss" => [ 0, 256000, 'max_rss_command' ], # Log trends monitoring "log_trend_/var/log/maillog" => [ 0.5, 2.0 ], "log_trend_/var/log/httpd/error_log" => [ 0.5, 2.0 ], }, # Record root disk-utilization to round-robin databases. rrds => { "blocks_used_/" => [ @RRD_GAUGE_SPEC ], "inodes_used_/" => [ @RRD_GAUGE_SPEC ], }, # I'll alert immediately (on first failure), but won't repeat alerts # more than 3x a day. alert_threshold => 1, alert_interval => 8*3600, }, ##### You probably don't want to change anything below this line :-) ########## # This machine's basic internet connectivity. 'internet' => { test => "check_ping('www.google.com');", depends => [ 'localnet', 'localdns' ], suppress_alerts => 1, }, 'localnet' => { test => "check_ping(default_gateway());", suppress_alerts => 1, }, 'localdns' => { test => "gethostbyname('google.com') or die 'FAILED: DNS borked!\n';", suppress_alerts => 1, }, }, ); # vi:ts=2 expandtab