Results tagged “nagios” from WHO IS 2HEI?
Scripts and executables must do two things (at a minimum) in order to function as Nagios plugins:
1.Exit with one of several possible return values
2.Return at least one line of text output to STDOUT
Plugin Return Code Service State Host State
0 OK UP
1 WARNING UP or DOWN/UNREACHABLE*
2 CRITICAL DOWN/UNREACHABLE
3 UNKNOWN DOWN/UNREACHABLE
Note: If the use_aggressive_host_checking option is enabled, return codes of 1 will result in a host
state of DOWN or UNREACHABLE. Otherwise return codes of 1 will result in a host state of UP.
Plugin Output Spec
At a minimum, plugins should return at least one of text output. Beginning with Nagios 3, plugins can
optionally return multiple lines of output. Plugins may also return optional performance data that can
be processed by external applications. The basic format for plugin output is shown below:
TEXT OUTPUT | OPTIONAL PERFDATA
LONG TEXT LINE 1
LONG TEXT LINE 2
...
LONG TEXT LINE N | PERFDATA LINE 2
PERFDATA LINE 3
...
PERFDATA LINE N
this is my python scripts:
#!/usr/bin/evn python
# -*- coding: utf-8 -*-
import sys,getopt
import memcache
memcached_host='2hei.net'
memcached_port=11211
Warning_item=120
Critical_item=20
def usage():
print """
Usage: check_memcached [-h|--help] [-w|--warning curr_items] [-c|--critical curr_items]"
Warning curr_items defaults to 120
Critical curr_items defaults to 20
"""
sys.exit(3)
#get curr_items from memcache stats
def get_memcache_curr_items(mc):
#mc = memcache.Client([memcached_host+':'+str(memcached_port)], debug=0)
stats = mc.get_stats()[0][1]
#for i in xrange(0,100):
# mc.set('key'+str(i),'value'+str(i))
#for k,v in stats.items():
# print k,v
items = stats.get('curr_items')
return items
if __name__ == "__main__":
warning_item = 0
critical_item = 0
try:
options, args = getopt.getopt(sys.argv[1:],"h:w:c:","--help --warning= --critical=",)
except getopt.GetoptError:
usage()
sys.exit(3)
try:
mc = memcache.Client([memcached_host+':'+str(memcached_port)], debug=0)
items = get_memcache_curr_items(mc)
mc.disconnect_all()
except Exception:
print "Cannot get memcache's curr_items.",Exception
sys.exit(3)
for name, value in options:
if name in ("-h", "--help"):
usage()
sys.exit(3)
if name in ("-w", "--warning"):
warning_item = value
if name in ("-c", "--critical"):
critical_item = value
if warning_item == 0:
warning_item = Warning_item
if critical_item == 0:
critical_item = Critical_item
if int(items) <= int(critical_item):
print 'MEMCACHED_ITEM CRITICAL: curr_items is:',items
sys.exit(2)
if int(items) <= int(warning_item):
print 'MEMCACHED_ITEM WARNING: curr_items is:',items
sys.exit(1)
else:
print 'MEMCACHED_ITEM OK: curr_items is:',items
sys.exit(0)
when encounter errors:
CHECK_NRPE: No output returned from daemon.
or
CHECK_NRPE: Received 0 bytes from daemon. Check the remote server logs for error messages.
this shows your plugins return output is null
1.Exit with one of several possible return values
2.Return at least one line of text output to STDOUT
Plugin Return Code Service State Host State
0 OK UP
1 WARNING UP or DOWN/UNREACHABLE*
2 CRITICAL DOWN/UNREACHABLE
3 UNKNOWN DOWN/UNREACHABLE
Note: If the use_aggressive_host_checking option is enabled, return codes of 1 will result in a host
state of DOWN or UNREACHABLE. Otherwise return codes of 1 will result in a host state of UP.
Plugin Output Spec
At a minimum, plugins should return at least one of text output. Beginning with Nagios 3, plugins can
optionally return multiple lines of output. Plugins may also return optional performance data that can
be processed by external applications. The basic format for plugin output is shown below:
TEXT OUTPUT | OPTIONAL PERFDATA
LONG TEXT LINE 1
LONG TEXT LINE 2
...
LONG TEXT LINE N | PERFDATA LINE 2
PERFDATA LINE 3
...
PERFDATA LINE N
this is my python scripts:
#!/usr/bin/evn python
# -*- coding: utf-8 -*-
import sys,getopt
import memcache
memcached_host='2hei.net'
memcached_port=11211
Warning_item=120
Critical_item=20
def usage():
print """
Usage: check_memcached [-h|--help] [-w|--warning curr_items] [-c|--critical curr_items]"
Warning curr_items defaults to 120
Critical curr_items defaults to 20
"""
sys.exit(3)
#get curr_items from memcache stats
def get_memcache_curr_items(mc):
#mc = memcache.Client([memcached_host+':'+str(memcached_port)], debug=0)
stats = mc.get_stats()[0][1]
#for i in xrange(0,100):
# mc.set('key'+str(i),'value'+str(i))
#for k,v in stats.items():
# print k,v
items = stats.get('curr_items')
return items
if __name__ == "__main__":
warning_item = 0
critical_item = 0
try:
options, args = getopt.getopt(sys.argv[1:],"h:w:c:","--help --warning= --critical=",)
except getopt.GetoptError:
usage()
sys.exit(3)
try:
mc = memcache.Client([memcached_host+':'+str(memcached_port)], debug=0)
items = get_memcache_curr_items(mc)
mc.disconnect_all()
except Exception:
print "Cannot get memcache's curr_items.",Exception
sys.exit(3)
for name, value in options:
if name in ("-h", "--help"):
usage()
sys.exit(3)
if name in ("-w", "--warning"):
warning_item = value
if name in ("-c", "--critical"):
critical_item = value
if warning_item == 0:
warning_item = Warning_item
if critical_item == 0:
critical_item = Critical_item
if int(items) <= int(critical_item):
print 'MEMCACHED_ITEM CRITICAL: curr_items is:',items
sys.exit(2)
if int(items) <= int(warning_item):
print 'MEMCACHED_ITEM WARNING: curr_items is:',items
sys.exit(1)
else:
print 'MEMCACHED_ITEM OK: curr_items is:',items
sys.exit(0)
when encounter errors:
CHECK_NRPE: No output returned from daemon.
or
CHECK_NRPE: Received 0 bytes from daemon. Check the remote server logs for error messages.
this shows your plugins return output is null
nagios check_http 处理返回码302的情况
使用nagios监控中发现这样一个问题,nagios的check_http对http返回码302的处理比较特殊,如果参数不当的话会影响监控的效果哦!
实际情况如下:
在web服务器配置的errpage页面指定了503跳转,如apache+resin的组合,如果apache找不到resin会返回503,捕获的页面会跳转到指定的页面上
1、正常情况:
/home/nagios/libexec/check_http -S -H 2hei.net -I 10.10.10.10 -u "/monitor.jsp" -P "" -s "OK" -t 10
返回
HTTP OK HTTP/1.1 200 OK - 617 bytes in 0.345 seconds |time=0.345145s;;;0.000000 size=617B;;;0
如果页面返回值不包含“OK”,则
HTTP CRITICAL - string not found|time=0.034220s;;;0.000000 size=617B;;;0
如果停掉保留apache不动,停掉resin,则以外发生了,302跳转后状态仍然是OK,I服了U !!!
HTTP OK - HTTP/1.1 302 Found - 0.030 second response time |time=0.029908s;;;0.000000 size=517B;;;0
加入-v 参数后 看看详细结果:
https://2hei.net:443/monitor.jsp is 517 characters
STATUS: HTTP/1.1 302 Found
**** HEADER ****
Date: Wed, 16 Dec 2009 07:49:53 GMT
Server: Apache
Location: https://2hei.net/errorcode/503.html
Cache-Control: max-age=0
Expires: Wed, 16 Dec 2009 07:49:53 GMT
Vary: Accept-Encoding
Content-Length: 220
Connection: close
Content-Type: text/html; charset=iso-8859-1
**** CONTENT ****
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>302 Found</title>
</head><body>
<h1>Found</h1>
<p>The document has moved <a href="https://2hei.net/errorcode/503.html">here</a>.</p>
</body></html>
HTTP OK - HTTP/1.1 302 Found - 0.030 second response time |time=0.029908s;;;0.000000 size=517B;;;0
丫难道是nagios check_http的bug?
在仔细看看nagios check_http 用法:
Usage: check_http -H <vhost> | -I <IP-address> [-u <uri>] [-p <port>]
[-w <warn time>] [-c <critical time>] [-t <timeout>] [-L]
[-a auth] [-f <ok | warn | critcal | follow>] [-e <expect>]
[-s string] [-l] [-r <regex> | -R <case-insensitive regex>] [-P string]
[-m <min_pg_size>:<max_pg_size>] [-4|-6] [-N] [-M <age>] [-A string]
[-k string] [-S] [-C <age>] [-T <content-type>]
常用参数
-H 用于虚拟主机,并可以增加端口。例如,test.com:10000
-I IP地址
-S 基于SSL连接。默认端口为443
-u URL地址 “/”
-w 警告响应时间 响应时间超过指定时间则显示WARNING
-c 临界报警响应时间 响应时间超过设定时间则显示CRITICAL
-p 端口设定
-e 监测服务器反馈的第一行(状态)信息是否包含匹配字符
-N 不读取body部分,在读取header信息,这样相应速度比较快
-t 超时时间值
-v 显示服务器反馈的所有信息
-r 正则表达式匹配反馈页信息
-R 同上,但可以不考虑大小写问题
--invert-regex 反馈信息中包含字符则报警
-L 反馈信息包含URL link
-a 对需要基础认证的,提供用户名和密码,获取反馈值
尝试使用了-e参数,问题搞定! yeah !!!
-e 指定check_http返回的第一行status中包含的字符串
STATUS: HTTP/1.1 302 Found
/home/nagios/libexec/check_http -S -H 2hei.net -I 10.10.10.10 -u "/monitor.jsp" -P "" -s "OK" -t 10 -e "200"
这样只有返回码包含200的才会OK,其他状态码一律FATAL!
最后的command如下:
define command{
command_name check_https
command_line $USER1$/check_http -S -H $ARG1$ -I $HOSTNAME$ -u $ARG2$ -P $ARG3$ -s $ARG4$ -t $ARG5$ -e $ARG6$
}
nagios的check_http果然很牛X。
使用nagios监控中发现这样一个问题,nagios的check_http对http返回码302的处理比较特殊,如果参数不当的话会影响监控的效果哦!
实际情况如下:
在web服务器配置的errpage页面指定了503跳转,如apache+resin的组合,如果apache找不到resin会返回503,捕获的页面会跳转到指定的页面上
1、正常情况:
/home/nagios/libexec/check_http -S -H 2hei.net -I 10.10.10.10 -u "/monitor.jsp" -P "" -s "OK" -t 10
返回
HTTP OK HTTP/1.1 200 OK - 617 bytes in 0.345 seconds |time=0.345145s;;;0.000000 size=617B;;;0
如果页面返回值不包含“OK”,则
HTTP CRITICAL - string not found|time=0.034220s;;;0.000000 size=617B;;;0
如果停掉保留apache不动,停掉resin,则以外发生了,302跳转后状态仍然是OK,I服了U !!!
HTTP OK - HTTP/1.1 302 Found - 0.030 second response time |time=0.029908s;;;0.000000 size=517B;;;0
加入-v 参数后 看看详细结果:
https://2hei.net:443/monitor.jsp is 517 characters
STATUS: HTTP/1.1 302 Found
**** HEADER ****
Date: Wed, 16 Dec 2009 07:49:53 GMT
Server: Apache
Location: https://2hei.net/errorcode/503.html
Cache-Control: max-age=0
Expires: Wed, 16 Dec 2009 07:49:53 GMT
Vary: Accept-Encoding
Content-Length: 220
Connection: close
Content-Type: text/html; charset=iso-8859-1
**** CONTENT ****
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>302 Found</title>
</head><body>
<h1>Found</h1>
<p>The document has moved <a href="https://2hei.net/errorcode/503.html">here</a>.</p>
</body></html>
HTTP OK - HTTP/1.1 302 Found - 0.030 second response time |time=0.029908s;;;0.000000 size=517B;;;0
丫难道是nagios check_http的bug?
在仔细看看nagios check_http 用法:
Usage: check_http -H <vhost> | -I <IP-address> [-u <uri>] [-p <port>]
[-w <warn time>] [-c <critical time>] [-t <timeout>] [-L]
[-a auth] [-f <ok | warn | critcal | follow>] [-e <expect>]
[-s string] [-l] [-r <regex> | -R <case-insensitive regex>] [-P string]
[-m <min_pg_size>:<max_pg_size>] [-4|-6] [-N] [-M <age>] [-A string]
[-k string] [-S] [-C <age>] [-T <content-type>]
常用参数
-H 用于虚拟主机,并可以增加端口。例如,test.com:10000
-I IP地址
-S 基于SSL连接。默认端口为443
-u URL地址 “/”
-w 警告响应时间 响应时间超过指定时间则显示WARNING
-c 临界报警响应时间 响应时间超过设定时间则显示CRITICAL
-p 端口设定
-e 监测服务器反馈的第一行(状态)信息是否包含匹配字符
-N 不读取body部分,在读取header信息,这样相应速度比较快
-t 超时时间值
-v 显示服务器反馈的所有信息
-r 正则表达式匹配反馈页信息
-R 同上,但可以不考虑大小写问题
--invert-regex 反馈信息中包含字符则报警
-L 反馈信息包含URL link
-a 对需要基础认证的,提供用户名和密码,获取反馈值
尝试使用了-e参数,问题搞定! yeah !!!
-e 指定check_http返回的第一行status中包含的字符串
STATUS: HTTP/1.1 302 Found
/home/nagios/libexec/check_http -S -H 2hei.net -I 10.10.10.10 -u "/monitor.jsp" -P "" -s "OK" -t 10 -e "200"
这样只有返回码包含200的才会OK,其他状态码一律FATAL!
最后的command如下:
define command{
command_name check_https
command_line $USER1$/check_http -S -H $ARG1$ -I $HOSTNAME$ -u $ARG2$ -P $ARG3$ -s $ARG4$ -t $ARG5$ -e $ARG6$
}
nagios的check_http果然很牛X。
nagios的一些辅助工具很有用处,如check_ping,check_tcp等等,这里介绍一下check_ping的用法:
nagios的check_ping命令:
nagios的check_ping命令:
源码可见 nagios插件: nagios-plugins-1.4.12/plugins/check_ping.c
用法:
./check_ping
Usage: check_ping -H <host_address> -w <wrta>,<wpl>% -c <crta>,<cpl>%
[-p packets] [-t timeout] [-L] [-4|-6]
具体如下:
-H 主机地址
-w WARNING 状态: 响应时间(毫秒),丢包率 (%) 阀值
-c CRITICAL状态: 响应时间(毫秒),丢包率 (%) 阀值
-p 发送的包数 默认5个包
-t 超时时间 默认10秒
-4|-6 使用ipv4|ipv6 地址 默认ipv4
如:
1、正常:
./check_ping -H www.google.com -w 100.0,20% -c 200.0,50% -p 3 -t 2
PING OK - Packet loss = 0%, RTA = 1.49 ms
命令执行结果返回: echo $? 为 0
2、WARNING :
./check_ping -H www.google.com -w 0.1,20% -c 200.0,50% -p 3 -t 2
PING WARNING - Packet loss = 0%, RTA = 1.71 ms
命令执行结果返回: echo $? 为 1
3、CRITICAL
./check_ping -H www.google.com -w 0.1,20% -c 0.9,50% -p 3 -t 2
PING CRITICAL - Packet loss = 0%, RTA = 1.60 ms
命令执行结果返回: echo $? 为 2
返回结果为:状态 丢包率 ping响应时间
因为check_ping的返回值非常清晰,
可以在其他程序中调用check_ping命令,作为辅助的网络检测工具。
可以在其他程序中调用check_ping命令,作为辅助的网络检测工具。
#check_tcpconn.sh
# warning value
W=1500
# critical value
C=2190
if [ -f /proc/net/tcp6 ]
then
TCP_FILE6="/proc/net/tcp6"
fi
if [ -f /proc/net/tcp ]
then
TCP_FILE="/proc/net/tcp"
fi
cat $TCP_FILE6 $TCP_FILE > /tmp/tcpstat
awk -v TOTAL_W="$W" -v TOTAL_C="$C" 'BEGIN{ ESTABLISHED=TIME_WAIT=SYN_RECV=TOTAL=0}
{if($4 ~/01/) {ESTABLISHED++ ; TOTAL++} else if($4 ~/06/) {TIME_WAIT++; TOTAL++} else if($4 ~/03/) {SYN_RECV++; TOTA
L++} else TOTAL++ }
END{
if (TOTAL < TOTAL_W)
{printf "OK CONN %s ESTABLISHED %s TIME_WAIT %s SYN_RECV %s TOTAL|CONN,%s,%s,%s,%s;",ESTABLISHED,TIME_WA
IT,SYN_RECV,TOTAL,ESTABLISHED,TIME_WAIT,SYN_RECV,TOTAL ; exit 0}
else if (TOTAL < TOTAL_C)
{printf "WARNING CONN %s ESTABLISHED %s TIME_WAIT %s SYN_RECV %s TOTAL|CONN,%s,%s,%s,%s;",ESTABLISHED,TI
ME_WAIT,SYN_RECV,TOTAL,ESTABLISHED,TIME_WAIT,SYN_RECV,TOTAL ; exit 0}
else
{printf "CRITICAL CONN %s ESTABLISHED %s TIME_WAIT %s SYN_RECV %s TOTAL|CONN,%s,%s,%s,%s;",ESTABLISHED,T
IME_WAIT,SYN_RECV,TOTAL,ESTABLISHED,TIME_WAIT,SYN_RECV,TOTAL ; exit 0}
}' /tmp/tcpstat
this script check /proc/net/tcp , u can find tcp connect status .
cat /proc/net/tcp
enum {
TCPF_ESTABLISHED = (1 << 1),
TCPF_SYN_SENT = (1 << 2),
TCPF_SYN_RECV = (1 << 3),
TCPF_FIN_WAIT1 = (1 << 4),
TCPF_FIN_WAIT2 = (1 << 5),
TCPF_TIME_WAIT = (1 << 6),
TCPF_CLOSE = (1 << 7),
TCPF_CLOSE_WAIT = (1 << 8),
TCPF_LAST_ACK = (1 << 9),
TCPF_LISTEN = (1 << 10),A
TCPF_CLOSING = (1 << 11),B
};
# warning value
W=1500
# critical value
C=2190
if [ -f /proc/net/tcp6 ]
then
TCP_FILE6="/proc/net/tcp6"
fi
if [ -f /proc/net/tcp ]
then
TCP_FILE="/proc/net/tcp"
fi
cat $TCP_FILE6 $TCP_FILE > /tmp/tcpstat
awk -v TOTAL_W="$W" -v TOTAL_C="$C" 'BEGIN{ ESTABLISHED=TIME_WAIT=SYN_RECV=TOTAL=0}
{if($4 ~/01/) {ESTABLISHED++ ; TOTAL++} else if($4 ~/06/) {TIME_WAIT++; TOTAL++} else if($4 ~/03/) {SYN_RECV++; TOTA
L++} else TOTAL++ }
END{
if (TOTAL < TOTAL_W)
{printf "OK CONN %s ESTABLISHED %s TIME_WAIT %s SYN_RECV %s TOTAL|CONN,%s,%s,%s,%s;",ESTABLISHED,TIME_WA
IT,SYN_RECV,TOTAL,ESTABLISHED,TIME_WAIT,SYN_RECV,TOTAL ; exit 0}
else if (TOTAL < TOTAL_C)
{printf "WARNING CONN %s ESTABLISHED %s TIME_WAIT %s SYN_RECV %s TOTAL|CONN,%s,%s,%s,%s;",ESTABLISHED,TI
ME_WAIT,SYN_RECV,TOTAL,ESTABLISHED,TIME_WAIT,SYN_RECV,TOTAL ; exit 0}
else
{printf "CRITICAL CONN %s ESTABLISHED %s TIME_WAIT %s SYN_RECV %s TOTAL|CONN,%s,%s,%s,%s;",ESTABLISHED,T
IME_WAIT,SYN_RECV,TOTAL,ESTABLISHED,TIME_WAIT,SYN_RECV,TOTAL ; exit 0}
}' /tmp/tcpstat
this script check /proc/net/tcp , u can find tcp connect status .
cat /proc/net/tcp
enum {
TCPF_ESTABLISHED = (1 << 1),
TCPF_SYN_SENT = (1 << 2),
TCPF_SYN_RECV = (1 << 3),
TCPF_FIN_WAIT1 = (1 << 4),
TCPF_FIN_WAIT2 = (1 << 5),
TCPF_TIME_WAIT = (1 << 6),
TCPF_CLOSE = (1 << 7),
TCPF_CLOSE_WAIT = (1 << 8),
TCPF_LAST_ACK = (1 << 9),
TCPF_LISTEN = (1 << 10),A
TCPF_CLOSING = (1 << 11),B
};




