用shell写一个简单的告警系统

创建目录结构

mkdir -p /usr/local/sbin/mon/{bin,conf,shares,mail,log}mon //主目录bin //主程序目录shares //子程序目录mail //发邮件目录log //日志目录主程序入口文件/mon/bin/main.sh

#!/bin/bashexport send=1export addr=`/usr/sbin/ifconfig | grep -A1 'ens33'|awk '/inet/{print $2}'`dir=`pwd`last_dir=`echo $dir|awk -F'/' '{print $NF}'`if [ $last_dir == "bin" ] || [ $last_dir == "bin/" ]; then conf_file="../conf/mon.conf"else echo "you should cd bin dir." exitfi#exec 1>>../log/mon.log 2>>../log/err.logecho "`date +'%F %T'` load average"/bin/bash ../shares/load.sh //在一个脚本中执行了另外一个脚本load.shif grep -q 'to_mon_502=1' $conf_file; then export log=`grep 'logfile=' $conf_file | awk -F '=' '{print $2}' | sed 's@ @@g'` /bin/bash ../shares/502.shfi主配置文件/mon/conf/mon.conf(自定义变量)

## to config the options if to monitor## 定义mysql的服务器地址、端口以及user、password to_mon_cdb=0db_ip=192.168.221.10db_port=3306db_user=usernamedb_pass=passwd## httpdto_mon_httpd=0to_mon_php_socket=0## http_code_502to_mon_502=0logfile=/data/log/xxx.xxx.com/access.log## request_count 定义日志路径以及域名to_mon_request_count=0req_log=/data/log/www.discuz.net/access.logdomainname=www.discuz.net监控的脚本

监控系统负载的脚本/usr/local/sbin/mon/shares/load.sh

#!/bin/bashload=`/usr/bin/uptime | awk -F'average:' '{print $2}'|cut -d, -f1|sed 's@ @@'|cut -d. -f1`if [ $load -lt 10 ] && [ $send -eq "1" ]; then //为了测试,设置了一分钟系统的负载小于了10 echo "${addr}-$(date +%T)-load-is-$load" > ../log/load.tmp /bin/bash ../mail/mail.sh "发邮件给谁" "$addr\_load:$load" "$(cat ../log/load.tmp)" //调用了mail.shfiecho "`date +%T` load is $load"

监控磁盘的脚本/usr/local/sbin/mon/shares/disk.sh

rm -f ../log/disk.tmpfor r in `df -h | awk -F'[ %]+' '{print $5}' | grep -v Use`do if [ $r -gt 90 ] && [ $send -eq "1" ]; then echo "$addr `date +%T` disk useage is $r" >> ../log/disk.tmp fi if [ -f ../log/disk.tmp ]; then df -h >> ../log/disk.tmp /bin/bash ../mail/mail.sh $addr\_disk $r ../log/disk.tmp //调用了mail.sh echo "`date +%T` disk useage is Alert!!!" else echo "`date +%T` disk useage is ok" fidone

监控网站出现502的脚本

#!/bin/bashd=`date -d '-1 min' +%H:%M`c_502=`grep :$d: $log |grep '502'|wc -l`if [ $c_502 -gt 10 ] && [ $send == 1 ]; then echo "$addr $d 502 count is $c_502" > ../log/502.tmp /bin/bash ../mail/mail.sh $addr\_502 $c_502 ../log/502.tmpfiecho "`date +%T` 502 $c_502"

发邮件的脚本/usr/local/sbin/mon/mail/mail.py

https://blog.51cto.com/13480443/2084118 /usr/lib/zabbix/alertscripts/mail.py //这个地址的这个文件告警收敛脚本/usr/local/sbin/mon/mail/mail.sh

#!/bin/bashlog=$1t_s=`date +%s`t_s2=`date -d "2 hours ago" +%s`if [ ! -f /tmp/$log ]then echo $t_s2 > /tmp/$logfit_s2=`tail -1 /tmp/$log|awk '{print $1}'`echo $t_s>>/tmp/$logv=$[$t_s-$t_s2]echo $vif [ $v -gt 3600 ]then /usr/bin/python /usr/local/sbin/mon/mail/mail.py $1 "trouble continue 10 min $2" $3 echo "0" > /tmp/$log.txtelse if [ ! -f /tmp/$log.txt ] then echo "0" > /tmp/$log.txt fi nu=`cat /tmp/$log.txt` nu2=$[$nu+1] echo $nu2>/tmp/$log.txt if [ $nu2 -gt 10 ] then /usr/bin/python /usr/local/sbin/mon/mail/mail.py $1 "trouble continue 10 min $2" "$3" echo "0" > /tmp/$log.txt fifi//异常间隔大于1小时,直接发邮件,异常小于1小时时则每隔10分钟发一次邮件计划每分钟执行/usr/local/sbin/mon/bin/main.sh

* * * * * cd /usr/local/sbin/mon/bin/; /usr/bin/bash /usr/local/sbin/mon/bin/main.sh

注意:这里主程序main.sh中只调用了子程序 load.sh,另外两个子程序没有调用到,系统会自动每分钟发邮件给root用户的邮箱(当然我这里测试是用的root用户)

总结:

主程序文件中要对主配置文件定义的内容进行过滤作为条件,再调用子程序的脚本子程序中也要引用主配置文件中的内容作为自己的条件判断,再调用发邮件脚本主配置文件相当于一个总的开头发邮件的脚本mail.py,注意参数是以空格作为分隔的,如
echo "${addr} $(date +%T)-load-is-$load" > ../log/load.tmp
/bin/bash ../mail/mail.sh "发邮件给谁" "$addr_load:$load" "$(cat ../log/load.tmp)"
只会识别${addr},不会辨别$(date +%T)-load-is-$load这里用到了环境变量