#!/bin/bash #author: QingFeng #qq: 530035210 #blog: http://my.oschina.net/pwd/blog #自动监控url是否可用,如不可用则重启应用,并做相应的报警策略。 #缺省的配置如下 logdir=/data/log/check #日志路径 log=$logdir/log.log #日志文件 is_font=1 #终端是否打印日志: 1打印 0不打印 is_log=0 #是否记录日志: 1记录 0不记录 key="data-camp" #进程关键字 exec_stop="/etc/init.d/data-camp stop" #停应用命令 exec_start="/etc/init.d/data-camp start" #启动应用命令 datef(){ date "+%Y-%m-%d %H:%M:%S" } print_log(){ if [[ $is_log -eq 1 ]];then [[ -d $logdir ]] || mkdir -p $logdir echo "[ $(datef) ] $1" >> $log fi if [[ $is_font -eq 1 ]];then echo -e "[ $(datef) ] $1" fi } #定义重启 derestart(){ if [[ $1 == "" ]];then print_log "$FUNCNAME():应用关键字不能为空" exit fi if [[ $2 == "" ]];then print_log "$FUNCNAME():启动文件不能为空" exit fi if [[ $2 == "" ]];then print_log "$FUNCNAME():启动参数口不能为空" exit fi ppid=0 ppid=$(ps axu |grep "$1" |grep -v grep |grep -v "$0" |wc -l) $2 $3 ppid=$(ps axu |grep "$1" |grep -v grep |grep -v "$0" |wc -l) echo $ppid > /tmp/restart.num print_log "$FUNCNAME(): $1的进程数为:$ppid" } #场景一: 当网站返回码不为200,则重启应用. check_code(){ if [[ $1 == "" ]];then print_log "$FUNCNAME():服务器地址不能为空" exit fi if [[ $2 == "" ]];then print_log "$FUNCNAME():服务器端口不能为空" exit fi print_log "$FUNCNAME():开始检测-[$1:$2]服务器的网站状态返回码." code=$(curl -m 8 -o /dev/null -s -w %{http_code} http://$1:$2/verdict/session/LSGJA52U7CH055974/latest/result) if [[ $code -ne 200 ]];then print_log "$FUNCNAME():[$1:$2]服务器的网站状态返回码不正常,开始重启应用--$code." print_log "$FUNCNAME():执行命令: $exec_stop" derestart "$key" "$exec_stop" num2=$(cat /tmp/restart.num) if [[ $num2 -ne 0 ]];then print_log "$FUNCNAME():停应用失败." fi print_log "$FUNCNAME():执行命令: $exec_start" sleep 3 derestart "$key" "$exec_start" num2=$(cat /tmp/restart.num) if [[ $num2 -eq 0 ]];then print_log "$FUNCNAME():启动应用失败." fi print_log "$FUNCNAME():重启应用成功." else print_log "$FUNCNAME():[$1:$2]服务器的网站状态返回码正常--$code." fi } #场景二: 检测网站http返回的时间 check_timeout(){ if [[ $1 == "" ]];then print_log "$FUNCNAME():服务器地址不能为空" exit fi if [[ $2 == "" ]];then print_log "$FUNCNAME():服务器端口不能为空" exit fi print_log "$FUNCNAME():开始检测-[$1:$2]服务器的网站超时时间." httptime=`curl -o /dev/null -s -w "time_connect: %{time_connect}\ntime_starttransfer:%{time_starttransfer}\ntime_total: %{time_total}\n" "http://$1:$2/verdict/session/LSGJA52U7CH055974/latest/result" |grep time_total|awk -F ":" '{print $2*1000}'` taketime=$(expr $httptime / 1000) if [[ $httptime -gt 60000 ]];then print_log "$FUNCNAME():[$1:$2]服务器的网站响应时间不正常,开始重启应用--$httptime ms." print_log "$FUNCNAME():执行命令: $exec_stop" derestart "$key" "$exec_stop" num2=$(cat /tmp/restart.num) if [[ $num2 -ne 0 ]];then print_log "$FUNCNAME():停应用失败." fi print_log "$FUNCNAME():执行命令: $exec_start" sleep 3 derestart "$key" "$exec_start" num2=$(cat /tmp/restart.num) if [[ $num2 -eq 0 ]];then print_log "$FUNCNAME():启动应用失败." fi print_log "$FUNCNAME():重启应用成功." else print_log "$FUNCNAME():[$1:$2]服务器的网站响应时间正常--$httptime ms/$taketime s." fi } check_code "localhost" "6500" check_timeout "localhost" "6500"