shell守护进程之踩坑记录

今天在写一个进程守护shell文件时,发现判断字符串为空时的一个坑,先记录下来。
在watch函数中,用 if [ ! -z ${pid} ];判断是正常的。
当把watch函数写成以下时(第一个程序正在运行,第二个未运行),

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
function watch() {
pid="$(ps aux|grep -v grep|grep "${1}" |awk '{print $2}'|head -n 1)"
date="[$(date '+%Y%m%d %H:%M:%S')]"
echo "1:${1}"
echo "abc:$(ps aux|grep -v grep|grep "${1}")"
echo "pid:${pid}"

if [ ${2} == "daemon" ]; then
if [ -n ${pid} ]; then
echo "nnnnnnnnnnnnnnnnnnnnnnnn-n"
fi
if [ -z ${pid} ]; then
echo "zzzzzzzzzzzzzzzzzzzzzzzz-z"
fi
if [ ! -z ${pid} ]; then
echo "${date}${1}(pid:${pid}) is running"
else
nohup ${1} 1>${3} 2>&1 &
echo "${date}启动进程${1}(pid:"$!")成功"
fi
elif [ ${2} == "restart" ]; then
if [ ! -z ${pid} ]; then
kill -9 ${pid}
fi
nohup ${1} 1>${3} 2>&1 &
echo "${date}重启进程${1}(pid:"$!")成功"
elif [ ${2} == "kill" ]; then
if [ ! -z ${pid} ]; then
kill -9 ${pid}
fi
echo "${date}结束进程${1}(pid:${pid})成功"
fi
}

输出为:

1
2
3
4
5
6
7
8
9
10
11
12
1:/usr/local/bin/python3 /search/offline/PPCityMoniter/ppcity_scripts/main.py -T web
abc:mali 7537 0.0 0.2 227712 22356 pts/3 S 18:36 0:00 /usr/local/bin/python3 /search/offline/PPCityMoniter/ppcity_scripts/main.py -T web
mali 7567 0.6 0.2 305592 22468 pts/3 Sl 18:36 0:37 /usr/local/bin/python3 /search/offline/PPCityMoniter/ppcity_scripts/main.py -T web
pid:7537
nnnnnnnnnnnnnnnnnnnnnnnn-n
[20170720 20:15:51]/usr/local/bin/python3 /search/offline/PPCityMoniter/ppcity_scripts/main.py -T web(pid:7537) is running
1:sh /search/offline/PPCityMoniter/ppcity_scripts/utils/pull_logs.sh
abc:
pid:
nnnnnnnnnnnnnnnnnnnnnnnn-n
zzzzzzzzzzzzzzzzzzzzzzzz-z
[20170720 20:15:51]启动进程sh /search/offline/PPCityMoniter/ppcity_scripts/utils/pull_logs.sh(pid:13848)成功

也就是说当pid为”空”时,程序既进入了 if [ -n ${pid} ];,也进入了if [ -z ${pid} ];,有点不懂了,-n代表非空字符串,-z代表空字符串

最后可以正常执行的脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env bash

set -o errexit
set -o nounset

cur_file=$(basename $0)
cur_dir=$(cd $(dirname $0) && pwd -P) # 绝对目录地址
cur_full_file="${cur_dir}/${cur_file}"
cd ${cur_dir}
run_dir="$(pwd)"

readonly p_file="../conf/p.list"

p_list=()

# 读文件
function readfile() {
if [ -f ${1} ]; then
declare -i x=0
while read -r line;
do
if [ "${line:0:1}" == "#" ]; then
continue
elif [ -z "${line}" ]; then
continue
else
p_list[${x}]=${line};
x+=1
fi
done < ${1}
return 0
else
echo ${1}"不存在" 1>>"${3:-/dev/null}" 2>&1
return 1
fi
}

# 传入3个参数:监控程序(p_list中项),命令参数,日志名
function watch() {
pid="$(ps aux|grep -v grep|grep "${1}" |awk '{print $2}'|head -n 1)"
date="[$(date '+%Y%m%d %H:%M:%S')]"

if [ ${2} == "daemon" ]; then
if [ ! -z ${pid} ]; then
echo "${date}${1}(pid:${pid}) is running"
else
(nohup ${1} >>${3} 2>&1) &
echo "${date}启动进程${1}(pid:"$!")成功"
fi
elif [ ${2} == "restart" ]; then
if [ ! -z ${pid} ]; then
kill -9 ${pid}
fi
(nohup ${1} >>${3} 2>&1) &
echo "${date}重启进程${1}(pid:"$!")成功"
elif [ ${2} == "kill" ]; then
if [ ! -z ${pid} ]; then
kill -9 ${pid}
fi
echo "${date}结束进程${1}(pid:${pid})成功"
fi
}

# 第1个参数是命令参数("daemon,restart,kill"),第2个参数是进程启动的日志输出文件名
function daemon() {
type nohup >/dev/null 2>&1 || { echo >&2 "缺少nohup" 1>"${3:-/dev/null}" 2>&1; return -1; }
declare -i x=0
while(( x < ${#p_list[*]} ))
do
watch "${p_list[${x}]}" "${1}" "${2}"
x+=1
done
return 0
}

readfile ${p_file}
daemon "${1:-daemon}" "${2:-${cur_full_file}.log}"

参考

Unix - Shell Basic Operators