OpenStack
쉘 스크립트로 오픈스택 모니터링하기
naleejang
2022. 12. 9. 13:56
이 스크립트 하나만 있으면 오픈스택 배포를 위한 디렉터 노드부터 컨트롤러, 컴퓨트 노드의 하드웨어 상태를 점검하고, 서비스 상태까지 한꺼번에 점검을 할 수 있다. 물론 단점은 글씨로 나온다는 것이지만, 엔지니어가 퀵하게 오버클라우드 전 노드를 살펴보는용으로는 매우 훌륭하다고 생각한다. 게다가 로그도 남겨주니 1석 2조가 아닌가?
#!/bin/bash
CON="ctrl01 ctrl02 ctrl03"
COM="cn01 cn02 cn03 cn04 cn05"
LOG_FILE=""
#------------------------
# Make Log File
#------------------------
function make_logs()
{
DATE=$(DATE +%Y%m%d%H%M)
LOG_FILE="/var/log/daily_chk/chk_overcloud_$DATE.log"
sudo touch $LOG_FILE
sudo chmod 777 $LOG_FILE
}
#------------------------
# Print message
#------------------------
function print_msg()
{
Message=$1
Date=$(date "+%Y-%m-%d %H:%M")
echo "$Date [Daily_chk] $Message" >> $LOG_FILE
echo "$Date $Message"
}
#------------------------
# Print message1
#------------------------
function print_msg1()
{
Message=$1
Date=$(date "+%Y-%m-%d %H:%M")
echo "$Message" >> $LOG_FILE
echo "$Message"
}
make_logs
print_msg "#----------------------------"
print_msg "# Check IDM Ping"
print_msg "#----------------------------"
idm_ping$(ping -c 1 idm-host | grep icmp_seq | wc -l)
if [$idm_ping -eq 0 ]; then
print_msg "IDM ping status is normal"
fi
print_msg "#----------------------------"
print_msg "# Check Director Network"
print_msg "#----------------------------"
net_stat=$ip a | grep "state UP" | grep mq | wc -l)
if [ $net_stat -eq 4 ]
then
print_msg "Network status is normal"
else
print_msg "Please check network status"
print_msg "$(ip a)"
fi
print_msg "#----------------------------"
print_msg "# Check Director Service logs"
print_msg "#----------------------------"
log_stat=$(sudo sh chk-log.sh | wc -l)
if [ $log_stat -eq 0 ]
then
print_msg "No error service logs. This system status is normal."
else
error_msg=$(sudo sh chk-log.sh)
print_msg "Please check system logs and container status."
print_msg "$error_msg"
fi
print_msg "#----------------------------"
print_msg "# Check Overcloud Power"
print_msg "#----------------------------"
for i in {1..3}
do
print_msg "ctrl0$i"
power_stat=$(fence_rhevm -o status -a 192.168.1.15 -l admin@internal -p passwd -n ctrl0$i --shell-timeout=30 --ssl-insecure -z --disable-http-filter)
print_msg "$power_stat"
done
for i in {31..35}
do
print_msg "cn0$i"
power_stat=$(ipmitool -H 192.168.141.15 -l lanplus -U admin -P passwd power status)
print_msg "$power_stat"
done
print_msg "#----------------------------"
print_msg "# Controller"
print_msg "#----------------------------"
for i in $CON
do
print_msg "#----------------------------"
print_msg "# Check Network"
print_msg "#----------------------------"
net_stat=(ssh -q heat-admin@$i ip a | grep "state UP" | grep mq | wc -l)
if [ $net_stat -eq 7 ]
then
print_msg "Network status is normal"
else
pring_msg "Please check network status"
print_msg "$(ssh -q heat-admin@$i sudo ip a)"
fi
if [ $i = "adm-vps-ctrl01" ]
then
print_msg "#----------------------------"
print_msg "# Check Clustering"
print_msg "#----------------------------"
cluster_stat=$(ssh -q heat-admin@$i sudo pcs status | grep -i 'failed' | wc -l)
if [ $cluster_stat -eq 0 ]
then
print_msg "Pacemaker status is normal"
else
print_msg "Please check pacemaker"
print_msg "$(ssh -q heat-admin@$i sudo pcs status)"
fi
fi
print_msg "#----------------------------"
print_msg "# Check CPU"
print_msg "#----------------------------"
cpu_stat=$(ssh -q heat-admin@$i sudo mpstat | grep all | awk '{print $4}')
print_msg "CPU usage is $cpu_stat. If CPU usage is high, please check system CPU status"
print_msg "#----------------------------"
print_msg "# Check Memory"
print_msg "#----------------------------"
mem_stat=$(ssh -q heat-admin@$i sudo free -h | grep -i mem | awk '{print $4}')
print_msg "Free memory amount is $mem_stat. If free memory amount is low, please check system memory status"
print_msg "#----------------------------"
print_msg "# Check Container"
print_msg "#----------------------------"
container_stat=$(ssh -q heat-admin@$i sudo systemctl list-units tripleo_* | grep failed | wc -l")
if [ $container_stat -eq 0 ]
then
print_msg "Container status is normal."
else
print_msg "Please check container status"
print_msg "$(ssh -q heat-admin@$i 'sudo systemctl list-units tripleo_*')"
fi
print_msg "#----------------------------"
print_msg "# Check NFS - glance"
print_msg "#----------------------------"
nfs_stat=$(ssh -q heat-admin@$i sudo dh -h | grep glance | wc -l)
if [ $nfs_stat -eq 1 ]
then
print_msg "NFS status is normal."
else
print_msg "Please check network status and nfs status"
fi
print_msg "#----------------------------"
print_msg "# Check Service logs"
print_msg "#----------------------------"
log_stat=$(ssh heat-admin@$i sudo sh chk-log.sh | wc -l)
if [ $log_stat -eq 0 ]
then
print_msg "No error service logs. This system status is normal."
else
error_msg=$(ssh -q heat-admin@$i sudo sh chk-log.sh)
print_msg "Please check system logs and container status."
print_msg "$error_msg"
fi
done
print_msg "#----------------------------"
print_msg "# Compute"
print_msg "#----------------------------"
for i in $COM
do
print_msg ">>>>>> $i <<<<<<<<"
print_msg "#----------------------------"
print_msg "# Check Network"
print_msg "#----------------------------"
net_stat=(ssh -q heat-admin@$i ip a | grep "state UP" | grep mq | wc -l)
if [ $net_stat -eq 10 ]
then
print_msg "Network status is normal"
else
pring_msg "Please check network status"
print_msg "$(ssh -q heat-admin@$i sudo ip a)"
fi
print_msg "#----------------------------"
print_msg "# Check CPU"
print_msg "#----------------------------"
cpu_stat=$(ssh -q heat-admin@$i sudo mpstat | grep all | awk '{print $4}')
print_msg "CPU usage is $cpu_stat. If CPU usage is high, please check system CPU status"
print_msg "#----------------------------"
print_msg "# Check Memory"
print_msg "#----------------------------"
mem_stat=$(ssh -q heat-admin@$i sudo free -h | grep -i mem | awk '{print $4}')
print_msg "Free memory amount is $mem_stat. If free memory amount is low, please check system memory status"
print_msg "#----------------------------"
print_msg "# Check Container"
print_msg "#----------------------------"
container_stat=$(ssh -q heat-admin@$i sudo systemctl list-units tripleo_* | grep failed | wc -l")
if [ $container_stat -eq 0 ]
then
print_msg "Container status is normal."
else
print_msg "Please check container status"
print_msg "$(ssh -q heat-admin@$i 'sudo systemctl list-units tripleo_*')"
fi
print_msg "#----------------------------"
print_msg "# Check Service logs"
print_msg "#----------------------------"
log_stat=$(ssh heat-admin@$i sudo sh chk-log.sh | wc -l)
if [ $log_stat -eq 0 ]
then
print_msg "No error service logs. This system status is normal."
else
error_msg=$(ssh -q heat-admin@$i sudo sh chk-log.sh)
print_msg "Please check system logs and container status."
print_msg "$error_msg"
fi
done
source /home/stack/overcloudrc
print_msg "#----------------------------"
print_msg "# Overcloud compute service"
print_msg "#----------------------------"
print_msg1 "$(openstack compute service list -c Binary -c Host -c Zone -c Status -c 'Updated At' --sort-column Host)"
print_msg "#----------------------------"
print_msg "# Overcloud volume service"
print_msg "#----------------------------"
print_msg1 "$(openstack volume service list)"
print_msg "#----------------------------"
print_msg "# Overcloud network service"
print_msg "#----------------------------"
print_msg1 "$(openstack network agent list -c Host -c 'Agent Type' -c Alive -c State -c 'vCPUs Used' -c vCPUs -c 'Memroy MB Used' -c 'Memory MB' --sort-column 'Hypervisor Hostname')"
print_msg "#----------------------------"
print_msg "# Overcloud hypervisor service"
print_msg "#----------------------------"
print_msg1 "$(openstack hypervisor list --long -c 'Hypervisor Hostname' -c 'Host IP' -c State -c 'vCPUs Used' -c vCPUs -c 'Memory MB Used' -c 'Memory MB' --sort-column 'Hypervisor HostName')"
print_msg "#----------------------------"
print_msg "# Instance count per hypervisor"
print_msg "#----------------------------"
print_msg1 "$(openstack server list --all --long --status ACTIVE -c Host --sort-column Host -f value | uniq -c)"
내용이 다소 길기는 하지만, 다음에 시간이 되면 중복되는 기능들을 함수로 변경하는 작업을 하면 좋을 것 같기는 하다. 그래도 이렇게 한번 스크립트를 짜 놓으면 모든 노드에 매번 들어가서 정보를 확인하지 않아도 되서 매우 편리하다.