OpenStack

쉘 스크립트로 오픈스택 모니터링하기

naleejang 2022. 12. 9. 13:56

이 스크립트 하나만 있으면 오픈스택 배포를 위한 디렉터 노드부터 컨트롤러, 컴퓨트 노드의 하드웨어 상태를 점검하고, 서비스 상태까지 한꺼번에 점검을 할 수 있다. 물론 단점은 글씨로 나온다는 것이지만, 엔지니어가 퀵하게 오버클라우드 전 노드를 살펴보는용으로는 매우 훌륭하다고 생각한다. 게다가 로그도 남겨주니 1석 2조가 아닌가?

#!/bin/bash

CON="ctrl01 ctrl02 ctrl03"
COM="cn01 cn02 cn03 cn04 cn05"
LOG_FILE=""
#------------------------
# Make Log File
#------------------------
function make_logs()
{
  DATE=$(DATE +%Y%m%d%H%M)
  LOG_FILE="/var/log/daily_chk/chk_overcloud_$DATE.log"
  sudo touch $LOG_FILE
  sudo chmod 777 $LOG_FILE
}

#------------------------
# Print message
#------------------------
function print_msg()
{
  Message=$1
  Date=$(date "+%Y-%m-%d %H:%M")
  echo "$Date [Daily_chk] $Message" >> $LOG_FILE
  echo "$Date $Message"
}

#------------------------
# Print message1
#------------------------
function print_msg1()
{
  Message=$1
  Date=$(date "+%Y-%m-%d %H:%M")
  echo "$Message" >> $LOG_FILE
  echo "$Message"
}

make_logs

print_msg "#----------------------------"
print_msg "# Check IDM Ping"
print_msg "#----------------------------"

idm_ping$(ping -c 1 idm-host | grep icmp_seq | wc -l)
if [$idm_ping -eq 0 ]; then
  print_msg "IDM ping status is normal"
fi

print_msg "#----------------------------"
print_msg "# Check Director Network"
print_msg "#----------------------------"

net_stat=$ip a | grep "state UP" | grep mq | wc -l)
if [ $net_stat -eq 4 ]
then
  print_msg "Network status is normal"
else
  print_msg "Please check network status"
  print_msg "$(ip a)"
fi 

print_msg "#----------------------------"
print_msg "# Check Director Service logs"
print_msg "#----------------------------"
log_stat=$(sudo sh chk-log.sh | wc -l)

if [ $log_stat -eq 0 ]
then
  print_msg "No error service logs. This system status is normal."
else
  error_msg=$(sudo sh chk-log.sh)
  print_msg "Please check system logs and container status."
  print_msg "$error_msg"
fi

print_msg "#----------------------------"
print_msg "# Check Overcloud Power"
print_msg "#----------------------------"

for i in {1..3}
do
  print_msg "ctrl0$i"
  power_stat=$(fence_rhevm -o status -a 192.168.1.15 -l admin@internal -p passwd -n ctrl0$i --shell-timeout=30 --ssl-insecure -z --disable-http-filter)
  print_msg "$power_stat"
done

for i in {31..35}
do
  print_msg "cn0$i"
  power_stat=$(ipmitool -H 192.168.141.15 -l lanplus -U admin -P passwd power status)
  print_msg "$power_stat"
done

print_msg "#----------------------------"
print_msg "# Controller"
print_msg "#----------------------------"

for i in $CON
do

  print_msg "#----------------------------"
  print_msg "# Check Network"
  print_msg "#----------------------------"
  net_stat=(ssh -q heat-admin@$i ip a | grep "state UP" | grep mq | wc -l)

  if [ $net_stat -eq 7 ]
  then
    print_msg "Network status is normal"
  else
    pring_msg "Please check network status"
    print_msg "$(ssh -q heat-admin@$i sudo ip a)"
  fi

  if [ $i = "adm-vps-ctrl01" ]
  then
    print_msg "#----------------------------"
    print_msg "# Check Clustering"
    print_msg "#----------------------------"
    cluster_stat=$(ssh -q heat-admin@$i sudo pcs status | grep -i 'failed' | wc -l)

    if [ $cluster_stat -eq 0 ]
    then
      print_msg "Pacemaker status is normal"
    else
      print_msg "Please check pacemaker"
      print_msg "$(ssh -q heat-admin@$i sudo pcs status)"
    fi
  fi

  print_msg "#----------------------------"
  print_msg "# Check CPU"
  print_msg "#----------------------------"
  cpu_stat=$(ssh -q heat-admin@$i sudo mpstat | grep all | awk '{print $4}')
  print_msg "CPU usage is $cpu_stat. If CPU usage is high, please check system CPU status"

  print_msg "#----------------------------"
  print_msg "# Check Memory"
  print_msg "#----------------------------"
  mem_stat=$(ssh -q heat-admin@$i sudo free -h | grep -i mem | awk '{print $4}')
  print_msg "Free memory amount is $mem_stat. If free memory amount is low, please check system memory status"  

  print_msg "#----------------------------"
  print_msg "# Check Container"
  print_msg "#----------------------------"
  container_stat=$(ssh -q heat-admin@$i sudo systemctl list-units tripleo_* | grep failed | wc -l")

  if [ $container_stat -eq 0 ]
  then
    print_msg "Container status is normal."
  else
    print_msg "Please check container status"
    print_msg "$(ssh -q heat-admin@$i 'sudo systemctl list-units tripleo_*')"
  fi

  print_msg "#----------------------------"
  print_msg "# Check NFS - glance"
  print_msg "#----------------------------"
  nfs_stat=$(ssh -q heat-admin@$i sudo dh -h | grep glance | wc -l)
  
  if [ $nfs_stat -eq 1 ]
  then
    print_msg "NFS status is normal."
  else
    print_msg "Please check network status and nfs status"
  fi

  print_msg "#----------------------------"
  print_msg "# Check Service logs"
  print_msg "#----------------------------"
  log_stat=$(ssh heat-admin@$i sudo sh chk-log.sh | wc -l)

  if [ $log_stat -eq 0 ]
  then
    print_msg "No error service logs. This system status is normal."
  else
    error_msg=$(ssh -q heat-admin@$i sudo sh chk-log.sh)
    print_msg "Please check system logs and container status."
    print_msg "$error_msg"
  fi
done

print_msg "#----------------------------"
print_msg "# Compute"
print_msg "#----------------------------"

for i in $COM
do
  print_msg ">>>>>> $i <<<<<<<<"

  print_msg "#----------------------------"
  print_msg "# Check Network"
  print_msg "#----------------------------"
  net_stat=(ssh -q heat-admin@$i ip a | grep "state UP" | grep mq | wc -l)

  if [ $net_stat -eq 10 ]
  then
    print_msg "Network status is normal"
  else
    pring_msg "Please check network status"
    print_msg "$(ssh -q heat-admin@$i sudo ip a)"
  fi

  print_msg "#----------------------------"
  print_msg "# Check CPU"
  print_msg "#----------------------------"
  cpu_stat=$(ssh -q heat-admin@$i sudo mpstat | grep all | awk '{print $4}')
  print_msg "CPU usage is $cpu_stat. If CPU usage is high, please check system CPU status"

  print_msg "#----------------------------"
  print_msg "# Check Memory"
  print_msg "#----------------------------"
  mem_stat=$(ssh -q heat-admin@$i sudo free -h | grep -i mem | awk '{print $4}')
  print_msg "Free memory amount is $mem_stat. If free memory amount is low, please check system memory status"

  print_msg "#----------------------------"
  print_msg "# Check Container"
  print_msg "#----------------------------"
  container_stat=$(ssh -q heat-admin@$i sudo systemctl list-units tripleo_* | grep failed | wc -l")

  if [ $container_stat -eq 0 ]
  then
    print_msg "Container status is normal."
  else
    print_msg "Please check container status"
    print_msg "$(ssh -q heat-admin@$i 'sudo systemctl list-units tripleo_*')"
  fi
  
  print_msg "#----------------------------"
  print_msg "# Check Service logs"
  print_msg "#----------------------------"
  log_stat=$(ssh heat-admin@$i sudo sh chk-log.sh | wc -l)

  if [ $log_stat -eq 0 ]
  then
    print_msg "No error service logs. This system status is normal."
  else
    error_msg=$(ssh -q heat-admin@$i sudo sh chk-log.sh)
    print_msg "Please check system logs and container status."
    print_msg "$error_msg"
  fi
done

source /home/stack/overcloudrc

print_msg "#----------------------------"
print_msg "# Overcloud compute service"
print_msg "#----------------------------"
print_msg1 "$(openstack compute service list -c Binary -c Host -c Zone -c Status -c 'Updated At' --sort-column Host)"

print_msg "#----------------------------"
print_msg "# Overcloud volume service"
print_msg "#----------------------------"
print_msg1 "$(openstack volume service list)"

print_msg "#----------------------------"
print_msg "# Overcloud network service"
print_msg "#----------------------------"
print_msg1 "$(openstack network agent list -c Host -c 'Agent Type' -c Alive -c State -c 'vCPUs Used' -c vCPUs -c 'Memroy MB Used' -c 'Memory MB' --sort-column 'Hypervisor Hostname')"

print_msg "#----------------------------"
print_msg "# Overcloud hypervisor service"
print_msg "#----------------------------"
print_msg1 "$(openstack hypervisor list --long -c 'Hypervisor Hostname' -c 'Host IP' -c State -c 'vCPUs Used' -c vCPUs -c 'Memory MB Used' -c 'Memory MB' --sort-column 'Hypervisor HostName')"

print_msg "#----------------------------"
print_msg "# Instance count per hypervisor"
print_msg "#----------------------------"
print_msg1 "$(openstack server list --all --long --status ACTIVE -c Host --sort-column Host -f value | uniq -c)"

내용이 다소 길기는 하지만, 다음에 시간이 되면 중복되는 기능들을 함수로 변경하는 작업을 하면 좋을 것 같기는 하다. 그래도 이렇게 한번 스크립트를 짜 놓으면 모든 노드에 매번 들어가서 정보를 확인하지 않아도 되서 매우 편리하다.