#!/bin/bash PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin # script can be executed as user # although we could switch to ksh93 to be able to use floats, and calculate more precise percentage ourselves # it's maybe preferable to keep using bash for compatibility # todo - check what happens on busybox systems # todo - skip/continue resource when unknown - or is it fine for all resources already since var is empty? # todo - port to bsd systems? no posix output format there [[ ! -x `which df` ]] && echo install coreutils first && exit 1 [[ ! -x `which free` ]] && echo install procps first && exit 1 [[ ! -x `which iostat` ]] && echo install sysstat first && exit 1 [[ ! -x `which uptime` ]] && echo install procps first && exit 1 # # cpu & disk i/o usage # output=`iostat -x` avg_cpu=`echo "$output" | grep -A1 ^avg-cpu:` [[ -z $avg_cpu ]] && echo [UNKNOWN] could not define cpu usage && (( fail = 1 )) # avoid floats and the need to switch to ksh93 idle=`echo "$avg_cpu" | tail -1 | awk '{print $NF}' | cut -f1 -d.` if (( idle <= 5 )); then echo [WARNING] average cpu usage reached 95%: echo "$avg_cpu" (( warn = 1 )) else echo [OK] average cpu usage is fine fi util_disks=`echo "$output" | sed -n '/^Device/,$p' | sed '1d; /^$/d' | grep -v ^loop` [[ -z $util_disks ]] && echo [UNKNOWN] could not define disk i/o usage && (( fail = 1 )) [[ -n $util_disks ]] && while read line; do disk_name=`echo $line | awk '{print $1}'` # avoid floats again util_percent=`echo $line | awk '{print $NF}' | cut -f1 -d.` if (( util_percent >= 100 )); then echo [WARNING] $disk_name disk reached 100% i/o usage \(fine for ssd drives\): echo "$util_disks" (( warn = 1 )) else echo [OK] $disk_name disk i/o usage is fine fi unset util_percent disk_name done <<< "$util_disks" unset output avg_cpu idle util_disks # # cpu load # # avoid floats load15m=`uptime | awk '{print $NF}' | cut -f1 -d.` nproc=`grep ^processor /proc/cpuinfo | wc -l` [[ -z $load15m ]] && echo [UNKNOWN] could not define 15 minutes cpu load && (( fail = 1 )) [[ -z $nproc ]] && echo [UNKNOWN] could not define amount of cores && (( fail = 1 )) if (( load15m / nproc >= 100 )); then echo [WARNING] average 15 minutes cpu load reached 100% per core: echo "$load15m" echo \(for $nproc cores\) (( warn = 1 )) else echo [OK] average 15 minutes cpu load is fine fi unset load15m nproc # # ram # ram_mebi=`free -m` total_mebi=`echo "$ram_mebi" | grep ^Mem: | awk '{print $2}'` available_mebi=`echo "$ram_mebi" | grep ^Mem: | awk '{print $NF}'` [[ -z $total_mebi ]] && echo [UNKNOWN] could not define total ram && (( fail = 1 )) [[ -z $available_mebi ]] && echo [UNKNOWN] could not define available ram && (( fail = 1 )) if (( 100 * $available_mebi / $total_mebi <= 10 )); then echo [WARNING] ram usage reached 90%: echo "$ram_mebi" (( warn = 1 )) else echo [OK] ram usage is fine fi unset available_mebi total_mebi ram_mebi # # network tx/rx # sar_output=`sar -n DEV` nics=`sar -n DEV | grep ^Average:` if [[ -z $nics ]]; then echo [UNKNOWN] enable sysstat sa1/collect for network interface statistics echo "$sar_output" (( fail = 1 )) fi [[ -n $nics ]] && while read line; do nic_name=`echo $line | awk '{print $2}'` # avoid floats altogether nic_util=`echo $line | awk '{print $NF}' | cut -f1 -d.` if (( nic_util >= 50 )); then echo [WARNING] $nic_name network interface reached 50% usage: echo "$sar_output" (( warn = 1 )) else echo [OK] $nic_name network interface usage is fine fi unset nic_name nic_util done <<< "$nics" unset nics sar_output # todo #sar_errors_output=`sar -n EDEV` # # filesystem usage # posix_output=`df -P -BG` filesystems=`echo "$posix_output" | sed 1d | grep -vE '^overlay|^tmpfs|^devtmpfs|^udev'` [[ -z $filesystems ]] && echo [UNKNOWN] could not find filesystem usage statistics && (( fail = 1 )) [[ -n $filesystems ]] && while read line; do filesystem_name=`echo $line | awk '{print $NF}'` filesystem_total=`echo $line | awk '{print $2}' | sed 's/G$//'` filesystem_util=`echo $line | awk '{print $5}' | sed 's/%$//'` # conf file needs to be readable as user [[ -f /etc/check_hw_resources.conf ]] && source /etc/check_hw_resources.conf if [[ $filesystem_special_name = $filesystem_name ]]; then percent_trigger_float=$filesystem_special_percent_trigger percent_trigger=$filesystem_special_percent_trigger special="special " else if (( filesystem_total <= 25 )); then (( percent_trigger_float = 75 )) (( percent_trigger = 75 )) else # raise trigger according to disk size # https://en.wikipedia.org/wiki/Sigmoid_function # https://en.wikipedia.org/wiki/Logistic_function percent_trigger_float=`echo $filesystem_total | awk '{print ( 25 / \ ( 1 + exp \ (-6 * (1 / 5000 * $1 - 0.5) ) \ ) ) - 1.18 + 75 }'` percent_trigger=${percent_trigger_float%%\.*} fi fi if (( percent_trigger < 75 || percent_trigger > 99 )); then echo [UNKNOWN] could not define filesystem percent_trigger echo debug filesystem_total is $filesystem_total G echo debug percent_trigger is $percent_trigger % (( fail = 1 )) fi if (( filesystem_util >= percent_trigger )); then echo [WARNING] $filesystem_name filesystem total size ${filesystem_total}G reached $special$filesystem_util/$percent_trigger_float%: echo "$posix_output" | grep -vE '^overlay|^tmpfs|^devtmpfs|^udev' (( warn = 1 )) else echo [OK] $filesystem_name filesystem total size ${filesystem_total}G usage $special$filesystem_util/$percent_trigger_float% is fine fi unset filesystem_util filesystem_total filesystem_name unset percent_trigger_float percent_trigger unset filesystem_special_name filesystem_special_percent_trigger special done <<< "$filesystems" unset filesystems posix_output # # nagios-style conclusion # https://nagios-plugins.org/doc/guidelines.html#AEN78 # (( crit == 1 )) && exit 2 || true (( fail == 1 )) && exit 3 || true (( warn == 1 )) && exit 1 || true