alerts-alertmanager | alerts-vmalert | alerts-acceptance
vi alarms.yaml
groups:
- name: alarms
rules:
- alert: cpu usage hits the roof
# testing 20% prod 95%
expr: avg_over_time(log_metric_gauge_cpu_p[1m]) > 20
# testing 5s/ prod 5m
for: 5s
labels:
# https://betterstack.com/community/guides/incident-management/severity-levels/
severity: sev5
annotations:
dashboard: https://vmetrics.nethence.com/vmui/#/?g0.expr=avg_over_time%28log_metric_gauge_cpu_p%5B1m%5D%29
check that AlartManager is up and running
ping alertmanager
nmap -p 9093 alertmanager
vi /etc/rc.local
echo starting vmalert
nohup vmalert-prod -rule=/root/alerts.yaml -datasource.url=http://127.0.0.1:8428 \
-notifier.showURL \
-notifier.suppressDuplicateTargetErrors \
-notifier.url http://alertmanager:9093 \
> /var/log/vmalert.log &
# -notifier.blackhole
tail -F /var/log/vmalert.log
reload
pgrep -a vmalert kill -HUP `pgrep vmalert`
https://docs.victoriametrics.com/vmalert.html
https://docs.victoriametrics.com/guides/guide-vmanomaly-vmalert.html