Alerts


/etc/prometheus/rules/probatiovault_alerts.yml > infrastructure
DiskSpaceCritical (0 active)
alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{mountpoint="/"}
  / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 5
for: 2m
labels:
  severity: critical
annotations:
  description: Disk space < 5% on {{ $labels.instance }}
  summary: Critical disk space
DiskSpaceLow (0 active)
alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"}
  / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15
for: 5m
labels:
  severity: warning
annotations:
  description: Disk space < 15% on {{ $labels.instance }}
  summary: Low disk space
HighCPUUsage (0 active)
alert: HighCPUUsage
expr: 100
  - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
  > 80
for: 10m
labels:
  severity: warning
annotations:
  description: CPU usage > 80% on {{ $labels.instance }}
  summary: High CPU usage
HighMemoryUsage (0 active)
alert: HighMemoryUsage
expr: (1
  - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
  severity: warning
annotations:
  description: Memory usage > 85% on {{ $labels.instance }}
  summary: High memory usage
NodeDown (0 active)
alert: NodeDown
expr: up{job="node"}
  == 0
for: 2m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} is down'
  summary: Node exporter down
/etc/prometheus/rules/probatiovault_alerts.yml > postgresql
PostgreSQLDown (1 active)
alert: PostgreSQLDown
expr: pg_up == 0
for: 1m
labels:
  severity: critical
annotations:
  description: PostgreSQL on {{ $labels.instance }} is down
  summary: PostgreSQL is down
Labels State Active Since Value
alertname="PostgreSQLDown" instance="vps-2410e330" job="postgres" service="postgresql" severity="critical" firing 2025-12-02 12:45:27.032693203 +0000 UTC 0
PostgreSQLDeadlocks (0 active)
alert: PostgreSQLDeadlocks
expr: increase(pg_stat_database_deadlocks[5m])
  > 0
for: 1m
labels:
  severity: warning
annotations:
  description: Deadlocks detected on {{ $labels.datname }}
  summary: PostgreSQL deadlocks
PostgreSQLSlowQueries (0 active)
alert: PostgreSQLSlowQueries
expr: rate(pg_stat_statements_seconds_total[5m])
  > 1
for: 10m
labels:
  severity: warning
annotations:
  description: High query time on {{ $labels.instance }}
  summary: Slow queries detected
PostgreSQLTooManyConnections (0 active)
alert: PostgreSQLTooManyConnections
expr: pg_stat_activity_count
  > 80
for: 5m
labels:
  severity: warning
annotations:
  description: '{{ $value }} connections (>80)'
  summary: PostgreSQL connections high
/etc/prometheus/rules/probatiovault_alerts.yml > services
APIDown (1 active)
alert: APIDown
expr: up{job="api"}
  == 0
for: 2m
labels:
  severity: critical
annotations:
  description: ProbatioVault API is not responding
  summary: API Backend down
Labels State Active Since Value
alertname="APIDown" instance="vps-2410e330" job="api" service="api-backend" severity="critical" firing 2025-12-02 12:45:36.46725848 +0000 UTC 0
GrafanaDown (0 active)
alert: GrafanaDown
expr: probe_success{instance=~".*grafana.*"}
  == 0
for: 3m
labels:
  severity: warning
annotations:
  description: Grafana dashboard is not responding
  summary: Grafana unreachable
PrometheusDown (0 active)
alert: PrometheusDown
expr: up{job="prometheus"}
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: Prometheus monitoring is down
  summary: Prometheus down
ServiceDown (0 active)
alert: ServiceDown
expr: probe_success
  == 0
for: 3m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} is not responding'
  summary: Service unreachable
SonarQubeDown (0 active)
alert: SonarQubeDown
expr: probe_success{instance=~".*sonar.*"}
  == 0
for: 5m
labels:
  severity: warning
annotations:
  description: SonarQube is not responding
  summary: SonarQube unreachable
/etc/prometheus/rules/probatiovault_alerts.yml > ssl
SSLCertExpired (0 active)
alert: SSLCertExpired
expr: probe_ssl_earliest_cert_expiry
  - time() < 0
for: 1m
labels:
  severity: critical
annotations:
  description: Certificate for {{ $labels.instance }} has expired
  summary: SSL certificate expired
SSLCertExpiringSoon (0 active)
alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry
  - time() < 86400 * 14
for: 1h
labels:
  severity: warning
annotations:
  description: Certificate for {{ $labels.instance }} expires in < 14 days
  summary: SSL certificate expiring soon