torrent-gateway/configs/alert_rules.yml
enki b3204ea07a
Some checks are pending
CI Pipeline / Run Tests (push) Waiting to run
CI Pipeline / Lint Code (push) Waiting to run
CI Pipeline / Security Scan (push) Waiting to run
CI Pipeline / Build Docker Images (push) Blocked by required conditions
CI Pipeline / E2E Tests (push) Blocked by required conditions
first commit
2025-08-18 00:40:15 -07:00

100 lines
3.3 KiB
YAML

groups:
- name: torrent-gateway-alerts
rules:
# Service availability alerts
- alert: GatewayDown
expr: up{job="torrent-gateway"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Torrent Gateway is down"
description: "Torrent Gateway has been down for more than 1 minute"
# Performance alerts
- alert: HighRequestLatency
expr: histogram_quantile(0.95, rate(gateway_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High request latency detected"
description: "95th percentile request latency is {{ $value }}s"
- alert: HighErrorRate
expr: rate(gateway_requests_total{status_code=~"5.."}[5m]) / rate(gateway_requests_total[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }}"
# Storage alerts
- alert: HighStorageUsage
expr: gateway_storage_used_bytes > 50 * 1024 * 1024 * 1024 # 50GB
for: 5m
labels:
severity: warning
annotations:
summary: "High storage usage"
description: "Storage usage is {{ $value | humanizeBytes }}"
- alert: LowDiskSpace
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space"
description: "Disk usage is {{ $value | humanizePercentage }}"
# Cache alerts
- alert: LowCacheHitRate
expr: rate(gateway_cache_hits_total[5m]) / (rate(gateway_cache_hits_total[5m]) + rate(gateway_cache_misses_total[5m])) < 0.5
for: 10m
labels:
severity: warning
annotations:
summary: "Low cache hit rate"
description: "Cache hit rate is {{ $value | humanizePercentage }}"
# Memory alerts
- alert: HighMemoryUsage
expr: gateway_memory_usage_bytes > 2 * 1024 * 1024 * 1024 # 2GB
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value | humanizeBytes }}"
# Rate limiting alerts
- alert: HighRateLimitBlocks
expr: rate(gateway_rate_limit_blocks_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "High rate limit blocks"
description: "Rate limit blocks are {{ $value }}/sec"
# Admin alerts
- alert: SuspiciousAdminActivity
expr: rate(gateway_admin_actions_total[5m]) > 5
for: 2m
labels:
severity: warning
annotations:
summary: "High admin activity detected"
description: "Admin actions rate is {{ $value }}/sec"
# Database alerts
- alert: HighDatabaseErrors
expr: rate(gateway_database_errors_total[5m]) > 1
for: 5m
labels:
severity: critical
annotations:
summary: "Database errors detected"
description: "Database error rate is {{ $value }}/sec"