enki b3204ea07a
Some checks are pending
CI Pipeline / Run Tests (push) Waiting to run
CI Pipeline / Lint Code (push) Waiting to run
CI Pipeline / Security Scan (push) Waiting to run
CI Pipeline / Build Docker Images (push) Blocked by required conditions
CI Pipeline / E2E Tests (push) Blocked by required conditions
first commit
2025-08-18 00:40:15 -07:00

519 lines
14 KiB
Go

package metrics
import (
"fmt"
"log"
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// Metrics holds all Prometheus metrics for the gateway
type Metrics struct {
// Request metrics
RequestsTotal *prometheus.CounterVec
RequestDuration *prometheus.HistogramVec
ActiveConnections prometheus.Gauge
// Upload metrics
UploadsTotal *prometheus.CounterVec
UploadSize *prometheus.HistogramVec
UploadDuration *prometheus.HistogramVec
// Download metrics
DownloadsTotal *prometheus.CounterVec
DownloadSize *prometheus.HistogramVec
DownloadDuration *prometheus.HistogramVec
// Stream metrics
StreamsActive prometheus.Gauge
StreamsTotal *prometheus.CounterVec
StreamDuration *prometheus.HistogramVec
// Storage metrics
StorageUsed prometheus.Gauge
FilesStored prometheus.Gauge
ChunksStored prometheus.Gauge
BlobsStored prometheus.Gauge
// Cache metrics
CacheHits *prometheus.CounterVec
CacheMisses *prometheus.CounterVec
CacheSize *prometheus.GaugeVec
CacheMemoryUsage *prometheus.GaugeVec
// Rate limiting metrics
RateLimitHits *prometheus.CounterVec
RateLimitBlocks *prometheus.CounterVec
// Admin metrics
AdminActions *prometheus.CounterVec
BannedUsers prometheus.Gauge
ContentReports *prometheus.CounterVec
// System metrics
DatabaseQueries *prometheus.CounterVec
DatabaseErrors *prometheus.CounterVec
GoroutineCount prometheus.Gauge
MemoryUsage prometheus.Gauge
// Blossom pool metrics
BlossomPoolServers *prometheus.GaugeVec
BlossomPoolRequests *prometheus.CounterVec
BlossomPoolErrors *prometheus.CounterVec
}
// NewMetrics creates and registers all Prometheus metrics
func NewMetrics() *Metrics {
m := &Metrics{
// Request metrics
RequestsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status_code"},
),
RequestDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "gateway_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "endpoint"},
),
ActiveConnections: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "gateway_active_connections",
Help: "Number of active HTTP connections",
},
),
// Upload metrics
UploadsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_uploads_total",
Help: "Total number of file uploads",
},
[]string{"storage_type", "status"},
),
UploadSize: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "gateway_upload_size_bytes",
Help: "Upload file size in bytes",
Buckets: []float64{1024, 10240, 102400, 1048576, 10485760, 104857600, 1073741824}, // 1KB to 1GB
},
[]string{"storage_type"},
),
UploadDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "gateway_upload_duration_seconds",
Help: "Upload duration in seconds",
Buckets: []float64{0.1, 0.5, 1, 5, 10, 30, 60, 300}, // 100ms to 5min
},
[]string{"storage_type"},
),
// Download metrics
DownloadsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_downloads_total",
Help: "Total number of file downloads",
},
[]string{"storage_type", "status"},
),
DownloadSize: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "gateway_download_size_bytes",
Help: "Download file size in bytes",
Buckets: []float64{1024, 10240, 102400, 1048576, 10485760, 104857600, 1073741824},
},
[]string{"storage_type"},
),
DownloadDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "gateway_download_duration_seconds",
Help: "Download duration in seconds",
Buckets: []float64{0.1, 0.5, 1, 5, 10, 30, 60, 300},
},
[]string{"storage_type"},
),
// Stream metrics
StreamsActive: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "gateway_streams_active",
Help: "Number of active streams",
},
),
StreamsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_streams_total",
Help: "Total number of streams started",
},
[]string{"file_type", "status"},
),
StreamDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "gateway_stream_duration_seconds",
Help: "Stream duration in seconds",
Buckets: []float64{1, 10, 60, 300, 1800, 3600}, // 1s to 1h
},
[]string{"file_type"},
),
// Storage metrics
StorageUsed: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "gateway_storage_used_bytes",
Help: "Total storage used in bytes",
},
),
FilesStored: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "gateway_files_stored_total",
Help: "Total number of files stored",
},
),
ChunksStored: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "gateway_chunks_stored_total",
Help: "Total number of chunks stored",
},
),
BlobsStored: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "gateway_blobs_stored_total",
Help: "Total number of blobs stored",
},
),
// Cache metrics
CacheHits: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_cache_hits_total",
Help: "Total number of cache hits",
},
[]string{"cache_type"},
),
CacheMisses: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_cache_misses_total",
Help: "Total number of cache misses",
},
[]string{"cache_type"},
),
CacheSize: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "gateway_cache_size_items",
Help: "Number of items in cache",
},
[]string{"cache_type"},
),
CacheMemoryUsage: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "gateway_cache_memory_bytes",
Help: "Memory usage of cache in bytes",
},
[]string{"cache_type"},
),
// Rate limiting metrics
RateLimitHits: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_rate_limit_hits_total",
Help: "Total number of rate limit hits",
},
[]string{"limit_type"},
),
RateLimitBlocks: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_rate_limit_blocks_total",
Help: "Total number of rate limit blocks",
},
[]string{"limit_type"},
),
// Admin metrics
AdminActions: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_admin_actions_total",
Help: "Total number of admin actions",
},
[]string{"action_type", "admin_pubkey"},
),
BannedUsers: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "gateway_banned_users_total",
Help: "Total number of banned users",
},
),
ContentReports: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_content_reports_total",
Help: "Total number of content reports",
},
[]string{"status"},
),
// System metrics
DatabaseQueries: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_database_queries_total",
Help: "Total number of database queries",
},
[]string{"operation", "table"},
),
DatabaseErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_database_errors_total",
Help: "Total number of database errors",
},
[]string{"operation", "table"},
),
GoroutineCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "gateway_goroutines_active",
Help: "Number of active goroutines",
},
),
MemoryUsage: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "gateway_memory_usage_bytes",
Help: "Memory usage in bytes",
},
),
// Blossom pool metrics
BlossomPoolServers: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "gateway_blossom_pool_servers",
Help: "Number of Blossom pool servers by status",
},
[]string{"status"}, // healthy, unhealthy
),
BlossomPoolRequests: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_blossom_pool_requests_total",
Help: "Total number of Blossom pool requests",
},
[]string{"server", "status"},
),
BlossomPoolErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gateway_blossom_pool_errors_total",
Help: "Total number of Blossom pool errors",
},
[]string{"server", "error_type"},
),
}
// Register all metrics
prometheus.MustRegister(
m.RequestsTotal,
m.RequestDuration,
m.ActiveConnections,
m.UploadsTotal,
m.UploadSize,
m.UploadDuration,
m.DownloadsTotal,
m.DownloadSize,
m.DownloadDuration,
m.StreamsActive,
m.StreamsTotal,
m.StreamDuration,
m.StorageUsed,
m.FilesStored,
m.ChunksStored,
m.BlobsStored,
m.CacheHits,
m.CacheMisses,
m.CacheSize,
m.CacheMemoryUsage,
m.RateLimitHits,
m.RateLimitBlocks,
m.AdminActions,
m.BannedUsers,
m.ContentReports,
m.DatabaseQueries,
m.DatabaseErrors,
m.GoroutineCount,
m.MemoryUsage,
m.BlossomPoolServers,
m.BlossomPoolRequests,
m.BlossomPoolErrors,
)
return m
}
// HTTPMiddleware wraps HTTP handlers to collect request metrics
func (m *Metrics) HTTPMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
// Increment active connections
m.ActiveConnections.Inc()
defer m.ActiveConnections.Dec()
// Wrap response writer to capture status code
ww := &wrappedWriter{ResponseWriter: w, statusCode: 200}
// Call next handler
next.ServeHTTP(ww, r)
// Record metrics
duration := time.Since(start).Seconds()
endpoint := r.URL.Path
method := r.Method
statusCode := ww.statusCode
m.RequestsTotal.WithLabelValues(method, endpoint, string(rune(statusCode))).Inc()
m.RequestDuration.WithLabelValues(method, endpoint).Observe(duration)
})
}
// wrappedWriter wraps http.ResponseWriter to capture status code
type wrappedWriter struct {
http.ResponseWriter
statusCode int
}
func (w *wrappedWriter) WriteHeader(statusCode int) {
w.statusCode = statusCode
w.ResponseWriter.WriteHeader(statusCode)
}
// RecordUpload records upload metrics
func (m *Metrics) RecordUpload(storageType string, size int64, duration time.Duration, success bool) {
status := "success"
if !success {
status = "error"
}
m.UploadsTotal.WithLabelValues(storageType, status).Inc()
m.UploadSize.WithLabelValues(storageType).Observe(float64(size))
m.UploadDuration.WithLabelValues(storageType).Observe(duration.Seconds())
}
// RecordDownload records download metrics
func (m *Metrics) RecordDownload(storageType string, size int64, duration time.Duration, success bool) {
status := "success"
if !success {
status = "error"
}
m.DownloadsTotal.WithLabelValues(storageType, status).Inc()
m.DownloadSize.WithLabelValues(storageType).Observe(float64(size))
m.DownloadDuration.WithLabelValues(storageType).Observe(duration.Seconds())
}
// RecordStream records streaming metrics
func (m *Metrics) RecordStream(fileType string, duration time.Duration, success bool) {
status := "success"
if !success {
status = "error"
}
m.StreamsTotal.WithLabelValues(fileType, status).Inc()
m.StreamDuration.WithLabelValues(fileType).Observe(duration.Seconds())
}
// UpdateStorageMetrics updates storage-related metrics
func (m *Metrics) UpdateStorageMetrics(storageUsed int64, filesCount, chunksCount, blobsCount int) {
m.StorageUsed.Set(float64(storageUsed))
m.FilesStored.Set(float64(filesCount))
m.ChunksStored.Set(float64(chunksCount))
m.BlobsStored.Set(float64(blobsCount))
}
// RecordCacheOperation records cache hit/miss
func (m *Metrics) RecordCacheOperation(cacheType string, hit bool) {
if hit {
m.CacheHits.WithLabelValues(cacheType).Inc()
} else {
m.CacheMisses.WithLabelValues(cacheType).Inc()
}
}
// UpdateCacheMetrics updates cache size and memory usage
func (m *Metrics) UpdateCacheMetrics(cacheType string, size int, memoryUsage int64) {
m.CacheSize.WithLabelValues(cacheType).Set(float64(size))
m.CacheMemoryUsage.WithLabelValues(cacheType).Set(float64(memoryUsage))
}
// RecordRateLimit records rate limiting events
func (m *Metrics) RecordRateLimit(limitType string, blocked bool) {
if blocked {
m.RateLimitBlocks.WithLabelValues(limitType).Inc()
} else {
m.RateLimitHits.WithLabelValues(limitType).Inc()
}
}
// RecordAdminAction records admin actions
func (m *Metrics) RecordAdminAction(actionType, adminPubkey string) {
m.AdminActions.WithLabelValues(actionType, adminPubkey[:16]+"...").Inc()
}
// UpdateAdminMetrics updates admin-related metrics
func (m *Metrics) UpdateAdminMetrics(bannedUsersCount int) {
m.BannedUsers.Set(float64(bannedUsersCount))
}
// RecordContentReport records content reports
func (m *Metrics) RecordContentReport(status string) {
m.ContentReports.WithLabelValues(status).Inc()
}
// RecordDatabaseOperation records database queries and errors
func (m *Metrics) RecordDatabaseOperation(operation, table string, success bool) {
m.DatabaseQueries.WithLabelValues(operation, table).Inc()
if !success {
m.DatabaseErrors.WithLabelValues(operation, table).Inc()
}
}
// UpdateSystemMetrics updates system-level metrics
func (m *Metrics) UpdateSystemMetrics(goroutineCount int, memoryUsage int64) {
m.GoroutineCount.Set(float64(goroutineCount))
m.MemoryUsage.Set(float64(memoryUsage))
}
// RecordBlossomPoolOperation records Blossom pool metrics
func (m *Metrics) RecordBlossomPoolOperation(server, status string, success bool) {
m.BlossomPoolRequests.WithLabelValues(server, status).Inc()
if !success {
m.BlossomPoolErrors.WithLabelValues(server, "request_failed").Inc()
}
}
// UpdateBlossomPoolHealth updates Blossom pool server health metrics
func (m *Metrics) UpdateBlossomPoolHealth(healthyCount, unhealthyCount int) {
m.BlossomPoolServers.WithLabelValues("healthy").Set(float64(healthyCount))
m.BlossomPoolServers.WithLabelValues("unhealthy").Set(float64(unhealthyCount))
}
// Handler returns the Prometheus metrics HTTP handler
func (m *Metrics) Handler() http.Handler {
return promhttp.Handler()
}
// StartMetricsServer starts a dedicated metrics server
func (m *Metrics) StartMetricsServer(port int) {
mux := http.NewServeMux()
mux.Handle("/metrics", m.Handler())
server := &http.Server{
Addr: fmt.Sprintf(":%d", port),
Handler: mux,
}
go func() {
if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Printf("Metrics server error: %v", err)
}
}()
}