diff --git a/cmd/server/main.go b/cmd/server/main.go index 7f8d5ed..2894950 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -130,6 +130,13 @@ type messageOut struct { FetchedAt time.Time `json:"fetched_at"` } +type componentProbe struct { + Name string `json:"name"` + Status string `json:"status"` + LatencyMs int64 `json:"latency_ms"` + Error string `json:"error,omitempty"` +} + func main() { cfg := loadConfig() logger := slog.New(slog.NewJSONHandler(os.Stdout, nil)) @@ -186,6 +193,10 @@ func (a *app) serveHTTP(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, map[string]string{"status": "ok"}) return } + if path == "/health/detail" { + a.handleHealthDetail(w, r) + return + } if path == "/" { writeJSON(w, http.StatusOK, map[string]string{"service": "monitoring-tg", "ui": "portal", "api": "go"}) return @@ -245,6 +256,156 @@ func (a *app) apiPath(path string) string { return path } +func (a *app) handleHealthDetail(w http.ResponseWriter, r *http.Request) { + ctx, cancel := context.WithTimeout(r.Context(), 3*time.Second) + defer cancel() + + components := []componentProbe{ + a.probePostgres(ctx), + a.probeAIService(ctx), + a.probeClassificationQueue(ctx), + a.probePoller(ctx), + a.probeMediaStorage(ctx), + a.probeMediaMetadata(ctx), + } + writeJSON(w, http.StatusOK, map[string]any{"components": components}) +} + +func (a *app) probePostgres(ctx context.Context) componentProbe { + start := time.Now() + if err := a.db.Ping(ctx); err != nil { + return componentProbe{Name: "postgres", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()} + } + return componentProbe{Name: "postgres", Status: "ok", LatencyMs: time.Since(start).Milliseconds()} +} + +func (a *app) probeAIService(ctx context.Context) componentProbe { + start := time.Now() + if !a.cfg.LLMEnabled { + return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "llm disabled"} + } + status, err := a.ai.ProvidersStatus(ctx) + if err != nil { + return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()} + } + for _, provider := range status.Providers { + if provider.Name != "llm" { + continue + } + if provider.Configured && provider.OK { + return componentProbe{Name: "ai_service", Status: "ok", LatencyMs: time.Since(start).Milliseconds()} + } + errMsg := strings.TrimSpace(provider.Error) + if errMsg == "" { + errMsg = "llm provider is not ready" + } + return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: errMsg} + } + return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "llm provider not found"} +} + +func (a *app) probeClassificationQueue(ctx context.Context) componentProbe { + start := time.Now() + var pending, pending24h int64 + err := a.db.QueryRow(ctx, ` + SELECT + COUNT(*)::bigint, + COUNT(*) FILTER (WHERE m.fetched_at >= now() - interval '24 hours')::bigint + FROM channels c + JOIN channels src ON src.id = COALESCE(c.source_channel_id, c.id) + JOIN messages m ON m.channel_id = src.id + JOIN sections s ON s.id = c.section_id + LEFT JOIN message_classifications mc ON mc.message_id = m.id AND mc.section_id = s.id + WHERE m.text IS NOT NULL + AND mc.id IS NULL`, + ).Scan(&pending, &pending24h) + if err != nil { + return componentProbe{Name: "classification_queue", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()} + } + if pending > 0 { + return componentProbe{ + Name: "classification_queue", + Status: "down", + LatencyMs: time.Since(start).Milliseconds(), + Error: "pending=" + strconv.FormatInt(pending, 10) + " pending_24h=" + strconv.FormatInt(pending24h, 10), + } + } + return componentProbe{Name: "classification_queue", Status: "ok", LatencyMs: time.Since(start).Milliseconds()} +} + +func (a *app) probePoller(ctx context.Context) componentProbe { + start := time.Now() + staleAfter := maxInt(a.cfg.PollIntervalSeconds*3, 900) + var active, neverPolled, stale int64 + err := a.db.QueryRow(ctx, ` + SELECT + COUNT(*)::bigint, + COUNT(*) FILTER (WHERE last_polled_at IS NULL)::bigint, + COUNT(*) FILTER (WHERE last_polled_at IS NOT NULL AND last_polled_at < now() - ($1::int * interval '1 second'))::bigint + FROM channels + WHERE is_active = true + AND source_channel_id IS NULL`, + staleAfter, + ).Scan(&active, &neverPolled, &stale) + if err != nil { + return componentProbe{Name: "poller", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()} + } + if active > 0 && (neverPolled > 0 || stale > 0) { + return componentProbe{ + Name: "poller", + Status: "down", + LatencyMs: time.Since(start).Milliseconds(), + Error: "active=" + strconv.FormatInt(active, 10) + + " never_polled=" + strconv.FormatInt(neverPolled, 10) + + " stale=" + strconv.FormatInt(stale, 10) + + " stale_after_sec=" + strconv.Itoa(staleAfter), + } + } + return componentProbe{Name: "poller", Status: "ok", LatencyMs: time.Since(start).Milliseconds()} +} + +func (a *app) probeMediaStorage(ctx context.Context) componentProbe { + start := time.Now() + if a.minio == nil || a.cfg.MinioBucket == "" { + return componentProbe{Name: "media_storage", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "minio not configured"} + } + exists, err := a.minio.BucketExists(ctx, a.cfg.MinioBucket) + if err != nil { + return componentProbe{Name: "media_storage", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()} + } + if !exists { + return componentProbe{Name: "media_storage", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "bucket not found: " + a.cfg.MinioBucket} + } + return componentProbe{Name: "media_storage", Status: "ok", LatencyMs: time.Since(start).Milliseconds()} +} + +func (a *app) probeMediaMetadata(ctx context.Context) componentProbe { + start := time.Now() + var withMedia, missingFiles int64 + err := a.db.QueryRow(ctx, ` + SELECT + COUNT(*) FILTER (WHERE has_media = true)::bigint, + COUNT(*) FILTER ( + WHERE has_media = true + AND jsonb_array_length(COALESCE(media_files, '[]'::jsonb)) = 0 + )::bigint + FROM messages`, + ).Scan(&withMedia, &missingFiles) + if err != nil { + return componentProbe{Name: "media_metadata", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()} + } + if missingFiles > 0 { + return componentProbe{ + Name: "media_metadata", + Status: "down", + LatencyMs: time.Since(start).Milliseconds(), + Error: "messages_with_media=" + strconv.FormatInt(withMedia, 10) + + " missing_media_files=" + strconv.FormatInt(missingFiles, 10), + } + } + return componentProbe{Name: "media_metadata", Status: "ok", LatencyMs: time.Since(start).Milliseconds()} +} + func (a *app) handleAccessMe(w http.ResponseWriter, r *http.Request) { scope := readAccess(r) writeJSON(w, http.StatusOK, map[string]any{