feat: expose monitoring tg health detail

This commit is contained in:
Grendgi
2026-06-17 15:55:28 +03:00
parent 90c0a346ed
commit 7cc5109cba

View File

@@ -130,6 +130,13 @@ type messageOut struct {
FetchedAt time.Time `json:"fetched_at"`
}
type componentProbe struct {
Name string `json:"name"`
Status string `json:"status"`
LatencyMs int64 `json:"latency_ms"`
Error string `json:"error,omitempty"`
}
func main() {
cfg := loadConfig()
logger := slog.New(slog.NewJSONHandler(os.Stdout, nil))
@@ -186,6 +193,10 @@ func (a *app) serveHTTP(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
return
}
if path == "/health/detail" {
a.handleHealthDetail(w, r)
return
}
if path == "/" {
writeJSON(w, http.StatusOK, map[string]string{"service": "monitoring-tg", "ui": "portal", "api": "go"})
return
@@ -245,6 +256,156 @@ func (a *app) apiPath(path string) string {
return path
}
func (a *app) handleHealthDetail(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), 3*time.Second)
defer cancel()
components := []componentProbe{
a.probePostgres(ctx),
a.probeAIService(ctx),
a.probeClassificationQueue(ctx),
a.probePoller(ctx),
a.probeMediaStorage(ctx),
a.probeMediaMetadata(ctx),
}
writeJSON(w, http.StatusOK, map[string]any{"components": components})
}
func (a *app) probePostgres(ctx context.Context) componentProbe {
start := time.Now()
if err := a.db.Ping(ctx); err != nil {
return componentProbe{Name: "postgres", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
}
return componentProbe{Name: "postgres", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
}
func (a *app) probeAIService(ctx context.Context) componentProbe {
start := time.Now()
if !a.cfg.LLMEnabled {
return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "llm disabled"}
}
status, err := a.ai.ProvidersStatus(ctx)
if err != nil {
return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
}
for _, provider := range status.Providers {
if provider.Name != "llm" {
continue
}
if provider.Configured && provider.OK {
return componentProbe{Name: "ai_service", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
}
errMsg := strings.TrimSpace(provider.Error)
if errMsg == "" {
errMsg = "llm provider is not ready"
}
return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: errMsg}
}
return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "llm provider not found"}
}
func (a *app) probeClassificationQueue(ctx context.Context) componentProbe {
start := time.Now()
var pending, pending24h int64
err := a.db.QueryRow(ctx, `
SELECT
COUNT(*)::bigint,
COUNT(*) FILTER (WHERE m.fetched_at >= now() - interval '24 hours')::bigint
FROM channels c
JOIN channels src ON src.id = COALESCE(c.source_channel_id, c.id)
JOIN messages m ON m.channel_id = src.id
JOIN sections s ON s.id = c.section_id
LEFT JOIN message_classifications mc ON mc.message_id = m.id AND mc.section_id = s.id
WHERE m.text IS NOT NULL
AND mc.id IS NULL`,
).Scan(&pending, &pending24h)
if err != nil {
return componentProbe{Name: "classification_queue", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
}
if pending > 0 {
return componentProbe{
Name: "classification_queue",
Status: "down",
LatencyMs: time.Since(start).Milliseconds(),
Error: "pending=" + strconv.FormatInt(pending, 10) + " pending_24h=" + strconv.FormatInt(pending24h, 10),
}
}
return componentProbe{Name: "classification_queue", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
}
func (a *app) probePoller(ctx context.Context) componentProbe {
start := time.Now()
staleAfter := maxInt(a.cfg.PollIntervalSeconds*3, 900)
var active, neverPolled, stale int64
err := a.db.QueryRow(ctx, `
SELECT
COUNT(*)::bigint,
COUNT(*) FILTER (WHERE last_polled_at IS NULL)::bigint,
COUNT(*) FILTER (WHERE last_polled_at IS NOT NULL AND last_polled_at < now() - ($1::int * interval '1 second'))::bigint
FROM channels
WHERE is_active = true
AND source_channel_id IS NULL`,
staleAfter,
).Scan(&active, &neverPolled, &stale)
if err != nil {
return componentProbe{Name: "poller", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
}
if active > 0 && (neverPolled > 0 || stale > 0) {
return componentProbe{
Name: "poller",
Status: "down",
LatencyMs: time.Since(start).Milliseconds(),
Error: "active=" + strconv.FormatInt(active, 10) +
" never_polled=" + strconv.FormatInt(neverPolled, 10) +
" stale=" + strconv.FormatInt(stale, 10) +
" stale_after_sec=" + strconv.Itoa(staleAfter),
}
}
return componentProbe{Name: "poller", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
}
func (a *app) probeMediaStorage(ctx context.Context) componentProbe {
start := time.Now()
if a.minio == nil || a.cfg.MinioBucket == "" {
return componentProbe{Name: "media_storage", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "minio not configured"}
}
exists, err := a.minio.BucketExists(ctx, a.cfg.MinioBucket)
if err != nil {
return componentProbe{Name: "media_storage", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
}
if !exists {
return componentProbe{Name: "media_storage", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "bucket not found: " + a.cfg.MinioBucket}
}
return componentProbe{Name: "media_storage", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
}
func (a *app) probeMediaMetadata(ctx context.Context) componentProbe {
start := time.Now()
var withMedia, missingFiles int64
err := a.db.QueryRow(ctx, `
SELECT
COUNT(*) FILTER (WHERE has_media = true)::bigint,
COUNT(*) FILTER (
WHERE has_media = true
AND jsonb_array_length(COALESCE(media_files, '[]'::jsonb)) = 0
)::bigint
FROM messages`,
).Scan(&withMedia, &missingFiles)
if err != nil {
return componentProbe{Name: "media_metadata", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
}
if missingFiles > 0 {
return componentProbe{
Name: "media_metadata",
Status: "down",
LatencyMs: time.Since(start).Milliseconds(),
Error: "messages_with_media=" + strconv.FormatInt(withMedia, 10) +
" missing_media_files=" + strconv.FormatInt(missingFiles, 10),
}
}
return componentProbe{Name: "media_metadata", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
}
func (a *app) handleAccessMe(w http.ResponseWriter, r *http.Request) {
scope := readAccess(r)
writeJSON(w, http.StatusOK, map[string]any{