feat: expose monitoring tg health detail
This commit is contained in:
@@ -130,6 +130,13 @@ type messageOut struct {
|
|||||||
FetchedAt time.Time `json:"fetched_at"`
|
FetchedAt time.Time `json:"fetched_at"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type componentProbe struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
LatencyMs int64 `json:"latency_ms"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
cfg := loadConfig()
|
cfg := loadConfig()
|
||||||
logger := slog.New(slog.NewJSONHandler(os.Stdout, nil))
|
logger := slog.New(slog.NewJSONHandler(os.Stdout, nil))
|
||||||
@@ -186,6 +193,10 @@ func (a *app) serveHTTP(w http.ResponseWriter, r *http.Request) {
|
|||||||
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
|
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if path == "/health/detail" {
|
||||||
|
a.handleHealthDetail(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
if path == "/" {
|
if path == "/" {
|
||||||
writeJSON(w, http.StatusOK, map[string]string{"service": "monitoring-tg", "ui": "portal", "api": "go"})
|
writeJSON(w, http.StatusOK, map[string]string{"service": "monitoring-tg", "ui": "portal", "api": "go"})
|
||||||
return
|
return
|
||||||
@@ -245,6 +256,156 @@ func (a *app) apiPath(path string) string {
|
|||||||
return path
|
return path
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *app) handleHealthDetail(w http.ResponseWriter, r *http.Request) {
|
||||||
|
ctx, cancel := context.WithTimeout(r.Context(), 3*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
components := []componentProbe{
|
||||||
|
a.probePostgres(ctx),
|
||||||
|
a.probeAIService(ctx),
|
||||||
|
a.probeClassificationQueue(ctx),
|
||||||
|
a.probePoller(ctx),
|
||||||
|
a.probeMediaStorage(ctx),
|
||||||
|
a.probeMediaMetadata(ctx),
|
||||||
|
}
|
||||||
|
writeJSON(w, http.StatusOK, map[string]any{"components": components})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *app) probePostgres(ctx context.Context) componentProbe {
|
||||||
|
start := time.Now()
|
||||||
|
if err := a.db.Ping(ctx); err != nil {
|
||||||
|
return componentProbe{Name: "postgres", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
|
||||||
|
}
|
||||||
|
return componentProbe{Name: "postgres", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *app) probeAIService(ctx context.Context) componentProbe {
|
||||||
|
start := time.Now()
|
||||||
|
if !a.cfg.LLMEnabled {
|
||||||
|
return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "llm disabled"}
|
||||||
|
}
|
||||||
|
status, err := a.ai.ProvidersStatus(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
|
||||||
|
}
|
||||||
|
for _, provider := range status.Providers {
|
||||||
|
if provider.Name != "llm" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if provider.Configured && provider.OK {
|
||||||
|
return componentProbe{Name: "ai_service", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
|
||||||
|
}
|
||||||
|
errMsg := strings.TrimSpace(provider.Error)
|
||||||
|
if errMsg == "" {
|
||||||
|
errMsg = "llm provider is not ready"
|
||||||
|
}
|
||||||
|
return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: errMsg}
|
||||||
|
}
|
||||||
|
return componentProbe{Name: "ai_service", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "llm provider not found"}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *app) probeClassificationQueue(ctx context.Context) componentProbe {
|
||||||
|
start := time.Now()
|
||||||
|
var pending, pending24h int64
|
||||||
|
err := a.db.QueryRow(ctx, `
|
||||||
|
SELECT
|
||||||
|
COUNT(*)::bigint,
|
||||||
|
COUNT(*) FILTER (WHERE m.fetched_at >= now() - interval '24 hours')::bigint
|
||||||
|
FROM channels c
|
||||||
|
JOIN channels src ON src.id = COALESCE(c.source_channel_id, c.id)
|
||||||
|
JOIN messages m ON m.channel_id = src.id
|
||||||
|
JOIN sections s ON s.id = c.section_id
|
||||||
|
LEFT JOIN message_classifications mc ON mc.message_id = m.id AND mc.section_id = s.id
|
||||||
|
WHERE m.text IS NOT NULL
|
||||||
|
AND mc.id IS NULL`,
|
||||||
|
).Scan(&pending, &pending24h)
|
||||||
|
if err != nil {
|
||||||
|
return componentProbe{Name: "classification_queue", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
|
||||||
|
}
|
||||||
|
if pending > 0 {
|
||||||
|
return componentProbe{
|
||||||
|
Name: "classification_queue",
|
||||||
|
Status: "down",
|
||||||
|
LatencyMs: time.Since(start).Milliseconds(),
|
||||||
|
Error: "pending=" + strconv.FormatInt(pending, 10) + " pending_24h=" + strconv.FormatInt(pending24h, 10),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return componentProbe{Name: "classification_queue", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *app) probePoller(ctx context.Context) componentProbe {
|
||||||
|
start := time.Now()
|
||||||
|
staleAfter := maxInt(a.cfg.PollIntervalSeconds*3, 900)
|
||||||
|
var active, neverPolled, stale int64
|
||||||
|
err := a.db.QueryRow(ctx, `
|
||||||
|
SELECT
|
||||||
|
COUNT(*)::bigint,
|
||||||
|
COUNT(*) FILTER (WHERE last_polled_at IS NULL)::bigint,
|
||||||
|
COUNT(*) FILTER (WHERE last_polled_at IS NOT NULL AND last_polled_at < now() - ($1::int * interval '1 second'))::bigint
|
||||||
|
FROM channels
|
||||||
|
WHERE is_active = true
|
||||||
|
AND source_channel_id IS NULL`,
|
||||||
|
staleAfter,
|
||||||
|
).Scan(&active, &neverPolled, &stale)
|
||||||
|
if err != nil {
|
||||||
|
return componentProbe{Name: "poller", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
|
||||||
|
}
|
||||||
|
if active > 0 && (neverPolled > 0 || stale > 0) {
|
||||||
|
return componentProbe{
|
||||||
|
Name: "poller",
|
||||||
|
Status: "down",
|
||||||
|
LatencyMs: time.Since(start).Milliseconds(),
|
||||||
|
Error: "active=" + strconv.FormatInt(active, 10) +
|
||||||
|
" never_polled=" + strconv.FormatInt(neverPolled, 10) +
|
||||||
|
" stale=" + strconv.FormatInt(stale, 10) +
|
||||||
|
" stale_after_sec=" + strconv.Itoa(staleAfter),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return componentProbe{Name: "poller", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *app) probeMediaStorage(ctx context.Context) componentProbe {
|
||||||
|
start := time.Now()
|
||||||
|
if a.minio == nil || a.cfg.MinioBucket == "" {
|
||||||
|
return componentProbe{Name: "media_storage", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "minio not configured"}
|
||||||
|
}
|
||||||
|
exists, err := a.minio.BucketExists(ctx, a.cfg.MinioBucket)
|
||||||
|
if err != nil {
|
||||||
|
return componentProbe{Name: "media_storage", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
|
||||||
|
}
|
||||||
|
if !exists {
|
||||||
|
return componentProbe{Name: "media_storage", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "bucket not found: " + a.cfg.MinioBucket}
|
||||||
|
}
|
||||||
|
return componentProbe{Name: "media_storage", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *app) probeMediaMetadata(ctx context.Context) componentProbe {
|
||||||
|
start := time.Now()
|
||||||
|
var withMedia, missingFiles int64
|
||||||
|
err := a.db.QueryRow(ctx, `
|
||||||
|
SELECT
|
||||||
|
COUNT(*) FILTER (WHERE has_media = true)::bigint,
|
||||||
|
COUNT(*) FILTER (
|
||||||
|
WHERE has_media = true
|
||||||
|
AND jsonb_array_length(COALESCE(media_files, '[]'::jsonb)) = 0
|
||||||
|
)::bigint
|
||||||
|
FROM messages`,
|
||||||
|
).Scan(&withMedia, &missingFiles)
|
||||||
|
if err != nil {
|
||||||
|
return componentProbe{Name: "media_metadata", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
|
||||||
|
}
|
||||||
|
if missingFiles > 0 {
|
||||||
|
return componentProbe{
|
||||||
|
Name: "media_metadata",
|
||||||
|
Status: "down",
|
||||||
|
LatencyMs: time.Since(start).Milliseconds(),
|
||||||
|
Error: "messages_with_media=" + strconv.FormatInt(withMedia, 10) +
|
||||||
|
" missing_media_files=" + strconv.FormatInt(missingFiles, 10),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return componentProbe{Name: "media_metadata", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
|
||||||
|
}
|
||||||
|
|
||||||
func (a *app) handleAccessMe(w http.ResponseWriter, r *http.Request) {
|
func (a *app) handleAccessMe(w http.ResponseWriter, r *http.Request) {
|
||||||
scope := readAccess(r)
|
scope := readAccess(r)
|
||||||
writeJSON(w, http.StatusOK, map[string]any{
|
writeJSON(w, http.StatusOK, map[string]any{
|
||||||
|
|||||||
Reference in New Issue
Block a user