feat: expose monitoring tg ai job health
This commit is contained in:
@@ -264,6 +264,7 @@ func (a *app) handleHealthDetail(w http.ResponseWriter, r *http.Request) {
|
||||
a.probePostgres(ctx),
|
||||
a.probeAIService(ctx),
|
||||
a.probeClassificationQueue(ctx),
|
||||
a.probeAIJobs(ctx),
|
||||
a.probePoller(ctx),
|
||||
a.probeMediaStorage(ctx),
|
||||
a.probeMediaMetadata(ctx),
|
||||
@@ -333,6 +334,64 @@ func (a *app) probeClassificationQueue(ctx context.Context) componentProbe {
|
||||
return componentProbe{Name: "classification_queue", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
|
||||
}
|
||||
|
||||
func (a *app) probeAIJobs(ctx context.Context) componentProbe {
|
||||
start := time.Now()
|
||||
if !a.cfg.LLMEnabled {
|
||||
return componentProbe{Name: "ai_jobs", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: "llm disabled"}
|
||||
}
|
||||
stats, err := a.ai.Stats(ctx)
|
||||
if err != nil {
|
||||
return componentProbe{Name: "ai_jobs", Status: "down", LatencyMs: time.Since(start).Milliseconds(), Error: err.Error()}
|
||||
}
|
||||
var pending, running, staleRunning, failed, failed24h int64
|
||||
for _, row := range stats.Backlog {
|
||||
if row.OwnerService != "monitoring-tg" || row.TaskType != "telegram_classification" {
|
||||
continue
|
||||
}
|
||||
pending += row.Pending
|
||||
running += row.Running
|
||||
staleRunning += row.StaleRunning
|
||||
}
|
||||
for _, row := range stats.Owners {
|
||||
if row.OwnerService != "monitoring-tg" || row.TaskType != "telegram_classification" {
|
||||
continue
|
||||
}
|
||||
if row.Status == "failed" {
|
||||
failed += row.Total
|
||||
}
|
||||
}
|
||||
for _, row := range stats.Errors {
|
||||
if row.OwnerService != "monitoring-tg" || row.TaskType != "telegram_classification" {
|
||||
continue
|
||||
}
|
||||
failed24h += row.Last24h
|
||||
}
|
||||
if staleRunning > 0 || failed24h > 0 {
|
||||
return componentProbe{
|
||||
Name: "ai_jobs",
|
||||
Status: "down",
|
||||
LatencyMs: time.Since(start).Milliseconds(),
|
||||
Error: "pending=" + strconv.FormatInt(pending, 10) +
|
||||
" running=" + strconv.FormatInt(running, 10) +
|
||||
" stale_running=" + strconv.FormatInt(staleRunning, 10) +
|
||||
" failed=" + strconv.FormatInt(failed, 10) +
|
||||
" failed_24h=" + strconv.FormatInt(failed24h, 10),
|
||||
}
|
||||
}
|
||||
if pending > 0 || running > 0 || failed > 0 {
|
||||
return componentProbe{
|
||||
Name: "ai_jobs",
|
||||
Status: "degraded",
|
||||
LatencyMs: time.Since(start).Milliseconds(),
|
||||
Error: "pending=" + strconv.FormatInt(pending, 10) +
|
||||
" running=" + strconv.FormatInt(running, 10) +
|
||||
" stale_running=" + strconv.FormatInt(staleRunning, 10) +
|
||||
" failed=" + strconv.FormatInt(failed, 10),
|
||||
}
|
||||
}
|
||||
return componentProbe{Name: "ai_jobs", Status: "ok", LatencyMs: time.Since(start).Milliseconds()}
|
||||
}
|
||||
|
||||
func (a *app) probePoller(ctx context.Context) componentProbe {
|
||||
start := time.Now()
|
||||
staleAfter := maxInt(a.cfg.PollIntervalSeconds*3, 900)
|
||||
|
||||
Reference in New Issue
Block a user