diff --git a/lib/services/health.ts b/lib/services/health.ts index e4a910b..e1083a1 100644 --- a/lib/services/health.ts +++ b/lib/services/health.ts @@ -1,19 +1,42 @@ /** * Health service — worker health checks and auto-fix. * - * Detects: active_no_session, zombie_session, stale_worker, inactive_with_issue. - * Used by both `status` (read-only) and `auto_pickup` (auto-fix). + * Triangulates THREE sources of truth: + * 1. projects.json — worker state (active, issueId, level, sessions) + * 2. Issue label — current GitHub/GitLab label (Doing, Testing, To Do, etc.) + * 3. Session state — whether the OpenClaw session exists via gateway status + * + * Detection matrix: + * | projects.json | Issue label | Session | Action | + * |---------------|-------------------|--------------|-------------------------------------------| + * | active | Doing/Testing ✅ | dead/missing | Deactivate worker, revert to To Do/To Test | + * | active | NOT Doing/Testing | any | Deactivate worker (moved externally) | + * | active | Doing/Testing ✅ | alive | Healthy (flag if stale >2h) | + * | inactive | Doing/Testing | any | Revert issue to To Do/To Test (label stuck)| + * | inactive | issueId set | any | Clear issueId (warning) | + * | active | issue deleted | any | Deactivate worker, clear state | */ -import type { StateLabel } from "../providers/provider.js"; +import type { StateLabel, IssueProvider, Issue } from "../providers/provider.js"; import { getSessionForLevel, getWorker, updateWorker, type Project, } from "../projects.js"; +import { runCommand } from "../run-command.js"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- export type HealthIssue = { - type: "active_no_session" | "zombie_session" | "stale_worker" | "inactive_with_issue"; + type: + | "session_dead" // Case 1: active worker but session missing/dead + | "label_mismatch" // Case 2: active worker but issue not in Doing/Testing + | "stale_worker" // Case 3: active for >2h + | "stuck_label" // Case 4: inactive but issue still has Doing/Testing + | "orphan_issue_id" // Case 5: inactive but issueId set + | "issue_gone"; // Case 6: active but issue deleted/closed severity: "critical" | "warning"; project: string; groupId: string; @@ -23,6 +46,8 @@ export type HealthIssue = { sessionKey?: string | null; hoursActive?: number; issueId?: string | null; + expectedLabel?: string; + actualLabel?: string | null; }; export type HealthFix = { @@ -32,81 +57,328 @@ export type HealthFix = { labelRevertFailed?: boolean; }; +export type GatewaySession = { + key: string; + updatedAt: number; + percentUsed: number; + abortedLastRun?: boolean; +}; + +export type SessionLookup = Map; + +// --------------------------------------------------------------------------- +// Gateway session lookup +// --------------------------------------------------------------------------- + +/** + * Query gateway status and build a lookup map of active sessions. + * Caches result for the duration of a health check pass. + */ +export async function fetchGatewaySessions(): Promise { + const lookup: SessionLookup = new Map(); + + try { + const result = await runCommand( + ["openclaw", "gateway", "call", "status", "--json"], + { timeoutMs: 15_000 }, + ); + + const data = JSON.parse(result.stdout); + const sessions: GatewaySession[] = data?.sessions?.recent ?? []; + + for (const session of sessions) { + if (session.key) { + lookup.set(session.key, session); + } + } + } catch { + // Gateway unavailable — return empty map (all sessions will be treated as missing) + } + + return lookup; +} + +/** + * Check if a session key exists in the gateway and is considered "alive". + * A session is alive if it exists. We don't consider percentUsed or abortedLastRun + * as dead indicators — those are normal states for reusable sessions. + */ +function isSessionAlive(sessionKey: string, sessions: SessionLookup): boolean { + return sessions.has(sessionKey); +} + +// --------------------------------------------------------------------------- +// Issue label lookup +// --------------------------------------------------------------------------- + +/** + * Fetch current issue state from the provider. + * Returns null if issue doesn't exist or is inaccessible. + */ +async function fetchIssue( + provider: IssueProvider, + issueId: number, +): Promise { + try { + return await provider.getIssue(issueId); + } catch { + return null; // Issue deleted, closed, or inaccessible + } +} + +// --------------------------------------------------------------------------- +// Health check logic +// --------------------------------------------------------------------------- + +/** + * Expected in-progress labels for each role. + */ +const ACTIVE_LABELS: Record<"dev" | "qa", StateLabel> = { + dev: "Doing", + qa: "Testing", +}; + +/** + * Queue labels to revert to when clearing stuck state. + */ +const QUEUE_LABELS: Record<"dev" | "qa", StateLabel> = { + dev: "To Do", + qa: "To Test", +}; + export async function checkWorkerHealth(opts: { workspaceDir: string; groupId: string; project: Project; role: "dev" | "qa"; - activeSessions: string[]; autoFix: boolean; - provider: { - transitionLabel(id: number, from: StateLabel, to: StateLabel): Promise; - }; + provider: IssueProvider; + sessions: SessionLookup; }): Promise { - const { workspaceDir, groupId, project, role, activeSessions, autoFix, provider } = opts; + const { workspaceDir, groupId, project, role, autoFix, provider, sessions } = opts; const fixes: HealthFix[] = []; const worker = getWorker(project, role); const sessionKey = worker.level ? getSessionForLevel(worker, worker.level) : null; - const revertLabel: StateLabel = role === "dev" ? "To Do" : "To Test"; - const currentLabel: StateLabel = role === "dev" ? "Doing" : "Testing"; + const expectedLabel = ACTIVE_LABELS[role]; + const queueLabel = QUEUE_LABELS[role]; - async function revertIssueLabel(fix: HealthFix) { - if (!worker.issueId) return; + // Parse issueId (may be comma-separated for batch, take first) + const issueIdNum = worker.issueId ? Number(worker.issueId.split(",")[0]) : null; + + // Fetch issue state if we have an issueId + let issue: Issue | null = null; + let currentLabel: StateLabel | null = null; + if (issueIdNum) { + issue = await fetchIssue(provider, issueIdNum); + currentLabel = issue ? provider.getCurrentStateLabel(issue) : null; + } + + // Helper to revert label + async function revertLabel(fix: HealthFix, from: StateLabel, to: StateLabel) { + if (!issueIdNum) return; try { - const id = Number(worker.issueId.split(",")[0]); - await provider.transitionLabel(id, currentLabel, revertLabel); - fix.labelReverted = `${currentLabel} → ${revertLabel}`; + await provider.transitionLabel(issueIdNum, from, to); + fix.labelReverted = `${from} → ${to}`; } catch { fix.labelRevertFailed = true; } } - // Check 1: Active but no session key for current level + // Helper to deactivate worker + async function deactivate(clearSessions = false) { + const updates: Record = { + active: false, + issueId: null, + startTime: null, + }; + if (clearSessions && worker.level) { + updates.sessions = { ...worker.sessions, [worker.level]: null }; + } + await updateWorker(workspaceDir, groupId, role, updates); + } + + // --------------------------------------------------------------------------- + // Case 6: Active but issue doesn't exist (deleted/closed externally) + // --------------------------------------------------------------------------- + if (worker.active && issueIdNum && !issue) { + const fix: HealthFix = { + issue: { + type: "issue_gone", + severity: "critical", + project: project.name, + groupId, + role, + level: worker.level, + sessionKey, + issueId: worker.issueId, + message: `${role.toUpperCase()} active but issue #${issueIdNum} no longer exists or is closed`, + }, + fixed: false, + }; + if (autoFix) { + await deactivate(true); + fix.fixed = true; + } + fixes.push(fix); + return fixes; // No point checking further + } + + // --------------------------------------------------------------------------- + // Case 2: Active but issue label is NOT the expected in-progress label + // --------------------------------------------------------------------------- + if (worker.active && issue && currentLabel !== expectedLabel) { + const fix: HealthFix = { + issue: { + type: "label_mismatch", + severity: "critical", + project: project.name, + groupId, + role, + level: worker.level, + sessionKey, + issueId: worker.issueId, + expectedLabel, + actualLabel: currentLabel, + message: `${role.toUpperCase()} active but issue #${issueIdNum} has label "${currentLabel}" (expected "${expectedLabel}")`, + }, + fixed: false, + }; + if (autoFix) { + await deactivate(true); + fix.fixed = true; + } + fixes.push(fix); + return fixes; // State is invalid, don't check session + } + + // --------------------------------------------------------------------------- + // Case 1: Active with correct label but session is dead/missing + // --------------------------------------------------------------------------- + if (worker.active && sessionKey && !isSessionAlive(sessionKey, sessions)) { + const fix: HealthFix = { + issue: { + type: "session_dead", + severity: "critical", + project: project.name, + groupId, + role, + sessionKey, + level: worker.level, + issueId: worker.issueId, + message: `${role.toUpperCase()} active but session "${sessionKey}" not found in gateway`, + }, + fixed: false, + }; + if (autoFix) { + await revertLabel(fix, expectedLabel, queueLabel); + await deactivate(true); + fix.fixed = true; + } + fixes.push(fix); + return fixes; + } + + // --------------------------------------------------------------------------- + // Case 1b: Active but no session key at all (shouldn't happen normally) + // --------------------------------------------------------------------------- if (worker.active && !sessionKey) { const fix: HealthFix = { issue: { - type: "active_no_session", severity: "critical", - project: project.name, groupId, role, + type: "session_dead", + severity: "critical", + project: project.name, + groupId, + role, level: worker.level, - message: `${role.toUpperCase()} active but no session for level "${worker.level}"`, + issueId: worker.issueId, + message: `${role.toUpperCase()} active but no session key for level "${worker.level}"`, }, fixed: false, }; if (autoFix) { - await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null }); + if (issue && currentLabel === expectedLabel) { + await revertLabel(fix, expectedLabel, queueLabel); + } + await deactivate(); fix.fixed = true; } fixes.push(fix); + return fixes; } - // Check 2: Active with session but session is dead (zombie) - if (worker.active && sessionKey && activeSessions.length > 0 && !activeSessions.includes(sessionKey)) { + // --------------------------------------------------------------------------- + // Case 3: Active with correct label and alive session — check for staleness + // --------------------------------------------------------------------------- + if (worker.active && worker.startTime && sessionKey && isSessionAlive(sessionKey, sessions)) { + const hours = (Date.now() - new Date(worker.startTime).getTime()) / 3_600_000; + if (hours > 2) { + const fix: HealthFix = { + issue: { + type: "stale_worker", + severity: "warning", + project: project.name, + groupId, + role, + hoursActive: Math.round(hours * 10) / 10, + sessionKey, + issueId: worker.issueId, + message: `${role.toUpperCase()} active for ${Math.round(hours * 10) / 10}h — may need attention`, + }, + fixed: false, + }; + // Stale workers get auto-fixed: revert label and deactivate + if (autoFix) { + await revertLabel(fix, expectedLabel, queueLabel); + await deactivate(); + fix.fixed = true; + } + fixes.push(fix); + } + // Otherwise: healthy, no issues to report + } + + // --------------------------------------------------------------------------- + // Case 4: Inactive but issue has stuck Doing/Testing label + // --------------------------------------------------------------------------- + if (!worker.active && issue && currentLabel === expectedLabel) { const fix: HealthFix = { issue: { - type: "zombie_session", severity: "critical", - project: project.name, groupId, role, - sessionKey, level: worker.level, - message: `${role.toUpperCase()} session not in active sessions list`, + type: "stuck_label", + severity: "critical", + project: project.name, + groupId, + role, + issueId: worker.issueId, + expectedLabel: queueLabel, + actualLabel: currentLabel, + message: `${role.toUpperCase()} inactive but issue #${issueIdNum} still has "${currentLabel}" label`, }, fixed: false, }; if (autoFix) { - await revertIssueLabel(fix); - const sessions = { ...worker.sessions }; - if (worker.level) sessions[worker.level] = null; - await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null, sessions }); + await revertLabel(fix, expectedLabel, queueLabel); + // Also clear the issueId if present + if (worker.issueId) { + await updateWorker(workspaceDir, groupId, role, { issueId: null }); + } fix.fixed = true; } fixes.push(fix); + return fixes; } - // Check 3: Inactive but still has issueId + // --------------------------------------------------------------------------- + // Case 5: Inactive but still has issueId set (orphan reference) + // --------------------------------------------------------------------------- if (!worker.active && worker.issueId) { const fix: HealthFix = { issue: { - type: "inactive_with_issue", severity: "warning", - project: project.name, groupId, role, + type: "orphan_issue_id", + severity: "warning", + project: project.name, + groupId, + role, issueId: worker.issueId, message: `${role.toUpperCase()} inactive but still has issueId "${worker.issueId}"`, }, @@ -119,28 +391,5 @@ export async function checkWorkerHealth(opts: { fixes.push(fix); } - // Check 4: Active for >2 hours (stale) - if (worker.active && worker.startTime && sessionKey) { - const hours = (Date.now() - new Date(worker.startTime).getTime()) / 3_600_000; - if (hours > 2) { - const fix: HealthFix = { - issue: { - type: "stale_worker", severity: "warning", - project: project.name, groupId, role, - hoursActive: Math.round(hours * 10) / 10, - sessionKey, issueId: worker.issueId, - message: `${role.toUpperCase()} active for ${Math.round(hours * 10) / 10}h — may need attention`, - }, - fixed: false, - }; - if (autoFix) { - await revertIssueLabel(fix); - await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null }); - fix.fixed = true; - } - fixes.push(fix); - } - } - return fixes; } diff --git a/lib/services/heartbeat.ts b/lib/services/heartbeat.ts index b4d66c4..b646b94 100644 --- a/lib/services/heartbeat.ts +++ b/lib/services/heartbeat.ts @@ -15,7 +15,7 @@ import fs from "node:fs"; import path from "node:path"; import { readProjects } from "../projects.js"; import { log as auditLog } from "../audit.js"; -import { checkWorkerHealth } from "./health.js"; +import { checkWorkerHealth, fetchGatewaySessions, type SessionLookup } from "./health.js"; import { projectTick } from "./tick.js"; import { createProvider } from "../providers/index.js"; import { notifyTickPickups, getNotificationConfig } from "../notify.js"; @@ -184,12 +184,16 @@ async function processAllAgents( totalSkipped: 0, }; + // Fetch gateway sessions once for all agents/projects + const sessions = await fetchGatewaySessions(); + for (const { agentId, workspace } of agents) { const agentResult = await tick({ workspaceDir: workspace, agentId, config, pluginConfig, + sessions, logger, }); @@ -221,9 +225,10 @@ export async function tick(opts: { agentId?: string; config: HeartbeatConfig; pluginConfig?: Record; + sessions: SessionLookup; logger: { info(msg: string): void; warn(msg: string): void }; }): Promise { - const { workspaceDir, agentId, config, pluginConfig } = opts; + const { workspaceDir, agentId, config, pluginConfig, sessions } = opts; const data = await readProjects(workspaceDir); const projectIds = Object.keys(data.projects); @@ -250,6 +255,7 @@ export async function tick(opts: { workspaceDir, groupId, project, + sessions, ); // Budget check: stop if we've hit the limit @@ -304,6 +310,7 @@ async function performHealthPass( workspaceDir: string, groupId: string, project: any, + sessions: SessionLookup, ): Promise { const { provider } = await createProvider({ repo: project.repo }); let fixedCount = 0; @@ -314,7 +321,7 @@ async function performHealthPass( groupId, project, role, - activeSessions: [], + sessions, autoFix: true, provider, }); @@ -332,5 +339,3 @@ async function checkProjectActive(workspaceDir: string, groupId: string): Promis if (!fresh) return false; return fresh.dev.active || fresh.qa.active; } - - diff --git a/lib/tools/health.ts b/lib/tools/health.ts index f8bf593..985cd51 100644 --- a/lib/tools/health.ts +++ b/lib/tools/health.ts @@ -1,13 +1,21 @@ /** * health — Worker health scan with optional auto-fix. * + * Triangulates projects.json, issue labels, and session state to detect: + * - session_dead: active worker but session missing in gateway + * - label_mismatch: active worker but issue not in expected label + * - stale_worker: active for >2h + * - stuck_label: inactive but issue has Doing/Testing label + * - orphan_issue_id: inactive but issueId set + * - issue_gone: active but issue deleted/closed + * * Read-only by default (surfaces issues). Pass fix=true to apply fixes. */ import { jsonResult } from "openclaw/plugin-sdk"; import type { ToolContext } from "../types.js"; import { readProjects, getProject } from "../projects.js"; import { log as auditLog } from "../audit.js"; -import { checkWorkerHealth, type HealthFix } from "../services/health.js"; +import { checkWorkerHealth, fetchGatewaySessions, type HealthFix } from "../services/health.js"; import { requireWorkspaceDir, resolveProvider } from "../tool-helpers.js"; export function createHealthTool() { @@ -20,20 +28,21 @@ export function createHealthTool() { properties: { projectGroupId: { type: "string", description: "Filter to specific project. Omit for all." }, fix: { type: "boolean", description: "Apply fixes for detected issues. Default: false (read-only)." }, - activeSessions: { type: "array", items: { type: "string" }, description: "Active session IDs for zombie detection." }, }, }, async execute(_id: string, params: Record) { const workspaceDir = requireWorkspaceDir(ctx); const fix = (params.fix as boolean) ?? false; - const activeSessions = (params.activeSessions as string[]) ?? []; const groupId = params.projectGroupId as string | undefined; const data = await readProjects(workspaceDir); const projectIds = groupId ? [groupId] : Object.keys(data.projects); + // Fetch gateway sessions once for all projects + const sessions = await fetchGatewaySessions(); + const issues: Array = []; for (const pid of projectIds) { @@ -43,8 +52,13 @@ export function createHealthTool() { for (const role of ["dev", "qa"] as const) { const fixes = await checkWorkerHealth({ - workspaceDir, groupId: pid, project, role, activeSessions, - autoFix: fix, provider, + workspaceDir, + groupId: pid, + project, + role, + sessions, + autoFix: fix, + provider, }); issues.push(...fixes.map((f) => ({ ...f, project: project.name, role }))); } @@ -55,14 +69,15 @@ export function createHealthTool() { fix, issuesFound: issues.length, issuesFixed: issues.filter((i) => i.fixed).length, + sessionsCached: sessions.size, }); return jsonResult({ success: true, fix, projectsScanned: projectIds.length, + sessionsQueried: sessions.size, issues, - note: activeSessions.length === 0 ? "No activeSessions provided — zombie detection skipped." : undefined, }); }, });