fix: dispatch timeout causing missed Telegram notifications (#153) (#154)

## Problem

`dispatchTask()` shells out to `openclaw gateway call sessions.patch` which
times out when the gateway is busy, causing:
1. Notifications never fire (they're at the end of dispatchTask)
2. Worker state may not be recorded
3. Workers run silently

## Solution (3 changes)

### 1. Make `ensureSession` fire-and-forget
Session key is deterministic, so we don't need to wait for confirmation.
Health check catches orphaned state later.

### 2. Use runtime API for notifications instead of CLI
Pass `runtime` through opts and use direct API calls:
- `runtime.channel.telegram.sendMessageTelegram()`
- `runtime.channel.whatsapp.sendMessageWhatsApp()`
- etc.

### 3. Move notification before session dispatch
Fire workerStart/workerComplete notifications early (after label transition)
before the session calls that can timeout.

## Files Changed

- lib/dispatch.ts — fire-and-forget ensureSession, early notification, accept runtime
- lib/notify.ts — use runtime API for direct channel sends
- lib/services/pipeline.ts — early notification, accept runtime
- lib/services/tick.ts — pass runtime through to dispatchTask
- lib/tool-helpers.ts — accept runtime in tickAndNotify
- lib/tools/work-start.ts — pass api.runtime to dispatchTask
- lib/tools/work-finish.ts — pass api.runtime to executeCompletion/tickAndNotify
This commit is contained in:
Lauren ten Hoor
2026-02-13 17:29:25 +08:00
committed by GitHub
parent d1106fc0bb
commit 24b35b3a3e
7 changed files with 140 additions and 106 deletions

View File

@@ -6,6 +6,7 @@
*/
import fs from "node:fs/promises";
import path from "node:path";
import type { PluginRuntime } from "openclaw/plugin-sdk";
import { log as auditLog } from "./audit.js";
import { runCommand } from "./run-command.js";
import {
@@ -43,6 +44,8 @@ export type DispatchOpts = {
channel?: string;
/** Orchestrator's session key (used as spawnedBy for subagent tracking) */
sessionKey?: string;
/** Plugin runtime for direct API access (avoids CLI subprocess timeouts) */
runtime?: PluginRuntime;
};
export type DispatchResult = {
@@ -131,8 +134,14 @@ export async function buildTaskMessage(opts: {
/**
* Dispatch a task to a worker session.
*
* Flow: resolve model → build message → transition label → spawn/send session
* → update worker state → audit → build announcement.
* Flow:
* 1. Resolve model and session key
* 2. Build task message
* 3. Transition label
* 4. Fire notification (early — before session dispatch which can timeout)
* 5. Ensure session (fire-and-forget) + send to agent
* 6. Update worker state
* 7. Audit
*
* On dispatch failure: rolls back label transition.
* On state update failure after dispatch: logs warning (session IS running).
@@ -143,7 +152,7 @@ export async function dispatchTask(
const {
workspaceDir, agentId, groupId, project, issueId, issueTitle,
issueDescription, issueUrl, role, level, fromLabel, toLabel,
transitionLabel, provider, pluginConfig,
transitionLabel, provider, pluginConfig, runtime,
} = opts;
const model = resolveModel(role, level, pluginConfig);
@@ -151,6 +160,9 @@ export async function dispatchTask(
const existingSessionKey = getSessionForLevel(worker, level);
const sessionAction = existingSessionKey ? "send" : "spawn";
// Compute session key deterministically (avoids waiting for gateway)
const sessionKey = `agent:${agentId ?? "unknown"}:subagent:${project.name}-${role}-${level}`;
// Fetch comments to include in task context
const comments = await provider.listComments(issueId);
@@ -161,55 +173,13 @@ export async function dispatchTask(
comments,
});
// Step 1: Transition label (this is the commitment point)
await transitionLabel(issueId, fromLabel, toLabel);
let dispatched = false;
let session: { key: string; action: "spawn" | "send" };
try {
session = await ensureSession(sessionAction, existingSessionKey, {
agentId, projectName: project.name, role, level, model,
});
sendToAgent(session.key, taskMessage, {
agentId, projectName: project.name, issueId, role,
orchestratorSessionKey: opts.sessionKey,
});
dispatched = true;
// Always store session key — a "send" may have fallen back to "spawn"
await recordWorkerState(workspaceDir, groupId, role, {
issueId, level, sessionKey: session.key, sessionAction: session.action,
});
} catch (err) {
if (dispatched) {
await auditLog(workspaceDir, "work_start", {
project: project.name, groupId, issue: issueId, role,
warning: "State update failed after successful dispatch",
error: (err as Error).message, sessionKey: session!.key,
});
throw new Error(
`State update failed after successful session dispatch: ${(err as Error).message}. Session is running but projects.json was not updated.`,
);
}
try { await transitionLabel(issueId, toLabel, fromLabel); } catch { /* best-effort rollback */ }
throw new Error(
`Session dispatch failed: ${(err as Error).message}. Label reverted to "${fromLabel}".`,
);
}
await auditDispatch(workspaceDir, {
project: project.name, groupId, issueId, issueTitle,
role, level, model, sessionAction: session.action, sessionKey: session.key,
fromLabel, toLabel,
});
const announcement = buildAnnouncement(level, role, session.action, issueId, issueTitle, issueUrl);
// Notify workerStart (non-fatal)
// Step 2: Send notification early (before session dispatch which can timeout)
// This ensures users see the notification even if gateway is slow
const notifyConfig = getNotificationConfig(pluginConfig);
await notify(
notify(
{
type: "workerStart",
project: project.name,
@@ -219,17 +189,51 @@ export async function dispatchTask(
issueUrl,
role,
level,
sessionAction: session.action,
sessionAction,
},
{
workspaceDir,
config: notifyConfig,
groupId,
channel: opts.channel ?? "telegram",
runtime,
},
).catch(() => { /* non-fatal */ });
return { sessionAction: session.action, sessionKey: session.key, level, model, announcement };
// Step 3: Ensure session exists (fire-and-forget — don't wait for gateway)
// Session key is deterministic, so we can proceed immediately
ensureSessionFireAndForget(sessionKey, model);
// Step 4: Send task to agent (fire-and-forget)
sendToAgent(sessionKey, taskMessage, {
agentId, projectName: project.name, issueId, role,
orchestratorSessionKey: opts.sessionKey,
});
// Step 5: Update worker state
try {
await recordWorkerState(workspaceDir, groupId, role, {
issueId, level, sessionKey, sessionAction,
});
} catch (err) {
// Session is already dispatched — log warning but don't fail
await auditLog(workspaceDir, "work_start", {
project: project.name, groupId, issue: issueId, role,
warning: "State update failed after successful dispatch",
error: (err as Error).message, sessionKey,
});
}
// Step 6: Audit
await auditDispatch(workspaceDir, {
project: project.name, groupId, issueId, issueTitle,
role, level, model, sessionAction, sessionKey,
fromLabel, toLabel,
});
const announcement = buildAnnouncement(level, role, sessionAction, issueId, issueTitle, issueUrl);
return { sessionAction, sessionKey, level, model, announcement };
}
// ---------------------------------------------------------------------------
@@ -246,32 +250,16 @@ async function loadRoleInstructions(
return "";
}
async function ensureSession(
action: "spawn" | "send",
existingKey: string | null,
opts: { agentId?: string; projectName: string; role: string; level: string; model: string },
): Promise<{ key: string; action: "spawn" | "send" }> {
const expectedKey = `agent:${opts.agentId ?? "unknown"}:subagent:${opts.projectName}-${opts.role}-${opts.level}`;
// Reuse: validate stored key matches expected format, then verify session exists
if (action === "send" && existingKey === expectedKey) {
try {
await runCommand(
["openclaw", "gateway", "call", "sessions.patch", "--params", JSON.stringify({ key: existingKey, model: opts.model })],
{ timeoutMs: 30_000 },
);
return { key: existingKey, action: "send" };
} catch {
// Session gone (deleted, cleanup, etc.) — fall through to spawn
}
}
// Spawn: create fresh session (also handles stale/mismatched keys)
await runCommand(
["openclaw", "gateway", "call", "sessions.patch", "--params", JSON.stringify({ key: expectedKey, model: opts.model })],
/**
* Fire-and-forget session creation/update.
* Session key is deterministic, so we don't need to wait for confirmation.
* If this fails, health check will catch orphaned state later.
*/
function ensureSessionFireAndForget(sessionKey: string, model: string): void {
runCommand(
["openclaw", "gateway", "call", "sessions.patch", "--params", JSON.stringify({ key: sessionKey, model })],
{ timeoutMs: 30_000 },
);
return { key: expectedKey, action: "spawn" };
).catch(() => { /* fire-and-forget */ });
}
function sendToAgent(

View File

@@ -8,7 +8,7 @@
* - workerComplete: Worker completed task (→ project group)
*/
import { log as auditLog } from "./audit.js";
import { runCommand } from "./run-command.js";
import type { PluginRuntime } from "openclaw/plugin-sdk";
/** Per-event-type toggle. All default to true — set to false to suppress. */
export type NotificationConfig = Partial<Record<NotifyEvent["type"], boolean>>;
@@ -78,19 +78,46 @@ function buildMessage(event: NotifyEvent): string {
}
/**
* Send a notification message via the native OpenClaw messaging CLI.
* Send a notification message via the plugin runtime API.
*
* Uses `openclaw message send` which handles target resolution, chunking,
* retries, and error reporting for all supported channels.
* Fails silently (logs error but doesn't throw) to avoid breaking the main flow.
* Uses the runtime's native send functions to bypass CLI → WebSocket timeouts.
* Falls back gracefully on error (notifications shouldn't break the main flow).
*/
async function sendMessage(
target: string,
message: string,
channel: string,
workspaceDir: string,
runtime?: PluginRuntime,
): Promise<boolean> {
try {
// Use runtime API when available (avoids CLI subprocess timeouts)
if (runtime) {
if (channel === "telegram") {
await runtime.channel.telegram.sendMessageTelegram(target, message, { silent: true });
return true;
}
if (channel === "whatsapp") {
await runtime.channel.whatsapp.sendMessageWhatsApp(target, message, { verbose: false });
return true;
}
if (channel === "discord") {
await runtime.channel.discord.sendMessageDiscord(target, message);
return true;
}
if (channel === "slack") {
await runtime.channel.slack.sendMessageSlack(target, message);
return true;
}
if (channel === "signal") {
await runtime.channel.signal.sendMessageSignal(target, message);
return true;
}
}
// Fallback: use CLI (for unsupported channels or when runtime isn't available)
// Import lazily to avoid circular dependency issues
const { runCommand } = await import("./run-command.js");
await runCommand(
[
"openclaw",
@@ -132,6 +159,8 @@ export async function notify(
groupId?: string;
/** Channel type for routing (e.g. "telegram", "whatsapp", "discord", "slack") */
channel?: string;
/** Plugin runtime for direct API access (avoids CLI subprocess timeouts) */
runtime?: PluginRuntime;
},
): Promise<boolean> {
if (opts.config?.[event.type] === false) return true;
@@ -155,7 +184,7 @@ export async function notify(
message,
});
return sendMessage(target, message, channel, opts.workspaceDir);
return sendMessage(target, message, channel, opts.workspaceDir, opts.runtime);
}
/**

View File

@@ -3,6 +3,7 @@
*
* Replaces 7 if-blocks with a data-driven lookup table.
*/
import type { PluginRuntime } from "openclaw/plugin-sdk";
import type { StateLabel, IssueProvider } from "../providers/provider.js";
import { deactivateWorker } from "../projects.js";
import { runCommand } from "../run-command.js";
@@ -74,8 +75,10 @@ export async function executeCompletion(opts: {
projectName: string;
channel?: string;
pluginConfig?: Record<string, unknown>;
/** Plugin runtime for direct API access (avoids CLI subprocess timeouts) */
runtime?: PluginRuntime;
}): Promise<CompletionOutput> {
const { workspaceDir, groupId, role, result, issueId, summary, provider, repoPath, projectName, channel, pluginConfig } = opts;
const { workspaceDir, groupId, role, result, issueId, summary, provider, repoPath, projectName, channel, pluginConfig, runtime } = opts;
const key = `${role}:${result}`;
const rule = COMPLETION_RULES[key];
if (!rule) throw new Error(`No completion rule for ${key}`);
@@ -94,27 +97,13 @@ export async function executeCompletion(opts: {
try { prUrl = await provider.getMergedMRUrl(issueId) ?? undefined; } catch { /* ignore */ }
}
// Deactivate worker + transition label
await deactivateWorker(workspaceDir, groupId, role);
await provider.transitionLabel(issueId, rule.from, rule.to);
// Close/reopen
if (rule.closeIssue) await provider.closeIssue(issueId);
if (rule.reopenIssue) await provider.reopenIssue(issueId);
// Build announcement
// Get issue early (for URL in notification)
const issue = await provider.getIssue(issueId);
const emoji = EMOJI[key] ?? "📋";
const label = key.replace(":", " ").toUpperCase();
let announcement = `${emoji} ${label} #${issueId}`;
if (summary) announcement += `${summary}`;
announcement += `\n📋 Issue: ${issue.web_url}`;
if (prUrl) announcement += `\n🔗 PR: ${prUrl}`;
announcement += `\n${NEXT_STATE[key]}.`;
// Notify workerComplete (non-fatal)
// Send notification early (before deactivation and label transition which can fail)
// This ensures users see the notification even if subsequent steps have issues
const notifyConfig = getNotificationConfig(pluginConfig);
await notify(
notify(
{
type: "workerComplete",
project: projectName,
@@ -131,9 +120,27 @@ export async function executeCompletion(opts: {
config: notifyConfig,
groupId,
channel: channel ?? "telegram",
runtime,
},
).catch(() => { /* non-fatal */ });
// Deactivate worker + transition label
await deactivateWorker(workspaceDir, groupId, role);
await provider.transitionLabel(issueId, rule.from, rule.to);
// Close/reopen
if (rule.closeIssue) await provider.closeIssue(issueId);
if (rule.reopenIssue) await provider.reopenIssue(issueId);
// Build announcement
const emoji = EMOJI[key] ?? "📋";
const label = key.replace(":", " ").toUpperCase();
let announcement = `${emoji} ${label} #${issueId}`;
if (summary) announcement += `${summary}`;
announcement += `\n📋 Issue: ${issue.web_url}`;
if (prUrl) announcement += `\n🔗 PR: ${prUrl}`;
announcement += `\n${NEXT_STATE[key]}.`;
return {
labelTransition: `${rule.from}${rule.to}`,
announcement,

View File

@@ -4,6 +4,7 @@
* Core function: projectTick() scans one project's queue and fills free worker slots.
* Called by: work_start (fill parallel slot), work_finish (next pipeline step), heartbeat service (sweep).
*/
import type { PluginRuntime } from "openclaw/plugin-sdk";
import type { Issue, StateLabel } from "../providers/provider.js";
import type { IssueProvider } from "../providers/provider.js";
import { createProvider } from "../providers/index.js";
@@ -118,8 +119,10 @@ export async function projectTick(opts: {
targetRole?: "dev" | "qa";
/** Optional provider override (for testing). Uses createProvider if omitted. */
provider?: Pick<IssueProvider, "listIssuesByLabel" | "transitionLabel" | "listComments">;
/** Plugin runtime for direct API access (avoids CLI subprocess timeouts) */
runtime?: PluginRuntime;
}): Promise<TickResult> {
const { workspaceDir, groupId, agentId, sessionKey, pluginConfig, dryRun, maxPickups, targetRole } = opts;
const { workspaceDir, groupId, agentId, sessionKey, pluginConfig, dryRun, maxPickups, targetRole, runtime } = opts;
const project = (await readProjects(workspaceDir)).projects[groupId];
if (!project) return { pickups: [], skipped: [{ reason: `Project not found: ${groupId}` }] };
@@ -179,6 +182,7 @@ export async function projectTick(opts: {
pluginConfig,
channel: fresh.channel,
sessionKey,
runtime,
});
pickups.push({
project: project.name, groupId, issueId: issue.iid, issueTitle: issue.title, issueUrl: issue.web_url,

View File

@@ -4,7 +4,7 @@
* Eliminates repeated boilerplate across tools: workspace validation,
* project resolution, provider creation.
*/
import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
import type { OpenClawPluginApi, PluginRuntime } from "openclaw/plugin-sdk";
import type { ToolContext } from "./types.js";
import { readProjects, getProject, type Project, type ProjectsData } from "./projects.js";
import { createProvider, type ProviderWithType } from "./providers/index.js";
@@ -60,6 +60,8 @@ export async function tickAndNotify(opts: {
pluginConfig?: Record<string, unknown>;
sessionKey?: string;
targetRole?: "dev" | "qa";
/** Plugin runtime for direct API access (avoids CLI subprocess timeouts) */
runtime?: PluginRuntime;
}): Promise<TickAction[]> {
try {
const result = await projectTick({
@@ -69,6 +71,7 @@ export async function tickAndNotify(opts: {
pluginConfig: opts.pluginConfig,
sessionKey: opts.sessionKey,
targetRole: opts.targetRole,
runtime: opts.runtime,
});
return result.pickups;
} catch {

View File

@@ -59,12 +59,13 @@ export function createWorkFinishTool(api: OpenClawPluginApi) {
const pluginConfig = getPluginConfig(api);
// Execute completion (pipeline service handles notification)
// Execute completion (pipeline service handles notification with runtime)
const completion = await executeCompletion({
workspaceDir, groupId, role, result, issueId, summary, prUrl, provider, repoPath,
projectName: project.name,
channel: project.channel,
pluginConfig,
runtime: api.runtime,
});
const output: Record<string, unknown> = {
@@ -72,9 +73,10 @@ export function createWorkFinishTool(api: OpenClawPluginApi) {
...completion,
};
// Tick: fill free slots (notifications handled by dispatchTask)
// Tick: fill free slots (notifications handled by dispatchTask with runtime)
const tickPickups = await tickAndNotify({
workspaceDir, groupId, agentId: ctx.agentId, pluginConfig, sessionKey: ctx.sessionKey,
runtime: api.runtime,
});
if (tickPickups.length) output.tickPickups = tickPickups;

View File

@@ -89,7 +89,7 @@ export function createWorkStartTool(api: OpenClawPluginApi) {
}
}
// Dispatch
// Dispatch (pass runtime for direct API access)
const pluginConfig = getPluginConfig(api);
const dr = await dispatchTask({
workspaceDir, agentId: ctx.agentId, groupId, project, issueId: issue.iid,
@@ -100,6 +100,7 @@ export function createWorkStartTool(api: OpenClawPluginApi) {
pluginConfig,
channel: project.channel,
sessionKey: ctx.sessionKey,
runtime: api.runtime,
});
// Auto-tick disabled per issue #125 - work_start should only pick up the explicitly requested issue