From cdfe3343b56d84cbe99000f51655a7495d4cf880 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Mon, 11 May 2026 17:10:15 +0100 Subject: [PATCH 1/3] fix(core): retry TASK_PROCESS_SIGSEGV under the user's retry policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SIGSEGV was hard-classified as non-retriable in shouldRetryError on the assumption that it's always a deterministic native crash. For Node tasks that's not reliably true — many production SIGSEGVs are flaky: - Native addon races (sharp, canvas, better-sqlite3, node-rdkafka, bcrypt, etc.) — libuv thread-pool work stepping on V8 handles. Different heap layout / thread schedule on a fresh process, retry often succeeds. - JIT / GC interaction — V8 turbofan deopt or GC during a native callback. Timing-dependent. - Near-OOM in native code — when RSS approaches the cgroup limit, native allocations fail and poorly-written addons dereference NULL → SIGSEGV instead of a clean OOM-kill. A fresh process with cleaner memory often succeeds. - Host / hardware issues — bit flips, kernel quirks. Retry lands on a different host. The codebase was already inconsistent here: shouldLookupRetrySettings listed SIGSEGV as retry-config-aware, but the shouldRetryError gate short-circuited fail_run before that branch could be reached. And we already retry TASK_RUN_UNCAUGHT_EXCEPTION — clearly a user-code bug — under the user's retry policy, so refusing to retry SIGSEGV was the odd one out. Flip TASK_PROCESS_SIGSEGV from the false branch to the true branch in shouldRetryError. The existing retrying.ts pipeline then gates the retry on lockedRetryConfig + maxAttempts — same path SIGTERM and uncaught-exception already use. No new code paths; tasks without a retry policy still fail fast. Tests added in packages/core/test/errors.test.ts lock down the new classification alongside SIGTERM, SIGKILL_TIMEOUT, and the OOM codes (still non-retriable here because OOM has its own machine-bump retry path in retrying.ts that runs before shouldRetryError). Closes TRI-9234. Co-Authored-By: Claude Opus 4.7 (1M context) --- .changeset/retry-sigsegv.md | 5 +++++ packages/core/src/v3/errors.ts | 2 +- packages/core/test/errors.test.ts | 36 ++++++++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 .changeset/retry-sigsegv.md diff --git a/.changeset/retry-sigsegv.md b/.changeset/retry-sigsegv.md new file mode 100644 index 00000000000..5a53c351efe --- /dev/null +++ b/.changeset/retry-sigsegv.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Retry `TASK_PROCESS_SIGSEGV` task crashes under the user's retry policy instead of failing the run on the first segfault. SIGSEGV in Node tasks is frequently non-deterministic (native addon races, JIT/GC interaction, near-OOM in native code, host issues), so retrying on a fresh process often succeeds. The retry is gated by the task's existing `retry` config + `maxAttempts` — same path `TASK_PROCESS_SIGTERM` and uncaught exceptions already use — so tasks without a retry policy still fail fast. diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index a538ca9357b..90650bbd18f 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -361,7 +361,6 @@ export function shouldRetryError(error: TaskRunError): boolean { case "CONFIGURED_INCORRECTLY": case "TASK_ALREADY_RUNNING": case "TASK_PROCESS_SIGKILL_TIMEOUT": - case "TASK_PROCESS_SIGSEGV": case "TASK_PROCESS_OOM_KILLED": case "TASK_PROCESS_MAYBE_OOM_KILLED": case "TASK_RUN_CANCELLED": @@ -398,6 +397,7 @@ export function shouldRetryError(error: TaskRunError): boolean { case "TASK_RUN_UNCAUGHT_EXCEPTION": case "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE": case "TASK_PROCESS_SIGTERM": + case "TASK_PROCESS_SIGSEGV": return true; default: diff --git a/packages/core/test/errors.test.ts b/packages/core/test/errors.test.ts index dee6509d3a2..9a94366d845 100644 --- a/packages/core/test/errors.test.ts +++ b/packages/core/test/errors.test.ts @@ -1,5 +1,13 @@ import { describe, it, expect } from "vitest"; -import { truncateStack, truncateMessage, parseError, sanitizeError } from "../src/v3/errors.js"; +import { + truncateStack, + truncateMessage, + parseError, + sanitizeError, + shouldRetryError, + shouldLookupRetrySettings, +} from "../src/v3/errors.js"; +import type { TaskRunError } from "../src/v3/schemas/common.js"; // Helper: build a fake stack with N frames function buildStack(messageLines: string[], frameCount: number): string { @@ -238,3 +246,29 @@ describe("truncateStack message line bounding", () => { expect(result).toContain("...[truncated]"); }); }); + +describe("shouldRetryError + shouldLookupRetrySettings", () => { + const internal = (code: string): TaskRunError => + ({ type: "INTERNAL_ERROR", code } as TaskRunError); + + it("retries SIGSEGV (changed from non-retriable) and looks up retry settings", () => { + const err = internal("TASK_PROCESS_SIGSEGV"); + expect(shouldRetryError(err)).toBe(true); + expect(shouldLookupRetrySettings(err)).toBe(true); + }); + + it("retries SIGTERM via the same path", () => { + const err = internal("TASK_PROCESS_SIGTERM"); + expect(shouldRetryError(err)).toBe(true); + expect(shouldLookupRetrySettings(err)).toBe(true); + }); + + it("still does not retry SIGKILL timeout", () => { + expect(shouldRetryError(internal("TASK_PROCESS_SIGKILL_TIMEOUT"))).toBe(false); + }); + + it("still does not retry OOM kills (handled by the separate machine-bump path)", () => { + expect(shouldRetryError(internal("TASK_PROCESS_OOM_KILLED"))).toBe(false); + expect(shouldRetryError(internal("TASK_PROCESS_MAYBE_OOM_KILLED"))).toBe(false); + }); +}); From 46ecf77c8350fd1ecd6b0106da43c7b851a131e8 Mon Sep 17 00:00:00 2001 From: deepshekhardas Date: Tue, 12 May 2026 09:59:59 +0530 Subject: [PATCH 2/3] fix: add docstrings to improve coverage for SIGSEGV retry PR Adds JSDoc to shouldRetryError, shouldLookupRetrySettings, and correctErrorStackTrace functions to meet 80% docstring coverage threshold. --- packages/core/src/v3/errors.ts | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index 90650bbd18f..4181424c7b3 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -351,6 +351,11 @@ export function sanitizeError(error: TaskRunError): TaskRunError { } } +/** + * Determines whether an error should trigger a retry attempt. + * Returns true for errors that are retriable under the user's retry policy. + * Non-retriable errors (like OOM, SIGKILL_TIMEOUT) will fail the run immediately. + */ export function shouldRetryError(error: TaskRunError): boolean { switch (error.type) { case "INTERNAL_ERROR": { @@ -419,6 +424,10 @@ export function shouldRetryError(error: TaskRunError): boolean { } } +/** + * Checks if retry settings should be looked up for this error type. + * Some errors (like SIGSEGV, SIGTERM, uncaught exceptions) respect user retry config. + */ export function shouldLookupRetrySettings(error: TaskRunError): boolean { switch (error.type) { case "INTERNAL_ERROR": { @@ -448,6 +457,10 @@ export function shouldLookupRetrySettings(error: TaskRunError): boolean { } } +/** + * Corrects error stack traces by normalizing file paths and removing noise. + * Used to make stack traces more readable in logs and error UI. + */ export function correctErrorStackTrace( stackTrace: string, projectDir?: string, From 7052c2530333bb61188f8f2730aaa19ad918b691 Mon Sep 17 00:00:00 2001 From: deepshekhardas Date: Tue, 12 May 2026 10:01:24 +0530 Subject: [PATCH 3/3] fix: add more docstrings to meet 80% coverage threshold Adds JSDoc to parseError, truncateMessage, sanitizeError, taskRunErrorEnhancer, and groupTaskMetadataIssuesByTask. --- packages/core/src/v3/errors.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index 4181424c7b3..80124e1d386 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -199,6 +199,9 @@ export function truncateStack(stack: string | undefined): string { ].join("\n"); } +/** + * Truncates error messages that exceed MAX_MESSAGE_LENGTH to prevent OOM. + */ export function truncateMessage(message: string | undefined): string { if (!message) return ""; return message.length > MAX_MESSAGE_LENGTH @@ -206,6 +209,10 @@ export function truncateMessage(message: string | undefined): string { : message; } +/** + * Parses an unknown error into a TaskRunError structure. + * Handles InternalError, built-in Error, strings, and custom error objects. + */ export function parseError(error: unknown): TaskRunError { if (isInternalError(error)) { return { @@ -301,6 +308,10 @@ export function createJsonErrorObject(error: TaskRunError): SerializedError { } // Removes null characters and truncates oversized fields to prevent OOM +/** + * Sanitizes TaskRunError by removing null bytes and truncating long fields. + * Used to clean errors before storage or transmission. + */ export function sanitizeError(error: TaskRunError): TaskRunError { switch (error.type) { case "BUILT_IN_ERROR": { @@ -503,6 +514,10 @@ function correctStackTraceLine(line: string, projectDir?: string, isDev?: boolea return line.trim(); } +/** + * Groups Zod validation issues by task index for better error reporting. + * Used when parsing task metadata fails. + */ export function groupTaskMetadataIssuesByTask(tasks: any, issues: z.ZodIssue[]) { return issues.reduce( (acc, issue) => { @@ -786,6 +801,10 @@ const findSignalInMessage = (message?: string, truncateLength = 100) => { } }; +/** + * Enhances TaskRunError with additional context like signals, OOM detection. + * Used to enrich errors before displaying or logging. + */ export function taskRunErrorEnhancer(error: TaskRunError): EnhanceError { switch (error.type) { case "BUILT_IN_ERROR": {