pub const LIBRARY_SOURCE: &str = "#!lua name=flowfabric\n\n-- source: lua/helpers.lua\n-- FlowFabric shared library-local helpers\n-- These are local functions available to all registered functions in the\n-- flowfabric library. They are NOT independently FCALL-able.\n-- Reference: RFC-010 \u{a7}4.8, RFC-004 \u{a7}Waitpoint Security (HMAC tokens)\n\n---------------------------------------------------------------------------\n-- Capability CSV bounds (RFC-009 \u{a7}7.5)\n---------------------------------------------------------------------------\n-- Shared ceiling for BOTH the worker-side CSV (ff_issue_claim_grant ARGV[9])\n-- AND the execution-side CSV (exec_core.required_capabilities). Defense in\n-- depth against runaway field sizes: a 10k-token list turns into a multi-MB\n-- HSET value and a per-candidate O(N) atomic scan that blocks the shard.\n--\n-- Inclusivity: these are MAXIMUM accepted values. `#csv == CAPS_MAX_BYTES`\n-- and `n == CAPS_MAX_TOKENS` are accepted; one more rejects. Rust-side\n-- ingress (ff-sdk::FlowFabricWorker::connect, ff-scheduler::Scheduler::\n-- claim_for_worker, ff-core::policy::RoutingRequirements deserialization\n-- via lua/execution.lua) enforces the same ceilings so the Lua check is a\n-- defense-in-depth backstop, not the primary validator.\nlocal CAPS_MAX_BYTES = 4096\nlocal CAPS_MAX_TOKENS = 256\n\n---------------------------------------------------------------------------\n-- Hex / binary helpers (for HMAC-SHA1 token derivation)\n---------------------------------------------------------------------------\n\n-- Convert a hex string to a binary (byte) string. Accepts mixed case.\n-- Returns nil on ANY malformed input: non-string, odd length, OR any\n-- non-hex char (including whitespace, unicode, control chars). Callers\n-- treat nil as invalid_secret.\n--\n-- Rust side (ServerConfig) already validates the env secret as even-length\n-- 0-9a-fA-F, but an operator writing directly to Valkey (or a torn write\n-- during rotation) could bypass that validator. We refuse the conversion\n-- here instead of silently coercing bad pairs to 0 bytes (which would\n-- produce a bogus but valid-looking MAC).\nlocal function hex_to_bytes(hex)\n if type(hex) ~= \"string\" or #hex % 2 ~= 0 then\n return nil\n end\n local out = {}\n for i = 1, #hex - 1, 2 do\n local byte = tonumber(hex:sub(i, i + 1), 16)\n if not byte then\n return nil\n end\n out[#out + 1] = string.char(byte)\n end\n return table.concat(out)\nend\n\n-- XOR two equal-length byte strings. Used for HMAC key-pad construction.\nlocal function xor_bytes(a, b)\n local out = {}\n for i = 1, #a do\n out[i] = string.char(bit.bxor(a:byte(i), b:byte(i)))\n end\n return table.concat(out)\nend\n\n-- HMAC-SHA1(key_hex, message) \u{2192} lowercase hex digest (40 chars), or nil on\n-- malformed key_hex (odd-length / non-string). Callers must treat nil as\n-- an invalid-secret error \u{2014} never pass it to HSET / concat / return.\n-- Reference: RFC 2104. SHA1 block size = 64 bytes.\nlocal function hmac_sha1_hex(key_hex, message)\n local key = hex_to_bytes(key_hex)\n if not key then\n return nil\n end\n local block_size = 64\n if #key > block_size then\n -- Reduce oversized key via SHA1 (per RFC 2104). sha1hex output is 40\n -- lowercase hex chars, so the inner hex_to_bytes cannot fail.\n key = hex_to_bytes(redis.sha1hex(key))\n end\n if #key < block_size then\n key = key .. string.rep(\"\\0\", block_size - #key)\n end\n local ipad = string.rep(string.char(0x36), block_size)\n local opad = string.rep(string.char(0x5c), block_size)\n local inner = redis.sha1hex(xor_bytes(key, ipad) .. message)\n return redis.sha1hex(xor_bytes(key, opad) .. hex_to_bytes(inner))\nend\n\n-- Constant-time string equality. Returns true iff strings are equal in\n-- both length and content. Uses XOR-accumulation to avoid early-exit\n-- timing leaks on byte mismatches during HMAC token validation.\n-- Reference: Remote timing attacks on authentication (e.g., CVE-2011-3389 class).\n--\n-- Safety note on the length check: a length-mismatch early return reveals\n-- whether the presented string matches the expected length, which is a\n-- timing side channel IF attacker-controlled length is used to probe the\n-- expected length. In this codebase the caller normalizes to a fixed shape\n-- BEFORE reaching here \u{2014} validate_waitpoint_token already requires\n-- #presented == 40 (SHA1 hex digest length) at the parsing boundary, so\n-- any input reaching constant_time_eq has a length already known to be 40\n-- by the attacker. The only length variation here is on `expected`, which\n-- is server-computed and constant. Hence this early return does not leak\n-- secret-dependent timing.\nlocal function constant_time_eq(a, b)\n if type(a) ~= \"string\" or type(b) ~= \"string\" then\n return false\n end\n if #a ~= #b then\n return false\n end\n local acc = 0\n for i = 1, #a do\n acc = bit.bor(acc, bit.bxor(a:byte(i), b:byte(i)))\n end\n return acc == 0\nend\n\n---------------------------------------------------------------------------\n-- Waitpoint HMAC tokens (RFC-004 \u{a7}Waitpoint Security)\n---------------------------------------------------------------------------\n--\n-- Token format: \"kid:40hex\" \u{2014} kid identifies which key signed the token,\n-- enabling zero-downtime rotation. ANY kid present in the secrets hash\n-- with a future `expires_at:<kid>` (or the current kid, which has no\n-- expiry) accepts tokens. This supports rapid rotation: rotating A\u{2192}B\u{2192}C\n-- within a grace window keeps A\'s secret validatable as long as\n-- expires_at:A is still future.\n--\n-- HMAC input binds (waitpoint_id | waitpoint_key | created_at_ms) with a\n-- pipe delimiter so field-boundary confusion cannot produce collisions\n-- across waitpoints.\n--\n-- Secret storage: per-partition replicated hash at\n-- ff:sec:{p:N}:waitpoint_hmac\n-- Fields:\n-- current_kid \u{2014} the kid minting new tokens (no expiry)\n-- secret:<kid> \u{2014} hex-encoded HMAC key for each kid ever installed\n-- expires_at:<kid> \u{2014} unix ms; accept tokens under <kid> iff exp > now_ms\n-- INVARIANT: expires_at:<current_kid> is NEVER written\n-- previous_kid \u{2014} observability/audit only: the kid immediately\n-- preceding current_kid (NOT the only acceptable one)\n-- previous_expires_at \u{2014} observability/audit only: matches\n-- expires_at:<previous_kid>\n--\n-- Replication is required for Valkey cluster mode (all FCALL KEYS must\n-- hash to the same slot); rotation fans out across partitions.\n---------------------------------------------------------------------------\n\n-- Read the hmac_secrets hash. Returns a table with:\n-- current_kid, current_secret \u{2014} the minting kid (nil if not initialized)\n-- kid_secrets = { [kid] = { secret = <hex>, expires_at = <ms or nil> } }\n-- includes current_kid (expires_at = nil \u{2192} no expiry)\n-- includes every secret:<kid> present in the hash\n-- previous_kid, previous_secret, previous_expires_at \u{2014} kept for back-compat\n-- (audit log / observability); validate path does NOT depend on them.\n-- Returns nil if the hash is absent.\nlocal function load_waitpoint_secrets(secrets_key)\n local raw = redis.call(\"HGETALL\", secrets_key)\n if #raw == 0 then\n return nil\n end\n local t = {}\n for i = 1, #raw, 2 do\n t[raw[i]] = raw[i + 1]\n end\n local out = {\n current_kid = t.current_kid,\n previous_kid = t.previous_kid,\n previous_expires_at = t.previous_expires_at,\n kid_secrets = {},\n }\n if out.current_kid then\n out.current_secret = t[\"secret:\" .. out.current_kid]\n end\n if out.previous_kid then\n out.previous_secret = t[\"secret:\" .. out.previous_kid]\n end\n -- Multi-kid scan: every secret:<kid> becomes a validation candidate.\n -- current_kid has no expiry entry (intentional \u{2014} it\'s always valid).\n -- Other kids are accepted iff expires_at:<kid> is set AND > now_ms; the\n -- expiry check runs in validate_waitpoint_token so we simply carry the\n -- raw expires_at string here.\n for k, v in pairs(t) do\n if k:sub(1, 7) == \"secret:\" then\n local kid = k:sub(8)\n if kid ~= \"\" then\n out.kid_secrets[kid] = {\n secret = v,\n expires_at = t[\"expires_at:\" .. kid],\n }\n end\n end\n end\n return out\nend\n\n-- Build the HMAC input string. Pipe delimiter prevents concatenation\n-- collisions across distinct (waitpoint_id, waitpoint_key) pairs.\nlocal function waitpoint_hmac_input(waitpoint_id, waitpoint_key, created_at_ms)\n return waitpoint_id .. \"|\" .. waitpoint_key .. \"|\" .. tostring(created_at_ms)\nend\n\n-- Mint a waitpoint token using the current kid.\n-- Returns (token, kid) on success or (nil, error_code) on failure.\n-- Defense-in-depth: returns a typed error for missing secrets_key / missing\n-- secrets hash so external callers that construct FCALL KEYS by hand cannot\n-- produce the \"arguments must be strings or integers\" Lua panic via nil.\nlocal function mint_waitpoint_token(secrets_key, waitpoint_id, waitpoint_key, created_at_ms)\n if type(secrets_key) ~= \"string\" or secrets_key == \"\" then\n return nil, \"invalid_keys_missing_hmac\"\n end\n local secrets = load_waitpoint_secrets(secrets_key)\n if not secrets or not secrets.current_kid or not secrets.current_secret then\n return nil, \"hmac_secret_not_initialized\"\n end\n local input = waitpoint_hmac_input(waitpoint_id, waitpoint_key, created_at_ms)\n local digest = hmac_sha1_hex(secrets.current_secret, input)\n if not digest then\n return nil, \"invalid_secret\"\n end\n return secrets.current_kid .. \":\" .. digest, secrets.current_kid\nend\n\n-- Validate a waitpoint token against the (waitpoint_id, waitpoint_key,\n-- created_at_ms) that were bound at mint time. Accepts tokens signed with\n-- current_kid, or previous_kid if previous_expires_at has not passed.\n-- Returns nil on success or an error code string on failure.\nlocal function validate_waitpoint_token(\n secrets_key, token, waitpoint_id, waitpoint_key, created_at_ms, now_ms\n)\n if type(secrets_key) ~= \"string\" or secrets_key == \"\" then\n return \"invalid_keys_missing_hmac\"\n end\n if type(token) ~= \"string\" or token == \"\" then\n return \"missing_token\"\n end\n local sep = token:find(\":\", 1, true)\n if not sep or sep < 2 or sep >= #token then\n return \"invalid_token\"\n end\n local kid = token:sub(1, sep - 1)\n local presented = token:sub(sep + 1)\n if #presented ~= 40 then\n -- SHA1 hex digest is always 40 chars.\n return \"invalid_token\"\n end\n\n local secrets = load_waitpoint_secrets(secrets_key)\n if not secrets or not secrets.current_kid then\n return \"hmac_secret_not_initialized\"\n end\n\n -- Multi-kid validation. ANY secret:<kid> present in the hash is a\n -- candidate IF:\n -- - kid == current_kid (no expiry, always valid), OR\n -- - expires_at:<kid> is a positive integer AND > now_ms.\n --\n -- Rationale: rapid rotation (A\u{2192}B\u{2192}C inside a grace window) must keep\n -- in-flight A-signed tokens valid. The previous 2-slot model\n -- (current + previous) evicted A as soon as B became previous, even\n -- though expires_at:A was still future. RFC-004 \u{a7}Waitpoint Security\n -- promises grace duration, not \"grace until next rotation\".\n --\n -- Fail-CLOSED on malformed expires_at: a corrupted/non-numeric value\n -- means \"no affirmative unexpired proof\" \u{2014} reject.\n local secret = nil\n local expiry_state = nil -- \"known_kid_expired\" | \"unknown_kid\"\n if kid == secrets.current_kid then\n secret = secrets.current_secret\n else\n local entry = secrets.kid_secrets and secrets.kid_secrets[kid]\n if entry then\n local exp = tonumber(entry.expires_at)\n if not exp or exp <= 0 or exp < now_ms then\n -- secret:<kid> is present but its grace has elapsed (or was\n -- never recorded). Distinguishable from unknown_kid so the\n -- caller can log the more-actionable \"token_expired\".\n expiry_state = \"known_kid_expired\"\n else\n secret = entry.secret\n end\n else\n expiry_state = \"unknown_kid\"\n end\n end\n\n if not secret then\n if expiry_state == \"known_kid_expired\" then\n return \"token_expired\"\n end\n return \"invalid_token\"\n end\n\n local input = waitpoint_hmac_input(waitpoint_id, waitpoint_key, created_at_ms)\n local expected = hmac_sha1_hex(secret, input)\n if not expected then\n return \"invalid_secret\"\n end\n if not constant_time_eq(expected, presented) then\n return \"invalid_token\"\n end\n return nil\nend\n\n---------------------------------------------------------------------------\n-- Time\n---------------------------------------------------------------------------\n\n-- Returns the Valkey server time as milliseconds. Always prefer this over\n-- a caller-supplied now_ms for fields that are used in retention windows,\n-- eligibility scoring, lease expiry, or any cross-execution causal\n-- comparison. Client-supplied timestamps are trivially skewable and\n-- produce observability drift when compared against fields written by\n-- other Lua functions (which already use redis.call(\"TIME\")).\nlocal function server_time_ms()\n local t = redis.call(\"TIME\")\n return tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\nend\n\n---------------------------------------------------------------------------\n-- Return wrappers (\u{a7}4.9)\n---------------------------------------------------------------------------\n\nlocal function ok(...)\n return {1, \"OK\", ...}\nend\n\nlocal function err(...)\n return {0, ...}\nend\n\n-- Require a numeric value from ARGV. Returns the number on success or\n-- an err() tuple on failure. Callers must check: if type(n) == \"table\"\n-- then return n end (the table IS the err tuple).\nlocal function require_number(val, name)\n local n = tonumber(val)\n if n == nil then\n return err(\"invalid_input\", name .. \" must be a number, got: \" .. tostring(val))\n end\n return n\nend\n\nlocal function ok_already_satisfied(...)\n return {1, \"ALREADY_SATISFIED\", ...}\nend\n\nlocal function ok_duplicate(...)\n return {1, \"DUPLICATE\", ...}\nend\n\n-- RFC-014 Pattern 3 \u{2014} expand {suspension_id, wp_id, wp_key, wp_tok,\n-- extras_table} into the 4 primary fields + N_extra count + N_extra \u{d7}\n-- (id, key, tok) response tail. `extras` is an array of {waitpoint_id,\n-- waitpoint_key, waitpoint_token} tables. Empty array \u{2192} N_extra=0.\nlocal function ok_extras(susp_id, wp_id, wp_key, wp_tok, extras)\n extras = extras or {}\n local out = {1, \"OK\", susp_id, wp_id, wp_key, wp_tok, tostring(#extras)}\n for _, e in ipairs(extras) do\n out[#out + 1] = e.waitpoint_id or \"\"\n out[#out + 1] = e.waitpoint_key or \"\"\n out[#out + 1] = e.waitpoint_token or \"\"\n end\n return out\nend\n\nlocal function ok_already_satisfied_extras(susp_id, wp_id, wp_key, wp_tok, extras)\n extras = extras or {}\n local out = {1, \"ALREADY_SATISFIED\", susp_id, wp_id, wp_key, wp_tok, tostring(#extras)}\n for _, e in ipairs(extras) do\n out[#out + 1] = e.waitpoint_id or \"\"\n out[#out + 1] = e.waitpoint_key or \"\"\n out[#out + 1] = e.waitpoint_token or \"\"\n end\n return out\nend\n\n---------------------------------------------------------------------------\n-- Data access\n---------------------------------------------------------------------------\n\n-- Converts HGETALL flat array {k1, v1, k2, v2, ...} to a Lua dict table.\n-- All RFC pseudocode uses core.field syntax which requires this conversion.\nlocal function hgetall_to_table(flat)\n local t = {}\n for i = 1, #flat, 2 do\n t[flat[i]] = flat[i + 1]\n end\n return t\nend\n\n-- Safe nil/empty check. Valkey hashes cannot store nil: HGET on a missing\n-- field returns false (via Lua), and cleared fields store \"\". This helper\n-- handles both cases plus actual nil for fields absent from hgetall_to_table.\nlocal function is_set(v)\n return v ~= nil and v ~= false and v ~= \"\"\nend\n\n---------------------------------------------------------------------------\n-- Lease validation (most widely shared \u{2014} prevents copy-paste drift)\n-- RFC-010 \u{a7}4.8: 7+ functions use this: complete, fail, suspend, delay,\n-- move_to_waiting_children, append_frame, report_usage.\n---------------------------------------------------------------------------\n\n-- Validates that the caller holds a valid, non-expired, non-revoked lease.\n-- Returns an error tuple on failure, or nil on success.\n-- @param core table from hgetall_to_table(HGETALL exec_core)\n-- @param argv table with lease_id, lease_epoch, attempt_id\n-- @param now_ms current timestamp in milliseconds\nlocal function validate_lease(core, argv, now_ms)\n if core.lifecycle_phase ~= \"active\" then\n -- See validate_lease_and_mark_expired for the full detail layout.\n return err(\"execution_not_active\",\n core.terminal_outcome or \"\",\n core.current_lease_epoch or \"\",\n core.lifecycle_phase or \"\",\n core.current_attempt_id or \"\")\n end\n if core.ownership_state == \"lease_revoked\" then\n return err(\"lease_revoked\")\n end\n if tonumber(core.lease_expires_at or \"0\") <= now_ms then\n return err(\"lease_expired\")\n end\n if core.current_lease_id ~= argv.lease_id then\n return err(\"stale_lease\")\n end\n if core.current_lease_epoch ~= argv.lease_epoch then\n return err(\"stale_lease\")\n end\n if core.current_attempt_id ~= argv.attempt_id then\n return err(\"stale_lease\")\n end\n return nil\nend\n\n-- Sets ownership_state to lease_expired_reclaimable. Idempotent.\n--\n-- Also writes `closed_at`/`closed_reason=\"lease_expired\"` on the attempt\'s\n-- `stream_meta` hash when that stream exists, so `tail_stream` consumers\n-- observe the terminal signal without having to fall back to polling\n-- `execution_state`. This matters for the permanent-failure case (worker\n-- OOM or node dead, no replacement reclaims): the reclaim path that\n-- normally writes `closed_reason=\"reclaimed\"` may never run, and without\n-- this signal the tail poll loop waits forever.\n--\n-- Write order: we only write stream_meta if its existing `closed_at` is\n-- empty. A later `ff_reclaim_execution` that overwrites `closed_reason`\n-- to \"reclaimed\" still wins because it unconditionally HSETs the field;\n-- this function intentionally does NOT overwrite a pre-existing close.\n--\n-- Key construction: the stream_meta key is derived from core_key\'s\n-- `{p:N}` hash tag + current_attempt_index. All three keys share the\n-- same hash tag, so this stays single-slot in cluster mode despite not\n-- being declared in KEYS upfront \u{2014} mirrors the dynamic attempt/lane key\n-- construction in `ff_create_execution`.\n--\n-- @param keys table with core_key, lease_history_key\n-- @param core table from hgetall_to_table\n-- @param now_ms current timestamp in milliseconds\n-- @param maxlen MAXLEN for lease_history stream\nlocal function mark_expired(keys, core, now_ms, maxlen)\n if core.ownership_state == \"lease_expired_reclaimable\" then\n return -- idempotent\n end\n -- ALL 7 dims (preserve lifecycle_phase=active, eligibility_state, terminal_outcome=none)\n redis.call(\"HSET\", keys.core_key,\n \"lifecycle_phase\", core.lifecycle_phase or \"active\", -- preserve\n \"ownership_state\", \"lease_expired_reclaimable\",\n \"eligibility_state\", core.eligibility_state or \"not_applicable\", -- preserve\n \"blocking_reason\", \"waiting_for_worker\",\n \"blocking_detail\", \"lease expired, awaiting reclaim\",\n \"terminal_outcome\", core.terminal_outcome or \"none\", -- preserve\n \"attempt_state\", \"attempt_interrupted\",\n \"public_state\", \"active\",\n \"lease_expired_at\", now_ms,\n \"last_mutation_at\", now_ms)\n redis.call(\"XADD\", keys.lease_history_key, \"MAXLEN\", \"~\", maxlen, \"*\",\n \"event\", \"expired\",\n \"lease_id\", core.current_lease_id or \"\",\n \"lease_epoch\", core.current_lease_epoch or \"\",\n \"attempt_index\", core.current_attempt_index or \"\",\n \"attempt_id\", core.current_attempt_id or \"\",\n \"worker_id\", core.current_worker_id or \"\",\n \"worker_instance_id\", core.current_worker_instance_id or \"\",\n \"ts\", now_ms)\n\n -- Close stream_meta (if the stream was lazily created) so tail_stream\n -- consumers receive the terminal signal. Core key format:\n -- ff:exec:{p:N}:<eid>:core\n -- Stream meta key format:\n -- ff:stream:{p:N}:<eid>:<attempt_index>:meta\n local att_idx = core.current_attempt_index\n if att_idx ~= nil and att_idx ~= \"\" then\n local tag_open = string.find(keys.core_key, \"{\", 1, true)\n local tag_close = tag_open and string.find(keys.core_key, \"}\", tag_open, true)\n if tag_open and tag_close then\n local tag = string.sub(keys.core_key, tag_open, tag_close)\n -- After `}:` comes `<eid>:core`. Walk past the `}:` delimiter.\n local after_tag = string.sub(keys.core_key, tag_close + 2)\n local eid_end = string.find(after_tag, \":core\", 1, true)\n if eid_end then\n local eid = string.sub(after_tag, 1, eid_end - 1)\n local stream_meta_key = \"ff:stream:\" .. tag .. \":\" .. eid\n .. \":\" .. tostring(att_idx) .. \":meta\"\n if redis.call(\"EXISTS\", stream_meta_key) == 1 then\n local existing_closed_at = redis.call(\"HGET\", stream_meta_key, \"closed_at\")\n if not is_set(existing_closed_at) then\n redis.call(\"HSET\", stream_meta_key,\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"lease_expired\")\n end\n end\n end\n end\n end\nend\n\n-- Validates lease AND atomically marks expired if lease has lapsed.\n-- Use this variant for write-path callers (complete, fail, suspend, delay,\n-- move_to_waiting_children) that have the lease_history key available.\n-- For read-only callers (append_frame, report_usage) use validate_lease.\n-- @param core table from hgetall_to_table\n-- @param argv table with lease_id, lease_epoch, attempt_id\n-- @param now_ms current timestamp in milliseconds\n-- @param keys table with core_key, lease_history_key\n-- @param maxlen MAXLEN for lease_history stream\nlocal function validate_lease_and_mark_expired(core, argv, now_ms, keys, maxlen)\n if core.lifecycle_phase ~= \"active\" then\n -- Enriched error detail lets the SDK reconcile a replay of a terminal\n -- operation after a network drop: if the caller\'s (lease_epoch,\n -- attempt_id) match what\'s stored and the outcome matches what they\n -- asked for, treat the \"error\" as a successful replay. See\n -- parse_terminal_replay() on the Rust side. Detail slots:\n -- idx 2: terminal_outcome (e.g. \"success\", \"failed\", \"cancelled\", \"none\")\n -- idx 3: current_lease_epoch (persists across terminal; cleared for retry-scheduled)\n -- idx 4: lifecycle_phase (\"terminal\" vs \"runnable\" disambiguates\n -- terminal_failed from retry_scheduled replay)\n -- idx 5: current_attempt_id (preserved on terminal, cleared on retry;\n -- per-attempt replay guard)\n return err(\"execution_not_active\",\n core.terminal_outcome or \"\",\n core.current_lease_epoch or \"\",\n core.lifecycle_phase or \"\",\n core.current_attempt_id or \"\")\n end\n if core.ownership_state == \"lease_revoked\" then\n return err(\"lease_revoked\")\n end\n if tonumber(core.lease_expires_at or \"0\") <= now_ms then\n mark_expired(keys, core, now_ms, maxlen)\n return err(\"lease_expired\")\n end\n if core.current_lease_id ~= argv.lease_id then\n return err(\"stale_lease\")\n end\n if core.current_lease_epoch ~= argv.lease_epoch then\n return err(\"stale_lease\")\n end\n if core.current_attempt_id ~= argv.attempt_id then\n return err(\"stale_lease\")\n end\n return nil\nend\n\n-- RFC #58.5 \u{2014} resolve the (lease_id, lease_epoch, attempt_id) fence triple.\n-- Returns (fence_table, must_check) on success, or (nil, err_table) on a\n-- partial triple (programming error \u{2014} caller passed some but not all three).\n--\n-- Semantics:\n-- * All three present (non-empty) \u{2192} fence triple, must_check=true.\n-- Caller is expected to run validate_lease_and_mark_expired next.\n-- * All three empty \u{2192} server-resolved from exec_core,\n-- must_check=false. Caller decides\n-- whether unfenced mode is allowed\n-- (terminal ops gate on `source`;\n-- renew/suspend hard-reject).\n-- * Any mix of set/empty \u{2192} err(\"partial_fence_triple\").\n--\n-- @param core hgetall_to_table(exec_core)\n-- @param argv table with .lease_id, .lease_epoch, .attempt_id (strings)\nlocal function resolve_lease_fence(core, argv)\n local has_id = is_set(argv.lease_id)\n local has_ep = is_set(argv.lease_epoch)\n local has_at = is_set(argv.attempt_id)\n if has_id or has_ep or has_at then\n if not (has_id and has_ep and has_at) then\n return nil, err(\"partial_fence_triple\")\n end\n return {\n lease_id = argv.lease_id,\n lease_epoch = argv.lease_epoch,\n attempt_id = argv.attempt_id,\n }, true\n end\n return {\n lease_id = core.current_lease_id or \"\",\n lease_epoch = core.current_lease_epoch or \"\",\n attempt_id = core.current_attempt_id or \"\",\n }, false\nend\n\n-- Consolidates the ~15-line lease release block shared by 7 functions.\n-- DEL lease_current, ZREM lease_expiry + worker_leases + active_index,\n-- clear lease fields on exec_core, XADD lease_history \"released\".\n-- @param keys table with lease_current_key, lease_expiry_key,\n-- worker_leases_key, active_index_key, lease_history_key,\n-- attempt_timeout_key, core_key\n-- @param core table from hgetall_to_table\n-- @param reason string reason for release (e.g. \"completed\", \"suspend\")\n-- @param now_ms current timestamp in milliseconds\n-- @param maxlen MAXLEN for lease_history stream\nlocal function clear_lease_and_indexes(keys, core, reason, now_ms, maxlen)\n local eid = core.execution_id or \"\"\n\n -- DEL lease record\n redis.call(\"DEL\", keys.lease_current_key)\n\n -- ZREM/SREM from scheduling indexes\n redis.call(\"ZREM\", keys.lease_expiry_key, eid)\n redis.call(\"SREM\", keys.worker_leases_key, eid)\n redis.call(\"ZREM\", keys.active_index_key, eid)\n redis.call(\"ZREM\", keys.attempt_timeout_key, eid)\n\n -- Clear lease fields on exec_core (including stale expiry/revocation markers)\n redis.call(\"HSET\", keys.core_key,\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"lease_expires_at\", \"\",\n \"lease_last_renewed_at\", \"\",\n \"lease_renewal_deadline\", \"\",\n \"lease_expired_at\", \"\",\n \"lease_revoked_at\", \"\",\n \"lease_revoke_reason\", \"\",\n \"last_mutation_at\", now_ms)\n\n -- Lease history event\n redis.call(\"XADD\", keys.lease_history_key, \"MAXLEN\", \"~\", maxlen, \"*\",\n \"event\", \"released\",\n \"lease_id\", core.current_lease_id or \"\",\n \"lease_epoch\", core.current_lease_epoch or \"\",\n \"attempt_index\", core.current_attempt_index or \"\",\n \"attempt_id\", core.current_attempt_id or \"\",\n \"reason\", reason,\n \"ts\", now_ms)\nend\n\n---------------------------------------------------------------------------\n-- Defensive index cleanup\n-- RFC-010 \u{a7}4.8: ZREM execution_id from all scheduling + timeout indexes\n-- except except_key. ~14 ZREM/SREM calls.\n---------------------------------------------------------------------------\n\n-- @param keys table with all index key names\n-- @param eid execution_id string\n-- @param except_key optional key to skip (the target index for this transition)\nlocal function defensive_zrem_all_indexes(keys, eid, except_key)\n -- Each index key and whether it uses ZREM or SREM\n local zrem_keys = {\n keys.eligible_key,\n keys.delayed_key,\n keys.active_index_key,\n keys.suspended_key,\n keys.terminal_key,\n keys.blocked_deps_key,\n keys.blocked_budget_key,\n keys.blocked_quota_key,\n keys.blocked_route_key,\n keys.blocked_operator_key,\n keys.lease_expiry_key,\n keys.suspension_timeout_key,\n keys.attempt_timeout_key,\n keys.execution_deadline_key,\n }\n for _, k in ipairs(zrem_keys) do\n if k and k ~= except_key then\n redis.call(\"ZREM\", k, eid)\n end\n end\n -- worker_leases is a SET, not ZSET\n if keys.worker_leases_key and keys.worker_leases_key ~= except_key then\n redis.call(\"SREM\", keys.worker_leases_key, eid)\n end\nend\n\n---------------------------------------------------------------------------\n-- Suspension reason \u{2192} blocking_reason mapping\n-- RFC-004 \u{a7}Suspension Reason Categories\n---------------------------------------------------------------------------\n\nlocal REASON_TO_BLOCKING = {\n waiting_for_signal = \"waiting_for_signal\",\n waiting_for_approval = \"waiting_for_approval\",\n waiting_for_callback = \"waiting_for_callback\",\n waiting_for_tool_result = \"waiting_for_tool_result\",\n waiting_for_operator_review = \"paused_by_operator\",\n paused_by_policy = \"paused_by_policy\",\n paused_by_budget = \"waiting_for_budget\",\n step_boundary = \"waiting_for_signal\",\n manual_pause = \"paused_by_operator\",\n}\n\nlocal function map_reason_to_blocking(reason_code)\n return REASON_TO_BLOCKING[reason_code] or \"waiting_for_signal\"\nend\n\n---------------------------------------------------------------------------\n-- Resume condition evaluation (shared module)\n-- RFC-004 \u{a7}Resume Condition Model, RFC-005 \u{a7}8.3\n---------------------------------------------------------------------------\n\n-- Parse resume_condition_json into a matcher table used by\n-- evaluate_signal_against_condition and is_condition_satisfied.\n-- @param json JSON string of the resume condition\n-- @return table with matchers array, match_mode, minimum_signal_count, etc.\n-- For composite conditions (RFC-014) the returned table also carries\n-- `composite = true` and a parsed `tree` spec; the matcher fields\n-- remain populated (empty) so legacy diagnostics that read\n-- `satisfied_count` / `total_matchers` on the wp_condition hash still\n-- work without crashing.\nlocal function initialize_condition(json)\n local spec = cjson.decode(json)\n -- RFC-014: composite path. The Rust-side serializer sets\n -- `composite = true` on multi-signal conditions. Legacy single/\n -- operator/timeout paths continue to use the matcher array below.\n if spec.composite then\n if spec.v and tonumber(spec.v) ~= 1 then\n -- Future-proofing: unknown version. RFC-014 \u{a7}7.2 rejects v>1.\n return {\n condition_type = \"composite\",\n composite = true,\n invalid = \"invalid_resume_condition\",\n matchers = {},\n total_matchers = 0,\n satisfied_count = 0,\n closed = false,\n }\n end\n return {\n condition_type = \"composite\",\n composite = true,\n tree = spec.tree,\n match_mode = \"composite\",\n minimum_signal_count = 1,\n total_matchers = 0,\n satisfied_count = 0,\n matchers = {},\n closed = false,\n }\n end\n local matchers = {}\n local names = spec.required_signal_names or {}\n if #names == 0 then\n -- Empty required_signal_names acts as wildcard: ANY signal satisfies the condition.\n -- To require explicit operator resume (no signal match), pass a sentinel name\n -- that no real signal will match, or use a different resume mechanism.\n matchers[1] = { name = \"\", satisfied = false, signal_id = \"\" }\n else\n for i, name in ipairs(names) do\n matchers[i] = { name = name, satisfied = false, signal_id = \"\" }\n end\n end\n return {\n condition_type = spec.condition_type or \"signal_set\",\n match_mode = spec.signal_match_mode or \"any\",\n minimum_signal_count = tonumber(spec.minimum_signal_count or \"1\"),\n total_matchers = #names > 0 and #names or 1,\n satisfied_count = 0,\n matchers = matchers,\n closed = false,\n }\nend\n\n-- Write condition state to a dedicated condition hash key.\n-- @param key Valkey key for the condition hash\n-- @param cond condition table from initialize_condition\n-- @param now_ms current timestamp\nlocal function write_condition_hash(key, cond, now_ms)\n -- RFC-014 composite path: write a minimal marker + the parsed tree.\n -- The composite evaluator reads `composite` + `tree_json` at signal\n -- delivery time; legacy `satisfied_count` / `total_matchers` fields\n -- stay zero so operator diagnostics that read them don\'t blow up.\n if cond.composite then\n redis.call(\"HSET\", key,\n \"condition_type\", \"composite\",\n \"composite\", \"1\",\n \"match_mode\", \"composite\",\n \"minimum_signal_count\", \"1\",\n \"total_matchers\", \"0\",\n \"satisfied_count\", \"0\",\n \"closed\", cond.closed and \"1\" or \"0\",\n \"updated_at\", tostring(now_ms),\n \"tree_json\", cond.tree and cjson.encode(cond.tree) or \"\")\n return\n end\n local fields = {\n \"condition_type\", cond.condition_type,\n \"match_mode\", cond.match_mode,\n \"minimum_signal_count\", tostring(cond.minimum_signal_count),\n \"total_matchers\", tostring(cond.total_matchers),\n \"satisfied_count\", tostring(cond.satisfied_count),\n \"closed\", cond.closed and \"1\" or \"0\",\n \"updated_at\", tostring(now_ms),\n }\n for i = 1, cond.total_matchers do\n local m = cond.matchers[i]\n local idx = i - 1 -- external field names remain 0-based for wire compat\n fields[#fields + 1] = \"matcher:\" .. idx .. \":name\"\n fields[#fields + 1] = m.name\n fields[#fields + 1] = \"matcher:\" .. idx .. \":satisfied\"\n fields[#fields + 1] = m.satisfied and \"1\" or \"0\"\n fields[#fields + 1] = \"matcher:\" .. idx .. \":signal_id\"\n fields[#fields + 1] = m.signal_id\n end\n redis.call(\"HSET\", key, unpack(fields))\nend\n\n-- Match a signal against the condition\'s matchers. Mutates cond in-place.\n-- @param cond condition table from initialize_condition\n-- @param signal_name signal name string\n-- @param signal_id signal ID string\n-- @return true if this signal matched a matcher, false otherwise\nlocal function evaluate_signal_against_condition(cond, signal_name, signal_id)\n for i = 1, cond.total_matchers do\n local m = cond.matchers[i]\n if not m.satisfied then\n -- Empty name = wildcard matcher (matches any signal)\n if m.name == \"\" or m.name == signal_name then\n m.satisfied = true\n m.signal_id = signal_id or \"\"\n cond.satisfied_count = cond.satisfied_count + 1\n return true\n end\n end\n end\n return false\nend\n\n-- Check if the overall condition is satisfied based on mode.\n-- @param cond condition table\n-- @return true if condition is satisfied\nlocal function is_condition_satisfied(cond)\n local mode = cond.match_mode\n local min_count = cond.minimum_signal_count\n if mode == \"any\" then\n return cond.satisfied_count >= min_count\n elseif mode == \"all\" then\n return cond.satisfied_count >= cond.total_matchers\n end\n -- count(n) mode \u{2014} same as any with minimum_signal_count = n\n return cond.satisfied_count >= min_count\nend\n\n---------------------------------------------------------------------------\n-- RFC-014 composite condition evaluator\n--\n-- Storage model (\u{a7}3.1):\n-- satisfied_set_key: SET of satisfier tokens (\"wp:<id>\" | \"sig:<id>\" |\n-- \"src:<type>:<identity>\" | \"node:<path>\").\n-- member_map_key: HASH of waitpoint_key \u{2192} node_path (write-once at\n-- suspend-time, informational only under single-\n-- waitpoint scoping \u{2014} see \u{a7}3.1).\n--\n-- Per-signal algorithm (\u{a7}3.3):\n-- 1. Walk the tree, identifying nodes this signal may contribute to.\n-- 2. For each such node, apply the local matcher (step 2.5); reject on\n-- mismatch with `signal_ignored_matcher_failed`.\n-- 3. SADD the per-kind satisfier token to satisfied_set. Dedup returns\n-- `appended_to_waitpoint_duplicate`.\n-- 4. Re-evaluate the tree: if the root is satisfied, emit\n-- `resume_condition_satisfied`.\n---------------------------------------------------------------------------\n\n-- Does a matcher accept this signal? Matches RFC-013 SignalMatcher shape.\n-- ByName compares by signal_name; Wildcard accepts any.\nlocal function matcher_accepts(matcher, signal_name)\n if matcher == nil or type(matcher) ~= \"table\" then\n return true\n end\n if matcher.kind == \"Wildcard\" or matcher.kind == nil then\n return true\n end\n if matcher.kind == \"ByName\" then\n return matcher.name == signal_name\n end\n return false\nend\n\n-- Compute the satisfier token for a Count node given CountKind + signal.\nlocal function count_satisfier_token(kind, signal, waitpoint_id)\n if kind == \"DistinctSignals\" then\n return \"sig:\" .. (signal.signal_id or \"\")\n elseif kind == \"DistinctSources\" then\n return \"src:\" .. (signal.source_type or \"\") .. \":\" .. (signal.source_identity or \"\")\n end\n -- DistinctWaitpoints (default)\n return \"wp:\" .. (waitpoint_id or \"\")\nend\n\n-- Set-membership helper: tokens table keyed-by-name (in-memory snapshot).\nlocal function set_has(tokens, token)\n return tokens[token] == true\nend\n\n-- Count how many satisfier tokens a Count node currently has in the set.\n-- For DistinctWaitpoints we count tokens of the form `wp:<id>` where <id>\n-- corresponds to any waitpoint_key in node.waitpoints. For\n-- DistinctSignals / DistinctSources we count tokens of the right prefix\n-- that were SADDed while delivering a signal matching this node (scoped\n-- correctness relies on the per-node matcher filter at step 2.5).\nlocal function count_node_satisfiers(node, tokens)\n local kind = node.count_kind or \"DistinctWaitpoints\"\n local count = 0\n -- For single-waitpoint composites (current scope), `tokens` contains\n -- the satisfier tokens of signals that landed on THIS node after\n -- passing the matcher filter. The tokens are already namespaced by\n -- the delivery routine so counting by prefix is sufficient.\n local prefix\n if kind == \"DistinctWaitpoints\" then\n prefix = \"wp:\"\n elseif kind == \"DistinctSignals\" then\n prefix = \"sig:\"\n else\n prefix = \"src:\"\n end\n local plen = #prefix\n for token, _ in pairs(tokens) do\n if string.sub(token, 1, plen) == prefix then\n count = count + 1\n end\n end\n return count\nend\n\n-- Evaluate a node against the current satisfied-set snapshot.\n-- @param node parsed tree node\n-- @param tokens table<string, true> snapshot of satisfied_set\n-- @return true if satisfied\nlocal function composite_evaluate_node(node, tokens)\n if node == nil then return false end\n local kind = node.kind\n if kind == \"Single\" then\n -- Leaf: the wp:<id> token is the satisfaction marker (RFC \u{a7}3.2).\n -- We look up by path instead so a same-waitpoint AllOf can\n -- disambiguate by matcher; the delivery routine writes\n -- `leaf:<path>` tokens for satisfied Single-under-AllOf leaves.\n return set_has(tokens, \"leaf:\" .. (node.path or \"\"))\n elseif kind == \"AllOf\" then\n local members = node.members or {}\n for _, child in ipairs(members) do\n if child.kind == \"Single\" then\n if not set_has(tokens, \"leaf:\" .. (child.path or \"\")) then\n return false\n end\n else\n if not set_has(tokens, \"node:\" .. (child.path or \"\")) then\n return false\n end\n end\n end\n return true\n elseif kind == \"Count\" then\n -- Timeout token short-circuits a Count (RFC \u{a7}6.1).\n if tokens[\"timeout:*\"] == true then return true end\n return count_node_satisfiers(node, tokens) >= (tonumber(node.n) or 1)\n end\n -- NeverBySignal / unknown\n return false\nend\n\n-- Load satisfied_set contents into an in-memory table<string, true>.\nlocal function load_satisfied_set(satisfied_set_key)\n local members = redis.call(\"SMEMBERS\", satisfied_set_key)\n local t = {}\n for _, m in ipairs(members) do\n t[m] = true\n end\n return t\nend\n\n-- Find nodes in the tree that this signal may contribute to. Returns a\n-- list of {node, ancestors} where ancestors is the path from the node\n-- upward to the root (exclusive of the node itself).\nlocal function composite_collect_candidate_nodes(tree, waitpoint_key, signal_name)\n -- For single-waitpoint composites, every Single/Count node whose\n -- waitpoint_key (or waitpoints list) contains `waitpoint_key` is a\n -- candidate. Walk the tree DFS, tracking ancestors.\n local results = {}\n local function walk(node, ancestors)\n if node == nil then return end\n local kind = node.kind\n if kind == \"Single\" then\n if node.waitpoint_key == waitpoint_key then\n results[#results + 1] = { node = node, ancestors = ancestors }\n end\n elseif kind == \"Count\" then\n local matches_wp = false\n for _, wk in ipairs(node.waitpoints or {}) do\n if wk == waitpoint_key then matches_wp = true; break end\n end\n if matches_wp then\n results[#results + 1] = { node = node, ancestors = ancestors }\n end\n elseif kind == \"AllOf\" then\n local nested = {}\n for i = 1, #ancestors do nested[i] = ancestors[i] end\n nested[#nested + 1] = node\n for _, child in ipairs(node.members or {}) do\n walk(child, nested)\n end\n end\n end\n walk(tree, {})\n return results\nend\n\n-- Emit initial member_map for operator diagnostics (RFC \u{a7}4.4).\n-- Accepts a single waitpoint_key (patterns 1 + 2) or a list of\n-- waitpoint_keys (RFC-014 Pattern 3). Every candidate node path for\n-- each key maps back to that key. Write-once at suspend-time.\nlocal function seed_composite_member_map(member_map_key, tree, waitpoint_keys)\n if type(waitpoint_keys) == \"string\" then\n waitpoint_keys = { waitpoint_keys }\n end\n local fields = {}\n for _, wk in ipairs(waitpoint_keys or {}) do\n local cands = composite_collect_candidate_nodes(tree, wk, nil)\n for _, c in ipairs(cands) do\n fields[#fields + 1] = \"wp:\" .. wk .. \":\" .. (c.node.path or \"\")\n fields[#fields + 1] = c.node.path or \"\"\n end\n end\n if #fields > 0 then\n redis.call(\"HSET\", member_map_key, unpack(fields))\n end\nend\n\n-- Deliver a signal against a composite condition. Returns a table:\n-- { effect = <string>, resume = <bool>, closer = <signal_id>,\n-- all_satisfiers_json = <string> }\n-- satisfied_set_key / member_map_key are RFC-014 \u{a7}3.1 keys.\nlocal function composite_deliver_signal(\n tree, satisfied_set_key, member_map_key, waitpoint_id, waitpoint_key,\n signal)\n local tokens = load_satisfied_set(satisfied_set_key)\n local candidates = composite_collect_candidate_nodes(tree, waitpoint_key, signal.signal_name)\n if #candidates == 0 then\n return { effect = \"signal_ignored_not_in_condition\", resume = false }\n end\n\n local added_any = false\n local matcher_failed_all = true\n -- RFC \u{a7}3.3 step 2.5: per-node matcher filter. If ANY candidate node\'s\n -- matcher accepts this signal, we proceed; if all reject, bail with\n -- `signal_ignored_matcher_failed`.\n for _, c in ipairs(candidates) do\n local node = c.node\n local matcher = node.matcher\n if matcher_accepts(matcher, signal.signal_name) then\n matcher_failed_all = false\n local token\n if node.kind == \"Single\" then\n token = \"leaf:\" .. (node.path or \"\")\n else -- Count\n token = count_satisfier_token(node.count_kind, signal, waitpoint_id)\n end\n local added = redis.call(\"SADD\", satisfied_set_key, token)\n if added == 1 then\n added_any = true\n -- Track the signal id for `all_satisfier_signals` emission.\n redis.call(\"SADD\", satisfied_set_key .. \":signals\", signal.signal_id)\n end\n end\n end\n\n if matcher_failed_all then\n return { effect = \"signal_ignored_matcher_failed\", resume = false }\n end\n if not added_any then\n return { effect = \"appended_to_waitpoint_duplicate\", resume = false }\n end\n\n -- Re-load + re-evaluate (cheap: depth \u{2264} 4).\n tokens = load_satisfied_set(satisfied_set_key)\n\n -- Propagate non-leaf child satisfaction upward: for each AllOf\n -- ancestor whose child is now satisfied, SADD `node:<child_path>`.\n -- Bounded by depth.\n local function propagate(node)\n if node == nil or node.kind ~= \"AllOf\" then return end\n for _, child in ipairs(node.members or {}) do\n if child.kind ~= \"Single\" then\n if composite_evaluate_node(child, tokens) then\n local added2 = redis.call(\"SADD\", satisfied_set_key, \"node:\" .. (child.path or \"\"))\n if added2 == 1 then\n tokens[\"node:\" .. (child.path or \"\")] = true\n end\n propagate(child)\n end\n end\n end\n end\n propagate(tree)\n\n local root_sat = composite_evaluate_node(tree, tokens)\n if root_sat then\n -- Gather satisfier signals (the ids we SADDed during this and prior\n -- deliveries).\n local sig_ids = redis.call(\"SMEMBERS\", satisfied_set_key .. \":signals\")\n return {\n effect = \"resume_condition_satisfied\",\n resume = true,\n closer = signal.signal_id,\n all_satisfiers_json = cjson.encode(sig_ids),\n }\n end\n return { effect = \"appended_to_waitpoint\", resume = false }\nend\n\n-- Clear composite keys on suspension termination (\u{a7}3.1.1).\nlocal function composite_cleanup(satisfied_set_key, member_map_key)\n redis.call(\"DEL\", satisfied_set_key)\n redis.call(\"DEL\", satisfied_set_key .. \":signals\")\n redis.call(\"DEL\", member_map_key)\nend\n\n-- RFC-014 Pattern 3 \u{2014} close per-extra waitpoint storage on\n-- cancel/expire/resume. The extras list is stored in\n-- `suspension_current.additional_waitpoints_json` as a JSON array of\n-- {waitpoint_id, waitpoint_key} pairs at suspend-time. Cleanup owners\n-- reconstruct the wp_hash + wp_condition keys dynamically from the\n-- suspension\'s hash tag (same trick ff_cancel_execution uses for lane\n-- keys) so no KEYS arity growth is needed.\n--\n-- @param suspension_current_key e.g. \"ff:exec:{p:12}:E-...:suspension:current\"\n-- @param additional_json value of HGET suspension_current additional_waitpoints_json\n-- @param close_state_fields table of HSET fields to apply to each extra waitpoint hash\n-- (e.g. {\"state\",\"closed\",\"close_reason\",\"cancelled\",\"closed_at\",\"<now>\"})\n-- @param close_cond_fields table of HSET fields to apply to each extra wp_condition hash\nlocal function close_additional_waitpoints(suspension_current_key, additional_json,\n close_state_fields, close_cond_fields)\n if not additional_json or additional_json == \"\" or additional_json == \"[]\" then\n return\n end\n local ok_dec, pairs_list = pcall(cjson.decode, additional_json)\n if not ok_dec or type(pairs_list) ~= \"table\" then return end\n local tag = string.match(suspension_current_key, \"(%b{})\")\n if not tag then return end\n for _, entry in ipairs(pairs_list) do\n local wp_id = entry.waitpoint_id\n if type(wp_id) == \"string\" and wp_id ~= \"\" then\n local wp_hash_key = \"ff:wp:\" .. tag .. \":\" .. wp_id\n local wp_cond_key = \"ff:wp:\" .. tag .. \":\" .. wp_id .. \":condition\"\n if redis.call(\"EXISTS\", wp_hash_key) == 1 and close_state_fields and #close_state_fields > 0 then\n redis.call(\"HSET\", wp_hash_key, unpack(close_state_fields))\n end\n if redis.call(\"EXISTS\", wp_cond_key) == 1 and close_cond_fields and #close_cond_fields > 0 then\n redis.call(\"HSET\", wp_cond_key, unpack(close_cond_fields))\n end\n end\n end\nend\n\n-- Extract a named field from a Valkey Stream entry\'s flat field array.\n-- Stream entries return {id, {field1, val1, field2, val2, ...}}.\n-- This operates on the inner flat array.\n-- @param fields flat array from stream entry[2]\n-- @param name field name to extract\n-- @return value string or nil\nlocal function extract_field(fields, name)\n for i = 1, #fields, 2 do\n if fields[i] == name then\n return fields[i + 1]\n end\n end\n return nil\nend\n\n---------------------------------------------------------------------------\n-- Suspension helpers (RFC-004)\n---------------------------------------------------------------------------\n\n-- Returns an empty signal summary JSON string for initial suspension record.\nlocal function initial_signal_summary_json()\n return \'{\"total_count\":0,\"matched_count\":0,\"signal_names\":[]}\'\nend\n\n-- Validates that a pending waitpoint can be activated by a suspension.\n-- Returns error tuple on failure, nil on success.\n-- @param wp_raw flat array from HGETALL on waitpoint hash\n-- @param eid expected execution_id\n-- @param att_idx expected attempt_index (string)\n-- @param now_ms current timestamp\nlocal function validate_pending_waitpoint(wp_raw, eid, att_idx, now_ms)\n if #wp_raw == 0 then\n return err(\"waitpoint_not_found\")\n end\n local wp = hgetall_to_table(wp_raw)\n if wp.state ~= \"pending\" then\n return err(\"waitpoint_not_pending\")\n end\n if wp.execution_id ~= eid then\n return err(\"invalid_waitpoint_for_execution\")\n end\n if tostring(wp.attempt_index) ~= tostring(att_idx) then\n return err(\"invalid_waitpoint_for_execution\")\n end\n -- Check if pending waitpoint has expired\n if is_set(wp.expires_at) and tonumber(wp.expires_at) <= now_ms then\n return err(\"pending_waitpoint_expired\")\n end\n return nil\nend\n\n-- Validates that a suspension record exists and is open (not closed).\n-- Returns error tuple on failure, nil on success. Also returns the parsed table.\n-- @param susp_raw flat array from HGETALL on suspension:current\nlocal function assert_active_suspension(susp_raw)\n if #susp_raw == 0 then\n return err(\"execution_not_suspended\")\n end\n local susp = hgetall_to_table(susp_raw)\n if not is_set(susp.suspension_id) then\n return err(\"execution_not_suspended\")\n end\n if is_set(susp.closed_at) then\n return err(\"execution_not_suspended\")\n end\n return nil, susp\nend\n\n-- Validates that a waitpoint belongs to the expected execution + suspension.\n-- Returns error tuple on failure, nil on success.\n-- @param wp_raw flat array from HGETALL on waitpoint hash\n-- @param eid expected execution_id\n-- @param sid expected suspension_id\n-- @param wid expected waitpoint_id\nlocal function assert_waitpoint_belongs(wp_raw, eid, sid, wid)\n if #wp_raw == 0 then\n return err(\"waitpoint_not_found\")\n end\n local wp = hgetall_to_table(wp_raw)\n if wp.execution_id ~= eid then\n return err(\"invalid_waitpoint_for_execution\")\n end\n if is_set(sid) and wp.suspension_id ~= sid then\n return err(\"invalid_waitpoint_for_execution\")\n end\n if is_set(wid) and wp.waitpoint_id ~= wid then\n return err(\"invalid_waitpoint_for_execution\")\n end\n return nil\nend\n\n---------------------------------------------------------------------------\n-- Policy\n---------------------------------------------------------------------------\n\n-- Decode a JSON policy string into flat key-value pairs suitable for HSET.\n-- @param json JSON string of the policy object\n-- @return flat array {k1, v1, k2, v2, ...} for use with redis.call(\"HSET\", key, unpack(...))\nlocal function unpack_policy(json)\n local policy = cjson.decode(json)\n local flat = {}\n for k, v in pairs(policy) do\n flat[#flat + 1] = k\n if type(v) == \"table\" then\n flat[#flat + 1] = cjson.encode(v)\n else\n flat[#flat + 1] = tostring(v)\n end\n end\n return flat\nend\n\n\n-- source: lua/version.lua\n-- FlowFabric library version check\n-- Returns the library version string. Used by the loader to detect\n-- whether the library is loaded and at the expected version.\n--\n-- Bump this string whenever any registered function\'s KEYS or ARGV arity\n-- changes, or a new function is added. Mismatched versions force\n-- `FUNCTION LOAD REPLACE` so old binaries cannot FCALL a library whose key\n-- signatures they expect a different shape for.\n--\n-- SINGLE SOURCE OF TRUTH: Rust\'s `LIBRARY_VERSION` is extracted from the\n-- `return \'X\'` literal below by `scripts/gen-ff-script-lua.sh`, which\n-- writes `crates/ff-script/src/flowfabric_lua_version`. Rust reads that\n-- file via `include_str!`. Do NOT maintain a separate Rust literal.\n-- Extract contract: the body MUST contain exactly one `return \'X\'` literal\n-- with single quotes (not double). CI runs the gen script and diffs; any\n-- drift fails the build.\n\nredis.register_function(\'ff_version\', function(keys, args)\n return \'27\'\nend)\n\n\n-- source: lua/lease.lua\n-- FlowFabric lease management functions\n-- Reference: RFC-003 (Lease and Fencing), RFC-010 \u{a7}4 (function inventory)\n--\n-- Depends on helpers: ok, err, ok_already_satisfied, hgetall_to_table,\n-- is_set, validate_lease, mark_expired\n\n---------------------------------------------------------------------------\n-- #6 ff_renew_lease\n--\n-- Extends a still-valid lease. Preserves lease_id and lease_epoch.\n-- KEYS (4): exec_core, lease_current, lease_history, lease_expiry_zset\n-- ARGV (7): execution_id, attempt_index, attempt_id,\n-- lease_id, lease_epoch,\n-- lease_ttl_ms, lease_history_grace_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_renew_lease\', function(keys, args)\n -- Positional KEYS\n local K = {\n core_key = keys[1],\n lease_current = keys[2],\n lease_history = keys[3],\n lease_expiry_key = keys[4],\n }\n\n -- Positional ARGV\n local lease_ttl_n = require_number(args[6], \"lease_ttl_ms\")\n if type(lease_ttl_n) == \"table\" then return lease_ttl_n end\n local grace_n = require_number(args[7], \"lease_history_grace_ms\")\n if type(grace_n) == \"table\" then return grace_n end\n\n local A = {\n execution_id = args[1],\n attempt_index = args[2],\n attempt_id = args[3] or \"\",\n lease_id = args[4] or \"\",\n lease_epoch = args[5] or \"\",\n lease_ttl_ms = lease_ttl_n,\n lease_history_grace_ms = grace_n,\n }\n\n -- Server time\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- Read execution core\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then\n return err(\"execution_not_found\")\n end\n local core = hgetall_to_table(raw)\n\n -- RFC #58.5 \u{2014} renew_lease is only meaningful when the caller holds the\n -- current lease. Hard-reject empty or partial fence triples; there is\n -- no operator-override escape hatch (owner\'s directive \u{2014} cleaner\n -- behavior than resolving-then-passing-self-match).\n local fence, must_check_or_err = resolve_lease_fence(core, A)\n if not fence then return must_check_or_err end\n if not must_check_or_err then return err(\"fence_required\") end\n\n -- Validate lifecycle\n if core.lifecycle_phase ~= \"active\" then\n return err(\"execution_not_active\",\n core.terminal_outcome or \"\",\n core.current_lease_epoch or \"\",\n core.lifecycle_phase or \"\",\n core.current_attempt_id or \"\")\n end\n\n -- Check revocation\n if core.ownership_state == \"lease_revoked\" or is_set(core.lease_revoked_at) then\n return err(\"lease_revoked\")\n end\n\n -- Check expiry (if expired, mark and reject)\n if tonumber(core.lease_expires_at or \"0\") <= now_ms then\n mark_expired(\n { core_key = K.core_key, lease_history_key = K.lease_history },\n core, now_ms, 1000)\n return err(\"lease_expired\")\n end\n\n -- Validate caller identity\n if tostring(core.current_attempt_index) ~= A.attempt_index then\n return err(\"stale_lease\")\n end\n if core.current_attempt_id ~= A.attempt_id then\n return err(\"stale_lease\")\n end\n if core.current_lease_id ~= A.lease_id then\n return err(\"stale_lease\")\n end\n if tostring(core.current_lease_epoch) ~= A.lease_epoch then\n return err(\"stale_lease\")\n end\n\n -- Compute new deadlines\n local new_expires_at = now_ms + A.lease_ttl_ms\n local new_renewal_deadline = now_ms + math.floor(A.lease_ttl_ms * 2 / 3)\n\n -- Update exec_core\n redis.call(\"HSET\", K.core_key,\n \"lease_last_renewed_at\", tostring(now_ms),\n \"lease_renewal_deadline\", tostring(new_renewal_deadline),\n \"lease_expires_at\", tostring(new_expires_at),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Update lease_current\n redis.call(\"HSET\", K.lease_current,\n \"last_renewed_at\", tostring(now_ms),\n \"renewal_deadline\", tostring(new_renewal_deadline),\n \"expires_at\", tostring(new_expires_at))\n redis.call(\"PEXPIREAT\", K.lease_current,\n new_expires_at + A.lease_history_grace_ms)\n\n -- Update lease_expiry index\n redis.call(\"ZADD\", K.lease_expiry_key, new_expires_at, A.execution_id)\n\n -- Renewal history event OFF for v1 (RFC-010 \u{a7}4.8g).\n -- The exec_core field lease_last_renewed_at provides the latest renewal\n -- timestamp without stream overhead. Enable per-lane when detailed\n -- ownership audit trails are needed.\n\n return ok(tostring(new_expires_at))\nend)\n\n---------------------------------------------------------------------------\n-- #28 ff_mark_lease_expired_if_due\n--\n-- Called by the lease expiry scanner. Re-validates that the lease is\n-- actually expired (guards against renewal since the ZRANGEBYSCORE read).\n-- Idempotent: no-op if already expired/reclaimed/revoked or not yet due.\n--\n-- KEYS (4): exec_core, lease_current, lease_expiry_zset, lease_history\n-- ARGV (1): execution_id\n---------------------------------------------------------------------------\nredis.register_function(\'ff_mark_lease_expired_if_due\', function(keys, args)\n -- Positional KEYS\n local K = {\n core_key = keys[1],\n lease_current = keys[2],\n lease_expiry_key = keys[3],\n lease_history = keys[4],\n }\n\n local execution_id = args[1]\n\n -- Server time\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- Read execution core\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then\n -- Execution gone \u{2014} clean up stale index entry\n redis.call(\"ZREM\", K.lease_expiry_key, execution_id)\n return ok_already_satisfied(\"execution_not_found\")\n end\n local core = hgetall_to_table(raw)\n\n -- Guard: not active \u{2192} nothing to expire\n if core.lifecycle_phase ~= \"active\" then\n -- Stale index entry: execution already left active phase.\n -- Clean up and return.\n redis.call(\"ZREM\", K.lease_expiry_key, execution_id)\n return ok_already_satisfied(\"not_active\")\n end\n\n -- Guard: already marked expired or ownership already cleared\n if core.ownership_state == \"lease_expired_reclaimable\" then\n return ok_already_satisfied(\"already_expired\")\n end\n if core.ownership_state == \"lease_revoked\" then\n return ok_already_satisfied(\"already_revoked\")\n end\n if core.ownership_state == \"unowned\" then\n -- Shouldn\'t happen for active phase, but defensive\n redis.call(\"ZREM\", K.lease_expiry_key, execution_id)\n return ok_already_satisfied(\"unowned\")\n end\n\n -- Check if lease is actually expired\n local expires_at = tonumber(core.lease_expires_at or \"0\")\n if expires_at > now_ms then\n -- Lease was renewed since the scanner read the index.\n -- The ZADD in renew_lease already updated the score.\n return ok_already_satisfied(\"not_yet_expired\")\n end\n\n -- Mark expired using shared helper\n -- Sets ownership_state=lease_expired_reclaimable, blocking_reason,\n -- blocking_detail, lease_expired_at, XADD lease_history \"expired\" event.\n mark_expired(\n { core_key = K.core_key, lease_history_key = K.lease_history },\n core, now_ms, 1000)\n\n -- NOTE: Do NOT ZREM from lease_expiry_key here.\n -- The entry stays so the scheduler (or a subsequent scanner cycle)\n -- can discover reclaimable executions from the same index.\n -- reclaim_execution or cancel_execution removes it.\n\n return ok(\"marked_expired\")\nend)\n\n---------------------------------------------------------------------------\n-- #8 ff_revoke_lease\n--\n-- Operator-initiated lease revocation. Immediate \u{2014} no grace period.\n-- Does NOT terminal-transition the execution. It clears the current\n-- owner and places the execution into lease_revoked ownership condition.\n-- The scheduler or operator must subsequently reclaim or cancel.\n--\n-- KEYS (5): exec_core, lease_current, lease_history,\n-- lease_expiry_zset, worker_leases\n-- ARGV (3): execution_id, expected_lease_id (or \"\" to skip check),\n-- revoke_reason\n---------------------------------------------------------------------------\nredis.register_function(\'ff_revoke_lease\', function(keys, args)\n -- Positional KEYS\n local K = {\n core_key = keys[1],\n lease_current = keys[2],\n lease_history = keys[3],\n lease_expiry_key = keys[4],\n worker_leases = keys[5],\n }\n\n -- Positional ARGV\n local A = {\n execution_id = args[1],\n expected_lease_id = args[2],\n revoke_reason = args[3],\n }\n\n -- Server time\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- Read execution core\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then\n return err(\"execution_not_found\")\n end\n local core = hgetall_to_table(raw)\n\n -- Must be active + leased\n if core.lifecycle_phase ~= \"active\" then\n return err(\"execution_not_active\",\n core.terminal_outcome or \"\",\n core.current_lease_epoch or \"\",\n core.lifecycle_phase or \"\",\n core.current_attempt_id or \"\")\n end\n if core.ownership_state ~= \"leased\" then\n if core.ownership_state == \"lease_revoked\" then\n return ok_already_satisfied(\"already_revoked\")\n end\n if core.ownership_state == \"lease_expired_reclaimable\" then\n return ok_already_satisfied(\"already_expired\")\n end\n return err(\"no_active_lease\")\n end\n\n -- Optional lease_id check (for targeted revocation)\n if is_set(A.expected_lease_id) and core.current_lease_id ~= A.expected_lease_id then\n return err(\"stale_lease\",\n \"expected \" .. A.expected_lease_id .. \" but current is \" .. (core.current_lease_id or \"\"))\n end\n\n -- Capture identity for history before clearing\n local lease_id = core.current_lease_id or \"\"\n local lease_epoch = core.current_lease_epoch or \"\"\n local attempt_index = core.current_attempt_index or \"\"\n local attempt_id = core.current_attempt_id or \"\"\n local worker_id = core.current_worker_id or \"\"\n local worker_instance_id = core.current_worker_instance_id or \"\"\n\n -- Update exec_core: all 7 state vector dimensions + public_state\n redis.call(\"HSET\", K.core_key,\n -- 7 state vector dimensions\n \"lifecycle_phase\", \"active\",\n \"ownership_state\", \"lease_revoked\",\n \"eligibility_state\", core.eligibility_state or \"not_applicable\",\n \"blocking_reason\", \"waiting_for_worker\",\n \"blocking_detail\", \"lease revoked: \" .. (A.revoke_reason or \"operator\"),\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"attempt_interrupted\",\n -- Derived public state (RFC-001 \u{a7}2.4 D3: active \u{2192} public_state=active)\n \"public_state\", \"active\",\n -- Revocation fields\n \"lease_revoked_at\", tostring(now_ms),\n \"lease_revoke_reason\", A.revoke_reason or \"operator\",\n \"last_mutation_at\", tostring(now_ms))\n\n -- DEL lease_current\n redis.call(\"DEL\", K.lease_current)\n\n -- ZREM from lease expiry index\n redis.call(\"ZREM\", K.lease_expiry_key, A.execution_id)\n\n -- SREM from worker leases set\n redis.call(\"SREM\", K.worker_leases, A.execution_id)\n\n -- Append revoked event to lease history\n redis.call(\"XADD\", K.lease_history, \"MAXLEN\", \"~\", 1000, \"*\",\n \"event\", \"revoked\",\n \"lease_id\", lease_id,\n \"lease_epoch\", lease_epoch,\n \"attempt_index\", attempt_index,\n \"attempt_id\", attempt_id,\n \"worker_id\", worker_id,\n \"worker_instance_id\", worker_instance_id,\n \"reason\", A.revoke_reason or \"operator\",\n \"ts\", tostring(now_ms))\n\n return ok(\"revoked\", lease_id, lease_epoch)\nend)\n-- NOTE: ff_issue_reclaim_grant is in scheduling.lua\n-- NOTE: ff_reclaim_execution is in execution.lua\n\n\n-- source: lua/execution.lua\n-- FlowFabric execution lifecycle functions\n-- Reference: RFC-001 (Execution), RFC-010 \u{a7}4 (function inventory)\n--\n-- Depends on helpers: ok, err, hgetall_to_table, is_set,\n-- validate_lease, validate_lease_and_mark_expired,\n-- clear_lease_and_indexes, defensive_zrem_all_indexes, unpack_policy\n\n---------------------------------------------------------------------------\n-- #0 ff_create_execution\n--\n-- Creates a new execution: core hash, payload, policy, tags, indexes.\n-- Idempotent via idempotency key (SET NX with TTL).\n--\n-- KEYS (8): exec_core, payload_key, policy_key, tags_key,\n-- eligible_or_delayed_zset, idem_key,\n-- execution_deadline_zset, all_executions_set\n-- ARGV (13): execution_id, namespace, lane_id, execution_kind,\n-- priority, creator_identity, policy_json,\n-- input_payload, delay_until, dedup_ttl_ms,\n-- tags_json, execution_deadline_at, partition_id\n---------------------------------------------------------------------------\nredis.register_function(\'ff_create_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n payload_key = keys[2],\n policy_key = keys[3],\n tags_key = keys[4],\n scheduling_zset = keys[5], -- eligible or delayed\n idem_key = keys[6],\n deadline_zset = keys[7],\n all_executions_set = keys[8],\n }\n\n local priority_n = require_number(args[5], \"priority\")\n if type(priority_n) == \"table\" then return priority_n end\n\n local A = {\n execution_id = args[1],\n namespace = args[2],\n lane_id = args[3],\n execution_kind = args[4],\n priority = priority_n,\n creator_identity = args[6],\n policy_json = args[7],\n input_payload = args[8],\n delay_until = args[9], -- \"\" or ms timestamp\n dedup_ttl_ms = args[10], -- \"\" or ms\n tags_json = args[11], -- \"\" or JSON object\n execution_deadline_at = args[12], -- \"\" or ms timestamp\n partition_id = args[13],\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- Clamp priority to [0, 9000]. The composite eligible-ZSET score formula\n -- -(priority * 1_000_000_000_000) + created_at uses Lua doubles (IEEE 754,\n -- 53-bit mantissa). priority > 9007 overflows the multiplication step;\n -- priority > ~8300 overflows the combined score when created_at is added.\n -- Clamping to 9000 keeps a safe margin while supporting 4+ orders of magnitude.\n if A.priority < 0 then A.priority = 0 end\n if A.priority > 9000 then A.priority = 9000 end\n\n -- 1. Idempotency check (only when idem_key is a real key, not the noop placeholder)\n if K.idem_key ~= \"\" and not string.find(K.idem_key, \"ff:noop:\") then\n local existing = redis.call(\"GET\", K.idem_key)\n if existing then\n return ok_duplicate(existing)\n end\n end\n\n -- 2. Guard: execution already exists\n if redis.call(\"EXISTS\", K.core_key) == 1 then\n return {1, \"DUPLICATE\", A.execution_id}\n end\n\n -- 2b. Policy JSON validation \u{2014} extract required_capabilities CSV now, BEFORE\n -- any write, so an error aborts without leaving a half-written exec_core.\n -- Fail-CLOSED on any malformed/typed input rather than silently defaulting\n -- to the empty-required wildcard \u{2014} required_capabilities is security-\n -- sensitive (an empty set matches any worker).\n --\n -- * An empty / \"{}\" policy_json \u{2192} no required_capabilities field written\n -- \u{2192} wildcard match, intentional.\n -- * A non-empty policy_json that fails cjson.decode \u{2192} fail with\n -- invalid_policy_json. An operator who MEANT \"no policy\" passes \"\".\n -- * routing_requirements.required_capabilities present but not an array\n -- \u{2192} fail with invalid_capabilities:required_not_array.\n -- * Any element that is not a non-empty string \u{2192} fail with\n -- invalid_capabilities:required:non_string_token. Silent-drop on\n -- `[\"gpu\", null, 42]` would erase real requirements.\n -- * Comma in a token \u{2192} fail (reserved delimiter, see ff_issue_claim_grant).\n -- * ASCII control byte (0x00-0x1F, 0x7F) or space (0x20) \u{2192} fail\n -- (invalid_capabilities:required:control_or_whitespace). Mirrors the\n -- Rust ingress in ff-sdk::FlowFabricWorker::connect and\n -- ff-scheduler::Scheduler::claim_for_worker (R3 relaxed printable-ASCII\n -- check to allow UTF-8 printable above 0x7F while still rejecting\n -- whitespace/control). This is the \"last line of defense\" for admin\n -- direct-HSET bypass: a required cap containing \"\\n\" or \"\\0\" is\n -- impossible to type, impossible to debug, and would silently pin an\n -- execution as unclaimable forever.\n -- * Bounds: same 4096 bytes / 256 tokens ceiling as the worker CSV.\n --\n -- Note on UTF-8: the byte-range test rejects ASCII control + space but\n -- accepts every byte \u{2265} 0x21 except 0x7F (DEL). UTF-8 multibyte sequences\n -- use only bytes 0x80-0xBF (continuation) or 0xC0-0xFD (lead), all above\n -- 0x7F, so i18n caps like \"\u{4e1c}\u{4eac}-gpu\" pass through intact. See RFC-009 \u{a7}7.5.\n local required_caps_csv = nil\n if is_set(A.policy_json) and A.policy_json ~= \"{}\" then\n local ok_decode, policy = pcall(cjson.decode, A.policy_json)\n if not ok_decode then\n return err(\"invalid_policy_json\", \"malformed\")\n end\n if type(policy) == \"table\" and policy.routing_requirements ~= nil then\n if type(policy.routing_requirements) ~= \"table\" then\n return err(\"invalid_policy_json\", \"routing_requirements:not_object\")\n end\n local caps = policy.routing_requirements.required_capabilities\n if caps ~= nil then\n if type(caps) ~= \"table\" then\n return err(\"invalid_capabilities\", \"required:not_array\")\n end\n local list = {}\n for _, cap in ipairs(caps) do\n if type(cap) ~= \"string\" then\n return err(\"invalid_capabilities\", \"required:non_string_token\")\n end\n if #cap == 0 then\n return err(\"invalid_capabilities\", \"required:empty_token\")\n end\n if string.find(cap, \",\", 1, true) then\n return err(\"invalid_capabilities\", \"required:comma_in_token\")\n end\n -- Reject ASCII control (0x00-0x1F), DEL (0x7F), and space (0x20).\n -- Iterating byte-by-byte: any byte in 0x00..=0x20 or == 0x7F fails.\n for i = 1, #cap do\n local b = cap:byte(i)\n if b <= 0x20 or b == 0x7F then\n return err(\"invalid_capabilities\", \"required:control_or_whitespace\")\n end\n end\n list[#list + 1] = cap\n end\n if #list > CAPS_MAX_TOKENS then\n return err(\"invalid_capabilities\", \"required:too_many_tokens\")\n end\n table.sort(list)\n if #list > 0 then\n local csv = table.concat(list, \",\")\n if #csv > CAPS_MAX_BYTES then\n return err(\"invalid_capabilities\", \"required:too_many_bytes\")\n end\n required_caps_csv = csv\n end\n end\n end\n end\n\n -- 3. Determine initial eligibility\n local lifecycle_phase = \"runnable\"\n local eligibility_state, blocking_reason, blocking_detail, public_state\n local is_delayed = is_set(A.delay_until) and tonumber(A.delay_until) > now_ms\n\n if is_delayed then\n eligibility_state = \"not_eligible_until_time\"\n blocking_reason = \"waiting_for_delay\"\n blocking_detail = \"delayed until \" .. A.delay_until\n public_state = \"delayed\"\n else\n eligibility_state = \"eligible_now\"\n blocking_reason = \"waiting_for_worker\"\n blocking_detail = \"\"\n public_state = \"waiting\"\n end\n\n -- 4. Create exec_core \u{2014} ALL 7 state vector dimensions\n redis.call(\"HSET\", K.core_key,\n \"execution_id\", A.execution_id,\n \"namespace\", A.namespace,\n \"lane_id\", A.lane_id,\n \"execution_kind\", A.execution_kind,\n \"partition_id\", A.partition_id,\n \"priority\", A.priority,\n \"creator_identity\", A.creator_identity,\n -- 7 state vector dimensions\n \"lifecycle_phase\", lifecycle_phase,\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", eligibility_state,\n \"blocking_reason\", blocking_reason,\n \"blocking_detail\", blocking_detail,\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"pending_first_attempt\",\n \"public_state\", public_state,\n -- accounting\n \"total_attempt_count\", \"0\",\n \"current_attempt_index\", \"\",\n \"current_attempt_id\", \"\",\n \"current_lease_id\", \"\",\n \"current_lease_epoch\", \"0\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"current_lane\", A.lane_id,\n \"retry_count\", \"0\",\n \"replay_count\", \"0\",\n \"lease_reclaim_count\", \"0\",\n -- timestamps\n \"created_at\", now_ms,\n \"started_at\", \"\",\n \"completed_at\", \"\",\n \"last_transition_at\", now_ms,\n \"last_mutation_at\", now_ms,\n -- lease fields (cleared)\n \"lease_acquired_at\", \"\",\n \"lease_expires_at\", \"\",\n \"lease_last_renewed_at\", \"\",\n \"lease_renewal_deadline\", \"\",\n \"lease_expired_at\", \"\",\n \"lease_revoked_at\", \"\",\n \"lease_revoke_reason\", \"\",\n -- delay\n \"delay_until\", is_delayed and A.delay_until or \"\",\n -- suspension (cleared)\n \"current_suspension_id\", \"\",\n \"current_waitpoint_id\", \"\",\n -- pending lineage (cleared)\n \"pending_retry_reason\", \"\",\n \"pending_replay_reason\", \"\",\n \"pending_replay_requested_by\", \"\",\n \"pending_previous_attempt_index\", \"\",\n -- progress\n \"progress_pct\", \"\",\n \"progress_message\", \"\",\n \"progress_updated_at\", \"\",\n -- flow\n \"flow_id\", \"\")\n\n -- 5. Store payload\n if is_set(A.input_payload) then\n redis.call(\"SET\", K.payload_key, A.input_payload)\n end\n\n -- 6. Store policy + required_capabilities. Validation already ran before\n -- any write (step 2b); here we only persist the pre-validated CSV. See\n -- step 2b for the fail-closed rationale on malformed / typed inputs.\n if is_set(A.policy_json) then\n redis.call(\"SET\", K.policy_key, A.policy_json)\n end\n if required_caps_csv then\n redis.call(\"HSET\", K.core_key, \"required_capabilities\", required_caps_csv)\n end\n\n -- 7. Store tags\n if is_set(A.tags_json) and A.tags_json ~= \"{}\" then\n local ok_decode, tags = pcall(cjson.decode, A.tags_json)\n if ok_decode and type(tags) == \"table\" then\n local flat = {}\n for k, v in pairs(tags) do\n flat[#flat + 1] = k\n flat[#flat + 1] = tostring(v)\n end\n if #flat > 0 then\n redis.call(\"HSET\", K.tags_key, unpack(flat))\n end\n end\n end\n\n -- 8. Add to scheduling index\n if is_delayed then\n redis.call(\"ZADD\", K.scheduling_zset, tonumber(A.delay_until), A.execution_id)\n else\n -- Composite score: -(priority * 1_000_000_000_000) + created_at_ms\n local score = 0 - (A.priority * 1000000000000) + now_ms\n redis.call(\"ZADD\", K.scheduling_zset, score, A.execution_id)\n end\n\n -- 9. SADD to all_executions partition index\n redis.call(\"SADD\", K.all_executions_set, A.execution_id)\n\n -- 10. Execution deadline index\n if is_set(A.execution_deadline_at) then\n redis.call(\"ZADD\", K.deadline_zset, tonumber(A.execution_deadline_at), A.execution_id)\n end\n\n -- 11. Set idempotency key with TTL\n -- Guard: PX 0 or PX <0 causes Valkey error (\"invalid expire time\"),\n -- which would abort the FCALL after exec_core was already written (step 4).\n local dedup_ms = tonumber(A.dedup_ttl_ms) or 0\n if dedup_ms > 0 and K.idem_key ~= \"\" and not string.find(K.idem_key, \"ff:noop:\") then\n redis.call(\"SET\", K.idem_key, A.execution_id,\n \"PX\", dedup_ms)\n end\n\n return ok(A.execution_id, public_state)\nend)\n\n---------------------------------------------------------------------------\n-- #1 ff_claim_execution (new attempt)\n--\n-- Consumes a claim grant, creates a new attempt + lease, transitions\n-- runnable \u{2192} active. Attempt type derived from exec_core attempt_state.\n--\n-- KEYS (14): exec_core, claim_grant, eligible_zset, lease_expiry_zset,\n-- worker_leases, attempt_hash, attempt_usage, attempt_policy,\n-- attempts_zset, lease_current, lease_history, active_index,\n-- attempt_timeout_zset, execution_deadline_zset\n-- ARGV (12): execution_id, worker_id, worker_instance_id, lane,\n-- capability_snapshot_hash, lease_id, lease_ttl_ms,\n-- renew_before_ms, attempt_id, attempt_policy_json,\n-- attempt_timeout_ms, execution_deadline_at\n--\n-- KNOWN LIMITATION (flow-cancel race): ff_claim_execution reads\n-- exec_core on {p:N} but cannot atomically read flow_core on {fp:N}\n-- (cross-slot). If ff_cancel_flow fired and its async member dispatch\n-- was dropped by a transient Valkey error, the member\'s exec_core has\n-- NOT yet been flipped to terminal \u{2014} so a worker may still claim it and\n-- run it to completion inside a cancelled flow. The flow\'s own\n-- public_flow_state is correctly terminal; only this one member escapes.\n-- Mitigations already in place:\n-- * cancel_flow(wait=true) avoids the bg dispatch entirely.\n-- * ff_apply_dependency_to_child rejects additions to terminal flows,\n-- so children cannot be added mid-cancel.\n-- * retention eventually trims the stale member.\n-- Full fix (flag on exec_core maintained by a broadcast loop) is a\n-- deliberate design change deferred past Batch A.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_claim_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n claim_grant = keys[2],\n eligible_zset = keys[3],\n lease_expiry_key = keys[4],\n worker_leases_key = keys[5],\n attempt_hash = keys[6],\n attempt_usage = keys[7],\n attempt_policy = keys[8],\n attempts_zset = keys[9],\n lease_current_key = keys[10],\n lease_history_key = keys[11],\n active_index_key = keys[12],\n attempt_timeout_key = keys[13],\n execution_deadline_key = keys[14],\n }\n\n local lease_ttl_n = require_number(args[7], \"lease_ttl_ms\")\n if type(lease_ttl_n) == \"table\" then return lease_ttl_n end\n local renew_before_n = require_number(args[8], \"renew_before_ms\")\n if type(renew_before_n) == \"table\" then return renew_before_n end\n\n local A = {\n execution_id = args[1],\n worker_id = args[2],\n worker_instance_id = args[3],\n lane = args[4],\n capability_hash = args[5],\n lease_id = args[6],\n lease_ttl_ms = lease_ttl_n,\n renew_before_ms = renew_before_n,\n attempt_id = args[9],\n attempt_policy_json = args[10],\n attempt_timeout_ms = args[11], -- \"\" or ms\n execution_deadline_at = args[12], -- \"\" or ms\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Validate execution state\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"runnable\" then return err(\"execution_not_leaseable\") end\n if core.ownership_state ~= \"unowned\" then return err(\"lease_conflict\") end\n if core.eligibility_state ~= \"eligible_now\" then return err(\"execution_not_leaseable\") end\n if core.terminal_outcome ~= \"none\" then return err(\"execution_not_leaseable\") end\n\n -- Defense-in-depth: invariant A3\n if core.attempt_state == \"running_attempt\" then\n return err(\"active_attempt_exists\")\n end\n -- Dispatch: resume-from-suspension must use claim_resumed_execution\n if core.attempt_state == \"attempt_interrupted\" then\n return err(\"use_claim_resumed_execution\")\n end\n\n -- 2. Validate and consume claim grant\n local grant_raw = redis.call(\"HGETALL\", K.claim_grant)\n if #grant_raw == 0 then return err(\"invalid_claim_grant\") end\n local grant = hgetall_to_table(grant_raw)\n\n if grant.worker_id ~= A.worker_id then\n return err(\"invalid_claim_grant\")\n end\n if is_set(grant.grant_expires_at) and tonumber(grant.grant_expires_at) < now_ms then\n return err(\"claim_grant_expired\")\n end\n redis.call(\"DEL\", K.claim_grant)\n\n -- 3. Compute lease fields\n local next_epoch = tonumber(core.current_lease_epoch or \"0\") + 1\n local expires_at = now_ms + A.lease_ttl_ms\n local renewal_deadline = now_ms + A.renew_before_ms\n local next_att_idx = tonumber(core.total_attempt_count or \"0\")\n\n -- 4. Derive attempt type from exec core attempt_state\n local attempt_type = \"initial\"\n local lineage_fields = {}\n if core.attempt_state == \"pending_retry_attempt\" then\n attempt_type = \"retry\"\n lineage_fields = {\n \"retry_reason\", core.pending_retry_reason or \"\",\n \"previous_attempt_index\", core.pending_previous_attempt_index or \"\"\n }\n elseif core.attempt_state == \"pending_replay_attempt\" then\n attempt_type = \"replay\"\n lineage_fields = {\n \"replay_reason\", core.pending_replay_reason or \"\",\n \"replay_requested_by\", core.pending_replay_requested_by or \"\",\n \"replayed_from_attempt_index\", core.pending_previous_attempt_index or \"\"\n }\n end\n\n -- 5. Create attempt record\n -- Construct actual attempt key from hash tag + computed index.\n -- KEYS[6] is a placeholder (always index 0); on retry/replay the\n -- actual index is > 0, so we build the real key dynamically.\n -- All attempt keys share the same {p:N} hash tag \u{2192} same Cluster slot.\n local tag = string.match(K.core_key, \"(%b{})\")\n local att_key = \"ff:attempt:\" .. tag .. \":\" .. A.execution_id .. \":\" .. tostring(next_att_idx)\n local att_usage_key = att_key .. \":usage\"\n local att_policy_key = att_key .. \":policy\"\n\n local attempt_fields = {\n \"attempt_id\", A.attempt_id,\n \"execution_id\", A.execution_id,\n \"attempt_index\", tostring(next_att_idx),\n \"attempt_type\", attempt_type,\n \"attempt_state\", \"started\",\n \"created_at\", tostring(now_ms),\n \"started_at\", tostring(now_ms),\n \"lease_id\", A.lease_id,\n \"lease_epoch\", tostring(next_epoch),\n \"worker_id\", A.worker_id,\n \"worker_instance_id\", A.worker_instance_id\n }\n for _, v in ipairs(lineage_fields) do\n attempt_fields[#attempt_fields + 1] = v\n end\n redis.call(\"HSET\", att_key, unpack(attempt_fields))\n redis.call(\"ZADD\", K.attempts_zset, now_ms, tostring(next_att_idx))\n\n -- 5b. Initialize attempt usage counters\n redis.call(\"HSET\", att_usage_key,\n \"last_usage_report_seq\", \"0\")\n\n -- 5c. Store attempt policy snapshot\n if is_set(A.attempt_policy_json) then\n local policy_flat = unpack_policy(A.attempt_policy_json)\n if #policy_flat > 0 then\n redis.call(\"HSET\", att_policy_key, unpack(policy_flat))\n end\n end\n\n -- 6. Create lease record\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"HSET\", K.lease_current_key,\n \"lease_id\", A.lease_id,\n \"lease_epoch\", tostring(next_epoch),\n \"execution_id\", A.execution_id,\n \"attempt_id\", A.attempt_id,\n \"worker_id\", A.worker_id,\n \"worker_instance_id\", A.worker_instance_id,\n \"acquired_at\", tostring(now_ms),\n \"expires_at\", tostring(expires_at),\n \"last_renewed_at\", tostring(now_ms),\n \"renewal_deadline\", tostring(renewal_deadline))\n\n -- 7. Update execution core \u{2014} ALL 7 state vector dimensions\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"active\",\n \"ownership_state\", \"leased\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"running_attempt\",\n \"public_state\", \"active\",\n \"current_attempt_index\", tostring(next_att_idx),\n \"total_attempt_count\", tostring(next_att_idx + 1),\n \"current_attempt_id\", A.attempt_id,\n \"current_lease_id\", A.lease_id,\n \"current_lease_epoch\", tostring(next_epoch),\n \"current_worker_id\", A.worker_id,\n \"current_worker_instance_id\", A.worker_instance_id,\n \"current_lane\", A.lane,\n \"lease_acquired_at\", tostring(now_ms),\n \"lease_expires_at\", tostring(expires_at),\n \"lease_last_renewed_at\", tostring(now_ms),\n \"lease_renewal_deadline\", tostring(renewal_deadline),\n -- Preserve the first-claim timestamp across retries; only fall\n -- back to `now_ms` when the stored value is empty (initial state\n -- written at HSET, line 208, or after a reset). In Lua the empty\n -- string is truthy, so `core.started_at or now_ms` would wedge at\n -- \"\" forever on the first claim \u{2014} explicit emptiness check fixes\n -- the scenario 4 stage_latency bench (exec_core surfaced this via\n -- the new ExecutionInfo.started_at REST field).\n \"started_at\", (core.started_at ~= nil and core.started_at ~= \"\") and core.started_at or tostring(now_ms),\n -- Clear pending lineage fields (consumed above)\n \"pending_retry_reason\", \"\",\n \"pending_replay_reason\", \"\",\n \"pending_replay_requested_by\", \"\",\n \"pending_previous_attempt_index\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- 8. Update indexes\n redis.call(\"ZREM\", K.eligible_zset, A.execution_id)\n redis.call(\"ZADD\", K.lease_expiry_key, expires_at, A.execution_id)\n redis.call(\"SADD\", K.worker_leases_key, A.execution_id)\n redis.call(\"ZADD\", K.active_index_key, expires_at, A.execution_id)\n\n -- 8a. Timeout indexes\n if is_set(A.attempt_timeout_ms) and A.attempt_timeout_ms ~= \"0\" then\n redis.call(\"ZADD\", K.attempt_timeout_key,\n now_ms + tonumber(A.attempt_timeout_ms), A.execution_id)\n end\n if is_set(A.execution_deadline_at) and A.execution_deadline_at ~= \"0\" then\n redis.call(\"ZADD\", K.execution_deadline_key,\n tonumber(A.execution_deadline_at), A.execution_id)\n end\n\n -- 9. Lease history event\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", 1000, \"*\",\n \"event\", \"acquired\",\n \"lease_id\", A.lease_id,\n \"lease_epoch\", tostring(next_epoch),\n \"attempt_id\", A.attempt_id,\n \"attempt_index\", tostring(next_att_idx),\n \"worker_id\", A.worker_id,\n \"ts\", tostring(now_ms))\n\n return ok(A.lease_id, tostring(next_epoch), tostring(expires_at),\n A.attempt_id, tostring(next_att_idx), attempt_type)\nend)\n\n---------------------------------------------------------------------------\n-- #3 ff_complete_execution\n--\n-- Validate lease, end attempt, store result, transition active\u{2192}terminal.\n--\n-- KEYS (12): exec_core, attempt_hash, lease_expiry_zset, worker_leases,\n-- terminal_zset, lease_current, lease_history, active_index,\n-- stream_meta, result_key, attempt_timeout_zset,\n-- execution_deadline_zset\n-- ARGV (6): execution_id, lease_id, lease_epoch, attempt_id, result_payload,\n-- source\n-- source (args[6]) \u{2014} RFC #58.5: empty fence triple requires\n-- source == \"operator_override\" to proceed unfenced; otherwise the\n-- FCALL returns err(\"fence_required\"). Terminal op, so gate is strict.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_complete_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n attempt_hash = keys[2],\n lease_expiry_key = keys[3],\n worker_leases_key = keys[4],\n terminal_zset = keys[5],\n lease_current_key = keys[6],\n lease_history_key = keys[7],\n active_index_key = keys[8],\n stream_meta = keys[9],\n result_key = keys[10],\n attempt_timeout_key = keys[11],\n execution_deadline_key = keys[12],\n }\n\n local A = {\n execution_id = args[1],\n lease_id = args[2] or \"\",\n lease_epoch = args[3] or \"\",\n attempt_id = args[4] or \"\",\n result_payload = args[5] or \"\",\n source = args[6] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- Read + validate lease\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- RFC #58.5 \u{2014} fence resolution. Terminal ops require either a\n -- non-empty fence triple OR source==\"operator_override\".\n -- In both branches validate_lease_and_mark_expired runs afterwards,\n -- which enforces lifecycle_phase==\"active\", ownership!=revoked, and\n -- expiry \u{2014} so operator overrides cannot bypass those preconditions.\n local fence, must_check_or_err = resolve_lease_fence(core, A)\n if not fence then return must_check_or_err end\n if not must_check_or_err then\n if A.source ~= \"operator_override\" then return err(\"fence_required\") end\n -- Bind server-resolved fence so validate_lease_and_mark_expired\'s\n -- identity check trivially matches; the lifecycle/expiry gates\n -- still fire and are the real safety net for the override path.\n A.lease_id = fence.lease_id\n A.lease_epoch = fence.lease_epoch\n A.attempt_id = fence.attempt_id\n end\n local lease_err = validate_lease_and_mark_expired(\n core, A, now_ms, K, 1000)\n if lease_err then return lease_err end\n\n -- End attempt\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"ended_success\",\n \"ended_at\", tostring(now_ms))\n\n -- Close stream if exists\n if redis.call(\"EXISTS\", K.stream_meta) == 1 then\n redis.call(\"HSET\", K.stream_meta,\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"attempt_success\")\n end\n\n -- Store result\n if A.result_payload ~= \"\" then\n redis.call(\"SET\", K.result_key, A.result_payload)\n end\n\n -- Update execution core \u{2014} ALL 7 state vector dimensions\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"terminal\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"success\",\n \"attempt_state\", \"attempt_terminal\",\n \"public_state\", \"completed\",\n \"completed_at\", tostring(now_ms),\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"lease_expires_at\", \"\",\n \"lease_last_renewed_at\", \"\",\n \"lease_renewal_deadline\", \"\",\n \"lease_expired_at\", \"\",\n \"lease_revoked_at\", \"\",\n \"lease_revoke_reason\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Update indexes\n redis.call(\"ZREM\", K.lease_expiry_key, A.execution_id)\n redis.call(\"SREM\", K.worker_leases_key, A.execution_id)\n redis.call(\"ZADD\", K.terminal_zset, now_ms, A.execution_id)\n redis.call(\"ZREM\", K.active_index_key, A.execution_id)\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n redis.call(\"ZREM\", K.execution_deadline_key, A.execution_id)\n\n -- Clean up lease\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", 1000, \"*\",\n \"event\", \"released\",\n \"lease_id\", A.lease_id,\n \"lease_epoch\", A.lease_epoch,\n \"attempt_index\", core.current_attempt_index or \"\",\n \"reason\", \"completed\",\n \"ts\", tostring(now_ms))\n\n -- Push-based DAG promotion (Batch C item 6). Only publish when the\n -- execution actually belongs to a flow \u{2014} standalone executions never\n -- have downstream edges for the engine to resolve. The engine\'s\n -- CompletionListener scanner SUBSCRIBEs to ff:dag:completions and\n -- calls dispatch_dependency_resolution per message. The\n -- dependency_reconciler interval scan is retained as a safety net\n -- for: missed messages during listener restart, ungated state\n -- (flow_id empty on older executions), and cluster broadcast gaps.\n if is_set(core.flow_id) then\n local payload = cjson.encode({\n execution_id = A.execution_id,\n flow_id = core.flow_id,\n outcome = \"success\",\n })\n redis.call(\"PUBLISH\", \"ff:dag:completions\", payload)\n end\n\n return ok(\"completed\")\nend)\n\n---------------------------------------------------------------------------\n-- #12a ff_cancel_execution\n--\n-- Cancel from any non-terminal state. Multi-path:\n-- active \u{2192} validate lease or operator override, end attempt, clear lease\n-- runnable \u{2192} defensive ZREM all indexes\n-- suspended \u{2192} close suspension/waitpoint, end attempt\n-- All paths: terminal(cancelled), defensive ZREM, ZADD terminal.\n--\n-- KEYS (21): exec_core, attempt_hash, stream_meta, lease_current,\n-- lease_history, lease_expiry_zset, worker_leases,\n-- suspension_current, waitpoint_hash, wp_condition,\n-- suspension_timeout_zset, terminal_zset,\n-- attempt_timeout_zset, execution_deadline_zset,\n-- eligible_zset, delayed_zset, blocked_deps_zset,\n-- blocked_budget_zset, blocked_quota_zset,\n-- blocked_route_zset, blocked_operator_zset\n-- ARGV (5): execution_id, reason, source, lease_id, lease_epoch\n---------------------------------------------------------------------------\nredis.register_function(\'ff_cancel_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n attempt_hash = keys[2],\n stream_meta = keys[3],\n lease_current_key = keys[4],\n lease_history_key = keys[5],\n lease_expiry_key = keys[6],\n worker_leases_key = keys[7],\n suspension_current = keys[8],\n waitpoint_hash = keys[9],\n wp_condition = keys[10],\n suspension_timeout_key = keys[11],\n terminal_key = keys[12],\n attempt_timeout_key = keys[13],\n execution_deadline_key = keys[14],\n eligible_key = keys[15],\n delayed_key = keys[16],\n blocked_deps_key = keys[17],\n blocked_budget_key = keys[18],\n blocked_quota_key = keys[19],\n blocked_route_key = keys[20],\n blocked_operator_key = keys[21],\n -- active_index_key and suspended_key are constructed dynamically below\n -- from the hash tag + lane_id (avoids changing KEYS count).\n active_index_key = nil,\n suspended_key = nil,\n }\n\n local A = {\n execution_id = args[1],\n reason = args[2],\n source = args[3] or \"\", -- \"operator_override\" or \"\"\n lease_id = args[4] or \"\",\n lease_epoch = args[5] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- Construct lane:active and lane:suspended keys dynamically from hash tag.\n -- These are not in the 21 KEYS array but defensive_zrem_all_indexes needs\n -- them to clean stale entries when cancelling active or suspended executions.\n local tag = string.match(K.core_key, \"(%b{})\")\n local lane = core.lane_id or \"default\"\n K.active_index_key = \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":active\"\n K.suspended_key = \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":suspended\"\n\n -- Already terminal. Enriched error detail mirrors validate_lease_and_mark_expired\n -- so SDK-side replay reconciliation works for cancel the same as for\n -- complete/fail.\n if core.lifecycle_phase == \"terminal\" then\n return err(\"execution_not_active\",\n core.terminal_outcome or \"\",\n core.current_lease_epoch or \"\",\n \"terminal\",\n core.current_attempt_id or \"\")\n end\n\n local cancelled_from = core.lifecycle_phase\n\n -- PATH: Active\n if core.lifecycle_phase == \"active\" then\n -- Require lease validation or operator override\n if A.source ~= \"operator_override\" then\n if core.ownership_state == \"lease_revoked\" then\n return err(\"lease_revoked\")\n end\n if is_set(A.lease_id) then\n if core.current_lease_id ~= A.lease_id then return err(\"stale_lease\") end\n if core.current_lease_epoch ~= A.lease_epoch then return err(\"stale_lease\") end\n end\n end\n\n -- End attempt\n if is_set(core.current_attempt_index) then\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"ended_cancelled\",\n \"ended_at\", tostring(now_ms),\n \"failure_reason\", \"cancelled: \" .. A.reason)\n end\n\n -- Close stream if exists\n if redis.call(\"EXISTS\", K.stream_meta) == 1 then\n redis.call(\"HSET\", K.stream_meta,\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"attempt_cancelled\")\n end\n\n -- Release lease\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", \"1000\", \"*\",\n \"event\", \"released\",\n \"lease_id\", core.current_lease_id or \"\",\n \"lease_epoch\", core.current_lease_epoch or \"\",\n \"attempt_index\", core.current_attempt_index or \"\",\n \"reason\", \"cancelled\",\n \"ts\", tostring(now_ms))\n end\n\n -- PATH: Suspended\n if core.lifecycle_phase == \"suspended\" then\n -- End attempt (interrupted \u{2192} ended_cancelled)\n if is_set(core.current_attempt_index) then\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"ended_cancelled\",\n \"ended_at\", tostring(now_ms),\n \"failure_reason\", \"cancelled: \" .. A.reason)\n end\n\n -- Close stream if exists\n if redis.call(\"EXISTS\", K.stream_meta) == 1 then\n redis.call(\"HSET\", K.stream_meta,\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"attempt_cancelled\")\n end\n\n -- Close suspension + waitpoint + wp_condition\n if redis.call(\"EXISTS\", K.suspension_current) == 1 then\n redis.call(\"HSET\", K.suspension_current,\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"cancelled\")\n end\n if redis.call(\"EXISTS\", K.waitpoint_hash) == 1 then\n redis.call(\"HSET\", K.waitpoint_hash,\n \"state\", \"closed\",\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"cancelled\")\n end\n if redis.call(\"EXISTS\", K.wp_condition) == 1 then\n redis.call(\"HSET\", K.wp_condition,\n \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"cancelled\")\n end\n -- RFC-014 \u{a7}3.1.1 composite cleanup owner: cancel path.\n composite_cleanup(\n K.suspension_current .. \":satisfied_set\",\n K.suspension_current .. \":member_map\")\n -- RFC-014 Pattern 3: close additional waitpoints co-owned by the\n -- suspension before it\'s discarded. Reread suspension to pick up\n -- the additional_waitpoints_json field.\n local susp_raw = redis.call(\"HGETALL\", K.suspension_current)\n if #susp_raw > 0 then\n local susp = hgetall_to_table(susp_raw)\n close_additional_waitpoints(\n K.suspension_current,\n susp.additional_waitpoints_json or \"\",\n { \"state\", \"closed\",\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"cancelled\" },\n { \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"cancelled\" })\n end\n end\n\n -- ALL PATHS: exec_core FIRST for terminal transition (\u{a7}4.8b Rule 2)\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"terminal\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"cancelled\",\n \"attempt_state\", is_set(core.current_attempt_index) and \"attempt_terminal\" or (core.attempt_state or \"none\"),\n \"public_state\", \"cancelled\",\n \"cancellation_reason\", A.reason,\n \"cancelled_by\", A.source ~= \"\" and A.source or A.execution_id,\n \"completed_at\", tostring(now_ms),\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"lease_expires_at\", \"\",\n \"lease_last_renewed_at\", \"\",\n \"lease_renewal_deadline\", \"\",\n \"current_suspension_id\", \"\",\n \"current_waitpoint_id\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Defensive ZREM from ALL scheduling + timeout indexes\n defensive_zrem_all_indexes(K, A.execution_id, K.terminal_key)\n\n -- ZADD terminal (unconditional)\n redis.call(\"ZADD\", K.terminal_key, now_ms, A.execution_id)\n\n -- Push-based DAG promotion (Batch C item 6). Skip the publish when\n -- the cancel came from `flow_cascade` \u{2014} that source means a parent\n -- already propagated and the engine is walking the edge graph; an\n -- additional publish here would trigger redundant dispatch work.\n -- (Not a correctness issue since dispatch is idempotent, but it\n -- avoids unnecessary load under wide fan-out cancellations.)\n if is_set(core.flow_id) and A.source ~= \"flow_cascade\" then\n local payload = cjson.encode({\n execution_id = A.execution_id,\n flow_id = core.flow_id,\n outcome = \"cancelled\",\n })\n redis.call(\"PUBLISH\", \"ff:dag:completions\", payload)\n end\n\n return ok(\"cancelled\", cancelled_from)\nend)\n\n---------------------------------------------------------------------------\n-- #30 ff_delay_execution\n--\n-- Worker delays its own active execution. Releases lease, transitions\n-- to runnable + not_eligible_until_time. Same attempt continues (paused).\n--\n-- KEYS (9): exec_core, attempt_hash, lease_current, lease_history,\n-- lease_expiry_zset, worker_leases, active_index,\n-- delayed_zset, attempt_timeout_zset\n-- ARGV (6): execution_id, lease_id, lease_epoch, attempt_id, delay_until,\n-- source\n-- source (args[6]) \u{2014} RFC #58.5: terminal-op-style fence gate. Empty\n-- fence triple requires source == \"operator_override\" to proceed unfenced.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_delay_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n attempt_hash = keys[2],\n lease_current_key = keys[3],\n lease_history_key = keys[4],\n lease_expiry_key = keys[5],\n worker_leases_key = keys[6],\n active_index_key = keys[7],\n delayed_zset = keys[8],\n attempt_timeout_key = keys[9],\n }\n\n local A = {\n execution_id = args[1],\n lease_id = args[2] or \"\",\n lease_epoch = args[3] or \"\",\n attempt_id = args[4] or \"\",\n delay_until = args[5] or \"\",\n source = args[6] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- RFC #58.5 \u{2014} fence resolution with operator override for terminal-style ops.\n -- In both branches validate_lease_and_mark_expired runs afterwards, which\n -- enforces lifecycle_phase==\"active\", ownership!=revoked, and expiry \u{2014}\n -- operator overrides cannot bypass those preconditions.\n local fence, must_check_or_err = resolve_lease_fence(core, A)\n if not fence then return must_check_or_err end\n if not must_check_or_err then\n if A.source ~= \"operator_override\" then return err(\"fence_required\") end\n A.lease_id = fence.lease_id\n A.lease_epoch = fence.lease_epoch\n A.attempt_id = fence.attempt_id\n end\n local lease_err = validate_lease_and_mark_expired(\n core, A, now_ms, K, 1000)\n if lease_err then return lease_err end\n\n -- OOM-SAFE WRITE ORDERING: exec_core FIRST (point of no return)\n -- ALL 7 state vector dimensions\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_eligible_until_time\",\n \"blocking_reason\", \"waiting_for_delay\",\n \"blocking_detail\", \"delayed until \" .. A.delay_until,\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"attempt_interrupted\",\n \"public_state\", \"delayed\",\n \"delay_until\", A.delay_until,\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"lease_expires_at\", \"\",\n \"lease_last_renewed_at\", \"\",\n \"lease_renewal_deadline\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Pause attempt: started \u{2192} suspended\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"suspended\",\n \"suspended_at\", tostring(now_ms),\n \"suspension_id\", \"worker_delay\")\n\n -- Release lease + update indexes\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"ZREM\", K.lease_expiry_key, A.execution_id)\n redis.call(\"SREM\", K.worker_leases_key, A.execution_id)\n redis.call(\"ZREM\", K.active_index_key, A.execution_id)\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n\n -- Add to delayed set\n redis.call(\"ZADD\", K.delayed_zset, tonumber(A.delay_until), A.execution_id)\n\n -- Lease history event\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", \"1000\", \"*\",\n \"event\", \"released\",\n \"lease_id\", A.lease_id,\n \"lease_epoch\", A.lease_epoch,\n \"attempt_index\", core.current_attempt_index or \"\",\n \"attempt_id\", A.attempt_id,\n \"reason\", \"worker_delay\",\n \"ts\", tostring(now_ms))\n\n return ok(A.delay_until)\nend)\n\n---------------------------------------------------------------------------\n-- #31 ff_move_to_waiting_children\n--\n-- Worker blocks on child dependencies. Releases lease, transitions to\n-- runnable + blocked_by_dependencies. Same attempt continues (paused).\n--\n-- KEYS (9): exec_core, attempt_hash, lease_current, lease_history,\n-- lease_expiry_zset, worker_leases, active_index,\n-- blocked_deps_zset, attempt_timeout_zset\n-- ARGV (5): execution_id, lease_id, lease_epoch, attempt_id, source\n-- source (args[5]) \u{2014} RFC #58.5: terminal-op-style fence gate. Empty\n-- fence triple requires source == \"operator_override\" to proceed unfenced.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_move_to_waiting_children\', function(keys, args)\n local K = {\n core_key = keys[1],\n attempt_hash = keys[2],\n lease_current_key = keys[3],\n lease_history_key = keys[4],\n lease_expiry_key = keys[5],\n worker_leases_key = keys[6],\n active_index_key = keys[7],\n blocked_deps_zset = keys[8],\n attempt_timeout_key = keys[9],\n }\n\n local A = {\n execution_id = args[1],\n lease_id = args[2] or \"\",\n lease_epoch = args[3] or \"\",\n attempt_id = args[4] or \"\",\n source = args[5] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- RFC #58.5 \u{2014} fence resolution with operator override for terminal-style ops.\n -- In both branches validate_lease_and_mark_expired runs afterwards, which\n -- enforces lifecycle_phase==\"active\", ownership!=revoked, and expiry \u{2014}\n -- operator overrides cannot bypass those preconditions.\n local fence, must_check_or_err = resolve_lease_fence(core, A)\n if not fence then return must_check_or_err end\n if not must_check_or_err then\n if A.source ~= \"operator_override\" then return err(\"fence_required\") end\n A.lease_id = fence.lease_id\n A.lease_epoch = fence.lease_epoch\n A.attempt_id = fence.attempt_id\n end\n local lease_err = validate_lease_and_mark_expired(\n core, A, now_ms, K, 1000)\n if lease_err then return lease_err end\n\n -- OOM-SAFE WRITE ORDERING: exec_core FIRST (point of no return, \u{a7}4.8b Rule 2)\n -- ALL 7 state vector dimensions\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"blocked_by_dependencies\",\n \"blocking_reason\", \"waiting_for_children\",\n \"blocking_detail\", \"waiting for child executions to complete\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"attempt_interrupted\",\n \"public_state\", \"waiting_children\",\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"lease_expires_at\", \"\",\n \"lease_last_renewed_at\", \"\",\n \"lease_renewal_deadline\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Pause attempt: started \u{2192} suspended (waiting children, not ended)\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"suspended\",\n \"suspended_at\", tostring(now_ms),\n \"suspension_id\", \"waiting_children\")\n\n -- Release lease + update indexes\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"ZREM\", K.lease_expiry_key, A.execution_id)\n redis.call(\"SREM\", K.worker_leases_key, A.execution_id)\n redis.call(\"ZREM\", K.active_index_key, A.execution_id)\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n\n -- Lease history event\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", \"1000\", \"*\",\n \"event\", \"released\",\n \"lease_id\", A.lease_id,\n \"lease_epoch\", A.lease_epoch,\n \"attempt_index\", core.current_attempt_index or \"\",\n \"attempt_id\", A.attempt_id,\n \"reason\", \"waiting_children\",\n \"ts\", tostring(now_ms))\n\n -- Add to blocked:dependencies set\n redis.call(\"ZADD\", K.blocked_deps_zset,\n tonumber(core.created_at or \"0\"), A.execution_id)\n\n return ok()\nend)\n\n---------------------------------------------------------------------------\n-- #4 ff_fail_execution\n--\n-- Fail an active execution. If retries remain: set pending_retry_attempt\n-- + lineage on exec_core (deferred creation \u{2014} claim_execution creates\n-- the attempt, per R21 fix). If max retries reached: terminal failure.\n--\n-- KEYS (12): exec_core, attempt_hash, lease_expiry_zset, worker_leases,\n-- terminal_zset, delayed_zset, lease_current, lease_history,\n-- active_index, stream_meta, attempt_timeout_zset,\n-- execution_deadline_zset\n-- ARGV (8): execution_id, lease_id, lease_epoch, attempt_id,\n-- failure_reason, failure_category, retry_policy_json, source\n-- source (args[8]) \u{2014} RFC #58.5: terminal op, so empty fence triple\n-- requires source == \"operator_override\" to proceed unfenced.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_fail_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n attempt_hash = keys[2],\n lease_expiry_key = keys[3],\n worker_leases_key = keys[4],\n terminal_key = keys[5],\n delayed_zset = keys[6],\n lease_current_key = keys[7],\n lease_history_key = keys[8],\n active_index_key = keys[9],\n stream_meta = keys[10],\n attempt_timeout_key = keys[11],\n execution_deadline_key = keys[12],\n }\n\n local A = {\n execution_id = args[1],\n lease_id = args[2] or \"\",\n lease_epoch = args[3] or \"\",\n attempt_id = args[4] or \"\",\n failure_reason = args[5] or \"\",\n failure_category = args[6] or \"\",\n retry_policy_json = args[7] or \"\",\n source = args[8] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Read + validate lease\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- RFC #58.5 \u{2014} fence resolution with operator override for terminal ops.\n -- In both branches validate_lease_and_mark_expired runs afterwards, which\n -- enforces lifecycle_phase==\"active\", ownership!=revoked, and expiry \u{2014}\n -- operator overrides cannot bypass those preconditions.\n local fence, must_check_or_err = resolve_lease_fence(core, A)\n if not fence then return must_check_or_err end\n if not must_check_or_err then\n if A.source ~= \"operator_override\" then return err(\"fence_required\") end\n A.lease_id = fence.lease_id\n A.lease_epoch = fence.lease_epoch\n A.attempt_id = fence.attempt_id\n end\n local lease_err = validate_lease_and_mark_expired(\n core, A, now_ms, K, 1000)\n if lease_err then return lease_err end\n\n -- 2. End current attempt\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"ended_failure\",\n \"ended_at\", tostring(now_ms),\n \"failure_reason\", A.failure_reason,\n \"failure_category\", A.failure_category)\n\n -- 3. Close stream if exists\n if redis.call(\"EXISTS\", K.stream_meta) == 1 then\n redis.call(\"HSET\", K.stream_meta,\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"attempt_failure\")\n end\n\n -- 4. Determine retry eligibility\n local retry_count = tonumber(core.retry_count or \"0\")\n local max_retries = 0\n local backoff_ms = 1000\n local can_retry = false\n\n if is_set(A.retry_policy_json) then\n local ok_decode, policy = pcall(cjson.decode, A.retry_policy_json)\n if ok_decode and type(policy) == \"table\" then\n max_retries = tonumber(policy.max_retries or \"0\")\n if retry_count < max_retries then\n can_retry = true\n local bt = policy.backoff or {}\n if bt.type == \"exponential\" then\n local initial = (tonumber(bt.initial_delay_ms) or 1000)\n local max_d = (tonumber(bt.max_delay_ms) or 60000)\n local mult = (tonumber(bt.multiplier) or 2)\n backoff_ms = math.min(initial * (mult ^ retry_count), max_d)\n elseif bt.type == \"fixed\" then\n backoff_ms = (tonumber(bt.delay_ms) or 1000)\n end\n end\n end\n end\n\n if can_retry then\n -- RETRY PATH: deferred attempt creation (R21 fix)\n -- Do NOT create attempt record here. claim_execution creates the\n -- attempt with correct type (retry) by reading pending_retry_attempt.\n local delay_until = now_ms + backoff_ms\n\n -- ALL 7 state vector dimensions + pending lineage\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_eligible_until_time\",\n \"blocking_reason\", \"waiting_for_retry_backoff\",\n \"blocking_detail\", \"retry backoff until \" .. tostring(delay_until) ..\n \" (attempt \" .. (core.current_attempt_index or \"0\") ..\n \" failed: \" .. A.failure_reason .. \")\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"pending_retry_attempt\",\n \"public_state\", \"delayed\",\n -- Pending lineage for claim_execution to consume\n \"pending_retry_reason\", A.failure_reason,\n \"pending_previous_attempt_index\", core.current_attempt_index or \"0\",\n \"retry_count\", tostring(retry_count + 1),\n \"current_attempt_id\", \"\",\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"lease_expires_at\", \"\",\n \"lease_last_renewed_at\", \"\",\n \"lease_renewal_deadline\", \"\",\n \"delay_until\", tostring(delay_until),\n \"failure_reason\", A.failure_reason,\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Release lease + update indexes\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"ZREM\", K.lease_expiry_key, A.execution_id)\n redis.call(\"SREM\", K.worker_leases_key, A.execution_id)\n redis.call(\"ZREM\", K.active_index_key, A.execution_id)\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n\n -- ZADD to delayed index\n redis.call(\"ZADD\", K.delayed_zset, delay_until, A.execution_id)\n\n -- Lease history\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", \"1000\", \"*\",\n \"event\", \"released\",\n \"lease_id\", A.lease_id,\n \"lease_epoch\", A.lease_epoch,\n \"attempt_index\", core.current_attempt_index or \"\",\n \"reason\", \"failed_retry_scheduled\",\n \"ts\", tostring(now_ms))\n\n return ok(\"retry_scheduled\", tostring(delay_until))\n else\n -- TERMINAL PATH\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"terminal\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"failed\",\n \"attempt_state\", \"attempt_terminal\",\n \"public_state\", \"failed\",\n \"failure_reason\", A.failure_reason,\n \"completed_at\", tostring(now_ms),\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"lease_expires_at\", \"\",\n \"lease_last_renewed_at\", \"\",\n \"lease_renewal_deadline\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Release lease + cleanup\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"ZREM\", K.lease_expiry_key, A.execution_id)\n redis.call(\"SREM\", K.worker_leases_key, A.execution_id)\n redis.call(\"ZREM\", K.active_index_key, A.execution_id)\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n redis.call(\"ZREM\", K.execution_deadline_key, A.execution_id)\n redis.call(\"ZADD\", K.terminal_key, now_ms, A.execution_id)\n\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", \"1000\", \"*\",\n \"event\", \"released\",\n \"lease_id\", A.lease_id,\n \"lease_epoch\", A.lease_epoch,\n \"attempt_index\", core.current_attempt_index or \"\",\n \"reason\", \"failed_terminal\",\n \"ts\", tostring(now_ms))\n\n -- Push-based DAG promotion (Batch C item 6). See ff_complete_execution\n -- for rationale. A terminal-failed upstream triggers\n -- child-skip cascades via ff_resolve_dependency on the receiver side.\n if is_set(core.flow_id) then\n local payload = cjson.encode({\n execution_id = A.execution_id,\n flow_id = core.flow_id,\n outcome = \"failed\",\n })\n redis.call(\"PUBLISH\", \"ff:dag:completions\", payload)\n end\n\n return ok(\"terminal_failed\")\n end\nend)\n\n---------------------------------------------------------------------------\n-- #26 ff_reclaim_execution\n--\n-- Atomically reclaim an expired/revoked execution: interrupt old attempt,\n-- create new attempt + new lease.\n--\n-- KEYS (14): exec_core, claim_grant, old_attempt_hash, old_stream_meta,\n-- new_attempt_hash, new_attempt_usage, attempts_zset,\n-- lease_current, lease_history, lease_expiry_zset,\n-- worker_leases, active_index, attempt_timeout_zset,\n-- execution_deadline_zset\n-- ARGV (8): execution_id, worker_id, worker_instance_id, lane,\n-- lease_id, lease_ttl_ms, attempt_id, attempt_policy_json\n---------------------------------------------------------------------------\nredis.register_function(\'ff_reclaim_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n claim_grant = keys[2],\n old_attempt_hash = keys[3],\n old_stream_meta = keys[4],\n new_attempt_hash = keys[5],\n new_attempt_usage = keys[6],\n attempts_zset = keys[7],\n lease_current_key = keys[8],\n lease_history_key = keys[9],\n lease_expiry_key = keys[10],\n worker_leases_key = keys[11],\n active_index_key = keys[12],\n attempt_timeout_key = keys[13],\n execution_deadline_key = keys[14],\n }\n\n local reclaim_ttl_n = require_number(args[6], \"lease_ttl_ms\")\n if type(reclaim_ttl_n) == \"table\" then return reclaim_ttl_n end\n\n local A = {\n execution_id = args[1],\n worker_id = args[2],\n worker_instance_id = args[3],\n lane = args[4],\n lease_id = args[5],\n lease_ttl_ms = reclaim_ttl_n,\n attempt_id = args[7],\n attempt_policy_json = args[8] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Validate execution state\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- Must be active + reclaimable\n if core.lifecycle_phase ~= \"active\" then\n return err(\"execution_not_reclaimable\")\n end\n if core.ownership_state ~= \"lease_expired_reclaimable\"\n and core.ownership_state ~= \"lease_revoked\" then\n return err(\"execution_not_reclaimable\")\n end\n\n -- Check max_reclaim_count\n local reclaim_count = tonumber(core.lease_reclaim_count or \"0\")\n local max_reclaim = 100 -- default\n -- Read from policy if available\n local policy_key = string.gsub(K.core_key, \":core$\", \":policy\")\n local policy_raw = redis.call(\"GET\", policy_key)\n if policy_raw then\n local ok_p, policy = pcall(cjson.decode, policy_raw)\n if ok_p and type(policy) == \"table\" then\n max_reclaim = tonumber(policy.max_reclaim_count or \"100\")\n end\n end\n\n if reclaim_count >= max_reclaim then\n -- Terminal: max reclaims exceeded\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"terminal\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"failed\",\n \"attempt_state\", \"attempt_terminal\",\n \"public_state\", \"failed\",\n \"failure_reason\", \"max_reclaims_exceeded\",\n \"completed_at\", tostring(now_ms),\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"ZREM\", K.lease_expiry_key, A.execution_id)\n redis.call(\"SREM\", K.worker_leases_key, A.execution_id)\n -- Dynamic worker_leases SREM: K.worker_leases_key targets the NEW (reclaiming)\n -- worker\'s set, but the entry is in the OLD (expired) worker\'s set. Use\n -- current_worker_instance_id from exec_core to SREM from the correct set.\n local wiid = core.current_worker_instance_id or \"\"\n if wiid ~= \"\" then\n local tag_wl = string.match(K.core_key, \"(%b{})\")\n redis.call(\"SREM\", \"ff:idx:\" .. tag_wl .. \":worker:\" .. wiid .. \":leases\", A.execution_id)\n end\n redis.call(\"ZREM\", K.active_index_key, A.execution_id)\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n redis.call(\"ZREM\", K.execution_deadline_key, A.execution_id)\n\n -- ZADD terminal (construct key from hash tag + lane)\n local tag = string.match(K.core_key, \"(%b{})\")\n local lane = core.lane_id or core.current_lane or \"default\"\n local terminal_key = \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":terminal\"\n redis.call(\"ZADD\", terminal_key, now_ms, A.execution_id)\n\n return err(\"max_retries_exhausted\")\n end\n\n -- 2. Validate and consume claim grant\n local grant_raw = redis.call(\"HGETALL\", K.claim_grant)\n if #grant_raw == 0 then return err(\"invalid_claim_grant\") end\n local grant = hgetall_to_table(grant_raw)\n if grant.worker_id ~= A.worker_id then return err(\"invalid_claim_grant\") end\n redis.call(\"DEL\", K.claim_grant)\n\n -- 3. Interrupt old attempt\n local old_att_idx = core.current_attempt_index or \"0\"\n redis.call(\"HSET\", K.old_attempt_hash,\n \"attempt_state\", \"interrupted_reclaimed\",\n \"ended_at\", tostring(now_ms),\n \"failure_reason\", \"lease_\" .. (core.ownership_state or \"expired\"))\n\n -- Close old stream if exists\n if redis.call(\"EXISTS\", K.old_stream_meta) == 1 then\n redis.call(\"HSET\", K.old_stream_meta,\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"reclaimed\")\n end\n\n -- 4. Create new attempt\n -- Construct actual attempt key from hash tag + computed index\n -- (same dynamic-key pattern as ff_claim_execution).\n local next_epoch = tonumber(core.current_lease_epoch or \"0\") + 1\n local next_att_idx = tonumber(core.total_attempt_count or \"0\")\n local expires_at = now_ms + A.lease_ttl_ms\n local renewal_deadline = now_ms + math.floor(A.lease_ttl_ms * 2 / 3)\n\n local tag = string.match(K.core_key, \"(%b{})\")\n local att_key = \"ff:attempt:\" .. tag .. \":\" .. A.execution_id .. \":\" .. tostring(next_att_idx)\n local att_usage_key = att_key .. \":usage\"\n\n redis.call(\"HSET\", att_key,\n \"attempt_id\", A.attempt_id,\n \"execution_id\", A.execution_id,\n \"attempt_index\", tostring(next_att_idx),\n \"attempt_type\", \"reclaim\",\n \"attempt_state\", \"started\",\n \"created_at\", tostring(now_ms),\n \"started_at\", tostring(now_ms),\n \"lease_id\", A.lease_id,\n \"lease_epoch\", tostring(next_epoch),\n \"worker_id\", A.worker_id,\n \"worker_instance_id\", A.worker_instance_id,\n \"reclaim_reason\", \"lease_\" .. (core.ownership_state or \"expired\"),\n \"previous_attempt_index\", old_att_idx)\n redis.call(\"ZADD\", K.attempts_zset, now_ms, tostring(next_att_idx))\n\n -- Initialize usage counters\n redis.call(\"HSET\", att_usage_key, \"last_usage_report_seq\", \"0\")\n\n -- 5. Create new lease\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"HSET\", K.lease_current_key,\n \"lease_id\", A.lease_id,\n \"lease_epoch\", tostring(next_epoch),\n \"execution_id\", A.execution_id,\n \"attempt_id\", A.attempt_id,\n \"worker_id\", A.worker_id,\n \"worker_instance_id\", A.worker_instance_id,\n \"acquired_at\", tostring(now_ms),\n \"expires_at\", tostring(expires_at),\n \"last_renewed_at\", tostring(now_ms),\n \"renewal_deadline\", tostring(renewal_deadline))\n\n -- 6. Update exec_core \u{2014} ALL 7 state vector dimensions\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"active\",\n \"ownership_state\", \"leased\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"running_attempt\",\n \"public_state\", \"active\",\n \"current_attempt_index\", tostring(next_att_idx),\n \"total_attempt_count\", tostring(next_att_idx + 1),\n \"current_attempt_id\", A.attempt_id,\n \"current_lease_id\", A.lease_id,\n \"current_lease_epoch\", tostring(next_epoch),\n \"current_worker_id\", A.worker_id,\n \"current_worker_instance_id\", A.worker_instance_id,\n \"current_lane\", A.lane,\n \"lease_acquired_at\", tostring(now_ms),\n \"lease_expires_at\", tostring(expires_at),\n \"lease_last_renewed_at\", tostring(now_ms),\n \"lease_renewal_deadline\", tostring(renewal_deadline),\n \"lease_expired_at\", \"\",\n \"lease_revoked_at\", \"\",\n \"lease_revoke_reason\", \"\",\n \"lease_reclaim_count\", tostring(reclaim_count + 1),\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- 7. Update indexes\n redis.call(\"ZADD\", K.lease_expiry_key, expires_at, A.execution_id)\n -- SREM from the OLD worker\'s leases set before SADD to the new one.\n -- K.worker_leases_key targets the NEW (reclaiming) worker, but the\n -- execution is currently in the OLD worker\'s set via the original\n -- claim\'s SADD. Without this SREM the execution stays indexed under\n -- the old worker even though exec_core now reports a new owner,\n -- breaking operator drain visibility and worker\u{2192}execution lookup.\n -- Same dynamic-key pattern as the max-reclaim branch above and as\n -- ff_expire_execution\'s cleanup path. Skipped when the same worker\n -- reclaims its own execution (idempotent no-op anyway).\n local old_wiid = core.current_worker_instance_id or \"\"\n if old_wiid ~= \"\" and old_wiid ~= A.worker_instance_id then\n local tag_wl = string.match(K.core_key, \"(%b{})\")\n redis.call(\"SREM\",\n \"ff:idx:\" .. tag_wl .. \":worker:\" .. old_wiid .. \":leases\",\n A.execution_id)\n end\n redis.call(\"SADD\", K.worker_leases_key, A.execution_id)\n redis.call(\"ZADD\", K.active_index_key, expires_at, A.execution_id)\n\n -- 8. Lease history events\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", 1000, \"*\",\n \"event\", \"reclaimed\",\n \"old_lease_epoch\", core.current_lease_epoch or \"\",\n \"new_lease_id\", A.lease_id,\n \"new_lease_epoch\", tostring(next_epoch),\n \"new_attempt_id\", A.attempt_id,\n \"new_attempt_index\", tostring(next_att_idx),\n \"worker_id\", A.worker_id,\n \"ts\", tostring(now_ms))\n\n return ok(A.lease_id, tostring(next_epoch), tostring(expires_at),\n A.attempt_id, tostring(next_att_idx), \"reclaim\")\nend)\n\n---------------------------------------------------------------------------\n-- #29 ff_expire_execution\n--\n-- Timeout-based expiration. Handles active, runnable, and suspended phases.\n-- Called by the attempt_timeout and execution_deadline scanners.\n--\n-- KEYS (14): exec_core, attempt_hash, stream_meta, lease_current,\n-- lease_history, lease_expiry_zset, worker_leases,\n-- active_index, terminal_zset, attempt_timeout_zset,\n-- execution_deadline_zset, suspended_zset,\n-- suspension_timeout_zset, suspension_current\n-- ARGV (2): execution_id, expire_reason\n---------------------------------------------------------------------------\nredis.register_function(\'ff_expire_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n attempt_hash = keys[2],\n stream_meta = keys[3],\n lease_current_key = keys[4],\n lease_history_key = keys[5],\n lease_expiry_key = keys[6],\n worker_leases_key = keys[7],\n active_index_key = keys[8],\n terminal_key = keys[9],\n attempt_timeout_key = keys[10],\n execution_deadline_key = keys[11],\n suspended_zset = keys[12],\n suspension_timeout_key = keys[13],\n suspension_current = keys[14],\n }\n\n local A = {\n execution_id = args[1],\n expire_reason = args[2] or \"attempt_timeout\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n redis.call(\"ZREM\", K.execution_deadline_key, A.execution_id)\n return ok(\"not_found_cleaned\")\n end\n local core = hgetall_to_table(raw)\n\n -- Already terminal \u{2014} no-op\n if core.lifecycle_phase == \"terminal\" then\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n redis.call(\"ZREM\", K.execution_deadline_key, A.execution_id)\n return ok(\"already_terminal\")\n end\n\n -- PATH: Active\n if core.lifecycle_phase == \"active\" then\n -- End attempt\n if is_set(core.current_attempt_index) then\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"ended_failure\",\n \"ended_at\", tostring(now_ms),\n \"failure_reason\", A.expire_reason)\n end\n -- Close stream\n if redis.call(\"EXISTS\", K.stream_meta) == 1 then\n redis.call(\"HSET\", K.stream_meta,\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"expired\")\n end\n -- Release lease\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"ZREM\", K.lease_expiry_key, A.execution_id)\n redis.call(\"SREM\", K.worker_leases_key, A.execution_id)\n -- Dynamic worker_leases SREM: the scanner passes empty WorkerInstanceId\n -- in KEYS[7] (it can\'t know which worker holds the lease). Use the actual\n -- worker_instance_id from exec_core to SREM from the correct set.\n local wiid = core.current_worker_instance_id or \"\"\n if wiid ~= \"\" then\n local tag = string.match(K.core_key, \"(%b{})\")\n redis.call(\"SREM\", \"ff:idx:\" .. tag .. \":worker:\" .. wiid .. \":leases\", A.execution_id)\n end\n redis.call(\"ZREM\", K.active_index_key, A.execution_id)\n -- Lease history\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", \"1000\", \"*\",\n \"event\", \"released\",\n \"lease_id\", core.current_lease_id or \"\",\n \"lease_epoch\", core.current_lease_epoch or \"\",\n \"attempt_index\", core.current_attempt_index or \"\",\n \"reason\", \"expired:\" .. A.expire_reason,\n \"ts\", tostring(now_ms))\n end\n\n -- PATH: Suspended\n if core.lifecycle_phase == \"suspended\" then\n -- End attempt\n if is_set(core.current_attempt_index) then\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"ended_failure\",\n \"ended_at\", tostring(now_ms),\n \"failure_reason\", A.expire_reason)\n end\n -- Close suspension\n if redis.call(\"EXISTS\", K.suspension_current) == 1 then\n redis.call(\"HSET\", K.suspension_current,\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"expired:\" .. A.expire_reason)\n end\n -- Close stream\n if redis.call(\"EXISTS\", K.stream_meta) == 1 then\n redis.call(\"HSET\", K.stream_meta,\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"expired\")\n end\n -- ZREM from suspended indexes\n redis.call(\"ZREM\", K.suspended_zset, A.execution_id)\n redis.call(\"ZREM\", K.suspension_timeout_key, A.execution_id)\n end\n\n -- PATH: Runnable (execution_deadline fires while execution is waiting/delayed/blocked)\n -- These indexes are not in the 14 KEYS array, so construct dynamically from\n -- hash tag + lane_id (same C2 pattern as ff_cancel_execution).\n if core.lifecycle_phase == \"runnable\" then\n local tag = string.match(K.core_key, \"(%b{})\")\n local lane = core.lane_id or core.current_lane or \"default\"\n local es = core.eligibility_state or \"\"\n\n if es == \"eligible_now\" then\n redis.call(\"ZREM\", \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":eligible\", A.execution_id)\n elseif es == \"not_eligible_until_time\" then\n redis.call(\"ZREM\", \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":delayed\", A.execution_id)\n elseif es == \"blocked_by_dependencies\" then\n redis.call(\"ZREM\", \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":blocked:dependencies\", A.execution_id)\n elseif es == \"blocked_by_budget\" then\n redis.call(\"ZREM\", \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":blocked:budget\", A.execution_id)\n elseif es == \"blocked_by_quota\" then\n redis.call(\"ZREM\", \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":blocked:quota\", A.execution_id)\n elseif es == \"blocked_by_route\" then\n redis.call(\"ZREM\", \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":blocked:route\", A.execution_id)\n elseif es == \"blocked_by_operator\" then\n redis.call(\"ZREM\", \"ff:idx:\" .. tag .. \":lane:\" .. lane .. \":blocked:operator\", A.execution_id)\n else\n -- Defensive catch-all: handles blocked_by_lane_state and any future\n -- eligibility states. ZREM from ALL runnable-state indexes \u{2014} idempotent.\n local lp = \"ff:idx:\" .. tag .. \":lane:\" .. lane\n redis.call(\"ZREM\", lp .. \":eligible\", A.execution_id)\n redis.call(\"ZREM\", lp .. \":delayed\", A.execution_id)\n redis.call(\"ZREM\", lp .. \":blocked:dependencies\", A.execution_id)\n redis.call(\"ZREM\", lp .. \":blocked:budget\", A.execution_id)\n redis.call(\"ZREM\", lp .. \":blocked:quota\", A.execution_id)\n redis.call(\"ZREM\", lp .. \":blocked:route\", A.execution_id)\n redis.call(\"ZREM\", lp .. \":blocked:operator\", A.execution_id)\n end\n end\n\n -- ALL PATHS: terminal transition\n local att_state = \"attempt_terminal\"\n if not is_set(core.current_attempt_index) then\n att_state = core.attempt_state or \"none\"\n end\n\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"terminal\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"expired\",\n \"attempt_state\", att_state,\n \"public_state\", \"expired\",\n \"failure_reason\", A.expire_reason,\n \"completed_at\", tostring(now_ms),\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"lease_expires_at\", \"\",\n \"current_suspension_id\", \"\",\n \"current_waitpoint_id\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Cleanup timeout indexes + ZADD terminal\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n redis.call(\"ZREM\", K.execution_deadline_key, A.execution_id)\n redis.call(\"ZADD\", K.terminal_key, now_ms, A.execution_id)\n\n -- Push-based DAG promotion (Batch C item 6 follow-up, issue #44).\n -- Same shape as ff_complete_execution / ff_fail_execution /\n -- ff_cancel_execution \u{2014} emit only for flow-bound executions. Covers\n -- all three phases (active, suspended, runnable) because the terminal\n -- HSET + ZADD above is shared across them. Without this, a never-\n -- claimed flow-bound execution that hits execution_deadline relies on\n -- the dependency_reconciler safety net (15s default) to unblock\n -- children, spiking DAG latency on the timeout path.\n if is_set(core.flow_id) then\n local payload = cjson.encode({\n execution_id = A.execution_id,\n flow_id = core.flow_id,\n outcome = \"expired\",\n })\n redis.call(\"PUBLISH\", \"ff:dag:completions\", payload)\n end\n\n return ok(\"expired\", core.lifecycle_phase)\nend)\n\n---------------------------------------------------------------------------\n-- ff_set_execution_tags (issue #58.4)\n--\n-- Write caller-supplied tag fields to the execution\'s separate tags key\n-- (`ff:exec:{fp:N}:<eid>:tags`). Tag keys MUST match the reserved\n-- namespace pattern `^[a-z][a-z0-9_]*%.` \u{2014} i.e. `<caller>.<field>` \u{2014} so\n-- callers get `$caller.*` for metadata and FF reserves dot-free\n-- snake_case for its own fields. Keys failing validation fail-closed\n-- with `invalid_tag_key` + the offending key; no writes happen.\n--\n-- Atomically HSETs all validated pairs in one call, then bumps\n-- `last_mutation_at` on `exec_core` so observers see the mutation.\n--\n-- KEYS (2): exec_core, tags_key\n-- ARGV (>=2, even): k1, v1, k2, v2, ...\n---------------------------------------------------------------------------\nredis.register_function(\'ff_set_execution_tags\', function(keys, args)\n local K = {\n core_key = keys[1],\n tags_key = keys[2],\n }\n\n local n = #args\n if n == 0 or (n % 2) ~= 0 then\n return err(\"invalid_input\", \"tags must be non-empty even-length key/value pairs\")\n end\n\n if redis.call(\"EXISTS\", K.core_key) == 0 then\n return err(\"execution_not_found\")\n end\n\n -- Require `<caller>.<field>` with at least one non-dot char after the\n -- first dot, so `cairn.` and `cairn..x` are rejected. The suffix may\n -- contain further dots (`app.sub.field` is legal).\n for i = 1, n, 2 do\n local k = args[i]\n if type(k) ~= \"string\" or not string.find(k, \"^[a-z][a-z0-9_]*%.[^.]\") then\n return err(\"invalid_tag_key\", tostring(k))\n end\n end\n\n redis.call(\"HSET\", K.tags_key, unpack(args))\n\n local now_ms = server_time_ms()\n redis.call(\"HSET\", K.core_key, \"last_mutation_at\", tostring(now_ms))\n\n return ok(tostring(n / 2))\nend)\n\n\n-- source: lua/scheduling.lua\n-- FlowFabric scheduling functions\n-- Reference: RFC-009 (Scheduling), RFC-010 \u{a7}4 (function inventory)\n--\n-- Depends on helpers: ok, err, hgetall_to_table, is_set, validate_lease\n\n---------------------------------------------------------------------------\n-- Capability matching helpers (local to scheduling.lua)\n-- Bounds CAPS_MAX_BYTES / CAPS_MAX_TOKENS live in helpers.lua and are\n-- enforced symmetrically on worker caps here and on required caps in\n-- ff_create_execution so neither side can smuggle in an oversized list.\n---------------------------------------------------------------------------\n\n-- Parse a capability CSV into a {token=true} set. Empty/nil \u{2192} empty set.\n-- Returns (set, nil) on success or (nil, err_tuple) on bound violation.\n--\n-- Empty tokens (from stray separators like \"a,,b\") are skipped BEFORE the\n-- count check so a legitimate list punctuated by noise isn\'t rejected.\n-- Real oversize input still fails because #csv > CAPS_MAX_BYTES catches it\n-- before this loop runs.\nlocal function parse_capability_csv(csv, kind)\n if csv == nil or csv == \"\" then\n return {}, nil\n end\n if #csv > CAPS_MAX_BYTES then\n return nil, err(\"invalid_capabilities\", kind .. \":too_many_bytes\")\n end\n local set = {}\n local n = 0\n for token in string.gmatch(csv, \"([^,]+)\") do\n if #token > 0 then\n n = n + 1\n if n > CAPS_MAX_TOKENS then\n return nil, err(\"invalid_capabilities\", kind .. \":too_many_tokens\")\n end\n set[token] = true\n end\n end\n return set, nil\nend\n\n-- Return sorted CSV of tokens present in `required` but missing from\n-- `worker_caps`. Empty result means worker satisfies all requirements.\nlocal function missing_capabilities(required, worker_caps)\n local missing = {}\n for cap, _ in pairs(required) do\n if not worker_caps[cap] then\n missing[#missing + 1] = cap\n end\n end\n table.sort(missing)\n return table.concat(missing, \",\")\nend\n\n---------------------------------------------------------------------------\n-- #25 ff_issue_claim_grant\n--\n-- Scheduler issues a claim grant for an eligible execution.\n-- Validates execution is eligible, writes grant hash with TTL,\n-- removes from eligible set.\n--\n-- KEYS (3): exec_core, claim_grant_key, eligible_zset\n-- ARGV (9): execution_id, worker_id, worker_instance_id,\n-- lane_id, capability_hash, grant_ttl_ms,\n-- route_snapshot_json, admission_summary,\n-- worker_capabilities_csv -- sorted CSV of worker caps (option a)\n--\n-- Capability matching (RFC-009):\n-- If exec_core.required_capabilities (sorted CSV on exec_core) is empty,\n-- any worker matches (backwards compat). Otherwise the worker\'s sorted\n-- CSV must be a superset.\n-- On mismatch: Lua stamps `last_capability_mismatch_at` (single scalar\n-- field, idempotent write \u{2014} no unbounded counter) and returns\n-- err(\"capability_mismatch\", missing_csv). The scheduler side MUST\n-- then block the execution off the eligible ZSET (see\n-- ff_block_execution_for_admission with reason `waiting_for_capability`),\n-- otherwise ZRANGEBYSCORE keeps returning the same top-of-zset every\n-- tick and 100 workers \u{d7} 1 tick/s = hot-loop starvation. RFC-009 \u{a7}564.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_issue_claim_grant\', function(keys, args)\n local K = {\n core_key = keys[1],\n claim_grant = keys[2],\n eligible_zset = keys[3],\n }\n\n local A = {\n execution_id = args[1],\n worker_id = args[2],\n worker_instance_id = args[3],\n lane_id = args[4],\n capability_hash = args[5] or \"\",\n route_snapshot_json = args[7] or \"\",\n admission_summary = args[8] or \"\",\n worker_capabilities_csv = args[9] or \"\",\n }\n\n local grant_ttl_n = require_number(args[6], \"grant_ttl_ms\")\n if type(grant_ttl_n) == \"table\" then return grant_ttl_n end\n A.grant_ttl_ms = grant_ttl_n\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Validate execution exists and is eligible\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"runnable\" then\n return err(\"execution_not_eligible\")\n end\n if core.eligibility_state ~= \"eligible_now\" then\n return err(\"execution_not_eligible\")\n end\n\n -- 2. Check no existing grant (prevent double-grant)\n if redis.call(\"EXISTS\", K.claim_grant) == 1 then\n return err(\"grant_already_exists\")\n end\n\n -- 3. Verify execution is in eligible set (TOCTOU guard)\n local score = redis.call(\"ZSCORE\", K.eligible_zset, A.execution_id)\n if not score then\n return err(\"execution_not_in_eligible_set\")\n end\n\n -- 4. Capability matching. On miss we stamp a SINGLE bounded field \u{2014}\n -- `last_capability_mismatch_at` \u{2014} so operators can SCAN for stuck\n -- executions via `HGET last_capability_mismatch_at < now - 1h` without\n -- needing a counter. An earlier version HINCRBY\'d a counter; that was\n -- dropped because combined with the hot-loop bug (executions staying in\n -- the eligible ZSET after mismatch) the counter grew unboundedly (2.4M\n -- increments/day on one stuck exec_core under 100 workers). An HSET of\n -- a fixed field is idempotent w.r.t. size.\n --\n -- The scheduler MUST block the execution off the eligible ZSET after\n -- this err returns; otherwise the next tick picks the same top-of-zset\n -- and we wasted this validation. See Scheduler::claim_for_worker.\n local required_set, req_err = parse_capability_csv(\n core.required_capabilities or \"\", \"required\")\n if req_err then return req_err end\n local worker_set, wrk_err = parse_capability_csv(\n A.worker_capabilities_csv, \"worker\")\n if wrk_err then return wrk_err end\n if next(required_set) ~= nil then\n local missing = missing_capabilities(required_set, worker_set)\n if missing ~= \"\" then\n redis.call(\"HSET\", K.core_key,\n \"last_capability_mismatch_at\", tostring(now_ms))\n return err(\"capability_mismatch\", missing)\n end\n end\n\n -- 5. Write grant hash with TTL\n local grant_expires_at = now_ms + A.grant_ttl_ms\n redis.call(\"HSET\", K.claim_grant,\n \"worker_id\", A.worker_id,\n \"worker_instance_id\", A.worker_instance_id,\n \"lane_id\", A.lane_id,\n \"capability_hash\", A.capability_hash,\n \"route_snapshot_json\", A.route_snapshot_json,\n \"admission_summary\", A.admission_summary,\n \"created_at\", tostring(now_ms),\n \"grant_expires_at\", tostring(grant_expires_at))\n redis.call(\"PEXPIREAT\", K.claim_grant, grant_expires_at)\n\n -- 6. Do NOT ZREM from eligible here. ff_claim_execution does the ZREM\n -- when consuming the grant. If the grant expires unconsumed, the execution\n -- remains in the eligible set and is re-discovered by the next scheduler\n -- cycle. This prevents the \"orphaned grant\" stuck state where an execution\n -- is in no scheduling index after grant expiry.\n\n return ok(A.execution_id)\nend)\n\n---------------------------------------------------------------------------\n-- #32 ff_change_priority\n--\n-- Update priority and re-score in eligible ZSET.\n-- Only works for runnable + eligible_now executions.\n--\n-- KEYS (2): exec_core, eligible_zset\n-- ARGV (2): execution_id, new_priority\n---------------------------------------------------------------------------\nredis.register_function(\'ff_change_priority\', function(keys, args)\n local K = {\n core_key = keys[1],\n eligible_zset = keys[2],\n }\n\n local new_priority_n = require_number(args[2], \"new_priority\")\n if type(new_priority_n) == \"table\" then return new_priority_n end\n\n local A = {\n execution_id = args[1],\n new_priority = new_priority_n,\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Read and validate\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"runnable\" then\n return err(\"execution_not_eligible\")\n end\n if core.eligibility_state ~= \"eligible_now\" then\n return err(\"execution_not_eligible\")\n end\n\n local old_priority = tonumber(core.priority or \"0\")\n\n -- Clamp to safe range (same as ff_create_execution)\n if A.new_priority < 0 then A.new_priority = 0 end\n if A.new_priority > 9000 then A.new_priority = 9000 end\n\n -- 2. Update exec_core priority\n redis.call(\"HSET\", K.core_key,\n \"priority\", tostring(A.new_priority),\n \"last_mutation_at\", tostring(now_ms))\n\n -- 3. Re-score in eligible ZSET\n -- Composite score: -(priority * 1_000_000_000_000) + created_at_ms\n local created_at = tonumber(core.created_at or \"0\")\n local new_score = 0 - (A.new_priority * 1000000000000) + created_at\n redis.call(\"ZADD\", K.eligible_zset, new_score, A.execution_id)\n\n return ok(tostring(old_priority), tostring(A.new_priority))\nend)\n\n---------------------------------------------------------------------------\n-- #33 ff_update_progress\n--\n-- Update progress fields on exec_core. Validate lease (lite check:\n-- lease_id + epoch only \u{2014} attempt_id not required per \u{a7}4 Class B).\n--\n-- KEYS (1): exec_core\n-- ARGV (5): execution_id, lease_id, lease_epoch,\n-- progress_pct, progress_message\n---------------------------------------------------------------------------\nredis.register_function(\'ff_update_progress\', function(keys, args)\n local K = {\n core_key = keys[1],\n }\n\n local A = {\n execution_id = args[1],\n lease_id = args[2],\n lease_epoch = args[3],\n progress_pct = args[4] or \"\",\n progress_message = args[5] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- Read and validate\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"active\" then\n return err(\"execution_not_active\",\n core.terminal_outcome or \"\",\n core.current_lease_epoch or \"\",\n core.lifecycle_phase or \"\",\n core.current_attempt_id or \"\")\n end\n if core.ownership_state == \"lease_revoked\" then\n return err(\"lease_revoked\")\n end\n if tonumber(core.lease_expires_at or \"0\") <= now_ms then\n return err(\"lease_expired\")\n end\n if core.current_lease_id ~= A.lease_id then\n return err(\"stale_lease\")\n end\n if core.current_lease_epoch ~= A.lease_epoch then\n return err(\"stale_lease\")\n end\n\n -- Update progress fields\n local fields = { \"last_mutation_at\", tostring(now_ms), \"progress_updated_at\", tostring(now_ms) }\n if is_set(A.progress_pct) then\n fields[#fields + 1] = \"progress_pct\"\n fields[#fields + 1] = A.progress_pct\n end\n if is_set(A.progress_message) then\n fields[#fields + 1] = \"progress_message\"\n fields[#fields + 1] = A.progress_message\n end\n redis.call(\"HSET\", K.core_key, unpack(fields))\n\n return ok()\nend)\n\n---------------------------------------------------------------------------\n-- #27 ff_promote_delayed\n--\n-- Promote a delayed execution to eligible when its delay_until has passed.\n-- Called by the delayed promoter scanner.\n-- Preserves attempt_state (may be pending_retry, pending_first, or\n-- attempt_interrupted from delay_execution).\n--\n-- KEYS (3): exec_core, delayed_zset, eligible_zset\n-- ARGV (2): execution_id, now_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_promote_delayed\', function(keys, args)\n local K = {\n core_key = keys[1],\n delayed_zset = keys[2],\n eligible_zset = keys[3],\n }\n\n local now_ms_n = require_number(args[2], \"now_ms\")\n if type(now_ms_n) == \"table\" then return now_ms_n end\n\n local A = {\n execution_id = args[1],\n now_ms = now_ms_n,\n }\n\n -- Read and validate\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then\n -- Execution gone \u{2014} clean up stale index entry\n redis.call(\"ZREM\", K.delayed_zset, A.execution_id)\n return ok(\"not_found_cleaned\")\n end\n local core = hgetall_to_table(raw)\n\n -- Must be runnable + not_eligible_until_time\n if core.lifecycle_phase ~= \"runnable\" then\n redis.call(\"ZREM\", K.delayed_zset, A.execution_id)\n return ok(\"not_runnable_cleaned\")\n end\n if core.eligibility_state ~= \"not_eligible_until_time\" then\n redis.call(\"ZREM\", K.delayed_zset, A.execution_id)\n return ok(\"not_delayed_cleaned\")\n end\n\n -- Check delay_until has actually passed\n local delay_until = tonumber(core.delay_until or \"0\")\n if delay_until > A.now_ms then\n return ok(\"not_yet_due\")\n end\n\n -- Promote: update 6 of 7 state vector dimensions.\n -- attempt_state is DELIBERATELY PRESERVED (not written). This is the 7th dim.\n --\n -- WHY: The caller that put this execution into the delayed set already set\n -- the attempt_state to reflect what should happen on next claim:\n -- * pending_retry_attempt \u{2014} from ff_fail_execution (retry backoff expired)\n -- * pending_replay_attempt \u{2014} from ff_replay_execution (replay delay expired)\n -- * attempt_interrupted \u{2014} from ff_delay_execution (worker self-delay)\n -- * pending_first_attempt \u{2014} from ff_create_execution (initial delay_until)\n -- Overwriting it here would lose this routing information and break\n -- claim_execution\'s attempt_type derivation (initial vs retry vs replay)\n -- and the claim dispatch routing (claim_execution vs claim_resumed_execution).\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"eligible_now\",\n \"blocking_reason\", \"waiting_for_worker\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\",\n -- attempt_state: NOT WRITTEN \u{2014} see comment above\n \"public_state\", \"waiting\",\n \"delay_until\", \"\",\n \"last_transition_at\", tostring(A.now_ms),\n \"last_mutation_at\", tostring(A.now_ms))\n\n -- ZREM from delayed, ZADD to eligible with composite priority score\n redis.call(\"ZREM\", K.delayed_zset, A.execution_id)\n local priority = tonumber(core.priority or \"0\")\n local created_at = tonumber(core.created_at or \"0\")\n local score = 0 - (priority * 1000000000000) + created_at\n redis.call(\"ZADD\", K.eligible_zset, score, A.execution_id)\n\n return ok(\"promoted\")\nend)\n\n---------------------------------------------------------------------------\n-- #26 ff_issue_reclaim_grant\n--\n-- TODO(batch-c): This function has NO production Rust caller as of Batch B.\n-- The reclaim scanner that would invoke it (to recover leases from crashed\n-- workers) is scheduled for cairn Batch C. A worker dying mid-execution today\n-- leaves its execution stuck in `lease_expired_reclaimable` until operator\n-- intervention. Test-only callers exist in crates/ff-test/tests to exercise\n-- the Lua side. When the scheduler reclaim integration lands, the caller\n-- must apply the same block-on-capability-mismatch pattern used by\n-- `ff-scheduler::Scheduler::claim_for_worker` (see the IMPORTANT note\n-- below) \u{2014} otherwise an unmatchable reclaim recycles every scanner tick.\n--\n-- Scheduler issues a reclaim grant for an expired/revoked execution.\n-- Similar to ff_issue_claim_grant but validates reclaimable state.\n--\n-- KEYS (3): exec_core, claim_grant_key, lease_expiry_zset\n-- ARGV (9): execution_id, worker_id, worker_instance_id,\n-- lane_id, capability_hash, grant_ttl_ms,\n-- route_snapshot_json, admission_summary,\n-- worker_capabilities_csv\n--\n-- Capability matching identical to ff_issue_claim_grant: reclaiming a lease\n-- must respect the execution\'s required_capabilities just like an initial\n-- claim, so a re-issuance to a non-matching worker is blocked here too.\n--\n-- IMPORTANT: on capability_mismatch this function does NOT remove the exec\n-- from the lease_expiry pool. The reclaim SCANNER (to be added in Rust) MUST\n-- detect capability_mismatch and move the execution into blocked_route with\n-- reason `waiting_for_capable_worker` (mirroring the claim-grant path). If\n-- the scanner instead re-attempts the same execution every tick, a reclaim\n-- hot-loop develops that is analogous to the claim-path hot-loop and\n-- identical in cost (wasted FCALLs + log volume). Lease_expiry as an index\n-- has no natural sweeping mechanism for post-mismatch promotion \u{2014} the\n-- scheduler-side block + periodic sweep owns the lifecycle.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_issue_reclaim_grant\', function(keys, args)\n local K = {\n core_key = keys[1],\n claim_grant = keys[2],\n lease_expiry = keys[3],\n }\n\n local A = {\n execution_id = args[1],\n worker_id = args[2],\n worker_instance_id = args[3],\n lane_id = args[4],\n capability_hash = args[5] or \"\",\n route_snapshot_json = args[7] or \"\",\n admission_summary = args[8] or \"\",\n worker_capabilities_csv = args[9] or \"\",\n }\n\n local grant_ttl_n = require_number(args[6], \"grant_ttl_ms\")\n if type(grant_ttl_n) == \"table\" then return grant_ttl_n end\n A.grant_ttl_ms = grant_ttl_n\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- Validate execution exists and is reclaimable\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"active\" then\n return err(\"execution_not_reclaimable\")\n end\n if core.ownership_state ~= \"lease_expired_reclaimable\"\n and core.ownership_state ~= \"lease_revoked\" then\n return err(\"execution_not_reclaimable\")\n end\n\n -- Check no existing grant\n if redis.call(\"EXISTS\", K.claim_grant) == 1 then\n return err(\"grant_already_exists\")\n end\n\n -- Capability matching \u{2014} same policy as issue_claim_grant: stamp\n -- last_capability_mismatch_at (single scalar) on miss so ops can surface\n -- stuck reclaims via SCAN. Scheduler MUST also block-out the exec from\n -- the lease_expiry reclaim pool; otherwise the reclaim scanner hits the\n -- same mismatch every cycle. See Scheduler::reclaim_for_worker.\n local required_set, req_err = parse_capability_csv(\n core.required_capabilities or \"\", \"required\")\n if req_err then return req_err end\n local worker_set, wrk_err = parse_capability_csv(\n A.worker_capabilities_csv, \"worker\")\n if wrk_err then return wrk_err end\n if next(required_set) ~= nil then\n local missing = missing_capabilities(required_set, worker_set)\n if missing ~= \"\" then\n redis.call(\"HSET\", K.core_key,\n \"last_capability_mismatch_at\", tostring(now_ms))\n return err(\"capability_mismatch\", missing)\n end\n end\n\n -- Write grant hash with TTL\n local grant_expires_at = now_ms + A.grant_ttl_ms\n redis.call(\"HSET\", K.claim_grant,\n \"worker_id\", A.worker_id,\n \"worker_instance_id\", A.worker_instance_id,\n \"lane_id\", A.lane_id,\n \"capability_hash\", A.capability_hash,\n \"route_snapshot_json\", A.route_snapshot_json,\n \"admission_summary\", A.admission_summary,\n \"created_at\", tostring(now_ms),\n \"grant_expires_at\", tostring(grant_expires_at))\n redis.call(\"PEXPIREAT\", K.claim_grant, grant_expires_at)\n\n -- Do NOT ZREM from lease_expiry \u{2014} stays for scheduler discovery\n\n return ok(A.execution_id)\nend)\n\n\n-- source: lua/suspension.lua\n-- FlowFabric suspension and waitpoint functions\n-- Reference: RFC-004 (Suspension), RFC-005 (Signal), RFC-010 \u{a7}4\n--\n-- Depends on helpers: ok, err, ok_already_satisfied, hgetall_to_table,\n-- is_set, validate_lease_and_mark_expired, clear_lease_and_indexes,\n-- map_reason_to_blocking, initialize_condition, write_condition_hash,\n-- evaluate_signal_against_condition, is_condition_satisfied,\n-- extract_field, initial_signal_summary_json, validate_pending_waitpoint,\n-- assert_active_suspension, assert_waitpoint_belongs\n\n---------------------------------------------------------------------------\n-- #13 ff_suspend_execution\n--\n-- Validate lease, release ownership, create suspension + waitpoint(s)\n-- (or activate pending), init condition, transition active \u{2192} suspended.\n-- Mints the waitpoint HMAC token(s) (RFC-004 \u{a7}Waitpoint Security)\n-- returned alongside the waitpoint_id(s) for external signal delivery.\n--\n-- KEYS (18 + 3*N_extra): exec_core, attempt_record, lease_current,\n-- lease_history, lease_expiry_zset, worker_leases,\n-- suspension_current, waitpoint_hash, waitpoint_signals,\n-- suspension_timeout_zset, pending_wp_expiry_zset,\n-- active_index, suspended_zset, waitpoint_history,\n-- wp_condition, attempt_timeout_zset, hmac_secrets,\n-- dedup_hash; then for each RFC-014 Pattern 3 additional\n-- waitpoint (wp_hash_extra, wp_signals_extra,\n-- wp_condition_extra).\n-- ARGV (19 + 1 + 2*N_extra): execution_id, attempt_index, attempt_id,\n-- lease_id, lease_epoch, suspension_id, waitpoint_id,\n-- waitpoint_key, reason_code, requested_by, timeout_at,\n-- resume_condition_json, resume_policy_json,\n-- continuation_metadata_pointer, use_pending_waitpoint,\n-- timeout_behavior, lease_history_maxlen, idempotency_key,\n-- dedup_ttl_ms, num_extra_waitpoints, then for each extra\n-- (waitpoint_id, waitpoint_key).\n---------------------------------------------------------------------------\nredis.register_function(\'ff_suspend_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n attempt_record = keys[2],\n lease_current_key = keys[3],\n lease_history_key = keys[4],\n lease_expiry_key = keys[5],\n worker_leases_key = keys[6],\n suspension_current = keys[7],\n waitpoint_hash = keys[8],\n waitpoint_signals = keys[9],\n suspension_timeout_key = keys[10],\n pending_wp_expiry_key = keys[11],\n active_index_key = keys[12],\n suspended_zset = keys[13],\n waitpoint_history = keys[14],\n wp_condition = keys[15],\n attempt_timeout_key = keys[16],\n hmac_secrets = keys[17],\n dedup_hash = keys[18],\n }\n\n local A = {\n execution_id = args[1],\n attempt_index = args[2],\n attempt_id = args[3] or \"\",\n lease_id = args[4] or \"\",\n lease_epoch = args[5] or \"\",\n suspension_id = args[6],\n waitpoint_id = args[7],\n waitpoint_key = args[8],\n reason_code = args[9],\n requested_by = args[10],\n timeout_at = args[11] or \"\",\n resume_condition_json = args[12],\n resume_policy_json = args[13],\n continuation_metadata_ptr = args[14] or \"\",\n use_pending_waitpoint = args[15] or \"\",\n timeout_behavior = args[16] or \"fail\",\n lease_history_maxlen = tonumber(args[17] or \"1000\"),\n idempotency_key = args[18] or \"\",\n dedup_ttl_ms = tonumber(args[19] or \"0\"),\n num_extra_waitpoints = tonumber(args[20] or \"0\"),\n }\n\n -- RFC-014 Pattern 3: additional-waitpoint bindings. Parsed up front\n -- so dedup hashing covers the full set (RFC-013 idempotency_key +\n -- RFC-014 multi-waitpoint widening).\n local extras = {}\n for i = 1, A.num_extra_waitpoints do\n local base_arg = 20 + (i - 1) * 2\n local ex_id = args[base_arg + 1]\n local ex_key = args[base_arg + 2]\n if not ex_id or ex_id == \"\" or not ex_key or ex_key == \"\" then\n return err(\"additional_waitpoint_binding_malformed\")\n end\n local base_key = 18 + (i - 1) * 3\n extras[#extras + 1] = {\n waitpoint_id = ex_id,\n waitpoint_key = ex_key,\n wp_hash_key = keys[base_key + 1],\n wp_signals_key = keys[base_key + 2],\n wp_condition_key = keys[base_key + 3],\n }\n end\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- RFC-013 \u{a7}9.2 \u{2014} idempotency_key dedup branch. When the caller set an\n -- idempotency_key, check the dedup hash first: a hit short-circuits with\n -- the previously-serialized outcome verbatim, performing no state\n -- mutation. On miss, fall through to the canonical path and write the\n -- outcome into the dedup hash with TTL after commit.\n --\n -- RFC-014 widens the format to carry additional waitpoint bindings:\n -- \"<status>\\t<suspension_id>\\t<waitpoint_id>\\t<waitpoint_key>\\t\n -- <waitpoint_token>\\t<N_extra>[\\t<ex_id>\\t<ex_key>\\t<ex_tok>]*\".\n -- Pattern-1/2 payloads with no extras have N_extra = 0.\n local _dedup_active = (A.idempotency_key ~= \"\" and K.dedup_hash and K.dedup_hash ~= \"\")\n if _dedup_active then\n local stored = redis.call(\"HGET\", K.dedup_hash, \"outcome\")\n if stored == false then stored = nil end\n if stored then\n -- Split on single \"\\t\". `string.gmatch(\"[^\\t]+\")` would drop\n -- empty fields (legal for e.g. an empty extras block).\n local parts = {}\n local pos = 1\n while pos <= #stored + 1 do\n local nxt = string.find(stored, \"\\t\", pos, true)\n if nxt then\n parts[#parts + 1] = string.sub(stored, pos, nxt - 1)\n pos = nxt + 1\n else\n parts[#parts + 1] = string.sub(stored, pos)\n break\n end\n end\n if #parts >= 5 then\n local status = parts[1]\n local susp_id = parts[2]\n local wp_id_out = parts[3]\n local wp_key_out = parts[4]\n local wp_tok_out = parts[5]\n local n_extra = tonumber(parts[6] or \"0\") or 0\n local extras_out = {}\n for i = 1, n_extra do\n local o = 6 + (i - 1) * 3\n extras_out[#extras_out + 1] = {\n waitpoint_id = parts[o + 1] or \"\",\n waitpoint_key = parts[o + 2] or \"\",\n waitpoint_token = parts[o + 3] or \"\",\n }\n end\n if status == \"ALREADY_SATISFIED\" then\n return ok_already_satisfied_extras(\n susp_id, wp_id_out, wp_key_out, wp_tok_out, extras_out)\n else\n return ok_extras(\n susp_id, wp_id_out, wp_key_out, wp_tok_out, extras_out)\n end\n end\n -- Malformed entry: fall through and treat as miss.\n end\n end\n\n -- 1. Read execution core\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- RFC #58.5 \u{2014} suspend_execution is lease-bound. Hard-reject empty or\n -- partial fence triples; no operator-override escape hatch.\n local fence, must_check_or_err = resolve_lease_fence(core, A)\n if not fence then return must_check_or_err end\n if not must_check_or_err then return err(\"fence_required\") end\n\n -- 2. Validate lease (full check incl. expiry + revocation + identity)\n local lease_err = validate_lease_and_mark_expired(\n core, A, now_ms, K, A.lease_history_maxlen)\n if lease_err then return lease_err end\n\n -- 3. Validate attempt binding\n if tostring(core.current_attempt_index) ~= A.attempt_index then\n return err(\"invalid_lease_for_suspend\")\n end\n\n -- 4. Check for existing suspension: reject if open, archive if closed\n if redis.call(\"EXISTS\", K.suspension_current) == 1 then\n local closed = redis.call(\"HGET\", K.suspension_current, \"closed_at\")\n if not is_set(closed) then\n return err(\"already_suspended\")\n end\n -- Previous suspension is closed. Archive old waitpoint_id for cleanup,\n -- then DEL the stale record before creating a new one.\n local old_wp = redis.call(\"HGET\", K.suspension_current, \"waitpoint_id\")\n if is_set(old_wp) then\n redis.call(\"SADD\", K.waitpoint_history, old_wp)\n end\n redis.call(\"DEL\", K.suspension_current)\n end\n\n -- 5. Create or activate waitpoint\n local waitpoint_id = A.waitpoint_id\n local waitpoint_key = A.waitpoint_key\n local waitpoint_token = \"\"\n\n if A.use_pending_waitpoint == \"1\" then\n -- Activate existing pending waitpoint\n local wp_raw = redis.call(\"HGETALL\", K.waitpoint_hash)\n local wp_err = validate_pending_waitpoint(wp_raw, A.execution_id, A.attempt_index, now_ms)\n if wp_err then return wp_err end\n\n -- Read waitpoint_id, waitpoint_key, and existing token from pending record.\n -- Token was minted at ff_create_pending_waitpoint time with that record\'s\n -- created_at; we MUST keep using it so signals buffered before activation\n -- validate against the same binding.\n local wp = hgetall_to_table(wp_raw)\n waitpoint_id = wp.waitpoint_id\n waitpoint_key = wp.waitpoint_key\n -- A pending waitpoint without a minted token is either a pre-HMAC-upgrade\n -- record or a corrupted write. Activating with an empty token would return\n -- \"\" to the SDK, and every subsequent signal delivery would reject with\n -- missing_token \u{2014} fail-closed at the security boundary but silent about\n -- the real degraded state. Surface the degradation AT the activation\n -- point so operators see it immediately.\n if not is_set(wp.waitpoint_token) then\n return err(\"waitpoint_not_token_bound\")\n end\n waitpoint_token = wp.waitpoint_token\n\n -- Activate the pending waitpoint\n redis.call(\"HSET\", K.waitpoint_hash,\n \"suspension_id\", A.suspension_id,\n \"state\", \"active\",\n \"activated_at\", tostring(now_ms),\n \"expires_at\", is_set(A.timeout_at) and A.timeout_at or \"\")\n redis.call(\"ZREM\", K.pending_wp_expiry_key, waitpoint_id)\n\n -- CRITICAL: Evaluate buffered signals that arrived while waitpoint was pending.\n -- If early signals already satisfy the resume condition, skip suspension entirely.\n local buffered = redis.call(\"XRANGE\", K.waitpoint_signals, \"-\", \"+\")\n if #buffered > 0 then\n local wp_cond = initialize_condition(A.resume_condition_json)\n for _, entry in ipairs(buffered) do\n local fields = entry[2]\n local sig_name = extract_field(fields, \"signal_name\")\n local sig_id = extract_field(fields, \"signal_id\")\n evaluate_signal_against_condition(wp_cond, sig_name, sig_id)\n end\n if is_condition_satisfied(wp_cond) then\n -- Resume condition already met by buffered signals \u{2014} skip suspension.\n redis.call(\"HSET\", K.waitpoint_hash,\n \"state\", \"closed\", \"satisfied_at\", tostring(now_ms),\n \"closed_at\", tostring(now_ms), \"close_reason\", \"resumed\")\n write_condition_hash(K.wp_condition, wp_cond, now_ms)\n -- Do NOT release lease, do NOT change execution state.\n -- RFC-013 \u{a7}9.2 \u{2014} write dedup outcome. UsePending+extras is\n -- rejected upstream, so N_extra is always 0 here.\n if _dedup_active then\n local payload = \"ALREADY_SATISFIED\\t\" .. A.suspension_id .. \"\\t\" .. waitpoint_id ..\n \"\\t\" .. waitpoint_key .. \"\\t\" .. waitpoint_token .. \"\\t0\"\n redis.call(\"HSET\", K.dedup_hash, \"outcome\", payload)\n if A.dedup_ttl_ms > 0 then\n redis.call(\"PEXPIRE\", K.dedup_hash, A.dedup_ttl_ms)\n end\n end\n return ok_already_satisfied_extras(A.suspension_id, waitpoint_id, waitpoint_key, waitpoint_token, {})\n end\n -- Condition not yet satisfied \u{2014} proceed with suspension.\n -- Write partial condition state (some matchers may be satisfied).\n write_condition_hash(K.wp_condition, wp_cond, now_ms)\n else\n -- No buffered signals \u{2014} init condition from scratch\n local wp_cond = initialize_condition(A.resume_condition_json)\n write_condition_hash(K.wp_condition, wp_cond, now_ms)\n end\n else\n -- Create new waitpoint \u{2014} mint HMAC token bound to (waitpoint_id,\n -- waitpoint_key, created_at). The created_at written here is what the\n -- signal-delivery path reads back for HMAC validation.\n local token, token_err = mint_waitpoint_token(\n K.hmac_secrets, waitpoint_id, waitpoint_key, now_ms)\n if not token then return err(token_err) end\n waitpoint_token = token\n\n redis.call(\"HSET\", K.waitpoint_hash,\n \"waitpoint_id\", waitpoint_id,\n \"execution_id\", A.execution_id,\n \"attempt_index\", A.attempt_index,\n \"suspension_id\", A.suspension_id,\n \"waitpoint_key\", waitpoint_key,\n \"waitpoint_token\", waitpoint_token,\n \"state\", \"active\",\n \"created_at\", tostring(now_ms),\n \"activated_at\", tostring(now_ms),\n \"expires_at\", is_set(A.timeout_at) and A.timeout_at or \"\",\n \"signal_count\", \"0\",\n \"matched_signal_count\", \"0\",\n \"last_signal_at\", \"\")\n\n -- Initialize condition hash from resume condition spec\n local wp_cond = initialize_condition(A.resume_condition_json)\n write_condition_hash(K.wp_condition, wp_cond, now_ms)\n end\n\n -- 5b. RFC-014 Pattern 3: mint extras\' HMAC tokens + waitpoint hashes\n -- + wp_condition hashes. Each additional binding gets its own\n -- storage so the composite evaluator can SADD per-waitpoint\n -- satisfier tokens and the HMAC enforcement works uniformly for\n -- external signal delivery to ANY of the N waitpoints.\n -- UsePending with extras is rejected by the Rust validator.\n local extras_out = {}\n if A.use_pending_waitpoint ~= \"1\" then\n for _, ex in ipairs(extras) do\n local ex_token, ex_err = mint_waitpoint_token(\n K.hmac_secrets, ex.waitpoint_id, ex.waitpoint_key, now_ms)\n if not ex_token then return err(ex_err) end\n\n redis.call(\"HSET\", ex.wp_hash_key,\n \"waitpoint_id\", ex.waitpoint_id,\n \"execution_id\", A.execution_id,\n \"attempt_index\", A.attempt_index,\n \"suspension_id\", A.suspension_id,\n \"waitpoint_key\", ex.waitpoint_key,\n \"waitpoint_token\", ex_token,\n \"state\", \"active\",\n \"created_at\", tostring(now_ms),\n \"activated_at\", tostring(now_ms),\n \"expires_at\", is_set(A.timeout_at) and A.timeout_at or \"\",\n \"signal_count\", \"0\",\n \"matched_signal_count\", \"0\",\n \"last_signal_at\", \"\")\n\n -- Mirror the primary wp_condition hash shape so ff_deliver_signal\'s\n -- `wp_cond.composite == \"1\"` branch routes correctly when a signal\n -- lands on this extra waitpoint.\n local ex_wp_cond = initialize_condition(A.resume_condition_json)\n write_condition_hash(ex.wp_condition_key, ex_wp_cond, now_ms)\n\n redis.call(\"SADD\", K.waitpoint_history, ex.waitpoint_id)\n\n extras_out[#extras_out + 1] = {\n waitpoint_id = ex.waitpoint_id,\n waitpoint_key = ex.waitpoint_key,\n waitpoint_token = ex_token,\n }\n end\n elseif #extras > 0 then\n -- Defensive: Rust validator already rejects UsePending+extras, but\n -- fail loudly if a client bypasses Rust entirely.\n return err(\"use_pending_with_extras_unsupported\")\n end\n\n -- 6. Record waitpoint_id in mandatory history set (required for cleanup cascade)\n redis.call(\"SADD\", K.waitpoint_history, waitpoint_id)\n\n -- OOM-SAFE WRITE ORDERING (per RFC-010 \u{a7}4.8b):\n -- exec_core HSET is the \"point of no return\" \u{2014} write it FIRST.\n\n -- 7. Transition exec_core (FIRST \u{2014} point of no return, all 7 dims)\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"suspended\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", map_reason_to_blocking(A.reason_code),\n \"blocking_detail\", \"suspended: waitpoint \" .. waitpoint_id .. \" awaiting \" .. A.reason_code,\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"attempt_interrupted\",\n \"public_state\", \"suspended\",\n \"current_lease_id\", \"\",\n \"current_worker_id\", \"\",\n \"current_worker_instance_id\", \"\",\n \"lease_expires_at\", \"\",\n \"lease_last_renewed_at\", \"\",\n \"lease_renewal_deadline\", \"\",\n \"current_suspension_id\", A.suspension_id,\n \"current_waitpoint_id\", waitpoint_id,\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- 8. Pause the attempt: started -> suspended (RFC-002 suspend_attempt)\n redis.call(\"HSET\", K.attempt_record,\n \"attempt_state\", \"suspended\",\n \"suspended_at\", tostring(now_ms),\n \"suspension_id\", A.suspension_id)\n\n -- 9. Release lease + update indexes\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"ZREM\", K.lease_expiry_key, A.execution_id)\n redis.call(\"SREM\", K.worker_leases_key, A.execution_id)\n redis.call(\"ZREM\", K.active_index_key, A.execution_id)\n redis.call(\"ZREM\", K.attempt_timeout_key, A.execution_id)\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", A.lease_history_maxlen, \"*\",\n \"event\", \"released\",\n \"lease_id\", A.lease_id,\n \"lease_epoch\", A.lease_epoch,\n \"attempt_index\", A.attempt_index,\n \"attempt_id\", core.current_attempt_id or \"\",\n \"reason\", \"suspend\",\n \"ts\", tostring(now_ms))\n\n -- 10. Create suspension record. RFC-014: record the full list of\n -- additional waitpoint_ids (JSON array of \"<id>|<key>\" pairs) so\n -- cleanup owners (cancel / expire / resume) can iterate all\n -- per-waitpoint storage on terminal transitions without needing\n -- the caller to re-supply them.\n local add_json = \"[]\"\n if #extras_out > 0 then\n local pairs_list = {}\n for _, e in ipairs(extras_out) do\n pairs_list[#pairs_list + 1] = {\n waitpoint_id = e.waitpoint_id,\n waitpoint_key = e.waitpoint_key,\n }\n end\n add_json = cjson.encode(pairs_list)\n end\n redis.call(\"HSET\", K.suspension_current,\n \"suspension_id\", A.suspension_id,\n \"execution_id\", A.execution_id,\n \"attempt_index\", A.attempt_index,\n \"waitpoint_id\", waitpoint_id,\n \"waitpoint_key\", waitpoint_key,\n \"additional_waitpoints_json\", add_json,\n \"reason_code\", A.reason_code,\n \"requested_by\", A.requested_by,\n \"created_at\", tostring(now_ms),\n \"timeout_at\", A.timeout_at,\n \"timeout_behavior\", A.timeout_behavior,\n \"resume_condition_json\", A.resume_condition_json,\n \"resume_policy_json\", A.resume_policy_json,\n \"continuation_metadata_pointer\", A.continuation_metadata_ptr,\n \"buffered_signal_summary_json\", initial_signal_summary_json(),\n \"last_signal_at\", \"\",\n \"satisfied_at\", \"\",\n \"closed_at\", \"\",\n \"close_reason\", \"\")\n\n -- 10b. RFC-014 \u{a7}3.1: seed composite member_map (write-once) when the\n -- resume condition carries a composite tree. No-op for single-matcher\n -- / operator / timeout conditions. Pattern 3 \u{2014} pass every binding\'s\n -- waitpoint_key so candidate-node lookup covers all N leaves.\n do\n local spec_ok, spec = pcall(cjson.decode, A.resume_condition_json)\n if spec_ok and type(spec) == \"table\" and spec.composite then\n local all_keys = { waitpoint_key }\n for _, e in ipairs(extras_out) do\n all_keys[#all_keys + 1] = e.waitpoint_key\n end\n seed_composite_member_map(\n K.suspension_current .. \":member_map\",\n spec.tree,\n all_keys)\n end\n end\n\n -- 11. Add to per-lane suspended index + suspension timeout\n -- Score: timeout_at if set, otherwise MAX for \"no timeout\" ordering\n redis.call(\"ZADD\", K.suspended_zset,\n is_set(A.timeout_at) and tonumber(A.timeout_at) or 9999999999999,\n A.execution_id)\n\n if is_set(A.timeout_at) then\n redis.call(\"ZADD\", K.suspension_timeout_key, tonumber(A.timeout_at), A.execution_id)\n end\n\n -- RFC-013 \u{a7}9.2 + RFC-014 \u{2014} write dedup outcome. Includes extras tail.\n if _dedup_active then\n local pieces = {\n \"OK\", A.suspension_id, waitpoint_id, waitpoint_key, waitpoint_token,\n tostring(#extras_out),\n }\n for _, e in ipairs(extras_out) do\n pieces[#pieces + 1] = e.waitpoint_id\n pieces[#pieces + 1] = e.waitpoint_key\n pieces[#pieces + 1] = e.waitpoint_token\n end\n redis.call(\"HSET\", K.dedup_hash, \"outcome\", table.concat(pieces, \"\\t\"))\n if A.dedup_ttl_ms > 0 then\n redis.call(\"PEXPIRE\", K.dedup_hash, A.dedup_ttl_ms)\n end\n end\n\n return ok_extras(A.suspension_id, waitpoint_id, waitpoint_key, waitpoint_token, extras_out)\nend)\n\n---------------------------------------------------------------------------\n-- #14 ff_resume_execution\n--\n-- Transition suspended \u{2192} runnable. Called after signal satisfies condition\n-- or by operator override. Closes suspension + waitpoint, updates indexes.\n--\n-- KEYS (8): exec_core, suspension_current, waitpoint_hash,\n-- waitpoint_signals, suspension_timeout_zset,\n-- eligible_zset, delayed_zset, suspended_zset\n-- ARGV (3): execution_id, trigger_type, resume_delay_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_resume_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n suspension_current = keys[2],\n waitpoint_hash = keys[3],\n waitpoint_signals = keys[4],\n suspension_timeout_key = keys[5],\n eligible_zset = keys[6],\n delayed_zset = keys[7],\n suspended_zset = keys[8],\n }\n\n local A = {\n execution_id = args[1],\n trigger_type = args[2] or \"signal\", -- \"signal\", \"operator\", \"auto_resume\"\n resume_delay_ms = tonumber(args[3] or \"0\"),\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Read + validate execution is suspended\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"suspended\" then\n return err(\"execution_not_suspended\")\n end\n\n -- 2. Validate active suspension\n local susp_raw = redis.call(\"HGETALL\", K.suspension_current)\n local susp_err, susp = assert_active_suspension(susp_raw)\n if susp_err then return susp_err end\n\n -- 3. Compute eligibility based on resume_delay_ms\n local eligibility_state = \"eligible_now\"\n local blocking_reason = \"waiting_for_worker\"\n local blocking_detail = \"\"\n local public_state = \"waiting\"\n\n if A.resume_delay_ms > 0 then\n eligibility_state = \"not_eligible_until_time\"\n blocking_reason = \"waiting_for_resume_delay\"\n blocking_detail = \"resume delay \" .. tostring(A.resume_delay_ms) .. \"ms\"\n public_state = \"delayed\"\n end\n\n -- OOM-SAFE WRITE ORDERING: exec_core FIRST (point of no return)\n\n -- 4. Transition exec_core (FIRST \u{2014} all 7 dims)\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", eligibility_state,\n \"blocking_reason\", blocking_reason,\n \"blocking_detail\", blocking_detail,\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"attempt_interrupted\",\n \"public_state\", public_state,\n \"current_suspension_id\", \"\",\n \"current_waitpoint_id\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- 5. Update scheduling indexes\n redis.call(\"ZREM\", K.suspended_zset, A.execution_id)\n if A.resume_delay_ms > 0 then\n redis.call(\"ZADD\", K.delayed_zset,\n now_ms + A.resume_delay_ms, A.execution_id)\n else\n -- ZADD eligible with composite priority score\n local priority = tonumber(core.priority or \"0\")\n local created_at = tonumber(core.created_at or \"0\")\n local score = 0 - (priority * 1000000000000) + created_at\n redis.call(\"ZADD\", K.eligible_zset, score, A.execution_id)\n end\n\n -- 6. Close sub-objects (safe to lose on OOM \u{2014} stale but not zombie)\n -- Close waitpoint\n redis.call(\"HSET\", K.waitpoint_hash,\n \"state\", \"closed\",\n \"satisfied_at\", tostring(now_ms),\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"resumed\")\n\n -- Close suspension\n redis.call(\"HSET\", K.suspension_current,\n \"satisfied_at\", tostring(now_ms),\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"resumed\")\n\n -- Remove from suspension timeout index\n redis.call(\"ZREM\", K.suspension_timeout_key, A.execution_id)\n\n -- RFC-014 \u{a7}3.1.1 composite cleanup owner: operator-driven resume path.\n composite_cleanup(\n K.suspension_current .. \":satisfied_set\",\n K.suspension_current .. \":member_map\")\n\n -- RFC-014 Pattern 3: close any additional waitpoints co-owned by\n -- this suspension so their HMAC tokens can no longer authenticate\n -- signal delivery once the suspension has resumed.\n close_additional_waitpoints(\n K.suspension_current,\n susp.additional_waitpoints_json or \"\",\n { \"state\", \"closed\",\n \"satisfied_at\", tostring(now_ms),\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"resumed\" },\n { \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"resumed\" })\n\n return ok(public_state)\nend)\n\n---------------------------------------------------------------------------\n-- #15 ff_create_pending_waitpoint\n--\n-- Pre-create a waitpoint before suspension commits. The waitpoint is\n-- externally addressable by waitpoint_key so early signals can be buffered.\n-- Requires the caller to still hold the active lease.\n-- Mints the waitpoint HMAC token up front so early signals targeting the\n-- pending waitpoint can be authenticated via ff_buffer_signal_for_pending_waitpoint.\n--\n-- KEYS (4): exec_core, waitpoint_hash, pending_wp_expiry_zset, hmac_secrets\n-- ARGV (5): execution_id, attempt_index, waitpoint_id, waitpoint_key,\n-- expires_at\n---------------------------------------------------------------------------\nredis.register_function(\'ff_create_pending_waitpoint\', function(keys, args)\n local K = {\n core_key = keys[1],\n waitpoint_hash = keys[2],\n pending_wp_expiry_key = keys[3],\n hmac_secrets = keys[4],\n }\n\n local A = {\n execution_id = args[1],\n attempt_index = args[2],\n waitpoint_id = args[3],\n waitpoint_key = args[4],\n expires_at = args[5],\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Validate execution is active with a lease\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"active\" then\n return err(\"execution_not_active\",\n core.terminal_outcome or \"\",\n core.current_lease_epoch or \"\",\n core.lifecycle_phase or \"\",\n core.current_attempt_id or \"\")\n end\n if core.ownership_state ~= \"leased\" then\n return err(\"no_active_lease\")\n end\n -- Validate attempt binding\n if tostring(core.current_attempt_index) ~= A.attempt_index then\n return err(\"stale_lease\")\n end\n\n -- 2. Guard: waitpoint already exists\n if redis.call(\"EXISTS\", K.waitpoint_hash) == 1 then\n local existing_state = redis.call(\"HGET\", K.waitpoint_hash, \"state\")\n if existing_state == \"pending\" or existing_state == \"active\" then\n return err(\"waitpoint_already_exists\")\n end\n -- Old closed/expired waitpoint \u{2014} safe to overwrite\n end\n\n -- 3. Mint HMAC token bound to (waitpoint_id, waitpoint_key, now_ms).\n -- The suspension activation path will reuse this token unchanged.\n local waitpoint_token, token_err = mint_waitpoint_token(\n K.hmac_secrets, A.waitpoint_id, A.waitpoint_key, now_ms)\n if not waitpoint_token then return err(token_err) end\n\n -- 4. Create pending waitpoint\n redis.call(\"HSET\", K.waitpoint_hash,\n \"waitpoint_id\", A.waitpoint_id,\n \"execution_id\", A.execution_id,\n \"attempt_index\", A.attempt_index,\n \"suspension_id\", \"\",\n \"waitpoint_key\", A.waitpoint_key,\n \"waitpoint_token\", waitpoint_token,\n \"state\", \"pending\",\n \"created_at\", tostring(now_ms),\n \"activated_at\", \"\",\n \"satisfied_at\", \"\",\n \"closed_at\", \"\",\n \"expires_at\", A.expires_at,\n \"close_reason\", \"\",\n \"signal_count\", \"0\",\n \"matched_signal_count\", \"0\",\n \"last_signal_at\", \"\")\n\n -- 5. Add to pending waitpoint expiry index\n redis.call(\"ZADD\", K.pending_wp_expiry_key,\n tonumber(A.expires_at), A.waitpoint_id)\n\n return ok(A.waitpoint_id, A.waitpoint_key, waitpoint_token)\nend)\n\n---------------------------------------------------------------------------\n-- #16/#19 ff_expire_suspension (Overlap group D \u{2014} one script)\n--\n-- Apply timeout behavior when suspension timeout fires.\n-- Re-validates that execution is still suspended and timeout is due.\n-- Handles all 5 timeout behaviors:\n-- fail \u{2192} terminal(failed)\n-- cancel \u{2192} terminal(cancelled)\n-- expire \u{2192} terminal(expired)\n-- auto_resume \u{2192} close + resume to runnable\n-- escalate \u{2192} mutate suspension to operator-review\n--\n-- KEYS (12): exec_core, suspension_current, waitpoint_hash, wp_condition,\n-- attempt_hash, stream_meta, suspension_timeout_zset,\n-- suspended_zset, terminal_zset, eligible_zset, delayed_zset,\n-- lease_history\n-- ARGV (1): execution_id\n---------------------------------------------------------------------------\nredis.register_function(\'ff_expire_suspension\', function(keys, args)\n local K = {\n core_key = keys[1],\n suspension_current = keys[2],\n waitpoint_hash = keys[3],\n wp_condition = keys[4],\n attempt_hash = keys[5],\n stream_meta = keys[6],\n suspension_timeout_key = keys[7],\n suspended_zset = keys[8],\n terminal_key = keys[9],\n eligible_zset = keys[10],\n delayed_zset = keys[11],\n lease_history_key = keys[12],\n }\n\n local A = {\n execution_id = args[1],\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Read + validate execution is still suspended\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then\n redis.call(\"ZREM\", K.suspension_timeout_key, A.execution_id)\n return ok(\"not_found_cleaned\")\n end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"suspended\" then\n redis.call(\"ZREM\", K.suspension_timeout_key, A.execution_id)\n return ok(\"not_suspended_cleaned\")\n end\n\n -- 2. Read suspension and validate it\'s still open\n local susp_raw = redis.call(\"HGETALL\", K.suspension_current)\n local susp_err, susp = assert_active_suspension(susp_raw)\n if susp_err then\n redis.call(\"ZREM\", K.suspension_timeout_key, A.execution_id)\n return ok(\"no_active_suspension_cleaned\")\n end\n\n -- 3. Check timeout is actually due\n local timeout_at = tonumber(susp.timeout_at or \"0\")\n if timeout_at == 0 or timeout_at > now_ms then\n return ok(\"not_yet_due\")\n end\n\n -- 4. Read timeout behavior\n local behavior = susp.timeout_behavior or \"fail\"\n\n -- 5. Apply behavior\n if behavior == \"auto_resume\" or behavior == \"auto_resume_with_timeout_signal\" then\n -- auto_resume: close suspension + resume to runnable (like ff_resume_execution)\n\n -- OOM-SAFE: exec_core FIRST\n local priority = tonumber(core.priority or \"0\")\n local created_at = tonumber(core.created_at or \"0\")\n local score = 0 - (priority * 1000000000000) + created_at\n\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"eligible_now\",\n \"blocking_reason\", \"waiting_for_worker\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"attempt_interrupted\",\n \"public_state\", \"waiting\",\n \"current_suspension_id\", \"\",\n \"current_waitpoint_id\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Update indexes\n redis.call(\"ZREM\", K.suspended_zset, A.execution_id)\n redis.call(\"ZADD\", K.eligible_zset, score, A.execution_id)\n\n -- Close sub-objects\n redis.call(\"HSET\", K.waitpoint_hash,\n \"state\", \"closed\",\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"timed_out_auto_resume\")\n redis.call(\"HSET\", K.wp_condition,\n \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"timed_out_auto_resume\")\n redis.call(\"HSET\", K.suspension_current,\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"timed_out_auto_resume\")\n\n redis.call(\"ZREM\", K.suspension_timeout_key, A.execution_id)\n\n -- RFC-014 \u{a7}3.1.1 composite cleanup owner: expire (auto_resume) path.\n composite_cleanup(\n K.suspension_current .. \":satisfied_set\",\n K.suspension_current .. \":member_map\")\n close_additional_waitpoints(\n K.suspension_current,\n susp.additional_waitpoints_json or \"\",\n { \"state\", \"closed\",\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"timed_out_auto_resume\" },\n { \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"timed_out_auto_resume\" })\n\n return ok(\"auto_resume\", \"waiting\")\n\n elseif behavior == \"escalate\" then\n -- escalate: mutate suspension to operator-review, keep suspended (ALL 7 dims)\n\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"suspended\", -- preserve\n \"ownership_state\", \"unowned\", -- preserve\n \"eligibility_state\", \"not_applicable\", -- preserve\n \"blocking_reason\", \"paused_by_operator\",\n \"blocking_detail\", \"suspension escalated: timeout at \" .. tostring(timeout_at),\n \"terminal_outcome\", \"none\", -- preserve\n \"attempt_state\", core.attempt_state or \"attempt_interrupted\", -- preserve\n \"public_state\", \"suspended\", -- preserve\n \"last_mutation_at\", tostring(now_ms))\n\n redis.call(\"HSET\", K.suspension_current,\n \"reason_code\", \"waiting_for_operator_review\",\n \"timeout_at\", \"\",\n \"timeout_behavior\", \"\")\n\n -- Remove from timeout index (no longer has a timeout)\n redis.call(\"ZREM\", K.suspension_timeout_key, A.execution_id)\n\n return ok(\"escalate\", \"suspended\")\n\n else\n -- Terminal paths: fail, cancel, expire\n\n local terminal_outcome\n local public_state_val\n local close_reason\n\n if behavior == \"cancel\" then\n terminal_outcome = \"cancelled\"\n public_state_val = \"cancelled\"\n close_reason = \"timed_out_cancel\"\n elseif behavior == \"expire\" then\n terminal_outcome = \"expired\"\n public_state_val = \"expired\"\n close_reason = \"timed_out_expire\"\n else\n -- Default: fail\n terminal_outcome = \"failed\"\n public_state_val = \"failed\"\n close_reason = \"timed_out_fail\"\n end\n\n -- OOM-SAFE: exec_core FIRST (\u{a7}4.8b Rule 2)\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"terminal\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", terminal_outcome,\n \"attempt_state\", \"attempt_terminal\",\n \"public_state\", public_state_val,\n \"failure_reason\", \"suspension_timeout:\" .. behavior,\n \"completed_at\", tostring(now_ms),\n \"current_suspension_id\", \"\",\n \"current_waitpoint_id\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- End attempt: suspended \u{2192} ended_failure/ended_cancelled\n if is_set(core.current_attempt_index) then\n local att_end_state = \"ended_failure\"\n if behavior == \"cancel\" then\n att_end_state = \"ended_cancelled\"\n end\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", att_end_state,\n \"ended_at\", tostring(now_ms),\n \"failure_reason\", \"suspension_timeout:\" .. behavior,\n \"suspended_at\", \"\",\n \"suspension_id\", \"\")\n end\n\n -- Close stream if exists\n if redis.call(\"EXISTS\", K.stream_meta) == 1 then\n redis.call(\"HSET\", K.stream_meta,\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"suspension_timeout\")\n end\n\n -- Close sub-objects\n redis.call(\"HSET\", K.waitpoint_hash,\n \"state\", \"closed\",\n \"closed_at\", tostring(now_ms),\n \"close_reason\", close_reason)\n redis.call(\"HSET\", K.wp_condition,\n \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", close_reason)\n redis.call(\"HSET\", K.suspension_current,\n \"closed_at\", tostring(now_ms),\n \"close_reason\", close_reason)\n\n -- RFC-014 \u{a7}3.1.1 composite cleanup owner: expire (terminal) paths.\n composite_cleanup(\n K.suspension_current .. \":satisfied_set\",\n K.suspension_current .. \":member_map\")\n close_additional_waitpoints(\n K.suspension_current,\n susp.additional_waitpoints_json or \"\",\n { \"state\", \"closed\",\n \"closed_at\", tostring(now_ms),\n \"close_reason\", close_reason },\n { \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", close_reason })\n\n -- Remove from suspension indexes, add to terminal\n redis.call(\"ZREM\", K.suspension_timeout_key, A.execution_id)\n redis.call(\"ZREM\", K.suspended_zset, A.execution_id)\n redis.call(\"ZADD\", K.terminal_key, now_ms, A.execution_id)\n\n -- Push-based DAG promotion (bridge-event gap report \u{a7}1.3 analogue).\n -- Suspension-timeout terminals (fail / cancel / expire) are FF-\n -- initiated transitions that cairn cannot observe via its\n -- call-then-emit pattern. Without a PUBLISH, flow-bound children\n -- only unblock via the dependency_reconciler safety net (15s).\n -- Gated on `is_set(core.flow_id)` \u{2014} standalone executions never\n -- have downstream edges. Outcome matches terminal_outcome\n -- (\"failed\" / \"cancelled\" / \"expired\").\n if is_set(core.flow_id) then\n local payload = cjson.encode({\n execution_id = A.execution_id,\n flow_id = core.flow_id,\n outcome = terminal_outcome,\n })\n redis.call(\"PUBLISH\", \"ff:dag:completions\", payload)\n end\n\n return ok(behavior, public_state_val)\n end\nend)\n\n---------------------------------------------------------------------------\n-- #36 ff_close_waitpoint\n--\n-- Proactive close of pending or active waitpoint. Used by workers that\n-- created a pending waitpoint but decided not to suspend.\n--\n-- KEYS (3): exec_core, waitpoint_hash, pending_wp_expiry_zset\n-- ARGV (2): waitpoint_id, reason\n---------------------------------------------------------------------------\nredis.register_function(\'ff_close_waitpoint\', function(keys, args)\n local K = {\n core_key = keys[1],\n waitpoint_hash = keys[2],\n pending_wp_expiry_key = keys[3],\n }\n\n local A = {\n waitpoint_id = args[1],\n reason = args[2] or \"proactive_close\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Read waitpoint\n local wp_raw = redis.call(\"HGETALL\", K.waitpoint_hash)\n if #wp_raw == 0 then\n return err(\"waitpoint_not_found\")\n end\n local wp = hgetall_to_table(wp_raw)\n\n -- 2. Validate state is pending or active\n if wp.state ~= \"pending\" and wp.state ~= \"active\" then\n if wp.state == \"closed\" or wp.state == \"expired\" then\n return ok(\"already_closed\")\n end\n return err(\"waitpoint_not_open\")\n end\n\n -- 3. Close the waitpoint\n redis.call(\"HSET\", K.waitpoint_hash,\n \"state\", \"closed\",\n \"closed_at\", tostring(now_ms),\n \"close_reason\", A.reason)\n\n -- 4. Remove from pending expiry index (no-op if not pending)\n redis.call(\"ZREM\", K.pending_wp_expiry_key, A.waitpoint_id)\n\n return ok()\nend)\n\n---------------------------------------------------------------------------\n-- ff_rotate_waitpoint_hmac_secret\n--\n-- Install a new waitpoint HMAC signing kid on a single partition. The\n-- server-side admin endpoint (ff-server POST /v1/admin/rotate-waitpoint-secret)\n-- delegates to this FCALL per partition. Direct-Valkey consumers (e.g.\n-- cairn-rs, issue #49) invoke it themselves across every partition.\n--\n-- FCALL atomicity is per-shard and per-call; previous implementations used\n-- a SETNX lock + read-modify-write from Rust. Here the script IS the\n-- atomicity boundary, so no lock is needed.\n--\n-- KEYS (1): hmac_secrets (ff:sec:{p:N}:waitpoint_hmac)\n-- ARGV (3): new_kid, new_secret_hex, grace_ms\n--\n-- `now_ms` is derived server-side from `redis.call(\"TIME\")` to match\n-- the rest of the library (lua/flow.lua, validate_waitpoint_token), so\n-- GC and validation agree on \"now\". `grace_ms` is a duration, not a\n-- clock value, so taking it from ARGV is safe \u{2014} operators set it via\n-- FF_WAITPOINT_HMAC_GRACE_MS (ff-server) or pass their own value.\n--\n-- Outcomes:\n-- ok(\"rotated\", previous_kid_or_empty, new_kid, gc_count)\n-- ok(\"noop\", kid) -- exact replay (same kid + secret)\n-- err(\"rotation_conflict\", kid) -- same kid, different secret\n-- err(\"invalid_kid\") -- empty or contains \':\'\n-- err(\"invalid_secret_hex\") -- empty / odd length / non-hex\n-- err(\"invalid_grace_ms\") -- not a non-negative integer\n--\n-- Authoritative implementation of waitpoint HMAC rotation: idempotent\n-- replay, torn-write repair, orphan GC across expired kids. INVARIANT:\n-- expires_at:<new_kid> is never written (current_kid has no expiry).\n-- ff-server\'s admin endpoint delegates to this FCALL \u{2014} single source\n-- of truth lives here, not in Rust.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_rotate_waitpoint_hmac_secret\', function(keys, args)\n local hmac_key = keys[1]\n local new_kid = args[1]\n local new_secret_hex = args[2]\n local grace_ms_s = args[3]\n\n -- Combined validation (feedback: single condition is more readable).\n if type(new_kid) ~= \"string\" or new_kid == \"\" or new_kid:find(\":\", 1, true) then\n return err(\"invalid_kid\")\n end\n if type(new_secret_hex) ~= \"string\"\n or new_secret_hex == \"\"\n or #new_secret_hex % 2 ~= 0\n or new_secret_hex:find(\"[^0-9a-fA-F]\") then\n return err(\"invalid_secret_hex\")\n end\n\n -- grace_ms must be a finite non-negative integer. The math.floor check\n -- rejects decimals but NOT infinities (math.floor(math.huge) == math.huge\n -- which would stamp \"inf\" into expires_at:*). Cap at 2^53-1 so the\n -- stored value stays within the IEEE-754 double-precision integer range\n -- AND within i64, keeping the Rust parser happy. 2^53-1 ms is ~285 years,\n -- far beyond any operational grace window.\n local grace_ms = tonumber(grace_ms_s)\n if not grace_ms\n or grace_ms ~= grace_ms -- NaN\n or grace_ms < 0\n or grace_ms > 9007199254740991 -- 2^53 - 1\n or grace_ms ~= math.floor(grace_ms) then\n return err(\"invalid_grace_ms\")\n end\n\n -- Server-side time via redis.call(\"TIME\"); never trust caller-supplied\n -- timestamps for expiry decisions (consistency with flow.lua + helpers.lua\n -- and with validate_waitpoint_token).\n local now_ms = server_time_ms()\n local prev_expires_at = now_ms + grace_ms\n\n local current_kid = redis.call(\"HGET\", hmac_key, \"current_kid\")\n if current_kid == false then current_kid = nil end\n\n -- Idempotency branch: same kid already installed.\n if current_kid == new_kid then\n local stored = redis.call(\"HGET\", hmac_key, \"secret:\" .. new_kid)\n if stored == false then stored = nil end\n if stored == new_secret_hex then\n return ok(\"noop\", new_kid)\n elseif stored then\n return err(\"rotation_conflict\", new_kid)\n else\n -- Torn-write repair: current_kid=new_kid but secret:<new_kid> missing.\n redis.call(\"HSET\", hmac_key, \"secret:\" .. new_kid, new_secret_hex)\n return ok(\"noop\", new_kid)\n end\n end\n\n -- Orphan GC: HGETALL once, collect kids whose expires_at has passed,\n -- then a single HDEL. One entry per distinct kid ever installed plus\n -- a handful of scalars; bounded in practice.\n local raw = redis.call(\"HGETALL\", hmac_key)\n local expired_fields = {}\n local gc_count = 0\n for i = 1, #raw, 2 do\n local field = raw[i]\n local value = raw[i + 1]\n if field:sub(1, 11) == \"expires_at:\" then\n local kid = field:sub(12)\n local exp = tonumber(value)\n -- GC strictness MUST match validate_waitpoint_token\'s `exp < now_ms`\n -- (lua/helpers.lua). Reaping on `exp <= now_ms` would delete a kid\n -- that the validator still considers in-grace at the boundary\n -- exp == now_ms, causing tokens that should validate to fail.\n if not exp or exp <= 0 or exp < now_ms then\n expired_fields[#expired_fields + 1] = \"expires_at:\" .. kid\n expired_fields[#expired_fields + 1] = \"secret:\" .. kid\n gc_count = gc_count + 1\n end\n end\n end\n if #expired_fields > 0 then\n redis.call(\"HDEL\", hmac_key, unpack(expired_fields))\n end\n\n -- Promote current \u{2192} previous. INVARIANT: expires_at:<new_kid> is NEVER\n -- written \u{2014} current_kid has no expiry entry.\n local prev_expires_str = tostring(prev_expires_at)\n if current_kid then\n redis.call(\"HSET\", hmac_key,\n \"previous_kid\", current_kid,\n \"previous_expires_at\", prev_expires_str,\n \"expires_at:\" .. current_kid, prev_expires_str,\n \"current_kid\", new_kid,\n \"secret:\" .. new_kid, new_secret_hex)\n else\n redis.call(\"HSET\", hmac_key,\n \"current_kid\", new_kid,\n \"secret:\" .. new_kid, new_secret_hex)\n end\n\n return ok(\"rotated\", current_kid or \"\", new_kid, tostring(gc_count))\nend)\n\n---------------------------------------------------------------------------\n-- ff_list_waitpoint_hmac_kids\n--\n-- Read-back for the waitpoint HMAC keystore. Insulates consumers (cairn\n-- admin UI, ff-server audit surface) from the hash-field naming so future\n-- layout changes don\'t break them.\n--\n-- KEYS (1): hmac_secrets (ff:sec:{p:N}:waitpoint_hmac)\n-- ARGV: none\n--\n-- Returns ok(current_kid_or_empty, n, kid1, exp1_ms, kid2, exp2_ms, ...)\n-- \"verifying\" kids = those with a FUTURE expires_at:<kid> entry. Kids\n-- whose grace has already elapsed are NOT reported here \u{2014} the contract\n-- promises kids that still validate tokens, so listing expired kids\n-- would mislead operators. Expired entries are swept by orphan GC on\n-- the next rotation. current_kid is excluded (it never has an expiry).\n-- Uninitialized \u{2192} ok(\"\", 0).\n---------------------------------------------------------------------------\nredis.register_function(\'ff_list_waitpoint_hmac_kids\', function(keys, args)\n local hmac_key = keys[1]\n local raw = redis.call(\"HGETALL\", hmac_key)\n local now_ms = server_time_ms()\n\n local current_kid = \"\"\n local pairs_out = {}\n local n = 0\n for i = 1, #raw, 2 do\n local field = raw[i]\n local value = raw[i + 1]\n if field == \"current_kid\" then\n current_kid = value\n elseif field:sub(1, 11) == \"expires_at:\" then\n local kid = field:sub(12)\n local exp = tonumber(value)\n -- Match validator\'s `exp < now_ms` rejection rule so a kid listed\n -- as verifying really does still validate tokens at call time.\n if exp and exp > 0 and exp >= now_ms then\n pairs_out[#pairs_out + 1] = kid\n pairs_out[#pairs_out + 1] = value\n n = n + 1\n end\n end\n end\n\n return ok(current_kid, tostring(n), unpack(pairs_out))\nend)\n\n\n-- source: lua/signal.lua\n-- FlowFabric signal delivery and resume-claim functions\n-- Reference: RFC-005 (Signal), RFC-001 (Execution), RFC-010 \u{a7}4.1 (#17, #18, #2)\n--\n-- Depends on helpers: ok, err, ok_duplicate, hgetall_to_table, is_set,\n-- initialize_condition, write_condition_hash, evaluate_signal_against_condition,\n-- is_condition_satisfied, extract_field\n\n---------------------------------------------------------------------------\n-- #17 ff_deliver_signal\n--\n-- Atomic signal delivery: validate target, check idempotency, record\n-- signal, evaluate resume condition, optionally close waitpoint +\n-- suspension + transition suspended -> runnable.\n--\n-- KEYS (15): exec_core, wp_condition, wp_signals_stream,\n-- exec_signals_zset, signal_hash, signal_payload,\n-- idem_key, waitpoint_hash, suspension_current,\n-- eligible_zset, suspended_zset, delayed_zset,\n-- suspension_timeout_zset, hmac_secrets,\n-- partition_signal_delivery_stream (RFC-019 Stage B / #310)\n-- ARGV (18): signal_id, execution_id, waitpoint_id, signal_name,\n-- signal_category, source_type, source_identity,\n-- payload, payload_encoding, idempotency_key,\n-- correlation_id, target_scope, created_at,\n-- dedup_ttl_ms, resume_delay_ms, signal_maxlen,\n-- max_signals_per_execution, waitpoint_token\n---------------------------------------------------------------------------\nredis.register_function(\'ff_deliver_signal\', function(keys, args)\n local K = {\n core_key = keys[1],\n wp_condition = keys[2],\n wp_signals_stream = keys[3],\n exec_signals_zset = keys[4],\n signal_hash = keys[5],\n signal_payload = keys[6],\n idem_key = keys[7],\n waitpoint_hash = keys[8],\n suspension_current = keys[9],\n eligible_zset = keys[10],\n suspended_zset = keys[11],\n delayed_zset = keys[12],\n suspension_timeout_zset = keys[13],\n hmac_secrets = keys[14],\n partition_signal_delivery_stream = keys[15],\n }\n\n local A = {\n signal_id = args[1],\n execution_id = args[2],\n waitpoint_id = args[3],\n signal_name = args[4],\n signal_category = args[5],\n source_type = args[6],\n source_identity = args[7],\n payload = args[8] or \"\",\n payload_encoding = args[9] or \"json\",\n idempotency_key = args[10] or \"\",\n correlation_id = args[11] or \"\",\n target_scope = args[12] or \"waitpoint\",\n created_at = args[13] or \"\",\n dedup_ttl_ms = tonumber(args[14] or \"86400000\"),\n resume_delay_ms = tonumber(args[15] or \"0\"),\n signal_maxlen = tonumber(args[16] or \"1000\"),\n max_signals = tonumber(args[17] or \"10000\"),\n waitpoint_token = args[18] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Validate execution exists\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then\n return err(\"execution_not_found\")\n end\n local core = hgetall_to_table(raw)\n\n -- 2. Validate HMAC token FIRST (RFC-004 \u{a7}Waitpoint Security).\n --\n -- Order matters: lifecycle / waitpoint-state checks below would otherwise\n -- form a state oracle \u{2014} an attacker presenting ANY token (including an\n -- invalid one) for an arbitrary (execution_id, waitpoint_id) pair could\n -- distinguish \"execution is terminal\" vs \"waitpoint is pending\" vs\n -- \"waitpoint is closed\" by the specific error code returned, without\n -- having to produce a valid HMAC. Auth-first closes that oracle.\n --\n -- Missing-waitpoint is collapsed into `invalid_token` for the same\n -- reason: an unauthenticated caller must not be able to probe which\n -- (execution, waitpoint) tuples exist.\n local wp_for_auth_raw = redis.call(\"HGETALL\", K.waitpoint_hash)\n if #wp_for_auth_raw == 0 then\n return err(\"invalid_token\")\n end\n local wp_for_auth = hgetall_to_table(wp_for_auth_raw)\n if not wp_for_auth.created_at then\n return err(\"invalid_token\")\n end\n local token_err = validate_waitpoint_token(\n K.hmac_secrets, A.waitpoint_token,\n A.waitpoint_id, wp_for_auth.waitpoint_key or \"\",\n tonumber(wp_for_auth.created_at) or 0, now_ms)\n if token_err then\n -- Operator-visible counter (RFC-004 \u{a7}Waitpoint Security observability).\n -- Single scalar HSET on exec_core \u{2014} bounded, amortized-free. Gives\n -- operators a \"last time this execution saw an auth failure\" field to\n -- correlate with key-rotation drift, client bugs, or attack traffic\n -- without needing to tail Lua slowlog or FCALL error logs.\n redis.call(\"HSET\", K.core_key, \"last_hmac_validation_failed_at\", tostring(now_ms))\n return err(token_err)\n end\n\n -- 3. Validate execution is in a signalable state (post-auth).\n local lp = core.lifecycle_phase\n if lp == \"terminal\" then\n return err(\"target_not_signalable\")\n end\n\n if lp == \"active\" or lp == \"runnable\" or lp == \"submitted\" then\n -- Not suspended. wp_for_auth was just loaded above; reuse it.\n if wp_for_auth.state == \"pending\" then\n return err(\"waitpoint_pending_use_buffer_script\")\n end\n if wp_for_auth.state ~= \"active\" then\n return err(\"target_not_signalable\")\n end\n -- Active waitpoint on non-suspended execution \u{2014} unusual but valid (race window)\n end\n\n -- 4. Validate waitpoint condition is open (post-auth).\n local cond_raw = redis.call(\"HGETALL\", K.wp_condition)\n if #cond_raw == 0 then\n return err(\"waitpoint_not_found\")\n end\n local wp_cond = hgetall_to_table(cond_raw)\n if wp_cond.closed == \"1\" then\n return err(\"waitpoint_closed\")\n end\n\n -- 4. Signal count limit (prevents unbounded ZSET growth from webhook storms)\n if A.max_signals > 0 then\n local current_count = redis.call(\"ZCARD\", K.exec_signals_zset)\n if current_count >= A.max_signals then\n return err(\"signal_limit_exceeded\")\n end\n end\n\n -- 5. Idempotency check\n -- Guard: (A.dedup_ttl_ms or 0) handles nil from tonumber(\"\") safely.\n local dedup_ms = A.dedup_ttl_ms or 0\n if A.idempotency_key ~= \"\" and dedup_ms > 0 then\n local existing = redis.call(\"GET\", K.idem_key)\n if existing then\n return ok_duplicate(existing)\n end\n redis.call(\"SET\", K.idem_key, A.signal_id,\n \"PX\", dedup_ms, \"NX\")\n end\n\n -- 6. Record signal hash\n local created_at = A.created_at ~= \"\" and A.created_at or tostring(now_ms)\n redis.call(\"HSET\", K.signal_hash,\n \"signal_id\", A.signal_id,\n \"target_execution_id\", A.execution_id,\n \"target_waitpoint_id\", A.waitpoint_id,\n \"target_scope\", A.target_scope,\n \"signal_name\", A.signal_name,\n \"signal_category\", A.signal_category,\n \"source_type\", A.source_type,\n \"source_identity\", A.source_identity,\n \"correlation_id\", A.correlation_id,\n \"idempotency_key\", A.idempotency_key,\n \"created_at\", created_at,\n \"accepted_at\", tostring(now_ms),\n \"matched_waitpoint_id\", A.waitpoint_id,\n \"payload_encoding\", A.payload_encoding)\n\n -- 6b. Store payload separately if present\n if A.payload ~= \"\" then\n redis.call(\"SET\", K.signal_payload, A.payload)\n end\n\n -- 7. Append to per-waitpoint signal stream + per-execution signal index\n redis.call(\"XADD\", K.wp_signals_stream, \"MAXLEN\", \"~\",\n tostring(A.signal_maxlen), \"*\",\n \"signal_id\", A.signal_id,\n \"signal_name\", A.signal_name,\n \"signal_category\", A.signal_category,\n \"source_type\", A.source_type,\n \"source_identity\", A.source_identity,\n \"matched\", \"0\",\n \"accepted_at\", tostring(now_ms))\n redis.call(\"ZADD\", K.exec_signals_zset, now_ms, A.signal_id)\n\n -- 8. Evaluate resume condition\n local effect = \"appended_to_waitpoint\"\n local matched = false\n\n -- RFC-014 \u{a7}3.3: composite branch. `wp_condition` stores a `composite=1`\n -- marker when suspension serialized a multi-signal tree. We short-\n -- circuit here and run the composite evaluator (depth-bounded).\n local composite_mode = (wp_cond.composite == \"1\")\n if composite_mode then\n -- Load tree + waitpoint_key for candidate lookup.\n local susp_raw = redis.call(\"HGETALL\", K.suspension_current)\n local susp = hgetall_to_table(susp_raw)\n local tree_json = wp_cond.tree_json or \"\"\n local tree = tree_json ~= \"\" and cjson.decode(tree_json) or nil\n local satisfied_set_key = K.suspension_current .. \":satisfied_set\"\n local member_map_key = K.suspension_current .. \":member_map\"\n\n local signal_for_eval = {\n signal_id = A.signal_id,\n signal_name = A.signal_name,\n source_type = A.source_type,\n source_identity = A.source_identity,\n }\n -- RFC-014 Pattern 3: use THIS waitpoint\'s own key (loaded into\n -- `wp_for_auth` above, keyed by A.waitpoint_id) so multi-waitpoint\n -- AllOf resolves each leaf against its own wp_key, not the\n -- suspension\'s primary key.\n local this_wp_key = wp_for_auth.waitpoint_key or susp.waitpoint_key or \"\"\n local outcome = composite_deliver_signal(\n tree, satisfied_set_key, member_map_key,\n A.waitpoint_id, this_wp_key, signal_for_eval)\n\n effect = outcome.effect or \"appended_to_waitpoint\"\n matched = (effect ~= \"signal_ignored_not_in_condition\"\n and effect ~= \"signal_ignored_matcher_failed\")\n\n if outcome.resume then\n -- Close suspension via the standard path (same as single-matcher\n -- resume below). Composite-scoped cleanup follows \u{a7}3.1.1.\n local lp2 = core.lifecycle_phase\n if lp2 == \"suspended\" then\n local es, br, bd, ps\n if A.resume_delay_ms > 0 then\n es = \"not_eligible_until_time\"\n br = \"waiting_for_resume_delay\"\n bd = \"resume delay \" .. A.resume_delay_ms .. \"ms after signal \" .. A.signal_name\n ps = \"delayed\"\n else\n es = \"eligible_now\"\n br = \"waiting_for_worker\"\n bd = \"\"\n ps = \"waiting\"\n end\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", es,\n \"blocking_reason\", br,\n \"blocking_detail\", bd,\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"attempt_interrupted\",\n \"public_state\", ps,\n \"current_suspension_id\", \"\",\n \"current_waitpoint_id\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n local priority = tonumber(core.priority or \"0\")\n local created_at_exec = tonumber(core.created_at or \"0\")\n redis.call(\"ZREM\", K.suspended_zset, A.execution_id)\n if A.resume_delay_ms > 0 then\n redis.call(\"ZADD\", K.delayed_zset,\n now_ms + A.resume_delay_ms, A.execution_id)\n else\n redis.call(\"ZADD\", K.eligible_zset,\n 0 - (priority * 1000000000000) + created_at_exec,\n A.execution_id)\n end\n end\n\n redis.call(\"HSET\", K.wp_condition,\n \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"satisfied\")\n redis.call(\"HSET\", K.waitpoint_hash,\n \"state\", \"closed\",\n \"satisfied_at\", tostring(now_ms),\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"resumed\")\n if redis.call(\"EXISTS\", K.suspension_current) == 1 then\n redis.call(\"HSET\", K.suspension_current,\n \"satisfied_at\", tostring(now_ms),\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"resumed\",\n \"closer_signal_id\", outcome.closer or A.signal_id,\n \"all_satisfier_signals\", outcome.all_satisfiers_json or \"[]\")\n end\n redis.call(\"ZREM\", K.suspension_timeout_zset, A.execution_id)\n -- RFC-014 \u{a7}3.1.1 cleanup owner: deliver_signal close path.\n composite_cleanup(satisfied_set_key, member_map_key)\n -- RFC-014 Pattern 3: close any OTHER waitpoints owned by this\n -- suspension (the one the signal arrived on is already closed\n -- via K.waitpoint_hash above). Reread susp to pick up the\n -- additional_waitpoints_json that was seeded at suspend-time.\n local susp_after = redis.call(\"HGETALL\", K.suspension_current)\n if #susp_after > 0 then\n local susp2 = hgetall_to_table(susp_after)\n close_additional_waitpoints(\n K.suspension_current,\n susp2.additional_waitpoints_json or \"\",\n { \"state\", \"closed\",\n \"satisfied_at\", tostring(now_ms),\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"resumed\" },\n { \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"satisfied\" })\n end\n end\n\n -- Record signal hash observed_effect + waitpoint counters + return.\n redis.call(\"HSET\", K.signal_hash, \"observed_effect\", effect)\n redis.call(\"HINCRBY\", K.waitpoint_hash, \"signal_count\", 1)\n if matched then\n redis.call(\"HINCRBY\", K.waitpoint_hash, \"matched_signal_count\", 1)\n end\n redis.call(\"HSET\", K.waitpoint_hash, \"last_signal_at\", tostring(now_ms))\n if redis.call(\"EXISTS\", K.suspension_current) == 1 and not outcome.resume then\n redis.call(\"HSET\", K.suspension_current, \"last_signal_at\", tostring(now_ms))\n end\n -- RFC-019 Stage B / #310: partition-level signal-delivery aggregate\n -- stream. `subscribe_signal_delivery` XREAD BLOCKs this key.\n redis.call(\"XADD\", K.partition_signal_delivery_stream, \"MAXLEN\", \"~\", \"1000\", \"*\",\n \"signal_id\", A.signal_id,\n \"execution_id\", A.execution_id,\n \"waitpoint_id\", A.waitpoint_id,\n \"source_identity\", A.source_identity,\n \"effect\", effect,\n \"delivered_at_ms\", tostring(now_ms))\n return ok(A.signal_id, effect)\n end\n\n local total = tonumber(wp_cond.total_matchers or \"0\")\n for i = 0, total - 1 do\n local sat_key = \"matcher:\" .. i .. \":satisfied\"\n local name_key = \"matcher:\" .. i .. \":name\"\n if wp_cond[sat_key] == \"0\" then\n local matcher_name = wp_cond[name_key] or \"\"\n if matcher_name == \"\" or matcher_name == A.signal_name then\n -- Mark matcher as satisfied\n redis.call(\"HSET\", K.wp_condition,\n sat_key, \"1\",\n \"matcher:\" .. i .. \":signal_id\", A.signal_id)\n matched = true\n local new_sat = tonumber(wp_cond.satisfied_count or \"0\") + 1\n redis.call(\"HSET\", K.wp_condition, \"satisfied_count\", tostring(new_sat))\n\n -- Check if overall condition is satisfied\n local mode = wp_cond.match_mode or \"any\"\n local min_count = tonumber(wp_cond.minimum_signal_count or \"1\")\n local resume = false\n if mode == \"any\" then\n resume = (new_sat >= min_count)\n elseif mode == \"all\" then\n resume = (new_sat >= total)\n else\n -- count(n) mode\n resume = (new_sat >= min_count)\n end\n\n if resume then\n effect = \"resume_condition_satisfied\"\n\n -- OOM-SAFE WRITE ORDERING (per RFC-010 \u{a7}4.8b):\n -- exec_core HSET is the \"point of no return\" \u{2014} write it FIRST.\n -- If OOM kills after exec_core but before closing sub-objects,\n -- execution is runnable (correct) with stale suspension/waitpoint\n -- records (generalized index reconciler catches this).\n\n -- 9a. Transition execution: suspended -> runnable (WRITE FIRST)\n -- Resume continues the SAME attempt (no new attempt created).\n if lp == \"suspended\" then\n local es, br, bd, ps\n if A.resume_delay_ms > 0 then\n es = \"not_eligible_until_time\"\n br = \"waiting_for_resume_delay\"\n bd = \"resume delay \" .. A.resume_delay_ms .. \"ms after signal \" .. A.signal_name\n ps = \"delayed\"\n else\n es = \"eligible_now\"\n br = \"waiting_for_worker\"\n bd = \"\"\n ps = \"waiting\"\n end\n\n -- ALL 7 state vector dimensions\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", es,\n \"blocking_reason\", br,\n \"blocking_detail\", bd,\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"attempt_interrupted\",\n \"public_state\", ps,\n \"current_suspension_id\", \"\",\n \"current_waitpoint_id\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- 9b. Update scheduling indexes\n local priority = tonumber(core.priority or \"0\")\n local created_at_exec = tonumber(core.created_at or \"0\")\n redis.call(\"ZREM\", K.suspended_zset, A.execution_id)\n if A.resume_delay_ms > 0 then\n redis.call(\"ZADD\", K.delayed_zset,\n now_ms + A.resume_delay_ms, A.execution_id)\n else\n redis.call(\"ZADD\", K.eligible_zset,\n 0 - (priority * 1000000000000) + created_at_exec,\n A.execution_id)\n end\n end\n\n -- 9c. Close waitpoint condition (after exec_core is safe)\n redis.call(\"HSET\", K.wp_condition,\n \"closed\", \"1\",\n \"closed_at\", tostring(now_ms),\n \"closed_reason\", \"satisfied\")\n\n -- 9d. Close waitpoint record\n redis.call(\"HSET\", K.waitpoint_hash,\n \"state\", \"closed\",\n \"satisfied_at\", tostring(now_ms),\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"resumed\")\n\n -- 9e. Close suspension record\n if redis.call(\"EXISTS\", K.suspension_current) == 1 then\n redis.call(\"HSET\", K.suspension_current,\n \"satisfied_at\", tostring(now_ms),\n \"closed_at\", tostring(now_ms),\n \"close_reason\", \"resumed\")\n end\n\n -- 9f. Remove from suspension timeout index\n redis.call(\"ZREM\", K.suspension_timeout_zset, A.execution_id)\n end\n break\n end\n end\n end\n\n if not matched then\n effect = \"no_op\"\n end\n\n -- 10. Record observed effect on signal\n redis.call(\"HSET\", K.signal_hash, \"observed_effect\", effect)\n\n -- 11. Update waitpoint signal counts\n redis.call(\"HINCRBY\", K.waitpoint_hash, \"signal_count\", 1)\n if matched then\n redis.call(\"HINCRBY\", K.waitpoint_hash, \"matched_signal_count\", 1)\n end\n redis.call(\"HSET\", K.waitpoint_hash, \"last_signal_at\", tostring(now_ms))\n\n -- 12. Update suspension signal summary\n if redis.call(\"EXISTS\", K.suspension_current) == 1 then\n redis.call(\"HSET\", K.suspension_current, \"last_signal_at\", tostring(now_ms))\n end\n\n -- RFC-019 Stage B / #310: partition-level signal-delivery aggregate\n -- stream. `subscribe_signal_delivery` XREAD BLOCKs this key.\n redis.call(\"XADD\", K.partition_signal_delivery_stream, \"MAXLEN\", \"~\", \"1000\", \"*\",\n \"signal_id\", A.signal_id,\n \"execution_id\", A.execution_id,\n \"waitpoint_id\", A.waitpoint_id,\n \"source_identity\", A.source_identity,\n \"effect\", effect,\n \"delivered_at_ms\", tostring(now_ms))\n\n return ok(A.signal_id, effect)\nend)\n\n---------------------------------------------------------------------------\n-- #18 ff_buffer_signal_for_pending_waitpoint\n--\n-- Accept signal for a pending (not yet committed) waitpoint.\n-- Records the signal but does NOT evaluate resume conditions.\n-- When suspend_execution activates the waitpoint, buffered signals\n-- are replayed through the full evaluation path.\n--\n-- KEYS (9): exec_core, wp_condition, wp_signals_stream,\n-- exec_signals_zset, signal_hash, signal_payload,\n-- idem_key, waitpoint_hash, hmac_secrets\n-- ARGV (18): same as ff_deliver_signal (17 + waitpoint_token)\n---------------------------------------------------------------------------\nredis.register_function(\'ff_buffer_signal_for_pending_waitpoint\', function(keys, args)\n local K = {\n core_key = keys[1],\n wp_condition = keys[2],\n wp_signals_stream = keys[3],\n exec_signals_zset = keys[4],\n signal_hash = keys[5],\n signal_payload = keys[6],\n idem_key = keys[7],\n waitpoint_hash = keys[8],\n hmac_secrets = keys[9],\n }\n\n local A = {\n signal_id = args[1],\n execution_id = args[2],\n waitpoint_id = args[3],\n signal_name = args[4],\n signal_category = args[5],\n source_type = args[6],\n source_identity = args[7],\n payload = args[8] or \"\",\n payload_encoding = args[9] or \"json\",\n idempotency_key = args[10] or \"\",\n correlation_id = args[11] or \"\",\n target_scope = args[12] or \"waitpoint\",\n created_at = args[13] or \"\",\n dedup_ttl_ms = tonumber(args[14] or \"86400000\"),\n signal_maxlen = tonumber(args[16] or \"1000\"),\n max_signals = tonumber(args[17] or \"10000\"),\n waitpoint_token = args[18] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Validate execution exists\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then\n return err(\"execution_not_found\")\n end\n\n -- 1a. Validate HMAC token against the pending waitpoint\'s mint-time binding.\n local wp_for_auth = hgetall_to_table(redis.call(\"HGETALL\", K.waitpoint_hash))\n if not wp_for_auth.created_at then\n return err(\"waitpoint_not_found\")\n end\n local token_err = validate_waitpoint_token(\n K.hmac_secrets, A.waitpoint_token,\n A.waitpoint_id, wp_for_auth.waitpoint_key or \"\",\n tonumber(wp_for_auth.created_at) or 0, now_ms)\n if token_err then\n -- Operator-visible counter mirroring ff_deliver_signal. See comment\n -- there for rationale.\n redis.call(\"HSET\", K.core_key, \"last_hmac_validation_failed_at\", tostring(now_ms))\n return err(token_err)\n end\n\n -- 1b. Gate on waitpoint state. ff_deliver_signal blocks replay-after-close\n -- via wp_condition.closed, but wp_condition is not initialized for pending\n -- waitpoints \u{2014} we must check wp.state directly. Without this, a caller\n -- holding a valid token for a pending waitpoint that has since been\n -- closed/expired can keep appending buffered signals that will replay\n -- when suspend_execution(use_pending=1) later activates the waitpoint.\n if wp_for_auth.state == \"closed\" or wp_for_auth.state == \"expired\" then\n return err(\"waitpoint_closed\")\n end\n\n -- 2. Signal count limit\n if A.max_signals > 0 then\n local current_count = redis.call(\"ZCARD\", K.exec_signals_zset)\n if current_count >= A.max_signals then\n return err(\"signal_limit_exceeded\")\n end\n end\n\n -- 3. Idempotency check\n -- Guard: (A.dedup_ttl_ms or 0) handles nil from tonumber(\"\") safely.\n local dedup_ms = A.dedup_ttl_ms or 0\n if A.idempotency_key ~= \"\" and dedup_ms > 0 then\n local existing = redis.call(\"GET\", K.idem_key)\n if existing then\n return ok_duplicate(existing)\n end\n redis.call(\"SET\", K.idem_key, A.signal_id,\n \"PX\", dedup_ms, \"NX\")\n end\n\n -- 4. Record signal hash with tentative effect\n local created_at = A.created_at ~= \"\" and A.created_at or tostring(now_ms)\n redis.call(\"HSET\", K.signal_hash,\n \"signal_id\", A.signal_id,\n \"target_execution_id\", A.execution_id,\n \"target_waitpoint_id\", A.waitpoint_id,\n \"target_scope\", A.target_scope,\n \"signal_name\", A.signal_name,\n \"signal_category\", A.signal_category,\n \"source_type\", A.source_type,\n \"source_identity\", A.source_identity,\n \"correlation_id\", A.correlation_id,\n \"idempotency_key\", A.idempotency_key,\n \"created_at\", created_at,\n \"accepted_at\", tostring(now_ms),\n \"matched_waitpoint_id\", A.waitpoint_id,\n \"payload_encoding\", A.payload_encoding,\n \"observed_effect\", \"buffered_for_pending_waitpoint\")\n\n -- 4b. Store payload separately if present\n if A.payload ~= \"\" then\n redis.call(\"SET\", K.signal_payload, A.payload)\n end\n\n -- 5. Append to per-waitpoint signal stream + per-execution signal index\n -- These are recorded so suspend_execution can XRANGE and replay them.\n redis.call(\"XADD\", K.wp_signals_stream, \"MAXLEN\", \"~\",\n tostring(A.signal_maxlen), \"*\",\n \"signal_id\", A.signal_id,\n \"signal_name\", A.signal_name,\n \"signal_category\", A.signal_category,\n \"source_type\", A.source_type,\n \"source_identity\", A.source_identity,\n \"matched\", \"0\",\n \"accepted_at\", tostring(now_ms))\n redis.call(\"ZADD\", K.exec_signals_zset, now_ms, A.signal_id)\n\n -- No resume condition evaluation \u{2014} waitpoint is pending, not active.\n\n return ok(A.signal_id, \"buffered_for_pending_waitpoint\")\nend)\n\n---------------------------------------------------------------------------\n-- #2 ff_claim_resumed_execution\n--\n-- Consume claim-grant, resume existing attempt (interrupted -> started),\n-- create new lease bound to SAME attempt. Does NOT create a new attempt.\n--\n-- KEYS (11): exec_core, claim_grant, eligible_zset, lease_expiry_zset,\n-- worker_leases, existing_attempt_hash, lease_current,\n-- lease_history, active_index, attempt_timeout_zset,\n-- execution_deadline_zset\n-- ARGV (8): execution_id, worker_id, worker_instance_id, lane,\n-- capability_snapshot_hash, lease_id, lease_ttl_ms,\n-- remaining_attempt_timeout_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_claim_resumed_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n claim_grant_key = keys[2],\n eligible_zset = keys[3],\n lease_expiry_key = keys[4],\n worker_leases_key = keys[5],\n attempt_hash = keys[6],\n lease_current_key = keys[7],\n lease_history_key = keys[8],\n active_index_key = keys[9],\n attempt_timeout_key = keys[10],\n execution_deadline_key = keys[11],\n }\n\n local lease_ttl_n = require_number(args[7], \"lease_ttl_ms\")\n if type(lease_ttl_n) == \"table\" then return lease_ttl_n end\n\n local A = {\n execution_id = args[1],\n worker_id = args[2],\n worker_instance_id = args[3],\n lane = args[4],\n capability_snapshot_hash = args[5] or \"\",\n lease_id = args[6],\n lease_ttl_ms = lease_ttl_n,\n remaining_attempt_timeout_ms = args[8] or \"\",\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Validate execution exists\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- 2. Must be runnable\n if core.lifecycle_phase ~= \"runnable\" then\n return err(\"execution_not_leaseable\")\n end\n\n -- 3. Must be attempt_interrupted (resumed after suspension/delay)\n if core.attempt_state ~= \"attempt_interrupted\" then\n return err(\"not_a_resumed_execution\")\n end\n\n -- 4. Validate claim grant\n local grant_raw = redis.call(\"HGETALL\", K.claim_grant_key)\n if #grant_raw == 0 then\n return err(\"invalid_claim_grant\")\n end\n local grant = hgetall_to_table(grant_raw)\n\n -- Validate grant matches (grant key is execution-scoped, so only check worker_id)\n if grant.worker_id ~= A.worker_id then\n return err(\"invalid_claim_grant\")\n end\n\n -- Check grant expiry\n if is_set(grant.grant_expires_at) and tonumber(grant.grant_expires_at) < now_ms then\n redis.call(\"DEL\", K.claim_grant_key)\n return err(\"claim_grant_expired\")\n end\n\n -- Consume grant (DEL)\n redis.call(\"DEL\", K.claim_grant_key)\n\n -- 5. Resume existing attempt: attempt_interrupted -> started\n -- Same attempt continues \u{2014} no new attempt_index.\n local att_idx = core.current_attempt_index\n local att_id = core.current_attempt_id\n local epoch = tonumber(core.current_lease_epoch or \"0\") + 1\n local expires_at = now_ms + A.lease_ttl_ms\n local renewal_deadline = now_ms + math.floor(A.lease_ttl_ms * 2 / 3)\n\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"started\",\n \"resumed_at\", tostring(now_ms),\n \"lease_id\", A.lease_id,\n \"lease_epoch\", tostring(epoch),\n \"worker_id\", A.worker_id,\n \"worker_instance_id\", A.worker_instance_id,\n \"suspended_at\", \"\",\n \"suspension_id\", \"\")\n\n -- 6. Create new lease bound to same attempt\n redis.call(\"DEL\", K.lease_current_key)\n redis.call(\"HSET\", K.lease_current_key,\n \"lease_id\", A.lease_id,\n \"lease_epoch\", tostring(epoch),\n \"execution_id\", A.execution_id,\n \"attempt_id\", att_id,\n \"worker_id\", A.worker_id,\n \"worker_instance_id\", A.worker_instance_id,\n \"acquired_at\", tostring(now_ms),\n \"expires_at\", tostring(expires_at),\n \"last_renewed_at\", tostring(now_ms),\n \"renewal_deadline\", tostring(renewal_deadline))\n\n -- 7. Update exec_core \u{2014} ALL 7 state vector dimensions\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"active\",\n \"ownership_state\", \"leased\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"running_attempt\",\n \"public_state\", \"active\",\n \"current_lease_id\", A.lease_id,\n \"current_lease_epoch\", tostring(epoch),\n \"current_worker_id\", A.worker_id,\n \"current_worker_instance_id\", A.worker_instance_id,\n \"current_lane\", A.lane,\n \"lease_acquired_at\", tostring(now_ms),\n \"lease_expires_at\", tostring(expires_at),\n \"lease_last_renewed_at\", tostring(now_ms),\n \"lease_renewal_deadline\", tostring(renewal_deadline),\n \"lease_expired_at\", \"\",\n \"lease_revoked_at\", \"\",\n \"lease_revoke_reason\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- 8. Update indexes\n redis.call(\"ZREM\", K.eligible_zset, A.execution_id)\n redis.call(\"ZADD\", K.lease_expiry_key, expires_at, A.execution_id)\n redis.call(\"SADD\", K.worker_leases_key, A.execution_id)\n redis.call(\"ZADD\", K.active_index_key, expires_at, A.execution_id)\n\n -- 9. ZADD attempt_timeout with remaining timeout\n if is_set(A.remaining_attempt_timeout_ms) then\n local remaining = tonumber(A.remaining_attempt_timeout_ms)\n if remaining > 0 then\n redis.call(\"ZADD\", K.attempt_timeout_key,\n now_ms + remaining, A.execution_id)\n end\n end\n\n -- 10. Lease history event\n redis.call(\"XADD\", K.lease_history_key, \"MAXLEN\", \"~\", 1000, \"*\",\n \"event\", \"acquired\",\n \"lease_id\", A.lease_id,\n \"lease_epoch\", tostring(epoch),\n \"attempt_index\", att_idx,\n \"attempt_id\", att_id,\n \"worker_id\", A.worker_id,\n \"reason\", \"claim_resumed\",\n \"ts\", tostring(now_ms))\n\n return ok(A.lease_id, tostring(epoch), tostring(expires_at),\n att_id, att_idx, \"resumed\")\nend)\n\n\n-- source: lua/stream.lua\n-- FlowFabric stream append function\n-- Reference: RFC-006 (Stream), RFC-010 \u{a7}4.1 (#20), RFC-015 (durability modes)\n--\n-- Depends on helpers: ok, err, is_set\n\n---------------------------------------------------------------------------\n-- RFC-015 JSON Merge Patch (RFC 7396) applier.\n--\n-- `target` and `patch` are Lua tables produced by `cjson.decode`. The\n-- applier mutates `target` in place per RFC 7396:\n-- * `patch[k] == cjson.null` \u{2192} delete `target[k]`\n-- * `patch[k]` is a table \u{2192} recurse\n-- * otherwise \u{2192} `target[k] = patch[k]`\n--\n-- `depth` is a recursion guard (RFC-015 \u{a7}3.3 step 4: bounded depth 16).\n-- A patch whose root is not a JSON object (array / scalar / null) is\n-- rejected by the caller per RFC 7396 \u{2014} only object-rooted patches are\n-- valid merge-patch documents.\n---------------------------------------------------------------------------\nlocal function apply_merge_patch(target, patch, depth)\n if depth > 16 then\n return nil, \"patch_depth_exceeded\"\n end\n for k, v in pairs(patch) do\n if v == cjson.null then\n target[k] = nil\n elseif type(v) == \"table\" and not v[1] and next(v) ~= nil then\n -- Object-valued \u{2192} recurse. `v[1]` heuristic distinguishes arrays\n -- (numeric-indexed from 1) from objects. RFC 7396 replaces\n -- arrays wholesale (no recursion into array members).\n if type(target[k]) ~= \"table\" or (target[k][1] ~= nil) then\n target[k] = {}\n end\n local _, ferr = apply_merge_patch(target[k], v, depth + 1)\n if ferr ~= nil then\n return nil, ferr\n end\n else\n -- Scalar, array, or empty-table leaf \u{2192} replace wholesale.\n target[k] = v\n end\n end\n return target, nil\nend\n\n---------------------------------------------------------------------------\n-- RFC-015 null-sentinel rewrite. Walks `doc` and replaces scalar string\n-- leaves equal to \"__ff_null__\" with cjson.null. Applied post-merge so\n-- callers can encode \"set this leaf to JSON null\" inside an RFC 7396\n-- merge patch (where `null` would otherwise mean \"delete key\").\n---------------------------------------------------------------------------\nlocal FF_NULL_SENTINEL = \"__ff_null__\"\nlocal function rewrite_null_sentinel(doc, depth)\n if depth > 32 then return end\n if type(doc) ~= \"table\" then return end\n for k, v in pairs(doc) do\n if type(v) == \"string\" and v == FF_NULL_SENTINEL then\n doc[k] = cjson.null\n elseif type(v) == \"table\" then\n rewrite_null_sentinel(v, depth + 1)\n end\n end\nend\n\n-- Reject non-object root documents / patches (RFC 7396 requires an\n-- object at the root). A bare scalar / array / null patch means\n-- \"replace the target wholesale\" under RFC 7396, but we scope v0.6\n-- to the object-root subset per RFC-015 \u{a7}3.3 step 2.\nlocal function is_json_object(tbl)\n if type(tbl) ~= \"table\" then return false end\n -- Empty table \u{2014} treat as an object by convention (cjson would have\n -- decoded `{}` to `cjson.empty_array` in encode-array mode, but\n -- stream.lua uses decode defaults).\n if next(tbl) == nil then return true end\n -- Any numeric-indexed [1] entry \u{2192} array, reject.\n return tbl[1] == nil\nend\n\n---------------------------------------------------------------------------\n-- #20 ff_append_frame\n--\n-- Append a frame to the attempt-scoped output stream. Highest-throughput\n-- function \u{2014} called once per token during LLM streaming. Uses lite lease\n-- validation (HMGET, not HGETALL) for minimal overhead. Class B operation.\n--\n-- KEYS (4): exec_core, stream_data, stream_meta, stream_summary\n-- ARGV (19): execution_id, attempt_index, lease_id, lease_epoch,\n-- frame_type, ts, payload, encoding, correlation_id,\n-- source, retention_maxlen, attempt_id, max_payload_bytes,\n-- stream_mode, patch_kind, ttl_ms,\n-- maxlen_floor, maxlen_ceiling, ema_alpha\n--\n-- `stream_mode` (ARGV 14, RFC-015): \"\" / \"durable\" \u{2192} StreamMode::Durable\n-- (default). \"summary\" \u{2192} DurableSummary (requires `patch_kind` ARGV 15).\n-- \"best_effort\" \u{2192} BestEffortLive (uses `ttl_ms` ARGV 16 for PEXPIRE and\n-- ARGV 17-19 for dynamic MAXLEN sizing per RFC-015 \u{a7}4.2).\n---------------------------------------------------------------------------\nredis.register_function(\'ff_append_frame\', function(keys, args)\n local K = {\n core_key = keys[1],\n stream_key = keys[2],\n stream_meta = keys[3],\n stream_summary = keys[4] or \"\",\n }\n\n local A = {\n execution_id = args[1],\n attempt_index = args[2],\n lease_id = args[3],\n lease_epoch = args[4],\n frame_type = args[5],\n ts = args[6] or \"\",\n payload = args[7] or \"\",\n encoding = args[8] or \"utf8\",\n correlation_id = args[9] or \"\",\n source = args[10] or \"worker\",\n retention_maxlen = tonumber(args[11] or \"0\"),\n attempt_id = args[12] or \"\",\n max_payload_bytes = tonumber(args[13] or \"65536\"),\n stream_mode = args[14] or \"\", -- RFC-015\n patch_kind = args[15] or \"\", -- RFC-015\n ttl_ms = tonumber(args[16] or \"0\"),\n -- RFC-015 \u{a7}4.2 dynamic MAXLEN knobs. Zero / missing \u{2192} fall back to\n -- the RFC-final defaults.\n maxlen_floor = tonumber(args[17] or \"0\"),\n maxlen_ceiling = tonumber(args[18] or \"0\"),\n ema_alpha = tonumber(args[19] or \"0\"),\n }\n if A.maxlen_floor == nil or A.maxlen_floor <= 0 then A.maxlen_floor = 64 end\n if A.maxlen_ceiling == nil or A.maxlen_ceiling <= 0 then A.maxlen_ceiling = 16384 end\n if A.ema_alpha == nil or A.ema_alpha <= 0 or A.ema_alpha > 1.0 then\n A.ema_alpha = 0.2\n end\n if A.maxlen_ceiling < A.maxlen_floor then\n A.maxlen_ceiling = A.maxlen_floor\n end\n\n -- Normalise empty / unknown mode to \"durable\" for pre-RFC-015 parity.\n if A.stream_mode == \"\" then A.stream_mode = \"durable\" end\n\n -- 1. Payload size guard (v1 default: 64KB)\n if #A.payload > A.max_payload_bytes then\n return err(\"retention_limit_exceeded\")\n end\n\n -- 2. Lite lease validation via HMGET (Class B \u{2014} no full HGETALL)\n local core = redis.call(\"HMGET\", K.core_key,\n \"current_attempt_index\", -- [1]\n \"current_lease_id\", -- [2]\n \"current_lease_epoch\", -- [3]\n \"lease_expires_at\", -- [4]\n \"lifecycle_phase\", -- [5]\n \"ownership_state\") -- [6]\n\n if core[5] ~= \"active\" then\n return err(\"stream_closed\")\n end\n\n if core[6] == \"lease_expired_reclaimable\" or core[6] == \"lease_revoked\" then\n return err(\"stale_owner_cannot_append\")\n end\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n if tonumber(core[4] or \"0\") <= now_ms then\n return err(\"stale_owner_cannot_append\")\n end\n\n if tostring(core[1]) ~= A.attempt_index then\n return err(\"stale_owner_cannot_append\")\n end\n\n if core[2] ~= A.lease_id or tostring(core[3]) ~= A.lease_epoch then\n return err(\"stale_owner_cannot_append\")\n end\n\n -- 3. Lazy-create stream metadata on first append\n if redis.call(\"EXISTS\", K.stream_meta) == 0 then\n redis.call(\"HSET\", K.stream_meta,\n \"stream_id\", A.execution_id .. \":\" .. A.attempt_index,\n \"execution_id\", A.execution_id,\n \"attempt_id\", A.attempt_id,\n \"attempt_index\", A.attempt_index,\n \"created_at\", tostring(now_ms),\n \"closed_at\", \"\",\n \"closed_reason\", \"\",\n \"durability_mode\", \"durable_full\",\n \"retention_maxlen\", tostring(A.retention_maxlen),\n \"last_sequence\", \"\",\n \"frame_count\", \"0\",\n \"total_bytes\", \"0\",\n \"last_frame_at\", \"\",\n \"has_durable_frame\", \"0\") -- RFC-015 \u{a7}4.1 \u{2014} PEXPIRE gate\n end\n\n -- 4. Check stream not closed\n local closed = redis.call(\"HGET\", K.stream_meta, \"closed_at\")\n if is_set(closed) then\n return err(\"stream_closed\")\n end\n\n -- 5. RFC-015 \u{a7}3.3 \u{2014} DurableSummary delta apply. Done BEFORE XADD so\n -- the stream entry carries the post-merge `summary_version`.\n local summary_version = \"\"\n if A.stream_mode == \"summary\" then\n if K.stream_summary == \"\" then\n return err(\"invalid_input\", \"summary mode requires stream_summary key\")\n end\n if A.patch_kind ~= \"json-merge-patch\" and A.patch_kind ~= \"\" then\n return err(\"invalid_input\", \"unsupported patch_kind\")\n end\n\n -- Parse patch payload. RFC 7396 requires an object at the root.\n local ok_decode, patch = pcall(cjson.decode, A.payload)\n if not ok_decode then\n return err(\"invalid_input\", \"patch payload is not valid JSON\")\n end\n if not is_json_object(patch) then\n return err(\"invalid_input\", \"patch must be a JSON object\")\n end\n\n -- Load current summary document (or start from {}).\n local cur_doc_raw = redis.call(\"HGET\", K.stream_summary, \"document\")\n local doc\n if cur_doc_raw and cur_doc_raw ~= \"\" then\n local ok_cur, parsed = pcall(cjson.decode, cur_doc_raw)\n if not ok_cur then\n return err(\"corruption\", \"stored summary document not decodeable\")\n end\n doc = parsed\n else\n doc = {}\n end\n\n -- Apply merge patch, then rewrite the null sentinel.\n local _, apply_err = apply_merge_patch(doc, patch, 0)\n if apply_err ~= nil then\n return err(\"invalid_input\", apply_err)\n end\n rewrite_null_sentinel(doc, 0)\n\n -- Encode + persist. Use cjson.encode (not cjson_safe) \u{2014} input\n -- validation above means we should never encode unencodable.\n local encoded = cjson.encode(doc)\n local new_version = redis.call(\"HINCRBY\", K.stream_summary, \"version\", 1)\n redis.call(\"HSET\", K.stream_summary,\n \"document\", encoded,\n \"patch_kind\", \"json-merge-patch\",\n \"last_updated_ms\", tostring(now_ms))\n -- Set first_applied_ms on the very first delta only.\n if new_version == 1 then\n redis.call(\"HSET\", K.stream_summary, \"first_applied_ms\", tostring(now_ms))\n end\n summary_version = tostring(new_version)\n end\n\n -- 6. Append frame via XADD with RFC-015 mode fields.\n local ts = A.ts ~= \"\" and A.ts or tostring(now_ms)\n local xadd_args = {\n K.stream_key, \"*\",\n \"frame_type\", A.frame_type,\n \"ts\", ts,\n \"payload\", A.payload,\n \"encoding\", A.encoding,\n \"source\", A.source,\n \"mode\", A.stream_mode,\n }\n if A.correlation_id ~= \"\" then\n xadd_args[#xadd_args + 1] = \"correlation_id\"\n xadd_args[#xadd_args + 1] = A.correlation_id\n end\n if summary_version ~= \"\" then\n xadd_args[#xadd_args + 1] = \"summary_version\"\n xadd_args[#xadd_args + 1] = summary_version\n end\n if A.stream_mode == \"best_effort\" and A.ttl_ms > 0 then\n xadd_args[#xadd_args + 1] = \"ttl_ms\"\n xadd_args[#xadd_args + 1] = tostring(A.ttl_ms)\n end\n\n local entry_id = redis.call(\"XADD\", unpack(xadd_args))\n\n -- 7. Update stream metadata.\n --\n -- `frame_count` is the LIFETIME append counter \u{2014} NOT the retained\n -- entry count (XTRIM below can prune without decrementing).\n local frame_count = redis.call(\"HINCRBY\", K.stream_meta, \"frame_count\", 1)\n redis.call(\"HINCRBY\", K.stream_meta, \"total_bytes\", #A.payload)\n redis.call(\"HSET\", K.stream_meta,\n \"last_sequence\", entry_id,\n \"last_frame_at\", tostring(now_ms))\n\n -- RFC-015 \u{a7}4.1: track `has_durable_frame` for the PEXPIRE gate. Any\n -- Durable or DurableSummary append flips the flag (and PERSISTs the\n -- stream key if a prior BestEffortLive had set a TTL).\n if A.stream_mode == \"durable\" or A.stream_mode == \"summary\" then\n local prior = redis.call(\"HGET\", K.stream_meta, \"has_durable_frame\")\n if prior ~= \"1\" then\n redis.call(\"HSET\", K.stream_meta, \"has_durable_frame\", \"1\")\n -- PERSIST any previously-set best-effort TTL on the stream key.\n redis.call(\"PERSIST\", K.stream_key)\n end\n end\n\n -- 8. Apply retention / TTL policy.\n --\n -- RFC-015 \u{a7}3.5 \u{2014} DurableSummary compacts to a ~64-entry live-tail\n -- window. \u{a7}4.1/\u{a7}4.2 \u{2014} BestEffortLive trims to a derived K and may\n -- PEXPIRE the stream key.\n -- Plain Durable keeps the pre-RFC-015 default (10_000 cap w/ `~`).\n if A.stream_mode == \"summary\" then\n redis.call(\"XTRIM\", K.stream_key, \"MAXLEN\", \"~\", 64)\n elseif A.stream_mode == \"best_effort\" then\n -- RFC-015 \u{a7}4.2: dynamic MAXLEN sizing from an EMA of append rate.\n --\n -- K = clamp(ceil(ema_rate_hz * ttl_ms / 1000) * 2, floor, ceiling)\n --\n -- EMA state persists on the per-attempt stream_meta Hash so the\n -- estimator survives across Lua invocations. `ema_rate_hz` is Hz;\n -- `last_append_ts_ms` is the wall-clock of the prior XADD.\n local meta = redis.call(\"HMGET\", K.stream_meta,\n \"ema_rate_hz\", \"last_append_ts_ms\")\n local prev_rate_hz = tonumber(meta[1] or \"\")\n local prev_ts_ms = tonumber(meta[2] or \"\")\n\n local rate_hz\n if prev_rate_hz == nil or prev_ts_ms == nil then\n -- First best-effort append: seed rate so the initial K == floor.\n -- floor = ceil(seed * ttl_s * 2) \u{2192} seed = floor / (ttl_s * 2).\n -- Guard against ttl_ms == 0 to avoid /0.\n local ttl_s = math.max(A.ttl_ms, 1) / 1000.0\n rate_hz = A.maxlen_floor / (ttl_s * 2.0)\n else\n local dt_ms = now_ms - prev_ts_ms\n if dt_ms <= 0 then dt_ms = 1 end\n local inst_hz = 1000.0 / dt_ms\n rate_hz = A.ema_alpha * inst_hz + (1.0 - A.ema_alpha) * prev_rate_hz\n end\n\n local ttl_s_for_k = math.max(A.ttl_ms, 1) / 1000.0\n local raw_k = math.ceil(rate_hz * ttl_s_for_k * 2.0)\n if raw_k < A.maxlen_floor then raw_k = A.maxlen_floor end\n if raw_k > A.maxlen_ceiling then raw_k = A.maxlen_ceiling end\n local target_maxlen = raw_k\n\n redis.call(\"XTRIM\", K.stream_key, \"MAXLEN\", \"~\", target_maxlen)\n\n redis.call(\"HSET\", K.stream_meta,\n \"ema_rate_hz\", tostring(rate_hz),\n \"last_append_ts_ms\", tostring(now_ms),\n \"maxlen_applied_last\", tostring(target_maxlen))\n\n -- PEXPIRE gate: only set a TTL if the stream has NEVER received a\n -- durable frame. Durable content must not be destroyed by a\n -- best-effort TTL refresh.\n local has_dur = redis.call(\"HGET\", K.stream_meta, \"has_durable_frame\")\n if (has_dur ~= \"1\") and A.ttl_ms > 0 then\n redis.call(\"PEXPIRE\", K.stream_key, A.ttl_ms * 2)\n end\n else\n -- Durable (pre-RFC-015 path).\n local maxlen = A.retention_maxlen\n local trim_op\n if maxlen == 0 then\n maxlen = 10000\n trim_op = \"~\"\n else\n trim_op = \"=\"\n end\n redis.call(\"XTRIM\", K.stream_key, \"MAXLEN\", trim_op, maxlen)\n end\n\n -- Success shape: ok(entry_id, frame_count, summary_version). The\n -- third field is empty string for Durable / BestEffortLive and the\n -- post-merge version string for DurableSummary. Pre-RFC-015 callers\n -- that only look at fields [0] / [1] are unaffected.\n return ok(entry_id, tostring(frame_count), summary_version)\nend)\n\n---------------------------------------------------------------------------\n-- ff_read_attempt_stream\n--\n-- Read frames from an attempt-scoped output stream via XRANGE. Non-blocking\n-- (safe in Lua Functions). Cluster-safe: stream_key and stream_meta share\n-- the {p:N} hash tag.\n--\n-- KEYS (2): stream_data, stream_meta\n-- ARGV (4): from_id, to_id, count_limit, visibility\n--\n-- `visibility` (ARGV 4, RFC-015 \u{a7}6.1): \"\" / \"all\" \u{2192} no filter.\n-- \"exclude_best_effort\" \u{2192} drop entries whose XADD `mode` field equals\n-- \"best_effort\". Pre-RFC-015 entries (no `mode` field) count as\n-- `durable` per the RFC-015 \u{a7}8.1 reader fallback.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_read_attempt_stream\', function(keys, args)\n local stream_key = keys[1]\n local stream_meta = keys[2]\n\n local from_id = args[1] or \"-\"\n local to_id = args[2] or \"+\"\n local count_limit = tonumber(args[3] or \"0\")\n local visibility = args[4] or \"\"\n\n local HARD_CAP = 10000\n if count_limit == nil or count_limit < 1 then\n return err(\"invalid_input\", \"count_limit must be >= 1\")\n end\n if count_limit > HARD_CAP then\n return err(\"invalid_input\", \"count_limit_exceeds_hard_cap\")\n end\n\n local entries = redis.call(\"XRANGE\", stream_key, from_id, to_id,\n \"COUNT\", count_limit)\n\n -- RFC-015 \u{a7}6.1 server-side mode filter.\n if visibility == \"exclude_best_effort\" then\n local filtered = {}\n for i = 1, #entries do\n local entry = entries[i]\n local fields = entry[2]\n -- fields is a flat array of alternating key/value.\n local mode = \"durable\"\n for j = 1, #fields - 1, 2 do\n if fields[j] == \"mode\" then\n mode = fields[j + 1]\n break\n end\n end\n if mode ~= \"best_effort\" then\n filtered[#filtered + 1] = entry\n end\n end\n entries = filtered\n end\n\n local meta = redis.call(\"HMGET\", stream_meta, \"closed_at\", \"closed_reason\")\n local closed_at = meta[1] or \"\"\n local closed_reason = meta[2] or \"\"\n\n return ok(entries, closed_at, closed_reason)\nend)\n\n---------------------------------------------------------------------------\n-- ff_read_summary (RFC-015 \u{a7}6.3)\n--\n-- Read the rolling summary document for an attempt. Non-blocking HGETALL\n-- wrapper. Returns empty fields list when no DurableSummary frame has\n-- ever been appended (the summary Hash is absent).\n--\n-- KEYS (1): stream_summary\n-- ARGV (0)\n---------------------------------------------------------------------------\nredis.register_function(\'ff_read_summary\', function(keys, args)\n local summary_key = keys[1]\n if redis.call(\"EXISTS\", summary_key) == 0 then\n return ok(\"\", \"0\", \"\", \"0\", \"0\")\n end\n local h = redis.call(\"HMGET\", summary_key,\n \"document\", \"version\", \"patch_kind\", \"last_updated_ms\", \"first_applied_ms\")\n return ok(h[1] or \"\", h[2] or \"0\", h[3] or \"\",\n h[4] or \"0\", h[5] or \"0\")\nend)\n\n\n-- source: lua/budget.lua\n-- FlowFabric budget functions\n-- Reference: RFC-008 (Budget), RFC-010 \u{a7}4.3 (#30, #31), \u{a7}4.1 (#29a, #29b)\n--\n-- Depends on helpers: ok, err, is_set, hgetall_to_table\n\n---------------------------------------------------------------------------\n-- ff_create_budget (on {b:M})\n--\n-- Create a new budget policy with hard/soft limits on N dimensions.\n-- Idempotent: if EXISTS budget_def \u{2192} return ok_already_satisfied.\n--\n-- KEYS (5): budget_def, budget_limits, budget_usage, budget_resets_zset,\n-- budget_policies_index\n-- ARGV (variable): budget_id, scope_type, scope_id, enforcement_mode,\n-- on_hard_limit, on_soft_limit, reset_interval_ms, now_ms,\n-- dimension_count, dim_1..dim_N, hard_1..hard_N, soft_1..soft_N\n---------------------------------------------------------------------------\nredis.register_function(\'ff_create_budget\', function(keys, args)\n local K = {\n def_key = keys[1],\n limits_key = keys[2],\n usage_key = keys[3],\n resets_zset = keys[4],\n policies_index = keys[5],\n }\n\n local A = {\n budget_id = args[1],\n scope_type = args[2],\n scope_id = args[3],\n enforcement_mode = args[4],\n on_hard_limit = args[5],\n on_soft_limit = args[6],\n reset_interval_ms = args[7],\n now_ms = args[8],\n }\n\n -- Maintain budget_policies_index BEFORE the idempotency guard. SADD is\n -- itself idempotent (no-op on existing members), and hoisting it heals\n -- any pre-existing budget_def that was created before this index was\n -- introduced \u{2014} no migration script required.\n redis.call(\"SADD\", K.policies_index, A.budget_id)\n\n -- Idempotency: already exists \u{2192} return immediately\n if redis.call(\"EXISTS\", K.def_key) == 1 then\n return ok_already_satisfied(A.budget_id)\n end\n\n local dim_count = require_number(args[9], \"dim_count\")\n if type(dim_count) == \"table\" then return dim_count end\n\n -- HSET budget definition\n redis.call(\"HSET\", K.def_key,\n \"budget_id\", A.budget_id,\n \"scope_type\", A.scope_type,\n \"scope_id\", A.scope_id,\n \"enforcement_mode\", A.enforcement_mode,\n \"on_hard_limit\", A.on_hard_limit,\n \"on_soft_limit\", A.on_soft_limit,\n \"reset_interval_ms\", A.reset_interval_ms,\n \"breach_count\", \"0\",\n \"soft_breach_count\", \"0\",\n \"created_at\", A.now_ms,\n \"last_updated_at\", A.now_ms)\n\n -- HSET per-dimension hard and soft limits\n for i = 1, dim_count do\n local dim = args[9 + i]\n local hard = args[9 + dim_count + i]\n local soft = args[9 + 2 * dim_count + i]\n redis.call(\"HSET\", K.limits_key, \"hard:\" .. dim, hard, \"soft:\" .. dim, soft)\n end\n\n -- budget_usage left empty \u{2014} first report_usage will create fields\n\n -- Schedule periodic reset if reset_interval_ms > 0\n local interval_ms = tonumber(A.reset_interval_ms)\n if interval_ms > 0 then\n local next_reset_at = tostring(tonumber(A.now_ms) + interval_ms)\n redis.call(\"HSET\", K.def_key, \"next_reset_at\", next_reset_at)\n redis.call(\"ZADD\", K.resets_zset, tonumber(next_reset_at), A.budget_id)\n end\n\n return ok(A.budget_id)\nend)\n\n---------------------------------------------------------------------------\n-- #30 ff_report_usage_and_check (on {b:M})\n--\n-- Check-before-increment: read current usage, check hard limits. If any\n-- dimension would breach, return HARD_BREACH without incrementing. If safe,\n-- HINCRBY all dimensions, then check soft limits.\n--\n-- Atomic Lua serialization on {b:M} guarantees zero overshoot.\n--\n-- KEYS (3): budget_usage, budget_limits, budget_def\n-- ARGV (variable): dimension_count, dim_1..dim_N, delta_1..delta_N, now_ms, [dedup_key]\n---------------------------------------------------------------------------\nredis.register_function(\'ff_report_usage_and_check\', function(keys, args)\n local K = {\n usage_key = keys[1],\n limits_key = keys[2],\n def_key = keys[3],\n }\n\n local dim_count = require_number(args[1], \"dim_count\")\n if type(dim_count) == \"table\" then return dim_count end\n local now_ms = args[2 * dim_count + 2]\n local dedup_key = args[2 * dim_count + 3] or \"\"\n\n -- Idempotency: if dedup_key provided, check for prior application\n if dedup_key ~= \"\" then\n local existing = redis.call(\"GET\", dedup_key)\n if existing then\n return {1, \"ALREADY_APPLIED\"}\n end\n end\n\n -- Phase 1: CHECK all dimensions BEFORE any increment.\n -- If any hard limit would be breached, reject the entire report.\n for i = 1, dim_count do\n local dim = args[1 + i]\n local delta = tonumber(args[1 + dim_count + i])\n local current = tonumber(redis.call(\"HGET\", K.usage_key, dim) or \"0\")\n local new_total = current + delta\n\n local hard_limit = redis.call(\"HGET\", K.limits_key, \"hard:\" .. dim)\n if hard_limit and hard_limit ~= \"\" and hard_limit ~= false then\n local limit_val = tonumber(hard_limit)\n if limit_val > 0 and new_total > limit_val then\n -- Record breach metadata but DO NOT increment\n redis.call(\"HINCRBY\", K.def_key, \"breach_count\", 1)\n redis.call(\"HSET\", K.def_key,\n \"last_breach_at\", now_ms,\n \"last_breach_dim\", dim,\n \"last_updated_at\", now_ms)\n return {1, \"HARD_BREACH\", dim, tostring(current), tostring(hard_limit)}\n end\n end\n end\n\n -- Phase 2: No hard breach detected \u{2014} safe to increment all dimensions.\n local breached_soft = nil\n for i = 1, dim_count do\n local dim = args[1 + i]\n local delta = tonumber(args[1 + dim_count + i])\n local new_val = redis.call(\"HINCRBY\", K.usage_key, dim, delta)\n\n -- Check soft limit (advisory \u{2014} increment still happens)\n local soft_limit = redis.call(\"HGET\", K.limits_key, \"soft:\" .. dim)\n if soft_limit and soft_limit ~= \"\" and soft_limit ~= false then\n local limit_val = tonumber(soft_limit)\n if limit_val > 0 and new_val > limit_val then\n if not breached_soft then\n breached_soft = dim\n end\n end\n end\n end\n\n -- Update metadata\n redis.call(\"HSET\", K.def_key, \"last_updated_at\", now_ms)\n\n -- Mark dedup key after successful increment (24h TTL)\n if dedup_key ~= \"\" then\n redis.call(\"SET\", dedup_key, \"1\", \"PX\", 86400000)\n end\n\n if breached_soft then\n redis.call(\"HINCRBY\", K.def_key, \"soft_breach_count\", 1)\n local soft_val = tonumber(redis.call(\"HGET\", K.limits_key, \"soft:\" .. breached_soft) or \"0\")\n local cur_val = tonumber(redis.call(\"HGET\", K.usage_key, breached_soft) or \"0\")\n return {1, \"SOFT_BREACH\", breached_soft, tostring(cur_val), tostring(soft_val)}\n end\n\n return {1, \"OK\"}\nend)\n\n---------------------------------------------------------------------------\n-- #31 ff_reset_budget (on {b:M})\n--\n-- Scanner-called periodic reset. Zero all usage fields, record reset,\n-- compute next_reset_at, re-score in reset index.\n--\n-- KEYS (3): budget_def, budget_usage, budget_resets_zset\n-- ARGV (2): budget_id, now_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_reset_budget\', function(keys, args)\n local K = {\n def_key = keys[1],\n usage_key = keys[2],\n resets_zset = keys[3],\n }\n\n local A = {\n budget_id = args[1],\n now_ms = args[2],\n }\n\n -- 1. Read usage fields and zero them\n local usage_fields = redis.call(\"HKEYS\", K.usage_key)\n if #usage_fields > 0 then\n local zero_args = {}\n for _, field in ipairs(usage_fields) do\n zero_args[#zero_args + 1] = field\n zero_args[#zero_args + 1] = \"0\"\n end\n redis.call(\"HSET\", K.usage_key, unpack(zero_args))\n end\n\n -- 2. Update budget_def: last_reset_at, reset_count\n redis.call(\"HINCRBY\", K.def_key, \"reset_count\", 1)\n redis.call(\"HSET\", K.def_key,\n \"last_reset_at\", A.now_ms,\n \"last_updated_at\", A.now_ms,\n \"last_breach_at\", \"\",\n \"last_breach_dim\", \"\")\n\n -- 3. Compute next_reset_at from reset_interval_ms\n local interval_ms = tonumber(redis.call(\"HGET\", K.def_key, \"reset_interval_ms\") or \"0\")\n local next_reset_at = \"0\"\n if interval_ms > 0 then\n next_reset_at = tostring(tonumber(A.now_ms) + interval_ms)\n redis.call(\"HSET\", K.def_key, \"next_reset_at\", next_reset_at)\n redis.call(\"ZADD\", K.resets_zset, tonumber(next_reset_at), A.budget_id)\n else\n -- No recurring reset \u{2014} remove from schedule\n redis.call(\"ZREM\", K.resets_zset, A.budget_id)\n end\n\n return ok(next_reset_at)\nend)\n\n---------------------------------------------------------------------------\n-- #29b ff_unblock_execution (on {p:N})\n--\n-- Re-evaluate blocked execution, set eligible_now, ZREM blocked set,\n-- ZADD eligible. All 7 dims.\n--\n-- KEYS (3): exec_core, blocked_zset, eligible_zset\n-- ARGV (3): execution_id, now_ms, expected_blocking_reason\n---------------------------------------------------------------------------\nredis.register_function(\'ff_unblock_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n blocked_zset = keys[2],\n eligible_zset = keys[3],\n }\n\n local A = {\n execution_id = args[1],\n now_ms = args[2],\n expected_blocking_reason = args[3] or \"\",\n }\n\n -- 1. Read execution core\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- 2. Must be runnable\n if core.lifecycle_phase ~= \"runnable\" then\n return err(\"execution_not_eligible\")\n end\n\n -- 3. Must be blocked\n local es = core.eligibility_state\n if es ~= \"blocked_by_budget\" and es ~= \"blocked_by_quota\"\n and es ~= \"blocked_by_route\" and es ~= \"blocked_by_operator\" then\n return err(\"execution_not_eligible\")\n end\n\n -- 4. Validate expected blocking reason (prevent stale unblock)\n if A.expected_blocking_reason ~= \"\" then\n if core.blocking_reason ~= A.expected_blocking_reason then\n return err(\"execution_not_eligible\")\n end\n end\n\n -- 5. Transition: all 7 dims\n local priority = tonumber(core.priority or \"0\")\n local created_at = tonumber(core.created_at or \"0\")\n local score = 0 - (priority * 1000000000000) + created_at\n\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"eligible_now\",\n \"blocking_reason\", \"waiting_for_worker\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", core.attempt_state or \"pending_first_attempt\",\n \"public_state\", \"waiting\",\n \"last_transition_at\", A.now_ms,\n \"last_mutation_at\", A.now_ms)\n\n -- 6. Move from blocked to eligible\n redis.call(\"ZREM\", K.blocked_zset, A.execution_id)\n redis.call(\"ZADD\", K.eligible_zset, score, A.execution_id)\n\n return ok(\"unblocked\")\nend)\n\n---------------------------------------------------------------------------\n-- #29a ff_block_execution_for_admission (on {p:N})\n--\n-- Parameterized block: set eligibility/blocking for budget/quota/route/lane\n-- denial, ZREM eligible, ZADD target blocked set. All 7 dims.\n--\n-- KEYS (3): exec_core, eligible_zset, target_blocked_zset\n-- ARGV (4): execution_id, blocking_reason, blocking_detail, now_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_block_execution_for_admission\', function(keys, args)\n local K = {\n core_key = keys[1],\n eligible_zset = keys[2],\n blocked_zset = keys[3],\n }\n\n local A = {\n execution_id = args[1],\n blocking_reason = args[2],\n blocking_detail = args[3] or \"\",\n now_ms = args[4],\n }\n\n -- 1. Read execution core\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- 2. Must be runnable\n if core.lifecycle_phase ~= \"runnable\" then\n if core.lifecycle_phase == \"terminal\" then\n return err(\"execution_not_active\",\n core.terminal_outcome or \"\",\n core.current_lease_epoch or \"\",\n \"terminal\",\n core.current_attempt_id or \"\")\n end\n return err(\"execution_not_eligible\")\n end\n\n -- 3. Map blocking_reason to eligibility_state\n local REASON_TO_ELIGIBILITY = {\n waiting_for_budget = \"blocked_by_budget\",\n waiting_for_quota = \"blocked_by_quota\",\n waiting_for_capable_worker = \"blocked_by_route\",\n paused_by_operator = \"blocked_by_operator\",\n paused_by_policy = \"blocked_by_lane_state\",\n }\n local eligibility = REASON_TO_ELIGIBILITY[A.blocking_reason]\n if not eligibility then\n return err(\"invalid_blocking_reason\")\n end\n\n -- 4. Transition: all 7 dims\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", eligibility,\n \"blocking_reason\", A.blocking_reason,\n \"blocking_detail\", A.blocking_detail,\n \"terminal_outcome\", \"none\",\n \"attempt_state\", core.attempt_state or \"pending_first_attempt\",\n \"public_state\", \"rate_limited\",\n \"last_transition_at\", A.now_ms,\n \"last_mutation_at\", A.now_ms)\n\n -- 5. Move from eligible to blocked\n redis.call(\"ZREM\", K.eligible_zset, A.execution_id)\n redis.call(\"ZADD\", K.blocked_zset, tonumber(A.now_ms), A.execution_id)\n\n return ok(\"blocked\")\nend)\n\n\n-- source: lua/quota.lua\n-- FlowFabric quota and rate-limit functions\n-- Reference: RFC-008 (Quota), RFC-010 \u{a7}4.4 (#32)\n--\n-- Depends on helpers: ok, err, is_set\n\n---------------------------------------------------------------------------\n-- ff_create_quota_policy (on {q:K})\n--\n-- Create a new quota/rate-limit policy.\n-- Idempotent: if EXISTS quota_def \u{2192} return ok_already_satisfied.\n--\n-- KEYS (5): quota_def, quota_window_zset, quota_concurrency_counter,\n-- admitted_set, quota_policies_index\n-- ARGV (5): quota_policy_id, window_seconds, max_requests_per_window,\n-- max_concurrent, now_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_create_quota_policy\', function(keys, args)\n local K = {\n def_key = keys[1],\n window_zset = keys[2],\n concurrency_key = keys[3],\n admitted_set = keys[4],\n policies_index = keys[5],\n }\n\n local A = {\n quota_policy_id = args[1],\n window_seconds = args[2],\n max_requests_per_window = args[3],\n max_concurrent = args[4],\n now_ms = args[5],\n }\n\n -- Idempotency: already exists \u{2192} return immediately\n if redis.call(\"EXISTS\", K.def_key) == 1 then\n return ok_already_satisfied(A.quota_policy_id)\n end\n\n -- HSET quota definition\n redis.call(\"HSET\", K.def_key,\n \"quota_policy_id\", A.quota_policy_id,\n \"requests_per_window_seconds\", A.window_seconds,\n \"max_requests_per_window\", A.max_requests_per_window,\n \"active_concurrency_cap\", A.max_concurrent,\n \"created_at\", A.now_ms)\n\n -- Init concurrency counter to 0\n redis.call(\"SET\", K.concurrency_key, \"0\")\n\n -- Register in partition-level policy index (for cluster-safe discovery)\n redis.call(\"SADD\", K.policies_index, A.quota_policy_id)\n\n -- admitted_set + quota_window_zset left empty (populated on admission)\n\n return ok(A.quota_policy_id)\nend)\n\n---------------------------------------------------------------------------\n-- #32 ff_check_admission_and_record (on {q:K})\n--\n-- Idempotent sliding-window rate check + concurrency check.\n-- If admitted: ZADD window, SET NX guard, optional INCR concurrency,\n-- SADD to admitted_set (for cluster-safe reconciler discovery).\n--\n-- KEYS (5): window_zset, concurrency_counter, quota_def, admitted_guard_key,\n-- admitted_set\n-- ARGV (6): now_ms, window_seconds, rate_limit, concurrency_cap,\n-- execution_id, jitter_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_check_admission_and_record\', function(keys, args)\n local K = {\n window_zset = keys[1],\n concurrency_key = keys[2],\n quota_def = keys[3],\n admitted_guard_key = keys[4],\n admitted_set = keys[5],\n }\n\n local now_ms_n = require_number(args[1], \"now_ms\")\n if type(now_ms_n) == \"table\" then return now_ms_n end\n local window_seconds_n = require_number(args[2], \"window_seconds\")\n if type(window_seconds_n) == \"table\" then return window_seconds_n end\n local rate_limit_n = require_number(args[3], \"rate_limit\")\n if type(rate_limit_n) == \"table\" then return rate_limit_n end\n local concurrency_cap_n = require_number(args[4], \"concurrency_cap\")\n if type(concurrency_cap_n) == \"table\" then return concurrency_cap_n end\n\n local A = {\n now_ms = now_ms_n,\n window_seconds = window_seconds_n,\n rate_limit = rate_limit_n,\n concurrency_cap = concurrency_cap_n,\n execution_id = args[5],\n jitter_ms = tonumber(args[6] or \"0\"),\n }\n\n local window_ms = A.window_seconds * 1000\n\n -- 1. Idempotency guard: already admitted in this window?\n if redis.call(\"EXISTS\", K.admitted_guard_key) == 1 then\n return { \"ALREADY_ADMITTED\" }\n end\n\n -- 2. Sliding window: remove expired entries\n redis.call(\"ZREMRANGEBYSCORE\", K.window_zset, \"-inf\", A.now_ms - window_ms)\n\n -- 3. Check rate limit\n if A.rate_limit > 0 then\n local current_count = redis.call(\"ZCARD\", K.window_zset)\n if current_count >= A.rate_limit then\n -- Compute retry_after from oldest entry\n local oldest = redis.call(\"ZRANGE\", K.window_zset, 0, 0, \"WITHSCORES\")\n local retry_after_ms = 0\n if #oldest >= 2 then\n retry_after_ms = tonumber(oldest[2]) + window_ms - A.now_ms\n if retry_after_ms < 0 then retry_after_ms = 0 end\n end\n local jitter = 0\n if A.jitter_ms > 0 then\n jitter = math.random(0, A.jitter_ms)\n end\n return { \"RATE_EXCEEDED\", tostring(retry_after_ms + jitter) }\n end\n end\n\n -- 4. Check concurrency cap\n if A.concurrency_cap > 0 then\n local active = tonumber(redis.call(\"GET\", K.concurrency_key) or \"0\")\n if active >= A.concurrency_cap then\n return { \"CONCURRENCY_EXCEEDED\" }\n end\n end\n\n -- 5. Admit: record in sliding window (execution_id as member \u{2014} idempotent)\n redis.call(\"ZADD\", K.window_zset, A.now_ms, A.execution_id)\n\n -- 6. Set admitted guard key with TTL = window size\n -- Guard: PX 0 or PX <0 causes Valkey error inside Lua (after ZADD committed).\n if window_ms > 0 then\n redis.call(\"SET\", K.admitted_guard_key, \"1\", \"PX\", window_ms, \"NX\")\n end\n\n -- 7. Increment concurrency counter if cap is set\n if A.concurrency_cap > 0 then\n redis.call(\"INCR\", K.concurrency_key)\n end\n\n -- 8. Track in admitted set (for cluster-safe reconciler \u{2014} replaces SCAN)\n redis.call(\"SADD\", K.admitted_set, A.execution_id)\n\n return { \"ADMITTED\" }\nend)\n\n---------------------------------------------------------------------------\n-- ff_release_admission (on {q:K})\n--\n-- Release a previously-recorded admission slot. Called when a claim grant\n-- fails after admission was recorded, preventing leaked concurrency slots.\n--\n-- KEYS (3): admitted_guard_key, admitted_set, concurrency_counter\n-- ARGV (1): execution_id\n---------------------------------------------------------------------------\nredis.register_function(\'ff_release_admission\', function(keys, args)\n local K = {\n admitted_guard_key = keys[1],\n admitted_set = keys[2],\n concurrency_key = keys[3],\n }\n\n local execution_id = args[1]\n\n -- 1. Delete the guard key (idempotent \u{2014} DEL returns 0 if absent)\n redis.call(\"DEL\", K.admitted_guard_key)\n\n -- 2. Remove from admitted set\n redis.call(\"SREM\", K.admitted_set, execution_id)\n\n -- 3. Decrement concurrency counter (floor at 0)\n local current = tonumber(redis.call(\"GET\", K.concurrency_key) or \"0\")\n if current > 0 then\n redis.call(\"DECR\", K.concurrency_key)\n end\n\n return {1, \"OK\", \"released\"}\nend)\n\n\n-- source: lua/flow.lua\n-- FlowFabric flow coordination and dependency functions\n-- Reference: RFC-007 (Flow), RFC-010 \u{a7}4.1 (#22-24, #35), \u{a7}4.2 (#29)\n--\n-- Depends on helpers: ok, err, is_set, hgetall_to_table\n\n---------------------------------------------------------------------------\n-- Cycle detection helper\n---------------------------------------------------------------------------\n\n-- Max nodes to visit during cycle detection BFS.\nlocal MAX_CYCLE_CHECK_NODES = 1000\n\n-- Detect if adding an edge upstream\u{2192}downstream would create a cycle.\n-- BFS from downstream through existing outgoing edges: if upstream is\n-- reachable, the new edge closes a loop (A\u{2192}B\u{2192}C\u{2192}A deadlock).\n-- All keys share the same {fp:N} slot (flow-partition co-location).\n-- @param flow_prefix e.g. \"ff:flow:{fp:0}:<flow_id>\"\n-- @param start_eid downstream of proposed edge (BFS start)\n-- @param target_eid upstream of proposed edge (looking for this)\n-- @return true if a cycle would be created\nlocal function detect_cycle(flow_prefix, start_eid, target_eid)\n local visited = {}\n local queue = {start_eid}\n local count = 0\n\n while #queue > 0 do\n local next_queue = {}\n for _, eid in ipairs(queue) do\n if eid == target_eid then\n return true\n end\n if not visited[eid] then\n visited[eid] = true\n count = count + 1\n if count > MAX_CYCLE_CHECK_NODES then\n return true -- graph too large to verify; reject conservatively\n end\n local out_key = flow_prefix .. \":out:\" .. eid\n local edges = redis.call(\"SMEMBERS\", out_key)\n for _, edge_id in ipairs(edges) do\n local edge_key = flow_prefix .. \":edge:\" .. edge_id\n local next_eid = redis.call(\"HGET\", edge_key, \"downstream_execution_id\")\n if next_eid and next_eid ~= \"\" and not visited[next_eid] then\n next_queue[#next_queue + 1] = next_eid\n end\n end\n end\n end\n queue = next_queue\n end\n\n return false\nend\n\n---------------------------------------------------------------------------\n-- ff_create_flow (on {fp:N})\n--\n-- Create a new flow container. Idempotent: if flow_core already exists,\n-- returns ok_already_satisfied.\n--\n-- KEYS (3): flow_core, members_set, flow_index\n-- ARGV (4): flow_id, flow_kind, namespace, now_ms (IGNORED \u{2014} see note below)\n--\n-- NOTE: ARGV[4] (`now_ms`) is accepted for caller compatibility but NOT\n-- used for stored timestamps. We read server time via redis.call(\"TIME\")\n-- so created_at / last_mutation_at agree with fields written by other\n-- Lua functions (ff_complete_execution etc.) under client clock skew.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_create_flow\', function(keys, args)\n local K = {\n flow_core = keys[1],\n members_set = keys[2],\n flow_index = keys[3],\n }\n\n local A = {\n flow_id = args[1],\n flow_kind = args[2],\n namespace = args[3],\n -- args[4] is client-provided now_ms; intentionally ignored.\n }\n\n -- Server time (not client-provided) so created_at / last_mutation_at\n -- agree with timestamps written by ff_complete_execution and peers.\n local now_ms = server_time_ms()\n\n -- Maintain flow_index BEFORE the idempotency guard. SADD is itself\n -- idempotent (no-op on existing members), and hoisting it heals any\n -- pre-existing flow_core that was created before this index was\n -- introduced \u{2014} no migration script required.\n redis.call(\"SADD\", K.flow_index, A.flow_id)\n\n -- Idempotency: if flow already exists, return already_satisfied\n if redis.call(\"EXISTS\", K.flow_core) == 1 then\n return ok_already_satisfied(A.flow_id)\n end\n\n -- Create flow core record\n redis.call(\"HSET\", K.flow_core,\n \"flow_id\", A.flow_id,\n \"flow_kind\", A.flow_kind,\n \"namespace\", A.namespace,\n \"graph_revision\", 0,\n \"node_count\", 0,\n \"edge_count\", 0,\n \"public_flow_state\", \"open\",\n \"created_at\", now_ms,\n \"last_mutation_at\", now_ms)\n\n return ok(A.flow_id)\nend)\n\n---------------------------------------------------------------------------\n-- ff_add_execution_to_flow (on {fp:N} \u{2014} single atomic FCALL)\n--\n-- Add a member execution to a flow AND stamp the flow_id back-pointer\n-- on exec_core in one atomic commit. Per RFC-011 \u{a7}7.3, exec keys\n-- co-locate with their parent flow\'s partition under hash-tag routing,\n-- so exec_core shares the `{fp:N}` hash-tag with flow_core / members_set\n-- / flow_index. All four KEYS hash to the same slot; no CROSSSLOT.\n--\n-- KEYS (4): flow_core, members_set, flow_index, exec_core\n-- ARGV (3): flow_id, execution_id, now_ms (IGNORED \u{2014} server time used)\n--\n-- Validates-before-writing: flow_not_found / flow_already_terminal\n-- early-returns fire BEFORE any write (step 1 below). On those error\n-- paths, zero state mutates \u{2014} atomicity by construction at the Lua\n-- level (Valkey scripting contract: no redis.call() before error_reply\n-- means nothing to roll back). See RFC-011 \u{a7}7.3.1 tests for the\n-- structural pin.\n--\n-- Invariant (post-RFC-011): a successful call commits BOTH the flow-\n-- index updates AND the exec_core.flow_id stamp in one atomic unit.\n-- Readers can assume exec_core.flow_id == flow_id iff the exec is in\n-- members_set. The pre-RFC-011 two-phase contract + \u{a7}5.5 orphan-window\n-- + issue #21 reconciliation-scanner plan are all superseded.\n---------------------------------------------------------------------------\nredis.register_function(\'ff_add_execution_to_flow\', function(keys, args)\n local K = {\n flow_core = keys[1],\n members_set = keys[2],\n flow_index = keys[3],\n exec_core = keys[4],\n }\n\n local A = {\n flow_id = args[1],\n execution_id = args[2],\n -- args[3] is client-provided now_ms; intentionally ignored in favour\n -- of redis.call(\"TIME\") to keep last_mutation_at consistent with\n -- timestamps stamped by ff_complete_execution and peers.\n }\n\n local now_ms = server_time_ms()\n\n -- 1. Validate flow exists and is not terminal, and execution exists.\n -- Validates-before-writing: no redis.call() writes happen before\n -- these guards, so the error paths commit zero state (symmetric\n -- with step 2\'s cross-flow guard on AlreadyMember).\n local raw = redis.call(\"HGETALL\", K.flow_core)\n if #raw == 0 then return err(\"flow_not_found\") end\n local flow = hgetall_to_table(raw)\n local pfs = flow.public_flow_state or \"\"\n if pfs == \"cancelled\" or pfs == \"completed\" or pfs == \"failed\" then\n return err(\"flow_already_terminal\")\n end\n -- Execution must exist \u{2014} otherwise step 5\'s HSET exec_core would\n -- silently create a hash for a non-existent exec, leading to an\n -- inconsistent members_set \u{2194} exec_core state. Symmetric with the\n -- flow_not_found guard above.\n if redis.call(\"EXISTS\", K.exec_core) == 0 then\n return err(\"execution_not_found\")\n end\n\n -- Self-heal flow_index for LIVE flows only. The projector may have\n -- SREMd this flow after observing an all-terminal sample, yet the\n -- flow is still \"open\" per flow_core and can accept new members.\n -- Re-add idempotently so the next projector cycle picks the flow\n -- back up. Runs only after the terminal-state guard above so we do\n -- not resurrect cancelled/completed/failed flows into the active\n -- index. Same {fp:N} slot as the other KEYS, so atomic with the\n -- membership mutation below.\n redis.call(\"SADD\", K.flow_index, A.flow_id)\n\n -- 2. Idempotency: already a member of THIS flow\'s members_set.\n -- Still stamp exec_core.flow_id defensively \u{2014} an earlier call\n -- may have committed members_set but crashed before exec_core\n -- HSET under the legacy two-phase shape. Stamping here is a\n -- no-op if flow_id already matches; heals any pre-RFC-011\n -- orphans encountered on a rolling upgrade.\n --\n -- Cross-flow guard on the orphan case: if exec_core.flow_id is\n -- already set to a DIFFERENT flow (corrupted-state orphan that\n -- IS in this flow\'s members_set but stamped wrong), refuse\n -- instead of silently re-stamping. Symmetric with step 3\'s\n -- guard on the not-yet-a-member branch \u{2014} catches the same\n -- invariant violation earlier in the path. Empty existing\n -- flow_id goes through the heal path normally.\n if redis.call(\"SISMEMBER\", K.members_set, A.execution_id) == 1 then\n local existing = redis.call(\"HGET\", K.exec_core, \"flow_id\")\n if existing and existing ~= \"\" and existing ~= A.flow_id then\n return err(\"already_member_of_different_flow:\" .. existing)\n end\n redis.call(\"HSET\", K.exec_core, \"flow_id\", A.flow_id)\n local nc = redis.call(\"HGET\", K.flow_core, \"node_count\") or \"0\"\n return ok_already_satisfied(A.execution_id, nc)\n end\n\n -- 3. Cross-flow guard: if exec_core.flow_id is already set to a\n -- DIFFERENT flow, refuse \u{2014} silently re-stamping would orphan\n -- the other flow\'s accounting. An exec belongs to at most one\n -- flow at a time per RFC-007.\n local existing_flow_id = redis.call(\"HGET\", K.exec_core, \"flow_id\")\n if existing_flow_id and existing_flow_id ~= \"\" and existing_flow_id ~= A.flow_id then\n return err(\"already_member_of_different_flow:\" .. existing_flow_id)\n end\n\n -- 4. Add to membership set\n redis.call(\"SADD\", K.members_set, A.execution_id)\n\n -- 5. Stamp the flow_id back-pointer on exec_core. Co-located with\n -- the flow\'s partition under RFC-011 \u{a7}7.3 hash-tag routing; this\n -- HSET is part of the same atomic FCALL as the SADD above.\n redis.call(\"HSET\", K.exec_core, \"flow_id\", A.flow_id)\n\n -- 6. Increment node_count and graph_revision\n local new_nc = redis.call(\"HINCRBY\", K.flow_core, \"node_count\", 1)\n local new_rev = redis.call(\"HINCRBY\", K.flow_core, \"graph_revision\", 1)\n redis.call(\"HSET\", K.flow_core, \"last_mutation_at\", now_ms)\n\n return ok(A.execution_id, tostring(new_nc))\nend)\n\n---------------------------------------------------------------------------\n-- ff_cancel_flow (on {fp:N})\n--\n-- Cancel a flow. Returns the member list for the caller to dispatch\n-- individual cancellations cross-partition.\n--\n-- KEYS (3): flow_core, members_set, flow_index (RESERVED \u{2014} see below)\n-- ARGV (4): flow_id, reason, cancellation_policy, now_ms (IGNORED \u{2014}\n-- server time used so `cancelled_at` agrees with peer Lua fields)\n--\n-- KEYS[3] (flow_index) is accepted for caller-compatibility with the\n-- shared FlowStructOpKeys wrapper, but this function does NOT mutate\n-- flow_index. The projector is the sole SREM writer (see the \"4b\" note\n-- in the body below).\n---------------------------------------------------------------------------\nredis.register_function(\'ff_cancel_flow\', function(keys, args)\n local K = {\n flow_core = keys[1],\n members_set = keys[2],\n -- keys[3] is flow_index; present in KEYS for wrapper symmetry but\n -- unused in this function (see rationale near the end of the body).\n pending_cancels = keys[4], -- SET populated only on first terminalization\n cancel_backlog = keys[5], -- per-fp ZSET tracking flows owing members\n }\n\n local A = {\n flow_id = args[1],\n reason = args[2],\n cancellation_policy = args[3],\n -- args[4] is client-provided now_ms; intentionally ignored.\n }\n\n -- grace_ms must be a finite non-negative integer. Same guard as\n -- ff_rotate_waitpoint_hmac_secret: reject NaN, \u{b1}inf, negative,\n -- non-integer, or >2^53-1. math.floor alone doesn\'t catch\n -- infinities (math.floor(math.huge) == math.huge) which would\n -- stamp \"inf\" into cancel_backlog and permanently poison the entry.\n -- Default 30000 (30s) if the arg is omitted or empty string.\n local grace_ms\n if args[5] == nil or args[5] == \"\" then\n grace_ms = 30000\n else\n local g = tonumber(args[5])\n if not g\n or g ~= g -- NaN\n or g < 0\n or g > 9007199254740991 -- 2^53 - 1\n or g ~= math.floor(g) then\n return err(\"invalid_grace_ms\")\n end\n grace_ms = g\n end\n A.grace_ms = grace_ms\n\n local now_ms = server_time_ms()\n\n -- 1. Validate flow exists\n local raw = redis.call(\"HGETALL\", K.flow_core)\n if #raw == 0 then return err(\"flow_not_found\") end\n local flow = hgetall_to_table(raw)\n\n -- 2. Check not already terminal\n local pfs = flow.public_flow_state or \"\"\n if pfs == \"cancelled\" or pfs == \"completed\" or pfs == \"failed\" then\n return err(\"flow_already_terminal\")\n end\n\n -- 3. Get all member execution IDs\n local members = redis.call(\"SMEMBERS\", K.members_set)\n\n -- 4. Update flow state\n -- cancellation_policy is persisted so an AlreadyTerminal retry can\n -- return the authoritative stored policy instead of echoing the\n -- caller\'s retry intent.\n --\n -- NOTE: this field is persisted from this library version onward.\n -- Flows cancelled before this deploy reach public_flow_state=\'cancelled\'\n -- without a cancellation_policy value. The Rust caller detects the\n -- empty field on HMGET and falls back to args.cancellation_policy, so\n -- no backfill migration is needed.\n redis.call(\"HSET\", K.flow_core,\n \"public_flow_state\", \"cancelled\",\n \"cancelled_at\", now_ms,\n \"cancel_reason\", A.reason,\n \"cancellation_policy\", A.cancellation_policy,\n \"last_mutation_at\", now_ms)\n\n -- Do NOT SREM flow_index here. Member cancellations dispatch\n -- asynchronously from ff-server; flow_projector needs to keep\n -- projecting the flow while those cancels land so the summary\n -- reflects the real progression (running/blocked \u{2192} cancelled). The\n -- projector owns the SREM once it observes sampled==true_total\n -- all-terminal (see crates/ff-engine/src/scanner/flow_projector.rs).\n -- A projector-owned SREM is also the right place because it is\n -- the only writer that can prove every member has actually reached\n -- terminal state. Removing the entry here would freeze the summary\n -- at whatever snapshot was current when cancel_flow fired.\n\n -- 5. Durable backlog for async member-cancel dispatch.\n --\n -- Only cancel_all policy dispatches per-member cancels; other policies\n -- mark the flow terminal and leave members to the flow_projector /\n -- retention. Skip the backlog writes outside cancel_all.\n --\n -- If a process crashes between `CancellationScheduled` returning and\n -- the in-process dispatch finishing, OR if one member\'s cancel hits a\n -- permanent error that the bounded retry can\'t recover, the member\n -- would otherwise escape cancellation. Tracking the owed members in\n -- a persistent SET + partition-level ZSET lets the cancel_reconciler\n -- scanner drain the remainder on its interval.\n --\n -- Score = now + grace_ms so the reconciler doesn\'t race the live\n -- dispatch that\'s about to start \u{2014} live dispatch SREMs as it\n -- succeeds; reconciler only picks up flows whose grace has elapsed.\n if A.cancellation_policy == \"cancel_all\" and #members > 0 then\n -- SADD chunked: Lua\'s unpack() arg limit is ~8000 on some builds\n -- and some Valkey deployments enforce max-args lower. Chunk at\n -- 1000 to stay well under both without a noticeable cost.\n local i = 1\n while i <= #members do\n local chunk_end = math.min(i + 999, #members)\n local sadd_args = {}\n for j = i, chunk_end do\n sadd_args[#sadd_args + 1] = members[j]\n end\n redis.call(\"SADD\", K.pending_cancels, unpack(sadd_args))\n i = chunk_end + 1\n end\n redis.call(\"ZADD\", K.cancel_backlog, now_ms + A.grace_ms, A.flow_id)\n end\n\n -- 6. Return: ok(cancellation_policy, member1, member2, ...)\n -- Build array manually to include variable member list.\n local result = {1, \"OK\", A.cancellation_policy}\n for _, eid in ipairs(members) do\n result[#result + 1] = eid\n end\n\n return result\nend)\n\n---------------------------------------------------------------------------\n-- ff_ack_cancel_member (on {fp:N})\n--\n-- Record that one flow member\'s cancel has been committed. Called by\n-- the live dispatch after each successful cancel_member_execution AND\n-- by the cancel_reconciler scanner after it catches up on crash-\n-- orphaned members. Atomically SREMs from pending_cancels and ZREMs\n-- the flow from the partition backlog when the set is empty.\n--\n-- KEYS (2): pending_cancels, cancel_backlog\n-- ARGV (2): eid, flow_id\n---------------------------------------------------------------------------\nredis.register_function(\'ff_ack_cancel_member\', function(keys, args)\n local pending = keys[1]\n local backlog = keys[2]\n local eid = args[1]\n local flow_id = args[2]\n\n redis.call(\"SREM\", pending, eid)\n if redis.call(\"EXISTS\", pending) == 0 then\n redis.call(\"ZREM\", backlog, flow_id)\n end\n\n return ok()\nend)\n\n---------------------------------------------------------------------------\n-- #29 ff_stage_dependency_edge (on {fp:N})\n--\n-- Validate membership + topology, check graph_revision, create edge,\n-- increment graph_revision.\n--\n-- KEYS (6): flow_core, members_set, edge_hash, out_adj_set, in_adj_set,\n-- grant_hash\n-- ARGV (8): flow_id, edge_id, upstream_eid, downstream_eid,\n-- dependency_kind, data_passing_ref, expected_graph_revision,\n-- now_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_stage_dependency_edge\', function(keys, args)\n local K = {\n flow_core = keys[1],\n members_set = keys[2],\n edge_hash = keys[3],\n out_adj_set = keys[4],\n in_adj_set = keys[5],\n grant_hash = keys[6],\n }\n\n local A = {\n flow_id = args[1],\n edge_id = args[2],\n upstream_eid = args[3],\n downstream_eid = args[4],\n dependency_kind = args[5] or \"success_only\",\n data_passing_ref = args[6] or \"\",\n expected_graph_revision = args[7],\n now_ms = args[8],\n }\n\n -- 1. Reject self-referencing edges\n if A.upstream_eid == A.downstream_eid then\n return err(\"self_referencing_edge\")\n end\n\n -- 2. Read flow core\n local raw = redis.call(\"HGETALL\", K.flow_core)\n if #raw == 0 then return err(\"flow_not_found\") end\n local flow = hgetall_to_table(raw)\n\n -- 2b. Reject mutations on terminal flows\n local pfs = flow.public_flow_state or \"\"\n if pfs == \"cancelled\" or pfs == \"completed\" or pfs == \"failed\" then\n return err(\"flow_already_terminal\")\n end\n\n -- 3. Check graph_revision\n if tostring(flow.graph_revision or \"0\") ~= A.expected_graph_revision then\n return err(\"stale_graph_revision\")\n end\n\n -- 4. Verify both executions are members\n if redis.call(\"SISMEMBER\", K.members_set, A.upstream_eid) == 0 then\n return err(\"execution_not_in_flow\")\n end\n if redis.call(\"SISMEMBER\", K.members_set, A.downstream_eid) == 0 then\n return err(\"execution_not_in_flow\")\n end\n\n -- 4b. Transitive cycle detection: walk from downstream through outgoing\n -- edges to check if upstream is reachable (A\u{2192}B\u{2192}C\u{2192}A deadlock prevention).\n local flow_prefix = string.sub(K.flow_core, 1, -6) -- strip \":core\"\n if detect_cycle(flow_prefix, A.downstream_eid, A.upstream_eid) then\n return err(\"cycle_detected\")\n end\n\n -- 5. Check edge doesn\'t already exist\n if redis.call(\"EXISTS\", K.edge_hash) == 1 then\n return err(\"dependency_already_exists\")\n end\n\n -- 6. Create edge record\n redis.call(\"HSET\", K.edge_hash,\n \"edge_id\", A.edge_id,\n \"flow_id\", A.flow_id,\n \"upstream_execution_id\", A.upstream_eid,\n \"downstream_execution_id\", A.downstream_eid,\n \"dependency_kind\", A.dependency_kind,\n \"satisfaction_condition\", \"all_required\",\n \"data_passing_ref\", A.data_passing_ref,\n \"edge_state\", \"pending\",\n \"created_at\", A.now_ms,\n \"created_by\", \"engine\")\n\n -- 7. Update adjacency sets\n redis.call(\"SADD\", K.out_adj_set, A.edge_id)\n redis.call(\"SADD\", K.in_adj_set, A.edge_id)\n\n -- 8. Increment graph_revision and edge_count\n local new_rev = redis.call(\"HINCRBY\", K.flow_core, \"graph_revision\", 1)\n redis.call(\"HINCRBY\", K.flow_core, \"edge_count\", 1)\n redis.call(\"HSET\", K.flow_core, \"last_mutation_at\", A.now_ms)\n\n return ok(A.edge_id, tostring(new_rev))\nend)\n\n---------------------------------------------------------------------------\n-- RFC-016 #set_edge_group_policy (on {fp:N})\n--\n-- Declare the inbound-edge-group policy for a downstream execution.\n-- Stage B accepts `all_of`, `any_of`, and `quorum`. Must be called\n-- BEFORE the first `add_dependency` for this downstream (enforced via\n-- the `n > 0` guard below).\n--\n-- KEYS (2): flow_core, edgegroup\n-- ARGV (4): policy_variant, on_satisfied, k, now_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_set_edge_group_policy\', function(keys, args)\n local flow_core = keys[1]\n local edgegroup = keys[2]\n local policy_variant = args[1]\n local on_satisfied = args[2] or \"\"\n local k_raw = args[3] or \"0\"\n local now_ms = args[4]\n\n if redis.call(\"EXISTS\", flow_core) == 0 then\n return err(\"flow_not_found\")\n end\n\n if policy_variant ~= \"all_of\"\n and policy_variant ~= \"any_of\"\n and policy_variant ~= \"quorum\" then\n return err(\"invalid_input\",\n \"unsupported_policy_variant: \" .. tostring(policy_variant))\n end\n\n local k_num = tonumber(k_raw) or 0\n if policy_variant == \"quorum\" then\n if k_num < 1 then\n return err(\"invalid_input\", \"quorum k must be >= 1\")\n end\n end\n\n if (policy_variant == \"any_of\" or policy_variant == \"quorum\")\n and on_satisfied ~= \"cancel_remaining\"\n and on_satisfied ~= \"let_run\" then\n return err(\"invalid_input\",\n \"on_satisfied required for any_of/quorum: cancel_remaining | let_run\")\n end\n\n -- Ordering: reject if edges have already been staged for this\n -- downstream (the group hash is populated either by this call or by\n -- ff_apply_dependency_to_child\'s first-edge write).\n local existing_n = tonumber(redis.call(\"HGET\", edgegroup, \"n\") or \"0\")\n if existing_n > 0 then\n return err(\"invalid_input\",\n \"edge_group_policy_already_fixed: dependencies already staged\")\n end\n\n local existing_variant = redis.call(\"HGET\", edgegroup, \"policy_variant\")\n if existing_variant == policy_variant then\n -- Idempotent on identical restate (ignoring on_satisfied / k\n -- drift; Stage B trusts the caller not to flip those once set).\n return ok(\"already_set\")\n end\n\n redis.call(\"HSET\", edgegroup,\n \"policy_variant\", policy_variant,\n \"on_satisfied\", on_satisfied,\n \"k\", tostring(k_num),\n \"n\", \"0\",\n \"succeeded\", \"0\",\n \"failed\", \"0\",\n \"skipped\", \"0\",\n \"group_state\", \"pending\",\n \"created_at\", now_ms)\n\n return ok(\"set\")\nend)\n\n---------------------------------------------------------------------------\n-- #22 ff_apply_dependency_to_child (on {p:N})\n--\n-- Create dep record on child execution partition, increment unsatisfied\n-- count. If child is runnable: set blocked_by_dependencies.\n--\n-- KEYS (8): exec_core, deps_meta, unresolved_set, dep_hash,\n-- eligible_zset, blocked_deps_zset, deps_all_edges, edgegroup\n-- ARGV (7): flow_id, edge_id, upstream_eid, graph_revision,\n-- dependency_kind, data_passing_ref, now_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_apply_dependency_to_child\', function(keys, args)\n local K = {\n core_key = keys[1],\n deps_meta = keys[2],\n unresolved_set = keys[3],\n dep_hash = keys[4],\n eligible_zset = keys[5],\n blocked_deps_zset = keys[6],\n deps_all_edges = keys[7],\n edgegroup = keys[8],\n }\n\n local A = {\n flow_id = args[1],\n edge_id = args[2],\n upstream_eid = args[3],\n graph_revision = args[4],\n dependency_kind = args[5] or \"success_only\",\n data_passing_ref = args[6] or \"\",\n now_ms = args[7],\n }\n\n -- 1. Read execution core\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- 2. Validate flow membership (RFC-007 assert_flow_membership)\n if is_set(core.flow_id) and core.flow_id ~= A.flow_id then\n return err(\"execution_already_in_flow\")\n end\n\n -- 3. Idempotency: dep already applied\n if redis.call(\"EXISTS\", K.dep_hash) == 1 then\n return ok(\"already_applied\")\n end\n\n -- 4. Create dep record\n redis.call(\"HSET\", K.dep_hash,\n \"edge_id\", A.edge_id,\n \"flow_id\", A.flow_id,\n \"upstream_execution_id\", A.upstream_eid,\n \"downstream_execution_id\", core.execution_id or \"\",\n \"dependency_kind\", A.dependency_kind,\n \"state\", \"unsatisfied\",\n \"data_passing_ref\", A.data_passing_ref,\n \"last_resolved_at\", \"\")\n\n -- 5. Update deps:meta\n redis.call(\"SADD\", K.unresolved_set, A.edge_id)\n -- Register edge in the per-execution all-edges index (cluster-safe\n -- retention discovery; retained across resolve, purged wholesale on\n -- retention trim).\n redis.call(\"SADD\", K.deps_all_edges, A.edge_id)\n local unresolved = redis.call(\"HINCRBY\", K.deps_meta, \"unsatisfied_required_count\", 1)\n redis.call(\"HSET\", K.deps_meta,\n \"flow_id\", A.flow_id,\n \"last_flow_graph_revision\", A.graph_revision,\n \"last_dependency_update_at\", A.now_ms)\n\n -- RFC-016 Stage A: dual-write the inbound edge-group hash. Default\n -- policy is `all_of` when no explicit policy has been set (the\n -- expected Stage A case for all existing flows). The edgegroup hash\n -- becomes the source of truth for the AllOf counters in snapshots;\n -- existing flows without a hash fall back to deps_meta on read.\n if is_set(K.edgegroup) then\n local existing_policy = redis.call(\"HGET\", K.edgegroup, \"policy_variant\")\n if not existing_policy then\n redis.call(\"HSET\", K.edgegroup,\n \"policy_variant\", \"all_of\",\n \"n\", \"1\",\n \"succeeded\", \"0\",\n \"group_state\", \"pending\")\n else\n redis.call(\"HINCRBY\", K.edgegroup, \"n\", 1)\n end\n end\n\n -- 6. If runnable: block by dependencies (ALL 7 dims)\n if core.lifecycle_phase == \"runnable\" and core.terminal_outcome == \"none\" then\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", core.lifecycle_phase, -- preserve\n \"ownership_state\", core.ownership_state or \"unowned\", -- preserve\n \"eligibility_state\", \"blocked_by_dependencies\",\n \"blocking_reason\", \"waiting_for_children\",\n \"blocking_detail\", unresolved .. \" dep(s) unresolved incl \" .. A.edge_id,\n \"terminal_outcome\", \"none\", -- preserve\n \"attempt_state\", core.attempt_state or \"pending_first_attempt\", -- preserve\n \"public_state\", \"waiting_children\",\n \"last_transition_at\", A.now_ms,\n \"last_mutation_at\", A.now_ms)\n redis.call(\"ZREM\", K.eligible_zset, core.execution_id or \"\")\n redis.call(\"ZADD\", K.blocked_deps_zset,\n tonumber(core.created_at or \"0\"), core.execution_id or \"\")\n end\n\n return ok(tostring(unresolved))\nend)\n\n---------------------------------------------------------------------------\n-- RFC-016 Stage B quorum helper (AnyOf / Quorum edge-group evaluator).\n--\n-- Owns the downstream transition for non-AllOf edge groups. Called\n-- from `ff_resolve_dependency` when the edgegroup hash\'s\n-- `policy_variant` is `any_of` or `quorum`.\n--\n-- Four-counter model (\u{a7}3):\n-- succeeded \u{2014} upstream terminal = success\n-- failed \u{2014} upstream terminal = failed | cancelled | expired\n-- skipped \u{2014} upstream terminal = skipped\n-- n \u{2014} frozen total inbound edges in the group\n--\n-- Eligibility evaluation:\n-- AnyOf (k=1) satisfied iff succeeded >= 1\n-- Quorum(k) satisfied iff succeeded >= k\n-- impossible iff failed + skipped > n - k\n--\n-- Once-fired semantics (Invariant Q1): after `group_state != pending`,\n-- counters still update (telemetry) but the downstream is NOT\n-- retriggered and siblings are NOT re-flagged.\n--\n-- CancelRemaining: when the terminal transition happens (satisfied or\n-- short-circuit impossible), the still-running sibling list is\n-- snapshotted into `cancel_siblings_pending` and `cancel_siblings_pending_flag`\n-- is set to \"true\". The dispatcher (Stage C) drains these; Stage B\n-- only WRITES the flag / list.\n--\n-- LetRun: siblings are never flagged, regardless of terminal\n-- `group_state` branch (RFC-016 \u{a7}5, adjudication 2026-04-23). Pure.\n---------------------------------------------------------------------------\nlocal function resolve_dependency_quorum(K, A, dep, policy_variant)\n local outcome_bucket -- \"succeeded\" | \"failed\" | \"skipped\"\n if A.upstream_outcome == \"success\" then\n outcome_bucket = \"succeeded\"\n elseif A.upstream_outcome == \"skipped\" then\n outcome_bucket = \"skipped\"\n else\n outcome_bucket = \"failed\"\n end\n\n -- Idempotency on dep_hash: mark the edge resolved + maintain\n -- unresolved_set so describe_flow\'s legacy counters don\'t drift out\n -- of sync with edge-level state. The edgegroup hash is the source of\n -- truth for eligibility under AnyOf/Quorum; deps_meta counters are\n -- not consulted on this path.\n if outcome_bucket == \"succeeded\" then\n redis.call(\"HSET\", K.dep_hash,\n \"state\", \"satisfied\", \"last_resolved_at\", A.now_ms)\n else\n redis.call(\"HSET\", K.dep_hash,\n \"state\", \"impossible\", \"last_resolved_at\", A.now_ms)\n end\n redis.call(\"SREM\", K.unresolved_set, A.edge_id)\n\n -- Atomically bump the matching counter on the edgegroup hash.\n local new_count = redis.call(\"HINCRBY\", K.edgegroup, outcome_bucket, 1)\n\n -- Load the group snapshot for decision-making.\n local group_raw = redis.call(\"HGETALL\", K.edgegroup)\n local group = hgetall_to_table(group_raw)\n local n = tonumber(group.n or \"0\")\n local succeeded = tonumber(group.succeeded or \"0\")\n local failed = tonumber(group.failed or \"0\")\n local skipped = tonumber(group.skipped or \"0\")\n local prior_state = group.group_state or \"pending\"\n local on_satisfied = group.on_satisfied or \"\"\n local k\n if policy_variant == \"any_of\" then\n k = 1\n else\n k = tonumber(group.k or \"1\")\n if k < 1 then k = 1 end\n end\n\n -- Invariant Q2 defence: counters must not exceed n.\n if (succeeded + failed + skipped) > n then\n -- Counter overflow (edge applied twice?) \u{2014} flag corruption and\n -- abort without transitioning the downstream.\n return err(\"invariant_violation\",\n \"edgegroup counters exceed n: \" .. tostring(succeeded + failed + skipped) .. \"/\" .. tostring(n))\n end\n\n -- Once-fired (Q1): if group already terminal, only update counters.\n -- The HINCRBY above already did that. No downstream transition.\n if prior_state == \"satisfied\" or prior_state == \"impossible\"\n or prior_state == \"cancelled\" then\n return ok(\"already_fired\", \"\")\n end\n\n -- Evaluate current state.\n local new_state = \"pending\"\n if succeeded >= k then\n new_state = \"satisfied\"\n elseif (failed + skipped) > (n - k) then\n new_state = \"impossible\"\n end\n\n if new_state == \"pending\" then\n return ok(\"pending\", \"\")\n end\n\n -- Terminal transition: write group_state + satisfied_at, decide\n -- cancel_siblings_pending, and flip the downstream execution.\n redis.call(\"HSET\", K.edgegroup, \"group_state\", new_state)\n if new_state == \"satisfied\" then\n redis.call(\"HSET\", K.edgegroup, \"satisfied_at\", A.now_ms)\n end\n\n -- Sibling-cancel flagging + membership snapshot (RFC-016 Stage C).\n -- Stage B set the flag; Stage C additionally:\n -- (a) writes the list of still-running-sibling execution ids to the\n -- edgegroup hash field `cancel_siblings_pending_members`\n -- (pipe-delimited string; empty \u{21d2} no sibling needs cancellation),\n -- (b) SADDs the tuple `<flow_id>|<downstream_eid>` to the\n -- per-flow-partition index SET `ff:pending_cancel_groups:{fp:N}`\n -- so the dispatcher scanner can iterate without a full scan.\n -- All writes stay on the downstream\'s {fp:N} slot; siblings are\n -- guaranteed co-located with the flow per RFC-011 \u{a7}11.\n -- LetRun is pure: never flag, never enumerate siblings, regardless of\n -- satisfied vs impossible (RFC-016 \u{a7}5, adjudication 2026-04-23).\n if on_satisfied == \"cancel_remaining\" then\n redis.call(\"HSET\", K.edgegroup,\n \"cancel_siblings_pending_flag\", \"true\",\n \"cancel_siblings_reason\",\n (new_state == \"satisfied\") and \"sibling_quorum_satisfied\"\n or \"sibling_quorum_impossible\")\n\n -- Enumerate still-running siblings. The incoming_set lists all\n -- inbound edge ids for the downstream; each edge hash stores the\n -- upstream_execution_id. A sibling is \"still running\" iff its\n -- exec_core\'s lifecycle_phase is NOT \"terminal\". The just-resolved\n -- upstream\'s dep_hash is flipped above \u{2014} but at this moment its\n -- exec_core may or may not have reached terminal_outcome; include\n -- only genuinely-non-terminal executions.\n local members = {}\n if is_set(K.incoming_set) and is_set(K.flow_edge_prefix) then\n local edge_ids = redis.call(\"SMEMBERS\", K.incoming_set)\n for i = 1, #edge_ids do\n local e_key = K.flow_edge_prefix .. edge_ids[i]\n local up_eid = redis.call(\"HGET\", e_key, \"upstream_execution_id\")\n if is_set(up_eid) then\n -- Build sibling exec_core key on the same slot (all members of\n -- a flow share the {fp:N} hash-tag baked into the eid string).\n local sib_core_key = \"ff:exec:\" .. A.exec_tag .. \":\" .. up_eid .. \":core\"\n local sib_phase = redis.call(\"HGET\", sib_core_key, \"lifecycle_phase\")\n if is_set(sib_phase) and sib_phase ~= \"terminal\" then\n members[#members + 1] = up_eid\n end\n end\n end\n end\n\n local members_str = table.concat(members, \"|\")\n redis.call(\"HSET\", K.edgegroup,\n \"cancel_siblings_pending_members\", members_str)\n\n -- SADD the tuple even when members is empty \u{2014} the dispatcher SREMs\n -- the tuple atomically after it observes the empty list + clears\n -- the flag (drain-done). This keeps crash-in-flight observable\n -- via the SET rather than via a silent edgegroup-hash field.\n if is_set(K.pending_cancel_groups_set)\n and is_set(A.flow_id) and is_set(A.downstream_eid) then\n redis.call(\"SADD\", K.pending_cancel_groups_set,\n A.flow_id .. \"|\" .. A.downstream_eid)\n end\n end\n\n -- Downstream transition.\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return ok(new_state, \"\") end\n local core = hgetall_to_table(raw)\n\n if new_state == \"satisfied\" then\n -- Satisfied: flip eligible, optionally COPY data_passing_ref.\n local data_injected = \"\"\n if is_set(dep.data_passing_ref)\n and outcome_bucket == \"succeeded\"\n and core.terminal_outcome == \"none\" then\n local copied = redis.call(\n \"COPY\", K.upstream_result, K.downstream_payload, \"REPLACE\")\n if copied == 1 then\n data_injected = \"data_injected\"\n end\n end\n\n if core.lifecycle_phase == \"runnable\"\n and core.ownership_state == \"unowned\"\n and core.terminal_outcome == \"none\"\n and core.eligibility_state == \"blocked_by_dependencies\" then\n local new_attempt_state = core.attempt_state\n if not is_set(new_attempt_state) or new_attempt_state == \"none\" then\n new_attempt_state = \"pending_first_attempt\"\n end\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", core.lifecycle_phase,\n \"ownership_state\", core.ownership_state or \"unowned\",\n \"eligibility_state\", \"eligible_now\",\n \"blocking_reason\", \"waiting_for_worker\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", new_attempt_state,\n \"public_state\", \"waiting\",\n \"last_transition_at\", A.now_ms,\n \"last_mutation_at\", A.now_ms)\n redis.call(\"ZREM\", K.blocked_deps_zset, core.execution_id or \"\")\n local priority = tonumber(core.priority or \"0\")\n local created_at_ms = tonumber(core.created_at or \"0\")\n local score = 0 - (priority * 1000000000000) + created_at_ms\n redis.call(\"ZADD\", K.eligible_zset, score, core.execution_id or \"\")\n end\n return ok(\"satisfied\", data_injected)\n end\n\n -- Impossible: short-circuit skip the downstream.\n local child_skipped = false\n if core.terminal_outcome == \"none\" then\n local skip_attempt_state = core.attempt_state or \"none\"\n if skip_attempt_state == \"running_attempt\"\n or skip_attempt_state == \"attempt_interrupted\" then\n skip_attempt_state = \"attempt_terminal\"\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"ended_cancelled\",\n \"ended_at\", A.now_ms,\n \"failure_reason\", \"dependency_impossible\")\n if redis.call(\"EXISTS\", K.stream_meta) == 1 then\n redis.call(\"HSET\", K.stream_meta,\n \"closed_at\", A.now_ms,\n \"closed_reason\", \"dependency_impossible\")\n end\n elseif is_set(skip_attempt_state) and skip_attempt_state ~= \"none\" then\n skip_attempt_state = \"none\"\n end\n\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"terminal\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"skipped\",\n \"attempt_state\", skip_attempt_state,\n \"public_state\", \"skipped\",\n \"completed_at\", A.now_ms,\n \"last_transition_at\", A.now_ms,\n \"last_mutation_at\", A.now_ms)\n redis.call(\"ZREM\", K.blocked_deps_zset, core.execution_id or \"\")\n redis.call(\"ZADD\", K.terminal_zset, tonumber(A.now_ms), core.execution_id or \"\")\n child_skipped = true\n\n if is_set(core.flow_id) and is_set(core.execution_id) then\n local payload = cjson.encode({\n execution_id = core.execution_id,\n flow_id = core.flow_id,\n outcome = \"skipped\",\n })\n redis.call(\"PUBLISH\", \"ff:dag:completions\", payload)\n end\n end\n\n return ok(\"impossible\", child_skipped and \"child_skipped\" or \"\")\nend\n\n---------------------------------------------------------------------------\n-- #23 ff_resolve_dependency (on {p:N})\n--\n-- Resolve one dependency edge: satisfied (upstream success) or impossible\n-- (upstream failed/cancelled/expired). Updates child eligibility.\n--\n-- On satisfaction, if the edge was staged with a non-empty\n-- `data_passing_ref`, atomically COPYs the upstream\'s result key into\n-- the downstream\'s input_payload key before flipping the child to\n-- eligible. Upstream + downstream are guaranteed co-located on the\n-- same {fp:N} slot by flow membership (RFC-011 \u{a7}7.3).\n--\n-- KEYS (14): exec_core, deps_meta, unresolved_set, dep_hash,\n-- eligible_zset, terminal_zset, blocked_deps_zset,\n-- attempt_hash, stream_meta, downstream_payload,\n-- upstream_result, edgegroup,\n-- -- RFC-016 Stage C additions (both required for the\n-- -- AnyOf/Quorum+CancelRemaining path; may be passed as\n-- -- empty strings when the caller knows the resolution\n-- -- cannot flip to a CancelRemaining terminal state):\n-- incoming_set, pending_cancel_groups_set\n-- ARGV (5): edge_id, upstream_outcome, now_ms,\n-- -- RFC-016 Stage C additions (required when the edgegroup\n-- -- uses CancelRemaining; may be empty for AllOf):\n-- flow_id, downstream_eid\n---------------------------------------------------------------------------\nredis.register_function(\'ff_resolve_dependency\', function(keys, args)\n local K = {\n core_key = keys[1],\n deps_meta = keys[2],\n unresolved_set = keys[3],\n dep_hash = keys[4],\n eligible_zset = keys[5],\n terminal_zset = keys[6],\n blocked_deps_zset = keys[7],\n attempt_hash = keys[8],\n stream_meta = keys[9],\n downstream_payload = keys[10],\n upstream_result = keys[11],\n edgegroup = keys[12],\n -- RFC-016 Stage C: sibling enumeration + pending-cancel-groups index.\n incoming_set = keys[13] or \"\",\n pending_cancel_groups_set = keys[14] or \"\",\n }\n\n local A = {\n edge_id = args[1],\n upstream_outcome = args[2],\n now_ms = args[3],\n flow_id = args[4] or \"\",\n downstream_eid = args[5] or \"\",\n }\n\n -- Derive the flow-partition hash-tag + per-flow edge-hash prefix from\n -- the edgegroup key (format: `ff:flow:{fp:N}:<flow_id>:edgegroup:<eid>`).\n -- These are used by the Stage C sibling-enumeration loop to read edge\n -- hashes and sibling exec_cores on the same slot as the edgegroup.\n local exec_tag = \"\"\n local flow_edge_prefix = \"\"\n if is_set(K.edgegroup) then\n exec_tag = string.match(K.edgegroup, \"(%b{})\") or \"\"\n if is_set(A.flow_id) then\n flow_edge_prefix = \"ff:flow:\" .. exec_tag .. \":\" .. A.flow_id .. \":edge:\"\n end\n end\n A.exec_tag = exec_tag\n K.flow_edge_prefix = flow_edge_prefix\n\n -- 1. Read dep record\n local dep_raw = redis.call(\"HGETALL\", K.dep_hash)\n if #dep_raw == 0 then return err(\"invalid_dependency\") end\n local dep = hgetall_to_table(dep_raw)\n\n -- 2. Already resolved?\n if dep.state == \"satisfied\" or dep.state == \"impossible\" then\n return ok(\"already_resolved\")\n end\n\n -- RFC-016 Stage B: branch on edge-group policy. AnyOf / Quorum run\n -- the four-counter state machine and own the downstream transition;\n -- AllOf (and legacy flows without an edgegroup hash) keep the\n -- original path unchanged.\n local policy_variant_b = nil\n if is_set(K.edgegroup) and redis.call(\"EXISTS\", K.edgegroup) == 1 then\n policy_variant_b = redis.call(\"HGET\", K.edgegroup, \"policy_variant\")\n end\n\n if policy_variant_b == \"any_of\" or policy_variant_b == \"quorum\" then\n return resolve_dependency_quorum(K, A, dep, policy_variant_b)\n end\n\n -- 3. Satisfaction path (upstream completed successfully)\n if A.upstream_outcome == \"success\" then\n redis.call(\"HSET\", K.dep_hash,\n \"state\", \"satisfied\", \"last_resolved_at\", A.now_ms)\n redis.call(\"SREM\", K.unresolved_set, A.edge_id)\n local remaining = redis.call(\"HINCRBY\", K.deps_meta,\n \"unsatisfied_required_count\", -1)\n redis.call(\"HSET\", K.deps_meta, \"last_dependency_update_at\", A.now_ms)\n\n -- RFC-016 Stage A: dual-write the edgegroup hash counters. The\n -- eligibility decision below still keys off `remaining == 0` so\n -- Stage A is behaviour-identical for existing flows; the\n -- edgegroup hash provides the snapshot source of truth going\n -- forward and is the foundation the Stage B resolver extends.\n if is_set(K.edgegroup) and redis.call(\"EXISTS\", K.edgegroup) == 1 then\n redis.call(\"HINCRBY\", K.edgegroup, \"succeeded\", 1)\n if remaining == 0 then\n redis.call(\"HSET\", K.edgegroup, \"group_state\", \"satisfied\")\n end\n end\n\n -- Check if all deps now satisfied\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return ok(\"satisfied\", \"\") end\n local core = hgetall_to_table(raw)\n\n -- Server-side data_passing_ref resolution (Batch C item 3). When\n -- the edge was staged with a non-empty `data_passing_ref`, replace\n -- the downstream\'s input_payload with the upstream\'s result. COPY\n -- is a single-slot server-internal op (no round-trip to Lua\n -- memory) so large result payloads don\'t inflate the FCALL\'s\n -- working set.\n --\n -- Terminal-child guard: a late satisfaction can race with the\n -- child being cancelled or skipped. Don\'t overwrite the payload\n -- of a child that has already reached a terminal state \u{2014} it\'s at\n -- best pointless (the worker will never read it) and at worst\n -- noisy for post-mortem debugging.\n --\n -- Write-ordering note (RFC-010 \u{a7}4.8b): COPY runs BEFORE the\n -- eligibility transition below so a crash between the two leaves\n -- the child blocked (or late-satisfied on reconciler retry) with\n -- the correct payload rather than eligible with a stale one.\n --\n -- Void-completion path: if the upstream called complete(None), the\n -- result key does not exist \u{2014} COPY returns 0 and data_injected\n -- stays empty, leaving the child\'s original input_payload intact.\n local data_injected = \"\"\n if is_set(dep.data_passing_ref)\n and core.terminal_outcome == \"none\" then\n local copied = redis.call(\n \"COPY\", K.upstream_result, K.downstream_payload, \"REPLACE\")\n if copied == 1 then\n data_injected = \"data_injected\"\n end\n end\n\n if remaining == 0\n and core.lifecycle_phase == \"runnable\"\n and core.ownership_state == \"unowned\"\n and core.terminal_outcome == \"none\"\n and core.eligibility_state == \"blocked_by_dependencies\" then\n -- Preserve attempt_state\n local new_attempt_state = core.attempt_state\n if not is_set(new_attempt_state) or new_attempt_state == \"none\" then\n new_attempt_state = \"pending_first_attempt\"\n end\n -- ALL 7 dims\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", core.lifecycle_phase, -- preserve (runnable)\n \"ownership_state\", core.ownership_state or \"unowned\", -- preserve\n \"eligibility_state\", \"eligible_now\",\n \"blocking_reason\", \"waiting_for_worker\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\", -- preserve\n \"attempt_state\", new_attempt_state,\n \"public_state\", \"waiting\",\n \"last_transition_at\", A.now_ms,\n \"last_mutation_at\", A.now_ms)\n redis.call(\"ZREM\", K.blocked_deps_zset, core.execution_id or \"\")\n local priority = tonumber(core.priority or \"0\")\n local created_at_ms = tonumber(core.created_at or \"0\")\n local score = 0 - (priority * 1000000000000) + created_at_ms\n redis.call(\"ZADD\", K.eligible_zset, score, core.execution_id or \"\")\n end\n\n return ok(\"satisfied\", data_injected)\n end\n\n -- 4. Impossible path (upstream failed/cancelled/expired/skipped)\n redis.call(\"HSET\", K.dep_hash,\n \"state\", \"impossible\", \"last_resolved_at\", A.now_ms)\n redis.call(\"SREM\", K.unresolved_set, A.edge_id)\n redis.call(\"HINCRBY\", K.deps_meta, \"unsatisfied_required_count\", -1)\n redis.call(\"HINCRBY\", K.deps_meta, \"impossible_required_count\", 1)\n redis.call(\"HSET\", K.deps_meta, \"last_dependency_update_at\", A.now_ms)\n\n -- RFC-016 Stage A: dual-write the edgegroup hash. AllOf short-\n -- circuits to impossible on the first non-success terminal\n -- (Invariant Q4). Failed/skipped bucket is lumped here as\n -- `failed_count` for Stage A \u{2014} Stage B will split them per the\n -- four-counter model (\u{a7}3).\n if is_set(K.edgegroup) and redis.call(\"EXISTS\", K.edgegroup) == 1 then\n redis.call(\"HINCRBY\", K.edgegroup, \"failed\", 1)\n redis.call(\"HSET\", K.edgegroup, \"group_state\", \"impossible\")\n end\n\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return ok(\"impossible\", \"\") end\n local core = hgetall_to_table(raw)\n\n local child_skipped = false\n\n if core.terminal_outcome == \"none\" then\n -- Determine attempt_state for skip\n local skip_attempt_state = core.attempt_state or \"none\"\n if skip_attempt_state == \"running_attempt\"\n or skip_attempt_state == \"attempt_interrupted\" then\n -- NOTE: If the child is active (worker holding lease), this FCALL runs\n -- on {p:N} (exec partition) so we CAN write exec_core and attempt_hash.\n -- However, lease_current, lease_expiry_zset, worker_leases, and\n -- active_index also live on {p:N} \u{2014} but the KEYS array for this\n -- function does not include them (only 9 KEYS). Cleaning them here\n -- would require adding more KEYS slots and pre-reading worker_instance_id\n -- to construct the worker_leases key. Instead, lease cleanup is\n -- delegated to the lease_expiry scanner (1.5s default interval):\n -- 1. lease_expiry_scanner sees expired lease \u{2192} ff_mark_lease_expired_if_due\n -- 2. Worker\'s renewal sees terminal \u{2192} stops with terminal error\n -- 3. ff_expire_execution (attempt_timeout/deadline scanner) does full cleanup\n -- Race window: between this skip and scanner cleanup, exec_core is\n -- terminal(skipped) but stale entries remain in active/lease indexes.\n -- Bounded by lease_expiry_interval (default 1.5s). Index reconciler\n -- detects and logs any residual inconsistency at 45s intervals.\n skip_attempt_state = \"attempt_terminal\"\n -- End real attempt + close stream\n redis.call(\"HSET\", K.attempt_hash,\n \"attempt_state\", \"ended_cancelled\",\n \"ended_at\", A.now_ms,\n \"failure_reason\", \"dependency_impossible\")\n if redis.call(\"EXISTS\", K.stream_meta) == 1 then\n redis.call(\"HSET\", K.stream_meta,\n \"closed_at\", A.now_ms,\n \"closed_reason\", \"dependency_impossible\")\n end\n elseif is_set(skip_attempt_state) and skip_attempt_state ~= \"none\" then\n skip_attempt_state = \"none\"\n end\n\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"terminal\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"not_applicable\",\n \"blocking_reason\", \"none\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"skipped\",\n \"attempt_state\", skip_attempt_state,\n \"public_state\", \"skipped\",\n \"completed_at\", A.now_ms,\n \"last_transition_at\", A.now_ms,\n \"last_mutation_at\", A.now_ms)\n redis.call(\"ZREM\", K.blocked_deps_zset, core.execution_id or \"\")\n redis.call(\"ZADD\", K.terminal_zset, tonumber(A.now_ms), core.execution_id or \"\")\n child_skipped = true\n\n -- Push-based DAG promotion (bridge-event gap report \u{a7}1.3 analogue).\n -- A child skipped due to an impossible upstream is an FF-initiated\n -- terminal transition: cairn never calls anything for the skip, so\n -- without a PUBLISH the skip\'s own downstream edges only resolve\n -- via the 15s dependency_reconciler safety net. Symmetric with the\n -- other terminal sites (ff_complete_execution et al.). Gated on\n -- `is_set(core.flow_id)` \u{2014} a skip on a standalone exec would be a\n -- bug upstream (standalones have no edges), but the gate keeps the\n -- invariant consistent with the other emit sites. Also gated on\n -- `is_set(core.execution_id)`: the ff-backend-valkey subscriber\n -- fails to parse an empty execution_id and silently drops the\n -- message, reintroducing reconciler-latency for that exec.\n if is_set(core.flow_id) and is_set(core.execution_id) then\n local payload = cjson.encode({\n execution_id = core.execution_id,\n flow_id = core.flow_id,\n outcome = \"skipped\",\n })\n redis.call(\"PUBLISH\", \"ff:dag:completions\", payload)\n end\n end\n\n return ok(\"impossible\", child_skipped and \"child_skipped\" or \"\")\nend)\n\n---------------------------------------------------------------------------\n-- RFC-016 Stage C: ff_drain_sibling_cancel_group (on {fp:N})\n--\n-- Atomic drain call issued by the ff-engine sibling-cancel dispatcher\n-- after it has fired `ff_cancel_execution` against every member of\n-- `cancel_siblings_pending_members`. In one Lua unit:\n-- (1) SREM `<flow_id>|<downstream_eid>` from the per-partition\n-- `ff:pending_cancel_groups:{fp:N}` index SET,\n-- (2) HDEL `cancel_siblings_pending_members` +\n-- `cancel_siblings_pending_flag` from the edgegroup hash,\n-- leaving `cancel_siblings_reason` + `satisfied_at` as\n-- observability breadcrumbs.\n--\n-- Returns \"drained\" on success (tuple was present in the SET) or\n-- \"already_drained\" when the dispatcher double-fires (idempotent).\n-- If the edgegroup hash is gone (retention / manual delete) the SET\n-- entry is still removed \u{2014} nothing authoritative to clear on the\n-- hash \u{2014} and the call returns \"drained_sans_group\".\n--\n-- KEYS (2): pending_cancel_groups_set, edgegroup\n-- ARGV (2): flow_id, downstream_eid\n---------------------------------------------------------------------------\nredis.register_function(\'ff_drain_sibling_cancel_group\', function(keys, args)\n local pending_set = keys[1]\n local edgegroup = keys[2]\n local flow_id = args[1] or \"\"\n local downstream_eid = args[2] or \"\"\n\n if not is_set(flow_id) or not is_set(downstream_eid) then\n return err(\"invalid_args\", \"flow_id and downstream_eid required\")\n end\n\n local member = flow_id .. \"|\" .. downstream_eid\n local removed = redis.call(\"SREM\", pending_set, member)\n\n local group_exists = redis.call(\"EXISTS\", edgegroup) == 1\n if group_exists then\n redis.call(\"HDEL\", edgegroup,\n \"cancel_siblings_pending_flag\",\n \"cancel_siblings_pending_members\")\n end\n\n if removed == 0 then\n return ok(\"already_drained\", \"\")\n end\n if not group_exists then\n return ok(\"drained_sans_group\", \"\")\n end\n return ok(\"drained\", \"\")\nend)\n\n---------------------------------------------------------------------------\n-- RFC-016 Stage D: ff_reconcile_sibling_cancel_group (on {fp:N})\n--\n-- Crash-recovery reconciler for Invariant Q6. Stage C\'s dispatcher\n-- populates `pending_cancel_groups` under ff_resolve_dependency\'s\n-- atomic satisfied/impossible flip and drains it via\n-- ff_drain_sibling_cancel_group after per-sibling cancels land. An\n-- engine crash between the SADD + the drain leaves stale tuples in\n-- the SET. This function reconciles one tuple atomically:\n--\n-- (1) flag false / absent AND SET still carries the tuple:\n-- stale marker \u{2014} SREM + return \"sremmed_stale\".\n-- (2) flag true AND every sibling in `cancel_siblings_pending_members`\n-- is already terminal (dispatcher fired cancels but crashed\n-- before drain): HDEL flag/members + SREM + return\n-- \"completed_drain\".\n-- (3) flag true AND at least one sibling non-terminal: leave alone\n-- \u{2014} the dispatcher owns this tuple on its next tick. Return\n-- \"no_op\".\n-- (4) edgegroup missing (retention / manual delete) with the tuple\n-- still present: SREM + return \"sremmed_stale\" \u{2014} nothing to\n-- reconcile; drop the orphan index entry.\n--\n-- Reconciler MUST NOT fight the dispatcher: case (3) leaves state\n-- unchanged.\n--\n-- KEYS (2): pending_cancel_groups_set, edgegroup\n-- ARGV (2): flow_id, downstream_eid\n---------------------------------------------------------------------------\nredis.register_function(\'ff_reconcile_sibling_cancel_group\', function(keys, args)\n local pending_set = keys[1]\n local edgegroup = keys[2]\n local flow_id = args[1] or \"\"\n local downstream_eid = args[2] or \"\"\n\n if not is_set(flow_id) or not is_set(downstream_eid) then\n return err(\"invalid_args\", \"flow_id and downstream_eid required\")\n end\n\n local member = flow_id .. \"|\" .. downstream_eid\n local in_set = redis.call(\"SISMEMBER\", pending_set, member) == 1\n if not in_set then\n -- Raced with the dispatcher or a prior reconcile tick; nothing to do.\n return ok(\"no_op\", \"not_in_set\")\n end\n\n local group_exists = redis.call(\"EXISTS\", edgegroup) == 1\n if not group_exists then\n -- Orphan tuple \u{2014} group is gone, no sibling state to check.\n redis.call(\"SREM\", pending_set, member)\n return ok(\"sremmed_stale\", \"missing_edgegroup\")\n end\n\n local flag = redis.call(\"HGET\", edgegroup, \"cancel_siblings_pending_flag\")\n if not is_set(flag) or flag ~= \"true\" then\n -- Dispatcher HDEL\'d the flag but crashed before SREM.\n redis.call(\"SREM\", pending_set, member)\n return ok(\"sremmed_stale\", \"flag_cleared\")\n end\n\n -- Flag true: check members. Empty members_str means the dispatcher\n -- observed no live siblings at flip-time \u{2014} still an interrupted\n -- drain case since the flag is set; clear and SREM.\n local members_str = redis.call(\"HGET\", edgegroup, \"cancel_siblings_pending_members\")\n if not is_set(members_str) then\n members_str = \"\"\n end\n\n -- Derive exec_tag from edgegroup key (`ff:flow:{fp:N}:<flow_id>:edgegroup:<eid>`).\n -- All flow members share this slot per RFC-011 \u{a7}11.\n local exec_tag = string.match(edgegroup, \"(%b{})\") or \"\"\n\n local all_terminal = true\n if is_set(members_str) and is_set(exec_tag) then\n for sib_eid in string.gmatch(members_str, \"([^|]+)\") do\n local sib_core_key = \"ff:exec:\" .. exec_tag .. \":\" .. sib_eid .. \":core\"\n local phase = redis.call(\"HGET\", sib_core_key, \"lifecycle_phase\")\n if is_set(phase) and phase ~= \"terminal\" then\n all_terminal = false\n break\n end\n -- Missing sibling core (retention): treat as terminal for\n -- reconcile purposes \u{2014} nothing to cancel, drain is effectively\n -- complete.\n end\n end\n\n if not all_terminal then\n -- Dispatcher will handle on its next tick.\n return ok(\"no_op\", \"siblings_running\")\n end\n\n redis.call(\"HDEL\", edgegroup,\n \"cancel_siblings_pending_flag\",\n \"cancel_siblings_pending_members\")\n redis.call(\"SREM\", pending_set, member)\n return ok(\"completed_drain\", \"\")\nend)\n\n---------------------------------------------------------------------------\n-- #24 ff_evaluate_flow_eligibility (on {p:N})\n--\n-- Read-only check of execution + dependency state. Class C.\n--\n-- KEYS (2): exec_core, deps_meta\n-- ARGV (0)\n---------------------------------------------------------------------------\nredis.register_function(\'ff_evaluate_flow_eligibility\', function(keys, args)\n local raw = redis.call(\"HGETALL\", keys[1])\n if #raw == 0 then return ok(\"not_found\") end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"runnable\" then\n return ok(\"not_runnable\")\n end\n if core.ownership_state ~= \"unowned\" then\n return ok(\"owned\")\n end\n if core.terminal_outcome ~= \"none\" then\n return ok(\"terminal\")\n end\n\n local deps_raw = redis.call(\"HGETALL\", keys[2])\n if #deps_raw == 0 then\n return ok(\"eligible\")\n end\n local deps = hgetall_to_table(deps_raw)\n\n local impossible = tonumber(deps.impossible_required_count or \"0\")\n if impossible > 0 then\n return ok(\"impossible\")\n end\n\n local unresolved = tonumber(deps.unsatisfied_required_count or \"0\")\n if unresolved > 0 then\n return ok(\"blocked_by_dependencies\")\n end\n\n return ok(\"eligible\")\nend)\n\n---------------------------------------------------------------------------\n-- #35 ff_promote_blocked_to_eligible (on {p:N})\n--\n-- Promote zero-dep flow member from blocked:dependencies to eligible.\n--\n-- KEYS (5): exec_core, blocked_deps_zset, eligible_zset, deps_meta,\n-- deps_unresolved\n-- ARGV (2): execution_id, now_ms\n---------------------------------------------------------------------------\nredis.register_function(\'ff_promote_blocked_to_eligible\', function(keys, args)\n local K = {\n core_key = keys[1],\n blocked_deps_zset = keys[2],\n eligible_zset = keys[3],\n deps_meta = keys[4],\n deps_unresolved = keys[5],\n }\n\n local A = {\n execution_id = args[1],\n now_ms = args[2],\n }\n\n -- 1. Read execution core\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n if core.lifecycle_phase ~= \"runnable\" then\n return err(\"not_runnable\")\n end\n if core.eligibility_state ~= \"blocked_by_dependencies\" then\n return err(\"not_blocked_by_deps\")\n end\n if core.terminal_outcome ~= \"none\" then\n return err(\"terminal\")\n end\n\n -- 2. Verify zero deps\n local unsatisfied = tonumber(\n redis.call(\"HGET\", K.deps_meta, \"unsatisfied_required_count\") or \"0\")\n local unresolved_count = redis.call(\"SCARD\", K.deps_unresolved)\n if unsatisfied > 0 or unresolved_count > 0 then\n return err(\"deps_not_satisfied\", tostring(unsatisfied), tostring(unresolved_count))\n end\n\n -- 3. Preserve attempt_state\n local new_attempt_state = core.attempt_state\n if not is_set(new_attempt_state) or new_attempt_state == \"none\" then\n new_attempt_state = \"pending_first_attempt\"\n end\n\n -- 4. Transition (ALL 7 dims)\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", core.lifecycle_phase, -- preserve (runnable)\n \"ownership_state\", core.ownership_state or \"unowned\", -- preserve\n \"eligibility_state\", \"eligible_now\",\n \"blocking_reason\", \"waiting_for_worker\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\", -- preserve\n \"attempt_state\", new_attempt_state,\n \"public_state\", \"waiting\",\n \"last_transition_at\", A.now_ms,\n \"last_mutation_at\", A.now_ms)\n\n redis.call(\"ZREM\", K.blocked_deps_zset, A.execution_id)\n local priority = tonumber(core.priority or \"0\")\n local created_at_ms = tonumber(core.created_at or \"0\")\n local score = 0 - (priority * 1000000000000) + created_at_ms\n redis.call(\"ZADD\", K.eligible_zset, score, A.execution_id)\n\n return ok()\nend)\n\n---------------------------------------------------------------------------\n-- #12b ff_replay_execution (on {p:N})\n--\n-- Reset a terminal execution for replay. If skipped flow member: reset\n-- impossible deps back to unsatisfied, recompute counts, set\n-- blocked_by_dependencies instead of eligible_now.\n--\n-- KEYS (4+N): exec_core, terminal_zset, eligible_zset, lease_history,\n-- [blocked_deps_zset, deps_meta, deps_unresolved, dep_edge_0..N]\n-- ARGV (2+N): execution_id, now_ms, [edge_id_0..N]\n---------------------------------------------------------------------------\nredis.register_function(\'ff_replay_execution\', function(keys, args)\n local K = {\n core_key = keys[1],\n terminal_zset = keys[2],\n eligible_zset = keys[3],\n lease_history = keys[4],\n }\n\n local A = {\n execution_id = args[1],\n }\n\n local t = redis.call(\"TIME\")\n local now_ms = tonumber(t[1]) * 1000 + math.floor(tonumber(t[2]) / 1000)\n\n -- 1. Read execution core\n local raw = redis.call(\"HGETALL\", K.core_key)\n if #raw == 0 then return err(\"execution_not_found\") end\n local core = hgetall_to_table(raw)\n\n -- 2. Must be terminal\n if core.lifecycle_phase ~= \"terminal\" then\n return err(\"execution_not_terminal\")\n end\n\n -- 3. Check replay limit (read from policy, same pattern as ff_reclaim_execution)\n local replay_count = tonumber(core.replay_count or \"0\")\n local max_replays = 10 -- default\n local policy_key = string.gsub(K.core_key, \":core$\", \":policy\")\n local policy_raw = redis.call(\"GET\", policy_key)\n if policy_raw then\n local ok_p, pol = pcall(cjson.decode, policy_raw)\n if ok_p and type(pol) == \"table\" then\n max_replays = tonumber(pol.max_replay_count) or 10\n end\n end\n if replay_count >= max_replays then\n return err(\"max_replays_exhausted\")\n end\n\n -- 4. Determine replay path\n local is_skipped_flow_member = (core.terminal_outcome == \"skipped\") and is_set(core.flow_id)\n\n if is_skipped_flow_member then\n -- SKIPPED FLOW MEMBER PATH: reset impossible deps \u{2192} blocked on deps\n local blocked_deps_zset = keys[5]\n local deps_meta = keys[6]\n local deps_unresolved = keys[7]\n\n -- Reset impossible dep edges back to unsatisfied\n local num_edges = #args - 2\n local new_unsatisfied = 0\n for i = 1, num_edges do\n local edge_id = args[2 + i]\n local dep_key = keys[7 + i] -- dep_edge keys start at KEYS[8]\n\n local dep_state = redis.call(\"HGET\", dep_key, \"state\")\n if dep_state == \"impossible\" then\n redis.call(\"HSET\", dep_key,\n \"state\", \"unsatisfied\",\n \"last_resolved_at\", \"\")\n redis.call(\"SADD\", deps_unresolved, edge_id)\n new_unsatisfied = new_unsatisfied + 1\n elseif dep_state == \"unsatisfied\" then\n new_unsatisfied = new_unsatisfied + 1\n end\n -- satisfied edges remain satisfied (upstream already succeeded)\n end\n\n -- Recompute deps:meta counts\n redis.call(\"HSET\", deps_meta,\n \"unsatisfied_required_count\", tostring(new_unsatisfied),\n \"impossible_required_count\", \"0\",\n \"last_dependency_update_at\", tostring(now_ms))\n\n -- Transition: terminal \u{2192} runnable/blocked_by_dependencies\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"blocked_by_dependencies\",\n \"blocking_reason\", \"waiting_for_children\",\n \"blocking_detail\", tostring(new_unsatisfied) .. \" dep(s) unsatisfied after replay\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"pending_replay_attempt\",\n \"public_state\", \"waiting_children\",\n \"pending_replay_attempt\", \"1\",\n \"replay_count\", tostring(replay_count + 1),\n \"completed_at\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Move from terminal \u{2192} blocked:deps\n redis.call(\"ZREM\", K.terminal_zset, A.execution_id)\n redis.call(\"ZADD\", blocked_deps_zset,\n tonumber(core.created_at or \"0\"), A.execution_id)\n\n -- Lease history\n redis.call(\"XADD\", K.lease_history, \"MAXLEN\", \"~\", 1000, \"*\",\n \"event\", \"replay_initiated\",\n \"replay_count\", tostring(replay_count + 1),\n \"replay_type\", \"skipped_flow_member\",\n \"ts\", tostring(now_ms))\n\n return ok(tostring(new_unsatisfied))\n else\n -- NORMAL REPLAY PATH: terminal \u{2192} runnable/eligible\n local priority = tonumber(core.priority or \"0\")\n local created_at = tonumber(core.created_at or \"0\")\n local score = 0 - (priority * 1000000000000) + created_at\n\n redis.call(\"HSET\", K.core_key,\n \"lifecycle_phase\", \"runnable\",\n \"ownership_state\", \"unowned\",\n \"eligibility_state\", \"eligible_now\",\n \"blocking_reason\", \"waiting_for_worker\",\n \"blocking_detail\", \"\",\n \"terminal_outcome\", \"none\",\n \"attempt_state\", \"pending_replay_attempt\",\n \"public_state\", \"waiting\",\n \"pending_replay_attempt\", \"1\",\n \"replay_count\", tostring(replay_count + 1),\n \"completed_at\", \"\",\n \"last_transition_at\", tostring(now_ms),\n \"last_mutation_at\", tostring(now_ms))\n\n -- Move from terminal \u{2192} eligible\n redis.call(\"ZREM\", K.terminal_zset, A.execution_id)\n redis.call(\"ZADD\", K.eligible_zset, score, A.execution_id)\n\n -- Lease history\n redis.call(\"XADD\", K.lease_history, \"MAXLEN\", \"~\", 1000, \"*\",\n \"event\", \"replay_initiated\",\n \"replay_count\", tostring(replay_count + 1),\n \"replay_type\", \"normal\",\n \"ts\", tostring(now_ms))\n\n return ok(\"0\")\n end\nend)\n\n---------------------------------------------------------------------------\n-- ff_set_flow_tags (issue #58.4)\n--\n-- Write caller-supplied tag fields to the flow\'s separate tags key\n-- (`ff:flow:{fp:N}:<flow_id>:tags`). Mirrors `ff_set_execution_tags`.\n--\n-- Lazy migration (Option 1(a)): BEFORE writing, any existing fields on\n-- `flow_core` whose name matches the reserved namespace\n-- `^[a-z][a-z0-9_]*%.` are moved to `tags_key` and HDEL\'d from\n-- `flow_core`. Heals pre-58.4 flows that stored `<caller>.<field>` tags\n-- inline on `flow_core`. Idempotent: after first call, no fields match.\n--\n-- Tag keys MUST match `^[a-z][a-z0-9_]*%.`; violations fail-closed with\n-- `invalid_tag_key` (no migration, no write).\n--\n-- KEYS (2): flow_core, tags_key\n-- ARGV (>=2, even): k1, v1, k2, v2, ...\n---------------------------------------------------------------------------\nredis.register_function(\'ff_set_flow_tags\', function(keys, args)\n local K = {\n flow_core = keys[1],\n tags_key = keys[2],\n }\n\n local n = #args\n if n == 0 or (n % 2) ~= 0 then\n return err(\"invalid_input\", \"tags must be non-empty even-length key/value pairs\")\n end\n\n if redis.call(\"EXISTS\", K.flow_core) == 0 then\n return err(\"flow_not_found\")\n end\n\n -- Require `<caller>.<field>` with at least one non-dot char after the\n -- first dot (same rule as `ff_set_execution_tags`). Suffix may contain\n -- further dots.\n for i = 1, n, 2 do\n local k = args[i]\n if type(k) ~= \"string\" or not string.find(k, \"^[a-z][a-z0-9_]*%.[^.]\") then\n return err(\"invalid_tag_key\", tostring(k))\n end\n end\n\n -- Lazy migration: only HGETALL the core hash once per flow. A sentinel\n -- `tags_migrated=1` field on `flow_core` short-circuits subsequent\n -- calls so tag writes on well-formed flows stay O(1) instead of paying\n -- an O(n) scan of every flow_core field. The sentinel itself is\n -- dot-free snake_case \u{2014} it matches FF\'s own-field rule, not the\n -- reserved caller namespace, so it can\'t be confused with a tag.\n local migrated = redis.call(\"HGET\", K.flow_core, \"tags_migrated\")\n if migrated ~= \"1\" then\n local flat = redis.call(\"HGETALL\", K.flow_core)\n local to_migrate = {}\n local to_delete = {}\n for i = 1, #flat, 2 do\n local fname = flat[i]\n if type(fname) == \"string\" and string.find(fname, \"^[a-z][a-z0-9_]*%.[^.]\") then\n to_migrate[#to_migrate + 1] = fname\n to_migrate[#to_migrate + 1] = flat[i + 1]\n to_delete[#to_delete + 1] = fname\n end\n end\n if #to_migrate > 0 then\n redis.call(\"HSET\", K.tags_key, unpack(to_migrate))\n redis.call(\"HDEL\", K.flow_core, unpack(to_delete))\n end\n redis.call(\"HSET\", K.flow_core, \"tags_migrated\", \"1\")\n end\n\n redis.call(\"HSET\", K.tags_key, unpack(args))\n\n local now_ms = server_time_ms()\n redis.call(\"HSET\", K.flow_core, \"last_mutation_at\", tostring(now_ms))\n\n return ok(tostring(n / 2))\nend)\n\n";Expand description
The compiled FlowFabric Lua library source.
Generated from lua/*.lua by scripts/gen-ff-script-lua.sh and checked
into the crate as flowfabric.lua so it ships inside the published
tarball. CI (matrix.yml) fails if this file drifts from what the
script would produce.