Skip to main content

plsql_ir/
stmt.rs

1//! IR for PL/SQL statement bodies.
2//!
3//! Adds the [`Statement`] enum and a heuristic lowering pass that
4//! turns a raw statement-body source slice into a sequence of IR
5//! statements. The full AST→IR lowering will wire `lower_statement`
6//! against the actual parser tree once (statement-
7//! body lowering in the parser) lands. Until then, this module
8//! ships:
9//!
10//! 1. The complete IR enum so downstream consumers (analysis
11//!    passes, lineage, bindings) can program against a stable
12//!    surface today.
13//! 2. A line-shaped heuristic classifier used by the engine's
14//!    source-only fallback path — sufficient for the lab corpus's
15//!    common-case statements (assignment, control flow, raise,
16//!    return, exit, null, EXECUTE IMMEDIATE, simple SQL).
17//!
18//! Both surfaces honour R13 (typed UnknownReason) by emitting
19//! [`Statement::Unrecognized`] with a reason discriminant when the
20//! recognizer cannot classify a line.
21//!
22//! ## /oracle evidence
23//!
24//! * `DATABASE-REFERENCE.md` PL/SQL Language Reference — the
25//!   recognised statement shapes (`IF / ELSIF / ELSE`, `LOOP`,
26//!   `FOR i IN …`, `WHILE`, `RAISE`, `RETURN`, `EXECUTE
27//!   IMMEDIATE`, SQL statements) match the PL/SQL Language
28//!   Reference chapter on statements.
29//! * `LOW-LEVEL-CATALOGS.md` — the supplied-package bucket
30//!   anchors `DBMS_OUTPUT` / `DBMS_SCHEDULER` usage that may
31//!   appear in EXECUTE IMMEDIATE bodies.
32
33use serde::{Deserialize, Serialize};
34
35/// One PL/SQL statement, in source order.
36#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
37#[serde(tag = "kind", rename_all = "snake_case")]
38pub enum Statement {
39    /// `NULL;` — the PL/SQL no-op.
40    Null,
41    /// `target := expr;` — captures the LHS target name and the
42    /// raw RHS expression text. Sub-expression lowering happens
43    /// in a later pass.
44    Assignment { target: String, rhs_text: String },
45    /// `IF cond THEN … [ELSIF …] [ELSE …] END IF;`. We capture the
46    /// condition text per arm + the body source slice; full body
47    /// lowering re-enters `lower_statement_body` on each slice once
48    /// the parser wires it.
49    If {
50        arms: Vec<IfArm>,
51        else_body_text: Option<String>,
52    },
53    /// `LOOP … END LOOP;` (bare loop).
54    BareLoop { body_text: String },
55    /// `FOR <ident> IN <range> LOOP … END LOOP;` — captures the
56    /// iterator name + the range text.
57    ForLoop {
58        iterator: String,
59        range_text: String,
60        body_text: String,
61    },
62    /// `WHILE cond LOOP … END LOOP;`.
63    WhileLoop {
64        cond_text: String,
65        body_text: String,
66    },
67    /// `RAISE [exception_name];`.
68    Raise { exception: Option<String> },
69    /// `RETURN [expr];`.
70    Return { value_text: Option<String> },
71    /// `EXIT [WHEN cond];`.
72    Exit { when_text: Option<String> },
73    /// `EXECUTE IMMEDIATE 'sql' [USING binds] [INTO targets];`.
74    /// The lowering captures the SQL literal verbatim plus a
75    /// boolean for whether the call had bind variables.
76    ExecuteImmediate {
77        sql_literal: String,
78        has_bind_variables: bool,
79    },
80    /// A SQL statement embedded in PL/SQL (`SELECT … INTO`,
81    /// `INSERT`, `UPDATE`, `DELETE`, `MERGE`). The verb is
82    /// captured plus the raw text so downstream lineage can walk
83    /// the tables it touches.
84    Sql { verb: SqlVerb, raw_text: String },
85    /// Anonymous nested block — `[DECLARE …] BEGIN … END;` inside
86    /// the surrounding body.
87    NestedBlock { body_text: String },
88    /// `COMMIT;` / `ROLLBACK [TO …];` / `SAVEPOINT …;` — captured
89    /// as a single kind because the engine treats them uniformly
90    /// for now.
91    TransactionControl { verb: String },
92    /// Statement the recognizer could not classify. The
93    /// `unknown_reason` discriminant feeds R13 reporting so the
94    /// engine never silently drops a line.
95    Unrecognized {
96        raw_text: String,
97        unknown_reason: UnknownStatementReason,
98    },
99}
100
101#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
102pub struct IfArm {
103    pub cond_text: String,
104    pub body_text: String,
105}
106
107#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
108#[serde(rename_all = "snake_case")]
109pub enum SqlVerb {
110    Select,
111    Insert,
112    Update,
113    Delete,
114    Merge,
115}
116
117#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
118#[serde(rename_all = "snake_case")]
119pub enum UnknownStatementReason {
120    /// The line did not match any recognised statement shape.
121    UnrecognizedKeyword,
122    /// The line started a block-shaped statement (e.g. `IF`,
123    /// `LOOP`) but the recognizer could not find the matching
124    /// terminator before the body ended.
125    UnterminatedBlock,
126    /// The line is a comment or a label and was not surfaced as
127    /// a statement.
128    NonStatement,
129}
130
131/// Lower a raw statement-body source slice (i.e. the bytes
132/// between `BEGIN` and `END` of a routine) into a vector of
133/// IR statements. The recognizer is line-shaped:
134///
135/// 1. Split on `;` keeping the terminator with each chunk.
136/// 2. Trim whitespace + comments.
137/// 3. Classify by leading keyword (case-insensitive).
138///
139/// The pass is intentionally conservative — anything it can't
140/// confidently classify lands as `Statement::Unrecognized` with
141/// `UnrecognizedKeyword` so downstream analysis sees the source
142/// text rather than silently dropping it.
143#[must_use]
144pub fn lower_statement_body(source: &str) -> Vec<Statement> {
145    let mut out: Vec<Statement> = Vec::new();
146    for chunk in split_statements(source) {
147        let stripped = strip_comments(&chunk.text).trim().to_string();
148        if stripped.is_empty() {
149            continue;
150        }
151        if chunk.unterminated {
152            // R13: the splitter reached end-of-body with an open
153            // block (`IF`/`LOOP`/`BEGIN`/`CASE` never matched its
154            // terminator). Surface it as a typed diagnostic instead
155            // of letting a downstream classifier silently mis-parse
156            // a half-block.
157            out.push(Statement::Unrecognized {
158                raw_text: stripped,
159                unknown_reason: UnknownStatementReason::UnterminatedBlock,
160            });
161            continue;
162        }
163        out.push(classify(&stripped));
164    }
165    out
166}
167
168/// One chunk produced by [`split_statements`] — the raw source text
169/// plus whether the chunk was emitted because the body ended while a
170/// block was still open (R13: the splitter never silently truncates).
171struct StatementChunk {
172    text: String,
173    /// `true` when this chunk was a block opener (`IF`/`LOOP`/
174    /// `BEGIN`/`CASE`) whose matching terminator was never found
175    /// before end-of-body.
176    unterminated: bool,
177}
178
179/// Split `source` on `;` honouring nested `BEGIN … END;` blocks
180/// **and** matching `IF … END IF;` / `LOOP … END LOOP;` /
181/// `CASE … END CASE;` so an inner semicolon doesn't tear apart a
182/// control-flow body. The result preserves the trailing semicolon
183/// (or end-keyword) on each chunk so downstream classifiers can see
184/// it.
185///
186/// Depth is incremented on every block opener — `BEGIN`, `IF`,
187/// `LOOP` (the keyword that introduces a bare / `FOR` / `WHILE`
188/// loop), and `CASE` — and decremented on the matching terminator.
189/// A bare `END` (block end) decrements; `END IF` / `END LOOP` /
190/// `END CASE` also decrement (one per matching opener) — so the
191/// three opener families stay balanced. `;` only splits at depth 0.
192///
193/// If end-of-body is reached with `depth > 0` the still-open chunk
194/// is flagged `unterminated` so [`lower_statement_body`] can emit a
195/// typed [`UnknownStatementReason::UnterminatedBlock`] (R13).
196fn split_statements(source: &str) -> Vec<StatementChunk> {
197    let mut out: Vec<StatementChunk> = Vec::new();
198    let mut depth: i32 = 0;
199    let mut buffer = String::new();
200    let upper_chars: Vec<char> = source.chars().map(|c| c.to_ascii_uppercase()).collect();
201    let mut i = 0;
202    let chars: Vec<char> = source.chars().collect();
203    while i < chars.len() {
204        // Opaque spans — string literals (`'…''…'`), Oracle q-quotes
205        // (`q'X…X'`), and comments — are copied through verbatim WITHOUT
206        // scanning: a `;` inside a literal/comment is not a statement boundary,
207        // and a `BEGIN`/`END` inside one must not move the block-depth counter.
208        // (split_statements runs on the raw body before strip_comments, so it
209        // must skip comments itself.)
210        if let Some(end) = opaque_span_end(&chars, i) {
211            for &ch in &chars[i..end] {
212                buffer.push(ch);
213            }
214            i = end;
215            continue;
216        }
217        let c = chars[i];
218        // `END IF` / `END LOOP` / `END CASE` must be matched before a
219        // bare `END`, otherwise the bare-`END` arm would consume the
220        // `END` and the depth bookkeeping would double-count.
221        if let Some(consumed) = consume_end_keyword(&upper_chars, i) {
222            depth = (depth - 1).max(0);
223            for &ch in chars.iter().skip(i).take(consumed) {
224                buffer.push(ch);
225            }
226            i += consumed;
227            continue;
228        }
229        // Track block depth by matching whole opener keywords.
230        if let Some(consumed) =
231            consume_any_keyword(&upper_chars, i, &["BEGIN", "IF", "LOOP", "CASE"])
232        {
233            depth += 1;
234            for &ch in chars.iter().skip(i).take(consumed) {
235                buffer.push(ch);
236            }
237            i += consumed;
238            continue;
239        }
240        buffer.push(c);
241        if c == ';' && depth == 0 {
242            out.push(StatementChunk {
243                text: std::mem::take(&mut buffer),
244                unterminated: false,
245            });
246        }
247        i += 1;
248    }
249    if !buffer.trim().is_empty() {
250        out.push(StatementChunk {
251            text: buffer,
252            // depth > 0 ⇒ a block opener never met its terminator.
253            unterminated: depth > 0,
254        });
255    }
256    out
257}
258
259/// If a string literal begins at `chars[i]` — a single-quoted `'…''…'`
260/// literal (doubled `''` escapes) or an Oracle alternative-quoting
261/// `q'X…X'` / `nq'X…X'` literal — return the index one past its end. An
262/// unterminated literal consumes to end-of-input so no `;`/keyword inside an
263/// open literal is ever treated as a boundary. `prev` (the char before `i`)
264/// guards the q-quote so an identifier ending in q/n (e.g. `acquire`) does not
265/// false-trigger. Mirrors the canonical scanner in plsql-parser-antlr::recover.
266fn string_literal_end(chars: &[char], i: usize) -> Option<usize> {
267    let len = chars.len();
268    if i >= len {
269        return None;
270    }
271    // q-quote: optional leading n/N, then q/Q, then `'`, then the delimiter.
272    let prev_is_ident = i > 0 && (chars[i - 1].is_ascii_alphanumeric() || chars[i - 1] == '_');
273    let q_at = if chars[i].eq_ignore_ascii_case(&'n') && i + 1 < len {
274        i + 1
275    } else {
276        i
277    };
278    if !prev_is_ident
279        && chars[q_at].eq_ignore_ascii_case(&'q')
280        && q_at + 2 < len
281        && chars[q_at + 1] == '\''
282    {
283        let open = chars[q_at + 2];
284        let close = match open {
285            '[' => ']',
286            '(' => ')',
287            '{' => '}',
288            '<' => '>',
289            other => other,
290        };
291        let mut j = q_at + 3;
292        while j + 1 < len {
293            if chars[j] == close && chars[j + 1] == '\'' {
294                return Some(j + 2);
295            }
296            j += 1;
297        }
298        return Some(len); // unterminated → consume to EOF
299    }
300    // Single-quoted string literal with doubled-`''` escape.
301    if chars[i] == '\'' {
302        let mut j = i + 1;
303        while j < len {
304            if chars[j] == '\'' {
305                if j + 1 < len && chars[j + 1] == '\'' {
306                    j += 2; // escaped ''
307                } else {
308                    return Some(j + 1);
309                }
310            } else {
311                j += 1;
312            }
313        }
314        return Some(len); // unterminated → consume to EOF
315    }
316    None
317}
318
319/// If an opaque span — a string literal (see [`string_literal_end`]) or a
320/// comment (`-- …` line / `/* … */` block) — begins at `chars[i]`, return the
321/// index one past its end. Used by [`split_statements`] to copy such spans
322/// through verbatim so their contents never affect `;`-splitting or block depth.
323fn opaque_span_end(chars: &[char], i: usize) -> Option<usize> {
324    if let Some(end) = string_literal_end(chars, i) {
325        return Some(end);
326    }
327    let len = chars.len();
328    // Line comment `-- …` (up to and including the newline).
329    if chars[i] == '-' && chars.get(i + 1) == Some(&'-') {
330        let mut j = i + 2;
331        while j < len && chars[j] != '\n' {
332            j += 1;
333        }
334        if j < len {
335            j += 1; // include the terminating newline
336        }
337        return Some(j);
338    }
339    // Block comment `/* … */`.
340    if chars[i] == '/' && chars.get(i + 1) == Some(&'*') {
341        let mut j = i + 2;
342        while j < len {
343            if chars[j] == '*' && chars.get(j + 1) == Some(&'/') {
344                return Some(j + 2);
345            }
346            j += 1;
347        }
348        return Some(len); // unterminated
349    }
350    None
351}
352
353/// Match a block terminator at `pos`: `END IF`, `END LOOP`,
354/// `END CASE`, or a bare `END`. Returns the number of chars to
355/// consume (covering the optional whitespace + sub-keyword) so the
356/// caller can copy the whole terminator into the current chunk.
357fn consume_end_keyword(chars: &[char], pos: usize) -> Option<usize> {
358    let end = consume_keyword(chars, pos, "END")?;
359    // Look past `END` and any run of whitespace for a sub-keyword.
360    let mut j = pos + end;
361    while j < chars.len() && chars[j].is_whitespace() {
362        j += 1;
363    }
364    for sub in ["IF", "LOOP", "CASE"] {
365        if let Some(sub_len) = consume_keyword(chars, j, sub) {
366            return Some(j + sub_len - pos);
367        }
368    }
369    // Bare `END` (terminates BEGIN…END).
370    Some(end)
371}
372
373/// Match the first whole keyword from `keywords` at `pos`.
374fn consume_any_keyword(chars: &[char], pos: usize, keywords: &[&str]) -> Option<usize> {
375    keywords
376        .iter()
377        .find_map(|kw| consume_keyword(chars, pos, kw))
378}
379
380fn consume_keyword(chars: &[char], pos: usize, keyword: &str) -> Option<usize> {
381    let kw: Vec<char> = keyword.chars().collect();
382    if pos + kw.len() > chars.len() {
383        return None;
384    }
385    for (j, k) in kw.iter().enumerate() {
386        if chars[pos + j] != *k {
387            return None;
388        }
389    }
390    // Boundary check: the char immediately after must NOT be
391    // alphanumeric / `_` / `$` / `#` and the char immediately
392    // before must be whitespace / start of input / non-ident.
393    if pos > 0 {
394        let prev = chars[pos - 1];
395        if prev.is_ascii_alphanumeric() || prev == '_' || prev == '$' || prev == '#' {
396            return None;
397        }
398    }
399    if pos + kw.len() < chars.len() {
400        let next = chars[pos + kw.len()];
401        if next.is_ascii_alphanumeric() || next == '_' || next == '$' || next == '#' {
402            return None;
403        }
404    }
405    Some(kw.len())
406}
407
408fn strip_comments(s: &str) -> String {
409    let chars: Vec<char> = s.chars().collect();
410    let mut out = String::with_capacity(s.len());
411    let mut i = 0;
412    while i < chars.len() {
413        // A string literal is opaque: copy it through verbatim so a `--` or
414        // `/*` *inside* a quoted (or q-quoted) literal is never mistaken for a
415        // comment and stripped — which would corrupt the statement's SQL text.
416        if let Some(end) = string_literal_end(&chars, i) {
417            for &ch in &chars[i..end] {
418                out.push(ch);
419            }
420            i = end;
421            continue;
422        }
423        let c = chars[i];
424        // Line comment `-- …`: drop it but keep the newline (line structure).
425        if c == '-' && chars.get(i + 1) == Some(&'-') {
426            i += 2;
427            while i < chars.len() && chars[i] != '\n' {
428                i += 1;
429            }
430            if i < chars.len() {
431                out.push('\n');
432                i += 1;
433            }
434            continue;
435        }
436        // Block comment `/* … */`: replace it with a single space. A comment is
437        // a token SEPARATOR, so dropping it outright fused the tokens on either
438        // side — `EXECUTE/**/IMMEDIATE` collapsed to `EXECUTEIMMEDIATE` and
439        // evaded the keyword classifier (oracle-qo1v.3). Substituting a space
440        // keeps the two tokens distinct (and is harmless mid-token: `a/**/b`
441        // becomes `a b`, which never re-fuses).
442        if c == '/' && chars.get(i + 1) == Some(&'*') {
443            i += 2;
444            while i < chars.len() {
445                if chars[i] == '*' && chars.get(i + 1) == Some(&'/') {
446                    i += 2;
447                    break;
448                }
449                i += 1;
450            }
451            out.push(' ');
452            continue;
453        }
454        out.push(c);
455        i += 1;
456    }
457    out
458}
459
460/// Whole-word prefix test: `trimmed` begins with `keyword` AND the char
461/// immediately after the keyword is a word boundary (not `[A-Za-z0-9_$#]`).
462///
463/// The keyword classifiers below all gated on a bare `starts_with`, so a
464/// local whose name merely *starts* with a verb — `null_count`,
465/// `return_val`, `update_stats`, `delete_flag`, `commit_seq`, … — was
466/// swallowed by the wrong arm (e.g. `null_count := 5;` → `Statement::Null`,
467/// `update_stats(p_id);` → `Statement::Sql{Update}`). The dropped statement
468/// then vanished from `flow_intra::walk` (assignments) or minted a phantom
469/// DML edge (calls), and any user-tainted value laundered through such a
470/// local was never recorded — a taint fail-open. Requiring a trailing word
471/// boundary makes the verb match only the real keyword. (oracle-rwjl.3)
472fn starts_with_keyword(trimmed: &str, keyword: &str) -> bool {
473    let Some(rest) = trimmed.strip_prefix(keyword) else {
474        return false;
475    };
476    match rest.chars().next() {
477        None => true,
478        Some(c) => !(c.is_ascii_alphanumeric() || c == '_' || c == '$' || c == '#'),
479    }
480}
481
482/// Match the `EXECUTE IMMEDIATE` keyword pair at the start of `text` (after
483/// optional leading whitespace), case-insensitively, tolerant of ANY run of
484/// inter-keyword whitespace, with both keywords word-boundaried. Returns the
485/// byte offset in `text` immediately past `IMMEDIATE`, or `None`.
486///
487/// Oracle is whitespace-insensitive between keywords, so `EXECUTE  IMMEDIATE`
488/// (multiple spaces), `EXECUTE\tIMMEDIATE`, `EXECUTE\nIMMEDIATE`, and — once
489/// `strip_comments` substitutes a space for a removed block comment —
490/// `EXECUTE/**/IMMEDIATE` must all classify as dynamic SQL exactly as the
491/// canonical single-space form does. The previous
492/// `starts_with_keyword(trimmed, "EXECUTE IMMEDIATE")` + hardcoded `text[17..]`
493/// matched only the single-space form and sliced a fixed 17 bytes, an asymmetry
494/// vs the runtime guard's canonicalized Stage A scan (oracle-qo1v.3). Returning
495/// the computed offset (rather than a hardcoded 17) keeps the body slice correct
496/// once inter-keyword spacing can vary.
497fn execute_immediate_body_offset(text: &str) -> Option<usize> {
498    let b = text.as_bytes();
499    let is_kw_byte = |c: u8| c.is_ascii_alphanumeric() || c == b'_' || c == b'$' || c == b'#';
500    let skip_ws = |mut i: usize| {
501        while i < b.len() && b[i].is_ascii_whitespace() {
502            i += 1;
503        }
504        i
505    };
506    let match_kw = |start: usize, kw: &[u8]| -> Option<usize> {
507        let end = start + kw.len();
508        if end > b.len() || !b[start..end].eq_ignore_ascii_case(kw) {
509            return None;
510        }
511        // Word boundary after the keyword (byte-level, never slices a codepoint).
512        (end >= b.len() || !is_kw_byte(b[end])).then_some(end)
513    };
514    let start = skip_ws(0);
515    let after_exec = match_kw(start, b"EXECUTE")?;
516    let after_ws = skip_ws(after_exec);
517    if after_ws == after_exec {
518        return None; // require at least one whitespace between the two keywords
519    }
520    match_kw(after_ws, b"IMMEDIATE")
521}
522
523/// Byte offset of the first top-level `:=` in `text` — one that is NOT inside
524/// a string literal (single-quoted or q-quoted). A top-level `:=` makes the
525/// statement unambiguously an assignment, regardless of what verb its LHS
526/// local happens to start with, so `classify` checks this BEFORE the keyword
527/// classifiers. (oracle-rwjl.3)
528fn top_level_assign_pos(text: &str) -> Option<usize> {
529    let chars: Vec<char> = text.chars().collect();
530    // Map char index -> byte offset so the returned position indexes `text`.
531    let mut byte_off = 0usize;
532    let mut i = 0usize;
533    while i < chars.len() {
534        if let Some(end) = string_literal_end(&chars, i) {
535            // Skip the opaque literal span verbatim.
536            for &ch in &chars[i..end] {
537                byte_off += ch.len_utf8();
538            }
539            i = end;
540            continue;
541        }
542        if chars[i] == ':' && chars.get(i + 1) == Some(&'=') {
543            return Some(byte_off);
544        }
545        byte_off += chars[i].len_utf8();
546        i += 1;
547    }
548    None
549}
550
551fn classify(text: &str) -> Statement {
552    let upper = text.to_ascii_uppercase();
553    let trimmed = upper.trim();
554    // BODY-INTRODUCING constructs are classified FIRST: an IF / LOOP / FOR /
555    // WHILE / BEGIN / DECLARE statement legitimately contains a `:=` inside its
556    // BODY (`IF c THEN v := p; END IF;`), so the top-level `:=` test below must
557    // not run before them or it would swallow the whole construct. With the
558    // word-boundary `starts_with_keyword`, `if_count := 1` / `for_idx := 0`
559    // etc. do NOT match these — they fall through to the assignment test.
560    // (oracle-rwjl.3)
561    if let Some(body_off) = execute_immediate_body_offset(text) {
562        let after = &text[body_off..];
563        let sql_literal = extract_quoted(after).unwrap_or_default();
564        let has_bind_variables = after.to_ascii_uppercase().contains("USING ");
565        return Statement::ExecuteImmediate {
566            sql_literal,
567            has_bind_variables,
568        };
569    }
570    if starts_with_keyword(trimmed, "IF") {
571        return classify_if(text);
572    }
573    if starts_with_keyword(trimmed, "LOOP")
574        || starts_with_keyword(trimmed, "FOR")
575        || starts_with_keyword(trimmed, "WHILE")
576    {
577        return classify_loop(text);
578    }
579    if starts_with_keyword(trimmed, "BEGIN") || starts_with_keyword(trimmed, "DECLARE") {
580        return Statement::NestedBlock {
581            body_text: text.to_string(),
582        };
583    }
584    // A top-level `:=` (outside any string literal) makes the statement
585    // unambiguously an assignment — emit it BEFORE the non-body verb
586    // classifiers so a verb-prefixed LHS local (`return_val := …`,
587    // `commit_count := …`) is never mis-swallowed. Slice the ORIGINAL `text`
588    // so case is preserved. (oracle-rwjl.3)
589    if let Some(pos) = top_level_assign_pos(text) {
590        let lhs = &text[..pos];
591        let rhs = &text[pos + 2..];
592        return Statement::Assignment {
593            target: lhs.trim().to_string(),
594            rhs_text: rhs.trim().trim_end_matches(';').trim().to_string(),
595        };
596    }
597    if starts_with_keyword(trimmed, "NULL") {
598        return Statement::Null;
599    }
600    if starts_with_keyword(trimmed, "COMMIT")
601        || starts_with_keyword(trimmed, "ROLLBACK")
602        || starts_with_keyword(trimmed, "SAVEPOINT")
603    {
604        let verb = trimmed.split_whitespace().next().unwrap_or("").to_string();
605        return Statement::TransactionControl { verb };
606    }
607    if starts_with_keyword(trimmed, "RAISE") {
608        let rest = text[5..].trim().trim_end_matches(';').trim();
609        let exception = if rest.is_empty() {
610            None
611        } else {
612            Some(rest.to_string())
613        };
614        return Statement::Raise { exception };
615    }
616    if starts_with_keyword(trimmed, "RETURN") {
617        let rest = text[6..].trim().trim_end_matches(';').trim();
618        let value_text = if rest.is_empty() {
619            None
620        } else {
621            Some(rest.to_string())
622        };
623        return Statement::Return { value_text };
624    }
625    if starts_with_keyword(trimmed, "EXIT") {
626        let rest = text[4..].trim().trim_end_matches(';').trim();
627        let when_text = rest
628            .strip_prefix("WHEN")
629            .or_else(|| rest.strip_prefix("when"))
630            .map(|s| s.trim().to_string());
631        return Statement::Exit { when_text };
632    }
633    for verb in ["SELECT", "INSERT", "UPDATE", "DELETE", "MERGE"] {
634        if starts_with_keyword(trimmed, verb) {
635            let kind = match verb {
636                "SELECT" => SqlVerb::Select,
637                "INSERT" => SqlVerb::Insert,
638                "UPDATE" => SqlVerb::Update,
639                "DELETE" => SqlVerb::Delete,
640                "MERGE" => SqlVerb::Merge,
641                _ => unreachable!(),
642            };
643            return Statement::Sql {
644                verb: kind,
645                raw_text: text.to_string(),
646            };
647        }
648    }
649    Statement::Unrecognized {
650        raw_text: text.to_string(),
651        unknown_reason: UnknownStatementReason::UnrecognizedKeyword,
652    }
653}
654
655fn classify_if(text: &str) -> Statement {
656    // Very small parser: split arms by `ELSIF` / `ELSE`, ending at
657    // `END IF`. The result is structural — `body_text` retains the
658    // raw inter-arm slice so a recursive `lower_statement_body`
659    // can re-enter it later.
660    let upper = text.to_ascii_uppercase();
661    let end_pos = upper.rfind("END IF").unwrap_or(upper.len());
662    let body = &text[..end_pos];
663    // Skip the leading "IF" keyword. `classify` only routes here via
664    // `starts_with_keyword(trimmed, "IF")`, which guarantees an ASCII "IF"
665    // prefix, so byte 2 is always a valid char boundary; the old hardcoded
666    // `&body[3..]` assumed a trailing ASCII space and panicked when the
667    // boundary char was multibyte (e.g. `IFé THEN …`). Slice char-safely and
668    // trim the leading separator. (oracle-clgt.1)
669    let after_if = body.get(2..).unwrap_or("").trim_start();
670    let mut arms: Vec<IfArm> = Vec::new();
671    let mut else_body_text: Option<String> = None;
672    // `cond_start` points just past the keyword that introduces the
673    // current arm's condition: `IF` for the first arm, `ELSIF` for
674    // every subsequent one. Each loop iteration handles exactly ONE
675    // arm — capture its condition, slice its body up to the next
676    // ELSIF/ELSE, push a single IfArm — then advance.
677    let mut cond_start = 0usize;
678    while let Some(then_pos) = find_keyword(after_if, "THEN", cond_start) {
679        let cond_text = after_if[cond_start..then_pos].trim().to_string();
680        let body_start = then_pos + 4;
681        let next_arm = find_any_keyword(after_if, &["ELSIF", "ELSE"], body_start);
682        let body_end = next_arm.map_or(after_if.len(), |(p, _)| p);
683        let body_text = after_if
684            .get(body_start..body_end)
685            .unwrap_or("")
686            .trim()
687            .to_string();
688        arms.push(IfArm {
689            cond_text,
690            body_text,
691        });
692        match next_arm {
693            // `ELSIF` — start the next arm's condition just past it.
694            Some((pos, "ELSIF")) => cond_start = pos + 5,
695            // `ELSE` — the trailing arm has no condition; its body
696            // runs to the end (`END IF` was already trimmed off).
697            Some((pos, _)) => {
698                let else_text = after_if.get(pos + 4..).unwrap_or("").trim().to_string();
699                else_body_text = Some(else_text);
700                break;
701            }
702            None => break,
703        }
704    }
705    Statement::If {
706        arms,
707        else_body_text,
708    }
709}
710
711fn classify_loop(text: &str) -> Statement {
712    let upper = text.to_ascii_uppercase();
713    if upper.starts_with("FOR ") {
714        let in_pos = find_keyword(text, "IN", 4);
715        let loop_pos = find_keyword(text, "LOOP", in_pos.unwrap_or(0));
716        let end_loop = upper.rfind("END LOOP").unwrap_or(text.len());
717        if let (Some(in_p), Some(loop_p)) = (in_pos, loop_pos) {
718            // Char-safe slicing: a multibyte char abutting `FOR`/`IN` (e.g.
719            // `FORé …`) must not panic on a mid-codepoint byte index.
720            // (oracle-clgt.5)
721            let iterator = text.get(4..in_p).unwrap_or("").trim().to_string();
722            let range_text = text.get(in_p + 2..loop_p).unwrap_or("").trim().to_string();
723            let body = text
724                .get(loop_p + 4..end_loop)
725                .unwrap_or("")
726                .trim()
727                .to_string();
728            return Statement::ForLoop {
729                iterator,
730                range_text,
731                body_text: body,
732            };
733        }
734    }
735    if upper.starts_with("WHILE ") {
736        let loop_pos = find_keyword(text, "LOOP", 6);
737        let end_loop = upper.rfind("END LOOP").unwrap_or(text.len());
738        if let Some(loop_p) = loop_pos {
739            // Char-safe slice past the `WHILE` keyword. (oracle-clgt.5)
740            let cond_text = text.get(6..loop_p).unwrap_or("").trim().to_string();
741            let body = text
742                .get(loop_p + 4..end_loop)
743                .unwrap_or("")
744                .trim()
745                .to_string();
746            return Statement::WhileLoop {
747                cond_text,
748                body_text: body,
749            };
750        }
751    }
752    let upper = text.to_ascii_uppercase();
753    let body = if let Some(end_pos) = upper.rfind("END LOOP") {
754        // Bare-loop fallthrough: `FORé LOOP … END LOOP;` reaches here when the
755        // FOR arm's IN/LOOP keywords are absent. Slice char-safely so a
756        // multibyte char at byte 4 cannot trigger a mid-codepoint panic.
757        // (oracle-clgt.5)
758        text.get(4..end_pos).unwrap_or("").trim().to_string()
759    } else {
760        text.trim_start_matches("LOOP")
761            .trim_start_matches("loop")
762            .trim()
763            .to_string()
764    };
765    Statement::BareLoop { body_text: body }
766}
767
768fn extract_quoted(text: &str) -> Option<String> {
769    let mut iter = text.chars().peekable();
770    while let Some(c) = iter.next() {
771        if c == '\'' {
772            let mut buf = String::new();
773            while let Some(nc) = iter.next() {
774                if nc == '\'' {
775                    // Oracle doubled-`''` escape: a `'` immediately followed by
776                    // another `'` is a single literal `'`, not the end of the
777                    // literal. Mirror `string_literal_end`'s handling so the
778                    // captured SQL text is not truncated at the first inner
779                    // escaped quote (e.g. EXECUTE IMMEDIATE 'SELECT ''x''…').
780                    // (oracle-ajm2.20)
781                    if iter.peek() == Some(&'\'') {
782                        iter.next();
783                        buf.push('\'');
784                        continue;
785                    }
786                    return Some(buf);
787                }
788                buf.push(nc);
789            }
790            return Some(buf);
791        }
792    }
793    None
794}
795
796fn find_keyword(text: &str, keyword: &str, start: usize) -> Option<usize> {
797    let upper = text.to_ascii_uppercase();
798    let kw_upper = keyword.to_ascii_uppercase();
799    // Clamp to a char boundary so the slice `upper[search_from..]` never panics.
800    let mut search_from = upper
801        .char_indices()
802        .map(|(i, _)| i)
803        .find(|&i| i >= start)
804        .unwrap_or(upper.len());
805    while search_from <= upper.len() {
806        let Some(rel) = upper[search_from..].find(&kw_upper) else {
807            break;
808        };
809        let abs = search_from + rel;
810        if is_word_boundary(&upper, abs, abs + kw_upper.len()) {
811            return Some(abs);
812        }
813        // Advance by the full char at `abs` so `search_from` always lands
814        // on a char boundary. Advancing by 1 byte would panic on the next
815        // slice if `abs` is inside a multi-byte UTF-8 code-point.
816        search_from = abs + upper[abs..].chars().next().map_or(1, char::len_utf8);
817    }
818    None
819}
820
821fn find_any_keyword(text: &str, keywords: &[&str], start: usize) -> Option<(usize, &'static str)> {
822    static ELSIF: &str = "ELSIF";
823    static ELSE: &str = "ELSE";
824    let upper = text.to_ascii_uppercase();
825    let mut best: Option<(usize, &'static str)> = None;
826    for kw in keywords {
827        let kw_upper = kw.to_ascii_uppercase();
828        // Clamp to a char boundary so the slice `upper[search_from..]` never panics.
829        let mut search_from = upper
830            .char_indices()
831            .map(|(i, _)| i)
832            .find(|&i| i >= start)
833            .unwrap_or(upper.len());
834        while search_from <= upper.len() {
835            let Some(rel) = upper[search_from..].find(&kw_upper) else {
836                break;
837            };
838            let abs = search_from + rel;
839            if is_word_boundary(&upper, abs, abs + kw_upper.len()) {
840                let tag: &'static str = match kw_upper.as_str() {
841                    "ELSIF" => ELSIF,
842                    "ELSE" => ELSE,
843                    _ => continue,
844                };
845                if best.is_none_or(|(b, _)| abs < b) {
846                    best = Some((abs, tag));
847                }
848                break;
849            }
850            // Advance by the full char at `abs` so `search_from` always lands
851            // on a char boundary. Advancing by 1 byte would panic on the next
852            // slice if `abs` is inside a multi-byte UTF-8 code-point.
853            search_from = abs + upper[abs..].chars().next().map_or(1, char::len_utf8);
854        }
855    }
856    best
857}
858
859fn is_word_boundary(text: &str, start: usize, end: usize) -> bool {
860    let bytes = text.as_bytes();
861    let prev_ok = start == 0 || {
862        let b = bytes[start - 1];
863        !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$' || b == b'#')
864    };
865    let next_ok = end >= bytes.len() || {
866        let b = bytes[end];
867        !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$' || b == b'#')
868    };
869    prev_ok && next_ok
870}
871
872#[cfg(test)]
873mod tests {
874    use super::*;
875
876    #[test]
877    fn null_statement_classified() {
878        let r = lower_statement_body("NULL;");
879        assert_eq!(r.len(), 1);
880        assert_eq!(r[0], Statement::Null);
881    }
882
883    #[test]
884    fn assignment_captures_target_and_rhs() {
885        let r = lower_statement_body("v_x := 42;");
886        match &r[0] {
887            Statement::Assignment { target, rhs_text } => {
888                assert_eq!(target, "v_x");
889                assert_eq!(rhs_text, "42");
890            }
891            other => panic!("expected Assignment, got {other:?}"),
892        }
893    }
894
895    // oracle-rwjl.3: an assignment whose LHS local merely STARTS with a verb
896    // keyword (`return_val`, `null_count`, `update_x`, `delete_flag`,
897    // `commit_count`, `exit_code`, `raise_amount`, `select_idx`, …) must be
898    // classified as `Statement::Assignment` — NOT swallowed by the keyword
899    // classifier. The bare `starts_with("RETURN")` etc. used to misclassify
900    // these, dropping the assignment from flow_intra::walk (a taint
901    // fail-open) and minting phantom DML edges.
902    #[test]
903    fn verb_prefixed_assignment_is_an_assignment_not_a_keyword() {
904        for (input, want_target, want_rhs) in [
905            ("return_val := p_user;", "return_val", "p_user"),
906            ("null_count := 5;", "null_count", "5"),
907            ("update_x := p_user;", "update_x", "p_user"),
908            ("delete_flag := 1;", "delete_flag", "1"),
909            ("commit_count := 1;", "commit_count", "1"),
910            ("exit_code := 0;", "exit_code", "0"),
911            ("raise_amount := 100;", "raise_amount", "100"),
912            ("select_idx := 7;", "select_idx", "7"),
913            ("merge_key := p_user;", "merge_key", "p_user"),
914            ("insert_seq := 3;", "insert_seq", "3"),
915            ("savepoint_id := 2;", "savepoint_id", "2"),
916            ("rollback_count := 9;", "rollback_count", "9"),
917        ] {
918            let r = lower_statement_body(input);
919            match &r[0] {
920                Statement::Assignment { target, rhs_text } => {
921                    assert_eq!(target, want_target, "target for {input:?}");
922                    assert_eq!(rhs_text, want_rhs, "rhs for {input:?}");
923                }
924                other => panic!("expected Assignment for {input:?}, got {other:?}"),
925            }
926        }
927    }
928
929    // oracle-rwjl.3: a real keyword statement (verb at a true word boundary)
930    // must still classify as the keyword, not regress to Unrecognized.
931    #[test]
932    fn real_keyword_statements_still_classify() {
933        assert_eq!(lower_statement_body("NULL;")[0], Statement::Null);
934        assert!(matches!(
935            lower_statement_body("RETURN 1;")[0],
936            Statement::Return { .. }
937        ));
938        assert!(matches!(
939            lower_statement_body("DELETE FROM t WHERE id = 1;")[0],
940            Statement::Sql {
941                verb: SqlVerb::Delete,
942                ..
943            }
944        ));
945        assert!(matches!(
946            lower_statement_body("COMMIT;")[0],
947            Statement::TransactionControl { .. }
948        ));
949    }
950
951    #[test]
952    fn raise_with_named_exception() {
953        let r = lower_statement_body("RAISE no_data_found;");
954        assert!(
955            matches!(&r[0], Statement::Raise { exception } if exception.as_deref() == Some("no_data_found"))
956        );
957    }
958
959    #[test]
960    fn bare_raise_classified() {
961        let r = lower_statement_body("RAISE;");
962        assert!(matches!(&r[0], Statement::Raise { exception: None }));
963    }
964
965    #[test]
966    fn return_with_value() {
967        let r = lower_statement_body("RETURN v_sum;");
968        assert!(
969            matches!(&r[0], Statement::Return { value_text } if value_text.as_deref() == Some("v_sum"))
970        );
971    }
972
973    #[test]
974    fn return_without_value() {
975        let r = lower_statement_body("RETURN;");
976        assert!(matches!(&r[0], Statement::Return { value_text: None }));
977    }
978
979    #[test]
980    fn exit_when_cond() {
981        let r = lower_statement_body("EXIT WHEN i > 10;");
982        assert!(
983            matches!(&r[0], Statement::Exit { when_text } if when_text.as_deref() == Some("i > 10"))
984        );
985    }
986
987    #[test]
988    fn execute_immediate_with_binds_detected() {
989        let r = lower_statement_body("EXECUTE IMMEDIATE 'UPDATE t SET a = :1' USING v_a;");
990        match &r[0] {
991            Statement::ExecuteImmediate {
992                sql_literal,
993                has_bind_variables,
994            } => {
995                assert_eq!(sql_literal, "UPDATE t SET a = :1");
996                assert!(*has_bind_variables);
997            }
998            other => panic!("expected ExecuteImmediate, got {other:?}"),
999        }
1000    }
1001
1002    #[test]
1003    fn execute_immediate_honors_doubled_quote_escape() {
1004        // oracle-ajm2.20: `extract_quoted` returned at the first lone `'`,
1005        // truncating the captured literal at an inner doubled-`''` escape
1006        // (`'SELECT ''x'' FROM dual'` -> "SELECT "). Mirroring
1007        // `string_literal_end`'s `''` handling captures the full SQL with the
1008        // escapes un-doubled to single quotes.
1009        let r = lower_statement_body("EXECUTE IMMEDIATE 'SELECT ''x'' FROM dual';");
1010        match &r[0] {
1011            Statement::ExecuteImmediate { sql_literal, .. } => {
1012                assert_eq!(sql_literal, "SELECT 'x' FROM dual");
1013            }
1014            other => panic!("expected ExecuteImmediate, got {other:?}"),
1015        }
1016    }
1017
1018    #[test]
1019    fn execute_immediate_without_binds() {
1020        let r = lower_statement_body("EXECUTE IMMEDIATE 'ALTER SESSION SET …';");
1021        if let Statement::ExecuteImmediate {
1022            has_bind_variables, ..
1023        } = &r[0]
1024        {
1025            assert!(!has_bind_variables);
1026        } else {
1027            panic!("{r:?}");
1028        }
1029    }
1030
1031    #[test]
1032    fn execute_immediate_recognised_with_non_canonical_whitespace() {
1033        // oracle-qo1v.3: Oracle is whitespace-insensitive between keywords, so
1034        // every inter-keyword spacing must classify as dynamic SQL exactly as the
1035        // canonical single space does — not fall through to Unrecognized and so
1036        // never become a dynamic-SQL sink. The block-comment form relies on
1037        // strip_comments now substituting a space for `/* */`.
1038        for src in [
1039            "EXECUTE  IMMEDIATE 'DROP TABLE t';",   // two spaces
1040            "EXECUTE\tIMMEDIATE 'DROP TABLE t';",   // tab
1041            "EXECUTE\nIMMEDIATE 'DROP TABLE t';",   // newline
1042            "EXECUTE/**/IMMEDIATE 'DROP TABLE t';", // block comment between
1043        ] {
1044            let r = lower_statement_body(src);
1045            assert!(
1046                matches!(r.first(), Some(Statement::ExecuteImmediate { sql_literal, .. }) if sql_literal == "DROP TABLE t"),
1047                "non-canonical EXECUTE IMMEDIATE must classify as dynamic SQL: {src:?} -> {r:?}"
1048            );
1049        }
1050        // Control: a local whose name merely starts with EXECUTE is not a sink.
1051        assert!(
1052            !matches!(
1053                lower_statement_body("executable_flag := 1;").first(),
1054                Some(Statement::ExecuteImmediate { .. })
1055            ),
1056            "an identifier starting with EXECUTE must not match"
1057        );
1058    }
1059
1060    #[test]
1061    fn sql_verbs_classified() {
1062        for (verb, src) in [
1063            ("SELECT", "SELECT * INTO v_row FROM t;"),
1064            ("INSERT", "INSERT INTO t VALUES (1);"),
1065            ("UPDATE", "UPDATE t SET x = 1;"),
1066            ("DELETE", "DELETE FROM t WHERE id = 1;"),
1067            (
1068                "MERGE",
1069                "MERGE INTO t USING s ON (t.id = s.id) WHEN MATCHED THEN UPDATE SET x = s.x;",
1070            ),
1071        ] {
1072            let r = lower_statement_body(src);
1073            assert!(matches!(&r[0], Statement::Sql { .. }), "{verb}: {r:?}");
1074        }
1075    }
1076
1077    #[test]
1078    fn transaction_control_classified() {
1079        for src in ["COMMIT;", "ROLLBACK;", "SAVEPOINT s1;"] {
1080            let r = lower_statement_body(src);
1081            assert!(
1082                matches!(&r[0], Statement::TransactionControl { .. }),
1083                "{src}: {r:?}"
1084            );
1085        }
1086    }
1087
1088    #[test]
1089    fn comment_only_chunks_dropped() {
1090        let r = lower_statement_body("-- header\n-- still here\nNULL;");
1091        assert_eq!(r.len(), 1);
1092        assert!(matches!(r[0], Statement::Null));
1093    }
1094
1095    #[test]
1096    fn unrecognised_line_surfaces_with_typed_reason() {
1097        let r = lower_statement_body("xyz_unknown_keyword;");
1098        match &r[0] {
1099            Statement::Unrecognized {
1100                unknown_reason: UnknownStatementReason::UnrecognizedKeyword,
1101                ..
1102            } => {}
1103            other => panic!("{other:?}"),
1104        }
1105    }
1106
1107    #[test]
1108    fn nested_block_passes_through() {
1109        let r = lower_statement_body("BEGIN NULL; END;");
1110        assert!(matches!(r[0], Statement::NestedBlock { .. }));
1111    }
1112
1113    #[test]
1114    fn multiple_statements_split_at_top_level_semicolons() {
1115        let src = "v_x := 1; v_y := 2; NULL;";
1116        let r = lower_statement_body(src);
1117        assert_eq!(r.len(), 3);
1118    }
1119
1120    #[test]
1121    fn semicolon_inside_string_literal_is_not_a_boundary() {
1122        // A `;` inside a single-quoted literal must not split the statement.
1123        let r = lower_statement_body("v_msg := 'a; b; c'; NULL;");
1124        assert_eq!(r.len(), 2, "the assignment (with its literal) + NULL");
1125    }
1126
1127    #[test]
1128    fn block_keywords_inside_string_literal_do_not_move_depth() {
1129        // 'BEGIN'/'END' inside a literal are data, not block keywords — the
1130        // top-level `;` after the assignment must still split into two.
1131        let r = lower_statement_body("v_msg := 'BEGIN x END;'; v_y := 2;");
1132        assert_eq!(r.len(), 2);
1133    }
1134
1135    #[test]
1136    fn q_quote_with_embedded_end_and_semicolon_is_opaque() {
1137        // q'{ … END; … }' is a single literal: neither a split nor a depth move.
1138        let r = lower_statement_body("v_sql := q'{SELECT 1; END;}'; NULL;");
1139        assert_eq!(r.len(), 2);
1140    }
1141
1142    #[test]
1143    fn semicolon_inside_line_comment_is_not_a_boundary() {
1144        // split runs on the raw body before strip_comments, so it must skip
1145        // comments: the `;`s inside the trailing comment do not split.
1146        let r = lower_statement_body("v_x := 1; -- trailing; comment; here\nNULL;");
1147        assert_eq!(r.len(), 2);
1148    }
1149
1150    #[test]
1151    fn semicolon_inside_block_comment_is_not_a_boundary() {
1152        let r = lower_statement_body("v_x := 1 /* a; b; c */ + 2; NULL;");
1153        assert_eq!(r.len(), 2);
1154    }
1155
1156    #[test]
1157    fn comment_markers_inside_string_literal_are_preserved() {
1158        // strip_comments must NOT treat `--` / `/*` inside a literal as a comment;
1159        // stripping the literal's body would corrupt the statement's SQL text.
1160        let r = lower_statement_body("v_msg := 'keep -- this and /* this */ too';");
1161        assert_eq!(r.len(), 1);
1162        let dbg = format!("{:?}", r[0]);
1163        assert!(
1164            dbg.contains("keep -- this") && dbg.contains("/* this */"),
1165            "comment-like content inside the literal must survive strip_comments: {dbg}"
1166        );
1167    }
1168
1169    #[test]
1170    fn for_loop_captures_iterator_and_range() {
1171        let r = lower_statement_body("FOR i IN 1..10 LOOP NULL; END LOOP;");
1172        match &r[0] {
1173            Statement::ForLoop {
1174                iterator,
1175                range_text,
1176                ..
1177            } => {
1178                assert_eq!(iterator, "i");
1179                assert_eq!(range_text, "1..10");
1180            }
1181            other => panic!("{other:?}"),
1182        }
1183    }
1184
1185    // oracle-hbhm: split_statements must depth-track IF…END IF so an
1186    // inner `;` does not tear a multi-statement IF body into separate
1187    // top-level statements. Before the fix this produced 3 statements
1188    // (the leaked UPDATE + bogus `END IF;`) instead of one If.
1189    #[test]
1190    fn multi_statement_if_body_is_one_statement() {
1191        let src = "IF p_flag = 1 THEN \
1192                   INSERT INTO audit_log VALUES (1); \
1193                   UPDATE accounts SET bal = 0; \
1194                   END IF;";
1195        let r = lower_statement_body(src);
1196        assert_eq!(r.len(), 1, "IF body must not be torn apart: {r:?}");
1197        match &r[0] {
1198            Statement::If { arms, .. } => {
1199                assert_eq!(arms.len(), 1);
1200                // Both inner DML statements stay inside the arm body.
1201                assert!(arms[0].body_text.to_ascii_uppercase().contains("INSERT"));
1202                assert!(arms[0].body_text.to_ascii_uppercase().contains("UPDATE"));
1203            }
1204            other => panic!("expected If, got {other:?}"),
1205        }
1206    }
1207
1208    // oracle-hbhm: split_statements must depth-track LOOP…END LOOP so
1209    // an inner `;` does not tear a multi-statement loop body apart.
1210    #[test]
1211    fn multi_statement_loop_body_is_one_statement() {
1212        let src = "FOR r IN 1..10 LOOP \
1213                   INSERT INTO dst VALUES (r); \
1214                   DELETE FROM stale WHERE id = r; \
1215                   END LOOP;";
1216        let r = lower_statement_body(src);
1217        assert_eq!(r.len(), 1, "LOOP body must not be torn apart: {r:?}");
1218        match &r[0] {
1219            Statement::ForLoop { body_text, .. } => {
1220                assert!(body_text.to_ascii_uppercase().contains("INSERT"));
1221                assert!(body_text.to_ascii_uppercase().contains("DELETE"));
1222            }
1223            other => panic!("expected ForLoop, got {other:?}"),
1224        }
1225    }
1226
1227    // oracle-hbhm: a bare LOOP…END LOOP with internal `;` must also
1228    // survive splitting.
1229    #[test]
1230    fn multi_statement_bare_loop_body_is_one_statement() {
1231        let src = "LOOP v_x := 1; v_y := 2; EXIT WHEN v_x > 5; END LOOP;";
1232        let r = lower_statement_body(src);
1233        assert_eq!(r.len(), 1, "bare LOOP body must not be torn apart: {r:?}");
1234        assert!(matches!(r[0], Statement::BareLoop { .. }));
1235    }
1236
1237    // oracle-hbhm: nested IF inside a LOOP — both openers must be
1238    // depth-tracked together.
1239    #[test]
1240    fn nested_if_inside_loop_stays_one_statement() {
1241        let src = "FOR i IN 1..3 LOOP \
1242                   IF i > 1 THEN do_a(i); ELSE do_b(i); END IF; \
1243                   log_iter(i); \
1244                   END LOOP;";
1245        let r = lower_statement_body(src);
1246        assert_eq!(r.len(), 1, "nested IF/LOOP must not be torn apart: {r:?}");
1247        assert!(matches!(r[0], Statement::ForLoop { .. }));
1248    }
1249
1250    // oracle-hbhm: an unterminated IF (no `END IF`) must degrade with
1251    // a typed diagnostic, never silently (R13).
1252    #[test]
1253    fn unterminated_if_block_degrades_with_typed_reason() {
1254        let src = "IF a THEN foo(); bar();";
1255        let r = lower_statement_body(src);
1256        assert_eq!(r.len(), 1, "unterminated IF stays one chunk: {r:?}");
1257        match &r[0] {
1258            Statement::Unrecognized {
1259                unknown_reason: UnknownStatementReason::UnterminatedBlock,
1260                ..
1261            } => {}
1262            other => panic!("expected Unrecognized/UnterminatedBlock, got {other:?}"),
1263        }
1264    }
1265
1266    // oracle-ina8: classify_if must emit exactly one arm per ELSIF,
1267    // never phantom duplicate arms re-using the first condition.
1268    #[test]
1269    fn multi_elsif_if_has_no_phantom_arms() {
1270        let src = "IF a THEN NULL ELSIF b THEN NULL ELSIF c THEN NULL ELSE NULL END IF";
1271        let r = lower_statement_body(src);
1272        assert_eq!(r.len(), 1);
1273        match &r[0] {
1274            Statement::If {
1275                arms,
1276                else_body_text,
1277            } => {
1278                let conds: Vec<&str> = arms.iter().map(|a| a.cond_text.as_str()).collect();
1279                assert_eq!(
1280                    conds,
1281                    vec!["a", "b", "c"],
1282                    "expected exactly 3 arms a/b/c, got {arms:?}"
1283                );
1284                assert_eq!(else_body_text.as_deref(), Some("NULL"));
1285            }
1286            other => panic!("expected If, got {other:?}"),
1287        }
1288    }
1289
1290    // oracle-ina8: a multi-ELSIF IF whose arms carry real bodies must
1291    // keep each body attached to the correct condition.
1292    #[test]
1293    fn multi_elsif_if_keeps_bodies_with_conditions() {
1294        let src = "IF a THEN s1; ELSIF b THEN s2; ELSIF c THEN s3; ELSE s4; END IF;";
1295        let r = lower_statement_body(src);
1296        assert_eq!(r.len(), 1);
1297        match &r[0] {
1298            Statement::If { arms, .. } => {
1299                assert_eq!(arms.len(), 3);
1300                assert_eq!(arms[0].cond_text, "a");
1301                assert_eq!(arms[0].body_text, "s1;");
1302                assert_eq!(arms[1].cond_text, "b");
1303                assert_eq!(arms[1].body_text, "s2;");
1304                assert_eq!(arms[2].cond_text, "c");
1305                assert_eq!(arms[2].body_text, "s3;");
1306            }
1307            other => panic!("expected If, got {other:?}"),
1308        }
1309    }
1310
1311    // oracle-clgt.1: a multibyte UTF-8 char abutting the IF keyword
1312    // (`IFé THEN …`) used to slice `&body[3..]` mid-codepoint and panic,
1313    // aborting the whole analysis run. classify_if must now strip the "IF"
1314    // keyword char-safely and return without panicking.
1315    #[test]
1316    fn if_keyword_followed_by_multibyte_char_does_not_panic() {
1317        let r = lower_statement_body("IFé THEN x := 1; END IF;");
1318        assert_eq!(r.len(), 1, "expected a single classified statement");
1319        assert!(
1320            matches!(&r[0], Statement::If { .. }),
1321            "expected If, got {:?}",
1322            r[0]
1323        );
1324    }
1325
1326    // oracle-clgt.5: a multibyte UTF-8 char abutting FOR/WHILE/LOOP keywords
1327    // (`FORé LOOP …`, `WHILEé …`) used to slice raw byte ranges mid-codepoint
1328    // and panic. classify_loop must now slice char-safely and return without
1329    // panicking for every loop arm.
1330    #[test]
1331    fn loop_keywords_followed_by_multibyte_char_do_not_panic() {
1332        for input in [
1333            "FORé LOOP NULL; END LOOP;",
1334            "WHILEé LOOP NULL; END LOOP;",
1335            "FORé i IN 1..3 LOOP NULL; END LOOP;",
1336        ] {
1337            let r = lower_statement_body(input);
1338            assert_eq!(r.len(), 1, "expected one statement for {input:?}");
1339        }
1340    }
1341}