flowscope_core/analyzer/helpers/
span.rs

1//! Utilities for finding identifier spans in SQL text.
2//!
3//! This module provides functions to locate identifiers in SQL source code
4//! for error reporting. Since sqlparser doesn't expose AST node locations,
5//! we use text search to find approximate positions.
6
7use super::{split_qualified_identifiers, unquote_identifier};
8use crate::types::Span;
9
10/// Finds the byte offset span of an identifier in SQL text.
11///
12/// Searches for the identifier as a whole word (not part of another identifier).
13/// Returns the first match found, or `None` if not found.
14///
15/// # Arguments
16///
17/// * `sql` - The SQL source text
18/// * `identifier` - The identifier to find (table name, column name, etc.)
19/// * `search_start` - Byte offset to start searching from (for multi-statement SQL)
20///
21/// # Example
22///
23/// ```ignore
24/// let sql = "SELECT * FROM users WHERE id = 1";
25/// let span = find_identifier_span(sql, "users", 0);
26/// assert_eq!(span, Some(Span { start: 14, end: 19 }));
27/// ```
28pub fn find_identifier_span(sql: &str, identifier: &str, search_start: usize) -> Option<Span> {
29    if identifier.is_empty() || search_start >= sql.len() {
30        return None;
31    }
32    if !sql.is_char_boundary(search_start) {
33        #[cfg(feature = "tracing")]
34        tracing::warn!(
35            search_start,
36            sql_len = sql.len(),
37            "find_identifier_span: search_start is not on a UTF-8 char boundary"
38        );
39        return None;
40    }
41
42    let search_text = &sql[search_start..];
43    find_qualified_name(search_text, identifier)
44        .map(|(start, end)| Span::new(search_start + start, search_start + end))
45}
46
47/// Finds every occurrence of an identifier in SQL text.
48///
49/// Returns all non-overlapping word-boundary matches (case-insensitive) within
50/// `[search_start, search_end)`. The `search_end` bound lets callers scope the
51/// scan to a single statement. Strings inside single-quoted or dollar-quoted
52/// SQL literals and inside block/line/hash comments are skipped so `-- users`,
53/// `# users`, or `'users'` do not produce false positives.
54///
55/// This is intentionally a textual scan rather than an AST walk: sqlparser
56/// does not preserve per-occurrence source positions, and a text scan handles
57/// the common case (every spelling of a table/CTE/view name) correctly. Alias
58/// shadowing on column references is out of scope here and is handled by
59/// populating `name_spans` only on table-like node types.
60pub fn find_all_identifier_spans(
61    sql: &str,
62    identifier: &str,
63    search_start: usize,
64    search_end: usize,
65) -> Vec<Span> {
66    let mut spans = Vec::new();
67    if identifier.is_empty() || search_start >= search_end || search_end > sql.len() {
68        return spans;
69    }
70    if !sql.is_char_boundary(search_start) || !sql.is_char_boundary(search_end) {
71        #[cfg(feature = "tracing")]
72        tracing::warn!(
73            search_start,
74            search_end,
75            sql_len = sql.len(),
76            "find_all_identifier_spans: search range is not on UTF-8 char boundaries"
77        );
78        return spans;
79    }
80
81    let scope = &sql[search_start..search_end];
82    let mut cursor = 0usize;
83    while let Some(occurrence) = find_identifier_occurrence(scope, identifier, cursor) {
84        spans.push(Span::new(
85            search_start + occurrence.full_start,
86            search_start + occurrence.full_end,
87        ));
88        cursor = occurrence.full_end;
89    }
90    spans
91}
92
93/// Finds the next relation occurrence and returns both its full span and the
94/// span of the final identifier component.
95///
96/// The full span includes any qualification and quoting as written in the SQL.
97/// The returned name span points at the inner content of the final identifier
98/// component so callers can highlight just the node name (for example, `orders`
99/// in `sales.orders`, or `my.table` in `"my.schema"."my.table"`). String
100/// literals and comments are skipped.
101pub fn find_relation_occurrence_spans(
102    sql: &str,
103    identifier: &str,
104    search_start: usize,
105) -> Option<(Span, Span)> {
106    if identifier.is_empty() || search_start >= sql.len() {
107        return None;
108    }
109    if !sql.is_char_boundary(search_start) {
110        #[cfg(feature = "tracing")]
111        tracing::warn!(
112            search_start,
113            sql_len = sql.len(),
114            "find_relation_occurrence_spans: search_start is not on a UTF-8 char boundary"
115        );
116        return None;
117    }
118
119    let search_text = &sql[search_start..];
120    let occurrence = find_identifier_occurrence(search_text, identifier, 0)?;
121    Some((
122        Span::new(
123            search_start + occurrence.full_start,
124            search_start + occurrence.full_end,
125        ),
126        Span::new(
127            search_start + occurrence.tail_start,
128            search_start + occurrence.tail_end,
129        ),
130    ))
131}
132
133/// Finds the span of a CTE body (the parenthesized subquery after `AS`) given
134/// the span of the CTE name.
135///
136/// Starting after `name_span.end`, skips whitespace/comments and an optional
137/// `AS` keyword, then locates the matching parenthesis pair and returns its
138/// span (including the parentheses themselves). Returns `None` if the body
139/// cannot be located — for example if the SQL has already been rewritten.
140pub fn find_cte_body_span(sql: &str, name_span: Span) -> Option<Span> {
141    if name_span.end > sql.len() || !sql.is_char_boundary(name_span.end) {
142        #[cfg(feature = "tracing")]
143        tracing::warn!(
144            end = name_span.end,
145            sql_len = sql.len(),
146            "find_cte_body_span: name_span.end is not on a UTF-8 char boundary"
147        );
148        return None;
149    }
150
151    let bytes = sql.as_bytes();
152
153    // Skip whitespace / comments after the CTE name.
154    let mut pos = skip_whitespace_and_comments(sql, name_span.end);
155
156    // Optional column list: `cte_name(col1, col2) AS (...)`.
157    if pos < bytes.len() && bytes[pos] == b'(' {
158        let list_end = find_matching_paren(bytes, pos)?;
159        pos = skip_whitespace_and_comments(sql, list_end + 1);
160    }
161
162    // Required `AS` keyword.
163    pos = consume_ascii_keyword(sql, pos, "AS")?;
164
165    // PostgreSQL materialization modifiers:
166    // `AS MATERIALIZED (...)` / `AS NOT MATERIALIZED (...)`.
167    if let Some(after_not) = consume_ascii_keyword(sql, pos, "NOT") {
168        if let Some(after_materialized) = consume_ascii_keyword(sql, after_not, "MATERIALIZED") {
169            pos = after_materialized;
170        }
171    } else if let Some(after_materialized) = consume_ascii_keyword(sql, pos, "MATERIALIZED") {
172        pos = after_materialized;
173    }
174
175    if pos >= bytes.len() || bytes[pos] != b'(' {
176        return None;
177    }
178
179    let body_end = find_matching_paren(bytes, pos)?;
180    Some(Span::new(pos, body_end + 1))
181}
182
183/// Given the byte offset of an opening `(`, finds the byte offset of its
184/// matching `)`. Respects string literals and comments so parentheses inside
185/// them do not affect depth. Operates on bytes so non-ASCII content in the
186/// SQL (identifiers, comments, string literals) does not cause panics on
187/// UTF-8 boundary slicing.
188fn find_matching_paren(bytes: &[u8], open: usize) -> Option<usize> {
189    if open >= bytes.len() || bytes[open] != b'(' {
190        return None;
191    }
192    let mut depth = 0i32;
193    let mut i = open;
194    while i < bytes.len() {
195        if let Some(skip_to) = skip_string_or_comment(bytes, i) {
196            debug_assert!(
197                skip_to > i,
198                "skip_string_or_comment must advance past the current index"
199            );
200            if skip_to <= i {
201                return None;
202            }
203            i = skip_to;
204            continue;
205        }
206        match bytes[i] {
207            b'(' => depth += 1,
208            b')' => {
209                depth -= 1;
210                if depth == 0 {
211                    return Some(i);
212                }
213            }
214            _ => {}
215        }
216        i += 1;
217    }
218    None
219}
220
221/// If `pos` is the start of a string literal or SQL comment, returns the byte
222/// offset immediately after it. Otherwise returns `None`.
223///
224/// Handles block comments (`/* */`), line comments (`--`, `#`), single-quoted
225/// strings, and PostgreSQL dollar-quoted strings (`$$...$$`, `$tag$...$tag$`).
226/// All delimiters are ASCII, so operating on raw bytes is safe and sidesteps
227/// any UTF-8 char-boundary concerns when the caller's cursor advances bytewise.
228fn skip_string_or_comment(bytes: &[u8], pos: usize) -> Option<usize> {
229    if pos >= bytes.len() {
230        return None;
231    }
232    // Block comment `/* ... */`.
233    if pos + 1 < bytes.len() && bytes[pos] == b'/' && bytes[pos + 1] == b'*' {
234        let mut i = pos + 2;
235        while i + 1 < bytes.len() {
236            if bytes[i] == b'*' && bytes[i + 1] == b'/' {
237                return Some(i + 2);
238            }
239            i += 1;
240        }
241        return Some(bytes.len());
242    }
243    // Line comment `-- ... \n`.
244    if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
245        let mut i = pos + 2;
246        while i < bytes.len() {
247            if bytes[i] == b'\n' {
248                return Some(i + 1);
249            }
250            i += 1;
251        }
252        return Some(bytes.len());
253    }
254    // MySQL/Hive hash comment `# ... \n`.
255    if bytes[pos] == b'#' {
256        let mut i = pos + 1;
257        while i < bytes.len() {
258            if bytes[i] == b'\n' {
259                return Some(i + 1);
260            }
261            i += 1;
262        }
263        return Some(bytes.len());
264    }
265    // Single-quoted string literal, with SQL `''` escape.
266    if bytes[pos] == b'\'' {
267        let mut i = pos + 1;
268        while i < bytes.len() {
269            if bytes[i] == b'\'' {
270                if i + 1 < bytes.len() && bytes[i + 1] == b'\'' {
271                    i += 2;
272                    continue;
273                }
274                return Some(i + 1);
275            }
276            i += 1;
277        }
278        return Some(bytes.len());
279    }
280    // PostgreSQL dollar-quoted string literal.
281    if bytes[pos] == b'$' {
282        return skip_dollar_quoted_string(bytes, pos);
283    }
284    None
285}
286
287fn skip_dollar_quoted_string(bytes: &[u8], pos: usize) -> Option<usize> {
288    if pos >= bytes.len() || bytes[pos] != b'$' {
289        return None;
290    }
291
292    let mut tag_end = pos + 1;
293    while tag_end < bytes.len() {
294        match bytes[tag_end] {
295            b'$' => {
296                let delimiter = &bytes[pos..=tag_end];
297                let search_start = tag_end + 1;
298                let mut i = search_start;
299                while i + delimiter.len() <= bytes.len() {
300                    if &bytes[i..i + delimiter.len()] == delimiter {
301                        return Some(i + delimiter.len());
302                    }
303                    i += 1;
304                }
305                return Some(bytes.len());
306            }
307            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' => tag_end += 1,
308            _ => return None,
309        }
310    }
311    None
312}
313
314/// Finds the span of a CTE definition name in SQL text.
315///
316/// Matches `WITH name`, `WITH RECURSIVE name`, or `, name` patterns and returns the span for `name`.
317/// Handles SQL comments between keywords and identifiers.
318/// Uses string operations instead of regex for performance.
319pub fn find_cte_definition_span(sql: &str, identifier: &str, search_start: usize) -> Option<Span> {
320    if identifier.is_empty() || search_start >= sql.len() {
321        return None;
322    }
323
324    let search_text = &sql[search_start..];
325
326    // Find CTE anchors: "WITH" keyword or comma separator
327    let mut pos = 0;
328    while pos < search_text.len() {
329        // Look for "WITH" keyword (case-insensitive, word boundary)
330        if let Some(with_pos) = find_keyword_case_insensitive(&search_text[pos..], "WITH") {
331            let after_with = pos + with_pos + 4;
332            // Skip whitespace and comments after WITH
333            let after_ws = skip_whitespace_and_comments(search_text, after_with);
334
335            // Check for optional RECURSIVE keyword
336            let after_recursive = if let Some(rec_pos) =
337                find_keyword_case_insensitive(&search_text[after_ws..], "RECURSIVE")
338            {
339                if rec_pos == 0 {
340                    // RECURSIVE found immediately after whitespace
341                    skip_whitespace_and_comments(search_text, after_ws + 9)
342                } else {
343                    after_ws
344                }
345            } else {
346                after_ws
347            };
348
349            // Try to match the identifier at this position
350            if let Some((start, end)) =
351                match_identifier_at(search_text, after_recursive, identifier)
352            {
353                return Some(Span::new(search_start + start, search_start + end));
354            }
355            pos = after_recursive.max(after_with);
356            continue;
357        }
358
359        // Look for comma separator
360        if let Some(comma_pos) = search_text[pos..].find(',') {
361            let after_comma = pos + comma_pos + 1;
362            // Skip whitespace and comments after comma
363            let after_ws = skip_whitespace_and_comments(search_text, after_comma);
364            if let Some((start, end)) = match_identifier_at(search_text, after_ws, identifier) {
365                return Some(Span::new(search_start + start, search_start + end));
366            }
367            pos = after_comma;
368            continue;
369        }
370
371        break;
372    }
373
374    None
375}
376
377/// Finds the span of a derived table alias in SQL text.
378///
379/// Matches `) alias` or `) AS alias` patterns and returns the span for `alias`.
380/// Handles SQL comments between the closing paren and the alias.
381/// Uses string operations instead of regex for performance.
382pub fn find_derived_table_alias_span(
383    sql: &str,
384    identifier: &str,
385    search_start: usize,
386) -> Option<Span> {
387    if identifier.is_empty() || search_start >= sql.len() {
388        return None;
389    }
390
391    let search_text = &sql[search_start..];
392
393    // Find closing paren anchors
394    let mut pos = 0;
395    while pos < search_text.len() {
396        if let Some(paren_pos) = search_text[pos..].find(')') {
397            let after_paren = pos + paren_pos + 1;
398            // Skip whitespace and comments
399            let ws_end = skip_whitespace_and_comments(search_text, after_paren);
400
401            if ws_end >= search_text.len() {
402                pos = after_paren;
403                continue;
404            }
405
406            // Check for optional "AS" keyword (must be followed by whitespace or comment, not "ASC")
407            let after_as = if search_text[ws_end..].to_ascii_uppercase().starts_with("AS") {
408                let potential_as_end = ws_end + 2;
409                let is_standalone_as = potential_as_end >= search_text.len()
410                    || search_text.as_bytes()[potential_as_end].is_ascii_whitespace()
411                    || search_text[potential_as_end..].starts_with("/*")
412                    || search_text[potential_as_end..].starts_with("--");
413                if is_standalone_as {
414                    skip_whitespace_and_comments(search_text, potential_as_end)
415                } else {
416                    ws_end
417                }
418            } else {
419                ws_end
420            };
421
422            if let Some((start, end)) = match_identifier_at(search_text, after_as, identifier) {
423                return Some(Span::new(search_start + start, search_start + end));
424            }
425            pos = after_paren;
426            continue;
427        }
428        break;
429    }
430
431    None
432}
433
434/// Finds a keyword case-insensitively with word boundary check.
435fn find_keyword_case_insensitive(text: &str, keyword: &str) -> Option<usize> {
436    let text_upper = text.to_ascii_uppercase();
437    let mut search_pos = 0;
438
439    while let Some(pos) = text_upper[search_pos..].find(keyword) {
440        let abs_pos = search_pos + pos;
441        // Check word boundary before
442        let before_ok = abs_pos == 0 || !is_identifier_char_before(text, abs_pos);
443        // Check word boundary after
444        let after_pos = abs_pos + keyword.len();
445        let after_ok = after_pos >= text.len() || !is_identifier_char_at(text, after_pos);
446
447        if before_ok && after_ok {
448            return Some(abs_pos);
449        }
450        search_pos = abs_pos + 1;
451    }
452    None
453}
454
455/// Skips whitespace and SQL comments (block `/* */` and line `-- \n`).
456/// Returns the position after all whitespace and comments.
457fn skip_whitespace_and_comments(text: &str, pos: usize) -> usize {
458    let mut current = pos;
459
460    loop {
461        if current >= text.len() {
462            return current;
463        }
464
465        let remaining = &text[current..];
466
467        // Skip whitespace first
468        let ws_chars: usize = remaining
469            .chars()
470            .take_while(|c| c.is_whitespace())
471            .map(|c| c.len_utf8())
472            .sum();
473        if ws_chars > 0 {
474            current += ws_chars;
475            continue;
476        }
477
478        // Check for block comment /* ... */
479        if let Some(after_open) = remaining.strip_prefix("/*") {
480            if let Some(end) = after_open.find("*/") {
481                current += 2 + end + 2; // Skip /* + content + */
482                continue;
483            } else {
484                // Unclosed comment - skip to end
485                return text.len();
486            }
487        }
488
489        // Check for line comment -- ... \n
490        if remaining.starts_with("--") {
491            if let Some(newline) = remaining.find('\n') {
492                current += newline + 1;
493                continue;
494            } else {
495                // No newline - comment goes to end
496                return text.len();
497            }
498        }
499
500        // No more whitespace or comments
501        break;
502    }
503
504    current
505}
506
507/// Matches an identifier at the given position (case-insensitive, handles quoting).
508fn match_identifier_at(text: &str, pos: usize, identifier: &str) -> Option<(usize, usize)> {
509    if pos >= text.len() {
510        return None;
511    }
512
513    let remaining = &text[pos..];
514    let ident_upper = identifier.to_ascii_uppercase();
515
516    // Check for quoted variants first
517    for (open, close) in [("\"", "\""), ("`", "`"), ("[", "]")] {
518        if remaining.starts_with(open) {
519            let after_open = open.len();
520            if remaining[after_open..]
521                .to_ascii_uppercase()
522                .starts_with(&ident_upper)
523            {
524                let ident_end = after_open + identifier.len();
525                if remaining[ident_end..].starts_with(close) {
526                    return Some((pos + after_open, pos + ident_end));
527                }
528            }
529        }
530    }
531
532    // Check for unquoted identifier with word boundary
533    if remaining.to_ascii_uppercase().starts_with(&ident_upper) {
534        let end_pos = identifier.len();
535        // Ensure word boundary after identifier.
536        let after_ok = end_pos >= remaining.len() || !is_identifier_char_at(remaining, end_pos);
537        if after_ok {
538            return Some((pos, pos + identifier.len()));
539        }
540    }
541
542    None
543}
544
545/// Finds an identifier occurrence in text, skipping comments and string literals.
546fn find_qualified_name(text: &str, qualified_name: &str) -> Option<(usize, usize)> {
547    find_identifier_occurrence(text, qualified_name, 0)
548        .map(|occurrence| (occurrence.full_start, occurrence.full_end))
549}
550
551#[derive(Debug, Clone, Copy)]
552struct IdentifierOccurrence {
553    full_start: usize,
554    full_end: usize,
555    tail_start: usize,
556    tail_end: usize,
557}
558
559#[derive(Debug, Clone, Copy)]
560struct ParsedIdentifierPart {
561    content_start: usize,
562    content_end: usize,
563    full_end: usize,
564}
565
566fn find_identifier_occurrence(
567    text: &str,
568    identifier: &str,
569    search_start: usize,
570) -> Option<IdentifierOccurrence> {
571    if identifier.is_empty() || search_start >= text.len() || !text.is_char_boundary(search_start) {
572        return None;
573    }
574
575    let target_parts = parse_identifier_target_parts(identifier)?;
576    let bytes = text.as_bytes();
577    let mut cursor = search_start;
578    while cursor < bytes.len() {
579        if let Some(skip_to) = skip_string_or_comment(bytes, cursor) {
580            debug_assert!(skip_to > cursor);
581            if skip_to <= cursor {
582                return None;
583            }
584            cursor = skip_to;
585            continue;
586        }
587
588        if let Some(occurrence) = match_identifier_occurrence_at(text, cursor, &target_parts) {
589            return Some(occurrence);
590        }
591
592        cursor = advance_scan_cursor(text, cursor)?;
593    }
594
595    None
596}
597
598fn parse_identifier_target_parts(identifier: &str) -> Option<Vec<String>> {
599    let target_parts: Vec<String> = split_qualified_identifiers(identifier)
600        .into_iter()
601        .map(|part| unquote_identifier(&part))
602        .collect();
603    (!target_parts.is_empty() && target_parts.iter().all(|part| !part.is_empty()))
604        .then_some(target_parts)
605}
606
607fn match_identifier_occurrence_at(
608    text: &str,
609    start: usize,
610    target_parts: &[String],
611) -> Option<IdentifierOccurrence> {
612    if !text.is_char_boundary(start) {
613        return None;
614    }
615
616    let before_ok = start == 0 || !is_identifier_char_before(text, start);
617    if !before_ok {
618        return None;
619    }
620
621    let (full_end, tail_start, tail_end) = match_identifier_sequence_at(text, start, target_parts)?;
622    let after_ok = full_end == text.len() || !is_identifier_char_at(text, full_end);
623    after_ok.then_some(IdentifierOccurrence {
624        full_start: start,
625        full_end,
626        tail_start,
627        tail_end,
628    })
629}
630
631fn advance_scan_cursor(text: &str, cursor: usize) -> Option<usize> {
632    if !text.is_char_boundary(cursor) {
633        return Some(cursor + 1);
634    }
635    let ch = text.get(cursor..)?.chars().next()?;
636    Some(cursor + ch.len_utf8())
637}
638
639fn match_identifier_sequence_at(
640    text: &str,
641    start: usize,
642    parts: &[String],
643) -> Option<(usize, usize, usize)> {
644    if parts.is_empty() {
645        return None;
646    }
647
648    let bytes = text.as_bytes();
649    let mut current = start;
650    let mut tail = None;
651
652    for (idx, part) in parts.iter().enumerate() {
653        let parsed = match_identifier_part_at(text, current, part)?;
654
655        if idx == parts.len() - 1 {
656            tail = Some((parsed.content_start, parsed.content_end));
657        }
658
659        current = parsed.full_end;
660        if idx < parts.len() - 1 {
661            current = skip_whitespace_and_comments(text, current);
662            if current >= bytes.len() || bytes[current] != b'.' {
663                return None;
664            }
665            current = skip_whitespace_and_comments(text, current + 1);
666        }
667    }
668
669    let (tail_start, tail_end) = tail?;
670    Some((current, tail_start, tail_end))
671}
672
673fn match_identifier_part_at(
674    text: &str,
675    start: usize,
676    target_part: &str,
677) -> Option<ParsedIdentifierPart> {
678    if start >= text.len() || !text.is_char_boundary(start) || target_part.is_empty() {
679        return None;
680    }
681
682    let bytes = text.as_bytes();
683    match bytes[start] {
684        b'"' | b'`' | b'[' | b'\'' => {
685            let (close_quote, content_start) = match bytes[start] {
686                b'"' => (b'"', start + 1),
687                b'`' => (b'`', start + 1),
688                b'[' => (b']', start + 1),
689                b'\'' => (b'\'', start + 1),
690                _ => unreachable!(),
691            };
692
693            let mut i = content_start;
694            while i < bytes.len() {
695                if bytes[i] == close_quote {
696                    if i + 1 < bytes.len() && bytes[i + 1] == close_quote {
697                        i += 2;
698                        continue;
699                    }
700                    let candidate = text.get(content_start..i)?;
701                    if !candidate.eq_ignore_ascii_case(target_part) {
702                        return None;
703                    }
704                    return Some(ParsedIdentifierPart {
705                        content_start,
706                        content_end: i,
707                        full_end: i + 1,
708                    });
709                }
710                i += 1;
711            }
712            None
713        }
714        _ if target_part.chars().all(is_identifier_char) => {
715            let mut end = start;
716            for ch in text.get(start..)?.chars() {
717                if !is_identifier_char(ch) {
718                    break;
719                }
720                end += ch.len_utf8();
721            }
722            let candidate = text.get(start..end)?;
723            candidate
724                .eq_ignore_ascii_case(target_part)
725                .then_some(ParsedIdentifierPart {
726                    content_start: start,
727                    content_end: end,
728                    full_end: end,
729                })
730        }
731        _ => {
732            let end = start + target_part.len();
733            let candidate = text.get(start..end)?;
734            candidate
735                .eq_ignore_ascii_case(target_part)
736                .then_some(ParsedIdentifierPart {
737                    content_start: start,
738                    content_end: end,
739                    full_end: end,
740                })
741        }
742    }
743}
744
745fn consume_ascii_keyword(text: &str, pos: usize, keyword: &str) -> Option<usize> {
746    let pos = skip_whitespace_and_comments(text, pos);
747    let remaining = text.get(pos..)?;
748    if find_keyword_case_insensitive(remaining, keyword) != Some(0) {
749        return None;
750    }
751    Some(skip_whitespace_and_comments(text, pos + keyword.len()))
752}
753
754fn is_identifier_char(ch: char) -> bool {
755    ch.is_alphanumeric() || ch == '_'
756}
757
758fn is_identifier_char_before(text: &str, byte_offset: usize) -> bool {
759    text.get(..byte_offset)
760        .and_then(|prefix| prefix.chars().next_back())
761        .is_some_and(is_identifier_char)
762}
763
764fn is_identifier_char_at(text: &str, byte_offset: usize) -> bool {
765    text.get(byte_offset..)
766        .and_then(|suffix| suffix.chars().next())
767        .is_some_and(is_identifier_char)
768}
769
770/// Calculates the byte offset for a given line and column in SQL text.
771///
772/// This is useful for converting line:column positions (from parse errors)
773/// to byte offsets for the Span type.
774///
775/// # Arguments
776///
777/// * `sql` - The SQL source text
778/// * `line` - Line number (1-indexed)
779/// * `column` - Column number (1-indexed)
780pub fn line_col_to_offset(sql: &str, line: usize, column: usize) -> Option<usize> {
781    if line == 0 || column == 0 {
782        return None;
783    }
784
785    let bytes = sql.as_bytes();
786    let mut current_line = 1;
787    let mut offset = 0;
788
789    // Advance `offset` to the start of the requested line.
790    while current_line < line {
791        let remaining = bytes.get(offset..)?;
792        let newline_pos = remaining.iter().position(|&b| b == b'\n')?;
793        offset += newline_pos + 1;
794        current_line += 1;
795    }
796
797    let line_start = offset;
798    let remaining = bytes.get(line_start..)?;
799    let line_len = remaining
800        .iter()
801        .position(|&b| b == b'\n')
802        .unwrap_or(remaining.len());
803    let line_end = line_start + line_len;
804    let line_slice = &sql[line_start..line_end];
805
806    // sqlparser reports columns in characters, so iterate char_indices to convert
807    // the 1-based column into a byte offset.
808    let mut current_column = 1;
809    for (rel_offset, _) in line_slice.char_indices() {
810        if current_column == column {
811            return Some(line_start + rel_offset);
812        }
813        current_column += 1;
814    }
815
816    if column == current_column {
817        return Some(line_end);
818    }
819
820    None
821}
822
823#[cfg(test)]
824mod tests {
825    use super::*;
826
827    #[test]
828    fn test_find_identifier_span_simple() {
829        let sql = "SELECT * FROM users WHERE id = 1";
830        let span = find_identifier_span(sql, "users", 0);
831        assert_eq!(span, Some(Span::new(14, 19)));
832    }
833
834    #[test]
835    fn test_find_identifier_span_case_insensitive() {
836        let sql = "SELECT * FROM Users WHERE id = 1";
837        let span = find_identifier_span(sql, "users", 0);
838        assert!(span.is_some());
839    }
840
841    #[test]
842    fn test_find_identifier_span_qualified() {
843        let sql = "SELECT * FROM public.users";
844        let span = find_identifier_span(sql, "public.users", 0);
845        assert_eq!(span, Some(Span::new(14, 26)));
846    }
847
848    #[test]
849    fn test_find_identifier_span_qualified_with_quotes() {
850        let sql = r#"SELECT * FROM "Public"."Users""#;
851        let span = find_identifier_span(sql, "public.users", 0);
852        assert_eq!(span, Some(Span::new(14, 30)));
853        let span = span.expect("quoted qualified span");
854        assert_eq!(&sql[span.start..span.end], r#""Public"."Users""#);
855    }
856
857    #[test]
858    fn test_find_identifier_span_with_offset() {
859        let sql = "SELECT 1; SELECT * FROM users";
860        let span = find_identifier_span(sql, "users", 10);
861        assert_eq!(span, Some(Span::new(24, 29)));
862    }
863
864    #[test]
865    fn test_find_identifier_span_not_found() {
866        let sql = "SELECT * FROM users";
867        let span = find_identifier_span(sql, "orders", 0);
868        assert_eq!(span, None);
869    }
870
871    #[test]
872    fn test_find_identifier_word_boundary() {
873        let sql = "SELECT users_id FROM users";
874        // Should find "users" as whole word, not "users" in "users_id"
875        let span = find_identifier_span(sql, "users", 0);
876        assert!(span.is_some());
877        let span = span.unwrap();
878        // Should match the standalone "users", not the one in "users_id"
879        assert_eq!(&sql[span.start..span.end].to_lowercase(), "users");
880    }
881
882    #[test]
883    fn test_find_cte_definition_span_single() {
884        let sql = "WITH my_cte AS (SELECT 1) SELECT * FROM my_cte";
885        let span = find_cte_definition_span(sql, "my_cte", 0);
886        assert_eq!(span, Some(Span::new(5, 11)));
887    }
888
889    #[test]
890    fn test_find_cte_definition_span_multiple() {
891        let sql = "WITH cte1 AS (SELECT 1), cte2 AS (SELECT 2) SELECT * FROM cte1, cte2";
892        let first_span = find_cte_definition_span(sql, "cte1", 0).expect("cte1 span");
893        assert_eq!(first_span, Span::new(5, 9));
894
895        let second_span = find_cte_definition_span(sql, "cte2", first_span.end).expect("cte2 span");
896        assert_eq!(second_span, Span::new(25, 29));
897    }
898
899    #[test]
900    fn test_find_derived_table_alias_span() {
901        let sql = "SELECT * FROM (SELECT 1) AS derived";
902        let span = find_derived_table_alias_span(sql, "derived", 0);
903        assert_eq!(span, Some(Span::new(28, 35)));
904        let span = span.expect("derived span");
905        assert_eq!(&sql[span.start..span.end], "derived");
906    }
907
908    #[test]
909    fn test_find_cte_definition_span_quoted() {
910        // Double-quoted identifier
911        let sql = r#"WITH "MyCte" AS (SELECT 1) SELECT * FROM "MyCte""#;
912        let span = find_cte_definition_span(sql, "MyCte", 0);
913        assert!(span.is_some(), "should find quoted CTE");
914        let span = span.unwrap();
915        assert_eq!(&sql[span.start..span.end], "MyCte");
916
917        // Backtick-quoted identifier
918        let sql = "WITH `my_cte` AS (SELECT 1) SELECT * FROM `my_cte`";
919        let span = find_cte_definition_span(sql, "my_cte", 0);
920        assert!(span.is_some(), "should find backtick-quoted CTE");
921        let span = span.unwrap();
922        assert_eq!(&sql[span.start..span.end], "my_cte");
923
924        // Bracket-quoted identifier
925        let sql = "WITH [my_cte] AS (SELECT 1) SELECT * FROM [my_cte]";
926        let span = find_cte_definition_span(sql, "my_cte", 0);
927        assert!(span.is_some(), "should find bracket-quoted CTE");
928        let span = span.unwrap();
929        assert_eq!(&sql[span.start..span.end], "my_cte");
930    }
931
932    #[test]
933    fn test_find_derived_table_alias_span_without_as() {
934        // Derived table without AS keyword
935        let sql = "SELECT * FROM (SELECT 1) derived";
936        let span = find_derived_table_alias_span(sql, "derived", 0);
937        assert!(span.is_some(), "should find derived alias without AS");
938        let span = span.unwrap();
939        assert_eq!(&sql[span.start..span.end], "derived");
940    }
941
942    #[test]
943    fn test_find_derived_table_alias_span_multiple() {
944        let sql = "SELECT * FROM (SELECT 1) AS a, (SELECT 2) AS b";
945        let first_span = find_derived_table_alias_span(sql, "a", 0).expect("first derived span");
946        assert_eq!(&sql[first_span.start..first_span.end], "a");
947
948        let second_span =
949            find_derived_table_alias_span(sql, "b", first_span.end).expect("second derived span");
950        assert_eq!(&sql[second_span.start..second_span.end], "b");
951    }
952
953    #[test]
954    fn test_find_derived_table_alias_span_quoted() {
955        let sql = r#"SELECT * FROM (SELECT 1) AS "Derived""#;
956        let span = find_derived_table_alias_span(sql, "Derived", 0);
957        assert!(span.is_some(), "should find quoted derived alias");
958        let span = span.unwrap();
959        assert_eq!(&sql[span.start..span.end], "Derived");
960    }
961
962    #[test]
963    fn test_line_col_to_offset_single_line() {
964        let sql = "SELECT * FROM users";
965        assert_eq!(line_col_to_offset(sql, 1, 1), Some(0));
966        assert_eq!(line_col_to_offset(sql, 1, 8), Some(7));
967    }
968
969    #[test]
970    fn test_line_col_to_offset_multi_line() {
971        let sql = "SELECT *\nFROM users\nWHERE id = 1";
972        assert_eq!(line_col_to_offset(sql, 1, 1), Some(0));
973        assert_eq!(line_col_to_offset(sql, 2, 1), Some(9));
974        assert_eq!(line_col_to_offset(sql, 3, 1), Some(20));
975    }
976
977    #[test]
978    fn test_line_col_to_offset_unicode_columns() {
979        let sql = "SELECT μ, FROM users";
980        // Column 11 should point at the 'F' byte even though the line includes a multi-byte char.
981        assert_eq!(line_col_to_offset(sql, 1, 11), Some("SELECT μ, ".len()));
982        // Column 12 moves one character to the right (the 'R').
983        assert_eq!(line_col_to_offset(sql, 1, 12), Some("SELECT μ, F".len()));
984    }
985
986    #[test]
987    fn test_line_col_to_offset_invalid() {
988        let sql = "SELECT * FROM users";
989        assert_eq!(line_col_to_offset(sql, 0, 1), None);
990        assert_eq!(line_col_to_offset(sql, 1, 0), None);
991        assert_eq!(line_col_to_offset(sql, 5, 1), None);
992    }
993
994    #[test]
995    fn test_find_identifier_empty() {
996        let sql = "SELECT * FROM users";
997        assert_eq!(find_identifier_span(sql, "", 0), None);
998        assert_eq!(find_identifier_span("", "users", 0), None);
999    }
1000
1001    // ============================================================================
1002    // Regression tests for prior code review findings
1003    // ============================================================================
1004
1005    // Issue 1: WITH RECURSIVE not supported
1006    #[test]
1007    fn test_find_cte_definition_span_recursive() {
1008        let sql = "WITH RECURSIVE my_cte AS (SELECT 1 UNION ALL SELECT 2) SELECT * FROM my_cte";
1009        let span = find_cte_definition_span(sql, "my_cte", 0);
1010        assert!(
1011            span.is_some(),
1012            "should find CTE name after RECURSIVE keyword"
1013        );
1014        let span = span.unwrap();
1015        assert_eq!(&sql[span.start..span.end], "my_cte");
1016    }
1017
1018    #[test]
1019    fn test_find_cte_definition_span_recursive_multiple() {
1020        let sql = "WITH RECURSIVE cte1 AS (SELECT 1), cte2 AS (SELECT 2) SELECT * FROM cte1, cte2";
1021        let first_span = find_cte_definition_span(sql, "cte1", 0);
1022        assert!(
1023            first_span.is_some(),
1024            "should find first CTE after RECURSIVE"
1025        );
1026        let first_span = first_span.unwrap();
1027        assert_eq!(&sql[first_span.start..first_span.end], "cte1");
1028
1029        let second_span = find_cte_definition_span(sql, "cte2", first_span.end);
1030        assert!(second_span.is_some(), "should find second CTE after comma");
1031        let second_span = second_span.unwrap();
1032        assert_eq!(&sql[second_span.start..second_span.end], "cte2");
1033    }
1034
1035    // Issue 2: Bounds checking - search_start at end of string
1036    #[test]
1037    fn test_find_cte_definition_span_search_start_at_end() {
1038        let sql = "WITH cte AS (SELECT 1) SELECT * FROM cte";
1039        // Search starting at the very end should return None, not panic
1040        let span = find_cte_definition_span(sql, "cte", sql.len());
1041        assert_eq!(span, None);
1042    }
1043
1044    #[test]
1045    fn test_find_derived_table_alias_search_start_at_end() {
1046        let sql = "SELECT * FROM (SELECT 1) AS derived";
1047        // Search starting at the very end should return None, not panic
1048        let span = find_derived_table_alias_span(sql, "derived", sql.len());
1049        assert_eq!(span, None);
1050    }
1051
1052    #[test]
1053    fn test_find_derived_table_alias_paren_at_end() {
1054        // Edge case: closing paren at end with no alias
1055        let sql = "SELECT * FROM (SELECT 1)";
1056        let span = find_derived_table_alias_span(sql, "anything", 0);
1057        assert_eq!(span, None);
1058    }
1059
1060    // Issue 3: Word boundary logic - underscore handling
1061    #[test]
1062    fn test_word_boundary_underscore_prefix() {
1063        let sql = "SELECT * FROM _users";
1064        // Should find "_users" as identifier, not fail to match
1065        let span = find_identifier_span(sql, "_users", 0);
1066        assert!(
1067            span.is_some(),
1068            "should find identifier starting with underscore"
1069        );
1070    }
1071
1072    #[test]
1073    fn test_word_boundary_underscore_suffix_no_match() {
1074        let sql = "SELECT * FROM users_table";
1075        // Should NOT match "users" because it's followed by underscore
1076        let span = find_identifier_span(sql, "users", 0);
1077        // This tests the bug: the current code may incorrectly match "users" within "users_table"
1078        // because of operator precedence: `!x && y != z` instead of `!(x || y == z)`
1079        assert!(
1080            span.is_none() || {
1081                let s = span.unwrap();
1082                // If it matched, verify it's the whole word not a prefix
1083                s.end == s.start + "users".len()
1084                    && (s.end >= sql.len()
1085                        || !sql.as_bytes()[s.end].is_ascii_alphanumeric()
1086                            && sql.as_bytes()[s.end] != b'_')
1087            },
1088            "should not match 'users' as part of 'users_table'"
1089        );
1090    }
1091
1092    #[test]
1093    fn test_word_boundary_unicode_suffix_no_match() {
1094        let sql = "SELECT * FROM 表名";
1095        // Should NOT match "表" because it's followed by a Unicode identifier char.
1096        let span = find_identifier_span(sql, "表", 0);
1097        assert_eq!(span, None);
1098    }
1099
1100    #[test]
1101    fn test_word_boundary_unicode_search_offset_no_partial_match() {
1102        let sql = "SELECT 表 FROM 表名";
1103        let first = find_identifier_span(sql, "表", 0).expect("first 表");
1104        let second = find_identifier_span(sql, "表", first.end);
1105        assert_eq!(second, None);
1106    }
1107
1108    #[test]
1109    fn test_cte_name_with_underscore_suffix_no_match() {
1110        // When searching for "cte" it should not match "cte_name"
1111        let sql = "WITH cte_name AS (SELECT 1) SELECT * FROM cte_name";
1112        let span = find_cte_definition_span(sql, "cte", 0);
1113        assert!(
1114            span.is_none(),
1115            "should not match 'cte' as part of 'cte_name'"
1116        );
1117    }
1118
1119    // Issue 4: Comments not handled
1120    #[test]
1121    fn test_find_cte_definition_span_with_block_comment() {
1122        let sql = "WITH /* comment */ my_cte AS (SELECT 1) SELECT * FROM my_cte";
1123        let span = find_cte_definition_span(sql, "my_cte", 0);
1124        assert!(span.is_some(), "should find CTE name after block comment");
1125        let span = span.unwrap();
1126        assert_eq!(&sql[span.start..span.end], "my_cte");
1127    }
1128
1129    #[test]
1130    fn test_find_cte_definition_span_with_line_comment() {
1131        let sql = "WITH -- comment\nmy_cte AS (SELECT 1) SELECT * FROM my_cte";
1132        let span = find_cte_definition_span(sql, "my_cte", 0);
1133        assert!(span.is_some(), "should find CTE name after line comment");
1134        let span = span.unwrap();
1135        assert_eq!(&sql[span.start..span.end], "my_cte");
1136    }
1137
1138    #[test]
1139    fn test_find_derived_table_alias_with_comment() {
1140        let sql = "SELECT * FROM (SELECT 1) /* comment */ AS derived";
1141        let span = find_derived_table_alias_span(sql, "derived", 0);
1142        assert!(span.is_some(), "should find alias after block comment");
1143        let span = span.unwrap();
1144        assert_eq!(&sql[span.start..span.end], "derived");
1145    }
1146
1147    // Issue 5: String literals may contain false matches
1148    #[test]
1149    fn test_find_cte_definition_not_in_string_literal() {
1150        // The CTE name "cte" appears in a string literal first, then as actual CTE
1151        let sql = "WITH cte AS (SELECT 'cte' AS name) SELECT * FROM cte";
1152        let span = find_cte_definition_span(sql, "cte", 0);
1153        assert!(span.is_some(), "should find CTE definition");
1154        let span = span.unwrap();
1155        // Should find the definition at position 5, not the string literal
1156        assert_eq!(
1157            span.start, 5,
1158            "should find CTE definition, not string literal"
1159        );
1160        assert_eq!(&sql[span.start..span.end], "cte");
1161    }
1162
1163    #[test]
1164    fn test_find_derived_alias_not_in_string_literal() {
1165        // The alias appears in a string literal inside the subquery
1166        let sql = "SELECT * FROM (SELECT 'derived' AS name) AS derived";
1167        let span = find_derived_table_alias_span(sql, "derived", 0);
1168        assert!(span.is_some(), "should find derived alias");
1169        let span = span.unwrap();
1170        // Should find the actual alias after the closing paren, not the string
1171        assert_eq!(&sql[span.start..span.end], "derived");
1172        // The alias position should be after the closing paren
1173        assert!(
1174            span.start > sql.find(')').unwrap(),
1175            "span should be after closing paren"
1176        );
1177    }
1178
1179    // Issue 6: Edge cases for empty/malformed inputs
1180    #[test]
1181    fn test_find_cte_definition_empty_identifier() {
1182        let sql = "WITH cte AS (SELECT 1) SELECT * FROM cte";
1183        let span = find_cte_definition_span(sql, "", 0);
1184        assert_eq!(span, None, "empty identifier should return None");
1185    }
1186
1187    #[test]
1188    fn test_find_derived_table_alias_empty_identifier() {
1189        let sql = "SELECT * FROM (SELECT 1) AS derived";
1190        let span = find_derived_table_alias_span(sql, "", 0);
1191        assert_eq!(span, None, "empty identifier should return None");
1192    }
1193
1194    #[test]
1195    fn test_find_cte_definition_empty_sql() {
1196        let span = find_cte_definition_span("", "cte", 0);
1197        assert_eq!(span, None, "empty SQL should return None");
1198    }
1199
1200    #[test]
1201    fn test_find_derived_table_alias_empty_sql() {
1202        let span = find_derived_table_alias_span("", "derived", 0);
1203        assert_eq!(span, None, "empty SQL should return None");
1204    }
1205
1206    #[test]
1207    fn test_find_cte_definition_search_start_beyond_bounds() {
1208        let sql = "WITH cte AS (SELECT 1)";
1209        let span = find_cte_definition_span(sql, "cte", sql.len() + 100);
1210        assert_eq!(span, None, "search_start beyond bounds should return None");
1211    }
1212
1213    #[test]
1214    fn test_find_derived_table_alias_search_start_beyond_bounds() {
1215        let sql = "SELECT * FROM (SELECT 1) AS derived";
1216        let span = find_derived_table_alias_span(sql, "derived", sql.len() + 100);
1217        assert_eq!(span, None, "search_start beyond bounds should return None");
1218    }
1219
1220    // Additional edge case: identifier at very end of SQL
1221    #[test]
1222    fn test_find_cte_at_end_of_sql() {
1223        let sql = "WITH x AS (SELECT 1) SELECT * FROM x";
1224        let span = find_cte_definition_span(sql, "x", 0);
1225        assert!(span.is_some());
1226        let span = span.unwrap();
1227        assert_eq!(&sql[span.start..span.end], "x");
1228    }
1229
1230    // Test for potential panic in match_identifier_at with short remaining text
1231    #[test]
1232    fn test_match_identifier_at_short_remaining() {
1233        let sql = "WITH a AS (SELECT 1) SELECT * FROM a";
1234        let span = find_cte_definition_span(sql, "a", 0);
1235        assert!(span.is_some());
1236        let span = span.unwrap();
1237        assert_eq!(&sql[span.start..span.end], "a");
1238    }
1239
1240    #[test]
1241    fn test_find_all_identifier_spans_multiple_refs() {
1242        let sql = "SELECT * FROM users u WHERE u.id IN (SELECT id FROM users)";
1243        let spans = find_all_identifier_spans(sql, "users", 0, sql.len());
1244        assert_eq!(spans.len(), 2);
1245        for span in &spans {
1246            assert_eq!(&sql[span.start..span.end], "users");
1247        }
1248    }
1249
1250    #[test]
1251    fn test_find_all_identifier_spans_cte_declaration_and_refs() {
1252        let sql = "WITH a AS (SELECT 1) SELECT a.x FROM a";
1253        let spans = find_all_identifier_spans(sql, "a", 0, sql.len());
1254        // `a` appears three times: declaration, qualifier in `a.x`, and `FROM a`.
1255        assert_eq!(spans.len(), 3);
1256        assert!(spans
1257            .iter()
1258            .all(|s| &sql[s.start..s.end] == "a" && s.end > s.start));
1259    }
1260
1261    #[test]
1262    fn test_find_all_identifier_spans_ignores_string_literals_and_comments() {
1263        let sql = "SELECT * FROM users WHERE name = 'users' -- users\n/* users */";
1264        let spans = find_all_identifier_spans(sql, "users", 0, sql.len());
1265        // Only the `FROM users` occurrence should match.
1266        assert_eq!(spans.len(), 1);
1267        assert_eq!(spans[0].start, 14);
1268        assert_eq!(spans[0].end, 19);
1269    }
1270
1271    #[test]
1272    fn test_find_all_identifier_spans_word_boundary() {
1273        let sql = "SELECT * FROM users_archive, users";
1274        let spans = find_all_identifier_spans(sql, "users", 0, sql.len());
1275        // `users_archive` must not match.
1276        assert_eq!(spans.len(), 1);
1277        assert_eq!(&sql[spans[0].start..spans[0].end], "users");
1278    }
1279
1280    #[test]
1281    fn test_find_all_identifier_spans_case_insensitive() {
1282        let sql = "SELECT * FROM Users JOIN USERS u ON u.id = Users.id";
1283        let spans = find_all_identifier_spans(sql, "users", 0, sql.len());
1284        assert_eq!(spans.len(), 3);
1285    }
1286
1287    #[test]
1288    fn test_find_all_identifier_spans_respects_search_bounds() {
1289        let sql = "users users users";
1290        let spans = find_all_identifier_spans(sql, "users", 6, 12);
1291        assert_eq!(spans.len(), 1);
1292        assert_eq!(spans[0].start, 6);
1293        assert_eq!(spans[0].end, 11);
1294    }
1295
1296    #[test]
1297    fn test_find_cte_body_span_simple() {
1298        let sql = "WITH a AS (SELECT 1) SELECT * FROM a";
1299        // The name span for `a` at offset 5.
1300        let name_span = Span::new(5, 6);
1301        let body = find_cte_body_span(sql, name_span).expect("body span");
1302        assert_eq!(&sql[body.start..body.end], "(SELECT 1)");
1303    }
1304
1305    #[test]
1306    fn test_find_cte_body_span_nested_parens() {
1307        let sql = "WITH a AS (SELECT (1 + 2) AS x) SELECT * FROM a";
1308        let name_span = Span::new(5, 6);
1309        let body = find_cte_body_span(sql, name_span).expect("body span");
1310        assert_eq!(&sql[body.start..body.end], "(SELECT (1 + 2) AS x)");
1311    }
1312
1313    #[test]
1314    fn test_find_cte_body_span_paren_in_string_literal() {
1315        let sql = "WITH a AS (SELECT ')' AS c) SELECT * FROM a";
1316        let name_span = Span::new(5, 6);
1317        let body = find_cte_body_span(sql, name_span).expect("body span");
1318        assert_eq!(&sql[body.start..body.end], "(SELECT ')' AS c)");
1319    }
1320
1321    #[test]
1322    fn test_find_cte_body_span_missing_paren_returns_none() {
1323        // No parenthesized body after the name.
1324        let sql = "WITH a AS SELECT 1";
1325        let name_span = Span::new(5, 6);
1326        assert_eq!(find_cte_body_span(sql, name_span), None);
1327    }
1328
1329    #[test]
1330    fn test_find_cte_body_span_with_whitespace_and_comment() {
1331        let sql = "WITH a  /* note */ AS  (SELECT 1) SELECT * FROM a";
1332        let name_span = Span::new(5, 6);
1333        let body = find_cte_body_span(sql, name_span).expect("body span");
1334        assert_eq!(&sql[body.start..body.end], "(SELECT 1)");
1335    }
1336
1337    #[test]
1338    fn test_find_cte_body_span_with_column_list() {
1339        let sql = "WITH a(x, y) AS (SELECT 1, 2) SELECT * FROM a";
1340        let name_span = Span::new(5, 6);
1341        let body = find_cte_body_span(sql, name_span).expect("body span");
1342        assert_eq!(&sql[body.start..body.end], "(SELECT 1, 2)");
1343    }
1344
1345    #[test]
1346    fn test_find_cte_body_span_with_materialized_modifier() {
1347        let sql = "WITH a AS MATERIALIZED (SELECT 1) SELECT * FROM a";
1348        let name_span = Span::new(5, 6);
1349        let body = find_cte_body_span(sql, name_span).expect("body span");
1350        assert_eq!(&sql[body.start..body.end], "(SELECT 1)");
1351    }
1352
1353    #[test]
1354    fn test_find_cte_body_span_with_not_materialized_modifier() {
1355        let sql = "WITH a AS NOT MATERIALIZED (SELECT 1) SELECT * FROM a";
1356        let name_span = Span::new(5, 6);
1357        let body = find_cte_body_span(sql, name_span).expect("body span");
1358        assert_eq!(&sql[body.start..body.end], "(SELECT 1)");
1359    }
1360
1361    #[test]
1362    fn test_find_identifier_span_skips_string_literal_before_match() {
1363        let sql = "SELECT 'users' AS x FROM users";
1364        let span = find_identifier_span(sql, "users", 0).expect("users span");
1365        assert_eq!(&sql[span.start..span.end], "users");
1366        assert_eq!(span, Span::new(25, 30));
1367    }
1368
1369    #[test]
1370    fn test_find_relation_occurrence_spans_quoted_identifier_with_embedded_dots() {
1371        let sql = "SELECT * FROM \"my.schema\".\"my.table\"";
1372        let (full_span, name_span) =
1373            find_relation_occurrence_spans(sql, "\"my.schema\".\"my.table\"", 0)
1374                .expect("relation span");
1375        assert_eq!(
1376            &sql[full_span.start..full_span.end],
1377            "\"my.schema\".\"my.table\""
1378        );
1379        assert_eq!(&sql[name_span.start..name_span.end], "my.table");
1380    }
1381
1382    // ============================================================================
1383    // UTF-8 safety regression tests: multi-byte characters in comments, string
1384    // literals, and around the scan region must not cause panics. Prior to the
1385    // byte-based refactor, the helpers would `sql[pos..]` with a cursor advancing
1386    // one byte at a time, which panics when `pos` lands inside a multi-byte char.
1387    // ============================================================================
1388
1389    #[test]
1390    fn test_find_all_identifier_spans_skips_non_ascii_comment() {
1391        // A block comment containing a multi-byte character (µ, 2 bytes in UTF-8)
1392        // previously caused a panic because the byte-indexed cursor slicing into
1393        // `sql[pos..]` would land inside the multi-byte sequence.
1394        let sql = "SELECT * /* µ µµµ */ FROM users WHERE id = 1";
1395        let spans = find_all_identifier_spans(sql, "users", 0, sql.len());
1396        assert_eq!(spans.len(), 1);
1397        assert_eq!(&sql[spans[0].start..spans[0].end], "users");
1398    }
1399
1400    #[test]
1401    fn test_find_all_identifier_spans_skips_non_ascii_line_comment() {
1402        let sql = "SELECT * FROM users -- é comment\nJOIN users u";
1403        let spans = find_all_identifier_spans(sql, "users", 0, sql.len());
1404        assert_eq!(spans.len(), 2);
1405    }
1406
1407    #[test]
1408    fn test_find_all_identifier_spans_skips_non_ascii_string_literal() {
1409        // Multi-byte char inside a string literal.
1410        let sql = "SELECT 'héllo users' FROM users";
1411        let spans = find_all_identifier_spans(sql, "users", 0, sql.len());
1412        // The `users` inside the string literal must not match; only the real FROM reference.
1413        assert_eq!(spans.len(), 1);
1414        assert_eq!(&sql[spans[0].start..spans[0].end], "users");
1415    }
1416
1417    #[test]
1418    fn test_find_cte_body_span_with_non_ascii_body_contents() {
1419        let sql = "WITH a AS (SELECT 'µ' AS x, (1 + 2) AS y) SELECT * FROM a";
1420        let name_span = Span::new(5, 6);
1421        let body = find_cte_body_span(sql, name_span).expect("body span");
1422        assert_eq!(
1423            &sql[body.start..body.end],
1424            "(SELECT 'µ' AS x, (1 + 2) AS y)"
1425        );
1426    }
1427
1428    #[test]
1429    fn test_find_all_identifier_spans_non_ascii_between_occurrences() {
1430        // Multi-byte chars between identifier occurrences stress the scan cursor.
1431        let sql = "SELECT users.id -- µ\nFROM users /* ñ */ JOIN users";
1432        let spans = find_all_identifier_spans(sql, "users", 0, sql.len());
1433        assert_eq!(spans.len(), 3);
1434    }
1435
1436    #[test]
1437    fn test_find_all_identifier_spans_ignores_hash_comments() {
1438        let sql = "SELECT 1 # users\nFROM users";
1439        let spans = find_all_identifier_spans(sql, "users", 0, sql.len());
1440        assert_eq!(spans.len(), 1);
1441        assert_eq!(&sql[spans[0].start..spans[0].end], "users");
1442        assert_eq!(spans[0], Span::new(22, 27));
1443    }
1444
1445    #[test]
1446    fn test_find_identifier_span_skips_dollar_quoted_string_literal() {
1447        let sql = "SELECT $$users$$ AS x FROM users";
1448        let span = find_identifier_span(sql, "users", 0).expect("users span");
1449        assert_eq!(&sql[span.start..span.end], "users");
1450        assert_eq!(span, Span::new(27, 32));
1451    }
1452
1453    #[test]
1454    fn test_find_cte_body_span_with_dollar_quoted_string() {
1455        let sql = "WITH a AS (SELECT $$)$$ AS x) SELECT * FROM a";
1456        let name_span = Span::new(5, 6);
1457        let body = find_cte_body_span(sql, name_span).expect("body span");
1458        assert_eq!(&sql[body.start..body.end], "(SELECT $$)$$ AS x)");
1459    }
1460}
flowscope_core/analyzer/helpers/span.rs

flowscope_core/analyzer/helpers/
span.rs