Skip to main content

sqrust_rules/ambiguous/
regexp_function.rs

1use sqrust_core::{Diagnostic, FileContext, Rule};
2
3pub struct RegexpFunction;
4
5/// Regexp function names that are dialect-specific (matched as function calls with `(`).
6const REGEXP_FUNCTIONS: &[&str] = &[
7    "REGEXP_LIKE",
8    "REGEXP_CONTAINS",
9    "REGEXP_EXTRACT",
10    "REGEXP_MATCH",
11    "REGEXP_MATCHES",
12    "REGEXP_SUBSTR",
13    "REGEXP_INSTR",
14    "REGEXP_COUNT",
15    "REGEXP_REPLACE",
16    "REGEXP_SPLIT_TO_ARRAY",
17];
18
19/// `RLIKE` is an operator keyword, not a function call — matched as a standalone keyword.
20const RLIKE_KEYWORD: &str = "RLIKE";
21
22impl Rule for RegexpFunction {
23    fn name(&self) -> &'static str {
24        "Ambiguous/RegexpFunction"
25    }
26
27    fn check(&self, ctx: &FileContext) -> Vec<Diagnostic> {
28        find_violations(&ctx.source, self.name())
29    }
30}
31
32fn find_violations(source: &str, rule_name: &'static str) -> Vec<Diagnostic> {
33    let bytes = source.as_bytes();
34    let len = bytes.len();
35
36    if len == 0 {
37        return Vec::new();
38    }
39
40    let skip = build_skip_set(bytes, len);
41    let mut diags = Vec::new();
42
43    // Scan for each regexp function name followed by '('
44    for func_name in REGEXP_FUNCTIONS {
45        scan_for_function(source, bytes, len, &skip, func_name, rule_name, &mut diags);
46    }
47
48    // Scan for RLIKE as a standalone keyword (operator form)
49    scan_for_keyword(source, bytes, len, &skip, RLIKE_KEYWORD, rule_name, &mut diags);
50
51    // Sort diagnostics by line then col for stable output
52    diags.sort_by(|a, b| a.line.cmp(&b.line).then(a.col.cmp(&b.col)));
53
54    diags
55}
56
57/// Scan for `func_name(` (case-insensitive) with word boundaries.
58fn scan_for_function(
59    source: &str,
60    bytes: &[u8],
61    len: usize,
62    skip: &[bool],
63    func_name: &str,
64    rule_name: &'static str,
65    diags: &mut Vec<Diagnostic>,
66) {
67    let kw = func_name.as_bytes();
68    let kw_len = kw.len();
69    let mut i = 0;
70
71    while i + kw_len <= len {
72        if skip[i] {
73            i += 1;
74            continue;
75        }
76
77        // Word boundary before
78        let before_ok = i == 0 || !is_word_char(bytes[i - 1]);
79        if before_ok && bytes[i..i + kw_len].eq_ignore_ascii_case(kw) {
80            let after = i + kw_len;
81            // Word boundary after: next char must not be a word char
82            let after_ok = after >= len || !is_word_char(bytes[after]);
83            if after_ok {
84                // Check that after optional whitespace there is a '('
85                let mut j = after;
86                while j < len && (bytes[j] == b' ' || bytes[j] == b'\t') {
87                    j += 1;
88                }
89                if j < len && bytes[j] == b'(' {
90                    let (line, col) = line_col(source, i);
91                    diags.push(Diagnostic {
92                        rule: rule_name,
93                        message: format!(
94                            "{func} is dialect-specific regexp syntax — different databases use \
95                             different regexp functions with inconsistent behavior; consider \
96                             abstracting via a dbt macro",
97                            func = func_name
98                        ),
99                        line,
100                        col,
101                    });
102                    i += kw_len;
103                    continue;
104                }
105            }
106        }
107
108        i += 1;
109    }
110}
111
112/// Scan for `RLIKE` as a standalone keyword (case-insensitive) with word boundaries.
113fn scan_for_keyword(
114    source: &str,
115    bytes: &[u8],
116    len: usize,
117    skip: &[bool],
118    keyword: &str,
119    rule_name: &'static str,
120    diags: &mut Vec<Diagnostic>,
121) {
122    let kw = keyword.as_bytes();
123    let kw_len = kw.len();
124    let mut i = 0;
125
126    while i + kw_len <= len {
127        if skip[i] {
128            i += 1;
129            continue;
130        }
131
132        let before_ok = i == 0 || !is_word_char(bytes[i - 1]);
133        if before_ok && bytes[i..i + kw_len].eq_ignore_ascii_case(kw) {
134            let after = i + kw_len;
135            let after_ok = after >= len || !is_word_char(bytes[after]);
136            if after_ok {
137                let (line, col) = line_col(source, i);
138                diags.push(Diagnostic {
139                    rule: rule_name,
140                    message: format!(
141                        "{func} is dialect-specific regexp syntax — different databases use \
142                         different regexp functions with inconsistent behavior; consider \
143                         abstracting via a dbt macro",
144                        func = keyword
145                    ),
146                    line,
147                    col,
148                });
149                i += kw_len;
150                continue;
151            }
152        }
153
154        i += 1;
155    }
156}
157
158#[inline]
159fn is_word_char(ch: u8) -> bool {
160    ch.is_ascii_alphanumeric() || ch == b'_'
161}
162
163fn line_col(source: &str, offset: usize) -> (usize, usize) {
164    let before = &source[..offset.min(source.len())];
165    let line = before.chars().filter(|&c| c == '\n').count() + 1;
166    let col = before.rfind('\n').map(|p| offset - p - 1).unwrap_or(offset) + 1;
167    (line, col)
168}
169
170/// Build a boolean skip-set: `skip[i] == true` means byte `i` is inside a
171/// single-quoted string, double-quoted identifier, block comment, or line comment.
172fn build_skip_set(bytes: &[u8], len: usize) -> Vec<bool> {
173    let mut skip = vec![false; len];
174    let mut i = 0;
175
176    while i < len {
177        // Single-quoted string: '...' with '' escape.
178        if bytes[i] == b'\'' {
179            skip[i] = true;
180            i += 1;
181            while i < len {
182                skip[i] = true;
183                if bytes[i] == b'\'' {
184                    if i + 1 < len && bytes[i + 1] == b'\'' {
185                        i += 1;
186                        skip[i] = true;
187                        i += 1;
188                        continue;
189                    }
190                    i += 1;
191                    break;
192                }
193                i += 1;
194            }
195            continue;
196        }
197
198        // Double-quoted identifier: "..." with "" escape.
199        if bytes[i] == b'"' {
200            skip[i] = true;
201            i += 1;
202            while i < len {
203                skip[i] = true;
204                if bytes[i] == b'"' {
205                    if i + 1 < len && bytes[i + 1] == b'"' {
206                        i += 1;
207                        skip[i] = true;
208                        i += 1;
209                        continue;
210                    }
211                    i += 1;
212                    break;
213                }
214                i += 1;
215            }
216            continue;
217        }
218
219        // Block comment: /* ... */
220        if i + 1 < len && bytes[i] == b'/' && bytes[i + 1] == b'*' {
221            skip[i] = true;
222            skip[i + 1] = true;
223            i += 2;
224            while i < len {
225                skip[i] = true;
226                if i + 1 < len && bytes[i] == b'*' && bytes[i + 1] == b'/' {
227                    skip[i + 1] = true;
228                    i += 2;
229                    break;
230                }
231                i += 1;
232            }
233            continue;
234        }
235
236        // Line comment: -- to end of line.
237        if i + 1 < len && bytes[i] == b'-' && bytes[i + 1] == b'-' {
238            skip[i] = true;
239            skip[i + 1] = true;
240            i += 2;
241            while i < len && bytes[i] != b'\n' {
242                skip[i] = true;
243                i += 1;
244            }
245            continue;
246        }
247
248        i += 1;
249    }
250
251    skip
252}