sqrust_rules/ambiguous/
regexp_function.rs1use sqrust_core::{Diagnostic, FileContext, Rule};
2
3pub struct RegexpFunction;
4
5const REGEXP_FUNCTIONS: &[&str] = &[
7 "REGEXP_LIKE",
8 "REGEXP_CONTAINS",
9 "REGEXP_EXTRACT",
10 "REGEXP_MATCH",
11 "REGEXP_MATCHES",
12 "REGEXP_SUBSTR",
13 "REGEXP_INSTR",
14 "REGEXP_COUNT",
15 "REGEXP_REPLACE",
16 "REGEXP_SPLIT_TO_ARRAY",
17];
18
19const RLIKE_KEYWORD: &str = "RLIKE";
21
22impl Rule for RegexpFunction {
23 fn name(&self) -> &'static str {
24 "Ambiguous/RegexpFunction"
25 }
26
27 fn check(&self, ctx: &FileContext) -> Vec<Diagnostic> {
28 find_violations(&ctx.source, self.name())
29 }
30}
31
32fn find_violations(source: &str, rule_name: &'static str) -> Vec<Diagnostic> {
33 let bytes = source.as_bytes();
34 let len = bytes.len();
35
36 if len == 0 {
37 return Vec::new();
38 }
39
40 let skip = build_skip_set(bytes, len);
41 let mut diags = Vec::new();
42
43 for func_name in REGEXP_FUNCTIONS {
45 scan_for_function(source, bytes, len, &skip, func_name, rule_name, &mut diags);
46 }
47
48 scan_for_keyword(source, bytes, len, &skip, RLIKE_KEYWORD, rule_name, &mut diags);
50
51 diags.sort_by(|a, b| a.line.cmp(&b.line).then(a.col.cmp(&b.col)));
53
54 diags
55}
56
57fn scan_for_function(
59 source: &str,
60 bytes: &[u8],
61 len: usize,
62 skip: &[bool],
63 func_name: &str,
64 rule_name: &'static str,
65 diags: &mut Vec<Diagnostic>,
66) {
67 let kw = func_name.as_bytes();
68 let kw_len = kw.len();
69 let mut i = 0;
70
71 while i + kw_len <= len {
72 if skip[i] {
73 i += 1;
74 continue;
75 }
76
77 let before_ok = i == 0 || !is_word_char(bytes[i - 1]);
79 if before_ok && bytes[i..i + kw_len].eq_ignore_ascii_case(kw) {
80 let after = i + kw_len;
81 let after_ok = after >= len || !is_word_char(bytes[after]);
83 if after_ok {
84 let mut j = after;
86 while j < len && (bytes[j] == b' ' || bytes[j] == b'\t') {
87 j += 1;
88 }
89 if j < len && bytes[j] == b'(' {
90 let (line, col) = line_col(source, i);
91 diags.push(Diagnostic {
92 rule: rule_name,
93 message: format!(
94 "{func} is dialect-specific regexp syntax — different databases use \
95 different regexp functions with inconsistent behavior; consider \
96 abstracting via a dbt macro",
97 func = func_name
98 ),
99 line,
100 col,
101 });
102 i += kw_len;
103 continue;
104 }
105 }
106 }
107
108 i += 1;
109 }
110}
111
112fn scan_for_keyword(
114 source: &str,
115 bytes: &[u8],
116 len: usize,
117 skip: &[bool],
118 keyword: &str,
119 rule_name: &'static str,
120 diags: &mut Vec<Diagnostic>,
121) {
122 let kw = keyword.as_bytes();
123 let kw_len = kw.len();
124 let mut i = 0;
125
126 while i + kw_len <= len {
127 if skip[i] {
128 i += 1;
129 continue;
130 }
131
132 let before_ok = i == 0 || !is_word_char(bytes[i - 1]);
133 if before_ok && bytes[i..i + kw_len].eq_ignore_ascii_case(kw) {
134 let after = i + kw_len;
135 let after_ok = after >= len || !is_word_char(bytes[after]);
136 if after_ok {
137 let (line, col) = line_col(source, i);
138 diags.push(Diagnostic {
139 rule: rule_name,
140 message: format!(
141 "{func} is dialect-specific regexp syntax — different databases use \
142 different regexp functions with inconsistent behavior; consider \
143 abstracting via a dbt macro",
144 func = keyword
145 ),
146 line,
147 col,
148 });
149 i += kw_len;
150 continue;
151 }
152 }
153
154 i += 1;
155 }
156}
157
158#[inline]
159fn is_word_char(ch: u8) -> bool {
160 ch.is_ascii_alphanumeric() || ch == b'_'
161}
162
163fn line_col(source: &str, offset: usize) -> (usize, usize) {
164 let before = &source[..offset.min(source.len())];
165 let line = before.chars().filter(|&c| c == '\n').count() + 1;
166 let col = before.rfind('\n').map(|p| offset - p - 1).unwrap_or(offset) + 1;
167 (line, col)
168}
169
170fn build_skip_set(bytes: &[u8], len: usize) -> Vec<bool> {
173 let mut skip = vec![false; len];
174 let mut i = 0;
175
176 while i < len {
177 if bytes[i] == b'\'' {
179 skip[i] = true;
180 i += 1;
181 while i < len {
182 skip[i] = true;
183 if bytes[i] == b'\'' {
184 if i + 1 < len && bytes[i + 1] == b'\'' {
185 i += 1;
186 skip[i] = true;
187 i += 1;
188 continue;
189 }
190 i += 1;
191 break;
192 }
193 i += 1;
194 }
195 continue;
196 }
197
198 if bytes[i] == b'"' {
200 skip[i] = true;
201 i += 1;
202 while i < len {
203 skip[i] = true;
204 if bytes[i] == b'"' {
205 if i + 1 < len && bytes[i + 1] == b'"' {
206 i += 1;
207 skip[i] = true;
208 i += 1;
209 continue;
210 }
211 i += 1;
212 break;
213 }
214 i += 1;
215 }
216 continue;
217 }
218
219 if i + 1 < len && bytes[i] == b'/' && bytes[i + 1] == b'*' {
221 skip[i] = true;
222 skip[i + 1] = true;
223 i += 2;
224 while i < len {
225 skip[i] = true;
226 if i + 1 < len && bytes[i] == b'*' && bytes[i + 1] == b'/' {
227 skip[i + 1] = true;
228 i += 2;
229 break;
230 }
231 i += 1;
232 }
233 continue;
234 }
235
236 if i + 1 < len && bytes[i] == b'-' && bytes[i + 1] == b'-' {
238 skip[i] = true;
239 skip[i + 1] = true;
240 i += 2;
241 while i < len && bytes[i] != b'\n' {
242 skip[i] = true;
243 i += 1;
244 }
245 continue;
246 }
247
248 i += 1;
249 }
250
251 skip
252}