Skip to main content

flowscope_core/linter/rules/
rf_005.rs

1//! LINT_RF_005: References special chars.
2//!
3//! SQLFluff RF05 parity (current scope): flag identifiers containing disallowed
4//! special characters with SQLFluff-style identifier policy/config controls.
5
6use std::collections::HashSet;
7
8use crate::linter::config::LintConfig;
9use crate::linter::rule::{LintContext, LintRule};
10use crate::types::{issue_codes, Dialect, Issue};
11use regex::Regex;
12use sqlparser::ast::Statement;
13
14use super::identifier_candidates_helpers::{
15    collect_identifier_candidates, IdentifierCandidate, IdentifierPolicy,
16};
17
18pub struct ReferencesSpecialChars {
19    quoted_policy: IdentifierPolicy,
20    unquoted_policy: IdentifierPolicy,
21    additional_allowed_characters: HashSet<char>,
22    allow_space_in_identifier: bool,
23    ignore_words: HashSet<String>,
24    ignore_words_regex: Option<Regex>,
25}
26
27impl ReferencesSpecialChars {
28    pub fn from_config(config: &LintConfig) -> Self {
29        Self {
30            quoted_policy: IdentifierPolicy::from_config(
31                config,
32                issue_codes::LINT_RF_005,
33                "quoted_identifiers_policy",
34                "all",
35            ),
36            unquoted_policy: IdentifierPolicy::from_config(
37                config,
38                issue_codes::LINT_RF_005,
39                "unquoted_identifiers_policy",
40                "all",
41            ),
42            additional_allowed_characters: configured_additional_allowed_characters(config),
43            allow_space_in_identifier: config
44                .rule_option_bool(issue_codes::LINT_RF_005, "allow_space_in_identifier")
45                .unwrap_or(false),
46            ignore_words: configured_ignore_words(config)
47                .into_iter()
48                .map(|word| normalize_token(&word))
49                .collect(),
50            ignore_words_regex: config
51                .rule_option_str(issue_codes::LINT_RF_005, "ignore_words_regex")
52                .filter(|pattern| !pattern.trim().is_empty())
53                .and_then(|pattern| Regex::new(pattern).ok()),
54        }
55    }
56}
57
58impl Default for ReferencesSpecialChars {
59    fn default() -> Self {
60        Self {
61            quoted_policy: IdentifierPolicy::All,
62            unquoted_policy: IdentifierPolicy::All,
63            additional_allowed_characters: HashSet::new(),
64            allow_space_in_identifier: false,
65            ignore_words: HashSet::new(),
66            ignore_words_regex: None,
67        }
68    }
69}
70
71impl LintRule for ReferencesSpecialChars {
72    fn code(&self) -> &'static str {
73        issue_codes::LINT_RF_005
74    }
75
76    fn name(&self) -> &'static str {
77        "References special chars"
78    }
79
80    fn description(&self) -> &'static str {
81        "Do not use special characters in identifiers."
82    }
83
84    fn check(&self, statement: &Statement, ctx: &LintContext) -> Vec<Issue> {
85        let dialect = ctx.dialect();
86        let has_special_chars = collect_identifier_candidates(statement)
87            .into_iter()
88            .any(|candidate| candidate_triggers_rule(&candidate, self, dialect))
89            || show_tblproperties_property_key_triggers_rule(ctx.statement_sql(), self, dialect);
90
91        if has_special_chars {
92            vec![Issue::warning(
93                issue_codes::LINT_RF_005,
94                "Identifier contains unsupported special characters.",
95            )
96            .with_statement(ctx.statement_index)]
97        } else {
98            Vec::new()
99        }
100    }
101}
102
103fn candidate_triggers_rule(
104    candidate: &IdentifierCandidate,
105    rule: &ReferencesSpecialChars,
106    dialect: Dialect,
107) -> bool {
108    if is_ignored_token(&candidate.value, rule) {
109        return false;
110    }
111
112    let policy = if candidate.quoted {
113        rule.quoted_policy
114    } else {
115        rule.unquoted_policy
116    };
117    if !policy.allows(candidate.kind) {
118        return false;
119    }
120
121    // Snowflake pivot references use identifiers that look like "'VALUE'" -
122    // these are valid Snowflake syntax and should not be flagged.
123    if candidate.quoted && candidate.value.starts_with('\'') && candidate.value.ends_with('\'') {
124        return false;
125    }
126
127    // BigQuery allows hyphens, dots, and trailing wildcards in backtick identifiers
128    // by default. SparkSQL/Databricks allows arbitrary file paths in backtick identifiers.
129    if candidate.quote_char == Some('`') {
130        match dialect {
131            Dialect::Bigquery => {
132                // BigQuery allows `-` and `.` throughout backtick identifiers,
133                // but `*` only as a trailing wildcard suffix (e.g., `table_*`).
134                let value = &candidate.value;
135                let has_mid_star = value
136                    .char_indices()
137                    .any(|(i, ch)| ch == '*' && i + 1 < value.len());
138                if has_mid_star {
139                    return true;
140                }
141                return contains_disallowed_identifier_chars_with_extras(
142                    value,
143                    &rule.additional_allowed_characters,
144                    rule.allow_space_in_identifier,
145                    &['-', '.', '*'],
146                );
147            }
148            Dialect::Databricks => {
149                // SparkSQL/Databricks backtick identifiers can contain file
150                // paths with any characters - do not flag them.
151                return false;
152            }
153            _ => {}
154        }
155    }
156
157    // BigQuery allows hyphens in unquoted identifiers as well.
158    if matches!(dialect, Dialect::Bigquery) && !candidate.quoted {
159        return contains_disallowed_identifier_chars_with_extras(
160            &candidate.value,
161            &rule.additional_allowed_characters,
162            rule.allow_space_in_identifier,
163            &['-', '.'],
164        );
165    }
166
167    // Snowflake allows $ in identifiers (e.g. METADATA$FILENAME).
168    if matches!(dialect, Dialect::Snowflake) && !candidate.quoted {
169        return contains_disallowed_identifier_chars_with_extras(
170            &candidate.value,
171            &rule.additional_allowed_characters,
172            rule.allow_space_in_identifier,
173            &['$'],
174        );
175    }
176
177    contains_disallowed_identifier_chars(
178        &candidate.value,
179        &rule.additional_allowed_characters,
180        rule.allow_space_in_identifier,
181    )
182}
183
184fn show_tblproperties_property_key_triggers_rule(
185    sql: &str,
186    rule: &ReferencesSpecialChars,
187    dialect: Dialect,
188) -> bool {
189    if !matches!(dialect, Dialect::Databricks) {
190        return false;
191    }
192
193    // SparkSQL grammar uses a string literal for the optional property key in
194    // SHOW TBLPROPERTIES. SQLFluff still applies RF05 semantics to that key.
195    let lowered = sql.to_ascii_lowercase();
196    if !lowered.contains("show tblproperties") {
197        return false;
198    }
199
200    let Some(open_paren) = sql.find('(') else {
201        return false;
202    };
203    let Some(close_rel) = sql[open_paren + 1..].find(')') else {
204        return false;
205    };
206    let inside = sql[open_paren + 1..open_paren + 1 + close_rel].trim();
207    if inside.len() < 2 || !inside.starts_with('\'') || !inside.ends_with('\'') {
208        return false;
209    }
210
211    let property_key = inside.trim_matches('\'');
212    if is_ignored_token(property_key, rule) {
213        return false;
214    }
215
216    contains_disallowed_identifier_chars_with_extras(
217        property_key,
218        &rule.additional_allowed_characters,
219        rule.allow_space_in_identifier,
220        &['.'],
221    )
222}
223
224fn contains_disallowed_identifier_chars(
225    ident: &str,
226    additional_allowed: &HashSet<char>,
227    allow_space: bool,
228) -> bool {
229    ident.chars().any(|ch| {
230        !(ch.is_ascii_alphanumeric()
231            || ch == '_'
232            || (allow_space && ch == ' ')
233            || additional_allowed.contains(&ch))
234    })
235}
236
237fn contains_disallowed_identifier_chars_with_extras(
238    ident: &str,
239    additional_allowed: &HashSet<char>,
240    allow_space: bool,
241    extras: &[char],
242) -> bool {
243    ident.chars().any(|ch| {
244        !(ch.is_ascii_alphanumeric()
245            || ch == '_'
246            || (allow_space && ch == ' ')
247            || extras.contains(&ch)
248            || additional_allowed.contains(&ch))
249    })
250}
251
252fn configured_additional_allowed_characters(config: &LintConfig) -> HashSet<char> {
253    if let Some(values) =
254        config.rule_option_string_list(issue_codes::LINT_RF_005, "additional_allowed_characters")
255    {
256        let mut chars = HashSet::new();
257        for value in values {
258            chars.extend(value.chars());
259        }
260        return chars;
261    }
262
263    config
264        .rule_option_str(issue_codes::LINT_RF_005, "additional_allowed_characters")
265        .map(|value| {
266            value
267                .split(',')
268                .flat_map(|item| item.trim().chars())
269                .collect()
270        })
271        .unwrap_or_default()
272}
273
274fn configured_ignore_words(config: &LintConfig) -> Vec<String> {
275    if let Some(words) = config.rule_option_string_list(issue_codes::LINT_RF_005, "ignore_words") {
276        return words;
277    }
278
279    config
280        .rule_option_str(issue_codes::LINT_RF_005, "ignore_words")
281        .map(|words| {
282            words
283                .split(',')
284                .map(str::trim)
285                .filter(|word| !word.is_empty())
286                .map(str::to_string)
287                .collect()
288        })
289        .unwrap_or_default()
290}
291
292fn is_ignored_token(token: &str, rule: &ReferencesSpecialChars) -> bool {
293    let normalized = normalize_token(token);
294    // ignore_words matches case-insensitively (via normalization).
295    if rule.ignore_words.contains(&normalized) {
296        return true;
297    }
298    // ignore_words_regex matches case-sensitively against the raw value,
299    // consistent with SQLFluff behavior.
300    if let Some(regex) = &rule.ignore_words_regex {
301        let raw = token
302            .trim()
303            .trim_matches(|ch| matches!(ch, '"' | '`' | '\'' | '[' | ']'));
304        if regex.is_match(raw) {
305            return true;
306        }
307    }
308    false
309}
310
311fn normalize_token(token: &str) -> String {
312    token
313        .trim()
314        .trim_matches(|ch| matches!(ch, '"' | '`' | '\'' | '[' | ']'))
315        .to_ascii_uppercase()
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321    use crate::linter::rule::with_active_dialect;
322    use crate::parser::parse_sql;
323    use crate::parser::parse_sql_with_dialect;
324    use crate::types::Dialect;
325
326    fn run(sql: &str) -> Vec<Issue> {
327        run_with_config(sql, LintConfig::default())
328    }
329
330    fn run_with_config(sql: &str, config: LintConfig) -> Vec<Issue> {
331        let statements = parse_sql(sql).expect("parse");
332        let rule = ReferencesSpecialChars::from_config(&config);
333        statements
334            .iter()
335            .enumerate()
336            .flat_map(|(index, statement)| {
337                rule.check(
338                    statement,
339                    &LintContext {
340                        sql,
341                        statement_range: 0..sql.len(),
342                        statement_index: index,
343                    },
344                )
345            })
346            .collect()
347    }
348
349    fn run_in_dialect(sql: &str, dialect: Dialect) -> Vec<Issue> {
350        let statements = parse_sql_with_dialect(sql, dialect).expect("parse");
351        let rule = ReferencesSpecialChars::default();
352        let mut issues = Vec::new();
353        with_active_dialect(dialect, || {
354            for (index, statement) in statements.iter().enumerate() {
355                issues.extend(rule.check(
356                    statement,
357                    &LintContext {
358                        sql,
359                        statement_range: 0..sql.len(),
360                        statement_index: index,
361                    },
362                ));
363            }
364        });
365        issues
366    }
367
368    #[test]
369    fn flags_quoted_identifier_with_hyphen() {
370        let issues = run("SELECT \"bad-name\" FROM t");
371        assert_eq!(issues.len(), 1);
372        assert_eq!(issues[0].code, issue_codes::LINT_RF_005);
373    }
374
375    #[test]
376    fn does_not_flag_quoted_identifier_with_underscore() {
377        let issues = run("SELECT \"good_name\" FROM t");
378        assert!(issues.is_empty());
379    }
380
381    #[test]
382    fn does_not_flag_double_quotes_inside_string_literal() {
383        let issues = run("SELECT '\"bad-name\"' AS note FROM t");
384        assert!(issues.is_empty());
385    }
386
387    #[test]
388    fn additional_allowed_characters_permit_hyphen() {
389        let issues = run_with_config(
390            "SELECT \"bad-name\" FROM t",
391            LintConfig {
392                enabled: true,
393                disabled_rules: vec![],
394                rule_configs: std::collections::BTreeMap::from([(
395                    "references.special_chars".to_string(),
396                    serde_json::json!({"additional_allowed_characters": "-"}),
397                )]),
398            },
399        );
400        assert!(issues.is_empty());
401    }
402
403    #[test]
404    fn quoted_policy_none_skips_quoted_identifier_checks() {
405        let issues = run_with_config(
406            "SELECT \"bad-name\" FROM t",
407            LintConfig {
408                enabled: true,
409                disabled_rules: vec![],
410                rule_configs: std::collections::BTreeMap::from([(
411                    "LINT_RF_005".to_string(),
412                    serde_json::json!({"quoted_identifiers_policy": "none"}),
413                )]),
414            },
415        );
416        assert!(issues.is_empty());
417    }
418
419    #[test]
420    fn ignore_words_suppresses_configured_identifier() {
421        let issues = run_with_config(
422            "SELECT \"bad-name\" FROM t",
423            LintConfig {
424                enabled: true,
425                disabled_rules: vec![],
426                rule_configs: std::collections::BTreeMap::from([(
427                    "references.special_chars".to_string(),
428                    serde_json::json!({"ignore_words": ["bad-name"]}),
429                )]),
430            },
431        );
432        assert!(issues.is_empty());
433    }
434
435    #[test]
436    fn ignore_words_regex_suppresses_configured_identifier() {
437        let issues = run_with_config(
438            "SELECT \"bad-name\" FROM t",
439            LintConfig {
440                enabled: true,
441                disabled_rules: vec![],
442                rule_configs: std::collections::BTreeMap::from([(
443                    "LINT_RF_005".to_string(),
444                    serde_json::json!({"ignore_words_regex": "^bad-"}),
445                )]),
446            },
447        );
448        assert!(issues.is_empty());
449    }
450
451    #[test]
452    fn ignore_words_regex_is_case_sensitive() {
453        let issues = run_with_config(
454            "SELECT \"bad-name\" FROM t",
455            LintConfig {
456                enabled: true,
457                disabled_rules: vec![],
458                rule_configs: std::collections::BTreeMap::from([(
459                    "LINT_RF_005".to_string(),
460                    serde_json::json!({"ignore_words_regex": "^BAD-"}),
461                )]),
462            },
463        );
464        assert_eq!(issues.len(), 1, "regex should be case-sensitive");
465    }
466
467    #[test]
468    fn flags_create_table_column_with_space() {
469        let issues = run("CREATE TABLE DBO.ColumnNames (\n    \"Internal Space\" INT\n)");
470        assert_eq!(issues.len(), 1);
471        assert_eq!(issues[0].code, issue_codes::LINT_RF_005);
472    }
473
474    #[test]
475    fn allow_space_in_identifier_permits_space() {
476        let issues = run_with_config(
477            "CREATE TABLE DBO.ColumnNames (\n    \"Internal Space\" INT\n)",
478            LintConfig {
479                enabled: true,
480                disabled_rules: vec![],
481                rule_configs: std::collections::BTreeMap::from([(
482                    "references.special_chars".to_string(),
483                    serde_json::json!({"allow_space_in_identifier": true}),
484                )]),
485            },
486        );
487        assert!(issues.is_empty());
488    }
489
490    #[test]
491    fn sparksql_show_tblproperties_allows_dot_in_property_key() {
492        let issues = run_in_dialect(
493            "SHOW TBLPROPERTIES customer ('created.date');",
494            Dialect::Databricks,
495        );
496        assert!(issues.is_empty());
497    }
498
499    #[test]
500    fn sparksql_show_tblproperties_flags_wildcard_in_property_key() {
501        let issues = run_in_dialect(
502            "SHOW TBLPROPERTIES customer ('created.*');",
503            Dialect::Databricks,
504        );
505        assert_eq!(issues.len(), 1);
506        assert_eq!(issues[0].code, issue_codes::LINT_RF_005);
507    }
508}