Skip to main content

flowscope_core/linter/rules/
am_001.rs

1//! LINT_AM_001: DISTINCT with GROUP BY.
2//!
3//! Using DISTINCT with GROUP BY is redundant because GROUP BY already
4//! collapses duplicate rows. The DISTINCT can be safely removed.
5
6use crate::linter::rule::{LintContext, LintRule};
7use crate::types::{issue_codes, Dialect, Issue, IssueAutofixApplicability, IssuePatchEdit};
8use sqlparser::ast::*;
9use sqlparser::keywords::Keyword;
10use sqlparser::tokenizer::{Token, TokenWithSpan, Tokenizer, Whitespace};
11
12pub struct DistinctWithGroupBy;
13
14impl LintRule for DistinctWithGroupBy {
15    fn code(&self) -> &'static str {
16        issue_codes::LINT_AM_001
17    }
18
19    fn name(&self) -> &'static str {
20        "DISTINCT with GROUP BY"
21    }
22
23    fn description(&self) -> &'static str {
24        "Ambiguous use of 'DISTINCT' in a 'SELECT' statement with 'GROUP BY'."
25    }
26
27    fn check(&self, stmt: &Statement, ctx: &LintContext) -> Vec<Issue> {
28        let mut issues = Vec::new();
29        check_statement(stmt, ctx, &mut issues);
30
31        let mut distinct_ranges = distinct_removal_ranges(ctx.statement_sql(), ctx.dialect());
32        if distinct_ranges.ranges.len() == issues.len() {
33            for issue in &mut issues {
34                if let Some((start, end)) = distinct_ranges.next() {
35                    let span = ctx.span_from_statement_offset(start, end);
36                    *issue = issue.clone().with_span(span).with_autofix_edits(
37                        IssueAutofixApplicability::Safe,
38                        vec![IssuePatchEdit::new(span, "")],
39                    );
40                }
41            }
42        }
43
44        issues
45    }
46}
47
48fn check_statement(stmt: &Statement, ctx: &LintContext, issues: &mut Vec<Issue>) {
49    match stmt {
50        Statement::Query(q) => check_query(q, ctx, issues),
51        Statement::Insert(ins) => {
52            if let Some(ref source) = ins.source {
53                check_query(source, ctx, issues);
54            }
55        }
56        Statement::CreateView { query, .. } => check_query(query, ctx, issues),
57        Statement::CreateTable(create) => {
58            if let Some(ref q) = create.query {
59                check_query(q, ctx, issues);
60            }
61        }
62        _ => {}
63    }
64}
65
66fn check_query(query: &Query, ctx: &LintContext, issues: &mut Vec<Issue>) {
67    if let Some(ref with) = query.with {
68        for cte in &with.cte_tables {
69            check_query(&cte.query, ctx, issues);
70        }
71    }
72    check_set_expr(&query.body, ctx, issues);
73}
74
75fn check_set_expr(body: &SetExpr, ctx: &LintContext, issues: &mut Vec<Issue>) {
76    match body {
77        SetExpr::Select(select) => {
78            let has_distinct = matches!(
79                select.distinct,
80                Some(Distinct::Distinct) | Some(Distinct::On(_))
81            );
82            let has_group_by = match &select.group_by {
83                GroupByExpr::All(_) => true,
84                GroupByExpr::Expressions(exprs, _) => !exprs.is_empty(),
85            };
86
87            if has_distinct && has_group_by {
88                issues.push(
89                    Issue::warning(
90                        issue_codes::LINT_AM_001,
91                        "DISTINCT is redundant when GROUP BY is present.",
92                    )
93                    .with_statement(ctx.statement_index),
94                );
95            }
96
97            // Recurse into derived tables (subqueries in FROM)
98            for from_item in &select.from {
99                check_table_factor(&from_item.relation, ctx, issues);
100                for join in &from_item.joins {
101                    check_table_factor(&join.relation, ctx, issues);
102                }
103            }
104        }
105        SetExpr::Query(q) => check_query(q, ctx, issues),
106        SetExpr::SetOperation { left, right, .. } => {
107            check_set_expr(left, ctx, issues);
108            check_set_expr(right, ctx, issues);
109        }
110        _ => {}
111    }
112}
113
114fn check_table_factor(relation: &TableFactor, ctx: &LintContext, issues: &mut Vec<Issue>) {
115    match relation {
116        TableFactor::Derived { subquery, .. } => check_query(subquery, ctx, issues),
117        TableFactor::NestedJoin {
118            table_with_joins, ..
119        } => {
120            check_table_factor(&table_with_joins.relation, ctx, issues);
121            for join in &table_with_joins.joins {
122                check_table_factor(&join.relation, ctx, issues);
123            }
124        }
125        _ => {}
126    }
127}
128
129struct DistinctRemovalRanges {
130    ranges: Vec<(usize, usize)>,
131    index: usize,
132}
133
134impl DistinctRemovalRanges {
135    fn next(&mut self) -> Option<(usize, usize)> {
136        let range = self.ranges.get(self.index).copied();
137        if range.is_some() {
138            self.index += 1;
139        }
140        range
141    }
142}
143
144fn distinct_removal_ranges(sql: &str, dialect: Dialect) -> DistinctRemovalRanges {
145    let Some(tokens) = tokenized(sql, dialect) else {
146        return DistinctRemovalRanges {
147            ranges: Vec::new(),
148            index: 0,
149        };
150    };
151
152    let mut ranges = Vec::new();
153    for distinct_index in 0..tokens.len() {
154        if !is_distinct_keyword(&tokens[distinct_index].token) {
155            continue;
156        }
157
158        let phrase_end_index = if let Some(on_index) =
159            next_non_trivia_index(&tokens, distinct_index + 1)
160        {
161            if is_on_keyword(&tokens[on_index].token) {
162                let Some(left_paren_index) = next_non_trivia_index(&tokens, on_index + 1) else {
163                    continue;
164                };
165                if !matches!(tokens[left_paren_index].token, Token::LParen) {
166                    continue;
167                }
168                let Some(right_paren_index) = find_matching_rparen(&tokens, left_paren_index)
169                else {
170                    continue;
171                };
172                right_paren_index
173            } else {
174                distinct_index
175            }
176        } else {
177            distinct_index
178        };
179
180        let Some((start, _)) = token_with_span_offsets(sql, &tokens[distinct_index]) else {
181            continue;
182        };
183        let end = match next_non_trivia_index(&tokens, phrase_end_index + 1) {
184            Some(next_index) => match token_with_span_offsets(sql, &tokens[next_index]) {
185                Some((next_start, _)) => next_start,
186                None => continue,
187            },
188            None => match token_with_span_offsets(sql, &tokens[phrase_end_index]) {
189                Some((_, phrase_end)) => phrase_end,
190                None => continue,
191            },
192        };
193
194        if start < end {
195            ranges.push((start, end));
196        }
197    }
198
199    DistinctRemovalRanges { ranges, index: 0 }
200}
201
202fn tokenized(sql: &str, dialect: Dialect) -> Option<Vec<TokenWithSpan>> {
203    let dialect = dialect.to_sqlparser_dialect();
204    let mut tokenizer = Tokenizer::new(dialect.as_ref(), sql);
205    tokenizer.tokenize_with_location().ok()
206}
207
208fn is_distinct_keyword(token: &Token) -> bool {
209    matches!(token, Token::Word(word) if word.keyword == Keyword::DISTINCT)
210}
211
212fn is_on_keyword(token: &Token) -> bool {
213    matches!(token, Token::Word(word) if word.keyword == Keyword::ON)
214}
215
216fn next_non_trivia_index(tokens: &[TokenWithSpan], mut index: usize) -> Option<usize> {
217    while index < tokens.len() {
218        if !is_trivia_token(&tokens[index].token) {
219            return Some(index);
220        }
221        index += 1;
222    }
223    None
224}
225
226fn find_matching_rparen(tokens: &[TokenWithSpan], left_paren_index: usize) -> Option<usize> {
227    let mut depth = 0usize;
228
229    for (index, token) in tokens.iter().enumerate().skip(left_paren_index) {
230        if is_trivia_token(&token.token) {
231            continue;
232        }
233
234        match token.token {
235            Token::LParen => {
236                depth += 1;
237            }
238            Token::RParen => {
239                if depth == 0 {
240                    return None;
241                }
242                depth -= 1;
243                if depth == 0 {
244                    return Some(index);
245                }
246            }
247            _ => {}
248        }
249    }
250
251    None
252}
253
254fn is_trivia_token(token: &Token) -> bool {
255    matches!(
256        token,
257        Token::Whitespace(
258            Whitespace::Space
259                | Whitespace::Newline
260                | Whitespace::Tab
261                | Whitespace::SingleLineComment { .. }
262                | Whitespace::MultiLineComment(_)
263        )
264    )
265}
266
267fn token_with_span_offsets(sql: &str, token: &TokenWithSpan) -> Option<(usize, usize)> {
268    let start = line_col_to_offset(
269        sql,
270        token.span.start.line as usize,
271        token.span.start.column as usize,
272    )?;
273    let end = line_col_to_offset(
274        sql,
275        token.span.end.line as usize,
276        token.span.end.column as usize,
277    )?;
278    Some((start, end))
279}
280
281fn line_col_to_offset(sql: &str, line: usize, column: usize) -> Option<usize> {
282    if line == 0 || column == 0 {
283        return None;
284    }
285
286    let mut current_line = 1usize;
287    let mut current_col = 1usize;
288
289    for (offset, ch) in sql.char_indices() {
290        if current_line == line && current_col == column {
291            return Some(offset);
292        }
293
294        if ch == '\n' {
295            current_line += 1;
296            current_col = 1;
297        } else {
298            current_col += 1;
299        }
300    }
301
302    if current_line == line && current_col == column {
303        return Some(sql.len());
304    }
305
306    None
307}
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312    use crate::parser::parse_sql;
313    use crate::types::IssueAutofixApplicability;
314
315    fn check_sql(sql: &str) -> Vec<Issue> {
316        let stmts = parse_sql(sql).unwrap();
317        let rule = DistinctWithGroupBy;
318        let ctx = LintContext {
319            sql,
320            statement_range: 0..sql.len(),
321            statement_index: 0,
322        };
323        let mut issues = Vec::new();
324        for stmt in &stmts {
325            issues.extend(rule.check(stmt, &ctx));
326        }
327        issues
328    }
329
330    fn apply_issue_autofix(sql: &str, issue: &Issue) -> Option<String> {
331        let autofix = issue.autofix.as_ref()?;
332        let mut edits = autofix.edits.clone();
333        edits.sort_by(|left, right| right.span.start.cmp(&left.span.start));
334
335        let mut out = sql.to_string();
336        for edit in edits {
337            out.replace_range(edit.span.start..edit.span.end, &edit.replacement);
338        }
339        Some(out)
340    }
341
342    #[test]
343    fn test_distinct_with_group_by_detected() {
344        let issues = check_sql("SELECT DISTINCT col FROM t GROUP BY col");
345        assert_eq!(issues.len(), 1);
346        assert_eq!(issues[0].code, "LINT_AM_001");
347    }
348
349    #[test]
350    fn test_distinct_without_group_by_ok() {
351        let issues = check_sql("SELECT DISTINCT col FROM t");
352        assert!(issues.is_empty());
353    }
354
355    #[test]
356    fn test_group_by_without_distinct_ok() {
357        let issues = check_sql("SELECT col FROM t GROUP BY col");
358        assert!(issues.is_empty());
359    }
360
361    // --- Edge cases adopted from sqlfluff AM01 (ambiguous.distinct) ---
362
363    #[test]
364    fn test_distinct_group_by_in_subquery() {
365        let issues = check_sql("SELECT * FROM (SELECT DISTINCT a FROM t GROUP BY a) AS sub");
366        assert_eq!(issues.len(), 1);
367    }
368
369    #[test]
370    fn test_distinct_group_by_in_cte() {
371        let issues =
372            check_sql("WITH cte AS (SELECT DISTINCT a FROM t GROUP BY a) SELECT * FROM cte");
373        assert_eq!(issues.len(), 1);
374    }
375
376    #[test]
377    fn test_distinct_group_by_in_create_view() {
378        let issues = check_sql("CREATE VIEW v AS SELECT DISTINCT a FROM t GROUP BY a");
379        assert_eq!(issues.len(), 1);
380    }
381
382    #[test]
383    fn test_distinct_group_by_in_insert() {
384        let issues = check_sql("INSERT INTO target SELECT DISTINCT a FROM t GROUP BY a");
385        assert_eq!(issues.len(), 1);
386    }
387
388    #[test]
389    fn test_no_distinct_no_group_by_ok() {
390        let issues = check_sql("SELECT a, b FROM t");
391        assert!(issues.is_empty());
392    }
393
394    #[test]
395    fn test_distinct_group_by_in_union_branch() {
396        let issues = check_sql("SELECT a FROM t UNION ALL SELECT DISTINCT b FROM t2 GROUP BY b");
397        assert_eq!(issues.len(), 1);
398    }
399
400    #[test]
401    fn test_distinct_group_by_emits_safe_autofix_patch() {
402        let sql = "SELECT DISTINCT a FROM t GROUP BY a";
403        let issues = check_sql(sql);
404        assert_eq!(issues.len(), 1);
405
406        let autofix = issues[0].autofix.as_ref().expect("autofix metadata");
407        assert_eq!(autofix.applicability, IssueAutofixApplicability::Safe);
408        assert_eq!(autofix.edits.len(), 1);
409
410        let fixed = apply_issue_autofix(sql, &issues[0]).expect("apply autofix");
411        assert_eq!(fixed, "SELECT a FROM t GROUP BY a");
412    }
413}