rumdl_lib/utils/
table_utils.rs

1/// Shared table detection and processing utilities for markdown linting rules
2///
3/// This module provides optimized table detection and processing functionality
4/// that can be shared across multiple table-related rules (MD055, MD056, MD058).
5/// Represents a table block in the document
6#[derive(Debug, Clone)]
7pub struct TableBlock {
8    pub start_line: usize,
9    pub end_line: usize,
10    pub header_line: usize,
11    pub delimiter_line: usize,
12    pub content_lines: Vec<usize>,
13}
14
15/// Shared table detection utilities
16pub struct TableUtils;
17
18impl TableUtils {
19    /// Check if a line looks like a potential table row
20    pub fn is_potential_table_row(line: &str) -> bool {
21        let trimmed = line.trim();
22        if trimmed.is_empty() || !trimmed.contains('|') {
23            return false;
24        }
25
26        // Skip lines that are clearly not table rows
27        if trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") {
28            return false;
29        }
30
31        // Skip lines that are clearly code or inline code
32        if trimmed.starts_with("`") || trimmed.contains("``") {
33            return false;
34        }
35
36        // Must have at least 2 parts when split by |
37        let parts: Vec<&str> = trimmed.split('|').collect();
38        if parts.len() < 2 {
39            return false;
40        }
41
42        // Check if it looks like a table row by having reasonable content between pipes
43        let mut valid_parts = 0;
44        let mut total_non_empty_parts = 0;
45
46        for part in &parts {
47            let part_trimmed = part.trim();
48            // Skip empty parts (from leading/trailing pipes)
49            if part_trimmed.is_empty() {
50                continue;
51            }
52            total_non_empty_parts += 1;
53
54            // Count parts that look like table cells (reasonable content, no newlines)
55            if !part_trimmed.contains('\n') {
56                valid_parts += 1;
57            }
58        }
59
60        // All non-empty parts must be valid (no newlines) for the row to be valid
61        if total_non_empty_parts == 0 {
62            return false;
63        }
64
65        if valid_parts != total_non_empty_parts {
66            // Some cells contain newlines, not a valid table row
67            return false;
68        }
69
70        // GFM allows single-column tables, so >= 1 valid part is enough
71        // when the line has proper table formatting (pipes)
72        if trimmed.starts_with('|') && trimmed.ends_with('|') {
73            // Properly formatted table row with pipes on both ends
74            valid_parts >= 1
75        } else {
76            // For rows without proper pipe formatting, require at least 2 cells
77            valid_parts >= 2
78        }
79    }
80
81    /// Check if a line is a table delimiter row (e.g., |---|---|)
82    pub fn is_delimiter_row(line: &str) -> bool {
83        let trimmed = line.trim();
84        if !trimmed.contains('|') || !trimmed.contains('-') {
85            return false;
86        }
87
88        // Split by pipes and check each part
89        let parts: Vec<&str> = trimmed.split('|').collect();
90        let mut valid_delimiter_parts = 0;
91
92        for part in &parts {
93            let part_trimmed = part.trim();
94            if part_trimmed.is_empty() {
95                continue; // Skip empty parts from leading/trailing pipes
96            }
97
98            // Check if this part looks like a delimiter (contains dashes and optionally colons)
99            if part_trimmed.chars().all(|c| c == '-' || c == ':' || c.is_whitespace()) && part_trimmed.contains('-') {
100                valid_delimiter_parts += 1;
101            }
102        }
103
104        valid_delimiter_parts >= 2
105    }
106
107    /// Find all table blocks in the content with optimized detection
108    /// This version accepts code_blocks and code_spans directly for use during LintContext construction
109    pub fn find_table_blocks_with_code_info(
110        content: &str,
111        code_blocks: &[(usize, usize)],
112        code_spans: &[crate::lint_context::CodeSpan],
113    ) -> Vec<TableBlock> {
114        let lines: Vec<&str> = content.lines().collect();
115        let mut tables = Vec::new();
116        let mut i = 0;
117
118        // Pre-compute line positions for efficient code block checking
119        let mut line_positions = Vec::with_capacity(lines.len());
120        let mut pos = 0;
121        for line in &lines {
122            line_positions.push(pos);
123            pos += line.len() + 1; // +1 for newline
124        }
125
126        while i < lines.len() {
127            // Skip lines in code blocks using provided code blocks
128            let line_start = line_positions[i];
129            let in_code =
130                crate::utils::code_block_utils::CodeBlockUtils::is_in_code_block_or_span(code_blocks, line_start)
131                    || code_spans
132                        .iter()
133                        .any(|span| line_start >= span.byte_offset && line_start < span.byte_end);
134            if in_code {
135                i += 1;
136                continue;
137            }
138
139            // Look for potential table start
140            if Self::is_potential_table_row(lines[i]) {
141                // Check if the next line is a delimiter row
142                if i + 1 < lines.len() && Self::is_delimiter_row(lines[i + 1]) {
143                    // Found a table! Find its end
144                    let table_start = i;
145                    let header_line = i;
146                    let delimiter_line = i + 1;
147                    let mut table_end = i + 1; // Include the delimiter row
148                    let mut content_lines = Vec::new();
149
150                    // Continue while we have table rows
151                    let mut j = i + 2;
152                    while j < lines.len() {
153                        let line = lines[j];
154                        if line.trim().is_empty() {
155                            // Empty line ends the table
156                            break;
157                        }
158                        if Self::is_potential_table_row(line) {
159                            content_lines.push(j);
160                            table_end = j;
161                            j += 1;
162                        } else {
163                            // Non-table line ends the table
164                            break;
165                        }
166                    }
167
168                    tables.push(TableBlock {
169                        start_line: table_start,
170                        end_line: table_end,
171                        header_line,
172                        delimiter_line,
173                        content_lines,
174                    });
175                    i = table_end + 1;
176                } else {
177                    i += 1;
178                }
179            } else {
180                i += 1;
181            }
182        }
183
184        tables
185    }
186
187    /// Find all table blocks in the content with optimized detection
188    /// This is a backward-compatible wrapper that accepts LintContext
189    pub fn find_table_blocks(content: &str, ctx: &crate::lint_context::LintContext) -> Vec<TableBlock> {
190        Self::find_table_blocks_with_code_info(content, &ctx.code_blocks, &ctx.code_spans())
191    }
192
193    /// Count the number of cells in a table row
194    pub fn count_cells(row: &str) -> usize {
195        let trimmed = row.trim();
196
197        // Skip non-table rows
198        if !trimmed.contains('|') {
199            return 0;
200        }
201
202        // Users shouldn't have to escape pipes in regex patterns, etc.
203        let masked_row = Self::mask_pipes_in_inline_code(trimmed);
204
205        // Handle case with leading/trailing pipes
206        let mut cell_count = 0;
207        let parts: Vec<&str> = masked_row.split('|').collect();
208
209        for (i, part) in parts.iter().enumerate() {
210            // Skip first part if it's empty and there's a leading pipe
211            if i == 0 && part.trim().is_empty() && parts.len() > 1 {
212                continue;
213            }
214
215            // Skip last part if it's empty and there's a trailing pipe
216            if i == parts.len() - 1 && part.trim().is_empty() && parts.len() > 1 {
217                continue;
218            }
219
220            cell_count += 1;
221        }
222
223        cell_count
224    }
225
226    /// Mask pipes inside inline code blocks with a placeholder character
227    fn mask_pipes_in_inline_code(text: &str) -> String {
228        let mut result = String::new();
229        let chars: Vec<char> = text.chars().collect();
230        let mut i = 0;
231
232        while i < chars.len() {
233            if chars[i] == '`' {
234                // Count consecutive backticks at start
235                let start = i;
236                let mut backtick_count = 0;
237                while i < chars.len() && chars[i] == '`' {
238                    backtick_count += 1;
239                    i += 1;
240                }
241
242                // Look for matching closing backticks
243                let mut found_closing = false;
244                let mut j = i;
245
246                while j < chars.len() {
247                    if chars[j] == '`' {
248                        // Count potential closing backticks
249                        let close_start = j;
250                        let mut close_count = 0;
251                        while j < chars.len() && chars[j] == '`' {
252                            close_count += 1;
253                            j += 1;
254                        }
255
256                        if close_count == backtick_count {
257                            // Found matching closing backticks
258                            found_closing = true;
259
260                            // Valid inline code - add with pipes masked
261                            result.extend(chars[start..i].iter());
262
263                            for &ch in chars.iter().take(close_start).skip(i) {
264                                if ch == '|' {
265                                    result.push('_'); // Mask pipe with underscore
266                                } else {
267                                    result.push(ch);
268                                }
269                            }
270
271                            result.extend(chars[close_start..j].iter());
272                            i = j;
273                            break;
274                        }
275                        // If not matching, continue searching (j is already past these backticks)
276                    } else {
277                        j += 1;
278                    }
279                }
280
281                if !found_closing {
282                    // No matching closing found, treat as regular text
283                    result.extend(chars[start..i].iter());
284                }
285            } else {
286                result.push(chars[i]);
287                i += 1;
288            }
289        }
290
291        result
292    }
293
294    /// Determine the pipe style of a table row
295    pub fn determine_pipe_style(line: &str) -> Option<&'static str> {
296        let trimmed = line.trim();
297        if !trimmed.contains('|') {
298            return None;
299        }
300
301        let has_leading = trimmed.starts_with('|');
302        let has_trailing = trimmed.ends_with('|');
303
304        match (has_leading, has_trailing) {
305            (true, true) => Some("leading_and_trailing"),
306            (true, false) => Some("leading_only"),
307            (false, true) => Some("trailing_only"),
308            (false, false) => Some("no_leading_or_trailing"),
309        }
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316    use crate::lint_context::LintContext;
317
318    #[test]
319    fn test_is_potential_table_row() {
320        // Basic valid table rows
321        assert!(TableUtils::is_potential_table_row("| Header 1 | Header 2 |"));
322        assert!(TableUtils::is_potential_table_row("| Cell 1 | Cell 2 |"));
323        assert!(TableUtils::is_potential_table_row("Cell 1 | Cell 2"));
324        assert!(TableUtils::is_potential_table_row("| Cell |")); // Single-column tables are valid in GFM
325
326        // Multiple cells
327        assert!(TableUtils::is_potential_table_row("| A | B | C | D | E |"));
328
329        // With whitespace
330        assert!(TableUtils::is_potential_table_row("  | Indented | Table |  "));
331        assert!(TableUtils::is_potential_table_row("| Spaces | Around |"));
332
333        // Not table rows
334        assert!(!TableUtils::is_potential_table_row("- List item"));
335        assert!(!TableUtils::is_potential_table_row("* Another list"));
336        assert!(!TableUtils::is_potential_table_row("+ Plus list"));
337        assert!(!TableUtils::is_potential_table_row("Regular text"));
338        assert!(!TableUtils::is_potential_table_row(""));
339        assert!(!TableUtils::is_potential_table_row("   "));
340
341        // Code blocks
342        assert!(!TableUtils::is_potential_table_row("`code with | pipe`"));
343        assert!(!TableUtils::is_potential_table_row("``multiple | backticks``"));
344
345        // Single pipe not enough
346        assert!(!TableUtils::is_potential_table_row("Just one |"));
347        assert!(!TableUtils::is_potential_table_row("| Just one"));
348
349        // Very long cells are valid in tables (no length limit for cell content)
350        let long_cell = "a".repeat(150);
351        assert!(TableUtils::is_potential_table_row(&format!("| {long_cell} | b |")));
352
353        // Cells with newlines
354        assert!(!TableUtils::is_potential_table_row("| Cell with\nnewline | Other |"));
355    }
356
357    #[test]
358    fn test_is_delimiter_row() {
359        // Basic delimiter rows
360        assert!(TableUtils::is_delimiter_row("|---|---|"));
361        assert!(TableUtils::is_delimiter_row("| --- | --- |"));
362        assert!(TableUtils::is_delimiter_row("|:---|---:|"));
363        assert!(TableUtils::is_delimiter_row("|:---:|:---:|"));
364
365        // With varying dash counts
366        assert!(TableUtils::is_delimiter_row("|-|--|"));
367        assert!(TableUtils::is_delimiter_row("|-------|----------|"));
368
369        // With whitespace
370        assert!(TableUtils::is_delimiter_row("|  ---  |  ---  |"));
371        assert!(TableUtils::is_delimiter_row("| :--- | ---: |"));
372
373        // Multiple columns
374        assert!(TableUtils::is_delimiter_row("|---|---|---|---|"));
375
376        // Without leading/trailing pipes
377        assert!(TableUtils::is_delimiter_row("--- | ---"));
378        assert!(TableUtils::is_delimiter_row(":--- | ---:"));
379
380        // Not delimiter rows
381        assert!(!TableUtils::is_delimiter_row("| Header | Header |"));
382        assert!(!TableUtils::is_delimiter_row("Regular text"));
383        assert!(!TableUtils::is_delimiter_row(""));
384        assert!(!TableUtils::is_delimiter_row("|||"));
385        assert!(!TableUtils::is_delimiter_row("| | |"));
386
387        // Must have dashes
388        assert!(!TableUtils::is_delimiter_row("| : | : |"));
389        assert!(!TableUtils::is_delimiter_row("|    |    |"));
390
391        // Mixed content
392        assert!(!TableUtils::is_delimiter_row("| --- | text |"));
393        assert!(!TableUtils::is_delimiter_row("| abc | --- |"));
394    }
395
396    #[test]
397    fn test_count_cells() {
398        // Basic counts
399        assert_eq!(TableUtils::count_cells("| Cell 1 | Cell 2 | Cell 3 |"), 3);
400        assert_eq!(TableUtils::count_cells("Cell 1 | Cell 2 | Cell 3"), 3);
401        assert_eq!(TableUtils::count_cells("| Cell 1 | Cell 2"), 2);
402        assert_eq!(TableUtils::count_cells("Cell 1 | Cell 2 |"), 2);
403
404        // Single cell
405        assert_eq!(TableUtils::count_cells("| Cell |"), 1);
406        assert_eq!(TableUtils::count_cells("Cell"), 0); // No pipe
407
408        // Empty cells
409        assert_eq!(TableUtils::count_cells("|  |  |  |"), 3);
410        assert_eq!(TableUtils::count_cells("| | | |"), 3);
411
412        // Many cells
413        assert_eq!(TableUtils::count_cells("| A | B | C | D | E | F |"), 6);
414
415        // Edge cases
416        assert_eq!(TableUtils::count_cells("||"), 1); // One empty cell
417        assert_eq!(TableUtils::count_cells("|||"), 2); // Two empty cells
418
419        // No table
420        assert_eq!(TableUtils::count_cells("Regular text"), 0);
421        assert_eq!(TableUtils::count_cells(""), 0);
422        assert_eq!(TableUtils::count_cells("   "), 0);
423
424        // Whitespace handling
425        assert_eq!(TableUtils::count_cells("  | A | B |  "), 2);
426        assert_eq!(TableUtils::count_cells("|   A   |   B   |"), 2);
427    }
428
429    #[test]
430    fn test_count_cells_with_inline_code() {
431        // Test the user's actual example from Issue #34
432        assert_eq!(TableUtils::count_cells("| Challenge | Solution |"), 2);
433        assert_eq!(
434            TableUtils::count_cells("| Hour:minute:second formats | `^([0-1]?\\d|2[0-3]):[0-5]\\d:[0-5]\\d$` |"),
435            2
436        );
437
438        // Test basic inline code with pipes
439        assert_eq!(TableUtils::count_cells("| Command | `echo | grep` |"), 2);
440        assert_eq!(TableUtils::count_cells("| A | `code | with | pipes` | B |"), 3);
441
442        // Test escaped pipes (correct GFM)
443        assert_eq!(TableUtils::count_cells("| Command | `echo \\| grep` |"), 2);
444
445        // Test multiple inline code blocks
446        assert_eq!(TableUtils::count_cells("| `code | one` | `code | two` |"), 2);
447
448        // Test edge cases
449        assert_eq!(TableUtils::count_cells("| Empty inline | `` | cell |"), 3);
450        assert_eq!(TableUtils::count_cells("| `single|pipe` |"), 1);
451
452        // Test that basic table structure still works
453        assert_eq!(TableUtils::count_cells("| A | B | C |"), 3);
454        assert_eq!(TableUtils::count_cells("| One | Two |"), 2);
455    }
456
457    #[test]
458    fn test_determine_pipe_style() {
459        // All pipe styles
460        assert_eq!(
461            TableUtils::determine_pipe_style("| Cell 1 | Cell 2 |"),
462            Some("leading_and_trailing")
463        );
464        assert_eq!(
465            TableUtils::determine_pipe_style("| Cell 1 | Cell 2"),
466            Some("leading_only")
467        );
468        assert_eq!(
469            TableUtils::determine_pipe_style("Cell 1 | Cell 2 |"),
470            Some("trailing_only")
471        );
472        assert_eq!(
473            TableUtils::determine_pipe_style("Cell 1 | Cell 2"),
474            Some("no_leading_or_trailing")
475        );
476
477        // With whitespace
478        assert_eq!(
479            TableUtils::determine_pipe_style("  | Cell 1 | Cell 2 |  "),
480            Some("leading_and_trailing")
481        );
482        assert_eq!(
483            TableUtils::determine_pipe_style("  | Cell 1 | Cell 2  "),
484            Some("leading_only")
485        );
486
487        // No pipes
488        assert_eq!(TableUtils::determine_pipe_style("Regular text"), None);
489        assert_eq!(TableUtils::determine_pipe_style(""), None);
490        assert_eq!(TableUtils::determine_pipe_style("   "), None);
491
492        // Single pipe cases
493        assert_eq!(TableUtils::determine_pipe_style("|"), Some("leading_and_trailing"));
494        assert_eq!(TableUtils::determine_pipe_style("| Cell"), Some("leading_only"));
495        assert_eq!(TableUtils::determine_pipe_style("Cell |"), Some("trailing_only"));
496    }
497
498    #[test]
499    fn test_find_table_blocks_simple() {
500        let content = "| Header 1 | Header 2 |
501|-----------|-----------|
502| Cell 1    | Cell 2    |
503| Cell 3    | Cell 4    |";
504
505        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
506
507        let tables = TableUtils::find_table_blocks(content, &ctx);
508        assert_eq!(tables.len(), 1);
509
510        let table = &tables[0];
511        assert_eq!(table.start_line, 0);
512        assert_eq!(table.end_line, 3);
513        assert_eq!(table.header_line, 0);
514        assert_eq!(table.delimiter_line, 1);
515        assert_eq!(table.content_lines, vec![2, 3]);
516    }
517
518    #[test]
519    fn test_find_table_blocks_multiple() {
520        let content = "Some text
521
522| Table 1 | Col A |
523|----------|-------|
524| Data 1   | Val 1 |
525
526More text
527
528| Table 2 | Col 2 |
529|----------|-------|
530| Data 2   | Data  |";
531
532        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
533
534        let tables = TableUtils::find_table_blocks(content, &ctx);
535        assert_eq!(tables.len(), 2);
536
537        // First table
538        assert_eq!(tables[0].start_line, 2);
539        assert_eq!(tables[0].end_line, 4);
540        assert_eq!(tables[0].header_line, 2);
541        assert_eq!(tables[0].delimiter_line, 3);
542        assert_eq!(tables[0].content_lines, vec![4]);
543
544        // Second table
545        assert_eq!(tables[1].start_line, 8);
546        assert_eq!(tables[1].end_line, 10);
547        assert_eq!(tables[1].header_line, 8);
548        assert_eq!(tables[1].delimiter_line, 9);
549        assert_eq!(tables[1].content_lines, vec![10]);
550    }
551
552    #[test]
553    fn test_find_table_blocks_no_content_rows() {
554        let content = "| Header 1 | Header 2 |
555|-----------|-----------|
556
557Next paragraph";
558
559        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
560
561        let tables = TableUtils::find_table_blocks(content, &ctx);
562        assert_eq!(tables.len(), 1);
563
564        let table = &tables[0];
565        assert_eq!(table.start_line, 0);
566        assert_eq!(table.end_line, 1); // Just header and delimiter
567        assert_eq!(table.content_lines.len(), 0);
568    }
569
570    #[test]
571    fn test_find_table_blocks_in_code_block() {
572        let content = "```
573| Not | A | Table |
574|-----|---|-------|
575| In  | Code | Block |
576```
577
578| Real | Table |
579|------|-------|
580| Data | Here  |";
581
582        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
583
584        let tables = TableUtils::find_table_blocks(content, &ctx);
585        assert_eq!(tables.len(), 1); // Only the table outside code block
586
587        let table = &tables[0];
588        assert_eq!(table.header_line, 6);
589        assert_eq!(table.delimiter_line, 7);
590    }
591
592    #[test]
593    fn test_find_table_blocks_no_tables() {
594        let content = "Just regular text
595No tables here
596- List item with | pipe
597* Another list item";
598
599        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
600
601        let tables = TableUtils::find_table_blocks(content, &ctx);
602        assert_eq!(tables.len(), 0);
603    }
604
605    #[test]
606    fn test_find_table_blocks_malformed() {
607        let content = "| Header without delimiter |
608| This looks like table |
609But no delimiter row
610
611| Proper | Table |
612|---------|-------|
613| Data    | Here  |";
614
615        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
616
617        let tables = TableUtils::find_table_blocks(content, &ctx);
618        assert_eq!(tables.len(), 1); // Only the proper table
619        assert_eq!(tables[0].header_line, 4);
620    }
621
622    #[test]
623    fn test_edge_cases() {
624        // Test empty content
625        assert!(!TableUtils::is_potential_table_row(""));
626        assert!(!TableUtils::is_delimiter_row(""));
627        assert_eq!(TableUtils::count_cells(""), 0);
628        assert_eq!(TableUtils::determine_pipe_style(""), None);
629
630        // Test whitespace only
631        assert!(!TableUtils::is_potential_table_row("   "));
632        assert!(!TableUtils::is_delimiter_row("   "));
633        assert_eq!(TableUtils::count_cells("   "), 0);
634        assert_eq!(TableUtils::determine_pipe_style("   "), None);
635
636        // Test single character
637        assert!(!TableUtils::is_potential_table_row("|"));
638        assert!(!TableUtils::is_delimiter_row("|"));
639        assert_eq!(TableUtils::count_cells("|"), 0); // Need at least 2 parts
640
641        // Test very long lines are valid table rows (no length limit)
642        // Test both single-column and multi-column long lines
643        let long_single = format!("| {} |", "a".repeat(200));
644        assert!(TableUtils::is_potential_table_row(&long_single)); // Single-column table with long content
645
646        let long_multi = format!("| {} | {} |", "a".repeat(200), "b".repeat(200));
647        assert!(TableUtils::is_potential_table_row(&long_multi)); // Multi-column table with long content
648
649        // Test unicode
650        assert!(TableUtils::is_potential_table_row("| 你好 | 世界 |"));
651        assert!(TableUtils::is_potential_table_row("| émoji | 🎉 |"));
652        assert_eq!(TableUtils::count_cells("| 你好 | 世界 |"), 2);
653    }
654
655    #[test]
656    fn test_table_block_struct() {
657        let block = TableBlock {
658            start_line: 0,
659            end_line: 5,
660            header_line: 0,
661            delimiter_line: 1,
662            content_lines: vec![2, 3, 4, 5],
663        };
664
665        // Test Debug trait
666        let debug_str = format!("{block:?}");
667        assert!(debug_str.contains("TableBlock"));
668        assert!(debug_str.contains("start_line: 0"));
669
670        // Test Clone trait
671        let cloned = block.clone();
672        assert_eq!(cloned.start_line, block.start_line);
673        assert_eq!(cloned.end_line, block.end_line);
674        assert_eq!(cloned.header_line, block.header_line);
675        assert_eq!(cloned.delimiter_line, block.delimiter_line);
676        assert_eq!(cloned.content_lines, block.content_lines);
677    }
678}