rumdl_lib/utils/
table_utils.rs

1/// Shared table detection and processing utilities for markdown linting rules
2///
3/// This module provides optimized table detection and processing functionality
4/// that can be shared across multiple table-related rules (MD055, MD056, MD058).
5/// Represents a table block in the document
6#[derive(Debug, Clone)]
7pub struct TableBlock {
8    pub start_line: usize,
9    pub end_line: usize,
10    pub header_line: usize,
11    pub delimiter_line: usize,
12    pub content_lines: Vec<usize>,
13}
14
15/// Shared table detection utilities
16pub struct TableUtils;
17
18impl TableUtils {
19    /// Check if a line looks like a potential table row
20    pub fn is_potential_table_row(line: &str) -> bool {
21        let trimmed = line.trim();
22        if trimmed.is_empty() || !trimmed.contains('|') {
23            return false;
24        }
25
26        // Skip lines that are clearly not table rows
27        if trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") {
28            return false;
29        }
30
31        // Skip lines that are clearly code or inline code
32        if trimmed.starts_with("`") || trimmed.contains("``") {
33            return false;
34        }
35
36        // Must have at least 2 parts when split by |
37        let parts: Vec<&str> = trimmed.split('|').collect();
38        if parts.len() < 2 {
39            return false;
40        }
41
42        // Check if it looks like a table row by having reasonable content between pipes
43        let mut valid_parts = 0;
44        let mut total_non_empty_parts = 0;
45
46        for part in &parts {
47            let part_trimmed = part.trim();
48            // Skip empty parts (from leading/trailing pipes)
49            if part_trimmed.is_empty() {
50                continue;
51            }
52            total_non_empty_parts += 1;
53
54            // Count parts that look like table cells (reasonable content, no newlines)
55            if !part_trimmed.contains('\n') {
56                valid_parts += 1;
57            }
58        }
59
60        // All non-empty parts must be valid (no newlines) for the row to be valid
61        if total_non_empty_parts == 0 {
62            return false;
63        }
64
65        if valid_parts != total_non_empty_parts {
66            // Some cells contain newlines, not a valid table row
67            return false;
68        }
69
70        // GFM allows single-column tables, so >= 1 valid part is enough
71        // when the line has proper table formatting (pipes)
72        if trimmed.starts_with('|') && trimmed.ends_with('|') {
73            // Properly formatted table row with pipes on both ends
74            valid_parts >= 1
75        } else {
76            // For rows without proper pipe formatting, require at least 2 cells
77            valid_parts >= 2
78        }
79    }
80
81    /// Check if a line is a table delimiter row (e.g., |---|---|)
82    pub fn is_delimiter_row(line: &str) -> bool {
83        let trimmed = line.trim();
84        if !trimmed.contains('|') || !trimmed.contains('-') {
85            return false;
86        }
87
88        // Split by pipes and check each part
89        let parts: Vec<&str> = trimmed.split('|').collect();
90        let mut valid_delimiter_parts = 0;
91        let mut total_non_empty_parts = 0;
92
93        for part in &parts {
94            let part_trimmed = part.trim();
95            if part_trimmed.is_empty() {
96                continue; // Skip empty parts from leading/trailing pipes
97            }
98
99            total_non_empty_parts += 1;
100
101            // Check if this part looks like a delimiter (contains dashes and optionally colons)
102            if part_trimmed.chars().all(|c| c == '-' || c == ':' || c.is_whitespace()) && part_trimmed.contains('-') {
103                valid_delimiter_parts += 1;
104            }
105        }
106
107        // All non-empty parts must be valid delimiters, and there must be at least one
108        total_non_empty_parts > 0 && valid_delimiter_parts == total_non_empty_parts
109    }
110
111    /// Find all table blocks in the content with optimized detection
112    /// This version accepts code_blocks and code_spans directly for use during LintContext construction
113    pub fn find_table_blocks_with_code_info(
114        content: &str,
115        code_blocks: &[(usize, usize)],
116        code_spans: &[crate::lint_context::CodeSpan],
117    ) -> Vec<TableBlock> {
118        let lines: Vec<&str> = content.lines().collect();
119        let mut tables = Vec::new();
120        let mut i = 0;
121
122        // Pre-compute line positions for efficient code block checking
123        let mut line_positions = Vec::with_capacity(lines.len());
124        let mut pos = 0;
125        for line in &lines {
126            line_positions.push(pos);
127            pos += line.len() + 1; // +1 for newline
128        }
129
130        while i < lines.len() {
131            // Skip lines in code blocks using provided code blocks
132            let line_start = line_positions[i];
133            let in_code =
134                crate::utils::code_block_utils::CodeBlockUtils::is_in_code_block_or_span(code_blocks, line_start)
135                    || code_spans
136                        .iter()
137                        .any(|span| line_start >= span.byte_offset && line_start < span.byte_end);
138            if in_code {
139                i += 1;
140                continue;
141            }
142
143            // Look for potential table start
144            if Self::is_potential_table_row(lines[i]) {
145                // Check if the next line is a delimiter row
146                if i + 1 < lines.len() && Self::is_delimiter_row(lines[i + 1]) {
147                    // Found a table! Find its end
148                    let table_start = i;
149                    let header_line = i;
150                    let delimiter_line = i + 1;
151                    let mut table_end = i + 1; // Include the delimiter row
152                    let mut content_lines = Vec::new();
153
154                    // Continue while we have table rows
155                    let mut j = i + 2;
156                    while j < lines.len() {
157                        let line = lines[j];
158                        if line.trim().is_empty() {
159                            // Empty line ends the table
160                            break;
161                        }
162                        if Self::is_potential_table_row(line) {
163                            content_lines.push(j);
164                            table_end = j;
165                            j += 1;
166                        } else {
167                            // Non-table line ends the table
168                            break;
169                        }
170                    }
171
172                    tables.push(TableBlock {
173                        start_line: table_start,
174                        end_line: table_end,
175                        header_line,
176                        delimiter_line,
177                        content_lines,
178                    });
179                    i = table_end + 1;
180                } else {
181                    i += 1;
182                }
183            } else {
184                i += 1;
185            }
186        }
187
188        tables
189    }
190
191    /// Find all table blocks in the content with optimized detection
192    /// This is a backward-compatible wrapper that accepts LintContext
193    pub fn find_table_blocks(content: &str, ctx: &crate::lint_context::LintContext) -> Vec<TableBlock> {
194        Self::find_table_blocks_with_code_info(content, &ctx.code_blocks, &ctx.code_spans())
195    }
196
197    /// Count the number of cells in a table row
198    pub fn count_cells(row: &str) -> usize {
199        let trimmed = row.trim();
200
201        // Skip non-table rows
202        if !trimmed.contains('|') {
203            return 0;
204        }
205
206        // Users shouldn't have to escape pipes in regex patterns, etc.
207        let masked_row = Self::mask_pipes_in_inline_code(trimmed);
208
209        // Handle case with leading/trailing pipes
210        let mut cell_count = 0;
211        let parts: Vec<&str> = masked_row.split('|').collect();
212
213        for (i, part) in parts.iter().enumerate() {
214            // Skip first part if it's empty and there's a leading pipe
215            if i == 0 && part.trim().is_empty() && parts.len() > 1 {
216                continue;
217            }
218
219            // Skip last part if it's empty and there's a trailing pipe
220            if i == parts.len() - 1 && part.trim().is_empty() && parts.len() > 1 {
221                continue;
222            }
223
224            cell_count += 1;
225        }
226
227        cell_count
228    }
229
230    /// Mask pipes inside inline code blocks with a placeholder character
231    pub fn mask_pipes_in_inline_code(text: &str) -> String {
232        let mut result = String::new();
233        let chars: Vec<char> = text.chars().collect();
234        let mut i = 0;
235
236        while i < chars.len() {
237            if chars[i] == '`' {
238                // Count consecutive backticks at start
239                let start = i;
240                let mut backtick_count = 0;
241                while i < chars.len() && chars[i] == '`' {
242                    backtick_count += 1;
243                    i += 1;
244                }
245
246                // Look for matching closing backticks
247                let mut found_closing = false;
248                let mut j = i;
249
250                while j < chars.len() {
251                    if chars[j] == '`' {
252                        // Count potential closing backticks
253                        let close_start = j;
254                        let mut close_count = 0;
255                        while j < chars.len() && chars[j] == '`' {
256                            close_count += 1;
257                            j += 1;
258                        }
259
260                        if close_count == backtick_count {
261                            // Found matching closing backticks
262                            found_closing = true;
263
264                            // Valid inline code - add with pipes masked
265                            result.extend(chars[start..i].iter());
266
267                            for &ch in chars.iter().take(close_start).skip(i) {
268                                if ch == '|' {
269                                    result.push('_'); // Mask pipe with underscore
270                                } else {
271                                    result.push(ch);
272                                }
273                            }
274
275                            result.extend(chars[close_start..j].iter());
276                            i = j;
277                            break;
278                        }
279                        // If not matching, continue searching (j is already past these backticks)
280                    } else {
281                        j += 1;
282                    }
283                }
284
285                if !found_closing {
286                    // No matching closing found, treat as regular text
287                    result.extend(chars[start..i].iter());
288                }
289            } else {
290                result.push(chars[i]);
291                i += 1;
292            }
293        }
294
295        result
296    }
297
298    /// Determine the pipe style of a table row
299    pub fn determine_pipe_style(line: &str) -> Option<&'static str> {
300        let trimmed = line.trim();
301        if !trimmed.contains('|') {
302            return None;
303        }
304
305        let has_leading = trimmed.starts_with('|');
306        let has_trailing = trimmed.ends_with('|');
307
308        match (has_leading, has_trailing) {
309            (true, true) => Some("leading_and_trailing"),
310            (true, false) => Some("leading_only"),
311            (false, true) => Some("trailing_only"),
312            (false, false) => Some("no_leading_or_trailing"),
313        }
314    }
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320    use crate::lint_context::LintContext;
321
322    #[test]
323    fn test_is_potential_table_row() {
324        // Basic valid table rows
325        assert!(TableUtils::is_potential_table_row("| Header 1 | Header 2 |"));
326        assert!(TableUtils::is_potential_table_row("| Cell 1 | Cell 2 |"));
327        assert!(TableUtils::is_potential_table_row("Cell 1 | Cell 2"));
328        assert!(TableUtils::is_potential_table_row("| Cell |")); // Single-column tables are valid in GFM
329
330        // Multiple cells
331        assert!(TableUtils::is_potential_table_row("| A | B | C | D | E |"));
332
333        // With whitespace
334        assert!(TableUtils::is_potential_table_row("  | Indented | Table |  "));
335        assert!(TableUtils::is_potential_table_row("| Spaces | Around |"));
336
337        // Not table rows
338        assert!(!TableUtils::is_potential_table_row("- List item"));
339        assert!(!TableUtils::is_potential_table_row("* Another list"));
340        assert!(!TableUtils::is_potential_table_row("+ Plus list"));
341        assert!(!TableUtils::is_potential_table_row("Regular text"));
342        assert!(!TableUtils::is_potential_table_row(""));
343        assert!(!TableUtils::is_potential_table_row("   "));
344
345        // Code blocks
346        assert!(!TableUtils::is_potential_table_row("`code with | pipe`"));
347        assert!(!TableUtils::is_potential_table_row("``multiple | backticks``"));
348
349        // Single pipe not enough
350        assert!(!TableUtils::is_potential_table_row("Just one |"));
351        assert!(!TableUtils::is_potential_table_row("| Just one"));
352
353        // Very long cells are valid in tables (no length limit for cell content)
354        let long_cell = "a".repeat(150);
355        assert!(TableUtils::is_potential_table_row(&format!("| {long_cell} | b |")));
356
357        // Cells with newlines
358        assert!(!TableUtils::is_potential_table_row("| Cell with\nnewline | Other |"));
359    }
360
361    #[test]
362    fn test_is_delimiter_row() {
363        // Basic delimiter rows
364        assert!(TableUtils::is_delimiter_row("|---|---|"));
365        assert!(TableUtils::is_delimiter_row("| --- | --- |"));
366        assert!(TableUtils::is_delimiter_row("|:---|---:|"));
367        assert!(TableUtils::is_delimiter_row("|:---:|:---:|"));
368
369        // With varying dash counts
370        assert!(TableUtils::is_delimiter_row("|-|--|"));
371        assert!(TableUtils::is_delimiter_row("|-------|----------|"));
372
373        // With whitespace
374        assert!(TableUtils::is_delimiter_row("|  ---  |  ---  |"));
375        assert!(TableUtils::is_delimiter_row("| :--- | ---: |"));
376
377        // Multiple columns
378        assert!(TableUtils::is_delimiter_row("|---|---|---|---|"));
379
380        // Without leading/trailing pipes
381        assert!(TableUtils::is_delimiter_row("--- | ---"));
382        assert!(TableUtils::is_delimiter_row(":--- | ---:"));
383
384        // Not delimiter rows
385        assert!(!TableUtils::is_delimiter_row("| Header | Header |"));
386        assert!(!TableUtils::is_delimiter_row("Regular text"));
387        assert!(!TableUtils::is_delimiter_row(""));
388        assert!(!TableUtils::is_delimiter_row("|||"));
389        assert!(!TableUtils::is_delimiter_row("| | |"));
390
391        // Must have dashes
392        assert!(!TableUtils::is_delimiter_row("| : | : |"));
393        assert!(!TableUtils::is_delimiter_row("|    |    |"));
394
395        // Mixed content
396        assert!(!TableUtils::is_delimiter_row("| --- | text |"));
397        assert!(!TableUtils::is_delimiter_row("| abc | --- |"));
398    }
399
400    #[test]
401    fn test_count_cells() {
402        // Basic counts
403        assert_eq!(TableUtils::count_cells("| Cell 1 | Cell 2 | Cell 3 |"), 3);
404        assert_eq!(TableUtils::count_cells("Cell 1 | Cell 2 | Cell 3"), 3);
405        assert_eq!(TableUtils::count_cells("| Cell 1 | Cell 2"), 2);
406        assert_eq!(TableUtils::count_cells("Cell 1 | Cell 2 |"), 2);
407
408        // Single cell
409        assert_eq!(TableUtils::count_cells("| Cell |"), 1);
410        assert_eq!(TableUtils::count_cells("Cell"), 0); // No pipe
411
412        // Empty cells
413        assert_eq!(TableUtils::count_cells("|  |  |  |"), 3);
414        assert_eq!(TableUtils::count_cells("| | | |"), 3);
415
416        // Many cells
417        assert_eq!(TableUtils::count_cells("| A | B | C | D | E | F |"), 6);
418
419        // Edge cases
420        assert_eq!(TableUtils::count_cells("||"), 1); // One empty cell
421        assert_eq!(TableUtils::count_cells("|||"), 2); // Two empty cells
422
423        // No table
424        assert_eq!(TableUtils::count_cells("Regular text"), 0);
425        assert_eq!(TableUtils::count_cells(""), 0);
426        assert_eq!(TableUtils::count_cells("   "), 0);
427
428        // Whitespace handling
429        assert_eq!(TableUtils::count_cells("  | A | B |  "), 2);
430        assert_eq!(TableUtils::count_cells("|   A   |   B   |"), 2);
431    }
432
433    #[test]
434    fn test_count_cells_with_inline_code() {
435        // Test the user's actual example from Issue #34
436        assert_eq!(TableUtils::count_cells("| Challenge | Solution |"), 2);
437        assert_eq!(
438            TableUtils::count_cells("| Hour:minute:second formats | `^([0-1]?\\d|2[0-3]):[0-5]\\d:[0-5]\\d$` |"),
439            2
440        );
441
442        // Test basic inline code with pipes
443        assert_eq!(TableUtils::count_cells("| Command | `echo | grep` |"), 2);
444        assert_eq!(TableUtils::count_cells("| A | `code | with | pipes` | B |"), 3);
445
446        // Test escaped pipes (correct GFM)
447        assert_eq!(TableUtils::count_cells("| Command | `echo \\| grep` |"), 2);
448
449        // Test multiple inline code blocks
450        assert_eq!(TableUtils::count_cells("| `code | one` | `code | two` |"), 2);
451
452        // Test edge cases
453        assert_eq!(TableUtils::count_cells("| Empty inline | `` | cell |"), 3);
454        assert_eq!(TableUtils::count_cells("| `single|pipe` |"), 1);
455
456        // Test that basic table structure still works
457        assert_eq!(TableUtils::count_cells("| A | B | C |"), 3);
458        assert_eq!(TableUtils::count_cells("| One | Two |"), 2);
459    }
460
461    #[test]
462    fn test_determine_pipe_style() {
463        // All pipe styles
464        assert_eq!(
465            TableUtils::determine_pipe_style("| Cell 1 | Cell 2 |"),
466            Some("leading_and_trailing")
467        );
468        assert_eq!(
469            TableUtils::determine_pipe_style("| Cell 1 | Cell 2"),
470            Some("leading_only")
471        );
472        assert_eq!(
473            TableUtils::determine_pipe_style("Cell 1 | Cell 2 |"),
474            Some("trailing_only")
475        );
476        assert_eq!(
477            TableUtils::determine_pipe_style("Cell 1 | Cell 2"),
478            Some("no_leading_or_trailing")
479        );
480
481        // With whitespace
482        assert_eq!(
483            TableUtils::determine_pipe_style("  | Cell 1 | Cell 2 |  "),
484            Some("leading_and_trailing")
485        );
486        assert_eq!(
487            TableUtils::determine_pipe_style("  | Cell 1 | Cell 2  "),
488            Some("leading_only")
489        );
490
491        // No pipes
492        assert_eq!(TableUtils::determine_pipe_style("Regular text"), None);
493        assert_eq!(TableUtils::determine_pipe_style(""), None);
494        assert_eq!(TableUtils::determine_pipe_style("   "), None);
495
496        // Single pipe cases
497        assert_eq!(TableUtils::determine_pipe_style("|"), Some("leading_and_trailing"));
498        assert_eq!(TableUtils::determine_pipe_style("| Cell"), Some("leading_only"));
499        assert_eq!(TableUtils::determine_pipe_style("Cell |"), Some("trailing_only"));
500    }
501
502    #[test]
503    fn test_find_table_blocks_simple() {
504        let content = "| Header 1 | Header 2 |
505|-----------|-----------|
506| Cell 1    | Cell 2    |
507| Cell 3    | Cell 4    |";
508
509        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
510
511        let tables = TableUtils::find_table_blocks(content, &ctx);
512        assert_eq!(tables.len(), 1);
513
514        let table = &tables[0];
515        assert_eq!(table.start_line, 0);
516        assert_eq!(table.end_line, 3);
517        assert_eq!(table.header_line, 0);
518        assert_eq!(table.delimiter_line, 1);
519        assert_eq!(table.content_lines, vec![2, 3]);
520    }
521
522    #[test]
523    fn test_find_table_blocks_multiple() {
524        let content = "Some text
525
526| Table 1 | Col A |
527|----------|-------|
528| Data 1   | Val 1 |
529
530More text
531
532| Table 2 | Col 2 |
533|----------|-------|
534| Data 2   | Data  |";
535
536        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
537
538        let tables = TableUtils::find_table_blocks(content, &ctx);
539        assert_eq!(tables.len(), 2);
540
541        // First table
542        assert_eq!(tables[0].start_line, 2);
543        assert_eq!(tables[0].end_line, 4);
544        assert_eq!(tables[0].header_line, 2);
545        assert_eq!(tables[0].delimiter_line, 3);
546        assert_eq!(tables[0].content_lines, vec![4]);
547
548        // Second table
549        assert_eq!(tables[1].start_line, 8);
550        assert_eq!(tables[1].end_line, 10);
551        assert_eq!(tables[1].header_line, 8);
552        assert_eq!(tables[1].delimiter_line, 9);
553        assert_eq!(tables[1].content_lines, vec![10]);
554    }
555
556    #[test]
557    fn test_find_table_blocks_no_content_rows() {
558        let content = "| Header 1 | Header 2 |
559|-----------|-----------|
560
561Next paragraph";
562
563        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
564
565        let tables = TableUtils::find_table_blocks(content, &ctx);
566        assert_eq!(tables.len(), 1);
567
568        let table = &tables[0];
569        assert_eq!(table.start_line, 0);
570        assert_eq!(table.end_line, 1); // Just header and delimiter
571        assert_eq!(table.content_lines.len(), 0);
572    }
573
574    #[test]
575    fn test_find_table_blocks_in_code_block() {
576        let content = "```
577| Not | A | Table |
578|-----|---|-------|
579| In  | Code | Block |
580```
581
582| Real | Table |
583|------|-------|
584| Data | Here  |";
585
586        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
587
588        let tables = TableUtils::find_table_blocks(content, &ctx);
589        assert_eq!(tables.len(), 1); // Only the table outside code block
590
591        let table = &tables[0];
592        assert_eq!(table.header_line, 6);
593        assert_eq!(table.delimiter_line, 7);
594    }
595
596    #[test]
597    fn test_find_table_blocks_no_tables() {
598        let content = "Just regular text
599No tables here
600- List item with | pipe
601* Another list item";
602
603        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
604
605        let tables = TableUtils::find_table_blocks(content, &ctx);
606        assert_eq!(tables.len(), 0);
607    }
608
609    #[test]
610    fn test_find_table_blocks_malformed() {
611        let content = "| Header without delimiter |
612| This looks like table |
613But no delimiter row
614
615| Proper | Table |
616|---------|-------|
617| Data    | Here  |";
618
619        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
620
621        let tables = TableUtils::find_table_blocks(content, &ctx);
622        assert_eq!(tables.len(), 1); // Only the proper table
623        assert_eq!(tables[0].header_line, 4);
624    }
625
626    #[test]
627    fn test_edge_cases() {
628        // Test empty content
629        assert!(!TableUtils::is_potential_table_row(""));
630        assert!(!TableUtils::is_delimiter_row(""));
631        assert_eq!(TableUtils::count_cells(""), 0);
632        assert_eq!(TableUtils::determine_pipe_style(""), None);
633
634        // Test whitespace only
635        assert!(!TableUtils::is_potential_table_row("   "));
636        assert!(!TableUtils::is_delimiter_row("   "));
637        assert_eq!(TableUtils::count_cells("   "), 0);
638        assert_eq!(TableUtils::determine_pipe_style("   "), None);
639
640        // Test single character
641        assert!(!TableUtils::is_potential_table_row("|"));
642        assert!(!TableUtils::is_delimiter_row("|"));
643        assert_eq!(TableUtils::count_cells("|"), 0); // Need at least 2 parts
644
645        // Test very long lines are valid table rows (no length limit)
646        // Test both single-column and multi-column long lines
647        let long_single = format!("| {} |", "a".repeat(200));
648        assert!(TableUtils::is_potential_table_row(&long_single)); // Single-column table with long content
649
650        let long_multi = format!("| {} | {} |", "a".repeat(200), "b".repeat(200));
651        assert!(TableUtils::is_potential_table_row(&long_multi)); // Multi-column table with long content
652
653        // Test unicode
654        assert!(TableUtils::is_potential_table_row("| 你好 | 世界 |"));
655        assert!(TableUtils::is_potential_table_row("| émoji | 🎉 |"));
656        assert_eq!(TableUtils::count_cells("| 你好 | 世界 |"), 2);
657    }
658
659    #[test]
660    fn test_table_block_struct() {
661        let block = TableBlock {
662            start_line: 0,
663            end_line: 5,
664            header_line: 0,
665            delimiter_line: 1,
666            content_lines: vec![2, 3, 4, 5],
667        };
668
669        // Test Debug trait
670        let debug_str = format!("{block:?}");
671        assert!(debug_str.contains("TableBlock"));
672        assert!(debug_str.contains("start_line: 0"));
673
674        // Test Clone trait
675        let cloned = block.clone();
676        assert_eq!(cloned.start_line, block.start_line);
677        assert_eq!(cloned.end_line, block.end_line);
678        assert_eq!(cloned.header_line, block.header_line);
679        assert_eq!(cloned.delimiter_line, block.delimiter_line);
680        assert_eq!(cloned.content_lines, block.content_lines);
681    }
682}