bashrs/formatter/
engine.rs

1//! Normalization engine for syntax transformation
2
3use crate::formatter::{dialect::*, logging::*, source_map::*, transforms::*, types::*};
4use std::borrow::Cow;
5
6/// Main normalization engine with zero-copy fast path
7#[derive(Debug, Clone)]
8pub struct NormalizationEngine {
9    /// Active whitespace context stack
10    ws_stack: Vec<WhitespaceContext>,
11
12    /// Configuration options
13    config: EngineConfig,
14}
15
16#[derive(Debug, Clone)]
17pub struct EngineConfig {
18    /// Enable fast path for canonical inputs
19    pub enable_fast_path: bool,
20
21    /// Maximum nesting depth before giving up
22    pub max_nesting_depth: usize,
23
24    /// Whether to preserve comments
25    pub preserve_comments: bool,
26
27    /// Whether to generate transform proofs
28    pub generate_proofs: bool,
29}
30
31impl Default for EngineConfig {
32    fn default() -> Self {
33        Self {
34            enable_fast_path: true,
35            max_nesting_depth: 256,
36            preserve_comments: true,
37            generate_proofs: false,
38        }
39    }
40}
41
42impl NormalizationEngine {
43    pub fn new() -> Self {
44        Self {
45            ws_stack: vec![WhitespaceContext::Command],
46            config: EngineConfig::default(),
47        }
48    }
49
50    pub fn with_config(config: EngineConfig) -> Self {
51        Self {
52            ws_stack: vec![WhitespaceContext::Command],
53            config,
54        }
55    }
56
57    /// Check if input is already in canonical form (23% hit rate on coreutils)
58    pub fn is_canonical(&self, input: &[u8]) -> bool {
59        if !self.config.enable_fast_path {
60            return false;
61        }
62
63        // Simple heuristics for canonical form
64        let input_str = match std::str::from_utf8(input) {
65            Ok(s) => s,
66            Err(_) => return false,
67        };
68
69        // If we need to preserve comments and there are comments, can't use fast path
70        if self.config.preserve_comments && input_str.contains('#') {
71            return false;
72        }
73
74        // Check for obviously non-canonical patterns
75        if input_str.contains("  ") || // Multiple spaces
76           input_str.contains("\t") || // Tabs
77           input_str.contains("\r") || // Carriage returns
78           input_str.starts_with(' ') || // Leading space
79           input_str.ends_with(' ')
80        {
81            // Trailing space
82            return false;
83        }
84
85        // Check for unquoted variables in command context
86        if input_str.contains("$") && !self.has_proper_quoting(input_str) {
87            return false;
88        }
89
90        true
91    }
92
93    /// Main normalization with full tracking
94    pub fn normalize<'a>(
95        &mut self,
96        input: &'a [u8],
97        dialect: ShellDialect,
98        config: FormatConfig,
99    ) -> crate::Result<FormattedSource<'a>> {
100        let input_str = std::str::from_utf8(input)
101            .map_err(|e| crate::Error::Internal(format!("Invalid UTF-8: {e}")))?;
102
103        // Fast path: already canonical
104        if self.is_canonical(input) {
105            return Ok(FormattedSource {
106                text: Cow::Borrowed(input_str),
107                source_map: SourceMap::identity(input.len()),
108                metadata: SemanticMetadata::default(),
109                canonical_hash: blake3::hash(input).into(),
110                transforms: TransformLog::new(),
111            });
112        }
113
114        // Slow path: full normalization
115        let mut output = String::with_capacity(input.len() + input.len() / 4);
116        let mut source_map = SourceMapBuilder::new();
117        let mut transform_log = TransformLog::new();
118        let mut metadata = SemanticMetadata::default();
119
120        // Simple line-by-line processing
121        let mut line_number = 1;
122        let mut char_pos = 0;
123
124        for line in input_str.lines() {
125            let _line_start = char_pos;
126            let formatted_line = self.normalize_line(
127                line,
128                dialect.clone(),
129                &config,
130                &mut source_map,
131                &mut transform_log,
132                &mut metadata,
133                line_number,
134                char_pos,
135            )?;
136
137            output.push_str(&formatted_line);
138            if line_number < input_str.lines().count() {
139                output.push('\n');
140            }
141
142            char_pos += line.len() + 1; // +1 for newline
143            line_number += 1;
144        }
145
146        let canonical_hash = blake3::hash(output.as_bytes()).into();
147
148        Ok(FormattedSource {
149            text: Cow::Owned(output),
150            source_map: source_map.build(),
151            metadata,
152            canonical_hash,
153            transforms: transform_log,
154        })
155    }
156
157    #[allow(clippy::too_many_arguments)]
158    fn normalize_line(
159        &mut self,
160        line: &str,
161        dialect: ShellDialect,
162        config: &FormatConfig,
163        source_map: &mut SourceMapBuilder,
164        transform_log: &mut TransformLog,
165        metadata: &mut SemanticMetadata,
166        line_number: usize,
167        line_start: usize,
168    ) -> crate::Result<String> {
169        let mut output = String::with_capacity(line.len());
170        let mut chars = line.char_indices().peekable();
171
172        while let Some((pos, ch)) = chars.next() {
173            let absolute_pos = line_start + pos;
174
175            match ch {
176                // Handle whitespace
177                ' ' | '\t' => {
178                    self.normalize_whitespace(
179                        &mut chars,
180                        &mut output,
181                        source_map,
182                        transform_log,
183                        absolute_pos,
184                    )?;
185                }
186
187                // Handle comments
188                '#' => {
189                    if config.preserve_whitespace || self.config.preserve_comments {
190                        let comment = self.extract_comment(&mut chars, pos, line)?;
191                        output.push_str(&comment);
192
193                        metadata.comments.push(CommentMetadata {
194                            content: comment.clone(),
195                            start_pos: absolute_pos,
196                            end_pos: absolute_pos + comment.len(),
197                            line: line_number,
198                            column: pos,
199                        });
200                    } else {
201                        // Still need to consume the character if not preserving
202                        output.push(ch);
203                    }
204                }
205
206                // Handle variable expansion
207                '$' => {
208                    self.normalize_expansion(
209                        &mut chars,
210                        &mut output,
211                        source_map,
212                        transform_log,
213                        absolute_pos,
214                        dialect.clone(),
215                    )?;
216                }
217
218                // Handle quotes
219                '\'' | '"' => {
220                    self.normalize_quoted_string(
221                        ch,
222                        &mut chars,
223                        &mut output,
224                        source_map,
225                        absolute_pos,
226                    )?;
227                }
228
229                // Copy other characters verbatim
230                _ => {
231                    output.push(ch);
232                    source_map.add_char_mapping(
233                        CharPos(absolute_pos),
234                        CharPos(line_start + output.len() - 1),
235                    );
236                }
237            }
238        }
239
240        Ok(output)
241    }
242
243    fn normalize_whitespace(
244        &mut self,
245        chars: &mut std::iter::Peekable<std::str::CharIndices>,
246        output: &mut String,
247        source_map: &mut SourceMapBuilder,
248        transform_log: &mut TransformLog,
249        start_pos: usize,
250    ) -> crate::Result<()> {
251        let context = self
252            .ws_stack
253            .last()
254            .copied()
255            .unwrap_or(WhitespaceContext::Command);
256
257        // Consume all consecutive whitespace
258        let mut whitespace_chars = 1; // We already found one
259        while let Some((_, ch)) = chars.peek() {
260            if ch.is_whitespace() && *ch != '\n' {
261                chars.next();
262                whitespace_chars += 1;
263            } else {
264                break;
265            }
266        }
267
268        // Apply normalization based on context
269        let normalized = match context {
270            WhitespaceContext::Command => " ",   // Single space
271            WhitespaceContext::Arithmetic => "", // No whitespace
272            WhitespaceContext::QuotedString { .. } => {
273                // Preserve original whitespace in quoted strings
274                return Ok(()); // Skip normalization
275            }
276            _ => " ", // Default to single space
277        };
278
279        if whitespace_chars > 1 || (!normalized.is_empty() && whitespace_chars == 0) {
280            // Record the transformation
281            let transform = Transform::WhitespaceNormalize {
282                context,
283                preserved: IntervalSet::new(),
284            };
285
286            transform_log.add_entry(TransformEntry {
287                id: TransformId::new(),
288                transform,
289                source_span: Span::new(BytePos(start_pos), BytePos(start_pos + whitespace_chars)),
290                result_span: Span::new(
291                    BytePos(output.len()),
292                    BytePos(output.len() + normalized.len()),
293                ),
294                timestamp: std::time::Instant::now(),
295                proof: None,
296                semantic_delta: None,
297            });
298        }
299
300        output.push_str(normalized);
301
302        // Add mapping for the whitespace range
303        source_map.add_range_mapping(
304            CharPos(start_pos),
305            CharPos(start_pos + whitespace_chars),
306            CharPos(output.len() - normalized.len()),
307            CharPos(output.len()),
308        );
309
310        Ok(())
311    }
312
313    fn normalize_expansion(
314        &mut self,
315        chars: &mut std::iter::Peekable<std::str::CharIndices>,
316        output: &mut String,
317        _source_map: &mut SourceMapBuilder,
318        transform_log: &mut TransformLog,
319        start_pos: usize,
320        _dialect: ShellDialect,
321    ) -> crate::Result<()> {
322        // Check if we need to add quotes
323        let context = self
324            .ws_stack
325            .last()
326            .copied()
327            .unwrap_or(WhitespaceContext::Command);
328
329        let needs_quotes = matches!(context, WhitespaceContext::Command);
330
331        if let Some((_, '{')) = chars.peek() {
332            // ${var} form - copy as is
333            output.push('$');
334            output.push('{');
335            chars.next();
336
337            for (_, ch) in chars.by_ref() {
338                output.push(ch);
339                if ch == '}' {
340                    break;
341                }
342            }
343        } else {
344            // $var form - might need quoting
345            let var_start = output.len();
346            let mut var_name = String::new();
347
348            while let Some((_, ch)) = chars.peek() {
349                if ch.is_alphanumeric() || *ch == '_' {
350                    var_name.push(*ch);
351                    chars.next();
352                } else {
353                    break;
354                }
355            }
356
357            if needs_quotes && !var_name.is_empty() {
358                output.push('"');
359                output.push('$');
360                output.push_str(&var_name);
361                output.push('"');
362
363                // Record quote expansion transform
364                let transform = Transform::QuoteExpansion {
365                    kind: QuoteKind::Double,
366                    reason: QuoteReason::WordSplitting,
367                    proof: SexprProof::new(format!(
368                        "(= (word-split ${var_name}) (word-split \"${var_name}\"))"
369                    )),
370                };
371
372                transform_log.add_entry(TransformEntry {
373                    id: TransformId::new(),
374                    transform,
375                    source_span: Span::new(
376                        BytePos(start_pos),
377                        BytePos(start_pos + 1 + var_name.len()),
378                    ),
379                    result_span: Span::new(BytePos(var_start), BytePos(output.len())),
380                    timestamp: std::time::Instant::now(),
381                    proof: None,
382                    semantic_delta: None,
383                });
384            } else {
385                output.push('$');
386                output.push_str(&var_name);
387            }
388        }
389
390        Ok(())
391    }
392
393    fn normalize_quoted_string(
394        &mut self,
395        quote_char: char,
396        chars: &mut std::iter::Peekable<std::str::CharIndices>,
397        output: &mut String,
398        _source_map: &mut SourceMapBuilder,
399        _start_pos: usize,
400    ) -> crate::Result<()> {
401        output.push(quote_char);
402
403        // Push quoted string context
404        let quote_type = match quote_char {
405            '\'' => QuoteType::Single,
406            '"' => QuoteType::Double,
407            _ => QuoteType::Double,
408        };
409
410        self.ws_stack
411            .push(WhitespaceContext::QuotedString { quote_type });
412
413        // Copy quoted content preserving whitespace
414        while let Some((_, ch)) = chars.next() {
415            output.push(ch);
416
417            if ch == quote_char {
418                break;
419            }
420
421            // Handle escape sequences
422            if ch == '\\' {
423                if let Some((_, escaped)) = chars.next() {
424                    output.push(escaped);
425                }
426            }
427        }
428
429        // Pop quoted string context
430        self.ws_stack.pop();
431
432        Ok(())
433    }
434
435    fn extract_comment(
436        &self,
437        chars: &mut std::iter::Peekable<std::str::CharIndices>,
438        start_pos: usize,
439        line: &str,
440    ) -> crate::Result<String> {
441        // Extract comment from current position to end of line
442        let comment = line[start_pos..].to_string();
443
444        // Consume all remaining characters since they're part of the comment
445        while chars.next().is_some() {}
446
447        Ok(comment)
448    }
449
450    fn has_proper_quoting(&self, input: &str) -> bool {
451        // Simple check for proper variable quoting
452        // This is a heuristic - proper implementation would need full parsing
453        let mut in_quotes = false;
454        let mut quote_char = '\0';
455        let chars = input.chars();
456
457        for ch in chars {
458            match ch {
459                '\'' | '"' if !in_quotes => {
460                    in_quotes = true;
461                    quote_char = ch;
462                }
463                c if in_quotes && c == quote_char => {
464                    in_quotes = false;
465                    quote_char = '\0';
466                }
467                '$' if !in_quotes => {
468                    // Unquoted variable - not canonical
469                    return false;
470                }
471                _ => {}
472            }
473        }
474
475        true
476    }
477}
478
479impl Default for NormalizationEngine {
480    fn default() -> Self {
481        Self::new()
482    }
483}
484
485#[cfg(test)]
486mod tests {
487    use super::*;
488
489    #[test]
490    fn test_engine_creation() {
491        let engine = NormalizationEngine::new();
492        assert_eq!(engine.ws_stack.len(), 1);
493        assert!(matches!(engine.ws_stack[0], WhitespaceContext::Command));
494    }
495
496    #[test]
497    fn test_engine_with_config() {
498        let config = EngineConfig {
499            enable_fast_path: false,
500            max_nesting_depth: 512,
501            preserve_comments: false,
502            generate_proofs: true,
503        };
504
505        let engine = NormalizationEngine::with_config(config.clone());
506        assert!(!engine.config.enable_fast_path);
507        assert_eq!(engine.config.max_nesting_depth, 512);
508    }
509
510    #[test]
511    fn test_is_canonical_simple() {
512        let engine = NormalizationEngine::new();
513
514        assert!(engine.is_canonical(b"echo hello"));
515        assert!(!engine.is_canonical(b"echo  hello")); // Multiple spaces
516        assert!(!engine.is_canonical(b" echo hello")); // Leading space
517        assert!(!engine.is_canonical(b"echo hello ")); // Trailing space
518        assert!(!engine.is_canonical(b"echo\thello")); // Tab
519    }
520
521    #[test]
522    fn test_is_canonical_quoting() {
523        let engine = NormalizationEngine::new();
524
525        assert!(engine.is_canonical(b"echo \"$var\""));
526        assert!(!engine.is_canonical(b"echo $var")); // Unquoted variable
527    }
528
529    #[test]
530    fn test_normalize_identity() {
531        let mut engine = NormalizationEngine::new();
532        let input = b"echo hello";
533        let config = FormatConfig::default();
534
535        let result = engine.normalize(input, ShellDialect::Posix, config);
536        assert!(result.is_ok());
537
538        let formatted = result.unwrap();
539        assert_eq!(formatted.text.as_ref(), "echo hello");
540    }
541
542    #[test]
543    fn test_normalize_whitespace() {
544        let mut engine = NormalizationEngine::new();
545        let input = b"echo  hello   world";
546        let config = FormatConfig::default();
547
548        let result = engine.normalize(input, ShellDialect::Posix, config);
549        assert!(result.is_ok());
550
551        let formatted = result.unwrap();
552        assert_eq!(formatted.text.as_ref(), "echo hello world");
553        assert!(!formatted.transforms.entries.is_empty());
554    }
555
556    #[test]
557    fn test_normalize_variable_quoting() {
558        let mut engine = NormalizationEngine::new();
559        let input = b"echo $var";
560        let config = FormatConfig::default();
561
562        let result = engine.normalize(input, ShellDialect::Posix, config);
563        assert!(result.is_ok());
564
565        let formatted = result.unwrap();
566        assert_eq!(formatted.text.as_ref(), "echo \"$var\"");
567
568        // Should have a quote expansion transform
569        let has_quote_transform = formatted
570            .transforms
571            .entries
572            .iter()
573            .any(|entry| matches!(entry.transform, Transform::QuoteExpansion { .. }));
574        assert!(has_quote_transform);
575    }
576
577    #[test]
578    fn test_normalize_quoted_strings() {
579        let mut engine = NormalizationEngine::new();
580        let input = b"echo 'hello  world'";
581        let config = FormatConfig::default();
582
583        let result = engine.normalize(input, ShellDialect::Posix, config);
584        assert!(result.is_ok());
585
586        let formatted = result.unwrap();
587        // Whitespace inside quotes should be preserved
588        assert_eq!(formatted.text.as_ref(), "echo 'hello  world'");
589    }
590
591    #[test]
592    fn test_normalize_comments() {
593        let mut engine = NormalizationEngine::new();
594        let input = b"echo hello # this is a comment";
595        let config = FormatConfig::default();
596
597        let result = engine.normalize(input, ShellDialect::Posix, config);
598        assert!(result.is_ok());
599
600        let formatted = result.unwrap();
601        assert_eq!(formatted.text.as_ref(), "echo hello # this is a comment");
602        assert_eq!(formatted.metadata.comments.len(), 1);
603        assert_eq!(
604            formatted.metadata.comments[0].content,
605            "# this is a comment"
606        );
607    }
608
609    #[test]
610    fn test_normalize_multiline() {
611        let mut engine = NormalizationEngine::new();
612        let input = b"echo  hello\necho   world";
613        let config = FormatConfig::default();
614
615        let result = engine.normalize(input, ShellDialect::Posix, config);
616        assert!(result.is_ok());
617
618        let formatted = result.unwrap();
619        assert_eq!(formatted.text.as_ref(), "echo hello\necho world");
620    }
621
622    #[test]
623    fn test_has_proper_quoting() {
624        let engine = NormalizationEngine::new();
625
626        assert!(engine.has_proper_quoting("echo \"$var\""));
627        assert!(engine.has_proper_quoting("echo '$var'"));
628        assert!(!engine.has_proper_quoting("echo $var"));
629        assert!(engine.has_proper_quoting("echo hello")); // No variables
630    }
631
632    #[test]
633    fn test_config_effects() {
634        let config = EngineConfig {
635            enable_fast_path: false,
636            preserve_comments: false,
637            ..Default::default()
638        };
639
640        let mut engine = NormalizationEngine::with_config(config);
641
642        // Fast path should be disabled
643        assert!(!engine.is_canonical(b"echo hello"));
644
645        // Comments should not be preserved (this would need full implementation)
646        let input = b"echo hello # comment";
647        let format_config = FormatConfig::default();
648
649        let result = engine.normalize(input, ShellDialect::Posix, format_config);
650        assert!(result.is_ok());
651    }
652}