bashrs 6.66.0 - Docs.rs

//! Normalization engine for syntax transformation

use crate::formatter::{dialect::*, logging::*, source_map::*, transforms::*, types::*};
use std::borrow::Cow;

/// Main normalization engine with zero-copy fast path
#[derive(Debug, Clone)]
pub struct NormalizationEngine {
    /// Active whitespace context stack
    ws_stack: Vec<WhitespaceContext>,

    /// Configuration options
    config: EngineConfig,
}

#[derive(Debug, Clone)]
pub struct EngineConfig {
    /// Enable fast path for canonical inputs
    pub enable_fast_path: bool,

    /// Maximum nesting depth before giving up
    pub max_nesting_depth: usize,

    /// Whether to preserve comments
    pub preserve_comments: bool,

    /// Whether to generate transform proofs
    pub generate_proofs: bool,
}

impl Default for EngineConfig {
    fn default() -> Self {
        Self {
            enable_fast_path: true,
            max_nesting_depth: 256,
            preserve_comments: true,
            generate_proofs: false,
        }
    }
}

impl NormalizationEngine {
    pub fn new() -> Self {
        Self {
            ws_stack: vec![WhitespaceContext::Command],
            config: EngineConfig::default(),
        }
    }

    pub fn with_config(config: EngineConfig) -> Self {
        Self {
            ws_stack: vec![WhitespaceContext::Command],
            config,
        }
    }

    /// Check if input is already in canonical form (23% hit rate on coreutils)
    pub fn is_canonical(&self, input: &[u8]) -> bool {
        if !self.config.enable_fast_path {
            return false;
        }

        // Simple heuristics for canonical form
        let input_str = match std::str::from_utf8(input) {
            Ok(s) => s,
            Err(_) => return false,
        };

        // If we need to preserve comments and there are comments, can't use fast path
        if self.config.preserve_comments && input_str.contains('#') {
            return false;
        }

        // Check for obviously non-canonical patterns
        if input_str.contains("  ") || // Multiple spaces
           input_str.contains('\t') || // Tabs
           input_str.contains('\r') || // Carriage returns
           input_str.starts_with(' ') || // Leading space
           input_str.ends_with(' ')
        {
            // Trailing space
            return false;
        }

        // Check for unquoted variables in command context
        if input_str.contains('$') && !self.has_proper_quoting(input_str) {
            return false;
        }

        true
    }

    /// Main normalization with full tracking
    pub fn normalize<'a>(
        &mut self,
        input: &'a [u8],
        dialect: ShellDialect,
        config: FormatConfig,
    ) -> crate::Result<FormattedSource<'a>> {
        let input_str = std::str::from_utf8(input)
            .map_err(|e| crate::Error::Internal(format!("Invalid UTF-8: {e}")))?;

        // Fast path: already canonical
        if self.is_canonical(input) {
            return Ok(FormattedSource {
                text: Cow::Borrowed(input_str),
                source_map: SourceMap::identity(input.len()),
                metadata: SemanticMetadata::default(),
                canonical_hash: blake3::hash(input).into(),
                transforms: TransformLog::new(),
            });
        }

        // Slow path: full normalization
        let mut output = String::with_capacity(input.len() + input.len() / 4);
        let mut source_map = SourceMapBuilder::new();
        let mut transform_log = TransformLog::new();
        let mut metadata = SemanticMetadata::default();

        // Simple line-by-line processing
        let mut line_number = 1;
        let mut char_pos = 0;

        for line in input_str.lines() {
            let _line_start = char_pos;
            let formatted_line = self.normalize_line(
                line,
                dialect.clone(),
                &config,
                &mut source_map,
                &mut transform_log,
                &mut metadata,
                line_number,
                char_pos,
            )?;

            output.push_str(&formatted_line);
            if line_number < input_str.lines().count() {
                output.push('\n');
            }

            char_pos += line.len() + 1; // +1 for newline
            line_number += 1;
        }

        let canonical_hash = blake3::hash(output.as_bytes()).into();

        Ok(FormattedSource {
            text: Cow::Owned(output),
            source_map: source_map.build(),
            metadata,
            canonical_hash,
            transforms: transform_log,
        })
    }

    #[allow(clippy::too_many_arguments)]
    fn normalize_line(
        &mut self,
        line: &str,
        dialect: ShellDialect,
        config: &FormatConfig,
        source_map: &mut SourceMapBuilder,
        transform_log: &mut TransformLog,
        metadata: &mut SemanticMetadata,
        line_number: usize,
        line_start: usize,
    ) -> crate::Result<String> {
        let mut output = String::with_capacity(line.len());
        let mut chars = line.char_indices().peekable();

        while let Some((pos, ch)) = chars.next() {
            let absolute_pos = line_start + pos;

            match ch {
                // Handle whitespace
                ' ' | '\t' => {
                    self.normalize_whitespace(
                        &mut chars,
                        &mut output,
                        source_map,
                        transform_log,
                        absolute_pos,
                    )?;
                }

                // Handle comments
                '#' => {
                    if config.preserve_whitespace || self.config.preserve_comments {
                        let comment = self.extract_comment(&mut chars, pos, line)?;
                        output.push_str(&comment);

                        metadata.comments.push(CommentMetadata {
                            content: comment.clone(),
                            start_pos: absolute_pos,
                            end_pos: absolute_pos + comment.len(),
                            line: line_number,
                            column: pos,
                        });
                    } else {
                        // Still need to consume the character if not preserving
                        output.push(ch);
                    }
                }

                // Handle variable expansion
                '$' => {
                    self.normalize_expansion(
                        &mut chars,
                        &mut output,
                        source_map,
                        transform_log,
                        absolute_pos,
                        dialect.clone(),
                    )?;
                }

                // Handle quotes
                '\'' | '"' => {
                    self.normalize_quoted_string(
                        ch,
                        &mut chars,
                        &mut output,
                        source_map,
                        absolute_pos,
                    )?;
                }

                // Copy other characters verbatim
                _ => {
                    output.push(ch);
                    source_map.add_char_mapping(
                        CharPos(absolute_pos),
                        CharPos(line_start + output.len() - 1),
                    );
                }
            }
        }

        Ok(output)
    }

    fn normalize_whitespace(
        &mut self,
        chars: &mut std::iter::Peekable<std::str::CharIndices<'_>>,
        output: &mut String,
        source_map: &mut SourceMapBuilder,
        transform_log: &mut TransformLog,
        start_pos: usize,
    ) -> crate::Result<()> {
        let context = self
            .ws_stack
            .last()
            .copied()
            .unwrap_or(WhitespaceContext::Command);

        // Consume all consecutive whitespace
        let mut whitespace_chars = 1; // We already found one
        while let Some((_, ch)) = chars.peek() {
            if ch.is_whitespace() && *ch != '\n' {
                chars.next();
                whitespace_chars += 1;
            } else {
                break;
            }
        }

        // Apply normalization based on context
        let normalized = match context {
            WhitespaceContext::Command => " ",   // Single space
            WhitespaceContext::Arithmetic => "", // No whitespace
            WhitespaceContext::QuotedString { .. } => {
                // Preserve original whitespace in quoted strings
                return Ok(()); // Skip normalization
            }
            _ => " ", // Default to single space
        };

        if whitespace_chars > 1 || (!normalized.is_empty() && whitespace_chars == 0) {
            // Record the transformation
            let transform = Transform::WhitespaceNormalize {
                context,
                preserved: IntervalSet::new(),
            };

            transform_log.add_entry(TransformEntry {
                id: TransformId::new(),
                transform,
                source_span: Span::new(BytePos(start_pos), BytePos(start_pos + whitespace_chars)),
                result_span: Span::new(
                    BytePos(output.len()),
                    BytePos(output.len() + normalized.len()),
                ),
                timestamp: std::time::Instant::now(),
                proof: None,
                semantic_delta: None,
            });
        }

        output.push_str(normalized);

        // Add mapping for the whitespace range
        source_map.add_range_mapping(
            CharPos(start_pos),
            CharPos(start_pos + whitespace_chars),
            CharPos(output.len() - normalized.len()),
            CharPos(output.len()),
        );

        Ok(())
    }

    fn normalize_expansion(
        &mut self,
        chars: &mut std::iter::Peekable<std::str::CharIndices<'_>>,
        output: &mut String,
        _source_map: &mut SourceMapBuilder,
        transform_log: &mut TransformLog,
        start_pos: usize,
        _dialect: ShellDialect,
    ) -> crate::Result<()> {
        // Check if we need to add quotes
        let context = self
            .ws_stack
            .last()
            .copied()
            .unwrap_or(WhitespaceContext::Command);

        let needs_quotes = matches!(context, WhitespaceContext::Command);

        if let Some((_, '{')) = chars.peek() {
            // ${var} form - copy as is
            output.push('$');
            output.push('{');
            chars.next();

            for (_, ch) in chars.by_ref() {
                output.push(ch);
                if ch == '}' {
                    break;
                }
            }
        } else {
            // $var form - might need quoting
            let var_start = output.len();
            let mut var_name = String::new();

            while let Some((_, ch)) = chars.peek() {
                if ch.is_alphanumeric() || *ch == '_' {
                    var_name.push(*ch);
                    chars.next();
                } else {
                    break;
                }
            }

            if needs_quotes && !var_name.is_empty() {
                output.push('"');
                output.push('$');
                output.push_str(&var_name);
                output.push('"');

                // Record quote expansion transform
                let transform = Transform::QuoteExpansion {
                    kind: QuoteKind::Double,
                    reason: QuoteReason::WordSplitting,
                    proof: SexprProof::new(format!(
                        "(= (word-split ${var_name}) (word-split \"${var_name}\"))"
                    )),
                };

                transform_log.add_entry(TransformEntry {
                    id: TransformId::new(),
                    transform,
                    source_span: Span::new(
                        BytePos(start_pos),
                        BytePos(start_pos + 1 + var_name.len()),
                    ),
                    result_span: Span::new(BytePos(var_start), BytePos(output.len())),
                    timestamp: std::time::Instant::now(),
                    proof: None,
                    semantic_delta: None,
                });
            } else {
                output.push('$');
                output.push_str(&var_name);
            }
        }

        Ok(())
    }

    fn normalize_quoted_string(
        &mut self,
        quote_char: char,
        chars: &mut std::iter::Peekable<std::str::CharIndices<'_>>,
        output: &mut String,
        _source_map: &mut SourceMapBuilder,
        _start_pos: usize,
    ) -> crate::Result<()> {
        output.push(quote_char);

        // Push quoted string context
        let quote_type = match quote_char {
            '\'' => QuoteType::Single,
            '"' => QuoteType::Double,
            _ => QuoteType::Double,
        };

        self.ws_stack
            .push(WhitespaceContext::QuotedString { quote_type });

        // Copy quoted content preserving whitespace
        while let Some((_, ch)) = chars.next() {
            output.push(ch);

            if ch == quote_char {
                break;
            }

            // Handle escape sequences
            if ch == '\\' {
                if let Some((_, escaped)) = chars.next() {
                    output.push(escaped);
                }
            }
        }

        // Pop quoted string context
        self.ws_stack.pop();

        Ok(())
    }

    fn extract_comment(
        &self,
        chars: &mut std::iter::Peekable<std::str::CharIndices<'_>>,
        start_pos: usize,
        line: &str,
    ) -> crate::Result<String> {
        // Extract comment from current position to end of line
        let comment = line[start_pos..].to_string();

        // Consume all remaining characters since they're part of the comment
        while chars.next().is_some() {}

        Ok(comment)
    }

    fn has_proper_quoting(&self, input: &str) -> bool {
        // Simple check for proper variable quoting
        // This is a heuristic - proper implementation would need full parsing
        let mut in_quotes = false;
        let mut quote_char = '\0';
        let chars = input.chars();

        for ch in chars {
            match ch {
                '\'' | '"' if !in_quotes => {
                    in_quotes = true;
                    quote_char = ch;
                }
                c if in_quotes && c == quote_char => {
                    in_quotes = false;
                    quote_char = '\0';
                }
                '$' if !in_quotes => {
                    // Unquoted variable - not canonical
                    return false;
                }
                _ => {}
            }
        }

        true
    }
}

impl Default for NormalizationEngine {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
#[path = "engine_tests_engine_creat.rs"]
mod tests_extracted;