Skip to main content

reovim_module_codec_utf8/
codec.rs

1//! UTF-8 content codec.
2//!
3//! Handles BOM detection/stripping, CRLF normalization, and
4//! round-trip encoding that preserves original BOM and line endings.
5
6use reovim_driver_codec::{CodecError, CodecMetadata, ContentCodec, ContentType, DecodeResult};
7
8/// UTF-8 BOM bytes.
9const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
10
11/// Metadata key for BOM presence.
12pub const META_BOM: &str = "bom";
13
14/// Metadata key for original line ending style.
15pub const META_LINE_ENDING: &str = "line_ending";
16
17/// Line ending: CRLF.
18pub const LINE_ENDING_CRLF: &str = "crlf";
19
20/// Line ending: LF (default).
21pub const LINE_ENDING_LF: &str = "lf";
22
23/// UTF-8 content codec.
24///
25/// Bidirectional codec that handles:
26/// - BOM detection and stripping on decode, restoration on encode
27/// - CRLF normalization to LF on decode, restoration on encode
28/// - Clean round-trip: `encode(decode(bytes)) == bytes`
29pub struct Utf8Codec;
30
31impl Utf8Codec {
32    /// Create a new UTF-8 codec.
33    #[must_use]
34    pub const fn new() -> Self {
35        Self
36    }
37}
38
39#[cfg_attr(coverage_nightly, coverage(off))]
40impl Default for Utf8Codec {
41    fn default() -> Self {
42        Self::new()
43    }
44}
45
46impl ContentCodec for Utf8Codec {
47    fn decode(&self, raw: &[u8]) -> Result<DecodeResult, CodecError> {
48        // Check for BOM
49        let (has_bom, content_bytes) = if raw.starts_with(UTF8_BOM) {
50            (true, &raw[UTF8_BOM.len()..])
51        } else {
52            (false, raw)
53        };
54
55        // Validate UTF-8
56        let text = std::str::from_utf8(content_bytes).map_err(|e| CodecError::InvalidSequence {
57            offset: e.valid_up_to() + if has_bom { UTF8_BOM.len() } else { 0 },
58            detail: format!("invalid UTF-8 at byte {}", e.valid_up_to()),
59        })?;
60
61        // Detect line ending and normalize CRLF to LF
62        let has_crlf = text.contains("\r\n");
63        let content = if has_crlf {
64            text.replace("\r\n", "\n")
65        } else {
66            text.to_string()
67        };
68
69        // Build metadata
70        let mut metadata = CodecMetadata::new(ContentType::new(ContentType::UTF8));
71        if has_bom {
72            metadata.set(META_BOM, "true");
73        }
74        if has_crlf {
75            metadata.set(META_LINE_ENDING, LINE_ENDING_CRLF);
76        } else {
77            metadata.set(META_LINE_ENDING, LINE_ENDING_LF);
78        }
79
80        Ok(DecodeResult {
81            content,
82            annotations: vec![],
83            metadata,
84            lossy: false,
85            readonly: false,
86        })
87    }
88
89    fn encode(
90        &self,
91        content: &str,
92        metadata: &CodecMetadata,
93    ) -> Option<Result<Vec<u8>, CodecError>> {
94        // Restore line endings
95        let text = if metadata.get(META_LINE_ENDING) == Some(LINE_ENDING_CRLF) {
96            content.replace('\n', "\r\n")
97        } else {
98            content.to_string()
99        };
100
101        // Restore BOM
102        let mut bytes = Vec::with_capacity(text.len() + 3);
103        if metadata.get(META_BOM) == Some("true") {
104            bytes.extend_from_slice(UTF8_BOM);
105        }
106        bytes.extend_from_slice(text.as_bytes());
107
108        Some(Ok(bytes))
109    }
110}
111
112#[cfg(test)]
113#[path = "codec_tests.rs"]
114mod tests;