langcodec 0.12.0

//! Support for Apple `.strings` localization format.
//!
//! Provides parsing, serialization, and conversion to/from the internal `Resource` model.

use std::collections::HashMap;
use std::fs::File;
// keep imports minimal; actual Read trait is used via fully qualified call above
use std::path::Path;

use indoc::indoc;

use crate::{
    error::Error,
    traits::Parser,
    types::{Entry, EntryStatus, Metadata, Resource, Translation},
};

/// Represents an Apple `.strings` localization file.
///
/// The format consists of a set of key-value pairs, with optional comments.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Format {
    /// Language code for this resource, if known (typically empty for `.strings`).
    pub language: String,
    /// All key-value pairs (and optional comments) in the file.
    pub pairs: Vec<Pair>,
}

impl Parser for Format {
    /// Creates a new `Format` instance with the specified language and pairs.
    ///
    /// The `language` parameter would be empty, since the .strings format does
    /// not contain any metadata about the language.
    fn from_reader<R: std::io::BufRead>(reader: R) -> Result<Self, Error> {
        // Read entire input into a string (UTF-8 expected here; UTF-16 handled in read_from)
        let mut reader = reader;
        let mut bytes = Vec::new();
        std::io::Read::read_to_end(&mut reader, &mut bytes).map_err(Error::Io)?;
        let content = String::from_utf8(bytes)
            .map_err(|_| Error::InvalidResource("Invalid UTF-8 in .strings file".to_string()))?;

        // Parse content
        let header_language = extract_header_language(&content).unwrap_or_default();
        let (pairs, _warnings) = parse_strings_content(&content);
        Ok(Format {
            language: header_language,
            pairs,
        })
    }

    fn to_writer<W: std::io::Write>(&self, mut writer: W) -> Result<(), Error> {
        let mut content = String::new();

        let header = format!(
            indoc! {"
            // This file is automatically generated by langcodec.
            // Do not edit it manually, as your changes will be overwritten.
            // Here's the basic information about the file which could be useful
            // for translators, and langcodec would use it to generate the
            // appropriate metadata for the resource.
            //
            //: Language: {}
            //

            "},
            self.language
        );

        content.push_str(&header);

        for pair in &self.pairs {
            if let Some(comment) = &pair.comment {
                let trimmed = comment.trim_end_matches(['\n', '\r']);
                content.push_str(trimmed);
                content.push('\n');
            }

            let key = escape_strings_token(&pair.key);
            let value = escape_strings_token(&pair.value);
            content.push_str(&format!("\"{}\" = \"{}\";\n", key, value));
        }

        writer.write_all(content.as_bytes()).map_err(Error::Io)
    }

    /// Override default file reading to support BOM-aware decoding (e.g., UTF-16 Apple .strings)
    fn read_from<P: AsRef<Path>>(path: P) -> Result<Self, Error>
    where
        Self: Sized,
    {
        let file = File::open(path).map_err(Error::Io)?;
        // Auto-detect BOM, decode to UTF-8; passthrough UTF-8
        let mut decoder = encoding_rs_io::DecodeReaderBytesBuilder::new()
            .bom_override(true)
            .build(file);

        let mut decoded_bytes = Vec::new();
        std::io::Read::read_to_end(&mut decoder, &mut decoded_bytes).map_err(Error::Io)?;
        let decoded = String::from_utf8(decoded_bytes)
            .map_err(|_| Error::InvalidResource("Invalid UTF-8 in .strings file".to_string()))?;
        Self::from_str(&decoded)
    }
}

impl From<Format> for Resource {
    fn from(value: Format) -> Self {
        Resource {
            metadata: Metadata {
                language: value.language,
                domain: String::from(""),
                custom: HashMap::new(),
            },
            entries: value.pairs.into_iter().map(Pair::into_entry).collect(),
        }
    }
}

impl TryFrom<Resource> for Format {
    type Error = Error;

    fn try_from(value: Resource) -> Result<Self, Self::Error> {
        let Resource { metadata, entries } = value;
        let language = metadata.language;
        let pairs = entries
            .into_iter()
            .map(Pair::try_from)
            .collect::<Result<Vec<_>, _>>()?;
        Ok(Format { language, pairs })
    }
}

/// A single key-value pair in a `.strings` file, possibly with an associated comment.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Pair {
    /// The key for this localization entry.
    pub key: String,
    /// The value for this localization entry.
    pub value: String,
    /// Optional comment associated with the key-value pair.
    ///
    /// Only comments that immediately precede a key-value pair are attached to it.
    /// Trailing comments on the same line as a key-value pair (e.g., `"key" = "value"; // comment`)
    /// are ignored and not attached.
    ///
    /// To keep it simple, we only support single-line comments in the form of `// comment` or `/* comment */`.
    /// The comment marker is included in the comment field.
    pub comment: Option<String>,
}

impl Pair {
    fn into_entry(self) -> Entry {
        let Pair {
            key,
            value,
            comment,
        } = self;

        let is_pair_value_empty = value.is_empty();

        Entry {
            id: key,
            value: Translation::Singular(value),
            comment,
            status: if is_pair_value_empty {
                EntryStatus::New
            } else {
                EntryStatus::Translated
            },
            custom: HashMap::new(),
        }
    }
}

// ----------------------
// Internal helpers
// ----------------------

fn parse_strings_content(content: &str) -> (Vec<Pair>, Vec<String>) {
    let bytes = content.as_bytes();
    let mut i = 0usize;
    let len = bytes.len();
    let mut pairs: Vec<Pair> = Vec::new();
    let warnings: Vec<String> = Vec::new();
    let mut pending_comment: Option<String> = None;
    let mut have_seen_pair = false;

    while i < len {
        let (ni, _saw_newline) = skip_whitespace(bytes, i);
        i = ni;
        if i >= len {
            break;
        }

        // If we're at the top of the file (no pairs yet), detect and skip the auto-generated header
        if !have_seen_pair && let Some(next_i) = try_skip_langcodec_header(bytes, i) {
            i = next_i;
            pending_comment = None;
            continue;
        }

        // Comments
        if starts_with(bytes, i, b"//") {
            let (nj, comment) = parse_line_comment(bytes, i);
            pending_comment = Some(comment);
            i = nj;
            continue;
        }
        if starts_with(bytes, i, b"/*") {
            let (nj, comment) = parse_block_comment(bytes, i);
            pending_comment = Some(comment);
            i = nj;
            continue;
        }

        // Key-Value pair: "key" = "value";
        if let Some((j, key)) = parse_quoted_utf8(content, bytes, i) {
            i = j;
            let (ni2, _) = skip_inline_ws(bytes, i);
            i = ni2;
            if i < len && bytes[i] == b'=' {
                i += 1; // consume '='
                let (ni3, _) = skip_inline_ws(bytes, i);
                i = ni3;
                if let Some((jv, value_raw)) = parse_quoted_utf8(content, bytes, i) {
                    i = jv;
                    // seek semicolon, ignoring spaces and tabs only
                    let (ni4, _) = skip_inline_ws(bytes, i);
                    i = ni4;
                    // Consume until ';' if present
                    if i < len && bytes[i] == b';' {
                        i += 1; // consume ';'
                    } else {
                        // try to find ';' ahead on the same or following lines
                        while i < len && bytes[i] != b';' && bytes[i] != b'\n' {
                            i += 1;
                        }
                        if i < len && bytes[i] == b';' {
                            i += 1;
                        }
                    }

                    let value = normalize_value_newlines(&value_raw);
                    let pair = Pair {
                        key,
                        value,
                        comment: pending_comment.take(),
                    };
                    pairs.push(pair);
                    have_seen_pair = true;
                    continue;
                }
            }
        }

        // If we reach here, consume until next newline to avoid infinite loop
        while i < len && bytes[i] != b'\n' {
            i += 1;
        }
        // newline will be skipped on next iteration
    }

    (pairs, warnings)
}

fn starts_with(hay: &[u8], i: usize, needle: &[u8]) -> bool {
    hay.len() >= i + needle.len() && &hay[i..i + needle.len()] == needle
}

fn skip_whitespace(bytes: &[u8], mut i: usize) -> (usize, bool) {
    let mut saw_newline = false;
    while i < bytes.len() {
        match bytes[i] {
            b' ' | b'\t' | 0x0C | 0x0D => i += 1, // spaces, tabs, form feed, carriage return
            b'\n' => {
                saw_newline = true;
                i += 1;
            }
            _ => break,
        }
    }
    (i, saw_newline)
}

fn skip_inline_ws(bytes: &[u8], mut i: usize) -> (usize, bool) {
    let mut saw_newline = false;
    while i < bytes.len() {
        match bytes[i] {
            b' ' | b'\t' | 0x0C | 0x0D => i += 1,
            b'\n' => {
                saw_newline = true;
                i += 1;
            }
            _ => break,
        }
    }
    (i, saw_newline)
}

fn parse_line_comment(bytes: &[u8], i: usize) -> (usize, String) {
    let mut j = i;
    while j < bytes.len() && bytes[j] != b'\n' {
        j += 1;
    }
    let comment = String::from_utf8_lossy(&bytes[i..j]).to_string();
    (j, comment)
}

fn parse_block_comment(bytes: &[u8], i: usize) -> (usize, String) {
    let mut j = i + 2; // after /*
    while j + 1 < bytes.len() {
        if bytes[j] == b'*' && bytes[j + 1] == b'/' {
            j += 2;
            break;
        }
        j += 1;
    }
    let comment = String::from_utf8_lossy(&bytes[i..j.min(bytes.len())]).to_string();
    (j, comment)
}

// Detect and skip the standard langcodec header block at the start of the file.
// Returns Some(new_index) if a header was skipped, or None otherwise.
fn try_skip_langcodec_header(bytes: &[u8], mut i: usize) -> Option<usize> {
    let start = i;
    let mut saw_header_marker = false;
    // We look for consecutive comment lines starting with // and possibly a block containing
    // a line beginning with //: Language:
    while i < bytes.len() {
        // Allow blank lines within header
        let (ni, _nl) = skip_whitespace(bytes, i);
        i = ni;
        if i >= bytes.len() {
            break;
        }
        if starts_with(bytes, i, b"//:") || starts_with(bytes, i, b"//") {
            if starts_with(bytes, i, b"//:") {
                saw_header_marker = true;
            }
            // consume to end of line
            while i < bytes.len() && bytes[i] != b'\n' {
                i += 1;
            }
            continue;
        }
        break;
    }
    if saw_header_marker && i > start {
        Some(i)
    } else {
        None
    }
}

fn extract_header_language(content: &str) -> Option<String> {
    // Look within the first ~50 lines for a header language line
    for line in content.lines().take(50) {
        let trimmed = line.trim_start();
        // Accept forms like: //: Language: xx or // : Language: xx
        if let Some(rest) = trimmed
            .strip_prefix("//:")
            .or_else(|| trimmed.strip_prefix("// :"))
        {
            let rest = rest.trim_start();
            if let Some(lang_part) = rest.strip_prefix("Language:") {
                let lang = lang_part.trim();
                if !lang.is_empty() {
                    return Some(lang.to_string());
                }
            }
        }
    }
    None
}

// Parses a quoted string starting at byte index i (which must point to '"').
// Returns (byte_index_after_closing_quote, substring content as UTF-8) without the surrounding quotes,
// preserving backslashes and non-ASCII characters exactly as in the source.
fn parse_quoted_utf8(source: &str, bytes: &[u8], i: usize) -> Option<(usize, String)> {
    if i >= bytes.len() || bytes[i] != b'"' {
        return None;
    }
    let start = i + 1; // start of content inside quotes
    let mut j = start;
    let mut consecutive_backslashes = 0usize;
    while j < bytes.len() {
        let b = bytes[j];
        if b == b'\\' {
            consecutive_backslashes += 1;
            j += 1;
            continue;
        }
        if b == b'"' {
            // If number of preceding backslashes is even, the quote terminates the string
            if consecutive_backslashes.is_multiple_of(2) {
                let end = j;
                let s = &source[start..end];
                return Some((j + 1, s.to_string()));
            }
            // else, it's an escaped quote, continue scanning
        }
        // reset backslash count on any non-backslash byte
        consecutive_backslashes = 0;
        j += 1;
    }
    None
}

fn normalize_value_newlines(raw: &str) -> String {
    if !raw.contains('\n') {
        return raw.to_string();
    }
    let mut out = String::new();
    for (idx, line) in raw.split('\n').enumerate() {
        if idx > 0 {
            out.push_str(r"\n");
        }
        // Preserve leading spaces exactly as-is; escape literal tab characters as \t.
        let segment = line.replace('\t', "\\t");
        out.push_str(&segment);
    }
    out
}

fn escape_strings_token(s: &str) -> String {
    // Escape quotes and literal newlines. Preserve recognized escape sequences (\n, \t, \r, \" , \' , \\) as-is.
    let mut out = String::new();
    let chars: Vec<char> = s.chars().collect();
    let mut i = 0usize;
    while i < chars.len() {
        let ch = chars[i];
        match ch {
            '"' => {
                out.push('\\');
                out.push('"');
                i += 1;
            }
            '\n' => {
                out.push('\\');
                out.push('n');
                i += 1;
            }
            '\\' => {
                // Handle runs of backslashes with lookahead
                let mut j = i;
                while j < chars.len() && chars[j] == '\\' {
                    j += 1;
                }
                let next_char = if j < chars.len() {
                    Some(chars[j])
                } else {
                    None
                };

                match next_char {
                    Some('\'') => {
                        // Preserve run when followed by apostrophe
                        for _ in i..j {
                            out.push('\\');
                        }
                        out.push('\'');
                        i = j + 1;
                    }
                    Some('n') | Some('t') | Some('r') | Some('"') | Some('\\') => {
                        // Recognized escape sequence: preserve exactly one run of backslashes and the escape char as-is
                        for _ in i..j {
                            out.push('\\');
                        }
                        out.push(next_char.unwrap());
                        i = j + 1;
                    }
                    Some(other) => {
                        // Unrecognized escape: double each backslash to preserve literal backslashes, then the next char
                        for _ in i..j {
                            out.push('\\');
                            out.push('\\');
                        }
                        out.push(other);
                        i = j + 1;
                    }
                    None => {
                        // Trailing backslashes at end of string: double them
                        for _ in i..j {
                            out.push('\\');
                            out.push('\\');
                        }
                        i = j;
                    }
                }
            }
            _ => {
                out.push(ch);
                i += 1;
            }
        }
    }
    out
}

impl TryFrom<Entry> for Pair {
    type Error = Error;

    fn try_from(entry: Entry) -> Result<Self, Self::Error> {
        // Strings format only supports singular translations. Preserve the value verbatim.
        match entry.value {
            Translation::Empty => Ok(Pair {
                key: entry.id,
                value: String::new(),
                comment: entry.comment,
            }),
            Translation::Singular(value) => Ok(Pair {
                key: entry.id,
                value: crate::placeholder::to_ios_placeholders(&value),
                comment: entry.comment,
            }),
            Translation::Plural(_) => Err(Error::DataMismatch(
                "Plural translations are not supported in .strings format".to_string(),
            )),
        }
    }
}

impl From<Pair> for Entry {
    fn from(pair: Pair) -> Self {
        let is_pair_value_empty = pair.value.is_empty();
        Entry {
            id: pair.key,
            value: Translation::Singular(pair.value),
            comment: pair.comment,
            status: if is_pair_value_empty {
                EntryStatus::New
            } else {
                EntryStatus::Translated
            },
            custom: HashMap::new(),
        }
    }
}

impl Pair {
    // Returns a comment without the comment marker.
    pub fn formatted_comment(&self) -> String {
        if let Some(comment) = &self.comment {
            if comment.starts_with("/*") && comment.ends_with("*/") {
                comment[2..comment.len() - 2].trim().to_string()
            } else if let Some(comment) = comment.strip_prefix("//") {
                comment.trim().to_string()
            } else {
                comment.trim().to_string()
            }
        } else {
            String::new()
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::traits::Parser;

    #[test]
    fn test_parse_basic_strings_with_comment() {
        let content = r#"
        /* Greeting for the user */
        "hello" = "Hello, world!";
        "#;
        let parsed = Format::from_str(content).unwrap();
        assert_eq!(parsed.pairs.len(), 1);
        let pair = &parsed.pairs[0];
        assert_eq!(pair.key, "hello");
        assert_eq!(pair.value, "Hello, world!");
        assert!(
            pair.comment
                .as_ref()
                .unwrap()
                .contains("Greeting for the user")
        );
    }

    #[test]
    fn test_round_trip_serialization() {
        let content = r#"
        /* Farewell */
        "bye" = "Goodbye!";
        "#;
        let parsed = Format::from_str(content).unwrap();
        let mut output = Vec::new();
        parsed.to_writer(&mut output).unwrap();
        let output_str = String::from_utf8(output).unwrap();
        // Parse again and compare key-value pairs
        let reparsed = Format::from_str(&output_str).unwrap();
        assert_eq!(parsed.pairs.len(), reparsed.pairs.len());
        for (orig, new) in parsed.pairs.iter().zip(reparsed.pairs.iter()) {
            assert_eq!(orig.key, new.key);
            assert_eq!(orig.value, new.value);
        }
    }

    #[test]
    fn test_strings_writer_escapes_quotes_backslashes_and_newlines() {
        let format = Format {
            language: String::new(),
            pairs: vec![Pair {
                key: "greet\"key\\with\nline".to_string(),
                value: "He said: \"hi\"\\and newline\n".to_string(),
                comment: None,
            }],
        };
        let mut out = Vec::new();
        format.to_writer(&mut out).unwrap();
        let out_str = String::from_utf8(out).unwrap();
        // Ensure escapes are present
        assert!(out_str.contains("\"greet\\\"key\\\\with\\nline\""));
        assert!(out_str.contains("\"He said: \\\"hi\\\"\\\\and newline\\n\""));
    }

    #[test]
    fn test_unescape_minimal_apostrophe_and_backslash() {
        let content = r#"
        "key1" = "Can\'t accept";
        "key2" = "Can\\'t accept";
        "#;
        let parsed = Format::from_str(content).unwrap();
        assert_eq!(parsed.pairs.len(), 2);
        assert_eq!(parsed.pairs[0].value, r#"Can\'t accept"#);
        assert_eq!(parsed.pairs[1].value, r#"Can\\'t accept"#);

        // Writing back should not introduce extra backslashes before apostrophes
        let mut out = Vec::new();
        parsed.to_writer(&mut out).unwrap();
        let out_str = String::from_utf8(out).unwrap();
        assert!(out_str.contains(r#""key1" = "Can\'t accept";"#));
        assert!(out_str.contains(r#""key2" = "Can\\'t accept";"#));
    }

    #[test]
    fn test_strings_writer_ios_placeholder_conversion() {
        // Build a Resource with Android-style placeholders and ensure writer converts to iOS style
        let resource = Resource {
            metadata: Metadata {
                language: "en".to_string(),
                domain: String::new(),
                custom: HashMap::new(),
            },
            entries: vec![Entry {
                id: "g".to_string(),
                value: Translation::Singular("Hi %1$s and %s".to_string()),
                comment: None,
                status: EntryStatus::Translated,
                custom: HashMap::new(),
            }],
        };
        let fmt = Format::try_from(resource).unwrap();
        assert_eq!(fmt.pairs.len(), 1);
        assert_eq!(fmt.pairs[0].value, "Hi %1$@ and %@");
    }

    #[test]
    fn test_multiline_value_with_embedded_newlines_and_whitespace() {
        let content = r#"
        /* Multiline value */
        "multiline" = "This is line 1.
            \t\tThis is line 2.
            This is line 3.";
        "#;
        let parsed = Format::from_str(content).unwrap();
        assert_eq!(parsed.pairs.len(), 1);
        let pair = &parsed.pairs[0];
        assert_eq!(pair.key, "multiline");
        // Should be joined with \n and trimmed of leading spaces on each line
        assert_eq!(
            pair.value,
            "This is line 1.\\n            \\t\\tThis is line 2.\\n            This is line 3."
        );
    }

    #[test]
    fn test_multiline_value_with_tabs_and_embedded_newlines() {
        let content =
            "\"multiline\" = \"This is line 1.\n\t\tThis is line\n\t\t\t2.This is line\n3.\";";
        let parsed = Format::from_str(content).unwrap();
        assert_eq!(parsed.pairs.len(), 1);
        let pair = &parsed.pairs[0];
        assert_eq!(pair.key, "multiline");
        assert_eq!(
            pair.value,
            r#"This is line 1.\n\t\tThis is line\n\t\t\t2.This is line\n3."#
        );
        assert!(pair.comment.is_none());
    }

    #[test]
    fn test_blank_lines_and_ignored_malformed_lines() {
        let content = r#"

        // Comment

        "good" = "yes";
        bad line without equals
        "another" = "ok";

        "#;
        let parsed = Format::from_str(content).unwrap();
        assert_eq!(parsed.pairs.len(), 2);
        assert_eq!(parsed.pairs[0].key, "good");
        assert_eq!(parsed.pairs[0].value, "yes");
        assert_eq!(parsed.pairs[1].key, "another");
        assert_eq!(parsed.pairs[1].value, "ok");
    }

    #[test]
    fn test_entry_with_empty_value() {
        let content = r#"
        /* Empty value */
        "empty" = "";
        "#;
        let parsed = Format::from_str(content).unwrap();
        assert_eq!(parsed.pairs.len(), 1);
        let pair = &parsed.pairs[0];
        assert_eq!(pair.key, "empty");
        assert_eq!(pair.value, "");
        // Should be marked as New status in Entry
        let entry = pair.clone().into_entry();
        assert_eq!(entry.status, EntryStatus::New);
    }

    #[test]
    fn test_preserve_trailing_spaces() {
        let content = r#"
        "key1" = "Value with trailing space ";
        "key2" = "Another value with trailing spaces   ";
        "key3" = "No trailing spaces";
        "key4" = "过去一天 ";
        "#;
        let parsed = Format::from_str(content).unwrap();
        assert_eq!(parsed.pairs.len(), 4);

        let pair1 = &parsed.pairs[0];
        let pair2 = &parsed.pairs[1];
        let pair3 = &parsed.pairs[2];
        let pair4 = &parsed.pairs[3];

        assert_eq!(pair1.value, "Value with trailing space ");
        assert_eq!(pair2.value, "Another value with trailing spaces   ");
        assert_eq!(pair3.value, "No trailing spaces");
        assert_eq!(pair4.value, "过去一天 ");
    }

    #[test]
    fn test_comments_attached_to_correct_key_value_pairs() {
        let content = r#"
        // Comment for A
        "A" = "a";
        // Comment for B
        "B" = "b";
        /* Block comment for C */
        "C" = "c";
        "#;
        let parsed = Format::from_str(content).unwrap();
        assert_eq!(parsed.pairs.len(), 3);
        let a = &parsed.pairs[0];
        let b = &parsed.pairs[1];
        let c = &parsed.pairs[2];
        assert!(a.comment.as_ref().unwrap().contains("Comment for A"));
        assert!(b.comment.as_ref().unwrap().contains("Comment for B"));
        assert!(c.comment.as_ref().unwrap().contains("Block comment for C"));
    }

    #[test]
    fn test_parse_strings_with_empty_value() {
        let content = r#"
        // String

        "PlayConsumed" = "%.2fMB traffic will be consumed if you play it";
        "Score" = "%@ reviews";
        "Wan" = "";//英文逻辑不一样，为空就好
        "#;
        let parsed = Format::from_str(content).unwrap();
        assert_eq!(parsed.pairs.len(), 3);
    }
}