Skip to main content

ferrocat_po/
text.rs

1use std::borrow::Cow;
2
3use crate::ParseError;
4use crate::scan::{find_byte, find_escapable_byte, find_quoted_bounds, has_byte};
5use crate::utf8::{input_slice_as_str, string_from_utf8};
6
7/// Escapes a PO string literal payload.
8#[must_use]
9pub fn escape_string(input: &str) -> String {
10    let bytes = input.as_bytes();
11    let Some(first_escape) = find_escapable_byte(bytes) else {
12        return input.to_owned();
13    };
14
15    let mut out = String::with_capacity(input.len() + 8);
16    out.push_str(&input[..first_escape]);
17    escape_string_from(&mut out, input, bytes, first_escape);
18
19    out
20}
21
22pub fn escape_string_into(out: &mut String, input: &str) {
23    let bytes = input.as_bytes();
24    let Some(first_escape) = find_escapable_byte(bytes) else {
25        out.push_str(input);
26        return;
27    };
28
29    escape_string_into_known(out, input, first_escape);
30}
31
32pub fn escape_string_into_with_first_escape(
33    out: &mut String,
34    input: &str,
35    first_escape: Option<usize>,
36) {
37    let Some(first_escape) = first_escape else {
38        out.push_str(input);
39        return;
40    };
41
42    escape_string_into_known(out, input, first_escape);
43}
44
45/// Unescapes a PO string literal payload.
46///
47/// # Errors
48///
49/// Returns [`ParseError`] when the escape sequence is malformed.
50pub fn unescape_string(input: &str) -> Result<String, ParseError> {
51    let bytes = input.as_bytes();
52    if !has_byte(b'\\', bytes) {
53        return Ok(input.to_owned());
54    }
55
56    let mut out = Vec::with_capacity(input.len());
57    let mut index = 0;
58
59    while index < bytes.len() {
60        let next_escape = if let Some(relative) = find_byte(b'\\', &bytes[index..]) {
61            index + relative
62        } else {
63            out.extend_from_slice(&bytes[index..]);
64            break;
65        };
66
67        out.extend_from_slice(&bytes[index..next_escape]);
68        index = next_escape + 1;
69        if index >= bytes.len() {
70            return Err(ParseError::new("unterminated escape sequence"));
71        }
72
73        let escaped = bytes[index];
74        match escaped {
75            b'a' => out.push(b'\x07'),
76            b'b' => out.push(b'\x08'),
77            b't' => out.push(b'\t'),
78            b'n' => out.push(b'\n'),
79            b'v' => out.push(b'\x0b'),
80            b'f' => out.push(b'\x0c'),
81            b'r' => out.push(b'\r'),
82            b'\'' => out.push(b'\''),
83            b'"' => out.push(b'"'),
84            b'\\' => out.push(b'\\'),
85            b'?' => out.push(b'?'),
86            b'0'..=b'7' => {
87                let mut value = u32::from(escaped - b'0');
88                let mut consumed = 1;
89                while consumed < 3 && index + consumed < bytes.len() {
90                    let next = bytes[index + consumed];
91                    if !(b'0'..=b'7').contains(&next) {
92                        break;
93                    }
94                    value = (value * 8) + u32::from(next - b'0');
95                    consumed += 1;
96                }
97                match char::from_u32(value) {
98                    Some(ch) => push_char_bytes(&mut out, ch),
99                    None => return Err(ParseError::new("invalid octal escape value")),
100                }
101                index += consumed - 1;
102            }
103            b'x' => {
104                if index + 2 >= bytes.len() {
105                    return Err(ParseError::new("incomplete hex escape"));
106                }
107                let hi = decode_hex(bytes[index + 1])?;
108                let lo = decode_hex(bytes[index + 2])?;
109                let value = u32::from((hi << 4) | lo);
110                match char::from_u32(value) {
111                    Some(ch) => push_char_bytes(&mut out, ch),
112                    None => return Err(ParseError::new("invalid hex escape value")),
113                }
114                index += 2;
115            }
116            other => out.push(other),
117        }
118
119        index += 1;
120    }
121
122    Ok(string_from_utf8(out))
123}
124
125/// Extracts and unescapes the first quoted PO string from `line`, borrowing
126/// from the input when no escapes are present.
127///
128/// # Errors
129///
130/// Returns [`ParseError`] when the quoted content is malformed.
131pub fn extract_quoted_cow(line: &str) -> Result<Cow<'_, str>, ParseError> {
132    extract_quoted_bytes_cow(line.as_bytes())
133}
134
135pub fn extract_quoted_bytes_cow(line: &[u8]) -> Result<Cow<'_, str>, ParseError> {
136    let Some((start, end)) = find_quoted_bounds(line) else {
137        return Ok(Cow::Borrowed(""));
138    };
139
140    let raw = &line[start..end];
141    validate_quoted_content(raw)?;
142    if !has_byte(b'\\', raw) {
143        return Ok(Cow::Borrowed(bytes_to_str(raw)));
144    }
145
146    Ok(Cow::Owned(unescape_string(bytes_to_str(raw))?))
147}
148
149/// Extracts and unescapes the first quoted PO string from `line`.
150///
151/// # Errors
152///
153/// Returns [`ParseError`] when the quoted content is malformed.
154pub fn extract_quoted(line: &str) -> Result<String, ParseError> {
155    Ok(extract_quoted_bytes_cow(line.as_bytes())?.into_owned())
156}
157
158pub fn split_reference_comment(input: &str) -> Vec<Cow<'_, str>> {
159    let trimmed = input.trim();
160    if trimmed.is_empty() {
161        return vec![Cow::Borrowed("")];
162    }
163
164    let mut parts = Vec::new();
165    let mut start = None;
166    let mut isolate_depth = 0usize;
167
168    for (index, ch) in trimmed.char_indices() {
169        match ch {
170            '\u{2068}' => {
171                if start.is_none() {
172                    start = Some(index);
173                }
174                isolate_depth += 1;
175            }
176            '\u{2069}' => {
177                if start.is_none() {
178                    start = Some(index);
179                }
180                isolate_depth = isolate_depth.saturating_sub(1);
181            }
182            _ if ch.is_whitespace() && isolate_depth == 0 => {
183                if let Some(segment_start) = start.take()
184                    && segment_start < index
185                {
186                    parts.push(normalize_reference_token(&trimmed[segment_start..index]));
187                }
188            }
189            _ => {
190                if start.is_none() {
191                    start = Some(index);
192                }
193            }
194        }
195    }
196
197    if let Some(segment_start) = start
198        && segment_start < trimmed.len()
199    {
200        parts.push(normalize_reference_token(&trimmed[segment_start..]));
201    }
202
203    if parts.len() == 1 {
204        return vec![normalize_reference_token(trimmed)];
205    }
206
207    if parts.iter().all(|part| part.contains(':')) {
208        return parts;
209    }
210
211    vec![Cow::Borrowed(trimmed)]
212}
213
214pub fn validate_quoted_content(raw: &[u8]) -> Result<(), ParseError> {
215    let mut trailing_backslashes = 0usize;
216
217    for &byte in raw {
218        match byte {
219            b'\\' => trailing_backslashes += 1,
220            b'"' if trailing_backslashes % 2 == 0 => {
221                return Err(ParseError::new("unescaped quote in string literal"));
222            }
223            _ => trailing_backslashes = 0,
224        }
225    }
226
227    Ok(())
228}
229
230fn escape_string_from(out: &mut String, input: &str, bytes: &[u8], first_escape: usize) {
231    let mut start = first_escape;
232
233    loop {
234        push_escape(out, bytes[start]);
235        let next_index = start + 1;
236        let Some(relative) = find_escapable_byte(&bytes[next_index..]) else {
237            out.push_str(&input[next_index..]);
238            break;
239        };
240
241        let absolute = next_index + relative;
242        out.push_str(&input[next_index..absolute]);
243        start = absolute;
244    }
245}
246
247#[inline]
248fn escape_string_into_known(out: &mut String, input: &str, first_escape: usize) {
249    let bytes = input.as_bytes();
250    out.push_str(&input[..first_escape]);
251    escape_string_from(out, input, bytes, first_escape);
252}
253
254fn push_escape(out: &mut String, byte: u8) {
255    out.push('\\');
256    out.push(match byte {
257        b'\x07' => 'a',
258        b'\x08' => 'b',
259        b'\t' => 't',
260        b'\n' => 'n',
261        b'\x0b' => 'v',
262        b'\x0c' => 'f',
263        b'\r' => 'r',
264        b'"' => '"',
265        b'\\' => '\\',
266        _ => unreachable!("unexpected escape byte"),
267    });
268}
269
270fn decode_hex(byte: u8) -> Result<u8, ParseError> {
271    match byte {
272        b'0'..=b'9' => Ok(byte - b'0'),
273        b'a'..=b'f' => Ok(byte - b'a' + 10),
274        b'A'..=b'F' => Ok(byte - b'A' + 10),
275        _ => Err(ParseError::new("invalid hex escape")),
276    }
277}
278
279fn push_char_bytes(out: &mut Vec<u8>, ch: char) {
280    if ch.is_ascii() {
281        out.push(ch as u8);
282        return;
283    }
284
285    let mut buf = [0u8; 4];
286    out.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes());
287}
288
289fn bytes_to_str(bytes: &[u8]) -> &str {
290    input_slice_as_str(bytes)
291}
292
293fn normalize_reference_token(input: &str) -> Cow<'_, str> {
294    if !input.contains('\u{2068}') && !input.contains('\u{2069}') {
295        return Cow::Borrowed(input);
296    }
297
298    Cow::Owned(
299        input
300            .chars()
301            .filter(|ch| *ch != '\u{2068}' && *ch != '\u{2069}')
302            .collect(),
303    )
304}
305
306#[cfg(test)]
307mod tests {
308    use std::borrow::Cow;
309
310    use super::{
311        escape_string, escape_string_into, escape_string_into_with_first_escape, extract_quoted,
312        extract_quoted_bytes_cow, extract_quoted_cow, split_reference_comment, unescape_string,
313        validate_quoted_content,
314    };
315
316    #[test]
317    fn escapes_special_characters() {
318        assert_eq!(escape_string("Say \"Hi\""), "Say \\\"Hi\\\"");
319        assert_eq!(escape_string("a\tb"), "a\\tb");
320    }
321
322    #[test]
323    fn unescapes_c_sequences() {
324        assert_eq!(
325            unescape_string("\\a\\b\\t\\n\\v\\f\\r\\'\\\"\\\\\\?").as_deref(),
326            Ok("\u{0007}\u{0008}\t\n\u{000b}\u{000c}\r'\"\\?")
327        );
328    }
329
330    #[test]
331    fn extracts_and_unescapes_quoted_text() {
332        assert_eq!(
333            extract_quoted(
334                "msgid \"The name field must not contain characters like \\\" or \\\\\""
335            )
336            .as_deref(),
337            Ok("The name field must not contain characters like \" or \\")
338        );
339    }
340
341    #[test]
342    fn borrows_simple_quoted_text_without_escape() {
343        assert_eq!(
344            extract_quoted_cow("msgid \"plain text\""),
345            Ok(Cow::Borrowed("plain text"))
346        );
347    }
348
349    #[test]
350    fn appends_escaped_text_into_existing_buffer() {
351        let mut out = String::from("prefix:");
352        escape_string_into(&mut out, "Say \"Hi\"\n");
353        assert_eq!(out, "prefix:Say \\\"Hi\\\"\\n");
354    }
355
356    #[test]
357    fn appends_escaped_text_into_existing_buffer_with_known_escape() {
358        let mut out = String::from("prefix:");
359        escape_string_into_with_first_escape(&mut out, "Say \"Hi\"\n", Some(4));
360        assert_eq!(out, "prefix:Say \\\"Hi\\\"\\n");
361    }
362
363    #[test]
364    fn appends_plain_text_when_no_escape_index_is_known() {
365        let mut out = String::from("prefix:");
366        escape_string_into_with_first_escape(&mut out, "plain", None);
367        assert_eq!(out, "prefix:plain");
368    }
369
370    #[test]
371    fn extracts_quoted_text_from_bytes() {
372        assert_eq!(
373            extract_quoted_bytes_cow(br#"msgid "byte path""#),
374            Ok(Cow::Borrowed("byte path"))
375        );
376    }
377
378    #[test]
379    fn extracts_owned_quoted_text_when_unescaping_is_required() {
380        assert_eq!(
381            extract_quoted_bytes_cow(br#"msgid "line\nbreak""#),
382            Ok(Cow::Owned("line\nbreak".to_owned()))
383        );
384        assert_eq!(extract_quoted("msgid bare"), Ok(String::new()));
385    }
386
387    #[test]
388    fn splits_multiple_reference_tokens() {
389        assert_eq!(
390            split_reference_comment("src/app.js:1 src/lib.js:2"),
391            vec![Cow::Borrowed("src/app.js:1"), Cow::Borrowed("src/lib.js:2")]
392        );
393    }
394
395    #[test]
396    fn preserves_standard_input_reference_lines() {
397        assert_eq!(
398            split_reference_comment("standard input:12 standard input:17"),
399            vec![Cow::Borrowed("standard input:12 standard input:17")]
400        );
401    }
402
403    #[test]
404    fn strips_isolates_when_splitting_reference_tokens() {
405        assert_eq!(
406            split_reference_comment("\u{2068}main 1.py\u{2069}:1 other.py:2"),
407            vec![
408                Cow::Owned("main 1.py:1".to_owned()),
409                Cow::Borrowed("other.py:2"),
410            ]
411        );
412    }
413
414    #[test]
415    fn keeps_non_reference_whitespace_groups_and_empty_input_stable() {
416        assert_eq!(
417            split_reference_comment("foo bar"),
418            vec![Cow::Borrowed("foo bar")]
419        );
420        assert_eq!(split_reference_comment("   "), vec![Cow::Borrowed("")]);
421    }
422
423    #[test]
424    fn rejects_unescaped_quote_in_string_literal() {
425        assert_eq!(
426            validate_quoted_content(br#"Some msgstr with "double\" quotes"#)
427                .expect_err("expected unescaped quote error")
428                .to_string(),
429            "unescaped quote in string literal"
430        );
431    }
432
433    #[test]
434    fn unescape_string_covers_octal_hex_and_error_paths() {
435        assert_eq!(unescape_string("\\101\\x42").as_deref(), Ok("AB"));
436        assert_eq!(
437            unescape_string("\\x4")
438                .expect_err("incomplete hex escape")
439                .to_string(),
440            "incomplete hex escape"
441        );
442        assert_eq!(
443            unescape_string("\\xZZ")
444                .expect_err("invalid hex escape")
445                .to_string(),
446            "invalid hex escape"
447        );
448        assert!(validate_quoted_content(br#"still safe\""#).is_ok());
449    }
450}