1use std::borrow::Cow;
2
3use crate::ParseError;
4use crate::scan::{find_byte, find_escapable_byte, find_quoted_bounds, has_byte};
5use crate::utf8::{input_slice_as_str, string_from_utf8};
6
7#[must_use]
9pub fn escape_string(input: &str) -> String {
10 let bytes = input.as_bytes();
11 let Some(first_escape) = find_escapable_byte(bytes) else {
12 return input.to_owned();
13 };
14
15 let mut out = String::with_capacity(input.len() + 8);
16 out.push_str(&input[..first_escape]);
17 escape_string_from(&mut out, input, bytes, first_escape);
18
19 out
20}
21
22pub fn escape_string_into(out: &mut String, input: &str) {
23 let bytes = input.as_bytes();
24 let Some(first_escape) = find_escapable_byte(bytes) else {
25 out.push_str(input);
26 return;
27 };
28
29 escape_string_into_known(out, input, first_escape);
30}
31
32pub fn escape_string_into_with_first_escape(
33 out: &mut String,
34 input: &str,
35 first_escape: Option<usize>,
36) {
37 let Some(first_escape) = first_escape else {
38 out.push_str(input);
39 return;
40 };
41
42 escape_string_into_known(out, input, first_escape);
43}
44
45pub fn unescape_string(input: &str) -> Result<String, ParseError> {
51 let bytes = input.as_bytes();
52 if !has_byte(b'\\', bytes) {
53 return Ok(input.to_owned());
54 }
55
56 let mut out = Vec::with_capacity(input.len());
57 let mut index = 0;
58
59 while index < bytes.len() {
60 let next_escape = if let Some(relative) = find_byte(b'\\', &bytes[index..]) {
61 index + relative
62 } else {
63 out.extend_from_slice(&bytes[index..]);
64 break;
65 };
66
67 out.extend_from_slice(&bytes[index..next_escape]);
68 index = next_escape + 1;
69 if index >= bytes.len() {
70 return Err(ParseError::new("unterminated escape sequence"));
71 }
72
73 let escaped = bytes[index];
74 match escaped {
75 b'a' => out.push(b'\x07'),
76 b'b' => out.push(b'\x08'),
77 b't' => out.push(b'\t'),
78 b'n' => out.push(b'\n'),
79 b'v' => out.push(b'\x0b'),
80 b'f' => out.push(b'\x0c'),
81 b'r' => out.push(b'\r'),
82 b'\'' => out.push(b'\''),
83 b'"' => out.push(b'"'),
84 b'\\' => out.push(b'\\'),
85 b'?' => out.push(b'?'),
86 b'0'..=b'7' => {
87 let mut value = u32::from(escaped - b'0');
88 let mut consumed = 1;
89 while consumed < 3 && index + consumed < bytes.len() {
90 let next = bytes[index + consumed];
91 if !(b'0'..=b'7').contains(&next) {
92 break;
93 }
94 value = (value * 8) + u32::from(next - b'0');
95 consumed += 1;
96 }
97 match char::from_u32(value) {
98 Some(ch) => push_char_bytes(&mut out, ch),
99 None => return Err(ParseError::new("invalid octal escape value")),
100 }
101 index += consumed - 1;
102 }
103 b'x' => {
104 if index + 2 >= bytes.len() {
105 return Err(ParseError::new("incomplete hex escape"));
106 }
107 let hi = decode_hex(bytes[index + 1])?;
108 let lo = decode_hex(bytes[index + 2])?;
109 let value = u32::from((hi << 4) | lo);
110 match char::from_u32(value) {
111 Some(ch) => push_char_bytes(&mut out, ch),
112 None => return Err(ParseError::new("invalid hex escape value")),
113 }
114 index += 2;
115 }
116 other => out.push(other),
117 }
118
119 index += 1;
120 }
121
122 Ok(string_from_utf8(out))
123}
124
125pub fn extract_quoted_cow(line: &str) -> Result<Cow<'_, str>, ParseError> {
132 extract_quoted_bytes_cow(line.as_bytes())
133}
134
135pub fn extract_quoted_bytes_cow(line: &[u8]) -> Result<Cow<'_, str>, ParseError> {
136 let Some((start, end)) = find_quoted_bounds(line) else {
137 return Ok(Cow::Borrowed(""));
138 };
139
140 let raw = &line[start..end];
141 validate_quoted_content(raw)?;
142 if !has_byte(b'\\', raw) {
143 return Ok(Cow::Borrowed(bytes_to_str(raw)));
144 }
145
146 Ok(Cow::Owned(unescape_string(bytes_to_str(raw))?))
147}
148
149pub fn extract_quoted(line: &str) -> Result<String, ParseError> {
155 Ok(extract_quoted_bytes_cow(line.as_bytes())?.into_owned())
156}
157
158pub fn split_reference_comment(input: &str) -> Vec<Cow<'_, str>> {
159 let trimmed = input.trim();
160 if trimmed.is_empty() {
161 return vec![Cow::Borrowed("")];
162 }
163
164 let mut parts = Vec::new();
165 let mut start = None;
166 let mut isolate_depth = 0usize;
167
168 for (index, ch) in trimmed.char_indices() {
169 match ch {
170 '\u{2068}' => {
171 if start.is_none() {
172 start = Some(index);
173 }
174 isolate_depth += 1;
175 }
176 '\u{2069}' => {
177 if start.is_none() {
178 start = Some(index);
179 }
180 isolate_depth = isolate_depth.saturating_sub(1);
181 }
182 _ if ch.is_whitespace() && isolate_depth == 0 => {
183 if let Some(segment_start) = start.take()
184 && segment_start < index
185 {
186 parts.push(normalize_reference_token(&trimmed[segment_start..index]));
187 }
188 }
189 _ => {
190 if start.is_none() {
191 start = Some(index);
192 }
193 }
194 }
195 }
196
197 if let Some(segment_start) = start
198 && segment_start < trimmed.len()
199 {
200 parts.push(normalize_reference_token(&trimmed[segment_start..]));
201 }
202
203 if parts.len() == 1 {
204 return vec![normalize_reference_token(trimmed)];
205 }
206
207 if parts.iter().all(|part| part.contains(':')) {
208 return parts;
209 }
210
211 vec![Cow::Borrowed(trimmed)]
212}
213
214pub fn validate_quoted_content(raw: &[u8]) -> Result<(), ParseError> {
215 let mut trailing_backslashes = 0usize;
216
217 for &byte in raw {
218 match byte {
219 b'\\' => trailing_backslashes += 1,
220 b'"' if trailing_backslashes % 2 == 0 => {
221 return Err(ParseError::new("unescaped quote in string literal"));
222 }
223 _ => trailing_backslashes = 0,
224 }
225 }
226
227 Ok(())
228}
229
230fn escape_string_from(out: &mut String, input: &str, bytes: &[u8], first_escape: usize) {
231 let mut start = first_escape;
232
233 loop {
234 push_escape(out, bytes[start]);
235 let next_index = start + 1;
236 let Some(relative) = find_escapable_byte(&bytes[next_index..]) else {
237 out.push_str(&input[next_index..]);
238 break;
239 };
240
241 let absolute = next_index + relative;
242 out.push_str(&input[next_index..absolute]);
243 start = absolute;
244 }
245}
246
247#[inline]
248fn escape_string_into_known(out: &mut String, input: &str, first_escape: usize) {
249 let bytes = input.as_bytes();
250 out.push_str(&input[..first_escape]);
251 escape_string_from(out, input, bytes, first_escape);
252}
253
254fn push_escape(out: &mut String, byte: u8) {
255 out.push('\\');
256 out.push(match byte {
257 b'\x07' => 'a',
258 b'\x08' => 'b',
259 b'\t' => 't',
260 b'\n' => 'n',
261 b'\x0b' => 'v',
262 b'\x0c' => 'f',
263 b'\r' => 'r',
264 b'"' => '"',
265 b'\\' => '\\',
266 _ => unreachable!("unexpected escape byte"),
267 });
268}
269
270fn decode_hex(byte: u8) -> Result<u8, ParseError> {
271 match byte {
272 b'0'..=b'9' => Ok(byte - b'0'),
273 b'a'..=b'f' => Ok(byte - b'a' + 10),
274 b'A'..=b'F' => Ok(byte - b'A' + 10),
275 _ => Err(ParseError::new("invalid hex escape")),
276 }
277}
278
279fn push_char_bytes(out: &mut Vec<u8>, ch: char) {
280 if ch.is_ascii() {
281 out.push(ch as u8);
282 return;
283 }
284
285 let mut buf = [0u8; 4];
286 out.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes());
287}
288
289fn bytes_to_str(bytes: &[u8]) -> &str {
290 input_slice_as_str(bytes)
291}
292
293fn normalize_reference_token(input: &str) -> Cow<'_, str> {
294 if !input.contains('\u{2068}') && !input.contains('\u{2069}') {
295 return Cow::Borrowed(input);
296 }
297
298 Cow::Owned(
299 input
300 .chars()
301 .filter(|ch| *ch != '\u{2068}' && *ch != '\u{2069}')
302 .collect(),
303 )
304}
305
306#[cfg(test)]
307mod tests {
308 use std::borrow::Cow;
309
310 use super::{
311 escape_string, escape_string_into, escape_string_into_with_first_escape, extract_quoted,
312 extract_quoted_bytes_cow, extract_quoted_cow, split_reference_comment, unescape_string,
313 validate_quoted_content,
314 };
315
316 #[test]
317 fn escapes_special_characters() {
318 assert_eq!(escape_string("Say \"Hi\""), "Say \\\"Hi\\\"");
319 assert_eq!(escape_string("a\tb"), "a\\tb");
320 }
321
322 #[test]
323 fn unescapes_c_sequences() {
324 assert_eq!(
325 unescape_string("\\a\\b\\t\\n\\v\\f\\r\\'\\\"\\\\\\?").as_deref(),
326 Ok("\u{0007}\u{0008}\t\n\u{000b}\u{000c}\r'\"\\?")
327 );
328 }
329
330 #[test]
331 fn extracts_and_unescapes_quoted_text() {
332 assert_eq!(
333 extract_quoted(
334 "msgid \"The name field must not contain characters like \\\" or \\\\\""
335 )
336 .as_deref(),
337 Ok("The name field must not contain characters like \" or \\")
338 );
339 }
340
341 #[test]
342 fn borrows_simple_quoted_text_without_escape() {
343 assert_eq!(
344 extract_quoted_cow("msgid \"plain text\""),
345 Ok(Cow::Borrowed("plain text"))
346 );
347 }
348
349 #[test]
350 fn appends_escaped_text_into_existing_buffer() {
351 let mut out = String::from("prefix:");
352 escape_string_into(&mut out, "Say \"Hi\"\n");
353 assert_eq!(out, "prefix:Say \\\"Hi\\\"\\n");
354 }
355
356 #[test]
357 fn appends_escaped_text_into_existing_buffer_with_known_escape() {
358 let mut out = String::from("prefix:");
359 escape_string_into_with_first_escape(&mut out, "Say \"Hi\"\n", Some(4));
360 assert_eq!(out, "prefix:Say \\\"Hi\\\"\\n");
361 }
362
363 #[test]
364 fn appends_plain_text_when_no_escape_index_is_known() {
365 let mut out = String::from("prefix:");
366 escape_string_into_with_first_escape(&mut out, "plain", None);
367 assert_eq!(out, "prefix:plain");
368 }
369
370 #[test]
371 fn extracts_quoted_text_from_bytes() {
372 assert_eq!(
373 extract_quoted_bytes_cow(br#"msgid "byte path""#),
374 Ok(Cow::Borrowed("byte path"))
375 );
376 }
377
378 #[test]
379 fn extracts_owned_quoted_text_when_unescaping_is_required() {
380 assert_eq!(
381 extract_quoted_bytes_cow(br#"msgid "line\nbreak""#),
382 Ok(Cow::Owned("line\nbreak".to_owned()))
383 );
384 assert_eq!(extract_quoted("msgid bare"), Ok(String::new()));
385 }
386
387 #[test]
388 fn splits_multiple_reference_tokens() {
389 assert_eq!(
390 split_reference_comment("src/app.js:1 src/lib.js:2"),
391 vec![Cow::Borrowed("src/app.js:1"), Cow::Borrowed("src/lib.js:2")]
392 );
393 }
394
395 #[test]
396 fn preserves_standard_input_reference_lines() {
397 assert_eq!(
398 split_reference_comment("standard input:12 standard input:17"),
399 vec![Cow::Borrowed("standard input:12 standard input:17")]
400 );
401 }
402
403 #[test]
404 fn strips_isolates_when_splitting_reference_tokens() {
405 assert_eq!(
406 split_reference_comment("\u{2068}main 1.py\u{2069}:1 other.py:2"),
407 vec![
408 Cow::Owned("main 1.py:1".to_owned()),
409 Cow::Borrowed("other.py:2"),
410 ]
411 );
412 }
413
414 #[test]
415 fn keeps_non_reference_whitespace_groups_and_empty_input_stable() {
416 assert_eq!(
417 split_reference_comment("foo bar"),
418 vec![Cow::Borrowed("foo bar")]
419 );
420 assert_eq!(split_reference_comment(" "), vec![Cow::Borrowed("")]);
421 }
422
423 #[test]
424 fn rejects_unescaped_quote_in_string_literal() {
425 assert_eq!(
426 validate_quoted_content(br#"Some msgstr with "double\" quotes"#)
427 .expect_err("expected unescaped quote error")
428 .to_string(),
429 "unescaped quote in string literal"
430 );
431 }
432
433 #[test]
434 fn unescape_string_covers_octal_hex_and_error_paths() {
435 assert_eq!(unescape_string("\\101\\x42").as_deref(), Ok("AB"));
436 assert_eq!(
437 unescape_string("\\x4")
438 .expect_err("incomplete hex escape")
439 .to_string(),
440 "incomplete hex escape"
441 );
442 assert_eq!(
443 unescape_string("\\xZZ")
444 .expect_err("invalid hex escape")
445 .to_string(),
446 "invalid hex escape"
447 );
448 assert!(validate_quoted_content(br#"still safe\""#).is_ok());
449 }
450}