fluent_syntax/unicode.rs
1//! A set of helper functions for unescaping Fluent unicode escape sequences.
2//!
3//! # Unicode
4//!
5//! Fluent supports UTF-8 in all FTL resources, but it also allows
6//! unicode sequences to be escaped in [`String
7//! Literals`](super::ast::InlineExpression::StringLiteral).
8//!
9//! Four byte sequences are encoded with `\u` and six byte
10//! sequences using `\U`.
11//! ## Example
12//!
13//! ```
14//! use fluent_syntax::unicode::unescape_unicode_to_string;
15//!
16//! assert_eq!(
17//! unescape_unicode_to_string("Foo \\u5bd2 Bar"),
18//! "Foo 寒 Bar"
19//! );
20//!
21//! assert_eq!(
22//! unescape_unicode_to_string("Foo \\U01F68A Bar"),
23//! "Foo 🚊 Bar"
24//! );
25//! ```
26//!
27//! # Other unescapes
28//!
29//! This also allows for a char `"` to be present inside an FTL string literal,
30//! and for `\` itself to be escaped.
31//!
32//! ## Example
33//!
34//! ```
35//! use fluent_syntax::unicode::unescape_unicode_to_string;
36//!
37//! assert_eq!(
38//! unescape_unicode_to_string("Foo \\\" Bar"),
39//! "Foo \" Bar"
40//! );
41//! assert_eq!(
42//! unescape_unicode_to_string("Foo \\\\ Bar"),
43//! "Foo \\ Bar"
44//! );
45//! ```
46use std::borrow::Cow;
47use std::char;
48use std::fmt;
49
50const UNKNOWN_CHAR: char = '�';
51
52fn encode_unicode(s: Option<&str>) -> char {
53 s.and_then(|s| u32::from_str_radix(s, 16).ok().and_then(char::from_u32))
54 .unwrap_or(UNKNOWN_CHAR)
55}
56
57/// Unescapes to a writer without allocating.
58///
59/// ## Example
60///
61/// ```
62/// use fluent_syntax::unicode::unescape_unicode;
63///
64/// let mut s = String::new();
65/// unescape_unicode(&mut s, "Foo \\U01F60A Bar");
66/// assert_eq!(s, "Foo 😊 Bar");
67/// ```
68pub fn unescape_unicode<W>(w: &mut W, input: &str) -> fmt::Result
69where
70 W: fmt::Write,
71{
72 if unescape(w, input)? {
73 return Ok(());
74 }
75 w.write_str(input)
76}
77
78fn unescape<W>(w: &mut W, input: &str) -> Result<bool, std::fmt::Error>
79where
80 W: fmt::Write,
81{
82 let bytes = input.as_bytes();
83
84 let mut start = 0;
85 let mut ptr = 0;
86
87 while let Some(b) = bytes.get(ptr) {
88 if b != &b'\\' {
89 ptr += 1;
90 continue;
91 }
92 if start != ptr {
93 w.write_str(&input[start..ptr])?;
94 }
95
96 ptr += 1;
97
98 let new_char = match bytes.get(ptr) {
99 Some(b'\\') => '\\',
100 Some(b'"') => '"',
101 Some(u @ b'u') | Some(u @ b'U') => {
102 let seq_start = ptr + 1;
103 let len = if u == &b'u' { 4 } else { 6 };
104 ptr += len;
105 encode_unicode(input.get(seq_start..seq_start + len))
106 }
107 _ => UNKNOWN_CHAR,
108 };
109 ptr += 1;
110 w.write_char(new_char)?;
111 start = ptr;
112 }
113
114 if start == 0 {
115 return Ok(false);
116 }
117
118 if start != ptr {
119 w.write_str(&input[start..ptr])?;
120 }
121 Ok(true)
122}
123
124/// Unescapes to a `Cow<str>` optionally allocating.
125///
126/// ## Example
127///
128/// ```
129/// use fluent_syntax::unicode::unescape_unicode_to_string;
130///
131/// assert_eq!(
132/// unescape_unicode_to_string("Foo \\U01F60A Bar"),
133/// "Foo 😊 Bar"
134/// );
135/// ```
136pub fn unescape_unicode_to_string(input: &str) -> Cow<str> {
137 let mut result = String::new();
138 let owned = unescape(&mut result, input).expect("String write methods don't Err");
139 if owned {
140 Cow::Owned(result)
141 } else {
142 Cow::Borrowed(input)
143 }
144}