crowbook_text_processing/
escape.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with
3// this file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5//! Some functions to escape character for display in HTML or LaTeX.
6//!
7//! The two most useful ones are `tex` and `html`.
8//!
9//! # Example
10//!
11//! ```
12//! use crowbook_text_processing::escape;
13//! let input = "<foo> & <bar>";
14//! let output = escape::html(input);
15//! assert_eq!(&output, "&lt;foo&gt; &amp; &lt;bar&gt;");
16//!
17//! let input = "#2: 20%";
18//! let output = escape::tex(input);
19//! assert_eq!(&output, r"\#2: 20\%");
20//! ```
21
22
23use std::borrow::Cow;
24
25use regex::Regex;
26use regex::Captures;
27
28use crate::common::{NB_CHAR, NB_CHAR_NARROW, NB_CHAR_EM};
29
30
31/// Escape narrow non-breaking spaces for HTML.
32///
33/// This is unfortunately sometimes necessary as some fonts/renderers don't support the
34/// narrow non breaking space character.
35///
36/// This function works by declaring a span with class "nnbsp" containing
37/// the previous and next word, and replacing narrow non breaking space with the non-breaking
38/// space character.
39///
40/// Thus, in order to display correctly, you will need to add some style to this span, e.g.:
41///
42/// ```css
43/// .nnbsp {
44///    word-spacing: -0.13em;
45///  }
46/// ```
47pub fn nb_spaces_html<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
48    let input = input.into();
49    lazy_static! {
50        static ref REGEX: Regex = Regex::new(r"\S*\x{202F}[\S\x{202F}]*").unwrap();
51        static ref REGEX_LOCAL: Regex = Regex::new(r"\x{202F}").unwrap();
52    }
53    if REGEX.is_match(&input) {
54        let res = REGEX.replace_all(&input, |caps: &Captures| {
55            format!("<span class = \"nnbsp\">{}</span>",
56                    REGEX_LOCAL.replace_all(&caps[0], "&#160;"))
57        });
58        Cow::Owned(res.into_owned())
59    } else {
60        input
61    }
62}
63
64/// Old name of nb_spaces html
65#[deprecated(
66    since="1.0.0",
67    note="Renamed nb_spaces_html"
68)]
69pub fn nnbsp<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
70    nb_spaces_html(input)
71}
72
73
74
75
76/// Escape non breaking spaces for LaTeX, replacing them with the appropriate TeX code.
77/// This ensures it works correctly with some LaTeX versions (and it makes
78/// the non-breaking spaces shenanigans more visible with most editors)
79///
80/// # Achtung
81///
82/// Since this function adds some LaTeX codes that use backslashes, it will cause issues
83/// if you then try to escape those characters. So if you must escape the text for LaTeX,
84/// this function should always be called **after** `escape::tex`.
85///
86/// # Example
87///
88/// ```
89/// use crowbook_text_processing::escape;
90/// let s = escape::nb_spaces_tex("Des espaces insécables ? Ça alors !");
91/// assert_eq!(&s, "Des espaces insécables\\,? Ça alors\\,!");
92/// ```
93pub fn nb_spaces_tex<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
94    let input = input.into();
95    if let Some(first) = input.chars().position(|c| match c {
96        NB_CHAR | NB_CHAR_NARROW | NB_CHAR_EM => true,
97        _ => false,
98    }) {
99        let mut chars = input.chars().collect::<Vec<_>>();
100        let rest = chars.split_off(first);
101        let mut output = chars.into_iter().collect::<String>();
102        for c in rest {
103            match c {
104                NB_CHAR_NARROW => output.push_str("\\,"),
105                NB_CHAR_EM => output.push_str("\\enspace "),
106                NB_CHAR => output.push('~'),
107                _ => output.push(c),
108            }
109        }
110        Cow::Owned(output)
111    } else {
112        input.into()
113    }
114}
115
116/// Remove xml 1.0 invalid characters
117pub fn remove_xml_chars<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
118    lazy_static! {
119        static ref REGEX: Regex = Regex::new("[[\u{0000}-\u{0008}][\u{000E}-\u{001F}]\u{000B}\u{000C}]").unwrap();
120    }
121    let s = input.into();
122    match REGEX.replace_all(&s, "") {
123        // Annoyingly necessary so that borrow checker says it's ok
124        Cow::Borrowed(_) => s,
125        Cow::Owned(s) => Cow::Owned(s)
126    }
127}
128
129
130/// Escape characters for HTML output, replacing  `<`, `>`, and `&` with appropriate
131/// HTML entities.
132///
133/// Also remove the entities that cause problems in strict XHTML with XML1.
134///
135/// **Warning**: this function was written for escaping text in a markdown
136/// text processor that is designed to run on a local machine, where the content
137/// can actually be trusted. It should *not* be used for untrusted content.
138///
139/// # Example
140///
141/// ```
142/// use crowbook_text_processing::escape;
143/// let s = escape::html("<foo> & <bar>");
144/// assert_eq!(&s, "&lt;foo&gt; &amp; &lt;bar&gt;");
145/// ```
146pub fn html<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
147    lazy_static! {
148        static ref REGEX: Regex = Regex::new("[<>&]").unwrap();
149    }
150    let input = remove_xml_chars(input.into());
151    let first = REGEX.find(&input)
152        .map(|mat| mat.start());
153    if let Some(first) = first {
154        let len = input.len();
155        let mut output = Vec::with_capacity(len + len / 2);
156        output.extend_from_slice(input[0..first].as_bytes());
157        let rest = input[first..].bytes();
158        for c in rest {
159            match c {
160                b'<' => output.extend_from_slice(b"&lt;"),
161                b'>' => output.extend_from_slice(b"&gt;"),
162                b'&' => output.extend_from_slice(b"&amp;"),
163                _ => output.push(c),
164            }
165        }
166        Cow::Owned(String::from_utf8(output).unwrap())
167    } else {
168        input
169    }
170}
171
172/// Very naively escape quotes
173///
174/// Simply replace `"` by `'`
175pub fn quotes<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
176    let input = input.into();
177    if input.contains('"') {
178        let mut output = String::with_capacity(input.len());
179        for c in input.chars() {
180            match c {
181                '"' => output.push('\''),
182                _ => output.push(c),
183            }
184        }
185        Cow::Owned(output)
186    } else {
187        input
188    }
189}
190
191
192/// Escape characters for LaTeX
193///
194/// # Example
195///
196/// ```
197/// use crowbook_text_processing::escape;
198/// let s = escape::tex("command --foo # calls command with option foo");
199/// assert_eq!(&s, r"command -{}-foo \# calls command with option foo");
200/// ```
201pub fn tex<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
202    let input = input.into();
203    const REGEX_LITERAL: &'static str = r"[!<>&%$#_\x7E\x2D\{\}\[\]\^\\]";
204    lazy_static! {
205       static ref REGEX: Regex = Regex::new(REGEX_LITERAL).unwrap();
206    }
207
208    let first = REGEX.find(&input)
209        .map(|mat| mat.start());
210    if let Some(first) = first {
211        let len = input.len();
212        let mut output = Vec::with_capacity(len + len / 2);
213        output.extend_from_slice(input[0..first].as_bytes());
214        let mut bytes: Vec<_> = input[first..].bytes().collect();
215        bytes.push(b' '); // add a dummy char for call to .windows()
216        // for &[c, next] in chars.windows(2) { // still experimental, uncomment when stable
217        for win in bytes.windows(2) {
218            let c = win[0];
219            let next = win[1];
220            match c {
221                b'-' => {
222                    if next == b'-' {
223                        // if next char is also a -, to avoid tex ligatures
224                        output.extend_from_slice(br"-{}");
225                    } else {
226                        output.push(c);
227                    }
228                }
229                b'&' => output.extend_from_slice(br"\&"),
230                b'%' => output.extend_from_slice(br"\%"),
231                b'$' => output.extend_from_slice(br"\$"),
232                b'#' => output.extend_from_slice(br"\#"),
233                b'_' => output.extend_from_slice(br"\_"),
234                b'{' => output.extend_from_slice(br"\{"),
235                b'}' => output.extend_from_slice(br"\}"),
236                b'[' => output.extend_from_slice(br"{[}"),
237                b']' => output.extend_from_slice(br"{]}"),
238                b'~' => output.extend_from_slice(br"\textasciitilde{}"),
239                b'^' => output.extend_from_slice(br"\textasciicircum{}"),
240                b'<' => output.extend_from_slice(br"\textless{}"),
241                b'>' => output.extend_from_slice(br"\textgreater{}"),
242                b'!' => output.extend_from_slice(br"!{}"),
243                b'\\' => output.extend_from_slice(br"\textbackslash{}"),
244                _ => output.push(c),
245            }
246        }
247        Cow::Owned(String::from_utf8(output).unwrap())
248    } else {
249        input
250    }
251}
252
253
254#[test]
255fn html_0() {
256    let s = "Some string without any character to escape";
257    let result = html(s);
258    assert_eq!(s, &result);
259}
260
261#[test]
262fn tex_0() {
263    let s = "Some string without any character to escape";
264    let result = tex(s);
265    assert_eq!(s, &result);
266}
267
268#[test]
269fn nb_spaces_0() {
270    let s = "Some string without any character to escape";
271    let result = nb_spaces_html(s);
272    assert_eq!(s, &result);
273}
274
275#[test]
276fn tex_nb_spaces_0() {
277    let s = "Some string without any character to escape";
278    let result = nb_spaces_tex(s);
279    assert_eq!(s, &result);
280}
281
282#[test]
283fn quotes_0() {
284    let s = "Some string without any character to escape";
285    let result = quotes(s);
286    assert_eq!(s, &result);
287}
288
289#[test]
290fn html_1() {
291    let s = "<p>Some characters need escaping & something</p>";
292    let expected = "&lt;p&gt;Some characters need escaping &amp; something&lt;/p&gt;";
293    let actual = html(s);
294    assert_eq!(expected, &actual);
295}
296
297#[test]
298fn html_2() {
299    let actual = html("<foo> & <bar>");
300    let expected = "&lt;foo&gt; &amp; &lt;bar&gt;";
301    assert_eq!(&actual, expected);
302}
303
304#[test]
305fn tex_braces() {
306    let actual = tex(r"\foo{bar}");
307    let expected = r"\textbackslash{}foo\{bar\}";
308    assert_eq!(&actual, expected);
309}
310
311#[test]
312fn tex_square_braces() {
313    let actual = tex(r"foo[bar]");
314    let expected = r"foo{[}bar{]}";
315    assert_eq!(&actual, expected);
316}
317
318#[test]
319fn tex_dashes() {
320    let actual = tex("--foo, ---bar");
321    let expected = r"-{}-foo, -{}-{}-bar";
322    assert_eq!(&actual, expected);
323}
324
325#[test]
326fn tex_numbers() {
327    let actual = tex(r"30000$ is 10% of number #1 income");
328    let expected = r"30000\$ is 10\% of number \#1 income";
329    assert_eq!(&actual, expected);
330}
331
332#[test]
333fn quotes_escape() {
334    let actual = quotes(r#"Some text with "quotes""#);
335    let expected = r#"Some text with 'quotes'"#;
336    assert_eq!(&actual, expected);
337}
338
339
340#[test]
341fn nnbsp_1() {
342    let actual = nb_spaces_html("Test ?"); // nnbsp before ?
343    let expected = "<span class = \"nnbsp\">Test&#160;?</span>";
344    assert_eq!(&actual, expected);
345}
346
347#[test]
348fn nnbsp_2() {
349    let actual = nb_spaces_html("Ceci est un « Test » !"); // nnbsp before ! and before/after quotes
350    let expected = "Ceci est un <span class = \"nnbsp\">«&#160;Test&#160;»&#160;!</span>";
351    assert_eq!(&actual, expected);
352}
353
354#[test]
355fn xml_chars() {
356    let actual = html("Hey\u{000C}");
357    let expected = "Hey";
358    assert_eq!(&actual, expected);
359}