crowbook_text_processing/
escape.rs1use std::borrow::Cow;
24
25use regex::Regex;
26use regex::Captures;
27
28use crate::common::{NB_CHAR, NB_CHAR_NARROW, NB_CHAR_EM};
29
30
31pub fn nb_spaces_html<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
48    let input = input.into();
49    lazy_static! {
50        static ref REGEX: Regex = Regex::new(r"\S*\x{202F}[\S\x{202F}]*").unwrap();
51        static ref REGEX_LOCAL: Regex = Regex::new(r"\x{202F}").unwrap();
52    }
53    if REGEX.is_match(&input) {
54        let res = REGEX.replace_all(&input, |caps: &Captures| {
55            format!("<span class = \"nnbsp\">{}</span>",
56                    REGEX_LOCAL.replace_all(&caps[0], " "))
57        });
58        Cow::Owned(res.into_owned())
59    } else {
60        input
61    }
62}
63
64#[deprecated(
66    since="1.0.0",
67    note="Renamed nb_spaces_html"
68)]
69pub fn nnbsp<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
70    nb_spaces_html(input)
71}
72
73
74
75
76pub fn nb_spaces_tex<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
94    let input = input.into();
95    if let Some(first) = input.chars().position(|c| match c {
96        NB_CHAR | NB_CHAR_NARROW | NB_CHAR_EM => true,
97        _ => false,
98    }) {
99        let mut chars = input.chars().collect::<Vec<_>>();
100        let rest = chars.split_off(first);
101        let mut output = chars.into_iter().collect::<String>();
102        for c in rest {
103            match c {
104                NB_CHAR_NARROW => output.push_str("\\,"),
105                NB_CHAR_EM => output.push_str("\\enspace "),
106                NB_CHAR => output.push('~'),
107                _ => output.push(c),
108            }
109        }
110        Cow::Owned(output)
111    } else {
112        input.into()
113    }
114}
115
116pub fn remove_xml_chars<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
118    lazy_static! {
119        static ref REGEX: Regex = Regex::new("[[\u{0000}-\u{0008}][\u{000E}-\u{001F}]\u{000B}\u{000C}]").unwrap();
120    }
121    let s = input.into();
122    match REGEX.replace_all(&s, "") {
123        Cow::Borrowed(_) => s,
125        Cow::Owned(s) => Cow::Owned(s)
126    }
127}
128
129
130pub fn html<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
147    lazy_static! {
148        static ref REGEX: Regex = Regex::new("[<>&]").unwrap();
149    }
150    let input = remove_xml_chars(input.into());
151    let first = REGEX.find(&input)
152        .map(|mat| mat.start());
153    if let Some(first) = first {
154        let len = input.len();
155        let mut output = Vec::with_capacity(len + len / 2);
156        output.extend_from_slice(input[0..first].as_bytes());
157        let rest = input[first..].bytes();
158        for c in rest {
159            match c {
160                b'<' => output.extend_from_slice(b"<"),
161                b'>' => output.extend_from_slice(b">"),
162                b'&' => output.extend_from_slice(b"&"),
163                _ => output.push(c),
164            }
165        }
166        Cow::Owned(String::from_utf8(output).unwrap())
167    } else {
168        input
169    }
170}
171
172pub fn quotes<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
176    let input = input.into();
177    if input.contains('"') {
178        let mut output = String::with_capacity(input.len());
179        for c in input.chars() {
180            match c {
181                '"' => output.push('\''),
182                _ => output.push(c),
183            }
184        }
185        Cow::Owned(output)
186    } else {
187        input
188    }
189}
190
191
192pub fn tex<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
202    let input = input.into();
203    const REGEX_LITERAL: &'static str = r"[!<>&%$#_\x7E\x2D\{\}\[\]\^\\]";
204    lazy_static! {
205       static ref REGEX: Regex = Regex::new(REGEX_LITERAL).unwrap();
206    }
207
208    let first = REGEX.find(&input)
209        .map(|mat| mat.start());
210    if let Some(first) = first {
211        let len = input.len();
212        let mut output = Vec::with_capacity(len + len / 2);
213        output.extend_from_slice(input[0..first].as_bytes());
214        let mut bytes: Vec<_> = input[first..].bytes().collect();
215        bytes.push(b' '); for win in bytes.windows(2) {
218            let c = win[0];
219            let next = win[1];
220            match c {
221                b'-' => {
222                    if next == b'-' {
223                        output.extend_from_slice(br"-{}");
225                    } else {
226                        output.push(c);
227                    }
228                }
229                b'&' => output.extend_from_slice(br"\&"),
230                b'%' => output.extend_from_slice(br"\%"),
231                b'$' => output.extend_from_slice(br"\$"),
232                b'#' => output.extend_from_slice(br"\#"),
233                b'_' => output.extend_from_slice(br"\_"),
234                b'{' => output.extend_from_slice(br"\{"),
235                b'}' => output.extend_from_slice(br"\}"),
236                b'[' => output.extend_from_slice(br"{[}"),
237                b']' => output.extend_from_slice(br"{]}"),
238                b'~' => output.extend_from_slice(br"\textasciitilde{}"),
239                b'^' => output.extend_from_slice(br"\textasciicircum{}"),
240                b'<' => output.extend_from_slice(br"\textless{}"),
241                b'>' => output.extend_from_slice(br"\textgreater{}"),
242                b'!' => output.extend_from_slice(br"!{}"),
243                b'\\' => output.extend_from_slice(br"\textbackslash{}"),
244                _ => output.push(c),
245            }
246        }
247        Cow::Owned(String::from_utf8(output).unwrap())
248    } else {
249        input
250    }
251}
252
253
254#[test]
255fn html_0() {
256    let s = "Some string without any character to escape";
257    let result = html(s);
258    assert_eq!(s, &result);
259}
260
261#[test]
262fn tex_0() {
263    let s = "Some string without any character to escape";
264    let result = tex(s);
265    assert_eq!(s, &result);
266}
267
268#[test]
269fn nb_spaces_0() {
270    let s = "Some string without any character to escape";
271    let result = nb_spaces_html(s);
272    assert_eq!(s, &result);
273}
274
275#[test]
276fn tex_nb_spaces_0() {
277    let s = "Some string without any character to escape";
278    let result = nb_spaces_tex(s);
279    assert_eq!(s, &result);
280}
281
282#[test]
283fn quotes_0() {
284    let s = "Some string without any character to escape";
285    let result = quotes(s);
286    assert_eq!(s, &result);
287}
288
289#[test]
290fn html_1() {
291    let s = "<p>Some characters need escaping & something</p>";
292    let expected = "<p>Some characters need escaping & something</p>";
293    let actual = html(s);
294    assert_eq!(expected, &actual);
295}
296
297#[test]
298fn html_2() {
299    let actual = html("<foo> & <bar>");
300    let expected = "<foo> & <bar>";
301    assert_eq!(&actual, expected);
302}
303
304#[test]
305fn tex_braces() {
306    let actual = tex(r"\foo{bar}");
307    let expected = r"\textbackslash{}foo\{bar\}";
308    assert_eq!(&actual, expected);
309}
310
311#[test]
312fn tex_square_braces() {
313    let actual = tex(r"foo[bar]");
314    let expected = r"foo{[}bar{]}";
315    assert_eq!(&actual, expected);
316}
317
318#[test]
319fn tex_dashes() {
320    let actual = tex("--foo, ---bar");
321    let expected = r"-{}-foo, -{}-{}-bar";
322    assert_eq!(&actual, expected);
323}
324
325#[test]
326fn tex_numbers() {
327    let actual = tex(r"30000$ is 10% of number #1 income");
328    let expected = r"30000\$ is 10\% of number \#1 income";
329    assert_eq!(&actual, expected);
330}
331
332#[test]
333fn quotes_escape() {
334    let actual = quotes(r#"Some text with "quotes""#);
335    let expected = r#"Some text with 'quotes'"#;
336    assert_eq!(&actual, expected);
337}
338
339
340#[test]
341fn nnbsp_1() {
342    let actual = nb_spaces_html("Test ?"); let expected = "<span class = \"nnbsp\">Test ?</span>";
344    assert_eq!(&actual, expected);
345}
346
347#[test]
348fn nnbsp_2() {
349    let actual = nb_spaces_html("Ceci est un « Test » !"); let expected = "Ceci est un <span class = \"nnbsp\">« Test » !</span>";
351    assert_eq!(&actual, expected);
352}
353
354#[test]
355fn xml_chars() {
356    let actual = html("Hey\u{000C}");
357    let expected = "Hey";
358    assert_eq!(&actual, expected);
359}