crowbook_text_processing/
escape.rs1use std::borrow::Cow;
24
25use regex::Regex;
26use regex::Captures;
27
28use crate::common::{NB_CHAR, NB_CHAR_NARROW, NB_CHAR_EM};
29
30
31pub fn nb_spaces_html<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
48 let input = input.into();
49 lazy_static! {
50 static ref REGEX: Regex = Regex::new(r"\S*\x{202F}[\S\x{202F}]*").unwrap();
51 static ref REGEX_LOCAL: Regex = Regex::new(r"\x{202F}").unwrap();
52 }
53 if REGEX.is_match(&input) {
54 let res = REGEX.replace_all(&input, |caps: &Captures| {
55 format!("<span class = \"nnbsp\">{}</span>",
56 REGEX_LOCAL.replace_all(&caps[0], " "))
57 });
58 Cow::Owned(res.into_owned())
59 } else {
60 input
61 }
62}
63
64#[deprecated(
66 since="1.0.0",
67 note="Renamed nb_spaces_html"
68)]
69pub fn nnbsp<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
70 nb_spaces_html(input)
71}
72
73
74
75
76pub fn nb_spaces_tex<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
94 let input = input.into();
95 if let Some(first) = input.chars().position(|c| match c {
96 NB_CHAR | NB_CHAR_NARROW | NB_CHAR_EM => true,
97 _ => false,
98 }) {
99 let mut chars = input.chars().collect::<Vec<_>>();
100 let rest = chars.split_off(first);
101 let mut output = chars.into_iter().collect::<String>();
102 for c in rest {
103 match c {
104 NB_CHAR_NARROW => output.push_str("\\,"),
105 NB_CHAR_EM => output.push_str("\\enspace "),
106 NB_CHAR => output.push('~'),
107 _ => output.push(c),
108 }
109 }
110 Cow::Owned(output)
111 } else {
112 input.into()
113 }
114}
115
116pub fn remove_xml_chars<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
118 lazy_static! {
119 static ref REGEX: Regex = Regex::new("[[\u{0000}-\u{0008}][\u{000E}-\u{001F}]\u{000B}\u{000C}]").unwrap();
120 }
121 let s = input.into();
122 match REGEX.replace_all(&s, "") {
123 Cow::Borrowed(_) => s,
125 Cow::Owned(s) => Cow::Owned(s)
126 }
127}
128
129
130pub fn html<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
147 lazy_static! {
148 static ref REGEX: Regex = Regex::new("[<>&]").unwrap();
149 }
150 let input = remove_xml_chars(input.into());
151 let first = REGEX.find(&input)
152 .map(|mat| mat.start());
153 if let Some(first) = first {
154 let len = input.len();
155 let mut output = Vec::with_capacity(len + len / 2);
156 output.extend_from_slice(input[0..first].as_bytes());
157 let rest = input[first..].bytes();
158 for c in rest {
159 match c {
160 b'<' => output.extend_from_slice(b"<"),
161 b'>' => output.extend_from_slice(b">"),
162 b'&' => output.extend_from_slice(b"&"),
163 _ => output.push(c),
164 }
165 }
166 Cow::Owned(String::from_utf8(output).unwrap())
167 } else {
168 input
169 }
170}
171
172pub fn quotes<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
176 let input = input.into();
177 if input.contains('"') {
178 let mut output = String::with_capacity(input.len());
179 for c in input.chars() {
180 match c {
181 '"' => output.push('\''),
182 _ => output.push(c),
183 }
184 }
185 Cow::Owned(output)
186 } else {
187 input
188 }
189}
190
191
192pub fn tex<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
202 let input = input.into();
203 const REGEX_LITERAL: &'static str = r"[!<>&%$#_\x7E\x2D\{\}\[\]\^\\]";
204 lazy_static! {
205 static ref REGEX: Regex = Regex::new(REGEX_LITERAL).unwrap();
206 }
207
208 let first = REGEX.find(&input)
209 .map(|mat| mat.start());
210 if let Some(first) = first {
211 let len = input.len();
212 let mut output = Vec::with_capacity(len + len / 2);
213 output.extend_from_slice(input[0..first].as_bytes());
214 let mut bytes: Vec<_> = input[first..].bytes().collect();
215 bytes.push(b' '); for win in bytes.windows(2) {
218 let c = win[0];
219 let next = win[1];
220 match c {
221 b'-' => {
222 if next == b'-' {
223 output.extend_from_slice(br"-{}");
225 } else {
226 output.push(c);
227 }
228 }
229 b'&' => output.extend_from_slice(br"\&"),
230 b'%' => output.extend_from_slice(br"\%"),
231 b'$' => output.extend_from_slice(br"\$"),
232 b'#' => output.extend_from_slice(br"\#"),
233 b'_' => output.extend_from_slice(br"\_"),
234 b'{' => output.extend_from_slice(br"\{"),
235 b'}' => output.extend_from_slice(br"\}"),
236 b'[' => output.extend_from_slice(br"{[}"),
237 b']' => output.extend_from_slice(br"{]}"),
238 b'~' => output.extend_from_slice(br"\textasciitilde{}"),
239 b'^' => output.extend_from_slice(br"\textasciicircum{}"),
240 b'<' => output.extend_from_slice(br"\textless{}"),
241 b'>' => output.extend_from_slice(br"\textgreater{}"),
242 b'!' => output.extend_from_slice(br"!{}"),
243 b'\\' => output.extend_from_slice(br"\textbackslash{}"),
244 _ => output.push(c),
245 }
246 }
247 Cow::Owned(String::from_utf8(output).unwrap())
248 } else {
249 input
250 }
251}
252
253
254#[test]
255fn html_0() {
256 let s = "Some string without any character to escape";
257 let result = html(s);
258 assert_eq!(s, &result);
259}
260
261#[test]
262fn tex_0() {
263 let s = "Some string without any character to escape";
264 let result = tex(s);
265 assert_eq!(s, &result);
266}
267
268#[test]
269fn nb_spaces_0() {
270 let s = "Some string without any character to escape";
271 let result = nb_spaces_html(s);
272 assert_eq!(s, &result);
273}
274
275#[test]
276fn tex_nb_spaces_0() {
277 let s = "Some string without any character to escape";
278 let result = nb_spaces_tex(s);
279 assert_eq!(s, &result);
280}
281
282#[test]
283fn quotes_0() {
284 let s = "Some string without any character to escape";
285 let result = quotes(s);
286 assert_eq!(s, &result);
287}
288
289#[test]
290fn html_1() {
291 let s = "<p>Some characters need escaping & something</p>";
292 let expected = "<p>Some characters need escaping & something</p>";
293 let actual = html(s);
294 assert_eq!(expected, &actual);
295}
296
297#[test]
298fn html_2() {
299 let actual = html("<foo> & <bar>");
300 let expected = "<foo> & <bar>";
301 assert_eq!(&actual, expected);
302}
303
304#[test]
305fn tex_braces() {
306 let actual = tex(r"\foo{bar}");
307 let expected = r"\textbackslash{}foo\{bar\}";
308 assert_eq!(&actual, expected);
309}
310
311#[test]
312fn tex_square_braces() {
313 let actual = tex(r"foo[bar]");
314 let expected = r"foo{[}bar{]}";
315 assert_eq!(&actual, expected);
316}
317
318#[test]
319fn tex_dashes() {
320 let actual = tex("--foo, ---bar");
321 let expected = r"-{}-foo, -{}-{}-bar";
322 assert_eq!(&actual, expected);
323}
324
325#[test]
326fn tex_numbers() {
327 let actual = tex(r"30000$ is 10% of number #1 income");
328 let expected = r"30000\$ is 10\% of number \#1 income";
329 assert_eq!(&actual, expected);
330}
331
332#[test]
333fn quotes_escape() {
334 let actual = quotes(r#"Some text with "quotes""#);
335 let expected = r#"Some text with 'quotes'"#;
336 assert_eq!(&actual, expected);
337}
338
339
340#[test]
341fn nnbsp_1() {
342 let actual = nb_spaces_html("Test ?"); let expected = "<span class = \"nnbsp\">Test ?</span>";
344 assert_eq!(&actual, expected);
345}
346
347#[test]
348fn nnbsp_2() {
349 let actual = nb_spaces_html("Ceci est un « Test » !"); let expected = "Ceci est un <span class = \"nnbsp\">« Test » !</span>";
351 assert_eq!(&actual, expected);
352}
353
354#[test]
355fn xml_chars() {
356 let actual = html("Hey\u{000C}");
357 let expected = "Hey";
358 assert_eq!(&actual, expected);
359}