crowbook_text_processing/
clean.rs1use regex::Regex;
17
18use std::borrow::Cow;
19
20use crate::common::is_whitespace;
21
22
23
24pub fn whitespaces<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
34 lazy_static! {
35 static ref REGEX: Regex = Regex::new(r"[ \x{202F}\x{2002}]{2,}?").unwrap();
36 }
37 let input = input.into();
38 let first = REGEX.find(&input)
39 .map(|mat| mat.start());
40 if let Some(first) = first {
41 let mut new_s = String::with_capacity(input.len());
42 new_s.push_str(&input[0..first]);
43 let mut previous_space = false;
44 for c in input[first..].chars() {
45 if is_whitespace(c) {
46 if previous_space {
47 } else {
49 new_s.push(c);
50 previous_space = true;
51 }
52 } else {
53 previous_space = false;
54 new_s.push(c);
55 }
56 }
57 Cow::Owned(new_s)
58 } else {
59 input
60 }
61}
62
63#[derive(Debug, PartialEq, Eq, PartialOrd, Clone, Copy)]
65enum CharClass {
66 Whitespace = 0,
67 Punctuation,
68 Alphanumeric,
69}
70
71fn char_class(c: char) -> CharClass {
73 if c.is_alphanumeric() {
74 CharClass::Alphanumeric
75 } else if c.is_whitespace() {
76 CharClass::Whitespace
77 } else {
78 CharClass::Punctuation
79 }
80}
81
82pub fn ellipsis<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
94 lazy_static! {
95 static ref REGEX: Regex = Regex::new(r"\.\.\.|\. \. \. ").unwrap();
96 static ref UNICODE_ELLIPSIS: &'static [u8] = "…".as_bytes();
97 static ref NB_ELLIPSIS: &'static [u8] = ". . . ".as_bytes();
98 static ref FULL_NB_ELLIPSIS: &'static [u8] = ". . . ".as_bytes();
99 }
100 let input = input.into();
101 let first = REGEX.find(&input)
102 .map(|mat| mat.start());
103 if let Some(first) = first {
104 let mut output: Vec<u8> = Vec::with_capacity(input.len());
105 output.extend_from_slice(input[0..first].as_bytes());
106 let rest = input[first..].bytes().collect::<Vec<_>>();
107 let len = rest.len();
108 let mut i = 0;
109 while i < len {
110 if i + 3 <= len && &rest[i..(i + 3)] == &[b'.', b'.', b'.'] {
111 output.extend_from_slice(*UNICODE_ELLIPSIS);
112 i += 3;
113 } else if i + 6 <= len && &rest[i..(i + 6)] == &[b'.', b' ', b'.', b' ', b'.', b' '] {
114 if i + 6 == len || rest[i + 6] != b'.' {
115 output.extend_from_slice(*NB_ELLIPSIS);
116 } else {
117 output.extend_from_slice(*FULL_NB_ELLIPSIS);
118 }
119 i += 6;
120 } else {
121 output.push(rest[i]);
122 i += 1;
123 }
124 }
125 Cow::Owned(String::from_utf8(output).unwrap())
126 } else {
127 input
128 }
129}
130
131
132pub fn quotes<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
149 lazy_static! {
150 static ref REGEX: Regex = Regex::new("[\"\']").unwrap();
151 }
152 let input = input.into();
153 let first = REGEX.find(&input)
154 .map(|mat| mat.start());
155 if let Some(mut first) = first {
156 let mut new_s = String::with_capacity(input.len());
157 if first > 0 {
158 first -= 1;
161 while !input.is_char_boundary(first) {
163 first -= 1;
164 }
165 }
166 new_s.push_str(&input[0..first]);
167 let mut chars = input[first..].chars().collect::<Vec<_>>();
168 let mut closing_quote = None;
169 let mut opened_doubles = 0;
170 for i in 0..chars.len() {
171 let c = chars[i];
172 let has_opened_quote = if let Some(n) = closing_quote {
173 i <= n
174 } else {
175 false
176 };
177 match c {
178 '"' => {
179 let prev = if i > 0 {
180 char_class(chars[i - 1])
181 } else {
182 CharClass::Whitespace
183 };
184 let next = if i < chars.len() - 1 {
185 char_class(chars[i + 1])
186 } else {
187 CharClass::Whitespace
188 };
189
190 if prev < next {
191 opened_doubles += 1;
192 new_s.push('“');
193 } else if opened_doubles > 0 {
194 opened_doubles -= 1;
195 new_s.push('”');
196 } else {
197 new_s.push('"');
198 }
199 }
200 '\'' => {
201 let prev = if i > 0 {
202 char_class(chars[i - 1])
203 } else {
204 CharClass::Whitespace
205 };
206 let next = if i < chars.len() - 1 {
207 char_class(chars[i + 1])
208 } else {
209 CharClass::Whitespace
210 };
211
212 let replacement = match (prev, next) {
213 (CharClass::Alphanumeric, CharClass::Alphanumeric)
215=> '’',
217
218 (x, y) if x < y
220 => {
221 let mut is_next_closing = false;
222 for j in (i + 1)..chars.len() {
223 if chars[j] == '\'' {
224 if chars[j-1].is_whitespace() {
225 continue;
226 } else if j >= chars.len() - 1
227 || char_class(chars[j+1]) != CharClass::Alphanumeric {
228 is_next_closing = true;
229 closing_quote = Some(j);
230 chars[j] = '’';
231 break;
232 }
233 }
234 }
235 if is_next_closing && !has_opened_quote {
236 '‘'
237 } else {
238 '’'
239 }
240 }
241
242 (x, y) if x > y
244 => {
245 '’'
246 },
247 _ => '\'',
248 };
249 new_s.push(replacement);
250 }
251 _ => new_s.push(c),
252 }
253 }
254 Cow::Owned(new_s)
255 } else {
256 input
257 }
258}
259
260
261pub fn dashes<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
274 lazy_static! {
275 static ref REGEX: Regex = Regex::new(r"\x2D\x2D").unwrap();
276 static ref EN_SPACE: &'static [u8] = "–".as_bytes();
277 static ref EM_SPACE: &'static [u8] = "—".as_bytes();
278 }
279 let input = input.into();
280 let first = REGEX.find(&input)
281 .map(|mat| mat.start());
282 if let Some(first) = first {
283 let mut output: Vec<u8> = Vec::with_capacity(input.len());
284 output.extend_from_slice(input[0..first].as_bytes());
285 let rest = input[first..].bytes().collect::<Vec<_>>();
286 let len = rest.len();
287 let mut i = 0;
288 while i < len {
289 if i + 2 <= len && &rest[i..(i + 2)] == &[b'-', b'-'] {
290 if i + 2 < len && rest[i + 2] == b'-' {
291 output.extend_from_slice(*EM_SPACE);
292 i += 3;
293 } else {
294 output.extend_from_slice(*EN_SPACE);
295 i += 2;
296 }
297 } else {
298 output.push(rest[i]);
299 i += 1;
300 }
301 }
302 Cow::Owned(String::from_utf8(output).unwrap())
303 } else {
304 input
305 }
306}
307
308pub fn guillemets<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
323 lazy_static! {
324 static ref REGEX: Regex = Regex::new(r"<<|>>").unwrap();
325 static ref OPENING_GUILLEMET: &'static [u8] = "«".as_bytes();
326 static ref CLOSING_GUILLEMET: &'static [u8] = "»".as_bytes();
327 }
328 let input = input.into();
329 let first = REGEX.find(&input)
330 .map(|mat| mat.start());
331 if let Some(first) = first {
332 let mut output: Vec<u8> = Vec::with_capacity(input.len());
333 output.extend_from_slice(input[0..first].as_bytes());
334 let rest = input[first..].bytes().collect::<Vec<_>>();
335 let len = rest.len();
336 let mut i = 0;
337 while i < len {
338 if i + 2 <= len && &rest[i..(i + 2)] == &[b'<', b'<'] {
339 output.extend_from_slice(*OPENING_GUILLEMET);
340 i += 2;
341 } else if i+2 <= len && &rest[i..(i + 2)] == &[b'>', b'>'] {
342 output.extend_from_slice(*CLOSING_GUILLEMET);
343 i += 2;
344 } else {
345 output.push(rest[i]);
346 i += 1;
347 }
348 }
349 Cow::Owned(String::from_utf8(output).unwrap())
350 } else {
351 input
352 }
353}
354
355
356
357#[test]
358fn whitespaces_1() {
359 let s = " Remove supplementary spaces but don't trim either ";
360 let res = whitespaces(s);
361 assert_eq!(&res, " Remove supplementary spaces but don't trim either ");
362}
363
364#[test]
365fn quotes_1() {
366 let s = "Some string without ' typographic ' quotes";
367 let res = quotes(s);
368 assert_eq!(&res, s);
369}
370
371#[test]
372fn quotes_2() {
373 let s = quotes("\"foo\"");
374 assert_eq!(&s, "“foo”");
375 let s = quotes("'foo'");
376 assert_eq!(&s, "‘foo’");
377}
378
379#[test]
380fn quotes_3() {
381 let s = quotes("\'mam, how are you?");
382 assert_eq!(&s, "’mam, how are you?");
383}
384
385#[test]
386fn quotes_4() {
387 let s = quotes("some char: 'c', '4', '&'");
388 assert_eq!(&s, "some char: ‘c’, ‘4’, ‘&’");
389}
390
391#[test]
392fn quotes_5() {
393 let s = quotes("It's a good day to say 'hi'");
394 assert_eq!(&s, "It’s a good day to say ‘hi’");
395}
396
397#[test]
398fn quotes_6() {
399 let s = quotes("The '60s were nice, weren't they?");
400 assert_eq!(&s, "The ’60s were nice, weren’t they?");
401}
402
403#[test]
404fn quotes_7() {
405 let s = quotes("Plurals' possessive");
406 assert_eq!(&s, "Plurals’ possessive");
407}
408
409#[test]
410fn quotes_8() {
411 let s = quotes("\"I like 'That '70s show'\", she said");
412 assert_eq!(&s, "“I like ‘That ’70s show’”, she said");
413}
414
415
416#[test]
417fn quotes_9() {
418 let s = quotes("some char: '!', '?', ','");
419 assert_eq!(&s, "some char: ‘!’, ‘?’, ‘,’");
420}
421
422#[test]
423fn quotes_10() {
424 let s = quotes("\"'Let's try \"nested\" quotes,' he said.\"");
425 assert_eq!(&s, "“‘Let’s try “nested” quotes,’ he said.”");
426}
427
428#[test]
429fn quotes_11() {
430 let s = quotes("Enhanced \"quotes\"'s heuristics");
431 assert_eq!(&s, "Enhanced “quotes”’s heuristics");
432}
433
434#[test]
435fn quotes_12() {
436 let s = quotes("A double quote--\"within\" dashes--would be nice.");
437 assert_eq!(&s, "A double quote--“within” dashes--would be nice.");
438}
439
440#[test]
441fn quotes_13() {
442 let s = quotes("A double quote–\"within\" dashes–would be nice.");
443 assert_eq!(&s, "A double quote–“within” dashes–would be nice.");
444}
445
446
447#[test]
448fn ellipsis_0() {
449 let s = ellipsis("Foo...");
450 assert_eq!(&s, "Foo…");
451}
452
453#[test]
454fn ellipsis_1() {
455 let s = ellipsis("Foo... Bar");
456 assert_eq!(&s, "Foo… Bar");
457}
458
459#[test]
460fn ellipsis_2() {
461 let s = ellipsis("foo....");
462 assert_eq!(&s, "foo….");
463}
464
465#[test]
466fn ellipsis_3() {
467 let s = ellipsis("foo. . . ");
468 assert_eq!(&s, "foo. . . ");
469}
470
471#[test]
472fn ellipsis_4() {
473 let s = ellipsis("foo. . . .");
474 assert_eq!(&s, "foo. . . .");
475}
476
477#[test]
478fn ellipsis_5() {
479 let s = ellipsis("foo..");
480 assert_eq!(&s, "foo..");
481}
482
483#[test]
484fn dashes_0() {
485 let s = dashes("foo - bar");
486 assert_eq!(&s, "foo - bar");
487}
488
489#[test]
490fn dashes_1() {
491 let s = dashes("foo -- bar");
492 assert_eq!(&s, "foo – bar");
493}
494
495#[test]
496fn dashes_2() {
497 let s = dashes("foo --- bar");
498 assert_eq!(&s, "foo — bar");
499}
500
501#[test]
502fn dashes_3() {
503 let s = dashes("foo --- bar--");
504 assert_eq!(&s, "foo — bar–");
505}
506
507#[test]
508fn guillemets_1() {
509 let s = guillemets("<< Foo >>");
510 assert_eq!(&s, "« Foo »");
511}
512
513#[test]
514fn guillemets_2() {
515 let s = guillemets("<< Foo");
516 assert_eq!(&s, "« Foo");
517}
518
519#[test]
520fn guillemets_3() {
521 let s = guillemets("Foo >>");
522 assert_eq!(&s, "Foo »");
523}
524
525#[test]
526fn guillemets_4() {
527 let s = guillemets("<< Foo < Bar >>");
528 assert_eq!(&s, "« Foo < Bar »");
529}