crowbook_text_processing/
clean.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with
3// this file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5//! This module provides function to "clean" a text typographically.
6//!
7//! # Example
8//!
9//! ```
10//! use crowbook_text_processing::clean;
11//! let input = "Some  'text'  whose formatting  could be enhanced...";
12//! let output = clean::quotes(clean::ellipsis(clean::whitespaces(input)));
13//! assert_eq!(&output, "Some ‘text’ whose formatting could be enhanced…");
14//! ```
15
16use regex::Regex;
17
18use std::borrow::Cow;
19
20use crate::common::is_whitespace;
21
22
23
24/// Removes unnecessary whitespaces from a String.
25///
26/// # Example
27///
28/// ```
29/// use crowbook_text_processing::clean;
30/// let s = clean::whitespaces("  A  string   with   more   whitespaces  than  needed   ");
31/// assert_eq!(&s, " A string with more whitespaces than needed ");
32/// ```
33pub fn whitespaces<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
34    lazy_static! {
35        static ref REGEX: Regex = Regex::new(r"[  \x{202F}\x{2002}]{2,}?").unwrap();
36    }
37    let input = input.into();
38    let first = REGEX.find(&input)
39        .map(|mat| mat.start());
40    if let Some(first) = first {
41        let mut new_s = String::with_capacity(input.len());
42        new_s.push_str(&input[0..first]);
43        let mut previous_space = false;
44        for c in input[first..].chars() {
45            if is_whitespace(c) {
46                if previous_space {
47                    // previous char already a space, don't copy it
48                } else {
49                    new_s.push(c);
50                    previous_space = true;
51                }
52            } else {
53                previous_space = false;
54                new_s.push(c);
55            }
56        }
57        Cow::Owned(new_s)
58    } else {
59        input
60    }
61}
62
63/// Class of a character
64#[derive(Debug, PartialEq, Eq, PartialOrd, Clone, Copy)]
65enum CharClass {
66    Whitespace = 0,
67    Punctuation,
68    Alphanumeric,
69}
70
71/// Get class of a character
72fn char_class(c: char) -> CharClass {
73    if c.is_alphanumeric() {
74        CharClass::Alphanumeric
75    } else if c.is_whitespace() {
76        CharClass::Whitespace
77    } else {
78        CharClass::Punctuation
79    }
80}
81
82/// Replace ellipsis (...) with the appropriate unicode character
83///
84/// # Example
85///
86/// ```
87/// use crowbook_text_processing::clean;
88/// let s = clean::ellipsis("foo...");
89/// assert_eq!(&s, "foo…");
90/// let s = clean::ellipsis("foo. . . ");
91/// assert_eq!(&s, "foo.\u{a0}.\u{a0}. "); // non breaking spaces
92/// ```
93pub fn ellipsis<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
94    lazy_static! {
95        static ref REGEX: Regex = Regex::new(r"\.\.\.|\. \. \. ").unwrap();
96        static ref UNICODE_ELLIPSIS: &'static [u8] = "…".as_bytes();
97        static ref NB_ELLIPSIS: &'static [u8] = ". . . ".as_bytes();
98        static ref FULL_NB_ELLIPSIS: &'static [u8] = ". . . ".as_bytes();
99    }
100    let input = input.into();
101    let first = REGEX.find(&input)
102        .map(|mat| mat.start());
103    if let Some(first) = first {
104        let mut output: Vec<u8> = Vec::with_capacity(input.len());
105        output.extend_from_slice(input[0..first].as_bytes());
106        let rest = input[first..].bytes().collect::<Vec<_>>();
107        let len = rest.len();
108        let mut i = 0;
109        while i < len {
110            if i + 3 <= len && &rest[i..(i + 3)] == &[b'.', b'.', b'.'] {
111                output.extend_from_slice(*UNICODE_ELLIPSIS);
112                i += 3;
113            } else if i + 6 <= len && &rest[i..(i + 6)] == &[b'.', b' ', b'.', b' ', b'.', b' '] {
114                if i + 6 == len || rest[i + 6] != b'.' {
115                    output.extend_from_slice(*NB_ELLIPSIS);
116                } else {
117                    output.extend_from_slice(*FULL_NB_ELLIPSIS);
118                }
119                i += 6;
120            } else {
121                output.push(rest[i]);
122                i += 1;
123            }
124        }
125        Cow::Owned(String::from_utf8(output).unwrap())
126    } else {
127        input
128    }
129}
130
131
132/// Replace straight quotes with more typographic variants
133///
134/// While it should work pretty well for double quotes (`"`), the rules for single
135/// quote (`'`) are more ambiguous, as it can be a quote or an apostrophe and it's not
136/// that easy (and, in some circumstances, impossible without understanding the meaning
137/// of the text) to get right.
138///
139/// # Example
140///
141/// ```
142/// use crowbook_text_processing::clean;
143/// let s = clean::quotes("\"foo\"");
144/// assert_eq!(&s, "“foo”");
145/// let s = clean::quotes("'foo'");
146/// assert_eq!(&s, "‘foo’");
147/// ```
148pub fn quotes<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
149    lazy_static! {
150        static ref REGEX: Regex = Regex::new("[\"\']").unwrap();
151    }
152    let input = input.into();
153    let first = REGEX.find(&input)
154        .map(|mat| mat.start());
155    if let Some(mut first) = first {
156        let mut new_s = String::with_capacity(input.len());
157        if first > 0 {
158            // Move one step backward since we might need to know if previous char was
159            // a letter or not
160            first -= 1;
161            // Check that it is a character boundary, else go backward
162            while !input.is_char_boundary(first) {
163                first -= 1;
164            }
165        }
166        new_s.push_str(&input[0..first]);
167        let mut chars = input[first..].chars().collect::<Vec<_>>();
168        let mut closing_quote = None;
169        let mut opened_doubles = 0;
170        for i in 0..chars.len() {
171            let c = chars[i];
172            let has_opened_quote = if let Some(n) = closing_quote {
173                i <= n
174            } else {
175                false
176            };
177            match c {
178                '"' => {
179                    let prev = if i > 0 {
180                        char_class(chars[i - 1])
181                    } else {
182                        CharClass::Whitespace
183                    };
184                    let next = if i < chars.len() - 1 {
185                        char_class(chars[i + 1])
186                    } else {
187                        CharClass::Whitespace
188                    };
189
190                    if prev < next {
191                        opened_doubles += 1;
192                        new_s.push('“');
193                    } else if opened_doubles > 0 {
194                        opened_doubles -= 1;
195                        new_s.push('”');
196                    } else {
197                        new_s.push('"');
198                    }
199                }
200                '\'' => {
201                    let prev = if i > 0 {
202                        char_class(chars[i - 1])
203                    } else {
204                        CharClass::Whitespace
205                    };
206                    let next = if i < chars.len() - 1 {
207                        char_class(chars[i + 1])
208                    } else {
209                        CharClass::Whitespace
210                    };
211
212                    let replacement = match (prev, next) {
213                        // Elision or possessive
214                        (CharClass::Alphanumeric, CharClass::Alphanumeric)
215//                            | (CharClass::Punctuation, CharClass::Alphanumeric)
216                            => '’',
217
218                        // Beginning of word, it's opening (not always though)
219                        (x, y) if x < y
220                            => {
221                                let mut is_next_closing = false;
222                                for j in (i + 1)..chars.len() {
223                                    if chars[j] == '\'' {
224                                        if chars[j-1].is_whitespace() {
225                                            continue;
226                                        } else if j >= chars.len() - 1
227                                            || char_class(chars[j+1]) != CharClass::Alphanumeric {
228                                                is_next_closing = true;
229                                                closing_quote = Some(j);
230                                                chars[j] = '’';
231                                                break;
232                                            }
233                                    }
234                                }
235                                if is_next_closing && !has_opened_quote {
236                                    '‘'
237                                } else {
238                                    '’'
239                                }
240                            }
241
242                        // Apostrophe at end of word, it's closing
243                        (x, y) if x > y
244                            => {
245                                '’'
246                            },
247                        _ => '\'',
248                    };
249                    new_s.push(replacement);
250                }
251                _ => new_s.push(c),
252            }
253        }
254        Cow::Owned(new_s)
255    } else {
256        input
257    }
258}
259
260
261/// Replace double dashes (`--`) and triple dashes (`---`) to en dash and em dash, respectively.
262///
263/// This function can be useful when writing literary texts, but should be used with caution
264/// as double and triple dashes can have special meanings.
265///
266/// # Example
267///
268/// ```
269/// use crowbook_text_processing::clean;
270/// let s = clean::dashes("--- Hi, he said -- unexpectedly");
271/// assert_eq!(&s, "— Hi, he said – unexpectedly");
272/// ```
273pub fn dashes<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
274    lazy_static! {
275        static ref REGEX: Regex = Regex::new(r"\x2D\x2D").unwrap();
276        static ref EN_SPACE: &'static [u8] = "–".as_bytes();
277        static ref EM_SPACE: &'static [u8] = "—".as_bytes();
278    }
279    let input = input.into();
280    let first = REGEX.find(&input)
281        .map(|mat| mat.start());
282    if let Some(first) = first {
283        let mut output: Vec<u8> = Vec::with_capacity(input.len());
284        output.extend_from_slice(input[0..first].as_bytes());
285        let rest = input[first..].bytes().collect::<Vec<_>>();
286        let len = rest.len();
287        let mut i = 0;
288        while i < len {
289            if i + 2 <= len && &rest[i..(i + 2)] == &[b'-', b'-'] {
290                if i + 2 < len && rest[i + 2] == b'-' {
291                    output.extend_from_slice(*EM_SPACE);
292                    i += 3;
293                } else {
294                    output.extend_from_slice(*EN_SPACE);
295                    i += 2;
296                }
297            } else {
298                output.push(rest[i]);
299                i += 1;
300            }
301        }
302        Cow::Owned(String::from_utf8(output).unwrap())
303    } else {
304        input
305    }
306}
307
308/// Replaces `<<` with `«` and `>>` with `»`.
309///
310/// This can be useful if you need those characters (e.g. for french texts) but
311/// don't have an easy access to them on your computer but, as the `dashes` function,
312/// it should be used with caution, as `<<` and `>>` can also be used for other things
313/// (typically to mean "very inferior to" or "very superior to").
314///
315/// # Example
316///
317/// ```
318/// use crowbook_text_processing::clean;
319/// let s = clean::guillemets("<< Foo >>");
320/// assert_eq!(&s, "« Foo »");
321/// ```
322pub fn guillemets<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
323    lazy_static! {
324        static ref REGEX: Regex = Regex::new(r"<<|>>").unwrap();
325        static ref OPENING_GUILLEMET: &'static [u8] = "«".as_bytes();
326        static ref CLOSING_GUILLEMET: &'static [u8] = "»".as_bytes();
327    }
328    let input = input.into();
329    let first = REGEX.find(&input)
330        .map(|mat| mat.start());
331    if let Some(first) = first {
332        let mut output: Vec<u8> = Vec::with_capacity(input.len());
333        output.extend_from_slice(input[0..first].as_bytes());
334        let rest = input[first..].bytes().collect::<Vec<_>>();
335        let len = rest.len();
336        let mut i = 0;
337        while i < len {
338            if i + 2 <= len && &rest[i..(i + 2)] == &[b'<', b'<'] {
339                output.extend_from_slice(*OPENING_GUILLEMET);
340                i += 2;
341            } else if i+2 <= len && &rest[i..(i + 2)] == &[b'>', b'>'] {
342                output.extend_from_slice(*CLOSING_GUILLEMET);
343                i += 2;
344            } else {
345                output.push(rest[i]);
346                i += 1;
347            }
348        }
349        Cow::Owned(String::from_utf8(output).unwrap())
350    } else {
351        input
352    }
353}
354
355
356
357#[test]
358fn whitespaces_1() {
359    let s = "   Remove    supplementary   spaces    but    don't    trim     either   ";
360    let res = whitespaces(s);
361    assert_eq!(&res, " Remove supplementary spaces but don't trim either ");
362}
363
364#[test]
365fn quotes_1() {
366    let s = "Some string without ' typographic ' quotes";
367    let res = quotes(s);
368    assert_eq!(&res, s);
369}
370
371#[test]
372fn quotes_2() {
373    let s = quotes("\"foo\"");
374    assert_eq!(&s, "“foo”");
375    let s = quotes("'foo'");
376    assert_eq!(&s, "‘foo’");
377}
378
379#[test]
380fn quotes_3() {
381    let s = quotes("\'mam, how are you?");
382    assert_eq!(&s, "’mam, how are you?");
383}
384
385#[test]
386fn quotes_4() {
387    let s = quotes("some char: 'c', '4', '&'");
388    assert_eq!(&s, "some char: ‘c’, ‘4’, ‘&’");
389}
390
391#[test]
392fn quotes_5() {
393    let s = quotes("It's a good day to say 'hi'");
394    assert_eq!(&s, "It’s a good day to say ‘hi’");
395}
396
397#[test]
398fn quotes_6() {
399    let s = quotes("The '60s were nice, weren't they?");
400    assert_eq!(&s, "The ’60s were nice, weren’t they?");
401}
402
403#[test]
404fn quotes_7() {
405    let s = quotes("Plurals' possessive");
406    assert_eq!(&s, "Plurals’ possessive");
407}
408
409#[test]
410fn quotes_8() {
411    let s = quotes("\"I like 'That '70s show'\", she said");
412    assert_eq!(&s, "“I like ‘That ’70s show’”, she said");
413}
414
415
416#[test]
417fn quotes_9() {
418    let s = quotes("some char: '!', '?', ','");
419    assert_eq!(&s, "some char: ‘!’, ‘?’, ‘,’");
420}
421
422#[test]
423fn quotes_10() {
424    let s = quotes("\"'Let's try \"nested\" quotes,' he said.\"");
425    assert_eq!(&s, "“‘Let’s try “nested” quotes,’ he said.”");
426}
427
428#[test]
429fn quotes_11() {
430    let s = quotes("Enhanced \"quotes\"'s heuristics");
431    assert_eq!(&s, "Enhanced “quotes”’s heuristics");
432}
433
434#[test]
435fn quotes_12() {
436    let s = quotes("A double quote--\"within\" dashes--would be nice.");
437    assert_eq!(&s, "A double quote--“within” dashes--would be nice.");
438}
439
440#[test]
441fn quotes_13() {
442    let s = quotes("A double quote–\"within\" dashes–would be nice.");
443    assert_eq!(&s, "A double quote–“within” dashes–would be nice.");
444}
445
446
447#[test]
448fn ellipsis_0() {
449    let s = ellipsis("Foo...");
450    assert_eq!(&s, "Foo…");
451}
452
453#[test]
454fn ellipsis_1() {
455    let s = ellipsis("Foo... Bar");
456    assert_eq!(&s, "Foo… Bar");
457}
458
459#[test]
460fn ellipsis_2() {
461    let s = ellipsis("foo....");
462    assert_eq!(&s, "foo….");
463}
464
465#[test]
466fn ellipsis_3() {
467    let s = ellipsis("foo. . . ");
468    assert_eq!(&s, "foo. . . ");
469}
470
471#[test]
472fn ellipsis_4() {
473    let s = ellipsis("foo. . . .");
474    assert_eq!(&s, "foo. . . .");
475}
476
477#[test]
478fn ellipsis_5() {
479    let s = ellipsis("foo..");
480    assert_eq!(&s, "foo..");
481}
482
483#[test]
484fn dashes_0() {
485    let s = dashes("foo - bar");
486    assert_eq!(&s, "foo - bar");
487}
488
489#[test]
490fn dashes_1() {
491    let s = dashes("foo -- bar");
492    assert_eq!(&s, "foo – bar");
493}
494
495#[test]
496fn dashes_2() {
497    let s = dashes("foo --- bar");
498    assert_eq!(&s, "foo — bar");
499}
500
501#[test]
502fn dashes_3() {
503    let s = dashes("foo --- bar--");
504    assert_eq!(&s, "foo — bar–");
505}
506    
507#[test]
508fn guillemets_1() {
509    let s = guillemets("<< Foo >>");
510    assert_eq!(&s, "« Foo »");
511}
512
513#[test]
514fn guillemets_2() {
515    let s = guillemets("<< Foo");
516    assert_eq!(&s, "« Foo");
517}
518
519#[test]
520fn guillemets_3() {
521    let s = guillemets("Foo >>");
522    assert_eq!(&s, "Foo »");
523}
524
525#[test]
526fn guillemets_4() {
527    let s = guillemets("<< Foo < Bar >>");
528    assert_eq!(&s, "« Foo < Bar »");
529}