crowbook_text_processing/
french.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with
3// this file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5use std::borrow::Cow;
6use std::default::Default;
7
8use crate::common::{NB_CHAR, NB_CHAR_NARROW, NB_CHAR_EM};
9use crate::common::is_whitespace;
10use crate::clean;
11use crate::escape;
12
13
14
15/// French typographic formatter.
16///
17/// The purpose of this struct is to try to make a text more typographically correct,
18/// according to french typographic rules. This means:
19///
20/// * making spaces before `?`, `!`, `;` narrow non-breaking space;
21/// * making spaces before `:` non-breaking space;
22/// * making space after `—` for dialog a demi em space;
23/// * making spaces after `«` and before `»` non-breking space or narrow non-breking space,
24///   according to the circumstances (dialog or a few quoted words).
25/// * making spaces in numbers, e.g. `80 000` or `50 €` narrow and non-breaking.
26///
27/// Additionally, this feature use functions that are "generic" (not specific to french language)
28/// in order to:
29///
30/// * replace straight quotes (`'` and `"`) with curly, typographic ones;
31/// * replace ellipsis (`...`) with the unicode character (`…`).
32///
33/// As some of these features require a bit of guessing sometimes, there are some paremeters that
34/// can be set if you want better results.
35///
36/// # Example
37///
38/// ```
39/// use crowbook_text_processing::FrenchFormatter;
40/// let input = "Un texte à 'formater', n'est-ce pas ?";
41/// let output = FrenchFormatter::new()
42///              .typographic_ellipsis(false) // don't replace ellipsis
43///              .format_tex(input); // format to tex (so non-breaking
44///                                  // spaces are visible in assert_eq!)
45/// assert_eq!(&output, "Un texte à ‘formater’, n’est-ce pas\\,?");
46/// ```
47#[derive(Debug)]
48pub struct FrenchFormatter {
49    /// After that number of characters, assume it's not a currency
50    threshold_currency: usize,
51    /// After that number of characters assume it's not an unit
52    threshold_unit: usize,
53    /// After that number of characters, assume it is a dialog
54    threshold_quote: usize,
55    /// After that number of characters, assume it isn't an abbreviation
56    threshold_real_word: usize,
57    /// Enable typographic apostrophe
58    typographic_quotes: bool,
59    /// Enaple typographic ellipsis
60    typographic_ellipsis: bool,
61    /// Enable dashes replacement
62    ligature_dashes: bool,
63    /// Enable guillemets replacement
64    ligature_guillemets: bool,
65}
66
67impl Default for FrenchFormatter {
68    fn default() -> Self {
69        FrenchFormatter {
70            threshold_currency: 3,
71            threshold_unit: 2,
72            threshold_quote: 20,
73            threshold_real_word: 3,
74            typographic_quotes: true,
75            typographic_ellipsis: true,
76            ligature_dashes: false,
77            ligature_guillemets: false,
78        }
79    }
80}
81
82impl FrenchFormatter {
83    /// Create a new FrenchFormatter with default settings
84    pub fn new() -> Self {
85        Self::default()
86    }
87
88    /// Sets the threshold currency.
89    ///
90    /// After that number of characters, assume it's not a currency
91    ///
92    /// Default is `3`.
93    pub fn threshold_currency(&mut self, t: usize) -> &mut Self {
94        self.threshold_currency = t;
95        self
96    }
97
98    /// Sets the threshold for unit.
99    ///
100    /// After that number of characters, assume it's not an unit.
101    ///
102    /// Default is `2`.
103    pub fn threshold_unit(&mut self, t: usize) -> &mut Self {
104        self.threshold_unit = t;
105        self
106    }
107
108    /// Sets the threshold for quote.
109    ///
110    /// After that number of characters, assume it's not a quote of a single
111    /// word or a few words, but a dialog.
112    ///
113    /// Default is `20`.
114    pub fn threshold_quote(&mut self, t: usize) -> &mut Self {
115        self.threshold_quote = t;
116        self
117    }
118
119    /// Sets the threshold for real word.
120    ///
121    /// After that number of characters, assume it's not an abbreviation
122    /// but a real word (used to determine if `.` marks the end of a sentence
123    /// or just a title such as `M. Dupuis`.
124    ///
125    /// Default is `3`
126    pub fn threshold_real_word(&mut self, t: usize) -> &mut Self {
127        self.threshold_real_word = t;
128        self
129    }
130
131    /// Enables the typographic quotes replacement.
132    ///
133    /// If true, "L'" will be replaced by "L’"
134    ///
135    /// Default is true
136    pub fn typographic_quotes(&mut self, b: bool) -> &mut Self {
137        self.typographic_quotes = b;
138        self
139    }
140
141    /// Enables typographic ellipsis replacement.
142    ///
143    /// If true, "..." will be replaced by "…"
144    ///
145    /// Default is true
146    pub fn typographic_ellipsis(&mut self, b: bool) -> &mut Self {
147        self.typographic_ellipsis = b;
148        self
149    }
150
151    /// If set to true, replaces `--`to `–` and `---` to `—`.
152    ///
153    /// Default is false.
154    pub fn ligature_dashes(&mut self, b: bool) -> &mut Self {
155        self.ligature_dashes = b;
156        self
157    }
158
159    /// If set to true, replaces `<<` to `«` and `>>` to `»`.
160    ///
161    /// Default is false.
162    pub fn ligature_guillemets(&mut self, b: bool) -> &mut Self {
163        self.ligature_guillemets = b;
164        self
165    }
166
167    /// (Try to) Format a string according to french typographic rules.
168    ///
169    /// This method should be called for each paragraph, as it makes some suppositions that
170    /// the beginning of the string also means the beginning of a line.
171    ///
172    /// This method calls `remove_whitespaces` internally, as it relies on it.
173    ///
174    /// # Example
175    ///
176    /// ```
177    /// use crowbook_text_processing::FrenchFormatter;
178    /// let f = FrenchFormatter::new();
179    /// let s = f.format("« Est-ce bien formaté ? » se demandait-elle — les espaces \
180    ///                   insécables étaient tellement compliquées à gérer,
181    ///                   dans cette langue !");
182    /// println!("{}", s);
183    /// ```
184    pub fn format<'a, S: Into<Cow<'a, str>>>(&self, input: S) -> Cow<'a, str> {
185        let mut input = clean::whitespaces(input); // first pass to remove whitespaces
186
187        if self.ligature_dashes {
188            input = clean::dashes(input);
189        }
190
191        if self.ligature_guillemets {
192            input = clean::guillemets(input);
193        }
194
195        if self.typographic_quotes {
196            input = clean::quotes(input);
197        }
198
199        if self.typographic_ellipsis {
200            input = clean::ellipsis(input);
201        }
202
203        // Find first characters that are trouble
204        let first = input.chars().position(is_trouble);
205        let first_number = input.chars().position(|c| c.is_digit(10));
206
207        // No need to do anything, return early
208        if first.is_none() && first_number.is_none() {
209            return input;
210        }
211
212        let (nb_char, nb_char_em, nb_char_narrow) = (NB_CHAR, NB_CHAR_EM, NB_CHAR_NARROW);
213
214        let mut chars = input.chars().collect::<Vec<_>>();
215        let mut is_number_series = false;
216
217        // Handle numbers
218        if let Some(first) = first_number {
219            // Go back one step
220            let first = if first > 1 { first - 1 } else { 0 };
221            for i in first..(chars.len() - 1) {
222                // Handle numbers (that's easy)
223                let current = chars[i];
224                let next = chars[i + 1];
225
226                match current {
227                    '0'..='9' => {
228                        if i == 0 || !chars[i - 1].is_alphabetic() {
229                            is_number_series = true;
230                        }
231                    }
232                    c if c.is_whitespace() => {
233                        if is_number_series &&
234                           (next.is_digit(10) || self.char_is_symbol(&chars, i + 1)) {
235                            // Next char is a number or symbol such as $, and previous was number
236                            chars[i] = nb_char_narrow;
237                        }
238                    }
239                    _ => {
240                        is_number_series = false;
241                    }
242                }
243            }
244        }
245
246        // Handle the rest
247        if let Some(first) = first {
248            // Go back one step
249            let first = if first > 1 { first - 1 } else { 0 };
250            for i in first..(chars.len() - 1) {
251                let current = chars[i];
252                let next = chars[i + 1];
253                if is_whitespace(current) {
254                    match next {
255                        // handle narrow nb space before char
256                        '?' | '!' | ';' => chars[i] = nb_char_narrow,
257                        ':' => chars[i] = nb_char,
258                        '»' => {
259                            if current == ' ' {
260                                // Assumne that if it isn't a normal space it
261                                // was used here for good reason, don't replace it
262                                chars[i] = nb_char;
263                            }
264                        }
265                        _ => (),
266                    }
267                } else {
268                    match current {
269                        // handle nb space after char
270                        '—' | '«' | '-' | '–' => {
271                            if is_whitespace(next) {
272                                let replacing_char = match current {
273                                    '—' | '-' | '–' => {
274                                        if i <= 1 {
275                                            nb_char_em
276                                        } else if chars[i - 1] == nb_char {
277                                            // non breaking space before, so probably
278                                            // should have a breakable one after
279                                            ' '
280                                        } else {
281                                            if let Some(closing) =
282                                                   self.find_closing_dash(&chars, i + 1) {
283                                                chars[closing] = nb_char;
284                                            }
285                                            nb_char
286                                        }
287                                    }
288                                    '«' => {
289                                        let j = find_next(&chars, '»', i);
290                                        if let Some(j) = j {
291                                            if chars[j - 1].is_whitespace() {
292                                                if i <= 1 ||
293                                                    j - i > self.threshold_quote {
294                                                        // Either '«' was at the beginning
295                                                        // => assume it is a dialogue
296                                                        // or it's a quote
297                                                        // => 'large' space too
298                                                        chars[j - 1] = nb_char;
299                                                        nb_char
300                                                    } else {
301                                                        // Not long enough to be a quote,
302                                                        // use narrow nb char
303                                                        chars[j - 1] = nb_char_narrow;
304                                                        nb_char_narrow
305                                                    }
306                                            } else {
307                                                // wtf formatting?
308                                                nb_char
309                                            }
310                                        } else {
311                                            // No ending quote found, assume is a dialogue
312                                            nb_char
313                                        }
314                                    }, // TODO: better heuristic: use narrow nb_char if not at front?
315                                    _ => unreachable!(),
316                                };
317                                chars[i + 1] = replacing_char;
318                        }
319                        }
320                        _ => (),
321                    }
322                }
323            }
324        }
325        Cow::Owned(chars.into_iter().collect())
326    }
327
328    /// (Try to) Format a string according to french typographic rules, escape the characters
329    /// that need to be escaped in LaTeX (e.g. backslashes) and use TeX commands ("~", "\enspace" "and "\,")
330    /// for non-breaking spaces so it works correctly with some LaTeX versions (and it makes
331    /// the non-breaking spaces shenanigans more visible with most editors)
332    ///
333    /// # Example
334    ///
335    /// ```
336    /// use crowbook_text_processing::FrenchFormatter;
337    /// let f = FrenchFormatter::new();
338    /// let s = f.format_tex("« Est-ce bien formaté ? »");
339    /// assert_eq!(&s, "«~Est-ce bien formaté\\,?~»");
340    /// ```
341    pub fn format_tex<'a, S: Into<Cow<'a, str>>>(&self, input: S) -> Cow<'a, str> {
342        escape::nb_spaces_tex(escape::tex(self.format(input)))
343    }
344
345    /// (Try to) Format a string according to french typographic rules, and escape the characters
346    /// that need to be escaped in HTML (e.g. &). Also use HTML commands instead
347    /// of unicode for narrow non-breaking spaces. See `escape::nb_spaces_html`. It's a bit of a hack
348    /// to make it work in most browsers/ereaders.
349    pub fn format_html<'a, S: Into<Cow<'a, str>>>(&self, input: S) -> Cow<'a, str> {
350        escape::nb_spaces_html(escape::html(self.format(input)))
351    }
352
353
354    /// Return true if the character is a symbol that is used after number
355    /// and should have a nb_char before
356    fn char_is_symbol(&self, v: &[char], i: usize) -> bool {
357        let is_next_letter = if i < v.len() - 1 {
358            v[i + 1].is_alphabetic()
359        } else {
360            false
361        };
362        if is_next_letter {
363            match v[i] {
364                '°' => true,
365                c if c.is_uppercase() => {
366                    let word = get_next_word(v, i);
367                    if word.len() > self.threshold_currency {
368                        // not a currency
369                        false
370                    } else {
371                        // if all uppercase and less than THRESHOLD,
372                        // assume it's a currency or a unit
373                        word.iter().all(|c| c.is_uppercase())
374                    }
375                }
376                c if c.is_alphabetic() => {
377                    let word = get_next_word(v, i);
378                    // if two letters, assume it is a unit
379                    word.len() <= self.threshold_unit
380                }
381                _ => false,
382            }
383        } else {
384            match v[i] {
385                c if (!c.is_alphabetic() && !c.is_whitespace()) => true, // special symbol
386                c if c.is_uppercase() => true, //single uppercase letter
387                _ => false,
388            }
389        }
390    }
391
392    // Return Some(pos) if a closing dash was found before what looks
393    // like the end of a sentence, None else
394    fn find_closing_dash(&self, v: &[char], n: usize) -> Option<usize> {
395        let mut word = String::new();
396        for j in n..v.len() {
397            match v[j] {
398                '!' | '?' => {
399                    if is_next_char_uppercase(v, j + 1) {
400                        return None;
401                    }
402                }
403                '-' | '–' | '—' => {
404                    if v[j - 1].is_whitespace() {
405                        return Some(j - 1);
406                    }
407                }
408                '.' => {
409                    if !is_next_char_uppercase(v, j + 1) {
410                        continue;
411                    } else if let Some(c) = word.chars().next() {
412                        if !c.is_uppercase() || word.len() > self.threshold_real_word {
413                            return None;
414                        }
415                    }
416                }
417                c if c.is_whitespace() => word = String::new(),
418                c => word.push(c),
419            }
420        }
421        None
422    }
423}
424
425fn is_trouble(c: char) -> bool {
426    match c {
427        '?' | '!' | ';' | ':' | '»' | '«' | '—' | '–' => true,
428        _ => false,
429    }
430}
431
432
433
434// Find first char `c` in slice `v` after index `n`
435fn find_next(v: &[char], c: char, n: usize) -> Option<usize> {
436    for (i, car) in v.iter()
437        .enumerate()
438        .skip(n) {
439        if *car == c {
440            return Some(i);
441        }
442    }
443    None
444}
445
446// Return true if next non whitespace char in `v` after index `n` is uppercase
447fn is_next_char_uppercase(v: &[char], n: usize) -> bool {
448    for i in n..v.len() {
449        if v[i].is_whitespace() {
450            continue;
451        }
452        if v[i].is_uppercase() {
453            return true;
454        }
455        if v[i].is_lowercase() {
456            return false;
457        }
458    }
459    false
460}
461
462
463/// Returns the next word in `v` starting from index `n`
464fn get_next_word(v: &[char], n: usize) -> &[char] {
465    let mut beginning = n;
466    let mut end = v.len();
467
468    for (i, car) in v.iter()
469        .enumerate()
470        .skip(n) {
471        if car.is_alphabetic() {
472            beginning = i;
473            break;
474        }
475    }
476
477    for (i, car) in v.iter()
478        .enumerate()
479        .skip(beginning) {
480        if car.is_whitespace() {
481            end = i - 1;
482            break;
483        }
484    }
485
486    &v[beginning..end]
487}
488
489
490#[cfg(test)]
491#[test]
492fn french() {
493    let s = "  «  Comment allez-vous ? » demanda-t-elle à son   \
494             interlocutrice  qui lui répondit  \
495             : « Mais très bien ma chère  !  »";
496    let res = FrenchFormatter::new().format(s);
497    assert_eq!(&res,
498               " « Comment allez-vous ? » demanda-t-elle à son \
499                interlocutrice qui lui répondit : \
500                « Mais très bien ma chère ! »");
501}
502
503#[test]
504fn french_quotes_1() {
505    let s = "« Un test »";
506    let res = FrenchFormatter::new().format_tex(s);
507    assert_eq!(&res, "«~Un test~»");
508}
509
510#[test]
511fn french_quotes_2() {
512    let s = "« Un test";
513    let res = FrenchFormatter::new().format_tex(s);
514    assert_eq!(&res, "«~Un test");
515}
516
517#[test]
518fn french_quotes_3() {
519    let s = "Un test »";
520    let res = FrenchFormatter::new().format_tex(s);
521    assert_eq!(&res, "Un test~»");
522}
523
524#[test]
525fn french_quotes_4() {
526    let s = "test « court »";
527    let res = FrenchFormatter::new().format(s);
528    assert_eq!(&res, "test « court »");
529}
530
531#[test]
532fn french_quotes_5() {
533    let s = "test « beaucoup, beaucoup plus long »";
534    let res = FrenchFormatter::new().format(s);
535    assert_eq!(&res, "test « beaucoup, beaucoup plus long »");
536}
537
538#[test]
539fn french_dashes_1() {
540    let s = "Il faudrait gérer ces tirets – sans ça certains textes rendent mal – un jour ou \
541             l'autre";
542    let res = FrenchFormatter::new().format_tex(s);
543    assert_eq!(&res,
544               "Il faudrait gérer ces tirets –~sans ça certains textes \
545                rendent mal~– un jour ou l’autre");
546}
547
548#[test]
549fn french_dashes_2() {
550    let s = "Il faudrait gérer ces tirets – sans ça certains textes rendent mal. Mais ce n'est \
551             pas si simple – si ?";
552    let res = FrenchFormatter::new().format_tex(s);
553    assert_eq!(&res,
554               "Il faudrait gérer ces tirets –~sans ça certains textes rendent mal. Mais ce \
555                n’est pas si simple –~si\\,?");
556}
557
558#[test]
559fn french_numbers() {
560    let french = FrenchFormatter::new();
561
562    let s = Cow::Borrowed("10 000");
563    let res = french.format_tex(s);
564    assert_eq!(&res, "10\\,000");
565
566    let s = Cow::Borrowed("10 000 €");
567    let res = french.format_tex(s);
568    assert_eq!(&res, "10\\,000\\,€");
569
570    let s = Cow::Borrowed("10 000 euros");
571    let res = french.format_tex(s);
572    assert_eq!(&res, "10\\,000 euros");
573
574    let s = Cow::Borrowed("10 000 EUR");
575    let res = french.format_tex(s);
576    assert_eq!(&res, "10\\,000\\,EUR");
577
578    let s = Cow::Borrowed("50 km");
579    let res = french.format_tex(s);
580    assert_eq!(&res, "50\\,km");
581
582    let s = Cow::Borrowed("50 %");
583    let res = french.format_tex(s);
584    assert_eq!(&res, "50\\,\\%");
585
586    let s = Cow::Borrowed("20 °C");
587    let res = french.format_tex(s);
588    assert_eq!(&res, "20\\,°C");
589
590    let s = Cow::Borrowed("20 F");
591    let res = french.format_tex(s);
592    assert_eq!(&res, "20\\,F");
593
594    let s = Cow::Borrowed("20 BALLES");
595    let res = french.format_tex(s);
596    assert_eq!(&res, "20 BALLES");
597}