crowbook_text_processing/french.rs
1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with
3// this file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5use std::borrow::Cow;
6use std::default::Default;
7
8use crate::common::{NB_CHAR, NB_CHAR_NARROW, NB_CHAR_EM};
9use crate::common::is_whitespace;
10use crate::clean;
11use crate::escape;
12
13
14
15/// French typographic formatter.
16///
17/// The purpose of this struct is to try to make a text more typographically correct,
18/// according to french typographic rules. This means:
19///
20/// * making spaces before `?`, `!`, `;` narrow non-breaking space;
21/// * making spaces before `:` non-breaking space;
22/// * making space after `—` for dialog a demi em space;
23/// * making spaces after `«` and before `»` non-breking space or narrow non-breking space,
24/// according to the circumstances (dialog or a few quoted words).
25/// * making spaces in numbers, e.g. `80 000` or `50 €` narrow and non-breaking.
26///
27/// Additionally, this feature use functions that are "generic" (not specific to french language)
28/// in order to:
29///
30/// * replace straight quotes (`'` and `"`) with curly, typographic ones;
31/// * replace ellipsis (`...`) with the unicode character (`…`).
32///
33/// As some of these features require a bit of guessing sometimes, there are some paremeters that
34/// can be set if you want better results.
35///
36/// # Example
37///
38/// ```
39/// use crowbook_text_processing::FrenchFormatter;
40/// let input = "Un texte à 'formater', n'est-ce pas ?";
41/// let output = FrenchFormatter::new()
42/// .typographic_ellipsis(false) // don't replace ellipsis
43/// .format_tex(input); // format to tex (so non-breaking
44/// // spaces are visible in assert_eq!)
45/// assert_eq!(&output, "Un texte à ‘formater’, n’est-ce pas\\,?");
46/// ```
47#[derive(Debug)]
48pub struct FrenchFormatter {
49 /// After that number of characters, assume it's not a currency
50 threshold_currency: usize,
51 /// After that number of characters assume it's not an unit
52 threshold_unit: usize,
53 /// After that number of characters, assume it is a dialog
54 threshold_quote: usize,
55 /// After that number of characters, assume it isn't an abbreviation
56 threshold_real_word: usize,
57 /// Enable typographic apostrophe
58 typographic_quotes: bool,
59 /// Enaple typographic ellipsis
60 typographic_ellipsis: bool,
61 /// Enable dashes replacement
62 ligature_dashes: bool,
63 /// Enable guillemets replacement
64 ligature_guillemets: bool,
65}
66
67impl Default for FrenchFormatter {
68 fn default() -> Self {
69 FrenchFormatter {
70 threshold_currency: 3,
71 threshold_unit: 2,
72 threshold_quote: 20,
73 threshold_real_word: 3,
74 typographic_quotes: true,
75 typographic_ellipsis: true,
76 ligature_dashes: false,
77 ligature_guillemets: false,
78 }
79 }
80}
81
82impl FrenchFormatter {
83 /// Create a new FrenchFormatter with default settings
84 pub fn new() -> Self {
85 Self::default()
86 }
87
88 /// Sets the threshold currency.
89 ///
90 /// After that number of characters, assume it's not a currency
91 ///
92 /// Default is `3`.
93 pub fn threshold_currency(&mut self, t: usize) -> &mut Self {
94 self.threshold_currency = t;
95 self
96 }
97
98 /// Sets the threshold for unit.
99 ///
100 /// After that number of characters, assume it's not an unit.
101 ///
102 /// Default is `2`.
103 pub fn threshold_unit(&mut self, t: usize) -> &mut Self {
104 self.threshold_unit = t;
105 self
106 }
107
108 /// Sets the threshold for quote.
109 ///
110 /// After that number of characters, assume it's not a quote of a single
111 /// word or a few words, but a dialog.
112 ///
113 /// Default is `20`.
114 pub fn threshold_quote(&mut self, t: usize) -> &mut Self {
115 self.threshold_quote = t;
116 self
117 }
118
119 /// Sets the threshold for real word.
120 ///
121 /// After that number of characters, assume it's not an abbreviation
122 /// but a real word (used to determine if `.` marks the end of a sentence
123 /// or just a title such as `M. Dupuis`.
124 ///
125 /// Default is `3`
126 pub fn threshold_real_word(&mut self, t: usize) -> &mut Self {
127 self.threshold_real_word = t;
128 self
129 }
130
131 /// Enables the typographic quotes replacement.
132 ///
133 /// If true, "L'" will be replaced by "L’"
134 ///
135 /// Default is true
136 pub fn typographic_quotes(&mut self, b: bool) -> &mut Self {
137 self.typographic_quotes = b;
138 self
139 }
140
141 /// Enables typographic ellipsis replacement.
142 ///
143 /// If true, "..." will be replaced by "…"
144 ///
145 /// Default is true
146 pub fn typographic_ellipsis(&mut self, b: bool) -> &mut Self {
147 self.typographic_ellipsis = b;
148 self
149 }
150
151 /// If set to true, replaces `--`to `–` and `---` to `—`.
152 ///
153 /// Default is false.
154 pub fn ligature_dashes(&mut self, b: bool) -> &mut Self {
155 self.ligature_dashes = b;
156 self
157 }
158
159 /// If set to true, replaces `<<` to `«` and `>>` to `»`.
160 ///
161 /// Default is false.
162 pub fn ligature_guillemets(&mut self, b: bool) -> &mut Self {
163 self.ligature_guillemets = b;
164 self
165 }
166
167 /// (Try to) Format a string according to french typographic rules.
168 ///
169 /// This method should be called for each paragraph, as it makes some suppositions that
170 /// the beginning of the string also means the beginning of a line.
171 ///
172 /// This method calls `remove_whitespaces` internally, as it relies on it.
173 ///
174 /// # Example
175 ///
176 /// ```
177 /// use crowbook_text_processing::FrenchFormatter;
178 /// let f = FrenchFormatter::new();
179 /// let s = f.format("« Est-ce bien formaté ? » se demandait-elle — les espaces \
180 /// insécables étaient tellement compliquées à gérer,
181 /// dans cette langue !");
182 /// println!("{}", s);
183 /// ```
184 pub fn format<'a, S: Into<Cow<'a, str>>>(&self, input: S) -> Cow<'a, str> {
185 let mut input = clean::whitespaces(input); // first pass to remove whitespaces
186
187 if self.ligature_dashes {
188 input = clean::dashes(input);
189 }
190
191 if self.ligature_guillemets {
192 input = clean::guillemets(input);
193 }
194
195 if self.typographic_quotes {
196 input = clean::quotes(input);
197 }
198
199 if self.typographic_ellipsis {
200 input = clean::ellipsis(input);
201 }
202
203 // Find first characters that are trouble
204 let first = input.chars().position(is_trouble);
205 let first_number = input.chars().position(|c| c.is_digit(10));
206
207 // No need to do anything, return early
208 if first.is_none() && first_number.is_none() {
209 return input;
210 }
211
212 let (nb_char, nb_char_em, nb_char_narrow) = (NB_CHAR, NB_CHAR_EM, NB_CHAR_NARROW);
213
214 let mut chars = input.chars().collect::<Vec<_>>();
215 let mut is_number_series = false;
216
217 // Handle numbers
218 if let Some(first) = first_number {
219 // Go back one step
220 let first = if first > 1 { first - 1 } else { 0 };
221 for i in first..(chars.len() - 1) {
222 // Handle numbers (that's easy)
223 let current = chars[i];
224 let next = chars[i + 1];
225
226 match current {
227 '0'..='9' => {
228 if i == 0 || !chars[i - 1].is_alphabetic() {
229 is_number_series = true;
230 }
231 }
232 c if c.is_whitespace() => {
233 if is_number_series &&
234 (next.is_digit(10) || self.char_is_symbol(&chars, i + 1)) {
235 // Next char is a number or symbol such as $, and previous was number
236 chars[i] = nb_char_narrow;
237 }
238 }
239 _ => {
240 is_number_series = false;
241 }
242 }
243 }
244 }
245
246 // Handle the rest
247 if let Some(first) = first {
248 // Go back one step
249 let first = if first > 1 { first - 1 } else { 0 };
250 for i in first..(chars.len() - 1) {
251 let current = chars[i];
252 let next = chars[i + 1];
253 if is_whitespace(current) {
254 match next {
255 // handle narrow nb space before char
256 '?' | '!' | ';' => chars[i] = nb_char_narrow,
257 ':' => chars[i] = nb_char,
258 '»' => {
259 if current == ' ' {
260 // Assumne that if it isn't a normal space it
261 // was used here for good reason, don't replace it
262 chars[i] = nb_char;
263 }
264 }
265 _ => (),
266 }
267 } else {
268 match current {
269 // handle nb space after char
270 '—' | '«' | '-' | '–' => {
271 if is_whitespace(next) {
272 let replacing_char = match current {
273 '—' | '-' | '–' => {
274 if i <= 1 {
275 nb_char_em
276 } else if chars[i - 1] == nb_char {
277 // non breaking space before, so probably
278 // should have a breakable one after
279 ' '
280 } else {
281 if let Some(closing) =
282 self.find_closing_dash(&chars, i + 1) {
283 chars[closing] = nb_char;
284 }
285 nb_char
286 }
287 }
288 '«' => {
289 let j = find_next(&chars, '»', i);
290 if let Some(j) = j {
291 if chars[j - 1].is_whitespace() {
292 if i <= 1 ||
293 j - i > self.threshold_quote {
294 // Either '«' was at the beginning
295 // => assume it is a dialogue
296 // or it's a quote
297 // => 'large' space too
298 chars[j - 1] = nb_char;
299 nb_char
300 } else {
301 // Not long enough to be a quote,
302 // use narrow nb char
303 chars[j - 1] = nb_char_narrow;
304 nb_char_narrow
305 }
306 } else {
307 // wtf formatting?
308 nb_char
309 }
310 } else {
311 // No ending quote found, assume is a dialogue
312 nb_char
313 }
314 }, // TODO: better heuristic: use narrow nb_char if not at front?
315 _ => unreachable!(),
316 };
317 chars[i + 1] = replacing_char;
318 }
319 }
320 _ => (),
321 }
322 }
323 }
324 }
325 Cow::Owned(chars.into_iter().collect())
326 }
327
328 /// (Try to) Format a string according to french typographic rules, escape the characters
329 /// that need to be escaped in LaTeX (e.g. backslashes) and use TeX commands ("~", "\enspace" "and "\,")
330 /// for non-breaking spaces so it works correctly with some LaTeX versions (and it makes
331 /// the non-breaking spaces shenanigans more visible with most editors)
332 ///
333 /// # Example
334 ///
335 /// ```
336 /// use crowbook_text_processing::FrenchFormatter;
337 /// let f = FrenchFormatter::new();
338 /// let s = f.format_tex("« Est-ce bien formaté ? »");
339 /// assert_eq!(&s, "«~Est-ce bien formaté\\,?~»");
340 /// ```
341 pub fn format_tex<'a, S: Into<Cow<'a, str>>>(&self, input: S) -> Cow<'a, str> {
342 escape::nb_spaces_tex(escape::tex(self.format(input)))
343 }
344
345 /// (Try to) Format a string according to french typographic rules, and escape the characters
346 /// that need to be escaped in HTML (e.g. &). Also use HTML commands instead
347 /// of unicode for narrow non-breaking spaces. See `escape::nb_spaces_html`. It's a bit of a hack
348 /// to make it work in most browsers/ereaders.
349 pub fn format_html<'a, S: Into<Cow<'a, str>>>(&self, input: S) -> Cow<'a, str> {
350 escape::nb_spaces_html(escape::html(self.format(input)))
351 }
352
353
354 /// Return true if the character is a symbol that is used after number
355 /// and should have a nb_char before
356 fn char_is_symbol(&self, v: &[char], i: usize) -> bool {
357 let is_next_letter = if i < v.len() - 1 {
358 v[i + 1].is_alphabetic()
359 } else {
360 false
361 };
362 if is_next_letter {
363 match v[i] {
364 '°' => true,
365 c if c.is_uppercase() => {
366 let word = get_next_word(v, i);
367 if word.len() > self.threshold_currency {
368 // not a currency
369 false
370 } else {
371 // if all uppercase and less than THRESHOLD,
372 // assume it's a currency or a unit
373 word.iter().all(|c| c.is_uppercase())
374 }
375 }
376 c if c.is_alphabetic() => {
377 let word = get_next_word(v, i);
378 // if two letters, assume it is a unit
379 word.len() <= self.threshold_unit
380 }
381 _ => false,
382 }
383 } else {
384 match v[i] {
385 c if (!c.is_alphabetic() && !c.is_whitespace()) => true, // special symbol
386 c if c.is_uppercase() => true, //single uppercase letter
387 _ => false,
388 }
389 }
390 }
391
392 // Return Some(pos) if a closing dash was found before what looks
393 // like the end of a sentence, None else
394 fn find_closing_dash(&self, v: &[char], n: usize) -> Option<usize> {
395 let mut word = String::new();
396 for j in n..v.len() {
397 match v[j] {
398 '!' | '?' => {
399 if is_next_char_uppercase(v, j + 1) {
400 return None;
401 }
402 }
403 '-' | '–' | '—' => {
404 if v[j - 1].is_whitespace() {
405 return Some(j - 1);
406 }
407 }
408 '.' => {
409 if !is_next_char_uppercase(v, j + 1) {
410 continue;
411 } else if let Some(c) = word.chars().next() {
412 if !c.is_uppercase() || word.len() > self.threshold_real_word {
413 return None;
414 }
415 }
416 }
417 c if c.is_whitespace() => word = String::new(),
418 c => word.push(c),
419 }
420 }
421 None
422 }
423}
424
425fn is_trouble(c: char) -> bool {
426 match c {
427 '?' | '!' | ';' | ':' | '»' | '«' | '—' | '–' => true,
428 _ => false,
429 }
430}
431
432
433
434// Find first char `c` in slice `v` after index `n`
435fn find_next(v: &[char], c: char, n: usize) -> Option<usize> {
436 for (i, car) in v.iter()
437 .enumerate()
438 .skip(n) {
439 if *car == c {
440 return Some(i);
441 }
442 }
443 None
444}
445
446// Return true if next non whitespace char in `v` after index `n` is uppercase
447fn is_next_char_uppercase(v: &[char], n: usize) -> bool {
448 for i in n..v.len() {
449 if v[i].is_whitespace() {
450 continue;
451 }
452 if v[i].is_uppercase() {
453 return true;
454 }
455 if v[i].is_lowercase() {
456 return false;
457 }
458 }
459 false
460}
461
462
463/// Returns the next word in `v` starting from index `n`
464fn get_next_word(v: &[char], n: usize) -> &[char] {
465 let mut beginning = n;
466 let mut end = v.len();
467
468 for (i, car) in v.iter()
469 .enumerate()
470 .skip(n) {
471 if car.is_alphabetic() {
472 beginning = i;
473 break;
474 }
475 }
476
477 for (i, car) in v.iter()
478 .enumerate()
479 .skip(beginning) {
480 if car.is_whitespace() {
481 end = i - 1;
482 break;
483 }
484 }
485
486 &v[beginning..end]
487}
488
489
490#[cfg(test)]
491#[test]
492fn french() {
493 let s = " « Comment allez-vous ? » demanda-t-elle à son \
494 interlocutrice qui lui répondit \
495 : « Mais très bien ma chère ! »";
496 let res = FrenchFormatter::new().format(s);
497 assert_eq!(&res,
498 " « Comment allez-vous ? » demanda-t-elle à son \
499 interlocutrice qui lui répondit : \
500 « Mais très bien ma chère ! »");
501}
502
503#[test]
504fn french_quotes_1() {
505 let s = "« Un test »";
506 let res = FrenchFormatter::new().format_tex(s);
507 assert_eq!(&res, "«~Un test~»");
508}
509
510#[test]
511fn french_quotes_2() {
512 let s = "« Un test";
513 let res = FrenchFormatter::new().format_tex(s);
514 assert_eq!(&res, "«~Un test");
515}
516
517#[test]
518fn french_quotes_3() {
519 let s = "Un test »";
520 let res = FrenchFormatter::new().format_tex(s);
521 assert_eq!(&res, "Un test~»");
522}
523
524#[test]
525fn french_quotes_4() {
526 let s = "test « court »";
527 let res = FrenchFormatter::new().format(s);
528 assert_eq!(&res, "test « court »");
529}
530
531#[test]
532fn french_quotes_5() {
533 let s = "test « beaucoup, beaucoup plus long »";
534 let res = FrenchFormatter::new().format(s);
535 assert_eq!(&res, "test « beaucoup, beaucoup plus long »");
536}
537
538#[test]
539fn french_dashes_1() {
540 let s = "Il faudrait gérer ces tirets – sans ça certains textes rendent mal – un jour ou \
541 l'autre";
542 let res = FrenchFormatter::new().format_tex(s);
543 assert_eq!(&res,
544 "Il faudrait gérer ces tirets –~sans ça certains textes \
545 rendent mal~– un jour ou l’autre");
546}
547
548#[test]
549fn french_dashes_2() {
550 let s = "Il faudrait gérer ces tirets – sans ça certains textes rendent mal. Mais ce n'est \
551 pas si simple – si ?";
552 let res = FrenchFormatter::new().format_tex(s);
553 assert_eq!(&res,
554 "Il faudrait gérer ces tirets –~sans ça certains textes rendent mal. Mais ce \
555 n’est pas si simple –~si\\,?");
556}
557
558#[test]
559fn french_numbers() {
560 let french = FrenchFormatter::new();
561
562 let s = Cow::Borrowed("10 000");
563 let res = french.format_tex(s);
564 assert_eq!(&res, "10\\,000");
565
566 let s = Cow::Borrowed("10 000 €");
567 let res = french.format_tex(s);
568 assert_eq!(&res, "10\\,000\\,€");
569
570 let s = Cow::Borrowed("10 000 euros");
571 let res = french.format_tex(s);
572 assert_eq!(&res, "10\\,000 euros");
573
574 let s = Cow::Borrowed("10 000 EUR");
575 let res = french.format_tex(s);
576 assert_eq!(&res, "10\\,000\\,EUR");
577
578 let s = Cow::Borrowed("50 km");
579 let res = french.format_tex(s);
580 assert_eq!(&res, "50\\,km");
581
582 let s = Cow::Borrowed("50 %");
583 let res = french.format_tex(s);
584 assert_eq!(&res, "50\\,\\%");
585
586 let s = Cow::Borrowed("20 °C");
587 let res = french.format_tex(s);
588 assert_eq!(&res, "20\\,°C");
589
590 let s = Cow::Borrowed("20 F");
591 let res = french.format_tex(s);
592 assert_eq!(&res, "20\\,F");
593
594 let s = Cow::Borrowed("20 BALLES");
595 let res = french.format_tex(s);
596 assert_eq!(&res, "20 BALLES");
597}