1use crate::cleaner::clean;
6use crate::data::MOTS_OSSE;
7use crate::parser::parse;
8use crate::phoneme::{classify, PhonClass};
9
10#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct DecodedPhoneme {
13 pub code: String,
14 pub letters: String,
15}
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum SyllableMode {
20 Written,
22 Oral,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum AssembleMode {
29 #[deprecated(
37 since = "0.4.0",
38 note = "non aligné avec LireCouleur 6 v6 ; préférer AssembleMode::Std"
39 )]
40 Lc,
41 Std,
44}
45
46#[derive(Debug, Clone)]
48struct SylPh {
49 class: PhonClass,
50 indices: Vec<usize>,
51}
52
53fn indices_of(codes: &[String], values: &[&str], limit: usize) -> Vec<usize> {
55 let mut out = Vec::new();
56 for (i, c) in codes.iter().enumerate() {
57 if i > limit {
58 break;
59 }
60 if values.contains(&c.as_str()) {
61 out.push(i);
62 }
63 }
64 out
65}
66
67pub fn post_process_e(pp: &mut [DecodedPhoneme]) {
70 if pp.len() <= 1 {
71 return;
72 }
73 let codes: Vec<String> = pp.iter().map(|p| p.code.clone()).collect();
74 if !codes.iter().any(|c| c == "x") {
75 return;
76 }
77
78 let mut nb_ph = codes.len() - 1;
80 while nb_ph >= 1 && codes[nb_ph] == "#" {
81 nb_ph -= 1;
82 }
83
84 let i_x = indices_of(&codes, &["x"], nb_ph);
85 if i_x.is_empty() {
86 return;
87 }
88 let i_ph = *i_x.last().unwrap();
89
90 if i_ph + 2 < nb_ph {
92 return;
93 }
94
95 if i_ph == nb_ph {
96 pp[i_ph].code = "x^".to_string();
98 return;
99 }
100
101 let consonnes_eu_ferme = ["z", "z_s", "t"];
102 if consonnes_eu_ferme.contains(&codes[i_ph + 1].as_str()) && codes[nb_ph] == "q_caduc" {
103 pp[i_ph].code = "x^".to_string();
104 }
105}
106
107pub fn post_process_o(pp: &mut [DecodedPhoneme]) {
109 if pp.len() <= 1 {
110 return;
111 }
112 let codes: Vec<String> = pp.iter().map(|p| p.code.clone()).collect();
113 if !codes.iter().any(|c| c == "o") {
114 return;
115 }
116
117 let consonnes_syllabe_fermee = [
118 "p", "k", "b", "d", "g", "f", "f_ph", "s^", "l", "r", "m", "n",
119 ];
120
121 let mut nb_ph = codes.len() - 1;
122 while nb_ph > 0 && codes[nb_ph] == "#" {
123 nb_ph -= 1;
124 }
125
126 let i_o = indices_of(&codes, &["o"], nb_ph);
127
128 let mot: String = pp[..=nb_ph].iter().map(|p| p.letters.as_str()).collect();
130
131 if MOTS_OSSE.binary_search(&mot.as_str()).is_ok() {
132 if let Some(&last_o) = i_o.last() {
133 pp[last_o].code = "o_ouvert".to_string();
134 }
135 return;
136 }
137
138 let consonnes = [
139 "p", "t", "k", "b", "d", "g", "f", "f_ph", "s", "s^", "v", "z", "z^", "l", "r", "m", "n",
140 "k_qu", "z^_g", "g_u", "s_c", "s_t", "z_s", "ks", "gz",
141 ];
142
143 for &i_ph in &i_o {
144 if i_ph == nb_ph {
145 return; }
147 if pp[i_ph].letters != "ô" {
148 let next = codes.get(i_ph + 1).map(String::as_str).unwrap_or("");
149 let next2 = codes.get(i_ph + 2).map(String::as_str).unwrap_or("");
150
151 if (i_ph + 2 == nb_ph && consonnes_syllabe_fermee.contains(&next) && next2 == "q_caduc")
152 || ["r", "z^_g", "v"].contains(&next)
153 || (i_ph + 2 < nb_ph && consonnes.contains(&next) && consonnes.contains(&next2))
154 {
155 pp[i_ph].code = "o_ouvert".to_string();
156 }
157 }
158 }
159}
160
161pub fn post_process_w(pp: &mut [DecodedPhoneme]) {
163 if pp.len() <= 1 {
164 return;
165 }
166
167 let _ = pp;
171}
172
173pub fn post_process_yod(pp: &mut [DecodedPhoneme], _mode: SyllableMode) {
175 if pp.len() <= 1 {
176 return;
177 }
178 let phon_suivant = [
179 "a", "a~", "e", "e^", "e_comp", "e^_comp", "o", "o_comp", "o~", "e~", "x", "x^", "u",
180 ];
181
182 for i in 0..pp.len() - 1 {
183 if pp[i].code == "i" && phon_suivant.contains(&pp[i + 1].code.as_str()) {
184 pp[i].code = "j".to_string();
185 }
186 }
187}
188
189pub fn assemble_syllables(
193 phonemes: &[DecodedPhoneme],
194 assemble_mode: AssembleMode,
195 syl_mode: SyllableMode,
196) -> (Vec<Vec<usize>>, Vec<DecodedPhoneme>) {
197 let nb_phon = phonemes.len();
198 if nb_phon < 2 {
199 return (vec![(0..nb_phon).collect()], phonemes.to_vec());
200 }
201
202 let mut nphonemes: Vec<DecodedPhoneme> = Vec::with_capacity(nb_phon);
206 if assemble_mode == AssembleMode::Std {
207 for ph in phonemes {
208 let c = classify(&ph.code);
209 let is_semi_consonne =
210 ph.code.starts_with("j_") || ph.code.starts_with("w_") || ph.code.starts_with("y_");
211 let eligible = c == PhonClass::Consonant || is_semi_consonne;
212 if eligible && ph.letters.chars().count() > 1 {
213 let chars: Vec<char> = ph.letters.chars().collect();
214 let n = chars.len();
215 if chars[n - 1] == chars[n - 2] {
216 let prefix: String = chars[..n - 1].iter().collect();
217 let last: String = chars[n - 1..].iter().collect();
218 nphonemes.push(DecodedPhoneme {
219 code: ph.code.clone(),
220 letters: prefix,
221 });
222 nphonemes.push(DecodedPhoneme {
223 code: ph.code.clone(),
224 letters: last,
225 });
226 } else {
227 nphonemes.push(ph.clone());
228 }
229 } else {
230 nphonemes.push(ph.clone());
231 }
232 }
233 } else {
234 nphonemes = phonemes.to_vec();
235 }
236
237 let nb_phon = nphonemes.len();
238
239 let mut sylph: Vec<SylPh> = Vec::with_capacity(nb_phon);
245 for (i, ph) in nphonemes.iter().enumerate() {
246 if ph.code.is_empty() {
247 continue;
250 }
251 let class = if ph.code.starts_with("j_")
252 || ph.code.starts_with("w_")
253 || ph.code.starts_with("y_")
254 {
255 PhonClass::Vowel
256 } else {
257 classify(&ph.code)
258 };
259 sylph.push(SylPh {
260 class,
261 indices: vec![i],
262 });
263 }
264
265 let attaque_premiere = ["b", "k", "p", "t", "g", "d", "f", "v"];
267 let mut i = 0;
268 while i + 1 < sylph.len() {
269 if sylph[i].class == PhonClass::Consonant && sylph[i + 1].class == PhonClass::Consonant {
270 let phon0 = &nphonemes[sylph[i].indices[0]].code;
271 let phon1 = &nphonemes[sylph[i + 1].indices[0]].code;
272 if (phon1 == "l" || phon1 == "r") && attaque_premiere.contains(&phon0.as_str()) {
273 let indices1 = sylph[i + 1].indices.clone();
274 sylph[i].indices.extend(indices1);
275 sylph.remove(i + 1);
276 continue;
278 }
279 }
280 i += 1;
281 }
282
283 let mut i = 0;
285 while i + 1 < sylph.len() {
286 if sylph[i].class == PhonClass::Vowel && sylph[i + 1].class == PhonClass::Vowel {
287 let phon1 = nphonemes[sylph[i].indices[0]].code.clone();
288 let phon2 = nphonemes[sylph[i + 1].indices[0]].code.clone();
289 let merge = (phon1 == "y" && phon2 == "i")
290 || (phon1 == "u" && (phon2 == "i" || phon2 == "e~" || phon2 == "o~"));
291 if merge {
292 let indices1 = sylph[i + 1].indices.clone();
293 sylph[i].indices.extend(indices1);
294 sylph.remove(i + 1);
295 continue;
296 }
297 }
298 i += 1;
299 }
300
301 let mut i = 0;
303 while i + 1 < sylph.len() {
304 if sylph[i + 1].class == PhonClass::Silent {
305 let indices1 = sylph[i + 1].indices.clone();
306 sylph[i].indices.extend(indices1);
307 sylph.remove(i + 1);
308 continue;
309 }
310 i += 1;
311 }
312
313 let mut sylls: Vec<Vec<usize>> = Vec::new();
315 let nb_sylph = sylph.len();
316 let mut i = 0;
317 let mut j = 0usize;
318 while i < nb_sylph {
319 j = i;
320 while i < nb_sylph && sylph[i].class != PhonClass::Vowel {
322 i += 1;
323 }
324 if i < nb_sylph && sylph[i].class == PhonClass::Vowel {
326 i += 1;
327 let mut cur_syl: Vec<usize> = Vec::new();
328 for sp in &sylph[j..i] {
329 cur_syl.extend(sp.indices.iter().copied());
330 }
331 j = i;
332 sylls.push(cur_syl);
333 }
334
335 if i + 1 < nb_sylph {
339 let phon_i_idx = *sylph[i].indices.last().unwrap();
340 let phon_i1_idx = sylph[i + 1].indices[0];
341 let last_letter = nphonemes[phon_i_idx].letters.chars().last().unwrap_or(' ');
342 let first_letter = nphonemes[phon_i1_idx].letters.chars().next().unwrap_or(' ');
343 let consonnes = "bcdfghjklmnpqrstvwxzç";
344 if consonnes.contains(last_letter) && consonnes.contains(first_letter) {
345 if let Some(last) = sylls.last_mut() {
346 last.extend(sylph[i].indices.iter().copied());
347 }
348 i += 1;
349 j = i;
350 }
351 }
352 }
353
354 if sylls.is_empty() {
355 return (vec![(0..nb_phon).collect()], nphonemes);
356 }
357
358 for sp in &sylph[j..nb_sylph] {
362 sylls.last_mut().unwrap().extend(sp.indices.iter().copied());
363 }
364
365 if syl_mode == SyllableMode::Oral && sylls.len() > 1 {
367 let last = sylls.last().unwrap();
368 let mut k = last.len() as isize - 1;
369 while k > 0 {
370 let code = &nphonemes[last[k as usize]].code;
371 if code != "#" && code != "verb_3p" {
372 break;
373 }
374 k -= 1;
375 }
376 if k >= 0 && nphonemes[last[k as usize]].code.ends_with("q_caduc") {
377 let last_syl = sylls.pop().unwrap();
378 sylls.last_mut().unwrap().extend(last_syl);
379 }
380 }
381
382 (sylls, nphonemes)
383}
384
385pub fn extract_phonemes_word(
387 word: &str,
388 novice_reader: bool,
389 mode: SyllableMode,
390) -> Vec<DecodedPhoneme> {
391 let lower: String = word.chars().flat_map(|c| c.to_lowercase()).collect();
394 let raw_phons = parse(&lower);
395 let chars_orig: Vec<char> = word.chars().collect();
396 let chars_lower: Vec<char> = lower.chars().collect();
399 let mut cursor = 0usize;
400 let mut out: Vec<DecodedPhoneme> = Vec::with_capacity(raw_phons.len());
401 for ph in raw_phons {
402 let end = (cursor + ph.step).min(chars_lower.len());
403 let orig_end = end.min(chars_orig.len());
405 let letters: String = chars_orig[cursor.min(chars_orig.len())..orig_end]
406 .iter()
407 .collect();
408 out.push(DecodedPhoneme {
409 code: ph.code,
410 letters,
411 });
412 cursor = end;
413 }
414
415 post_process_e(&mut out);
416 if !novice_reader {
417 post_process_w(&mut out);
418 post_process_yod(&mut out, mode);
419 post_process_o(&mut out);
420 }
421 out
422}
423
424#[derive(Debug, Clone, PartialEq, Eq)]
428pub enum TextChunk {
429 Word(Vec<String>),
431 Raw(String),
433}
434
435pub fn extract_syllables(
436 text: &str,
437 novice_reader: bool,
438 assemble_mode: AssembleMode,
439 syl_mode: SyllableMode,
440) -> Vec<TextChunk> {
441 let ultext = clean(text, ' ');
442 let words: Vec<&str> = ultext.split_whitespace().collect();
443 let mut out: Vec<TextChunk> = Vec::new();
444 let mut p_text = 0usize;
445
446 let text_chars: Vec<char> = text.chars().collect();
447 let ultext_chars: Vec<char> = ultext.chars().collect();
448
449 let mut previous_word: Option<String> = None;
451
452 for word in &words {
453 let wlen = word.chars().count();
454 let word_chars: Vec<char> = word.chars().collect();
455 let pp_text = match find_subseq(&ultext_chars, &word_chars, p_text) {
456 Some(p) => p,
457 None => continue,
458 };
459
460 if pp_text > p_text {
461 let raw: String = text_chars[p_text..pp_text].iter().collect();
462 out.push(TextChunk::Raw(raw));
463 }
464
465 let original_word: String = text_chars[pp_text..pp_text + wlen].iter().collect();
466 let lower_word = original_word.to_lowercase();
467
468 let phonemes: Vec<DecodedPhoneme> =
470 match crate::homographs::lookup(&lower_word, previous_word.as_deref()) {
471 Some(coded) => coded
472 .into_iter()
473 .map(|(code, letters)| DecodedPhoneme { code, letters })
474 .collect(),
475 None => extract_phonemes_word(&original_word, novice_reader, syl_mode),
476 };
477 let (sylls, nphons) = assemble_syllables(&phonemes, assemble_mode, syl_mode);
478
479 let sylls_strings: Vec<String> = sylls
480 .iter()
481 .map(|syl| {
482 syl.iter()
483 .map(|&i| nphons[i].letters.clone())
484 .collect::<String>()
485 })
486 .collect();
487
488 out.push(TextChunk::Word(sylls_strings));
489 p_text = pp_text + wlen;
490
491 previous_word = Some(lower_word.replace('\u{2019}', "'"));
493 }
494
495 if p_text < text_chars.len() {
496 let raw: String = text_chars[p_text..].iter().collect();
497 out.push(TextChunk::Raw(raw));
498 }
499
500 out
501}
502
503fn find_subseq(haystack: &[char], needle: &[char], start: usize) -> Option<usize> {
504 if needle.is_empty() || start >= haystack.len() {
505 return None;
506 }
507 let nlen = needle.len();
508 if start + nlen > haystack.len() {
509 return None;
510 }
511 for i in start..=haystack.len().saturating_sub(nlen) {
512 if haystack[i..i + nlen] == needle[..] {
513 return Some(i);
514 }
515 }
516 None
517}