use std::vec::IntoIter;
use lingua::LanguageDetector;
use lingua::LanguageDetectorBuilder;
use unicode_segmentation::UnicodeSegmentation;
use crate::SegmentedToken;
use crate::UseOrSubdivide;
use crate::language_detection::DetectionIterator;
use crate::language_detection::detect_top_n_languages;
use crate::language_detection::lingua_language_to_whatlang_language;
use crate::segmentation::Segmenter;
use crate::segmentation::UnicodeSentenceSplitter;
pub struct LinguaLanguageBlockSentenceSplitter {
language_detector: LanguageDetector,
}
impl Default for LinguaLanguageBlockSentenceSplitter {
fn default() -> Self {
Self {
language_detector: LanguageDetectorBuilder::from_all_languages()
.with_preloaded_language_models()
.build(),
}
}
}
impl LinguaLanguageBlockSentenceSplitter {
pub fn new() -> Self {
Default::default()
}
pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
Self {
language_detector: builder.build(),
}
}
}
impl Segmenter for LinguaLanguageBlockSentenceSplitter {
type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
fn subdivide<'a>(
&self,
token: SegmentedToken<'a>,
) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
let languages = detect_top_n_languages(&self.language_detector, 3, token.text);
if languages.is_empty() {
return UnicodeSentenceSplitter::new().subdivide(token);
}
let detection_iterator = DetectionIterator::detect(
&LanguageDetectorBuilder::from_languages(&languages).build(),
token.text,
);
let mut sentence_iterator = token.text.split_sentence_bounds().map(|s| s.len());
let mut collection: Vec<SegmentedToken<'_>> = Vec::new();
let mut last_offset = 0;
let mut next_sentence_split_at = sentence_iterator.next().unwrap_or(token.text.len());
for (mut start_index, end_index, language) in detection_iterator {
let language = language.and_then(lingua_language_to_whatlang_language);
while next_sentence_split_at > start_index && next_sentence_split_at <= end_index {
collection.push(
SegmentedToken::new_derived_from(
&token.text[start_index..next_sentence_split_at],
&token,
)
.with_detected_language(language, true, 1.0),
);
collection.push(
SegmentedToken::new_end_of_sentence(
&token.text[next_sentence_split_at..next_sentence_split_at],
)
.with_detected_language(language, true, 1.0),
);
start_index = next_sentence_split_at;
next_sentence_split_at = sentence_iterator
.next()
.map(|n| n + next_sentence_split_at)
.unwrap_or_else(|| token.text.len());
}
if start_index != end_index {
collection.push(
SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token)
.with_detected_language(language, true, 1.0),
);
}
last_offset = end_index;
}
if last_offset != token.text.len() {
collection.push(SegmentedToken::new_derived_from(
&token.text[last_offset..],
&token,
));
}
UseOrSubdivide::Subdivide(collection.into_iter())
}
}
#[cfg(test)]
mod test {
use whatlang::Lang;
use std::time::Instant;
use super::*;
use crate::chain::ChainSegmenter;
use crate::chain::StartSegmentationChain;
#[test]
fn test_lingua_multilanguage_detection() {
let test_text = "Parlez-vous français? \
Ich spreche Französisch nur ein bisschen. \
A little bit is better than nothing. ";
let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
let result: Vec<(&str, Option<Lang>, bool)> = test_text
.start_segmentation_chain()
.chain_segmenter(&lingua_segmenter)
.map(|t| (t.text, t.detected_language, t.is_end_of_sentence))
.collect();
let expected_tokens = vec![
("Parlez-vous français? ", Some(Lang::Fra), false),
("", Some(Lang::Fra), true),
(
"Ich spreche Französisch nur ein bisschen. ",
Some(Lang::Deu),
false,
),
("", Some(Lang::Deu), true),
(
"A little bit is better than nothing. ",
Some(Lang::Eng),
false,
),
("", Some(Lang::Eng), true),
];
assert_eq!(result, expected_tokens);
}
#[test]
fn test_lingua_sentence_splitting() {
let test_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur eu tincidunt enim. Praesent ornare pharetra ligula, sit amet sodales elit gravida ut. In hac habitasse platea dictumst. Aliquam quam odio, tristique at venenatis ut, auctor sed nibh. Sed et sapien lobortis, sagittis lacus sed, posuere tellus. Donec tincidunt dictum tempor. Aenean nec nisl commodo, venenatis leo nec, cursus tellus. Mauris finibus facilisis ultrices. Quisque quis lobortis odio. Cras at nisi augue.";
let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
let result: Vec<(&str, bool)> = test_text
.start_segmentation_chain()
.chain_segmenter(&lingua_segmenter)
.map(|t| (t.text, t.is_end_of_sentence))
.collect();
let expected_tokens = vec![
(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. ",
false,
),
("", true),
("Curabitur eu tincidunt enim. ", false),
("", true),
(
"Praesent ornare pharetra ligula, sit amet sodales elit gravida ut. ",
false,
),
("", true),
("In hac habitasse platea dictumst. ", false),
("", true),
(
"Aliquam quam odio, tristique at venenatis ut, auctor sed nibh. ",
false,
),
("", true),
(
"Sed et sapien lobortis, sagittis lacus sed, posuere tellus. ",
false,
),
("", true),
("Donec tincidunt dictum tempor. ", false),
("", true),
(
"Aenean nec nisl commodo, venenatis leo nec, cursus tellus. ",
false,
),
("", true),
("Mauris finibus facilisis ultrices. ", false),
("", true),
("Quisque quis lobortis odio. ", false),
("", true),
("Cras at nisi augue.", false),
("", true),
];
if result != expected_tokens {
eprintln!("Sentence splitting test failed!");
eprintln!("\nExpected output:");
for (n, line) in expected_tokens.iter().enumerate() {
eprintln!("* {n}: {line:?}");
}
eprintln!("\nGot output:");
for (n, line) in result.iter().enumerate() {
let is_ok = expected_tokens.get(n) == Some(line);
eprintln!("* {n}: {line:?} is_ok: {is_ok}");
}
panic!("Sentence splitting test failed: check stderr output.");
}
}
#[test]
fn test_lingua_performance() {
let test_text = "Parlez-vous français? \
Ich spreche Französisch nur ein bisschen. \
A little bit is better than nothing.";
let start_instant = Instant::now();
let _result: Vec<(&str, Option<Lang>)> = test_text
.start_segmentation_chain()
.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
.map(|t| (t.text, t.detected_language))
.collect();
let time_first_iteration = start_instant.elapsed();
let start_instant = Instant::now();
for _ in 0..100 {
let _result: Vec<(&str, Option<Lang>)> = test_text
.start_segmentation_chain()
.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
.map(|t| (t.text, t.detected_language))
.collect();
}
let time_multiple_iterations = start_instant.elapsed();
assert!(
time_first_iteration > (time_multiple_iterations / 100),
"Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
time_multiple_iterations / 100
);
}
}