use std::vec::IntoIter;
use lingua::LanguageDetector;
use lingua::LanguageDetectorBuilder;
use crate::SegmentedToken;
use crate::UseOrSubdivide;
use crate::language_detection::DetectionIterator;
use crate::language_detection::detect_top_n_languages;
use crate::language_detection::lingua_language_to_whatlang_language;
use crate::segmentation::Segmenter;
pub struct LinguaLanguageBlockSplitter {
language_detector: LanguageDetector,
}
impl Default for LinguaLanguageBlockSplitter {
fn default() -> Self {
Self {
language_detector: LanguageDetectorBuilder::from_all_languages()
.with_preloaded_language_models()
.build(),
}
}
}
impl LinguaLanguageBlockSplitter {
pub fn new() -> Self {
Default::default()
}
pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
Self {
language_detector: builder.build(),
}
}
}
impl Segmenter for LinguaLanguageBlockSplitter {
type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
fn subdivide<'a>(
&self,
token: SegmentedToken<'a>,
) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
let languages = detect_top_n_languages(&self.language_detector, 3, token.text);
if languages.is_empty() {
return UseOrSubdivide::Use(token);
}
let detection_iterator = DetectionIterator::detect(
&LanguageDetectorBuilder::from_languages(&languages).build(),
token.text,
);
let result_list = self
.language_detector
.detect_multiple_languages_of(token.text);
let mut collection: Vec<SegmentedToken<'_>> = Vec::with_capacity(result_list.len() * 2 + 1);
let mut last_offset = 0;
for (start_index, end_index, language) in detection_iterator {
if last_offset != start_index {
collection.push(SegmentedToken::new_derived_from(
&token.text[last_offset..start_index],
&token,
));
}
let mut new_token =
SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token);
new_token.detected_language = language.and_then(lingua_language_to_whatlang_language);
new_token.is_detected_language_relible = true;
collection.push(new_token);
last_offset = end_index;
}
if last_offset != token.text.len() {
collection.push(SegmentedToken::new_derived_from(
&token.text[last_offset..],
&token,
));
}
UseOrSubdivide::Subdivide(collection.into_iter())
}
}
#[cfg(test)]
mod test {
use std::time::Instant;
use super::*;
use crate::chain::ChainSegmenter;
use crate::chain::StartSegmentationChain;
use whatlang::Lang;
#[test]
fn test_lingua_multilanguage_detection() {
let test_text = "Parlez-vous français? \
Ich spreche Französisch nur ein bisschen. \
A little bit is better than nothing. ";
let lingua_segmenter = LinguaLanguageBlockSplitter::new();
for _ in 0..100 {
let result: Vec<(&str, Option<Lang>)> = test_text
.start_segmentation_chain()
.chain_segmenter(&lingua_segmenter)
.map(|t| (t.text, t.detected_language))
.collect();
let expected_tokens = vec![
("Parlez-vous français? ", Some(Lang::Fra)),
(
"Ich spreche Französisch nur ein bisschen. ",
Some(Lang::Deu),
),
("A little bit is better than nothing. ", Some(Lang::Eng)),
];
assert_eq!(result, expected_tokens);
}
}
#[test]
fn test_lingua_performance() {
let test_text = "Parlez-vous français? \
Ich spreche Französisch nur ein bisschen. \
A little bit is better than nothing.";
let start_instant = Instant::now();
let _result: Vec<(&str, Option<Lang>)> = test_text
.start_segmentation_chain()
.chain_segmenter(&LinguaLanguageBlockSplitter::new())
.map(|t| (t.text, t.detected_language))
.collect();
let time_first_iteration = start_instant.elapsed();
let start_instant = Instant::now();
for _ in 0..100 {
let _result: Vec<(&str, Option<Lang>)> = test_text
.start_segmentation_chain()
.chain_segmenter(&LinguaLanguageBlockSplitter::new())
.map(|t| (t.text, t.detected_language))
.collect();
}
let time_multiple_iterations = start_instant.elapsed();
assert!(
time_first_iteration > (time_multiple_iterations / 100),
"Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
time_multiple_iterations / 100
);
}
}