unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use std::collections::HashMap;

use lingua::{Language, LanguageDetector};
use unicode_segmentation::UnicodeSegmentation;
use whatlang::Lang;

pub fn detect_language_with_confidence(
	detector: &LanguageDetector,
	text: &str,
) -> Option<(Language, f64)> {
	let language = detector.detect_language_of(text);
	match language {
		Some(Language::Japanese) | Some(Language::Chinese) => {
			Some(detect_chinese_or_japanese(text).unwrap_or_else(|| {
				(
					language.unwrap(),
					detector.compute_language_confidence(text, language.unwrap()),
				)
			}))
		}
		Some(language) => Some((
			language,
			detector.compute_language_confidence(text, language),
		)),
		None => None,
	}
}

pub fn detect_chinese_or_japanese(text: &str) -> Option<(Language, f64)> {
	let detector = whatlang::Detector::with_allowlist(vec![Lang::Cmn, Lang::Jpn]);
	let info = detector.detect(text)?;
	match info.lang() {
		Lang::Cmn => Some((Language::Chinese, info.confidence())),
		Lang::Jpn => Some((Language::Japanese, info.confidence())),
		_ => None,
	}
}

/// Returns the detected languages in a text in a way that it can be used for multilanguage text segmentation.
///
/// This will try to detect up to `max_number_of_languages` that are used in a given text. Your code must ba able to handle any array size up to the specified maximum, including the **restulting array ebing empty**, as this may cause some lingua functions to panic.
pub fn detect_top_n_languages(
	detector: &LanguageDetector,
	max_number_of_languages: usize,
	text: &str,
) -> Vec<Language> {
	let mut language_highscores: HashMap<Language, f64> = HashMap::new();

	let mut detected_sentence_count: usize = 0;

	for sentence in text.split_sentence_bounds() {
		if let Some((language, confidence)) = detect_language_with_confidence(detector, sentence) {
			detected_sentence_count += 1;
			if confidence > 0.3 {
				language_highscores
					.entry(language)
					.and_modify(|value| {
						*value += confidence;
					})
					.or_insert(confidence);
			}
		}
	}

	let mut scores: Vec<(Language, f64)> = language_highscores.into_iter().collect();
	scores.sort_by(|(_, score_a), (_, score_b)| score_b.total_cmp(score_a));

	//println!("Scores: {scores:?}");

	let fluke_border = 0.25 + (detected_sentence_count as f64) * 0.05;

	//println!("Fluke border: {fluke_border}");

	let mut languages = vec![];
	for (language, score) in scores {
		if languages.len() < max_number_of_languages
			&& (score > fluke_border || languages.is_empty())
		{
			languages.push(language);
		} else {
			break;
		}
	}

	languages
}