unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use lingua::Language;
use whatlang::Lang;

/// Convert linugua languages to whatlang languages.
///
/// Drops some languages because they aren't in [whatlang::Lang] (unfortunately).
///
/// Long term plan is to use a generic language representation that can fully represent all languages that both whatlang and lingua support.i
///
/// With apologies to everyone who feels represented by a language that gets dropped here.
pub fn lingua_language_to_whatlang_language(lang: Language) -> Option<Lang> {
	match lang {
		Language::Afrikaans => Some(Lang::Afr),
		Language::Albanian => None,
		Language::Arabic => Some(Lang::Ara),
		Language::Armenian => Some(Lang::Hye),
		Language::Azerbaijani => Some(Lang::Aze),
		Language::Basque => None,
		Language::Belarusian => Some(Lang::Bel),
		Language::Bengali => Some(Lang::Ben),
		Language::Bokmal => Some(Lang::Nob),
		Language::Bosnian => None,
		Language::Bulgarian => Some(Lang::Bul),
		Language::Catalan => Some(Lang::Cat),
		Language::Chinese => Some(Lang::Cmn),
		Language::Croatian => Some(Lang::Hrv),
		Language::Czech => Some(Lang::Ces),
		Language::Danish => Some(Lang::Dan),
		Language::Dutch => Some(Lang::Nld),
		Language::English => Some(Lang::Eng),
		Language::Esperanto => Some(Lang::Epo),
		Language::Estonian => Some(Lang::Est),
		Language::Finnish => Some(Lang::Fin),
		Language::French => Some(Lang::Fra),
		Language::Ganda => None,
		Language::Georgian => Some(Lang::Kat),
		Language::German => Some(Lang::Deu),
		Language::Greek => Some(Lang::Ell),
		Language::Gujarati => Some(Lang::Guj),
		Language::Hebrew => Some(Lang::Heb),
		Language::Hindi => Some(Lang::Hin),
		Language::Hungarian => Some(Lang::Hun),
		Language::Icelandic => None,
		Language::Indonesian => Some(Lang::Ind),
		Language::Irish => None,
		Language::Italian => Some(Lang::Ita),
		Language::Japanese => Some(Lang::Jpn),
		Language::Kazakh => None,
		Language::Korean => Some(Lang::Kor),
		Language::Latin => Some(Lang::Lat),
		Language::Latvian => Some(Lang::Lav),
		Language::Lithuanian => Some(Lang::Lit),
		Language::Macedonian => Some(Lang::Mkd),
		Language::Malay => Some(Lang::Mal), // Semantic break, macrolanguage to specific language
		Language::Maori => None,
		Language::Marathi => Some(Lang::Mar),
		Language::Mongolian => None,
		Language::Nynorsk => None,
		Language::Persian => Some(Lang::Pes),
		Language::Polish => Some(Lang::Pol),
		Language::Portuguese => Some(Lang::Por),
		Language::Punjabi => Some(Lang::Pan),
		Language::Romanian => Some(Lang::Ron),
		Language::Russian => Some(Lang::Rus),
		Language::Serbian => Some(Lang::Srp),
		Language::Shona => Some(Lang::Sna),
		Language::Slovak => Some(Lang::Slk),
		Language::Slovene => Some(Lang::Slv),
		Language::Somali => None,
		Language::Sotho => None,
		Language::Spanish => Some(Lang::Spa),
		Language::Swahili => None,
		Language::Swedish => Some(Lang::Swe),
		Language::Tagalog => Some(Lang::Tgl),
		Language::Tamil => Some(Lang::Tam),
		Language::Telugu => Some(Lang::Tel),
		Language::Thai => Some(Lang::Tha),
		Language::Tsonga => None,
		Language::Tswana => None,
		Language::Turkish => Some(Lang::Tur),
		Language::Ukrainian => Some(Lang::Ukr),
		Language::Urdu => Some(Lang::Urd),
		Language::Vietnamese => Some(Lang::Vie),
		Language::Welsh => Some(Lang::Cym),
		Language::Xhosa => None,
		Language::Yoruba => None,
		Language::Zulu => Some(Lang::Zul),
	}
}