unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use aho_corasick::BuildError;
use aho_corasick::MatchKind;

use std::vec::IntoIter;

use crate::SegmentedToken;
use crate::UseOrSubdivide;
use crate::segmentation::Segmenter;

/// Decompose compound words into their parts found in a given dictionary. Useful for small or on the fly generated dictionaries.
///
/// This implementation is based on the [aho_corasick] crate.
///
/// This will decompose only if a word can be fully resolved using the dictionary, any texts containing unknown "words" will be passed through as-is.
///
/// This is an adaption of the [`SplitCompoundWords` filter from tantivy](https://docs.rs/tantivy/latest/tantivy/tokenizer/struct.SplitCompoundWords.html) for this crate.
#[derive(Debug, Clone)]
pub struct DecompositionAhoCorasick {
	dictionary: AhoCorasick,
}

impl DecompositionAhoCorasick {
	/// Create a filter from a given dictionary.
	///
	/// The dictionary will be used to construct an [`AhoCorasick`] automaton
	/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
	/// more control over its construction is required.
	pub fn from_dictionary<I, P>(dict: I) -> Result<Self, BuildError>
	where
		I: IntoIterator<Item = P>,
		P: AsRef<[u8]>,
	{
		let dict = AhoCorasickBuilder::new()
			.match_kind(MatchKind::LeftmostLongest)
			.build(dict)?;

		Ok(Self::from_automaton(dict))
	}

	/// Create a filter from a given automaton.
	///
	/// The automaton should use one of the leftmost-first match kinds
	/// and it should not be anchored.
	pub fn from_automaton(dictionary: AhoCorasick) -> Self {
		Self { dictionary }
	}
}

impl Segmenter for DecompositionAhoCorasick {
	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;

	fn subdivide<'a>(
		&self,
		token: SegmentedToken<'a>,
	) -> UseOrSubdivide<SegmentedToken<'a>, Self::SubdivisionIter<'a>> {
		if token.is_known_word {
			return UseOrSubdivide::Use(token);
		}

		let mut cuts = Vec::new();
		let mut pos: usize = 0;

		for match_ in self.dictionary.find_iter(token.text) {
			// abort if the matches have some space between them
			if pos != match_.start() {
				break;
			}
			cuts.push(match_.len());
			pos = match_.end();
		}

		if pos == token.text.len() {
			let mut subsegments = Vec::<SegmentedToken>::with_capacity(cuts.len() + 1);
			let mut text = token.text;
			for pos in cuts {
				// Every time we split here `text` gets shorter, which is
				// why we can use the pushed lengths as positions here.
				let (word, rest) = text.split_at(pos);
				text = rest;
				subsegments
					.push(SegmentedToken::new_derived_from(word, &token).with_is_kown_word(true));
			}
			return UseOrSubdivide::Subdivide(subsegments.into_iter());
		} else {
			// If we didn't end at the end the token couldn't be split into parts of the wordlist -> return the token whole and unchanged.
			return UseOrSubdivide::Use(token);
		}
	}
}

#[cfg(test)]
mod test {

	use super::*;
	use crate::SubdivisionMap;
	use crate::initial_paragraph_splitter::InitialParagraphSplitter;

	#[test]
	fn test_decomposition_aho_corasick() {
		let decomposer =
			DecompositionAhoCorasick::from_dictionary(vec!["foo", "bar", "baz"]).unwrap();

		let splitter = InitialParagraphSplitter::new("foobarbaz fooquux foo bazbaz");

		let subsplitter = SubdivisionMap::new(splitter, |s| decomposer.subdivide(s));

		let result: Vec<&str> = subsplitter.map(|s| s.text).collect();

		let expected_result = vec![
			"foo", "bar", "baz", " ", "fooquux", " ", "foo", " ", "baz", "baz",
		];

		assert_eq!(result, expected_result);
	}
}