use uuid::Uuid;
use crate::chunk_by_word::{WordType, chunk_by_word};
use crate::cut_type::CutType;
use crate::token_counter::TokenCounter;
#[derive(Debug, Clone)]
pub struct SentenceChunk<'a> {
pub paragraph_id: Uuid,
pub text: &'a str,
pub size: usize,
pub cut_type: CutType,
}
fn word_type_to_cut_type(wt: WordType) -> CutType {
match wt {
WordType::ParagraphEnd => CutType::ParagraphEnd,
WordType::SentenceEnd => CutType::SentenceEnd,
WordType::Word => CutType::Word,
}
}
fn offset_in(base: &str, slice: &str) -> usize {
slice.as_ptr() as usize - base.as_ptr() as usize
}
#[allow(
clippy::expect_used,
reason = "sentence_start invariants are upheld by the is_some() guard and the explicit set above each emit branch"
)]
pub fn chunk_by_sentence<'a, C: TokenCounter>(
data: &'a str,
maximum_size: Option<usize>,
counter: &C,
) -> Vec<SentenceChunk<'a>> {
let words = chunk_by_word(data);
let mut result = Vec::new();
let mut paragraph_id = Uuid::new_v4();
let mut sentence_size: usize = 0;
let mut word_type_state = WordType::Word;
let mut sentence_start: Option<usize> = None;
let mut sentence_end: usize = 0;
for word_chunk in &words {
let word = word_chunk.text;
let word_type = word_chunk.word_type;
let word_size = counter.count_tokens(word);
let word_start_byte = offset_in(data, word);
let word_end_byte = word_start_byte + word.len();
match word_type {
WordType::ParagraphEnd | WordType::SentenceEnd => {
word_type_state = word_type;
}
WordType::Word => {
if word.chars().any(|c| c.is_alphabetic()) {
word_type_state = word_type;
}
}
}
if let Some(max) = maximum_size
&& sentence_size + word_size > max
&& sentence_start.is_some()
{
result.push(SentenceChunk {
paragraph_id,
text: &data[sentence_start.expect("sentence_start is Some because the guard sentence_start.is_some() was checked before this branch")..sentence_end],
size: sentence_size,
cut_type: word_type_to_cut_type(word_type_state),
});
sentence_start = Some(word_start_byte);
sentence_end = word_end_byte;
sentence_size = word_size;
continue;
}
if matches!(word_type, WordType::ParagraphEnd | WordType::SentenceEnd) {
if sentence_start.is_none() {
sentence_start = Some(word_start_byte);
}
sentence_end = word_end_byte;
sentence_size += word_size;
if word_type == WordType::ParagraphEnd {
paragraph_id = Uuid::new_v4();
}
result.push(SentenceChunk {
paragraph_id,
text: &data[sentence_start
.expect("sentence_start is Some because it was just set above if it was None")
..sentence_end],
size: sentence_size,
cut_type: word_type_to_cut_type(word_type_state),
});
sentence_start = None;
sentence_size = 0;
} else {
if sentence_start.is_none() {
sentence_start = Some(word_start_byte);
}
sentence_end = word_end_byte;
sentence_size += word_size;
}
}
if let Some(start) = sentence_start {
let cut_type = if word_type_state == WordType::Word {
CutType::SentenceCut
} else {
word_type_to_cut_type(word_type_state)
};
result.push(SentenceChunk {
paragraph_id,
text: &data[start..sentence_end],
size: sentence_size,
cut_type,
});
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::token_counter::WordCounter;
#[test]
fn empty_input() {
let chunks = chunk_by_sentence("", None, &WordCounter);
assert!(chunks.is_empty());
}
#[test]
fn single_sentence() {
let chunks = chunk_by_sentence("Hello world.", None, &WordCounter);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "Hello world.");
assert_eq!(chunks[0].size, 2);
assert_eq!(chunks[0].cut_type, CutType::SentenceEnd);
}
#[test]
fn two_sentences_same_paragraph() {
let chunks = chunk_by_sentence("Hello world. Foo bar.", None, &WordCounter);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].paragraph_id, chunks[1].paragraph_id);
}
#[test]
fn paragraph_boundary_new_id() {
let chunks = chunk_by_sentence(
"First paragraph.\nSecond paragraph.\nThird.",
None,
&WordCounter,
);
assert_eq!(chunks.len(), 3);
assert_ne!(chunks[0].paragraph_id, chunks[1].paragraph_id);
}
#[test]
fn sentence_cut_no_punctuation() {
let chunks = chunk_by_sentence("Hello world", None, &WordCounter);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].cut_type, CutType::SentenceCut);
}
#[test]
fn maximum_size_overflow() {
let chunks = chunk_by_sentence("one two three four", Some(2), &WordCounter);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].text, "one two ");
assert_eq!(chunks[0].size, 2);
assert_eq!(chunks[1].text, "three four");
assert_eq!(chunks[1].size, 2);
}
#[test]
fn token_counting_matches_word_count() {
let chunks = chunk_by_sentence("This is a test sentence.", None, &WordCounter);
assert_eq!(chunks[0].size, 5);
}
#[test]
fn isomorphism_parametrized() {
use crate::test_inputs::{EMPTY, ENGLISH_LISTS, ENGLISH_TEXT, PYTHON_CODE};
let texts = [
("english_text", ENGLISH_TEXT),
("english_lists", ENGLISH_LISTS),
("python_code", PYTHON_CODE),
("empty", EMPTY),
];
let max_sizes: [Option<usize>; 3] = [None, Some(16), Some(64)];
let counter = WordCounter;
for &(name, text) in &texts {
for max in max_sizes {
let chunks = chunk_by_sentence(text, max, &counter);
let reconstructed: String = chunks.iter().map(|c| c.text).collect();
assert_eq!(
reconstructed, text,
"isomorphism failed for ('{name}', max={max:?})"
);
}
}
}
#[test]
fn token_count_within_max_length() {
use crate::test_inputs::{EMPTY, ENGLISH_LISTS, ENGLISH_TEXT, PYTHON_CODE};
let texts = [
("english_text", ENGLISH_TEXT),
("english_lists", ENGLISH_LISTS),
("python_code", PYTHON_CODE),
("empty", EMPTY),
];
let counter = WordCounter;
for &(name, text) in &texts {
for max in [16_usize, 64] {
let chunks = chunk_by_sentence(text, Some(max), &counter);
for (i, chunk) in chunks.iter().enumerate() {
assert!(
chunk.size <= max,
"chunk {i} in ('{name}', max={max}) has size {} > {max}",
chunk.size
);
}
}
}
}
#[test]
fn chinese_text_no_panic() {
use crate::test_inputs::CHINESE_TEXT;
let counter = WordCounter;
let chunks = chunk_by_sentence(CHINESE_TEXT, Some(16), &counter);
assert!(
!chunks.is_empty(),
"Chinese text should produce at least one chunk"
);
}
}