use unicode_segmentation::UnicodeSegmentation;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Unit {
Sentence,
Paragraph,
Line,
}
const CLOSERS: &[char] = &['」', '』', ')', ')', '"', '\'', '”', '’'];
pub fn segment(text: &str, unit: Unit) -> Vec<String> {
let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
match unit {
Unit::Sentence => segment_sentences(&normalized),
Unit::Paragraph => segment_paragraphs(&normalized),
Unit::Line => segment_lines(&normalized),
}
}
fn segment_sentences(text: &str) -> Vec<String> {
let graphemes: Vec<&str> = text.graphemes(true).collect();
let mut out = Vec::new();
let mut cur = String::new();
let mut i = 0;
while i < graphemes.len() {
let g = graphemes[i];
cur.push_str(g);
let is_ja_end = g == "。" || g == "!" || g == "?";
let is_ascii_end = g == "." || g == "!" || g == "?";
let mut boundary = false;
if is_ja_end {
while i + 1 < graphemes.len() && is_closer(graphemes[i + 1]) {
i += 1;
cur.push_str(graphemes[i]);
}
boundary = true;
} else if is_ascii_end {
let next = graphemes.get(i + 1).copied();
let decimal_dot = g == "."
&& i > 0
&& is_ascii_digit(graphemes[i - 1])
&& next.is_some_and(is_ascii_digit);
let mut after = i + 1;
while after < graphemes.len() && is_closer(graphemes[after]) {
after += 1;
}
let followed_by_break = match graphemes.get(after) {
None => true,
Some(n) => n.chars().all(char::is_whitespace),
};
if !decimal_dot && followed_by_break {
while i + 1 < after {
i += 1;
cur.push_str(graphemes[i]);
}
boundary = true;
}
}
if boundary {
push_trimmed(&mut out, &cur);
cur.clear();
}
i += 1;
}
push_trimmed(&mut out, &cur);
out
}
fn segment_paragraphs(text: &str) -> Vec<String> {
let mut out = Vec::new();
let mut cur = String::new();
let mut newline_run = 0usize;
for ch in text.chars() {
if ch == '\n' {
newline_run += 1;
if newline_run == 2 {
if cur.ends_with('\n') {
cur.pop();
}
if !cur.is_empty() {
push_trimmed(&mut out, &cur);
cur.clear();
}
continue;
}
if newline_run > 2 {
continue;
}
cur.push(ch);
} else {
newline_run = 0;
cur.push(ch);
}
}
push_trimmed(&mut out, &cur);
out
}
fn segment_lines(text: &str) -> Vec<String> {
let mut out = Vec::new();
for line in text.split('\n') {
push_trimmed(&mut out, line);
}
out
}
fn push_trimmed(out: &mut Vec<String>, seg: &str) {
if !seg.trim().is_empty() {
out.push(seg.to_string());
}
}
fn is_closer(g: &str) -> bool {
let mut chars = g.chars();
match (chars.next(), chars.next()) {
(Some(c), None) => CLOSERS.contains(&c),
_ => false,
}
}
fn is_ascii_digit(g: &str) -> bool {
g.len() == 1 && g.as_bytes()[0].is_ascii_digit()
}
pub fn reader_prompt(index: usize, total: usize) -> String {
format!("\x1b[2m[ {index}/{total} ] Enter \u{25b8}\x1b[0m")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn sentence_japanese_basic() {
assert_eq!(
segment("一文目。二文目!三文目?", Unit::Sentence),
vec!["一文目。", "二文目!", "三文目?"]
);
}
#[test]
fn sentence_japanese_absorbs_closing_bracket() {
assert_eq!(
segment("「こんにちは。」次へ。", Unit::Sentence),
vec!["「こんにちは。」", "次へ。"]
);
}
#[test]
fn sentence_english_splits_on_space_after_period() {
assert_eq!(
segment("Hello there. How are you?", Unit::Sentence),
vec!["Hello there.", " How are you?"]
);
}
#[test]
fn sentence_does_not_split_decimal() {
assert_eq!(
segment("Pi is 3.14 today.", Unit::Sentence),
vec!["Pi is 3.14 today."]
);
}
#[test]
fn sentence_trailing_fragment_without_terminator() {
assert_eq!(
segment("First. Loose end", Unit::Sentence),
vec!["First.", " Loose end"]
);
}
#[test]
fn sentence_empty_and_whitespace_only() {
assert!(segment("", Unit::Sentence).is_empty());
assert!(segment(" \n ", Unit::Sentence).is_empty());
}
#[test]
fn sentence_absorbs_ascii_quote() {
assert_eq!(
segment("He said \"hi.\" Then left.", Unit::Sentence),
vec!["He said \"hi.\"", " Then left."]
);
}
#[test]
fn paragraph_splits_on_blank_line() {
assert_eq!(
segment("Para one.\nStill one.\n\nPara two.", Unit::Paragraph),
vec!["Para one.\nStill one.", "Para two."]
);
}
#[test]
fn paragraph_collapses_multiple_blank_lines() {
assert_eq!(segment("A\n\n\n\nB", Unit::Paragraph), vec!["A", "B"]);
}
#[test]
fn line_splits_on_newline_drops_trailing_blanks() {
assert_eq!(segment("one\ntwo\n\n", Unit::Line), vec!["one", "two"]);
}
#[test]
fn line_drops_empty_interior_lines() {
assert_eq!(segment("a\n\nb", Unit::Line), vec!["a", "b"]);
}
#[test]
fn sentence_mr_period_splits_on_space() {
assert_eq!(
segment("Mr. Smith went home.", Unit::Sentence),
vec!["Mr.", " Smith went home."]
);
}
#[test]
fn sentence_ellipsis_midword() {
assert_eq!(
segment("Wait... really.", Unit::Sentence),
vec!["Wait...", " really."]
);
}
#[test]
fn sentence_ellipsis_trailing() {
assert_eq!(segment("Wait...", Unit::Sentence), vec!["Wait..."]);
}
#[test]
fn sentence_consecutive_japanese_terminators() {
assert_eq!(
segment("本当。。終わり。", Unit::Sentence),
vec!["本当。", "。", "終わり。"]
);
}
#[test]
fn sentence_mixed_bang_question() {
assert_eq!(
segment("Really?! Yes.", Unit::Sentence),
vec!["Really?!", " Yes."]
);
}
#[test]
fn sentence_leading_dot_decimal_like() {
assert_eq!(segment(".5 cents.", Unit::Sentence), vec![".5 cents."]);
}
#[test]
fn sentence_digit_then_terminal_dot() {
assert_eq!(segment("100.", Unit::Sentence), vec!["100."]);
}
#[test]
fn sentence_emoji_before_terminator() {
assert_eq!(
segment("Run🎉. Next.", Unit::Sentence),
vec!["Run🎉.", " Next."]
);
}
#[test]
fn sentence_crlf_normalized_no_cr_residue() {
assert_eq!(
segment("Line one.\r\nLine two.", Unit::Sentence),
vec!["Line one.", "\nLine two."]
);
}
#[test]
fn sentence_domain_dot_not_split() {
assert_eq!(segment("a.b", Unit::Sentence), vec!["a.b"]);
assert_eq!(
segment("U.S.A. is here.", Unit::Sentence),
vec!["U.S.A.", " is here."]
);
}
#[test]
fn sentence_closer_at_eof() {
assert_eq!(
segment("He said \"no.\"", Unit::Sentence),
vec!["He said \"no.\""]
);
assert_eq!(segment("end.)", Unit::Sentence), vec!["end.)"]);
}
#[test]
fn sentence_closer_then_space() {
assert_eq!(segment("Hi.) Bye.", Unit::Sentence), vec!["Hi.)", " Bye."]);
}
#[test]
fn sentence_japanese_terminator_then_newline() {
assert_eq!(segment("a。\nb", Unit::Sentence), vec!["a。", "\nb"]);
}
#[test]
fn sentence_combining_grapheme_preserved() {
assert_eq!(
segment("e\u{0301}nd.", Unit::Sentence),
vec!["e\u{0301}nd."]
);
}
#[test]
fn sentence_single_terminator_only() {
assert_eq!(segment("。", Unit::Sentence), vec!["。"]);
}
#[test]
fn sentence_unclosed_opener() {
assert_eq!(segment("「終わり。", Unit::Sentence), vec!["「終わり。"]);
}
#[test]
fn paragraph_preserves_interior_single_newline() {
assert_eq!(
segment("A\nB\n\nC\nD", Unit::Paragraph),
vec!["A\nB", "C\nD"]
);
}
#[test]
fn paragraph_leading_blank_lines_dropped() {
assert_eq!(segment("\n\nA", Unit::Paragraph), vec!["A"]);
}
#[test]
fn paragraph_whitespace_only_line_between() {
assert_eq!(segment("A\n \nB", Unit::Paragraph), vec!["A\n \nB"]);
}
#[test]
fn paragraph_crlf_blank_line_splits() {
assert_eq!(segment("a\r\n\r\nb", Unit::Paragraph), vec!["a", "b"]);
}
#[test]
fn line_last_line_without_newline() {
assert_eq!(segment("a\nb", Unit::Line), vec!["a", "b"]);
}
#[test]
fn line_crlf_normalized() {
assert_eq!(segment("a\r\nb\r\n\r\n", Unit::Line), vec!["a", "b"]);
}
#[test]
fn line_all_blank_inputs() {
assert!(segment("", Unit::Line).is_empty());
assert!(segment("\n", Unit::Line).is_empty());
}
#[test]
fn reader_prompt_index_equals_total_and_arrow() {
let p = reader_prompt(1, 1);
assert!(p.contains("1/1"), "shows index/total when equal");
assert!(p.contains('\u{25b8}'), "contains the advance arrow");
}
#[test]
fn reader_prompt_is_dim_and_has_counts() {
let p = reader_prompt(2, 5);
assert!(p.starts_with("\x1b[2m"), "starts dim");
assert!(p.ends_with("\x1b[0m"), "ends with reset");
assert!(p.contains("2/5"), "shows index/total");
}
}