use regex::Regex;
use unicode_width::UnicodeWidthStr;
struct PlainChar<'a> {
ch: char,
trailing: &'a str,
}
struct Tokens<'a> {
leading: &'a str,
chars: Vec<PlainChar<'a>>,
}
fn tokenize_plain_chars<'a>(input: &'a str, tag_regex: &Regex) -> Tokens<'a> {
let mut chars = Vec::new();
let mut pos = 0;
let mut leading_end = 0;
let mut leading_set = false;
let tag_ranges: Vec<(usize, usize)> = tag_regex.find_iter(input).map(|m| (m.start(), m.end())).collect();
let mut tag_idx = 0;
while pos < input.len() {
if tag_idx < tag_ranges.len() && pos == tag_ranges[tag_idx].0 {
pos = tag_ranges[tag_idx].1;
tag_idx += 1;
continue;
}
let remaining = &input[pos..];
if let Some(c) = remaining.chars().next() {
if !leading_set {
leading_end = pos;
leading_set = true;
}
let char_end = pos + c.len_utf8();
let mut trailing_end = char_end;
loop {
if tag_idx < tag_ranges.len() && trailing_end == tag_ranges[tag_idx].0 {
trailing_end = tag_ranges[tag_idx].1;
tag_idx += 1;
} else {
break;
}
}
chars.push(PlainChar {
ch: c,
trailing: &input[char_end..trailing_end],
});
pos = trailing_end;
} else {
break;
}
}
let leading = if leading_set {
&input[..leading_end]
} else {
input
};
Tokens { leading, chars }
}
pub fn break_lines_impl(
input: &str,
widths: &[usize],
tag_regex: &Regex,
model: &budoux::Model,
) -> String {
if input.is_empty() || widths.is_empty() {
return input.to_string();
}
let tokens = tokenize_plain_chars(input, tag_regex);
if tokens.chars.is_empty() {
return input.to_string();
}
let plaintext: String = tokens.chars.iter().map(|pc| pc.ch).collect();
let words = budoux::parse(model, &plaintext);
let mut break_positions = Vec::new();
let mut line_width: usize = 0;
let mut line_idx: usize = 0;
let mut char_idx: usize = 0;
for word in &words {
let word_width = UnicodeWidthStr::width_cjk(word.as_str());
let threshold = widths[line_idx.min(widths.len() - 1)];
if char_idx > 0 && line_width + word_width > threshold {
if line_width > 0 {
break_positions.push(char_idx);
line_idx += 1;
line_width = word_width;
} else {
line_width += word_width;
}
} else {
line_width += word_width;
}
char_idx += word.chars().count();
}
let mut result = String::with_capacity(input.len() + break_positions.len() * 2);
result.push_str(tokens.leading);
let mut bp_idx = 0;
for (i, pc) in tokens.chars.iter().enumerate() {
if bp_idx < break_positions.len() && i == break_positions[bp_idx] {
result.push_str("\\n");
bp_idx += 1;
}
result.push(pc.ch);
result.push_str(pc.trailing);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use super::super::tokenizer::Tokenizer;
fn tag_regex() -> Regex {
Regex::new(Tokenizer::SAKURA_TAG_PATTERN).unwrap()
}
fn model() -> &'static budoux::Model {
budoux::models::default_japanese_model()
}
#[test]
fn test_plain_japanese_text_breaks_at_word_boundary() {
let re = tag_regex();
let m = model();
let input = "今日はいい天気ですね";
let result = break_lines_impl(input, &[6], &re, &m);
assert!(result.contains("\\n"), "Expected line break in: {}", result);
let plain: String = re.replace_all(&result, "").into_owned();
assert_eq!(plain, input, "Plain text should be preserved");
}
#[test]
fn test_plain_text_no_break_when_fits() {
let re = tag_regex();
let m = model();
let input = "短い";
let result = break_lines_impl(input, &[20], &re, &m);
assert_eq!(result, "短い");
}
#[test]
fn test_tags_excluded_from_width_and_preserved() {
let re = tag_regex();
let m = model();
let input = r"こ\_w[50]れ\_w[50]は\_w[50]テ\_w[50]ス\_w[50]ト";
let result = break_lines_impl(input, &[6], &re, &m);
assert!(result.contains("\\n"), "Expected line break in: {}", result);
assert_eq!(result.matches(r"\_w[50]").count(), 5, "All 5 wait tags must be preserved");
}
#[test]
fn test_leading_tags_preserved() {
let re = tag_regex();
let m = model();
let input = r"\h\s[0]こんにちは世界";
let result = break_lines_impl(input, &[6], &re, &m);
assert!(result.starts_with(r"\h\s[0]"), "Leading tags should be preserved: {}", result);
}
#[test]
fn test_multiple_width_thresholds() {
let re = tag_regex();
let m = model();
let input = "あいうえおかきくけこさしすせそたちつてと";
let result = break_lines_impl(input, &[4, 6], &re, &m);
assert!(result.contains("\\n"), "Expected line breaks: {}", result);
let plain: String = re.replace_all(&result, "").into_owned();
assert_eq!(plain, input);
}
#[test]
fn test_empty_input_returns_empty() {
let re = tag_regex();
let m = model();
let result = break_lines_impl("", &[10], &re, &m);
assert_eq!(result, "");
}
#[test]
fn test_empty_widths_returns_input_unchanged() {
let re = tag_regex();
let m = model();
let input = "テスト文字列";
let result = break_lines_impl(input, &[], &re, &m);
assert_eq!(result, input);
}
#[test]
fn test_single_oversized_word_no_forced_break() {
let re = tag_regex();
let m = model();
let input = "超長い一語";
let result = break_lines_impl(input, &[2], &re, &m);
let plain: String = re.replace_all(&result, "").into_owned();
assert_eq!(plain, input);
}
#[test]
fn test_oversized_word_then_normal() {
let re = tag_regex();
let m = model();
let input = "あいうえおかきくけこさしすせそ短い文";
let result = break_lines_impl(input, &[4], &re, &m);
let plain: String = re.replace_all(&result, "").into_owned();
assert_eq!(plain, input, "Plain text must be preserved");
}
#[test]
fn test_existing_newline_tag_preserved() {
let re = tag_regex();
let m = model();
let input = r"あいう\nえおか";
let result = break_lines_impl(input, &[20], &re, &m);
assert!(result.contains(r"\n"), "Existing \\n should be preserved: {}", result);
let plain: String = re.replace_all(&result, "").into_owned();
assert_eq!(plain, "あいうえおか");
}
#[test]
fn test_last_width_repeats_for_subsequent_lines() {
let re = tag_regex();
let m = model();
let input = "あいうえおかきくけこさしすせそたちつてと";
let result_len2 = break_lines_impl(input, &[4, 4], &re, &m);
let result_len3 = break_lines_impl(input, &[4, 4, 4], &re, &m);
assert_eq!(
result_len2, result_len3,
"[4,4] と [4,4,4] は等価であること(最後の値繰り返し):\n{}\nvs\n{}",
result_len2, result_len3
);
}
#[test]
fn test_wider_last_width_produces_fewer_breaks() {
let re = tag_regex();
let m = model();
let input = "あいうえおかきくけこさしすせそたちつてと";
let result_narrow = break_lines_impl(input, &[4, 4], &re, &m);
let result_wide = break_lines_impl(input, &[4, 20], &re, &m);
let breaks_narrow = result_narrow.matches("\\n").count();
let breaks_wide = result_wide.matches("\\n").count();
assert!(
breaks_narrow >= breaks_wide,
"最後の値が広い方が改行数 ≤ であること: [4,4]={} [4,20]={}",
breaks_narrow, breaks_wide
);
let plain_narrow: String = re.replace_all(&result_narrow, "").into_owned();
let plain_wide: String = re.replace_all(&result_wide, "").into_owned();
assert_eq!(plain_narrow, input);
assert_eq!(plain_wide, input);
}
#[test]
fn test_tokenize_plain_only() {
let re = tag_regex();
let tokens = tokenize_plain_chars("abc", &re);
assert_eq!(tokens.leading, "");
assert_eq!(tokens.chars.len(), 3);
assert_eq!(tokens.chars[0].ch, 'a');
assert_eq!(tokens.chars[0].trailing, "");
}
#[test]
fn test_tokenize_with_tags() {
let re = tag_regex();
let tokens = tokenize_plain_chars(r"\hこ\_w[50]ん", &re);
assert_eq!(tokens.leading, r"\h");
assert_eq!(tokens.chars.len(), 2);
assert_eq!(tokens.chars[0].ch, 'こ');
assert_eq!(tokens.chars[0].trailing, r"\_w[50]");
assert_eq!(tokens.chars[1].ch, 'ん');
assert_eq!(tokens.chars[1].trailing, "");
}
#[test]
fn test_tokenize_tags_only() {
let re = tag_regex();
let tokens = tokenize_plain_chars(r"\h\s[0]", &re);
assert_eq!(tokens.chars.len(), 0);
}
}