#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(dead_code)]
pub enum SegmentationRule {
MaxCharsPerLine(u32),
MaxLines(u32),
BreakAtNaturalPoints,
NoOrphanWords,
}
static CONJUNCTIONS: &[&str] = &[
"and", "or", "but", "because", "that", "which", "who", "when", "where", "while",
];
fn is_break_word(word: &str) -> bool {
let lower = word.to_ascii_lowercase();
CONJUNCTIONS.contains(&lower.as_str())
}
#[derive(Debug, Clone, Default)]
#[allow(dead_code)]
pub struct SubtitleBlock {
pub lines: Vec<String>,
}
impl SubtitleBlock {
#[allow(dead_code)]
pub fn new(lines: Vec<String>) -> Self {
Self { lines }
}
#[allow(dead_code)]
pub fn char_count(&self) -> usize {
self.lines.iter().map(|l| l.chars().count()).sum()
}
#[allow(dead_code)]
pub fn line_count(&self) -> usize {
self.lines.len()
}
#[allow(dead_code)]
pub fn to_plain_text(&self) -> String {
self.lines.join(" ")
}
}
pub struct LineBreaker;
impl LineBreaker {
#[allow(dead_code)]
pub fn segment(text: &str, rules: &[SegmentationRule]) -> Vec<String> {
let max_chars = rules.iter().find_map(|r| {
if let SegmentationRule::MaxCharsPerLine(n) = r {
Some(*n as usize)
} else {
None
}
});
let max_lines = rules.iter().find_map(|r| {
if let SegmentationRule::MaxLines(n) = r {
Some(*n as usize)
} else {
None
}
});
let natural_breaks = rules.contains(&SegmentationRule::BreakAtNaturalPoints);
let no_orphans = rules.contains(&SegmentationRule::NoOrphanWords);
let words: Vec<&str> = text.split_whitespace().collect();
if words.is_empty() {
return vec![];
}
let max_chars = max_chars.unwrap_or(42);
let mut lines: Vec<String> = Vec::new();
let mut current_line = String::new();
for word in &words {
let should_break_before = natural_breaks
&& is_break_word(word)
&& !current_line.is_empty()
&& current_line.len() > max_chars / 3;
if should_break_before && !current_line.is_empty() {
lines.push(current_line.trim_end().to_string());
current_line = String::new();
}
let would_be_len = if current_line.is_empty() {
word.len()
} else {
current_line.len() + 1 + word.len()
};
if would_be_len > max_chars && !current_line.is_empty() {
lines.push(current_line.trim_end().to_string());
current_line = word.to_string();
} else if current_line.is_empty() {
current_line = word.to_string();
} else {
current_line.push(' ');
current_line.push_str(word);
}
}
if !current_line.is_empty() {
lines.push(current_line.trim_end().to_string());
}
if let Some(max) = max_lines {
lines.truncate(max);
}
if no_orphans && lines.len() > 1 {
if let Some(last) = lines.last() {
let word_count = last.split_whitespace().count();
if word_count == 1 && last.chars().count() <= 4 {
let orphan = lines
.pop()
.expect("invariant: lines non-empty confirmed above");
if let Some(prev) = lines.last_mut() {
prev.push(' ');
prev.push_str(&orphan);
}
}
}
}
lines
}
}
pub struct BlockOptimizer;
impl BlockOptimizer {
#[allow(dead_code)]
pub fn reflow(blocks: &[SubtitleBlock], max_chars: u32, max_lines: u32) -> Vec<SubtitleBlock> {
let rules = vec![
SegmentationRule::MaxCharsPerLine(max_chars),
SegmentationRule::MaxLines(max_lines),
SegmentationRule::BreakAtNaturalPoints,
];
blocks
.iter()
.map(|block| {
let text = block.to_plain_text();
let lines = LineBreaker::segment(&text, &rules);
SubtitleBlock::new(lines)
})
.collect()
}
}
pub struct ReadabilityScore;
impl ReadabilityScore {
#[allow(dead_code)]
pub fn compute(subtitle: &SubtitleBlock) -> f32 {
if subtitle.lines.is_empty() {
return 0.0;
}
let all_text = subtitle.to_plain_text();
let words: Vec<&str> = all_text.split_whitespace().collect();
if words.is_empty() {
return 0.0;
}
let avg_word_len: f32 =
words.iter().map(|w| w.chars().count() as f32).sum::<f32>() / words.len() as f32;
let word_len_score = 1.0 - ((avg_word_len - 4.5).abs() / 10.0).min(1.0);
let line_lengths: Vec<f32> = subtitle
.lines
.iter()
.map(|l| l.chars().count() as f32)
.collect();
let mean_len = line_lengths.iter().sum::<f32>() / line_lengths.len() as f32;
let variance = line_lengths
.iter()
.map(|l| (l - mean_len).powi(2))
.sum::<f32>()
/ line_lengths.len() as f32;
let stddev = variance.sqrt();
let balance_score = if mean_len > 0.0 {
1.0 - (stddev / mean_len).min(1.0)
} else {
0.0
};
(word_len_score + balance_score) / 2.0
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_line_breaker_simple() {
let lines = LineBreaker::segment("Hello world", &[SegmentationRule::MaxCharsPerLine(42)]);
assert_eq!(lines, vec!["Hello world"]);
}
#[test]
fn test_line_breaker_wraps_at_max() {
let text = "The quick brown fox jumps over the lazy dog";
let lines = LineBreaker::segment(text, &[SegmentationRule::MaxCharsPerLine(20)]);
assert!(lines.len() > 1);
for line in &lines {
assert!(line.chars().count() <= 20, "Line too long: {line}");
}
}
#[test]
fn test_line_breaker_max_lines_truncates() {
let text = "one two three four five six seven eight nine ten";
let lines = LineBreaker::segment(
text,
&[
SegmentationRule::MaxCharsPerLine(10),
SegmentationRule::MaxLines(2),
],
);
assert!(lines.len() <= 2);
}
#[test]
fn test_line_breaker_natural_breaks_at_conjunction() {
let text = "I went to the store and I bought some milk";
let lines = LineBreaker::segment(
text,
&[
SegmentationRule::MaxCharsPerLine(60),
SegmentationRule::BreakAtNaturalPoints,
],
);
assert!(lines.len() >= 1, "Should produce at least one line");
}
#[test]
fn test_line_breaker_empty_text() {
let lines = LineBreaker::segment("", &[SegmentationRule::MaxCharsPerLine(42)]);
assert!(lines.is_empty());
}
#[test]
fn test_subtitle_block_char_count() {
let block = SubtitleBlock::new(vec!["Hello".to_string(), "World".to_string()]);
assert_eq!(block.char_count(), 10);
}
#[test]
fn test_subtitle_block_line_count() {
let block = SubtitleBlock::new(vec!["A".to_string(), "B".to_string(), "C".to_string()]);
assert_eq!(block.line_count(), 3);
}
#[test]
fn test_subtitle_block_to_plain_text() {
let block = SubtitleBlock::new(vec!["Hello".to_string(), "World".to_string()]);
assert_eq!(block.to_plain_text(), "Hello World");
}
#[test]
fn test_block_optimizer_reflow() {
let blocks = vec![SubtitleBlock::new(vec![
"This is a very long subtitle line that should be reflowed properly".to_string(),
])];
let reflowed = BlockOptimizer::reflow(&blocks, 30, 3);
assert!(!reflowed.is_empty());
for block in &reflowed {
for line in &block.lines {
assert!(line.chars().count() <= 30, "Reflowed line too long: {line}");
}
}
}
#[test]
fn test_readability_score_range() {
let block = SubtitleBlock::new(vec![
"Hello world this is a test".to_string(),
"of the readability scorer".to_string(),
]);
let score = ReadabilityScore::compute(&block);
assert!(score >= 0.0 && score <= 1.0, "Score out of range: {score}");
}
#[test]
fn test_readability_score_empty_block() {
let block = SubtitleBlock::new(vec![]);
assert_eq!(ReadabilityScore::compute(&block), 0.0);
}
#[test]
fn test_no_orphan_rule() {
let text = "one two three four five six";
let lines = LineBreaker::segment(
text,
&[
SegmentationRule::MaxCharsPerLine(20),
SegmentationRule::NoOrphanWords,
],
);
if lines.len() > 1 {
let last = lines.last().expect("should succeed in test");
let last_words: Vec<&str> = last.split_whitespace().collect();
if last_words.len() == 1 {
assert!(
last_words[0].len() > 4,
"Orphan word should have been merged: {}",
last_words[0]
);
}
}
}
}