ass_core/analysis/events/unicode_wrap.rs
1//! Unicode-aware soft-wrap opportunities (UAX #14).
2//!
3//! Feature-gated behind `unicode-wrap`. Provides the analysis-side equivalent
4//! of libass 0.17.4's `ASS_FEATURE_WRAP_UNICODE`: it identifies valid line
5//! break positions in plain text, including breaks between CJK/Kana/Hangul
6//! characters that are not separated by spaces.
7//!
8//! The input is expected to be *plain* text — that is, with override tag blocks
9//! removed and explicit `\N`/`\n` breaks already resolved (see
10//! [`TextWithLineBreaks`](super::line_breaks::TextWithLineBreaks)). Each
11//! returned offset is the byte index of the character that would start the new
12//! line if a wrap is taken there.
13//!
14//! # Examples
15//!
16//! ```rust
17//! use ass_core::analysis::events::unicode_wrap::soft_wrap_offsets;
18//!
19//! // A space provides a soft-wrap opportunity before the next word.
20//! let offsets = soft_wrap_offsets("Hello world");
21//! assert!(offsets.contains(&6));
22//! ```
23
24use alloc::vec::Vec;
25
26/// A position in plain text where a line wrap may occur.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct WrapOpportunity {
29 /// Byte offset of the character that starts the new line if wrapped here.
30 pub offset: usize,
31 /// Whether the break is mandatory (e.g. a hard line feed or end of text).
32 pub mandatory: bool,
33}
34
35/// Compute all Unicode line-break opportunities for `text` per UAX #14.
36///
37/// Includes the mandatory break at the end of the text. Use
38/// [`soft_wrap_offsets`] for only the optional intra-text break points.
39#[must_use]
40pub fn wrap_opportunities(text: &str) -> Vec<WrapOpportunity> {
41 unicode_linebreak::linebreaks(text)
42 .map(|(offset, opportunity)| WrapOpportunity {
43 offset,
44 mandatory: matches!(opportunity, unicode_linebreak::BreakOpportunity::Mandatory),
45 })
46 .collect()
47}
48
49/// Return only the byte offsets at which an optional (soft) wrap may occur.
50///
51/// Mandatory breaks — including the implicit one at the end of the string — are
52/// excluded, leaving the positions a renderer may choose to wrap at when a line
53/// exceeds the available width.
54#[must_use]
55pub fn soft_wrap_offsets(text: &str) -> Vec<usize> {
56 wrap_opportunities(text)
57 .into_iter()
58 .filter(|opportunity| !opportunity.mandatory)
59 .map(|opportunity| opportunity.offset)
60 .collect()
61}
62
63#[cfg(test)]
64mod tests {
65 use super::*;
66
67 #[test]
68 fn breaks_after_space() {
69 let offsets = soft_wrap_offsets("Hello world");
70 // The break is permitted before "world" (offset of 'w').
71 assert_eq!(offsets, alloc::vec![6]);
72 }
73
74 #[test]
75 fn multiple_words() {
76 let offsets = soft_wrap_offsets("one two three");
77 assert_eq!(offsets, alloc::vec![4, 8]);
78 }
79
80 #[test]
81 fn breaks_between_cjk_without_spaces() {
82 // Japanese text has no spaces but permits inter-character wrapping.
83 let text = "日本語字幕";
84 let offsets = soft_wrap_offsets(text);
85 // At least one break opportunity exists despite the absence of spaces.
86 assert!(!offsets.is_empty());
87 // Every offset must fall on a UTF-8 character boundary.
88 assert!(offsets.iter().all(|&o| text.is_char_boundary(o)));
89 }
90
91 #[test]
92 fn no_soft_break_in_single_token() {
93 // A single unbroken word offers no interior soft-wrap point.
94 assert!(soft_wrap_offsets("indivisible").is_empty());
95 }
96
97 #[test]
98 fn final_break_is_mandatory() {
99 let all = wrap_opportunities("hi there");
100 let last = all.last().expect("at least one opportunity");
101 assert_eq!(last.offset, "hi there".len());
102 assert!(last.mandatory);
103 }
104}