batch_mode_tts/chunk.rs
1crate::ix!();
2
3impl BatchModeTtsJob {
4 /// Character‑safe splitter (≤ `max_len` chars) with a soft preference for newline
5 /// boundaries. Long single lines are split at exact character boundaries.
6 pub fn chunk_text(text: &str, max_len: usize) -> Vec<String> {
7 let limit = max_len.min(4096);
8 if limit == 0 {
9 warn!("chunk_text called with max_len=0; returning empty chunk list");
10 return Vec::new();
11 }
12
13 let mut res = Vec::<String>::new();
14 let mut buf = String::new();
15 let mut buf_chars: usize = 0;
16
17 // Helper: flush current buffer if non‑empty
18 let flush = |res: &mut Vec<String>, buf: &mut String, buf_chars: &mut usize| {
19 if !buf.is_empty() {
20 trace!("Flushing chunk of {} chars", *buf_chars);
21 res.push(std::mem::take(buf));
22 *buf_chars = 0;
23 }
24 };
25
26 for line in text.lines() {
27 let mut remaining = line;
28 loop {
29 let rem_chars = remaining.chars().count();
30 // Will we need to insert a newline before appending `remaining`?
31 let sep = if buf.is_empty() { 0 } else { 1 };
32
33 if buf_chars + sep + rem_chars <= limit {
34 if sep == 1 {
35 buf.push('\n');
36 buf_chars += 1;
37 }
38 buf.push_str(remaining);
39 buf_chars += rem_chars;
40 break; // Done with this line
41 }
42
43 // Not enough room to add whole `remaining`
44 // 1) If the buffer has something, flush it first so we start fresh
45 if buf_chars > 0 {
46 flush(&mut res, &mut buf, &mut buf_chars);
47 // Continue the loop — we will re‑evaluate with an empty buffer
48 continue;
49 }
50
51 // 2) Buffer is empty but `remaining` itself is too large;
52 // split `remaining` into `limit`‑sized char chunk
53 let take_n = limit; // fill exactly to limit
54 // Take first `take_n` chars of `remaining`
55 let taken: String = remaining.chars().take(take_n).collect();
56 let taken_count = taken.chars().count();
57 debug!(
58 "Splitting an over‑long line into a full chunk of {} chars (limit {})",
59 taken_count, limit
60 );
61 res.push(taken);
62
63 // Advance `remaining`
64 let mut it = remaining.chars();
65 for _ in 0..take_n {
66 it.next();
67 }
68 let rest: String = it.collect();
69
70 if rest.is_empty() {
71 break; // finished this line
72 } else {
73 remaining = &rest; // loop continues on rest
74 // We must store `rest` somewhere stable; allocate new String and use it
75 // To avoid lifetime issue, rebind `remaining` to a new owned string and iterate again
76 let owned = rest; // already owned
77 // Reassign `remaining` to a &'_ str from owned for next iteration
78 // But we cannot keep `owned` alive across iterations without storage.
79 // Workaround: move ownership into `remaining_owned` and shadow `remaining`.
80 // Implement via block scope below.
81 let mut cursor = owned;
82 loop {
83 // inner splitting loop replicates the top logic with `cursor`
84 // 1) Determine chars left
85 let c_rem = cursor.chars().count();
86 let sep2 = if buf.is_empty() { 0 } else { 1 };
87 if buf_chars + sep2 + c_rem <= limit {
88 if sep2 == 1 {
89 buf.push('\n');
90 buf_chars += 1;
91 }
92 buf.push_str(&cursor);
93 buf_chars += c_rem;
94 break;
95 }
96 if buf_chars > 0 {
97 flush(&mut res, &mut buf, &mut buf_chars);
98 continue;
99 }
100 // Take another full chunk from cursor
101 let take_full: String = cursor.chars().take(limit).collect();
102 let taken_full_count = take_full.chars().count();
103 debug!(
104 "Splitting continuation into full chunk of {} chars (limit {})",
105 taken_full_count, limit
106 );
107 res.push(take_full);
108 let mut it2 = cursor.chars();
109 for _ in 0..limit {
110 it2.next();
111 }
112 let tmp: String = it2.collect();
113 if tmp.is_empty() {
114 break;
115 }
116 cursor = tmp;
117 // continue inner loop
118 }
119 break; // done with original `line`
120 }
121 }
122 }
123
124 if !buf.is_empty() {
125 trace!("Flushing final chunk of {} chars", buf_chars);
126 res.push(buf);
127 }
128
129 res
130 }
131}