1use serde::{Deserialize, Serialize};
2use sha2::{Digest, Sha256};
3
4use crate::{
5 BookforgeError, Result,
6 config::SegmentationConfig,
7 ir::{Block, BlockId, BlockKind, Book, Section, SectionId},
8};
9
10pub const CACHE_KEY_SCHEMA_VERSION: u32 = 1;
12pub const SEGMENT_SCHEMA_VERSION: u32 = 1;
14pub const INLINE_MARKER_SCHEMA_VERSION: u32 = 1;
16
17pub fn compute_cache_namespace(
21 max_segment_tokens: usize,
22 context_tokens: usize,
23 profile: &str,
24 batch_enabled: bool,
25 prompt_version: &str,
26) -> String {
27 let mut hasher = Sha256::new();
28 hasher.update(CACHE_KEY_SCHEMA_VERSION.to_le_bytes());
29 hasher.update(SEGMENT_SCHEMA_VERSION.to_le_bytes());
30 hasher.update(INLINE_MARKER_SCHEMA_VERSION.to_le_bytes());
31 hasher.update((max_segment_tokens as u64).to_le_bytes());
32 hasher.update((context_tokens as u64).to_le_bytes());
33 hasher.update(profile.as_bytes());
34 hasher.update([batch_enabled as u8]);
35 hasher.update(prompt_version.as_bytes());
36 let digest = hasher.finalize();
37 let mut hex = String::with_capacity(digest.len() * 2);
38 for byte in digest {
39 hex.push_str(&format!("{byte:02x}"));
40 }
41 hex
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
45pub struct BlockTranslation {
46 pub block_id: BlockId,
47 pub text: String,
48}
49
50#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
51pub struct SegmentId(pub String);
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct Segment {
55 pub id: SegmentId,
56 pub section_id: SectionId,
57 pub ordinal: usize,
58 pub block_ids: Vec<BlockId>,
59 pub source: SegmentSource,
60 pub context: SegmentContext,
61 pub metadata: SegmentMetadata,
62 pub constraints: SegmentConstraints,
63 pub checksum: String,
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct SegmentSource {
68 pub text: String,
69 pub blocks: Vec<SegmentBlock>,
70 pub token_estimate: usize,
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
74pub struct SegmentBlock {
75 pub block_id: BlockId,
76 pub kind: String,
77 pub text: String,
78 pub text_runs: Vec<SegmentTextRun>,
79 pub protected_spans: Vec<String>,
80}
81
82#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
83pub struct SegmentTextRun {
84 pub id: String,
85 pub text: String,
86}
87
88#[derive(Debug, Clone, Default, Serialize, Deserialize)]
89pub struct SegmentContext {
90 pub before: Option<String>,
91 pub after: Option<String>,
92}
93
94#[derive(Debug, Clone, Default, Serialize, Deserialize)]
95pub struct SegmentMetadata {
96 pub book_title: Option<String>,
97 pub section_title: Option<String>,
98 pub section_index: usize,
99 pub segment_index_in_section: usize,
100 pub total_segments_in_section: usize,
101}
102
103#[derive(Debug, Clone, Default, Serialize, Deserialize)]
104pub struct SegmentConstraints {
105 pub preserve_markers: Vec<String>,
106 pub preserve_spans: Vec<String>,
107 pub max_tokens: usize,
108}
109
110pub fn block_kind_label(kind: BlockKind) -> &'static str {
111 match kind {
112 BlockKind::Heading(_) => "heading",
113 BlockKind::Paragraph => "paragraph",
114 BlockKind::ListItem => "list_item",
115 BlockKind::Quote => "quote",
116 BlockKind::TableCell => "table_cell",
117 BlockKind::TableRow => "table_row",
118 BlockKind::Footnote => "footnote",
119 BlockKind::Caption => "caption",
120 BlockKind::Code => "code",
121 BlockKind::Unknown => "unknown",
122 }
123}
124
125#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
126pub enum SegmentStatus {
127 Queued,
128 Succeeded,
129 Failed,
130 RetryPending,
131 NeedsReview,
132 SkippedCached,
133}
134
135pub fn build_segments(book: &Book, config: &SegmentationConfig) -> Result<Vec<Segment>> {
136 if config.max_segment_tokens == 0 {
137 return Err(BookforgeError::InvalidInput(
138 "max_segment_tokens must be greater than zero".to_string(),
139 ));
140 }
141
142 let mut segments = Vec::new();
143
144 for (section_index, section) in book.sections.iter().enumerate() {
145 let section_blocks = section
146 .block_ids
147 .iter()
148 .map(|block_id| {
149 book.blocks
150 .iter()
151 .find(|block| &block.id == block_id)
152 .ok_or_else(|| {
153 BookforgeError::InvalidInput(format!(
154 "section '{}' references missing block '{}'",
155 section.id.0, block_id.0
156 ))
157 })
158 })
159 .collect::<Result<Vec<_>>>()?;
160
161 let mut current = Vec::<&Block>::new();
162 let mut current_tokens = 0usize;
163 let section_segments_start = segments.len();
164
165 for block in section_blocks {
166 let block_tokens = block.token_estimate.max(1);
167 let should_flush = !current.is_empty()
168 && current_tokens + block_tokens > config.max_segment_tokens
169 && !should_keep_with_previous(¤t, block);
170
171 if should_flush {
172 push_segment(
173 &mut segments,
174 book,
175 section,
176 section_index,
177 ¤t,
178 config,
179 );
180 current.clear();
181 current_tokens = 0;
182 }
183
184 current.push(block);
185 current_tokens += block_tokens;
186 }
187
188 if !current.is_empty() {
189 push_segment(
190 &mut segments,
191 book,
192 section,
193 section_index,
194 ¤t,
195 config,
196 );
197 }
198
199 let total_in_section = segments.len() - section_segments_start;
200 for (offset, segment) in segments[section_segments_start..].iter_mut().enumerate() {
201 segment.metadata.segment_index_in_section = offset;
202 segment.metadata.total_segments_in_section = total_in_section;
203 }
204 }
205
206 apply_context(&mut segments, config.context_tokens);
207
208 Ok(segments)
209}
210
211fn push_segment(
212 segments: &mut Vec<Segment>,
213 book: &Book,
214 section: &Section,
215 section_index: usize,
216 blocks: &[&Block],
217 config: &SegmentationConfig,
218) {
219 let segment_blocks = blocks
220 .iter()
221 .map(|block| {
222 let mut spans = block
223 .protected_spans
224 .iter()
225 .map(|span| span.text.clone())
226 .collect::<Vec<_>>();
227 spans.sort();
228 spans.dedup();
229 SegmentBlock {
230 block_id: block.id.clone(),
231 kind: block_kind_label(block.kind).to_string(),
232 text: block_text(block),
233 text_runs: block
234 .text_runs
235 .iter()
236 .map(|run| SegmentTextRun {
237 id: run.id.clone(),
238 text: run.text.clone(),
239 })
240 .collect(),
241 protected_spans: spans,
242 }
243 })
244 .collect::<Vec<_>>();
245 let source_text = segment_blocks
246 .iter()
247 .map(|block| block.text.as_str())
248 .collect::<Vec<_>>()
249 .join("\n\n");
250 let checksum = stable_hash(&source_text);
251 let ordinal = segments.len();
252 let first_block = blocks
253 .first()
254 .map(|block| block.id.0.as_str())
255 .unwrap_or("empty");
256 let id = SegmentId(format!(
257 "seg_{}_{}_{}",
258 section.id.0,
259 first_block,
260 &checksum[..12]
261 ));
262
263 let mut preserve_spans = blocks
264 .iter()
265 .flat_map(|block| block.protected_spans.iter().map(|span| span.text.clone()))
266 .collect::<Vec<_>>();
267 preserve_spans.sort();
268 preserve_spans.dedup();
269
270 let mut preserve_markers = blocks
271 .iter()
272 .flat_map(|block| block.inline_marks.iter().map(|mark| mark.id.clone()))
273 .collect::<Vec<_>>();
274 preserve_markers.sort();
275 preserve_markers.dedup();
276
277 let token_estimate = blocks
278 .iter()
279 .map(|block| block.token_estimate.max(1))
280 .sum::<usize>();
281
282 let metadata = SegmentMetadata {
283 book_title: book.metadata.title.clone(),
284 section_title: section.title.clone(),
285 section_index,
286 segment_index_in_section: 0,
287 total_segments_in_section: 0,
288 };
289
290 segments.push(Segment {
291 id,
292 section_id: section.id.clone(),
293 ordinal,
294 block_ids: blocks.iter().map(|block| block.id.clone()).collect(),
295 source: SegmentSource {
296 text: source_text,
297 blocks: segment_blocks,
298 token_estimate,
299 },
300 context: SegmentContext::default(),
301 metadata,
302 constraints: SegmentConstraints {
303 preserve_markers,
304 preserve_spans,
305 max_tokens: config.max_segment_tokens,
306 },
307 checksum,
308 });
309}
310
311fn apply_context(segments: &mut [Segment], context_tokens: usize) {
312 if context_tokens == 0 {
313 return;
314 }
315
316 let sources = segments
317 .iter()
318 .map(|segment| segment.source.text.clone())
319 .collect::<Vec<_>>();
320
321 for (index, segment) in segments.iter_mut().enumerate() {
322 segment.context.before = index
323 .checked_sub(1)
324 .and_then(|previous| sources.get(previous))
325 .map(|text| tail_words(text, context_tokens));
326 segment.context.after = sources
327 .get(index + 1)
328 .map(|text| head_words(text, context_tokens));
329 }
330}
331
332fn should_keep_with_previous(current: &[&Block], next: &Block) -> bool {
333 let Some(previous) = current.last() else {
334 return false;
335 };
336
337 matches!(previous.kind, crate::ir::BlockKind::Heading(_)) && next.token_estimate <= 80
338}
339
340fn block_text(block: &Block) -> String {
341 block
342 .text_runs
343 .iter()
344 .map(|run| run.text.as_str())
345 .collect::<Vec<_>>()
346 .join("")
347}
348
349fn stable_hash(value: &str) -> String {
350 let digest = Sha256::digest(value.as_bytes());
351 let mut output = String::with_capacity(digest.len() * 2);
352 for byte in digest {
353 use std::fmt::Write as _;
354 write!(&mut output, "{byte:02x}").expect("writing to string should not fail");
355 }
356 output
357}
358
359fn head_words(text: &str, max_words: usize) -> String {
360 text.split_whitespace()
361 .take(max_words)
362 .collect::<Vec<_>>()
363 .join(" ")
364}
365
366fn tail_words(text: &str, max_words: usize) -> String {
367 let words = text.split_whitespace().collect::<Vec<_>>();
368 let start = words.len().saturating_sub(max_words);
369 words[start..].join(" ")
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375 use crate::ir::{
376 BlockKind, BookFormat, BookId, DomPath, Metadata, Resource, Section, SpineItem, TextRun,
377 };
378
379 #[test]
380 fn builds_stable_segments_without_crossing_sections() {
381 let book = book_with_two_sections();
382 let config = SegmentationConfig {
383 max_segment_tokens: 10,
384 context_tokens: 4,
385 };
386
387 let first = build_segments(&book, &config).expect("segments should build");
388 let second = build_segments(&book, &config).expect("segments should be stable");
389
390 assert_eq!(first.len(), 3);
391 assert_eq!(first[0].id, second[0].id);
392 assert_eq!(first[1].checksum, second[1].checksum);
393 assert_eq!(first[0].section_id.0, "sec_000000");
394 assert_eq!(first[1].section_id.0, "sec_000000");
395 assert_eq!(first[2].section_id.0, "sec_000001");
396 assert_eq!(first[2].block_ids, vec![BlockId("b_000003".to_string())]);
397 }
398
399 #[test]
400 fn rejects_zero_token_limit() {
401 let book = book_with_two_sections();
402 let config = SegmentationConfig {
403 max_segment_tokens: 0,
404 context_tokens: 0,
405 };
406
407 assert!(build_segments(&book, &config).is_err());
408 }
409
410 #[test]
411 fn cache_namespace_changes_when_segmentation_settings_change() {
412 let a = compute_cache_namespace(1200, 160, "Balanced", false, "v1");
413 let b = compute_cache_namespace(1201, 160, "Balanced", false, "v1");
414 let c = compute_cache_namespace(1200, 160, "Balanced", true, "v1");
415 let d = compute_cache_namespace(1200, 160, "Balanced", false, "batch_v1");
416 let e = compute_cache_namespace(1200, 160, "Balanced", false, "v1");
417
418 assert_ne!(a, b, "max_segment_tokens must affect namespace");
419 assert_ne!(a, c, "batch_enabled must affect namespace");
420 assert_ne!(a, d, "prompt_version must affect namespace");
421 assert_eq!(a, e, "namespace is deterministic for identical inputs");
422 }
423
424 fn book_with_two_sections() -> Book {
425 let section_a = SectionId("sec_000000".to_string());
426 let section_b = SectionId("sec_000001".to_string());
427
428 Book {
429 source_path: None,
430 id: BookId("test".to_string()),
431 format: BookFormat::Epub,
432 metadata: Metadata::default(),
433 manifest: vec![Resource {
434 id: "chapter".to_string(),
435 href: "chapter.xhtml".to_string(),
436 media_type: "application/xhtml+xml".to_string(),
437 properties: Vec::new(),
438 }],
439 spine: vec![SpineItem {
440 idref: "chapter".to_string(),
441 href: Some("chapter.xhtml".to_string()),
442 linear: true,
443 }],
444 sections: vec![
445 Section {
446 id: section_a.clone(),
447 href: "chapter.xhtml".to_string(),
448 spine_index: 0,
449 title: Some("One".to_string()),
450 heading_level: Some(1),
451 block_ids: vec![
452 BlockId("b_000000".to_string()),
453 BlockId("b_000001".to_string()),
454 BlockId("b_000002".to_string()),
455 ],
456 prev: None,
457 next: Some(section_b.clone()),
458 },
459 Section {
460 id: section_b.clone(),
461 href: "chapter2.xhtml".to_string(),
462 spine_index: 1,
463 title: None,
464 heading_level: None,
465 block_ids: vec![BlockId("b_000003".to_string())],
466 prev: Some(section_a.clone()),
467 next: None,
468 },
469 ],
470 blocks: vec![
471 block("b_000000", §ion_a, BlockKind::Heading(1), "One", 2),
472 block(
473 "b_000001",
474 §ion_a,
475 BlockKind::Paragraph,
476 "short lead",
477 3,
478 ),
479 block(
480 "b_000002",
481 §ion_a,
482 BlockKind::Paragraph,
483 "this paragraph forces a second segment",
484 10,
485 ),
486 block(
487 "b_000003",
488 §ion_b,
489 BlockKind::Paragraph,
490 "new section must stay separate",
491 4,
492 ),
493 ],
494 }
495 }
496
497 fn block(
498 id: &str,
499 section_id: &SectionId,
500 kind: BlockKind,
501 text: &str,
502 token_estimate: usize,
503 ) -> Block {
504 Block {
505 id: BlockId(id.to_string()),
506 section_id: section_id.clone(),
507 kind,
508 dom_path: DomPath(vec![0]),
509 text_runs: vec![TextRun {
510 id: "r0".to_string(),
511 text: text.to_string(),
512 }],
513 inline_marks: Vec::new(),
514 protected_spans: Vec::new(),
515 token_estimate,
516 }
517 }
518}