1use serde::{Deserialize, Serialize};
2use sha2::{Digest, Sha256};
3
4use crate::{
5 BookforgeError, Result,
6 config::SegmentationConfig,
7 ir::{Block, BlockId, BlockKind, Book, Section, SectionId},
8};
9
10pub const CACHE_KEY_SCHEMA_VERSION: u32 = 2;
12pub const SEGMENT_SCHEMA_VERSION: u32 = 1;
14pub const INLINE_MARKER_SCHEMA_VERSION: u32 = 3;
21
22pub fn compute_cache_namespace(
33 max_segment_tokens: usize,
34 context_tokens: usize,
35 profile: &str,
36 batch_enabled: bool,
37 prompt_version: &str,
38 glossary_fingerprint: &str,
39 style_fingerprint: &str,
40 entities_fingerprint: &str,
41) -> String {
42 compute_cache_namespace_inner(
43 CACHE_KEY_SCHEMA_VERSION,
44 max_segment_tokens,
45 context_tokens,
46 profile,
47 batch_enabled,
48 prompt_version,
49 Some(glossary_fingerprint),
50 if style_fingerprint.is_empty() {
51 None
52 } else {
53 Some(style_fingerprint)
54 },
55 if entities_fingerprint.is_empty() {
56 None
57 } else {
58 Some(entities_fingerprint)
59 },
60 )
61}
62
63pub fn compute_cache_namespace_v1(
64 max_segment_tokens: usize,
65 context_tokens: usize,
66 profile: &str,
67 batch_enabled: bool,
68 prompt_version: &str,
69) -> String {
70 compute_cache_namespace_inner(
71 1,
72 max_segment_tokens,
73 context_tokens,
74 profile,
75 batch_enabled,
76 prompt_version,
77 None,
78 None,
79 None,
80 )
81}
82
83fn compute_cache_namespace_inner(
84 cache_key_schema_version: u32,
85 max_segment_tokens: usize,
86 context_tokens: usize,
87 profile: &str,
88 batch_enabled: bool,
89 prompt_version: &str,
90 glossary_fingerprint: Option<&str>,
91 style_fingerprint: Option<&str>,
92 entities_fingerprint: Option<&str>,
93) -> String {
94 let mut hasher = Sha256::new();
95 hasher.update(cache_key_schema_version.to_le_bytes());
96 hasher.update(SEGMENT_SCHEMA_VERSION.to_le_bytes());
97 hasher.update(INLINE_MARKER_SCHEMA_VERSION.to_le_bytes());
98 hasher.update((max_segment_tokens as u64).to_le_bytes());
99 hasher.update((context_tokens as u64).to_le_bytes());
100 hasher.update(profile.as_bytes());
101 hasher.update([batch_enabled as u8]);
102 hasher.update(prompt_version.as_bytes());
103 if let Some(glossary_fingerprint) = glossary_fingerprint {
104 hasher.update(glossary_fingerprint.as_bytes());
105 }
106 if let Some(style_fingerprint) = style_fingerprint {
107 hasher.update(b"|style|");
108 hasher.update(style_fingerprint.as_bytes());
109 }
110 if let Some(entities_fingerprint) = entities_fingerprint {
111 hasher.update(b"|entities|");
112 hasher.update(entities_fingerprint.as_bytes());
113 }
114 let digest = hasher.finalize();
115 let mut hex = String::with_capacity(digest.len() * 2);
116 for byte in digest {
117 hex.push_str(&format!("{byte:02x}"));
118 }
119 hex
120}
121
122#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
123pub struct BlockTranslation {
124 pub block_id: BlockId,
125 pub text: String,
126}
127
128#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
129pub struct SegmentId(pub String);
130
131#[derive(Debug, Clone, Serialize, Deserialize)]
132pub struct Segment {
133 pub id: SegmentId,
134 pub section_id: SectionId,
135 pub ordinal: usize,
136 pub block_ids: Vec<BlockId>,
137 pub source: SegmentSource,
138 pub context: SegmentContext,
139 pub metadata: SegmentMetadata,
140 pub constraints: SegmentConstraints,
141 pub checksum: String,
142}
143
144#[derive(Debug, Clone, Serialize, Deserialize)]
145pub struct SegmentSource {
146 pub text: String,
147 pub blocks: Vec<SegmentBlock>,
148 pub token_estimate: usize,
149}
150
151#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
152pub struct SegmentBlock {
153 pub block_id: BlockId,
154 pub kind: String,
155 pub text: String,
156 pub text_runs: Vec<SegmentTextRun>,
157 pub protected_spans: Vec<String>,
158}
159
160#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
161pub struct SegmentTextRun {
162 pub id: String,
163 pub text: String,
164}
165
166#[derive(Debug, Clone, Default, Serialize, Deserialize)]
167pub struct SegmentContext {
168 pub before: Option<String>,
169 pub after: Option<String>,
170}
171
172#[derive(Debug, Clone, Default, Serialize, Deserialize)]
173pub struct SegmentMetadata {
174 pub book_title: Option<String>,
175 pub section_title: Option<String>,
176 pub section_index: usize,
177 pub segment_index_in_section: usize,
178 pub total_segments_in_section: usize,
179}
180
181#[derive(Debug, Clone, Default, Serialize, Deserialize)]
182pub struct SegmentConstraints {
183 pub preserve_markers: Vec<String>,
184 pub preserve_spans: Vec<String>,
185 pub max_tokens: usize,
186}
187
188pub fn block_kind_label(kind: BlockKind) -> &'static str {
189 match kind {
190 BlockKind::Heading(_) => "heading",
191 BlockKind::Paragraph => "paragraph",
192 BlockKind::ListItem => "list_item",
193 BlockKind::Quote => "quote",
194 BlockKind::TableCell => "table_cell",
195 BlockKind::TableRow => "table_row",
196 BlockKind::Footnote => "footnote",
197 BlockKind::Caption => "caption",
198 BlockKind::Code => "code",
199 BlockKind::Unknown => "unknown",
200 }
201}
202
203#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
204pub enum SegmentStatus {
205 Queued,
206 Succeeded,
207 Failed,
208 RetryPending,
209 NeedsReview,
210 SkippedCached,
211}
212
213pub fn build_segments(book: &Book, config: &SegmentationConfig) -> Result<Vec<Segment>> {
214 if config.max_segment_tokens == 0 {
215 return Err(BookforgeError::InvalidInput(
216 "max_segment_tokens must be greater than zero".to_string(),
217 ));
218 }
219
220 let mut segments = Vec::new();
221
222 for (section_index, section) in book.sections.iter().enumerate() {
223 let section_blocks = section
224 .block_ids
225 .iter()
226 .map(|block_id| {
227 book.blocks
228 .iter()
229 .find(|block| &block.id == block_id)
230 .ok_or_else(|| {
231 BookforgeError::InvalidInput(format!(
232 "section '{}' references missing block '{}'",
233 section.id.0, block_id.0
234 ))
235 })
236 })
237 .collect::<Result<Vec<_>>>()?;
238
239 let mut current = Vec::<&Block>::new();
240 let mut current_tokens = 0usize;
241 let section_segments_start = segments.len();
242
243 for block in section_blocks {
244 if matches!(block.kind, BlockKind::Code) {
249 continue;
250 }
251 let block_tokens = block.token_estimate.max(1);
252 let should_flush = !current.is_empty()
253 && current_tokens + block_tokens > config.max_segment_tokens
254 && !should_keep_with_previous(¤t, block);
255
256 if should_flush {
257 push_segment(
258 &mut segments,
259 book,
260 section,
261 section_index,
262 ¤t,
263 config,
264 );
265 current.clear();
266 current_tokens = 0;
267 }
268
269 current.push(block);
270 current_tokens += block_tokens;
271 }
272
273 if !current.is_empty() {
274 push_segment(
275 &mut segments,
276 book,
277 section,
278 section_index,
279 ¤t,
280 config,
281 );
282 }
283
284 let total_in_section = segments.len() - section_segments_start;
285 for (offset, segment) in segments[section_segments_start..].iter_mut().enumerate() {
286 segment.metadata.segment_index_in_section = offset;
287 segment.metadata.total_segments_in_section = total_in_section;
288 }
289 }
290
291 apply_context(&mut segments, config.context_tokens);
292
293 Ok(segments)
294}
295
296fn push_segment(
297 segments: &mut Vec<Segment>,
298 book: &Book,
299 section: &Section,
300 section_index: usize,
301 blocks: &[&Block],
302 config: &SegmentationConfig,
303) {
304 let segment_blocks = blocks
305 .iter()
306 .map(|block| {
307 let mut spans = block
308 .protected_spans
309 .iter()
310 .map(|span| span.text.clone())
311 .collect::<Vec<_>>();
312 spans.sort();
313 spans.dedup();
314 SegmentBlock {
315 block_id: block.id.clone(),
316 kind: block_kind_label(block.kind).to_string(),
317 text: block_text(block),
318 text_runs: block
319 .text_runs
320 .iter()
321 .map(|run| SegmentTextRun {
322 id: run.id.clone(),
323 text: run.text.clone(),
324 })
325 .collect(),
326 protected_spans: spans,
327 }
328 })
329 .collect::<Vec<_>>();
330 let source_text = segment_blocks
331 .iter()
332 .map(|block| block.text.as_str())
333 .collect::<Vec<_>>()
334 .join("\n\n");
335 let checksum = stable_hash(&source_text);
336 let ordinal = segments.len();
337 let first_block = blocks
338 .first()
339 .map(|block| block.id.0.as_str())
340 .unwrap_or("empty");
341 let id = SegmentId(format!(
342 "seg_{}_{}_{}",
343 section.id.0,
344 first_block,
345 &checksum[..12]
346 ));
347
348 let mut preserve_spans = blocks
349 .iter()
350 .flat_map(|block| block.protected_spans.iter().map(|span| span.text.clone()))
351 .collect::<Vec<_>>();
352 preserve_spans.sort();
353 preserve_spans.dedup();
354
355 let mut preserve_markers = blocks
356 .iter()
357 .flat_map(|block| block.inline_marks.iter().map(|mark| mark.id.clone()))
358 .collect::<Vec<_>>();
359 preserve_markers.sort();
360 preserve_markers.dedup();
361
362 let token_estimate = blocks
363 .iter()
364 .map(|block| block.token_estimate.max(1))
365 .sum::<usize>();
366
367 let metadata = SegmentMetadata {
368 book_title: book.metadata.title.clone(),
369 section_title: section.title.clone(),
370 section_index,
371 segment_index_in_section: 0,
372 total_segments_in_section: 0,
373 };
374
375 segments.push(Segment {
376 id,
377 section_id: section.id.clone(),
378 ordinal,
379 block_ids: blocks.iter().map(|block| block.id.clone()).collect(),
380 source: SegmentSource {
381 text: source_text,
382 blocks: segment_blocks,
383 token_estimate,
384 },
385 context: SegmentContext::default(),
386 metadata,
387 constraints: SegmentConstraints {
388 preserve_markers,
389 preserve_spans,
390 max_tokens: config.max_segment_tokens,
391 },
392 checksum,
393 });
394}
395
396fn apply_context(segments: &mut [Segment], context_tokens: usize) {
397 if context_tokens == 0 {
398 return;
399 }
400
401 let sources = segments
402 .iter()
403 .map(|segment| segment.source.text.clone())
404 .collect::<Vec<_>>();
405
406 for (index, segment) in segments.iter_mut().enumerate() {
407 segment.context.before = index
408 .checked_sub(1)
409 .and_then(|previous| sources.get(previous))
410 .map(|text| tail_words(text, context_tokens));
411 segment.context.after = sources
412 .get(index + 1)
413 .map(|text| head_words(text, context_tokens));
414 }
415}
416
417fn should_keep_with_previous(current: &[&Block], next: &Block) -> bool {
418 let Some(previous) = current.last() else {
419 return false;
420 };
421
422 matches!(previous.kind, crate::ir::BlockKind::Heading(_)) && next.token_estimate <= 80
423}
424
425fn block_text(block: &Block) -> String {
426 block
427 .text_runs
428 .iter()
429 .map(|run| run.text.as_str())
430 .collect::<Vec<_>>()
431 .join("")
432}
433
434fn stable_hash(value: &str) -> String {
435 let digest = Sha256::digest(value.as_bytes());
436 let mut output = String::with_capacity(digest.len() * 2);
437 for byte in digest {
438 use std::fmt::Write as _;
439 write!(&mut output, "{byte:02x}").expect("writing to string should not fail");
440 }
441 output
442}
443
444fn head_words(text: &str, max_words: usize) -> String {
445 text.split_whitespace()
446 .take(max_words)
447 .collect::<Vec<_>>()
448 .join(" ")
449}
450
451fn tail_words(text: &str, max_words: usize) -> String {
452 let words = text.split_whitespace().collect::<Vec<_>>();
453 let start = words.len().saturating_sub(max_words);
454 words[start..].join(" ")
455}
456
457#[cfg(test)]
458mod tests {
459 use super::*;
460 use crate::ir::{
461 BlockKind, BookFormat, BookId, DomPath, Metadata, Resource, Section, SpineItem, TextRun,
462 };
463
464 #[test]
465 fn builds_stable_segments_without_crossing_sections() {
466 let book = book_with_two_sections();
467 let config = SegmentationConfig {
468 max_segment_tokens: 10,
469 context_tokens: 4,
470 };
471
472 let first = build_segments(&book, &config).expect("segments should build");
473 let second = build_segments(&book, &config).expect("segments should be stable");
474
475 assert_eq!(first.len(), 3);
476 assert_eq!(first[0].id, second[0].id);
477 assert_eq!(first[1].checksum, second[1].checksum);
478 assert_eq!(first[0].section_id.0, "sec_000000");
479 assert_eq!(first[1].section_id.0, "sec_000000");
480 assert_eq!(first[2].section_id.0, "sec_000001");
481 assert_eq!(first[2].block_ids, vec![BlockId("b_000003".to_string())]);
482 }
483
484 #[test]
485 fn code_blocks_are_excluded_from_segments() {
486 let mut book = book_with_two_sections();
487 book.blocks[1].kind = BlockKind::Code;
488 let config = SegmentationConfig {
489 max_segment_tokens: 100,
490 context_tokens: 0,
491 };
492
493 let segments = build_segments(&book, &config).expect("segments should build");
494 let segmented_blocks: Vec<&str> = segments
495 .iter()
496 .flat_map(|segment| segment.block_ids.iter().map(|id| id.0.as_str()))
497 .collect();
498
499 assert!(
500 !segmented_blocks.contains(&"b_000001"),
501 "code block must not be segmented for translation"
502 );
503 assert!(segmented_blocks.contains(&"b_000000"));
504 assert!(segmented_blocks.contains(&"b_000002"));
505 }
506
507 #[test]
508 fn rejects_zero_token_limit() {
509 let book = book_with_two_sections();
510 let config = SegmentationConfig {
511 max_segment_tokens: 0,
512 context_tokens: 0,
513 };
514
515 assert!(build_segments(&book, &config).is_err());
516 }
517
518 #[test]
519 fn cache_namespace_changes_when_segmentation_settings_change() {
520 let a = compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
521 let b = compute_cache_namespace(1201, 160, "Balanced", false, "v1", "glossary:a", "", "");
522 let c = compute_cache_namespace(1200, 160, "Balanced", true, "v1", "glossary:a", "", "");
523 let d = compute_cache_namespace(
524 1200,
525 160,
526 "Balanced",
527 false,
528 "batch_v1",
529 "glossary:a",
530 "",
531 "",
532 );
533 let e = compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
534 let f = compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:b", "", "");
535
536 assert_ne!(a, b, "max_segment_tokens must affect namespace");
537 assert_ne!(a, c, "batch_enabled must affect namespace");
538 assert_ne!(a, d, "prompt_version must affect namespace");
539 assert_ne!(a, f, "glossary fingerprint must affect namespace");
540 assert_eq!(a, e, "namespace is deterministic for identical inputs");
541 }
542
543 #[test]
544 fn legacy_cache_namespace_v1_ignores_glossary_fingerprint() {
545 let current_without_terms =
546 compute_cache_namespace(1200, 160, "Balanced", false, "v1", "", "", "");
547 let current_with_terms =
548 compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
549 let legacy = compute_cache_namespace_v1(1200, 160, "Balanced", false, "v1");
550
551 assert_ne!(legacy, current_without_terms);
552 assert_ne!(legacy, current_with_terms);
553 assert_eq!(
554 legacy,
555 compute_cache_namespace_v1(1200, 160, "Balanced", false, "v1")
556 );
557 }
558
559 #[test]
560 fn cache_namespace_is_stable_when_style_fingerprint_is_empty() {
561 let without_style =
564 compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
565 let still_without_style =
566 compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
567 assert_eq!(without_style, still_without_style);
568 }
569
570 #[test]
571 fn cache_namespace_changes_when_style_fingerprint_changes() {
572 let baseline =
573 compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
574 let with_style = compute_cache_namespace(
575 1200,
576 160,
577 "Balanced",
578 false,
579 "v1",
580 "glossary:a",
581 "style:a",
582 "",
583 );
584 let with_other_style = compute_cache_namespace(
585 1200,
586 160,
587 "Balanced",
588 false,
589 "v1",
590 "glossary:a",
591 "style:b",
592 "",
593 );
594
595 assert_ne!(
596 baseline, with_style,
597 "switching on a style sheet must invalidate cache"
598 );
599 assert_ne!(
600 with_style, with_other_style,
601 "different style fingerprints must yield different namespaces"
602 );
603 }
604
605 #[test]
606 fn cache_namespace_changes_when_entities_fingerprint_changes() {
607 let baseline =
608 compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
609 let with_entities = compute_cache_namespace(
610 1200,
611 160,
612 "Balanced",
613 false,
614 "v1",
615 "glossary:a",
616 "",
617 "entities:a",
618 );
619 let with_other_entities = compute_cache_namespace(
620 1200,
621 160,
622 "Balanced",
623 false,
624 "v1",
625 "glossary:a",
626 "",
627 "entities:b",
628 );
629 assert_ne!(
630 baseline, with_entities,
631 "switching on entities must invalidate cache"
632 );
633 assert_ne!(
634 with_entities, with_other_entities,
635 "different entity fingerprints must yield different namespaces"
636 );
637 }
638
639 #[test]
640 fn style_and_entities_fingerprints_use_distinct_domain_separators() {
641 let as_style =
644 compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "ab", "");
645 let as_entities =
646 compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "ab");
647 assert_ne!(as_style, as_entities);
648 }
649
650 fn book_with_two_sections() -> Book {
651 let section_a = SectionId("sec_000000".to_string());
652 let section_b = SectionId("sec_000001".to_string());
653
654 Book {
655 source_path: None,
656 id: BookId("test".to_string()),
657 format: BookFormat::Epub,
658 metadata: Metadata::default(),
659 manifest: vec![Resource {
660 id: "chapter".to_string(),
661 href: "chapter.xhtml".to_string(),
662 media_type: "application/xhtml+xml".to_string(),
663 properties: Vec::new(),
664 }],
665 spine: vec![SpineItem {
666 idref: "chapter".to_string(),
667 href: Some("chapter.xhtml".to_string()),
668 linear: true,
669 }],
670 sections: vec![
671 Section {
672 id: section_a.clone(),
673 href: "chapter.xhtml".to_string(),
674 spine_index: 0,
675 title: Some("One".to_string()),
676 heading_level: Some(1),
677 block_ids: vec![
678 BlockId("b_000000".to_string()),
679 BlockId("b_000001".to_string()),
680 BlockId("b_000002".to_string()),
681 ],
682 prev: None,
683 next: Some(section_b.clone()),
684 },
685 Section {
686 id: section_b.clone(),
687 href: "chapter2.xhtml".to_string(),
688 spine_index: 1,
689 title: None,
690 heading_level: None,
691 block_ids: vec![BlockId("b_000003".to_string())],
692 prev: Some(section_a.clone()),
693 next: None,
694 },
695 ],
696 blocks: vec![
697 block("b_000000", §ion_a, BlockKind::Heading(1), "One", 2),
698 block(
699 "b_000001",
700 §ion_a,
701 BlockKind::Paragraph,
702 "short lead",
703 3,
704 ),
705 block(
706 "b_000002",
707 §ion_a,
708 BlockKind::Paragraph,
709 "this paragraph forces a second segment",
710 10,
711 ),
712 block(
713 "b_000003",
714 §ion_b,
715 BlockKind::Paragraph,
716 "new section must stay separate",
717 4,
718 ),
719 ],
720 }
721 }
722
723 fn block(
724 id: &str,
725 section_id: &SectionId,
726 kind: BlockKind,
727 text: &str,
728 token_estimate: usize,
729 ) -> Block {
730 Block {
731 id: BlockId(id.to_string()),
732 section_id: section_id.clone(),
733 kind,
734 dom_path: DomPath(vec![0]),
735 text_runs: vec![TextRun {
736 id: "r0".to_string(),
737 text: text.to_string(),
738 }],
739 inline_marks: Vec::new(),
740 protected_spans: Vec::new(),
741 token_estimate,
742 }
743 }
744}