Skip to main content

bookforge_core/
segment.rs

1use serde::{Deserialize, Serialize};
2use sha2::{Digest, Sha256};
3
4use crate::{
5    BookforgeError, Result,
6    config::SegmentationConfig,
7    ir::{Block, BlockId, BlockKind, Book, Section, SectionId},
8};
9
10/// Bumped when the cache key derivation changes incompatibly.
11pub const CACHE_KEY_SCHEMA_VERSION: u32 = 2;
12/// Bumped when Segment / SegmentBlock layout changes incompatibly.
13pub const SEGMENT_SCHEMA_VERSION: u32 = 1;
14/// Bumped when inline marker extraction (m/keep/ref) changes incompatibly.
15/// v2: depth-anchored block closing, lazily anchored text blocks for
16/// non-whitelist elements, addressable stray text nodes — block ordinals
17/// and marker assignments differ from v1 on affected books.
18/// v3: short per-block inline marker tags (`<m1>...</m1>`, `<r1/>`)
19/// replace verbose global ids (`<m id="m000000_000">...</m>`).
20pub const INLINE_MARKER_SCHEMA_VERSION: u32 = 3;
21
22/// Compute a cache namespace that scopes lookups to a single set of
23/// schema and segmentation parameters. Cached rows from a different
24/// namespace are not eligible for reuse.
25///
26/// `style_fingerprint` and `entities_fingerprint` are opt-in mixins:
27/// pass an empty string to preserve cache compatibility with runs that
28/// didn't use the feature; pass a non-empty fingerprint when the
29/// rendered prompt actually changes. The two slots use distinct domain
30/// separators so a style fingerprint can never collide with an entity
31/// fingerprint of the same content.
32pub fn compute_cache_namespace(
33    max_segment_tokens: usize,
34    context_tokens: usize,
35    profile: &str,
36    batch_enabled: bool,
37    prompt_version: &str,
38    glossary_fingerprint: &str,
39    style_fingerprint: &str,
40    entities_fingerprint: &str,
41) -> String {
42    compute_cache_namespace_inner(
43        CACHE_KEY_SCHEMA_VERSION,
44        max_segment_tokens,
45        context_tokens,
46        profile,
47        batch_enabled,
48        prompt_version,
49        Some(glossary_fingerprint),
50        if style_fingerprint.is_empty() {
51            None
52        } else {
53            Some(style_fingerprint)
54        },
55        if entities_fingerprint.is_empty() {
56            None
57        } else {
58            Some(entities_fingerprint)
59        },
60    )
61}
62
63pub fn compute_cache_namespace_v1(
64    max_segment_tokens: usize,
65    context_tokens: usize,
66    profile: &str,
67    batch_enabled: bool,
68    prompt_version: &str,
69) -> String {
70    compute_cache_namespace_inner(
71        1,
72        max_segment_tokens,
73        context_tokens,
74        profile,
75        batch_enabled,
76        prompt_version,
77        None,
78        None,
79        None,
80    )
81}
82
83fn compute_cache_namespace_inner(
84    cache_key_schema_version: u32,
85    max_segment_tokens: usize,
86    context_tokens: usize,
87    profile: &str,
88    batch_enabled: bool,
89    prompt_version: &str,
90    glossary_fingerprint: Option<&str>,
91    style_fingerprint: Option<&str>,
92    entities_fingerprint: Option<&str>,
93) -> String {
94    let mut hasher = Sha256::new();
95    hasher.update(cache_key_schema_version.to_le_bytes());
96    hasher.update(SEGMENT_SCHEMA_VERSION.to_le_bytes());
97    hasher.update(INLINE_MARKER_SCHEMA_VERSION.to_le_bytes());
98    hasher.update((max_segment_tokens as u64).to_le_bytes());
99    hasher.update((context_tokens as u64).to_le_bytes());
100    hasher.update(profile.as_bytes());
101    hasher.update([batch_enabled as u8]);
102    hasher.update(prompt_version.as_bytes());
103    if let Some(glossary_fingerprint) = glossary_fingerprint {
104        hasher.update(glossary_fingerprint.as_bytes());
105    }
106    if let Some(style_fingerprint) = style_fingerprint {
107        hasher.update(b"|style|");
108        hasher.update(style_fingerprint.as_bytes());
109    }
110    if let Some(entities_fingerprint) = entities_fingerprint {
111        hasher.update(b"|entities|");
112        hasher.update(entities_fingerprint.as_bytes());
113    }
114    let digest = hasher.finalize();
115    let mut hex = String::with_capacity(digest.len() * 2);
116    for byte in digest {
117        hex.push_str(&format!("{byte:02x}"));
118    }
119    hex
120}
121
122#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
123pub struct BlockTranslation {
124    pub block_id: BlockId,
125    pub text: String,
126}
127
128#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
129pub struct SegmentId(pub String);
130
131#[derive(Debug, Clone, Serialize, Deserialize)]
132pub struct Segment {
133    pub id: SegmentId,
134    pub section_id: SectionId,
135    pub ordinal: usize,
136    pub block_ids: Vec<BlockId>,
137    pub source: SegmentSource,
138    pub context: SegmentContext,
139    pub metadata: SegmentMetadata,
140    pub constraints: SegmentConstraints,
141    pub checksum: String,
142}
143
144#[derive(Debug, Clone, Serialize, Deserialize)]
145pub struct SegmentSource {
146    pub text: String,
147    pub blocks: Vec<SegmentBlock>,
148    pub token_estimate: usize,
149}
150
151#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
152pub struct SegmentBlock {
153    pub block_id: BlockId,
154    pub kind: String,
155    pub text: String,
156    pub text_runs: Vec<SegmentTextRun>,
157    pub protected_spans: Vec<String>,
158}
159
160#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
161pub struct SegmentTextRun {
162    pub id: String,
163    pub text: String,
164}
165
166#[derive(Debug, Clone, Default, Serialize, Deserialize)]
167pub struct SegmentContext {
168    pub before: Option<String>,
169    pub after: Option<String>,
170}
171
172#[derive(Debug, Clone, Default, Serialize, Deserialize)]
173pub struct SegmentMetadata {
174    pub book_title: Option<String>,
175    pub section_title: Option<String>,
176    pub section_index: usize,
177    pub segment_index_in_section: usize,
178    pub total_segments_in_section: usize,
179}
180
181#[derive(Debug, Clone, Default, Serialize, Deserialize)]
182pub struct SegmentConstraints {
183    pub preserve_markers: Vec<String>,
184    pub preserve_spans: Vec<String>,
185    pub max_tokens: usize,
186}
187
188pub fn block_kind_label(kind: BlockKind) -> &'static str {
189    match kind {
190        BlockKind::Heading(_) => "heading",
191        BlockKind::Paragraph => "paragraph",
192        BlockKind::ListItem => "list_item",
193        BlockKind::Quote => "quote",
194        BlockKind::TableCell => "table_cell",
195        BlockKind::TableRow => "table_row",
196        BlockKind::Footnote => "footnote",
197        BlockKind::Caption => "caption",
198        BlockKind::Code => "code",
199        BlockKind::Unknown => "unknown",
200    }
201}
202
203#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
204pub enum SegmentStatus {
205    Queued,
206    Succeeded,
207    Failed,
208    RetryPending,
209    NeedsReview,
210    SkippedCached,
211}
212
213pub fn build_segments(book: &Book, config: &SegmentationConfig) -> Result<Vec<Segment>> {
214    if config.max_segment_tokens == 0 {
215        return Err(BookforgeError::InvalidInput(
216            "max_segment_tokens must be greater than zero".to_string(),
217        ));
218    }
219
220    let mut segments = Vec::new();
221
222    for (section_index, section) in book.sections.iter().enumerate() {
223        let section_blocks = section
224            .block_ids
225            .iter()
226            .map(|block_id| {
227                book.blocks
228                    .iter()
229                    .find(|block| &block.id == block_id)
230                    .ok_or_else(|| {
231                        BookforgeError::InvalidInput(format!(
232                            "section '{}' references missing block '{}'",
233                            section.id.0, block_id.0
234                        ))
235                    })
236            })
237            .collect::<Result<Vec<_>>>()?;
238
239        let mut current = Vec::<&Block>::new();
240        let mut current_tokens = 0usize;
241        let section_segments_start = segments.len();
242
243        for block in section_blocks {
244            // pre/code content is layout and syntax, not prose: sending it
245            // to the model both mistranslates it and destroys intentional
246            // whitespace. Excluded blocks are never patched, so the
247            // original markup survives rebuild byte-for-byte.
248            if matches!(block.kind, BlockKind::Code) {
249                continue;
250            }
251            let block_tokens = block.token_estimate.max(1);
252            let should_flush = !current.is_empty()
253                && current_tokens + block_tokens > config.max_segment_tokens
254                && !should_keep_with_previous(&current, block);
255
256            if should_flush {
257                push_segment(
258                    &mut segments,
259                    book,
260                    section,
261                    section_index,
262                    &current,
263                    config,
264                );
265                current.clear();
266                current_tokens = 0;
267            }
268
269            current.push(block);
270            current_tokens += block_tokens;
271        }
272
273        if !current.is_empty() {
274            push_segment(
275                &mut segments,
276                book,
277                section,
278                section_index,
279                &current,
280                config,
281            );
282        }
283
284        let total_in_section = segments.len() - section_segments_start;
285        for (offset, segment) in segments[section_segments_start..].iter_mut().enumerate() {
286            segment.metadata.segment_index_in_section = offset;
287            segment.metadata.total_segments_in_section = total_in_section;
288        }
289    }
290
291    apply_context(&mut segments, config.context_tokens);
292
293    Ok(segments)
294}
295
296fn push_segment(
297    segments: &mut Vec<Segment>,
298    book: &Book,
299    section: &Section,
300    section_index: usize,
301    blocks: &[&Block],
302    config: &SegmentationConfig,
303) {
304    let segment_blocks = blocks
305        .iter()
306        .map(|block| {
307            let mut spans = block
308                .protected_spans
309                .iter()
310                .map(|span| span.text.clone())
311                .collect::<Vec<_>>();
312            spans.sort();
313            spans.dedup();
314            SegmentBlock {
315                block_id: block.id.clone(),
316                kind: block_kind_label(block.kind).to_string(),
317                text: block_text(block),
318                text_runs: block
319                    .text_runs
320                    .iter()
321                    .map(|run| SegmentTextRun {
322                        id: run.id.clone(),
323                        text: run.text.clone(),
324                    })
325                    .collect(),
326                protected_spans: spans,
327            }
328        })
329        .collect::<Vec<_>>();
330    let source_text = segment_blocks
331        .iter()
332        .map(|block| block.text.as_str())
333        .collect::<Vec<_>>()
334        .join("\n\n");
335    let checksum = stable_hash(&source_text);
336    let ordinal = segments.len();
337    let first_block = blocks
338        .first()
339        .map(|block| block.id.0.as_str())
340        .unwrap_or("empty");
341    let id = SegmentId(format!(
342        "seg_{}_{}_{}",
343        section.id.0,
344        first_block,
345        &checksum[..12]
346    ));
347
348    let mut preserve_spans = blocks
349        .iter()
350        .flat_map(|block| block.protected_spans.iter().map(|span| span.text.clone()))
351        .collect::<Vec<_>>();
352    preserve_spans.sort();
353    preserve_spans.dedup();
354
355    let mut preserve_markers = blocks
356        .iter()
357        .flat_map(|block| block.inline_marks.iter().map(|mark| mark.id.clone()))
358        .collect::<Vec<_>>();
359    preserve_markers.sort();
360    preserve_markers.dedup();
361
362    let token_estimate = blocks
363        .iter()
364        .map(|block| block.token_estimate.max(1))
365        .sum::<usize>();
366
367    let metadata = SegmentMetadata {
368        book_title: book.metadata.title.clone(),
369        section_title: section.title.clone(),
370        section_index,
371        segment_index_in_section: 0,
372        total_segments_in_section: 0,
373    };
374
375    segments.push(Segment {
376        id,
377        section_id: section.id.clone(),
378        ordinal,
379        block_ids: blocks.iter().map(|block| block.id.clone()).collect(),
380        source: SegmentSource {
381            text: source_text,
382            blocks: segment_blocks,
383            token_estimate,
384        },
385        context: SegmentContext::default(),
386        metadata,
387        constraints: SegmentConstraints {
388            preserve_markers,
389            preserve_spans,
390            max_tokens: config.max_segment_tokens,
391        },
392        checksum,
393    });
394}
395
396fn apply_context(segments: &mut [Segment], context_tokens: usize) {
397    if context_tokens == 0 {
398        return;
399    }
400
401    let sources = segments
402        .iter()
403        .map(|segment| segment.source.text.clone())
404        .collect::<Vec<_>>();
405
406    for (index, segment) in segments.iter_mut().enumerate() {
407        segment.context.before = index
408            .checked_sub(1)
409            .and_then(|previous| sources.get(previous))
410            .map(|text| tail_words(text, context_tokens));
411        segment.context.after = sources
412            .get(index + 1)
413            .map(|text| head_words(text, context_tokens));
414    }
415}
416
417fn should_keep_with_previous(current: &[&Block], next: &Block) -> bool {
418    let Some(previous) = current.last() else {
419        return false;
420    };
421
422    matches!(previous.kind, crate::ir::BlockKind::Heading(_)) && next.token_estimate <= 80
423}
424
425fn block_text(block: &Block) -> String {
426    block
427        .text_runs
428        .iter()
429        .map(|run| run.text.as_str())
430        .collect::<Vec<_>>()
431        .join("")
432}
433
434fn stable_hash(value: &str) -> String {
435    let digest = Sha256::digest(value.as_bytes());
436    let mut output = String::with_capacity(digest.len() * 2);
437    for byte in digest {
438        use std::fmt::Write as _;
439        write!(&mut output, "{byte:02x}").expect("writing to string should not fail");
440    }
441    output
442}
443
444fn head_words(text: &str, max_words: usize) -> String {
445    text.split_whitespace()
446        .take(max_words)
447        .collect::<Vec<_>>()
448        .join(" ")
449}
450
451fn tail_words(text: &str, max_words: usize) -> String {
452    let words = text.split_whitespace().collect::<Vec<_>>();
453    let start = words.len().saturating_sub(max_words);
454    words[start..].join(" ")
455}
456
457#[cfg(test)]
458mod tests {
459    use super::*;
460    use crate::ir::{
461        BlockKind, BookFormat, BookId, DomPath, Metadata, Resource, Section, SpineItem, TextRun,
462    };
463
464    #[test]
465    fn builds_stable_segments_without_crossing_sections() {
466        let book = book_with_two_sections();
467        let config = SegmentationConfig {
468            max_segment_tokens: 10,
469            context_tokens: 4,
470        };
471
472        let first = build_segments(&book, &config).expect("segments should build");
473        let second = build_segments(&book, &config).expect("segments should be stable");
474
475        assert_eq!(first.len(), 3);
476        assert_eq!(first[0].id, second[0].id);
477        assert_eq!(first[1].checksum, second[1].checksum);
478        assert_eq!(first[0].section_id.0, "sec_000000");
479        assert_eq!(first[1].section_id.0, "sec_000000");
480        assert_eq!(first[2].section_id.0, "sec_000001");
481        assert_eq!(first[2].block_ids, vec![BlockId("b_000003".to_string())]);
482    }
483
484    #[test]
485    fn code_blocks_are_excluded_from_segments() {
486        let mut book = book_with_two_sections();
487        book.blocks[1].kind = BlockKind::Code;
488        let config = SegmentationConfig {
489            max_segment_tokens: 100,
490            context_tokens: 0,
491        };
492
493        let segments = build_segments(&book, &config).expect("segments should build");
494        let segmented_blocks: Vec<&str> = segments
495            .iter()
496            .flat_map(|segment| segment.block_ids.iter().map(|id| id.0.as_str()))
497            .collect();
498
499        assert!(
500            !segmented_blocks.contains(&"b_000001"),
501            "code block must not be segmented for translation"
502        );
503        assert!(segmented_blocks.contains(&"b_000000"));
504        assert!(segmented_blocks.contains(&"b_000002"));
505    }
506
507    #[test]
508    fn rejects_zero_token_limit() {
509        let book = book_with_two_sections();
510        let config = SegmentationConfig {
511            max_segment_tokens: 0,
512            context_tokens: 0,
513        };
514
515        assert!(build_segments(&book, &config).is_err());
516    }
517
518    #[test]
519    fn cache_namespace_changes_when_segmentation_settings_change() {
520        let a = compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
521        let b = compute_cache_namespace(1201, 160, "Balanced", false, "v1", "glossary:a", "", "");
522        let c = compute_cache_namespace(1200, 160, "Balanced", true, "v1", "glossary:a", "", "");
523        let d = compute_cache_namespace(
524            1200,
525            160,
526            "Balanced",
527            false,
528            "batch_v1",
529            "glossary:a",
530            "",
531            "",
532        );
533        let e = compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
534        let f = compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:b", "", "");
535
536        assert_ne!(a, b, "max_segment_tokens must affect namespace");
537        assert_ne!(a, c, "batch_enabled must affect namespace");
538        assert_ne!(a, d, "prompt_version must affect namespace");
539        assert_ne!(a, f, "glossary fingerprint must affect namespace");
540        assert_eq!(a, e, "namespace is deterministic for identical inputs");
541    }
542
543    #[test]
544    fn legacy_cache_namespace_v1_ignores_glossary_fingerprint() {
545        let current_without_terms =
546            compute_cache_namespace(1200, 160, "Balanced", false, "v1", "", "", "");
547        let current_with_terms =
548            compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
549        let legacy = compute_cache_namespace_v1(1200, 160, "Balanced", false, "v1");
550
551        assert_ne!(legacy, current_without_terms);
552        assert_ne!(legacy, current_with_terms);
553        assert_eq!(
554            legacy,
555            compute_cache_namespace_v1(1200, 160, "Balanced", false, "v1")
556        );
557    }
558
559    #[test]
560    fn cache_namespace_is_stable_when_style_fingerprint_is_empty() {
561        // Users who don't use --style must see no cache invalidation when
562        // they upgrade to a build that supports style sheets.
563        let without_style =
564            compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
565        let still_without_style =
566            compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
567        assert_eq!(without_style, still_without_style);
568    }
569
570    #[test]
571    fn cache_namespace_changes_when_style_fingerprint_changes() {
572        let baseline =
573            compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
574        let with_style = compute_cache_namespace(
575            1200,
576            160,
577            "Balanced",
578            false,
579            "v1",
580            "glossary:a",
581            "style:a",
582            "",
583        );
584        let with_other_style = compute_cache_namespace(
585            1200,
586            160,
587            "Balanced",
588            false,
589            "v1",
590            "glossary:a",
591            "style:b",
592            "",
593        );
594
595        assert_ne!(
596            baseline, with_style,
597            "switching on a style sheet must invalidate cache"
598        );
599        assert_ne!(
600            with_style, with_other_style,
601            "different style fingerprints must yield different namespaces"
602        );
603    }
604
605    #[test]
606    fn cache_namespace_changes_when_entities_fingerprint_changes() {
607        let baseline =
608            compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "");
609        let with_entities = compute_cache_namespace(
610            1200,
611            160,
612            "Balanced",
613            false,
614            "v1",
615            "glossary:a",
616            "",
617            "entities:a",
618        );
619        let with_other_entities = compute_cache_namespace(
620            1200,
621            160,
622            "Balanced",
623            false,
624            "v1",
625            "glossary:a",
626            "",
627            "entities:b",
628        );
629        assert_ne!(
630            baseline, with_entities,
631            "switching on entities must invalidate cache"
632        );
633        assert_ne!(
634            with_entities, with_other_entities,
635            "different entity fingerprints must yield different namespaces"
636        );
637    }
638
639    #[test]
640    fn style_and_entities_fingerprints_use_distinct_domain_separators() {
641        // The same hex string used as style vs. entities must not produce
642        // the same namespace — domain separators prevent the collision.
643        let as_style =
644            compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "ab", "");
645        let as_entities =
646            compute_cache_namespace(1200, 160, "Balanced", false, "v1", "glossary:a", "", "ab");
647        assert_ne!(as_style, as_entities);
648    }
649
650    fn book_with_two_sections() -> Book {
651        let section_a = SectionId("sec_000000".to_string());
652        let section_b = SectionId("sec_000001".to_string());
653
654        Book {
655            source_path: None,
656            id: BookId("test".to_string()),
657            format: BookFormat::Epub,
658            metadata: Metadata::default(),
659            manifest: vec![Resource {
660                id: "chapter".to_string(),
661                href: "chapter.xhtml".to_string(),
662                media_type: "application/xhtml+xml".to_string(),
663                properties: Vec::new(),
664            }],
665            spine: vec![SpineItem {
666                idref: "chapter".to_string(),
667                href: Some("chapter.xhtml".to_string()),
668                linear: true,
669            }],
670            sections: vec![
671                Section {
672                    id: section_a.clone(),
673                    href: "chapter.xhtml".to_string(),
674                    spine_index: 0,
675                    title: Some("One".to_string()),
676                    heading_level: Some(1),
677                    block_ids: vec![
678                        BlockId("b_000000".to_string()),
679                        BlockId("b_000001".to_string()),
680                        BlockId("b_000002".to_string()),
681                    ],
682                    prev: None,
683                    next: Some(section_b.clone()),
684                },
685                Section {
686                    id: section_b.clone(),
687                    href: "chapter2.xhtml".to_string(),
688                    spine_index: 1,
689                    title: None,
690                    heading_level: None,
691                    block_ids: vec![BlockId("b_000003".to_string())],
692                    prev: Some(section_a.clone()),
693                    next: None,
694                },
695            ],
696            blocks: vec![
697                block("b_000000", &section_a, BlockKind::Heading(1), "One", 2),
698                block(
699                    "b_000001",
700                    &section_a,
701                    BlockKind::Paragraph,
702                    "short lead",
703                    3,
704                ),
705                block(
706                    "b_000002",
707                    &section_a,
708                    BlockKind::Paragraph,
709                    "this paragraph forces a second segment",
710                    10,
711                ),
712                block(
713                    "b_000003",
714                    &section_b,
715                    BlockKind::Paragraph,
716                    "new section must stay separate",
717                    4,
718                ),
719            ],
720        }
721    }
722
723    fn block(
724        id: &str,
725        section_id: &SectionId,
726        kind: BlockKind,
727        text: &str,
728        token_estimate: usize,
729    ) -> Block {
730        Block {
731            id: BlockId(id.to_string()),
732            section_id: section_id.clone(),
733            kind,
734            dom_path: DomPath(vec![0]),
735            text_runs: vec![TextRun {
736                id: "r0".to_string(),
737                text: text.to_string(),
738            }],
739            inline_marks: Vec::new(),
740            protected_spans: Vec::new(),
741            token_estimate,
742        }
743    }
744}