Skip to main content

bookforge_core/
segment.rs

1use serde::{Deserialize, Serialize};
2use sha2::{Digest, Sha256};
3
4use crate::{
5    BookforgeError, Result,
6    config::SegmentationConfig,
7    ir::{Block, BlockId, BlockKind, Book, Section, SectionId},
8};
9
10/// Bumped when the cache key derivation changes incompatibly.
11pub const CACHE_KEY_SCHEMA_VERSION: u32 = 1;
12/// Bumped when Segment / SegmentBlock layout changes incompatibly.
13pub const SEGMENT_SCHEMA_VERSION: u32 = 1;
14/// Bumped when inline marker extraction (m/keep/ref) changes incompatibly.
15pub const INLINE_MARKER_SCHEMA_VERSION: u32 = 1;
16
17/// Compute a cache namespace that scopes lookups to a single set of
18/// schema and segmentation parameters. Cached rows from a different
19/// namespace are not eligible for reuse.
20pub fn compute_cache_namespace(
21    max_segment_tokens: usize,
22    context_tokens: usize,
23    profile: &str,
24    batch_enabled: bool,
25    prompt_version: &str,
26) -> String {
27    let mut hasher = Sha256::new();
28    hasher.update(CACHE_KEY_SCHEMA_VERSION.to_le_bytes());
29    hasher.update(SEGMENT_SCHEMA_VERSION.to_le_bytes());
30    hasher.update(INLINE_MARKER_SCHEMA_VERSION.to_le_bytes());
31    hasher.update((max_segment_tokens as u64).to_le_bytes());
32    hasher.update((context_tokens as u64).to_le_bytes());
33    hasher.update(profile.as_bytes());
34    hasher.update([batch_enabled as u8]);
35    hasher.update(prompt_version.as_bytes());
36    let digest = hasher.finalize();
37    let mut hex = String::with_capacity(digest.len() * 2);
38    for byte in digest {
39        hex.push_str(&format!("{byte:02x}"));
40    }
41    hex
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
45pub struct BlockTranslation {
46    pub block_id: BlockId,
47    pub text: String,
48}
49
50#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
51pub struct SegmentId(pub String);
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct Segment {
55    pub id: SegmentId,
56    pub section_id: SectionId,
57    pub ordinal: usize,
58    pub block_ids: Vec<BlockId>,
59    pub source: SegmentSource,
60    pub context: SegmentContext,
61    pub metadata: SegmentMetadata,
62    pub constraints: SegmentConstraints,
63    pub checksum: String,
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct SegmentSource {
68    pub text: String,
69    pub blocks: Vec<SegmentBlock>,
70    pub token_estimate: usize,
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
74pub struct SegmentBlock {
75    pub block_id: BlockId,
76    pub kind: String,
77    pub text: String,
78    pub text_runs: Vec<SegmentTextRun>,
79    pub protected_spans: Vec<String>,
80}
81
82#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
83pub struct SegmentTextRun {
84    pub id: String,
85    pub text: String,
86}
87
88#[derive(Debug, Clone, Default, Serialize, Deserialize)]
89pub struct SegmentContext {
90    pub before: Option<String>,
91    pub after: Option<String>,
92}
93
94#[derive(Debug, Clone, Default, Serialize, Deserialize)]
95pub struct SegmentMetadata {
96    pub book_title: Option<String>,
97    pub section_title: Option<String>,
98    pub section_index: usize,
99    pub segment_index_in_section: usize,
100    pub total_segments_in_section: usize,
101}
102
103#[derive(Debug, Clone, Default, Serialize, Deserialize)]
104pub struct SegmentConstraints {
105    pub preserve_markers: Vec<String>,
106    pub preserve_spans: Vec<String>,
107    pub max_tokens: usize,
108}
109
110pub fn block_kind_label(kind: BlockKind) -> &'static str {
111    match kind {
112        BlockKind::Heading(_) => "heading",
113        BlockKind::Paragraph => "paragraph",
114        BlockKind::ListItem => "list_item",
115        BlockKind::Quote => "quote",
116        BlockKind::TableCell => "table_cell",
117        BlockKind::TableRow => "table_row",
118        BlockKind::Footnote => "footnote",
119        BlockKind::Caption => "caption",
120        BlockKind::Code => "code",
121        BlockKind::Unknown => "unknown",
122    }
123}
124
125#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
126pub enum SegmentStatus {
127    Queued,
128    Succeeded,
129    Failed,
130    RetryPending,
131    NeedsReview,
132    SkippedCached,
133}
134
135pub fn build_segments(book: &Book, config: &SegmentationConfig) -> Result<Vec<Segment>> {
136    if config.max_segment_tokens == 0 {
137        return Err(BookforgeError::InvalidInput(
138            "max_segment_tokens must be greater than zero".to_string(),
139        ));
140    }
141
142    let mut segments = Vec::new();
143
144    for (section_index, section) in book.sections.iter().enumerate() {
145        let section_blocks = section
146            .block_ids
147            .iter()
148            .map(|block_id| {
149                book.blocks
150                    .iter()
151                    .find(|block| &block.id == block_id)
152                    .ok_or_else(|| {
153                        BookforgeError::InvalidInput(format!(
154                            "section '{}' references missing block '{}'",
155                            section.id.0, block_id.0
156                        ))
157                    })
158            })
159            .collect::<Result<Vec<_>>>()?;
160
161        let mut current = Vec::<&Block>::new();
162        let mut current_tokens = 0usize;
163        let section_segments_start = segments.len();
164
165        for block in section_blocks {
166            let block_tokens = block.token_estimate.max(1);
167            let should_flush = !current.is_empty()
168                && current_tokens + block_tokens > config.max_segment_tokens
169                && !should_keep_with_previous(&current, block);
170
171            if should_flush {
172                push_segment(
173                    &mut segments,
174                    book,
175                    section,
176                    section_index,
177                    &current,
178                    config,
179                );
180                current.clear();
181                current_tokens = 0;
182            }
183
184            current.push(block);
185            current_tokens += block_tokens;
186        }
187
188        if !current.is_empty() {
189            push_segment(
190                &mut segments,
191                book,
192                section,
193                section_index,
194                &current,
195                config,
196            );
197        }
198
199        let total_in_section = segments.len() - section_segments_start;
200        for (offset, segment) in segments[section_segments_start..].iter_mut().enumerate() {
201            segment.metadata.segment_index_in_section = offset;
202            segment.metadata.total_segments_in_section = total_in_section;
203        }
204    }
205
206    apply_context(&mut segments, config.context_tokens);
207
208    Ok(segments)
209}
210
211fn push_segment(
212    segments: &mut Vec<Segment>,
213    book: &Book,
214    section: &Section,
215    section_index: usize,
216    blocks: &[&Block],
217    config: &SegmentationConfig,
218) {
219    let segment_blocks = blocks
220        .iter()
221        .map(|block| {
222            let mut spans = block
223                .protected_spans
224                .iter()
225                .map(|span| span.text.clone())
226                .collect::<Vec<_>>();
227            spans.sort();
228            spans.dedup();
229            SegmentBlock {
230                block_id: block.id.clone(),
231                kind: block_kind_label(block.kind).to_string(),
232                text: block_text(block),
233                text_runs: block
234                    .text_runs
235                    .iter()
236                    .map(|run| SegmentTextRun {
237                        id: run.id.clone(),
238                        text: run.text.clone(),
239                    })
240                    .collect(),
241                protected_spans: spans,
242            }
243        })
244        .collect::<Vec<_>>();
245    let source_text = segment_blocks
246        .iter()
247        .map(|block| block.text.as_str())
248        .collect::<Vec<_>>()
249        .join("\n\n");
250    let checksum = stable_hash(&source_text);
251    let ordinal = segments.len();
252    let first_block = blocks
253        .first()
254        .map(|block| block.id.0.as_str())
255        .unwrap_or("empty");
256    let id = SegmentId(format!(
257        "seg_{}_{}_{}",
258        section.id.0,
259        first_block,
260        &checksum[..12]
261    ));
262
263    let mut preserve_spans = blocks
264        .iter()
265        .flat_map(|block| block.protected_spans.iter().map(|span| span.text.clone()))
266        .collect::<Vec<_>>();
267    preserve_spans.sort();
268    preserve_spans.dedup();
269
270    let mut preserve_markers = blocks
271        .iter()
272        .flat_map(|block| block.inline_marks.iter().map(|mark| mark.id.clone()))
273        .collect::<Vec<_>>();
274    preserve_markers.sort();
275    preserve_markers.dedup();
276
277    let token_estimate = blocks
278        .iter()
279        .map(|block| block.token_estimate.max(1))
280        .sum::<usize>();
281
282    let metadata = SegmentMetadata {
283        book_title: book.metadata.title.clone(),
284        section_title: section.title.clone(),
285        section_index,
286        segment_index_in_section: 0,
287        total_segments_in_section: 0,
288    };
289
290    segments.push(Segment {
291        id,
292        section_id: section.id.clone(),
293        ordinal,
294        block_ids: blocks.iter().map(|block| block.id.clone()).collect(),
295        source: SegmentSource {
296            text: source_text,
297            blocks: segment_blocks,
298            token_estimate,
299        },
300        context: SegmentContext::default(),
301        metadata,
302        constraints: SegmentConstraints {
303            preserve_markers,
304            preserve_spans,
305            max_tokens: config.max_segment_tokens,
306        },
307        checksum,
308    });
309}
310
311fn apply_context(segments: &mut [Segment], context_tokens: usize) {
312    if context_tokens == 0 {
313        return;
314    }
315
316    let sources = segments
317        .iter()
318        .map(|segment| segment.source.text.clone())
319        .collect::<Vec<_>>();
320
321    for (index, segment) in segments.iter_mut().enumerate() {
322        segment.context.before = index
323            .checked_sub(1)
324            .and_then(|previous| sources.get(previous))
325            .map(|text| tail_words(text, context_tokens));
326        segment.context.after = sources
327            .get(index + 1)
328            .map(|text| head_words(text, context_tokens));
329    }
330}
331
332fn should_keep_with_previous(current: &[&Block], next: &Block) -> bool {
333    let Some(previous) = current.last() else {
334        return false;
335    };
336
337    matches!(previous.kind, crate::ir::BlockKind::Heading(_)) && next.token_estimate <= 80
338}
339
340fn block_text(block: &Block) -> String {
341    block
342        .text_runs
343        .iter()
344        .map(|run| run.text.as_str())
345        .collect::<Vec<_>>()
346        .join("")
347}
348
349fn stable_hash(value: &str) -> String {
350    let digest = Sha256::digest(value.as_bytes());
351    let mut output = String::with_capacity(digest.len() * 2);
352    for byte in digest {
353        use std::fmt::Write as _;
354        write!(&mut output, "{byte:02x}").expect("writing to string should not fail");
355    }
356    output
357}
358
359fn head_words(text: &str, max_words: usize) -> String {
360    text.split_whitespace()
361        .take(max_words)
362        .collect::<Vec<_>>()
363        .join(" ")
364}
365
366fn tail_words(text: &str, max_words: usize) -> String {
367    let words = text.split_whitespace().collect::<Vec<_>>();
368    let start = words.len().saturating_sub(max_words);
369    words[start..].join(" ")
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375    use crate::ir::{
376        BlockKind, BookFormat, BookId, DomPath, Metadata, Resource, Section, SpineItem, TextRun,
377    };
378
379    #[test]
380    fn builds_stable_segments_without_crossing_sections() {
381        let book = book_with_two_sections();
382        let config = SegmentationConfig {
383            max_segment_tokens: 10,
384            context_tokens: 4,
385        };
386
387        let first = build_segments(&book, &config).expect("segments should build");
388        let second = build_segments(&book, &config).expect("segments should be stable");
389
390        assert_eq!(first.len(), 3);
391        assert_eq!(first[0].id, second[0].id);
392        assert_eq!(first[1].checksum, second[1].checksum);
393        assert_eq!(first[0].section_id.0, "sec_000000");
394        assert_eq!(first[1].section_id.0, "sec_000000");
395        assert_eq!(first[2].section_id.0, "sec_000001");
396        assert_eq!(first[2].block_ids, vec![BlockId("b_000003".to_string())]);
397    }
398
399    #[test]
400    fn rejects_zero_token_limit() {
401        let book = book_with_two_sections();
402        let config = SegmentationConfig {
403            max_segment_tokens: 0,
404            context_tokens: 0,
405        };
406
407        assert!(build_segments(&book, &config).is_err());
408    }
409
410    #[test]
411    fn cache_namespace_changes_when_segmentation_settings_change() {
412        let a = compute_cache_namespace(1200, 160, "Balanced", false, "v1");
413        let b = compute_cache_namespace(1201, 160, "Balanced", false, "v1");
414        let c = compute_cache_namespace(1200, 160, "Balanced", true, "v1");
415        let d = compute_cache_namespace(1200, 160, "Balanced", false, "batch_v1");
416        let e = compute_cache_namespace(1200, 160, "Balanced", false, "v1");
417
418        assert_ne!(a, b, "max_segment_tokens must affect namespace");
419        assert_ne!(a, c, "batch_enabled must affect namespace");
420        assert_ne!(a, d, "prompt_version must affect namespace");
421        assert_eq!(a, e, "namespace is deterministic for identical inputs");
422    }
423
424    fn book_with_two_sections() -> Book {
425        let section_a = SectionId("sec_000000".to_string());
426        let section_b = SectionId("sec_000001".to_string());
427
428        Book {
429            source_path: None,
430            id: BookId("test".to_string()),
431            format: BookFormat::Epub,
432            metadata: Metadata::default(),
433            manifest: vec![Resource {
434                id: "chapter".to_string(),
435                href: "chapter.xhtml".to_string(),
436                media_type: "application/xhtml+xml".to_string(),
437                properties: Vec::new(),
438            }],
439            spine: vec![SpineItem {
440                idref: "chapter".to_string(),
441                href: Some("chapter.xhtml".to_string()),
442                linear: true,
443            }],
444            sections: vec![
445                Section {
446                    id: section_a.clone(),
447                    href: "chapter.xhtml".to_string(),
448                    spine_index: 0,
449                    title: Some("One".to_string()),
450                    heading_level: Some(1),
451                    block_ids: vec![
452                        BlockId("b_000000".to_string()),
453                        BlockId("b_000001".to_string()),
454                        BlockId("b_000002".to_string()),
455                    ],
456                    prev: None,
457                    next: Some(section_b.clone()),
458                },
459                Section {
460                    id: section_b.clone(),
461                    href: "chapter2.xhtml".to_string(),
462                    spine_index: 1,
463                    title: None,
464                    heading_level: None,
465                    block_ids: vec![BlockId("b_000003".to_string())],
466                    prev: Some(section_a.clone()),
467                    next: None,
468                },
469            ],
470            blocks: vec![
471                block("b_000000", &section_a, BlockKind::Heading(1), "One", 2),
472                block(
473                    "b_000001",
474                    &section_a,
475                    BlockKind::Paragraph,
476                    "short lead",
477                    3,
478                ),
479                block(
480                    "b_000002",
481                    &section_a,
482                    BlockKind::Paragraph,
483                    "this paragraph forces a second segment",
484                    10,
485                ),
486                block(
487                    "b_000003",
488                    &section_b,
489                    BlockKind::Paragraph,
490                    "new section must stay separate",
491                    4,
492                ),
493            ],
494        }
495    }
496
497    fn block(
498        id: &str,
499        section_id: &SectionId,
500        kind: BlockKind,
501        text: &str,
502        token_estimate: usize,
503    ) -> Block {
504        Block {
505            id: BlockId(id.to_string()),
506            section_id: section_id.clone(),
507            kind,
508            dom_path: DomPath(vec![0]),
509            text_runs: vec![TextRun {
510                id: "r0".to_string(),
511                text: text.to_string(),
512            }],
513            inline_marks: Vec::new(),
514            protected_spans: Vec::new(),
515            token_estimate,
516        }
517    }
518}