1#[cfg(feature = "semantic")]
8use serde::{Deserialize, Serialize};
9#[cfg(feature = "semantic")]
10use std::collections::BTreeMap;
11
12use crate::pipeline::element::{Element, ElementBBox};
13use crate::pipeline::hybrid_chunking::split_into_sentences;
14
15pub(crate) struct Aggregates {
17 pub dominant_font: Option<String>,
18 pub dominant_font_size: Option<f64>,
19 pub is_bold: bool,
20 pub is_italic: bool,
21 pub min_confidence: f32,
22}
23
24impl Aggregates {
25 pub(crate) fn from_elements(elements: &[Element]) -> Self {
26 let mut font_weight: Vec<(String, usize)> = Vec::new();
27 let mut size_weight: Vec<(f64, usize)> = Vec::new();
28 let mut bold_chars = 0usize;
29 let mut italic_chars = 0usize;
30 let mut total_chars = 0usize;
31 let mut min_conf = 1.0f32;
32
33 for e in elements {
34 let w = e.text().chars().count();
35 total_chars += w;
36 let meta = e.metadata();
37 if let Some(f) = &meta.font_name {
38 match font_weight.iter_mut().find(|(name, _)| name == f) {
39 Some((_, c)) => *c += w,
40 None => font_weight.push((f.clone(), w)),
41 }
42 }
43 if let Some(s) = meta.font_size {
44 match size_weight.iter_mut().find(|(sz, _)| (*sz - s).abs() < 0.1) {
45 Some((_, c)) => *c += w,
46 None => size_weight.push((s, w)),
47 }
48 }
49 if meta.is_bold {
50 bold_chars += w;
51 }
52 if meta.is_italic {
53 italic_chars += w;
54 }
55 min_conf = min_conf.min(meta.confidence as f32);
56 }
57
58 let dominant_font = font_weight
59 .into_iter()
60 .max_by_key(|(_, c)| *c)
61 .map(|(name, _)| name);
62 let dominant_font_size = size_weight
63 .into_iter()
64 .max_by_key(|(_, c)| *c)
65 .map(|(sz, _)| sz);
66
67 Self {
68 dominant_font,
69 dominant_font_size,
70 is_bold: total_chars > 0 && bold_chars * 2 > total_chars,
71 is_italic: total_chars > 0 && italic_chars * 2 > total_chars,
72 min_confidence: if elements.is_empty() { 0.0 } else { min_conf },
73 }
74 }
75}
76
77#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
79#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
80pub struct ContentTypeFlags {
81 pub has_table: bool,
83 pub has_list: bool,
85 pub has_code: bool,
87 pub heading_only: bool,
89}
90
91#[derive(Debug, Clone, Default, PartialEq, Eq)]
93#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
94#[non_exhaustive]
95pub struct DocumentSource {
96 pub title: Option<String>,
98 pub author: Option<String>,
100 pub creation_date: Option<String>,
102 pub filename: Option<String>,
104 pub doc_hash: Option<String>,
106 pub total_pages: Option<u32>,
108}
109
110impl DocumentSource {
111 pub fn with_file(filename: Option<String>, doc_hash: Option<String>) -> Self {
117 Self {
118 filename,
119 doc_hash,
120 ..Default::default()
121 }
122 }
123}
124
125#[derive(Debug, Clone, Copy, PartialEq)]
129#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
130#[non_exhaustive]
131pub struct PageRegion {
132 pub page: u32,
134 pub bbox: ElementBBox,
136}
137
138#[derive(Debug, Clone, Default, PartialEq)]
140#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
141#[non_exhaustive]
142pub struct ChunkMetadata {
143 pub heading_path: Vec<String>,
145 pub dominant_font: Option<String>,
147 pub dominant_font_size: Option<f64>,
149 pub is_bold: bool,
151 pub is_italic: bool,
153 pub min_confidence: f32,
155 pub content_types: ContentTypeFlags,
157 pub char_count: usize,
159 pub word_count: usize,
161 pub sentence_count: usize,
163 pub language: Option<String>,
166 pub language_confidence: Option<f32>,
169 pub language_reliable: Option<bool>,
173 pub chunk_id: String,
175 pub prev_chunk_id: Option<String>,
177 pub next_chunk_id: Option<String>,
179 pub source: Option<DocumentSource>,
181 pub page_span: Option<(u32, u32)>,
184 pub page_regions: Vec<PageRegion>,
187 pub table_rows: Option<usize>,
190 pub table_cols: Option<usize>,
193 #[cfg(feature = "semantic")]
197 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
198 pub extra: BTreeMap<String, serde_json::Value>,
199}
200
201use sha2::{Digest, Sha256};
202
203impl ChunkMetadata {
204 pub(crate) fn from_elements(
209 elements: &[Element],
210 text: &str,
211 full_text: &str,
212 chunk_index: usize,
213 doc_hash: Option<&str>,
214 ) -> Self {
215 let agg = Aggregates::from_elements(elements);
216 let heading_path = elements
217 .first()
218 .map(|e| e.metadata().heading_path.clone())
219 .unwrap_or_default();
220 let (page_span, page_regions) = page_anchor(elements);
221 let (table_rows, table_cols) = table_dims(elements);
222 #[cfg(feature = "language-detection")]
224 let (language, language_confidence, language_reliable) = match detect_language_full(text) {
225 Some((code, conf, reliable)) => (Some(code), Some(conf), Some(reliable)),
226 None => (None, None, None),
227 };
228 #[cfg(not(feature = "language-detection"))]
229 let (language, language_confidence, language_reliable): (
230 Option<String>,
231 Option<f32>,
232 Option<bool>,
233 ) = (None, None, None);
234 ChunkMetadata {
235 heading_path,
236 dominant_font: agg.dominant_font,
237 dominant_font_size: agg.dominant_font_size,
238 is_bold: agg.is_bold,
239 is_italic: agg.is_italic,
240 min_confidence: agg.min_confidence,
241 content_types: content_type_flags(elements),
242 char_count: char_count(text),
243 word_count: word_count(text),
244 sentence_count: sentence_count(text),
245 language,
246 language_confidence,
247 language_reliable,
248 chunk_id: content_chunk_id(doc_hash, chunk_index, full_text),
249 prev_chunk_id: None,
250 next_chunk_id: None,
251 source: None,
252 page_span,
253 page_regions,
254 table_rows,
255 table_cols,
256 #[cfg(feature = "semantic")]
257 extra: BTreeMap::new(),
258 }
259 }
260}
261
262fn table_dims(elements: &[Element]) -> (Option<usize>, Option<usize>) {
265 elements
266 .iter()
267 .filter_map(|e| match e {
268 Element::Table(t) => Some(&t.rows),
269 _ => None,
270 })
271 .max_by_key(|rows| rows.len())
272 .map(|rows| {
273 let cols = rows.iter().map(|r| r.len()).max().unwrap_or(0);
274 (Some(rows.len()), Some(cols))
275 })
276 .unwrap_or((None, None))
277}
278
279fn union_bbox(a: ElementBBox, b: ElementBBox) -> ElementBBox {
281 let x = a.x.min(b.x);
282 let y = a.y.min(b.y);
283 let right = a.right().max(b.right());
284 let top = a.top().max(b.top());
285 ElementBBox::new(x, y, right - x, top - y)
286}
287
288fn page_anchor(elements: &[Element]) -> (Option<(u32, u32)>, Vec<PageRegion>) {
292 let mut by_page: Vec<(u32, ElementBBox)> = Vec::new();
293 for e in elements {
294 let page = e.metadata().page;
295 let bbox = *e.bbox();
296 match by_page.iter_mut().find(|(p, _)| *p == page) {
297 Some(slot) => slot.1 = union_bbox(slot.1, bbox),
298 None => by_page.push((page, bbox)),
299 }
300 }
301 if by_page.is_empty() {
302 return (None, Vec::new());
303 }
304 by_page.sort_by_key(|(p, _)| *p);
305 let span = (by_page.first().unwrap().0, by_page.last().unwrap().0);
306 let regions = by_page
307 .into_iter()
308 .map(|(page, bbox)| PageRegion { page, bbox })
309 .collect();
310 (Some(span), regions)
311}
312
313pub(crate) fn link_chunks(chunks: &mut [crate::pipeline::RagChunk]) {
315 let ids: Vec<String> = chunks.iter().map(|c| c.metadata.chunk_id.clone()).collect();
316 for (i, c) in chunks.iter_mut().enumerate() {
317 c.metadata.prev_chunk_id = if i > 0 {
318 Some(ids[i - 1].clone())
319 } else {
320 None
321 };
322 c.metadata.next_chunk_id = ids.get(i + 1).cloned();
323 }
324}
325
326#[cfg(feature = "language-detection")]
333pub fn detect_language(text: &str) -> Option<String> {
334 detect_language_full(text).map(|(code, _, _)| code)
335}
336
337#[cfg(feature = "language-detection")]
342pub(crate) fn detect_language_full(text: &str) -> Option<(String, f32, bool)> {
343 if text.trim().is_empty() {
344 return None;
345 }
346 whatlang::detect(text).map(|info| {
347 (
348 info.lang().code().to_string(),
349 info.confidence() as f32,
350 info.is_reliable(),
351 )
352 })
353}
354
355pub(crate) fn content_chunk_id(doc_hash: Option<&str>, index: usize, full_text: &str) -> String {
358 let doc_id = match doc_hash {
359 Some(h) => h.to_string(),
360 None => {
361 let mut hasher = Sha256::new();
362 hasher.update(full_text.as_bytes());
363 let digest = hasher.finalize();
364 digest[..8]
365 .iter()
366 .map(|b| format!("{b:02x}"))
367 .collect::<String>()
368 }
369 };
370 format!("{doc_id}:{index}")
371}
372
373pub(crate) fn content_type_flags(elements: &[Element]) -> ContentTypeFlags {
374 let mut flags = ContentTypeFlags::default();
375 let mut all_titles = !elements.is_empty();
376 for e in elements {
377 match e {
378 Element::Table(_) => flags.has_table = true,
379 Element::ListItem(_) => flags.has_list = true,
380 Element::CodeBlock(_) => flags.has_code = true,
381 _ => {}
382 }
383 if !matches!(e, Element::Title(_)) {
384 all_titles = false;
385 }
386 }
387 flags.heading_only = all_titles;
388 flags
389}
390
391pub(crate) fn char_count(text: &str) -> usize {
392 text.chars().count()
393}
394
395pub(crate) fn word_count(text: &str) -> usize {
396 text.split_whitespace().count()
397}
398
399pub(crate) fn sentence_count(text: &str) -> usize {
400 if text.trim().is_empty() {
401 return 0;
402 }
403 split_into_sentences(text).len()
404}
405
406#[cfg(test)]
407mod tests {
408 use super::*;
409 use crate::pipeline::element::{Element, ElementData, ElementMetadata};
410
411 fn table_el() -> Element {
412 Element::Table(crate::pipeline::element::TableElementData {
413 rows: vec![],
414 metadata: crate::pipeline::element::ElementMetadata::default(),
415 })
416 }
417
418 #[test]
419 fn content_types_and_counts() {
420 let els = vec![
421 para("Hello world. Second sentence!", "F", 10.0, false, 1.0),
422 table_el(),
423 ];
424 let flags = content_type_flags(&els);
425 assert!(flags.has_table);
426 assert!(!flags.has_list);
427 assert!(!flags.heading_only);
428
429 let text = "Hello world. Second sentence!";
430 assert_eq!(char_count(text), text.chars().count());
431 assert_eq!(word_count(text), 4);
432 assert_eq!(sentence_count(text), 2);
433 }
434
435 #[test]
436 fn heading_only_when_all_titles() {
437 let d = crate::pipeline::element::ElementData {
438 text: "Title".to_string(),
439 metadata: crate::pipeline::element::ElementMetadata::default(),
440 };
441 let els = vec![Element::Title(d)];
442 assert!(content_type_flags(&els).heading_only);
443 }
444
445 fn para(text: &str, font: &str, size: f64, bold: bool, conf: f64) -> Element {
446 let metadata = ElementMetadata {
447 font_name: Some(font.to_string()),
448 font_size: Some(size),
449 is_bold: bold,
450 confidence: conf,
451 ..ElementMetadata::default()
452 };
453 Element::Paragraph(ElementData {
454 text: text.to_string(),
455 metadata,
456 })
457 }
458
459 #[test]
460 fn aggregate_picks_char_weighted_dominant_font_and_min_confidence() {
461 let els = vec![
463 para("aaaa", "Helvetica", 12.0, true, 0.9),
464 para("bb", "Times", 10.0, false, 0.5),
465 ];
466 let agg = Aggregates::from_elements(&els);
467 assert_eq!(agg.dominant_font.as_deref(), Some("Helvetica"));
468 assert_eq!(agg.dominant_font_size, Some(12.0));
469 assert!(agg.is_bold, "4 bold chars vs 2 non-bold → bold majority");
470 assert!((agg.min_confidence - 0.5).abs() < 1e-6);
471 }
472
473 #[test]
474 fn chunk_id_is_deterministic_and_prefixed() {
475 let a = content_chunk_id(None, 0, "the quick brown fox");
476 let b = content_chunk_id(None, 0, "the quick brown fox");
477 assert_eq!(a, b, "same text + index → same id");
478 assert!(a.ends_with(":0"));
479 assert_eq!(
482 a.split(':').next().unwrap().len(),
483 16,
484 "hashless chunk_id prefix must be 16 hex chars (8 bytes)"
485 );
486
487 let with_hash = content_chunk_id(Some("dochash123"), 7, "ignored when hash present");
488 assert_eq!(with_hash, "dochash123:7");
489
490 let other = content_chunk_id(None, 0, "different text");
491 assert_ne!(a, other);
492 }
493
494 #[test]
495 fn chunk_metadata_default_is_empty() {
496 let m = ChunkMetadata::default();
497 assert!(m.heading_path.is_empty());
498 assert_eq!(m.dominant_font, None);
499 assert!(!m.is_bold);
500 assert_eq!(m.min_confidence, 0.0);
501 assert!(!m.content_types.has_table);
502 assert_eq!(m.char_count, 0);
503 assert_eq!(m.language, None);
504 assert_eq!(m.language_confidence, None);
505 assert_eq!(m.language_reliable, None);
506 assert_eq!(m.chunk_id, "");
507 assert!(m.source.is_none());
508 assert_eq!(m.page_span, None);
509 assert!(m.page_regions.is_empty());
510 assert_eq!(m.table_rows, None);
511 assert_eq!(m.table_cols, None);
512 }
513
514 #[test]
515 fn document_source_with_file_sets_only_supplied_fields() {
516 let s = DocumentSource::with_file(Some("doc.pdf".to_string()), Some("h7".to_string()));
517 assert_eq!(s.filename.as_deref(), Some("doc.pdf"));
518 assert_eq!(s.doc_hash.as_deref(), Some("h7"));
519 assert_eq!(s.title, None);
522 assert_eq!(s.author, None);
523 assert_eq!(s.creation_date, None);
524 assert_eq!(s.total_pages, None);
525
526 let empty = DocumentSource::with_file(None, None);
527 assert_eq!(empty, DocumentSource::default());
528 }
529
530 #[test]
531 fn build_metadata_from_chunk_elements() {
532 let els = vec![
533 para("aaaa", "Helvetica", 12.0, true, 0.8),
534 para("bb. cc.", "Helvetica", 12.0, false, 0.6),
535 ];
536 let text = "aaaa\nbb. cc.";
537 let m = ChunkMetadata::from_elements(&els, text, text, 3, None);
538 assert_eq!(m.dominant_font.as_deref(), Some("Helvetica"));
539 assert!((m.min_confidence - 0.6).abs() < 1e-6);
540 assert_eq!(m.char_count, text.chars().count());
541 assert_eq!(m.chunk_id, content_chunk_id(None, 3, text));
542 assert!(m.source.is_none());
543 #[cfg(not(feature = "language-detection"))]
546 assert_eq!(m.language, None);
547 }
548
549 fn el_at(text: &str, page: u32, x: f64, y: f64, w: f64, h: f64) -> Element {
550 Element::Paragraph(ElementData {
551 text: text.to_string(),
552 metadata: ElementMetadata {
553 page,
554 bbox: crate::pipeline::element::ElementBBox::new(x, y, w, h),
555 ..ElementMetadata::default()
556 },
557 })
558 }
559
560 #[test]
561 fn citation_anchor_page_span_and_per_page_union_bbox() {
562 let els = vec![
563 el_at("a", 1, 10.0, 700.0, 100.0, 20.0), el_at("b", 1, 50.0, 600.0, 200.0, 10.0), el_at("c", 2, 30.0, 500.0, 40.0, 40.0), ];
567 let text = "a\nb\nc";
568 let m = ChunkMetadata::from_elements(&els, text, text, 0, None);
569
570 assert_eq!(m.page_span, Some((1, 2)));
571 assert_eq!(m.page_regions.len(), 2);
572 assert_eq!(m.page_regions[0].page, 1);
574 assert_eq!(m.page_regions[1].page, 2);
575
576 let p1 = &m.page_regions[0].bbox;
578 assert_eq!(p1.x, 10.0);
579 assert_eq!(p1.y, 600.0);
580 assert_eq!(p1.right(), 250.0);
581 assert_eq!(p1.top(), 720.0);
582
583 let p2 = &m.page_regions[1].bbox;
585 assert_eq!(p2.x, 30.0);
586 assert_eq!(p2.right(), 70.0);
587 assert_eq!(p2.top(), 540.0);
588 }
589
590 #[test]
591 fn citation_anchor_empty_for_no_elements() {
592 let m = ChunkMetadata::from_elements(&[], "", "", 0, None);
593 assert_eq!(m.page_span, None);
594 assert!(m.page_regions.is_empty());
595 }
596
597 #[cfg(feature = "language-detection")]
598 #[test]
599 fn language_reliability_populated_alongside_code() {
600 let els = vec![para("x", "F", 10.0, false, 1.0)];
601 let text =
602 "The annual report summarizes the financial performance of the company over the year.";
603 let m = ChunkMetadata::from_elements(&els, text, text, 0, None);
604 assert_eq!(m.language.as_deref(), Some("eng"));
605 let conf = m
606 .language_confidence
607 .expect("confidence present when a language is detected");
608 assert!(
609 conf > 0.0 && conf <= 1.0,
610 "confidence must be in (0, 1], got {conf}"
611 );
612 assert_eq!(
613 m.language_reliable,
614 Some(true),
615 "a full English sentence must be a reliable detection"
616 );
617 }
618
619 #[cfg(feature = "language-detection")]
620 #[test]
621 fn language_reliability_none_for_empty_text() {
622 let m = ChunkMetadata::from_elements(&[], "", "", 0, None);
623 assert_eq!(m.language, None);
624 assert_eq!(m.language_confidence, None);
625 assert_eq!(m.language_reliable, None);
626 }
627
628 fn table_with(rows: Vec<Vec<&str>>) -> Element {
629 Element::Table(crate::pipeline::element::TableElementData {
630 rows: rows
631 .into_iter()
632 .map(|r| r.into_iter().map(String::from).collect())
633 .collect(),
634 metadata: ElementMetadata::default(),
635 })
636 }
637
638 #[test]
639 fn table_dims_from_largest_table() {
640 let small = table_with(vec![vec!["a", "b"]]); let big = table_with(vec![vec!["a"], vec!["b"], vec!["c"]]); let els = vec![para("x", "F", 10.0, false, 1.0), small, big];
643 let text = "x";
644 let m = ChunkMetadata::from_elements(&els, text, text, 0, None);
645 assert_eq!(m.table_rows, Some(3));
647 assert_eq!(m.table_cols, Some(1));
648 }
649
650 #[test]
651 fn table_cols_uses_widest_row() {
652 let ragged = table_with(vec![vec!["a", "b"], vec!["c", "d", "e", "f"]]);
653 let m = ChunkMetadata::from_elements(&[ragged], "t", "t", 0, None);
654 assert_eq!(m.table_rows, Some(2));
655 assert_eq!(m.table_cols, Some(4));
656 }
657
658 #[test]
659 fn table_dims_none_without_table() {
660 let els = vec![para("just prose", "F", 10.0, false, 1.0)];
661 let m = ChunkMetadata::from_elements(&els, "just prose", "just prose", 0, None);
662 assert_eq!(m.table_rows, None);
663 assert_eq!(m.table_cols, None);
664 }
665
666 #[cfg(feature = "semantic")]
667 #[test]
668 fn extra_bag_defaults_empty_and_roundtrips() {
669 let mut m = ChunkMetadata::default();
670 assert!(m.extra.is_empty(), "extra defaults to empty");
671
672 let json_empty = serde_json::to_string(&m).unwrap();
674 assert!(
675 !json_empty.contains("\"extra\""),
676 "empty extra must be skipped in JSON"
677 );
678
679 m.extra
681 .insert("legal.clause_number".to_string(), serde_json::json!("3.2"));
682 m.extra.insert(
683 "legal.defined_terms".to_string(),
684 serde_json::json!(["Party", "Agreement"]),
685 );
686 let json = serde_json::to_string(&m).unwrap();
687 assert!(json.contains("\"extra\""));
688 let back: ChunkMetadata = serde_json::from_str(&json).unwrap();
689 assert_eq!(back.extra, m.extra, "extra survives round-trip");
690 assert_eq!(
691 back.extra.get("legal.clause_number").unwrap(),
692 &serde_json::json!("3.2")
693 );
694 }
695}