Skip to main content

oxidize_pdf/pipeline/
chunk_metadata.rs

1//! Chunk-level metadata for RAG output.
2//!
3//! Surfaces data already computed by the partitioner (heading hierarchy, font,
4//! style, confidence) plus new retrieval signals (content-type flags, counts,
5//! stable IDs, language) and optional source-document metadata.
6
7#[cfg(feature = "semantic")]
8use serde::{Deserialize, Serialize};
9#[cfg(feature = "semantic")]
10use std::collections::BTreeMap;
11
12use crate::pipeline::element::{Element, ElementBBox};
13use crate::pipeline::hybrid_chunking::split_into_sentences;
14
15/// Char-weighted aggregates over a chunk's elements.
16pub(crate) struct Aggregates {
17    pub dominant_font: Option<String>,
18    pub dominant_font_size: Option<f64>,
19    pub is_bold: bool,
20    pub is_italic: bool,
21    pub min_confidence: f32,
22}
23
24impl Aggregates {
25    pub(crate) fn from_elements(elements: &[Element]) -> Self {
26        let mut font_weight: Vec<(String, usize)> = Vec::new();
27        let mut size_weight: Vec<(f64, usize)> = Vec::new();
28        let mut bold_chars = 0usize;
29        let mut italic_chars = 0usize;
30        let mut total_chars = 0usize;
31        let mut min_conf = 1.0f32;
32
33        for e in elements {
34            let w = e.text().chars().count();
35            total_chars += w;
36            let meta = e.metadata();
37            if let Some(f) = &meta.font_name {
38                match font_weight.iter_mut().find(|(name, _)| name == f) {
39                    Some((_, c)) => *c += w,
40                    None => font_weight.push((f.clone(), w)),
41                }
42            }
43            if let Some(s) = meta.font_size {
44                match size_weight.iter_mut().find(|(sz, _)| (*sz - s).abs() < 0.1) {
45                    Some((_, c)) => *c += w,
46                    None => size_weight.push((s, w)),
47                }
48            }
49            if meta.is_bold {
50                bold_chars += w;
51            }
52            if meta.is_italic {
53                italic_chars += w;
54            }
55            min_conf = min_conf.min(meta.confidence as f32);
56        }
57
58        let dominant_font = font_weight
59            .into_iter()
60            .max_by_key(|(_, c)| *c)
61            .map(|(name, _)| name);
62        let dominant_font_size = size_weight
63            .into_iter()
64            .max_by_key(|(_, c)| *c)
65            .map(|(sz, _)| sz);
66
67        Self {
68            dominant_font,
69            dominant_font_size,
70            is_bold: total_chars > 0 && bold_chars * 2 > total_chars,
71            is_italic: total_chars > 0 && italic_chars * 2 > total_chars,
72            min_confidence: if elements.is_empty() { 0.0 } else { min_conf },
73        }
74    }
75}
76
77/// Boolean flags describing the kinds of content present in a chunk.
78#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
79#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
80pub struct ContentTypeFlags {
81    /// The chunk contains at least one table element.
82    pub has_table: bool,
83    /// The chunk contains at least one list item.
84    pub has_list: bool,
85    /// The chunk contains at least one code block.
86    pub has_code: bool,
87    /// The chunk is composed solely of heading (title) elements.
88    pub heading_only: bool,
89}
90
91/// Metadata about the source document a chunk came from.
92#[derive(Debug, Clone, Default, PartialEq, Eq)]
93#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
94#[non_exhaustive]
95pub struct DocumentSource {
96    /// Document title from the info dictionary, if present.
97    pub title: Option<String>,
98    /// Document author from the info dictionary, if present.
99    pub author: Option<String>,
100    /// Creation date string from the info dictionary, if present.
101    pub creation_date: Option<String>,
102    /// Originating file name (caller-supplied — the pipeline does not know it).
103    pub filename: Option<String>,
104    /// Stable document hash (caller-supplied; used as the chunk_id prefix).
105    pub doc_hash: Option<String>,
106    /// Total page count of the source document.
107    pub total_pages: Option<u32>,
108}
109
110impl DocumentSource {
111    /// Construct a source from the two caller-supplied fields (`filename`,
112    /// `doc_hash`); the rest (`title`/`author`/`creation_date`/`total_pages`)
113    /// are left `None` for [`rag_chunks_with_source`](crate::parser::PdfDocument::rag_chunks_with_source)
114    /// to auto-fill from the info dictionary. Provided because `DocumentSource`
115    /// is `#[non_exhaustive]`, so external callers cannot use a struct literal.
116    pub fn with_file(filename: Option<String>, doc_hash: Option<String>) -> Self {
117        Self {
118            filename,
119            doc_hash,
120            ..Default::default()
121        }
122    }
123}
124
125/// Citation anchor for a chunk on a single page: the axis-aligned union of all
126/// the chunk's element bounding boxes that fall on that page. Lets a RAG
127/// consumer cite back to an exact region of the source PDF.
128#[derive(Debug, Clone, Copy, PartialEq)]
129#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
130#[non_exhaustive]
131pub struct PageRegion {
132    /// Page the region is on (as stored on the elements).
133    pub page: u32,
134    /// Union bounding box of the chunk's elements on this page.
135    pub bbox: ElementBBox,
136}
137
138/// Per-chunk metadata attached to every [`RagChunk`](crate::pipeline::RagChunk).
139#[derive(Debug, Clone, Default, PartialEq)]
140#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
141#[non_exhaustive]
142pub struct ChunkMetadata {
143    /// Full section breadcrumb, root→leaf (e.g. `["1 Intro", "1.2 Scope"]`).
144    pub heading_path: Vec<String>,
145    /// Dominant font (char-weighted majority across the chunk's elements).
146    pub dominant_font: Option<String>,
147    /// Dominant font size (char-weighted majority).
148    pub dominant_font_size: Option<f64>,
149    /// True if the majority of characters are bold.
150    pub is_bold: bool,
151    /// True if the majority of characters are italic.
152    pub is_italic: bool,
153    /// Lowest classification confidence among the chunk's elements.
154    pub min_confidence: f32,
155    /// Content-type flags derived from element types.
156    pub content_types: ContentTypeFlags,
157    /// Character count of the chunk text.
158    pub char_count: usize,
159    /// Whitespace-separated word count.
160    pub word_count: usize,
161    /// Sentence count (uses the chunker's sentence splitter).
162    pub sentence_count: usize,
163    /// Detected language code (ISO 639-3, via `whatlang`); `None` if the
164    /// `language-detection` feature is off or detection is inconclusive.
165    pub language: Option<String>,
166    /// Detection confidence in `(0, 1]` for [`language`](Self::language);
167    /// `None` when no language was detected.
168    pub language_confidence: Option<f32>,
169    /// Whether `whatlang` considered the [`language`](Self::language)
170    /// detection reliable. Consumers should gate language-based routing on
171    /// this; `None` when no language was detected.
172    pub language_reliable: Option<bool>,
173    /// Deterministic, stable identifier for this chunk.
174    pub chunk_id: String,
175    /// Identifier of the previous chunk in the document, if any.
176    pub prev_chunk_id: Option<String>,
177    /// Identifier of the next chunk in the document, if any.
178    pub next_chunk_id: Option<String>,
179    /// Source-document metadata, if available.
180    pub source: Option<DocumentSource>,
181    /// First and last page the chunk's elements touch (inclusive), or `None`
182    /// when the chunk has no positioned elements.
183    pub page_span: Option<(u32, u32)>,
184    /// Per-page citation regions (union bbox of the chunk's elements on each
185    /// page), sorted ascending by page. Empty when the chunk has no elements.
186    pub page_regions: Vec<PageRegion>,
187    /// Row count of the chunk's largest table (by row count), or `None` if the
188    /// chunk has no table. Lets a consumer filter/route table-bearing chunks.
189    pub table_rows: Option<usize>,
190    /// Column count (widest row) of the same table reported by
191    /// [`table_rows`](Self::table_rows); `None` when the chunk has no table.
192    pub table_cols: Option<usize>,
193    /// Open extension bag for provider-supplied fields (e.g. a closed analyzer
194    /// stamping `legal.clause_number`). Namespacing keys by provider avoids
195    /// collisions. Serializes nested under `"extra"`; omitted when empty.
196    #[cfg(feature = "semantic")]
197    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
198    pub extra: BTreeMap<String, serde_json::Value>,
199}
200
201use sha2::{Digest, Sha256};
202
203impl ChunkMetadata {
204    /// Build chunk metadata from the chunk's elements and text. `full_text` is
205    /// used for the content-hash id; `doc_hash` (when `Some`) overrides it.
206    /// Language is detected here when the `language-detection` feature is on;
207    /// prev/next links are filled by a later pass ([`link_chunks`]).
208    pub(crate) fn from_elements(
209        elements: &[Element],
210        text: &str,
211        full_text: &str,
212        chunk_index: usize,
213        doc_hash: Option<&str>,
214    ) -> Self {
215        let agg = Aggregates::from_elements(elements);
216        let heading_path = elements
217            .first()
218            .map(|e| e.metadata().heading_path.clone())
219            .unwrap_or_default();
220        let (page_span, page_regions) = page_anchor(elements);
221        let (table_rows, table_cols) = table_dims(elements);
222        // Detect once; fill code + confidence + reliability together.
223        #[cfg(feature = "language-detection")]
224        let (language, language_confidence, language_reliable) = match detect_language_full(text) {
225            Some((code, conf, reliable)) => (Some(code), Some(conf), Some(reliable)),
226            None => (None, None, None),
227        };
228        #[cfg(not(feature = "language-detection"))]
229        let (language, language_confidence, language_reliable): (
230            Option<String>,
231            Option<f32>,
232            Option<bool>,
233        ) = (None, None, None);
234        ChunkMetadata {
235            heading_path,
236            dominant_font: agg.dominant_font,
237            dominant_font_size: agg.dominant_font_size,
238            is_bold: agg.is_bold,
239            is_italic: agg.is_italic,
240            min_confidence: agg.min_confidence,
241            content_types: content_type_flags(elements),
242            char_count: char_count(text),
243            word_count: word_count(text),
244            sentence_count: sentence_count(text),
245            language,
246            language_confidence,
247            language_reliable,
248            chunk_id: content_chunk_id(doc_hash, chunk_index, full_text),
249            prev_chunk_id: None,
250            next_chunk_id: None,
251            source: None,
252            page_span,
253            page_regions,
254            table_rows,
255            table_cols,
256            #[cfg(feature = "semantic")]
257            extra: BTreeMap::new(),
258        }
259    }
260}
261
262/// Dimensions of the chunk's largest table (by row count): `(rows, widest row)`.
263/// `(None, None)` when the chunk contains no table element.
264fn table_dims(elements: &[Element]) -> (Option<usize>, Option<usize>) {
265    elements
266        .iter()
267        .filter_map(|e| match e {
268            Element::Table(t) => Some(&t.rows),
269            _ => None,
270        })
271        .max_by_key(|rows| rows.len())
272        .map(|rows| {
273            let cols = rows.iter().map(|r| r.len()).max().unwrap_or(0);
274            (Some(rows.len()), Some(cols))
275        })
276        .unwrap_or((None, None))
277}
278
279/// Union of two axis-aligned bounding boxes.
280fn union_bbox(a: ElementBBox, b: ElementBBox) -> ElementBBox {
281    let x = a.x.min(b.x);
282    let y = a.y.min(b.y);
283    let right = a.right().max(b.right());
284    let top = a.top().max(b.top());
285    ElementBBox::new(x, y, right - x, top - y)
286}
287
288/// Compute the chunk's citation anchor: `(page_span, page_regions)`. Groups the
289/// elements by page, unions their bboxes per page, and sorts the regions by
290/// page ascending. Returns `(None, vec![])` for an element-less chunk.
291fn page_anchor(elements: &[Element]) -> (Option<(u32, u32)>, Vec<PageRegion>) {
292    let mut by_page: Vec<(u32, ElementBBox)> = Vec::new();
293    for e in elements {
294        let page = e.metadata().page;
295        let bbox = *e.bbox();
296        match by_page.iter_mut().find(|(p, _)| *p == page) {
297            Some(slot) => slot.1 = union_bbox(slot.1, bbox),
298            None => by_page.push((page, bbox)),
299        }
300    }
301    if by_page.is_empty() {
302        return (None, Vec::new());
303    }
304    by_page.sort_by_key(|(p, _)| *p);
305    let span = (by_page.first().unwrap().0, by_page.last().unwrap().0);
306    let regions = by_page
307        .into_iter()
308        .map(|(page, bbox)| PageRegion { page, bbox })
309        .collect();
310    (Some(span), regions)
311}
312
313/// Fill `prev_chunk_id` / `next_chunk_id` on each chunk from its neighbours' ids.
314pub(crate) fn link_chunks(chunks: &mut [crate::pipeline::RagChunk]) {
315    let ids: Vec<String> = chunks.iter().map(|c| c.metadata.chunk_id.clone()).collect();
316    for (i, c) in chunks.iter_mut().enumerate() {
317        c.metadata.prev_chunk_id = if i > 0 {
318            Some(ids[i - 1].clone())
319        } else {
320            None
321        };
322        c.metadata.next_chunk_id = ids.get(i + 1).cloned();
323    }
324}
325
326/// Detect the dominant language of `text` as an ISO 639-3 code (e.g. `"eng"`,
327/// `"spa"`), via `whatlang`. Returns `None` for empty/whitespace-only input or
328/// when `whatlang` produces no detection. The detection is best-effort: on
329/// short or ambiguous text the code may be unreliable.
330///
331/// Requires the `language-detection` feature.
332#[cfg(feature = "language-detection")]
333pub fn detect_language(text: &str) -> Option<String> {
334    detect_language_full(text).map(|(code, _, _)| code)
335}
336
337/// Run `whatlang` once and return `(code, confidence, reliable)`; `None` for
338/// empty/whitespace-only input or when no detection is produced. Single call
339/// site so [`ChunkMetadata`] can fill code + confidence + reliability without
340/// detecting twice.
341#[cfg(feature = "language-detection")]
342pub(crate) fn detect_language_full(text: &str) -> Option<(String, f32, bool)> {
343    if text.trim().is_empty() {
344        return None;
345    }
346    whatlang::detect(text).map(|info| {
347        (
348            info.lang().code().to_string(),
349            info.confidence() as f32,
350            info.is_reliable(),
351        )
352    })
353}
354
355/// Deterministic chunk id: `<doc_id>:<index>` where `doc_id` is the supplied
356/// `doc_hash` or, absent that, the first 8 bytes of SHA-256(full_text) in hex.
357pub(crate) fn content_chunk_id(doc_hash: Option<&str>, index: usize, full_text: &str) -> String {
358    let doc_id = match doc_hash {
359        Some(h) => h.to_string(),
360        None => {
361            let mut hasher = Sha256::new();
362            hasher.update(full_text.as_bytes());
363            let digest = hasher.finalize();
364            digest[..8]
365                .iter()
366                .map(|b| format!("{b:02x}"))
367                .collect::<String>()
368        }
369    };
370    format!("{doc_id}:{index}")
371}
372
373pub(crate) fn content_type_flags(elements: &[Element]) -> ContentTypeFlags {
374    let mut flags = ContentTypeFlags::default();
375    let mut all_titles = !elements.is_empty();
376    for e in elements {
377        match e {
378            Element::Table(_) => flags.has_table = true,
379            Element::ListItem(_) => flags.has_list = true,
380            Element::CodeBlock(_) => flags.has_code = true,
381            _ => {}
382        }
383        if !matches!(e, Element::Title(_)) {
384            all_titles = false;
385        }
386    }
387    flags.heading_only = all_titles;
388    flags
389}
390
391pub(crate) fn char_count(text: &str) -> usize {
392    text.chars().count()
393}
394
395pub(crate) fn word_count(text: &str) -> usize {
396    text.split_whitespace().count()
397}
398
399pub(crate) fn sentence_count(text: &str) -> usize {
400    if text.trim().is_empty() {
401        return 0;
402    }
403    split_into_sentences(text).len()
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409    use crate::pipeline::element::{Element, ElementData, ElementMetadata};
410
411    fn table_el() -> Element {
412        Element::Table(crate::pipeline::element::TableElementData {
413            rows: vec![],
414            metadata: crate::pipeline::element::ElementMetadata::default(),
415        })
416    }
417
418    #[test]
419    fn content_types_and_counts() {
420        let els = vec![
421            para("Hello world. Second sentence!", "F", 10.0, false, 1.0),
422            table_el(),
423        ];
424        let flags = content_type_flags(&els);
425        assert!(flags.has_table);
426        assert!(!flags.has_list);
427        assert!(!flags.heading_only);
428
429        let text = "Hello world. Second sentence!";
430        assert_eq!(char_count(text), text.chars().count());
431        assert_eq!(word_count(text), 4);
432        assert_eq!(sentence_count(text), 2);
433    }
434
435    #[test]
436    fn heading_only_when_all_titles() {
437        let d = crate::pipeline::element::ElementData {
438            text: "Title".to_string(),
439            metadata: crate::pipeline::element::ElementMetadata::default(),
440        };
441        let els = vec![Element::Title(d)];
442        assert!(content_type_flags(&els).heading_only);
443    }
444
445    fn para(text: &str, font: &str, size: f64, bold: bool, conf: f64) -> Element {
446        let metadata = ElementMetadata {
447            font_name: Some(font.to_string()),
448            font_size: Some(size),
449            is_bold: bold,
450            confidence: conf,
451            ..ElementMetadata::default()
452        };
453        Element::Paragraph(ElementData {
454            text: text.to_string(),
455            metadata,
456        })
457    }
458
459    #[test]
460    fn aggregate_picks_char_weighted_dominant_font_and_min_confidence() {
461        // "aaaa" (4 chars) Helvetica bold conf=0.9 ; "bb" (2) Times conf=0.5
462        let els = vec![
463            para("aaaa", "Helvetica", 12.0, true, 0.9),
464            para("bb", "Times", 10.0, false, 0.5),
465        ];
466        let agg = Aggregates::from_elements(&els);
467        assert_eq!(agg.dominant_font.as_deref(), Some("Helvetica"));
468        assert_eq!(agg.dominant_font_size, Some(12.0));
469        assert!(agg.is_bold, "4 bold chars vs 2 non-bold → bold majority");
470        assert!((agg.min_confidence - 0.5).abs() < 1e-6);
471    }
472
473    #[test]
474    fn chunk_id_is_deterministic_and_prefixed() {
475        let a = content_chunk_id(None, 0, "the quick brown fox");
476        let b = content_chunk_id(None, 0, "the quick brown fox");
477        assert_eq!(a, b, "same text + index → same id");
478        assert!(a.ends_with(":0"));
479        // Hashless prefix is exactly 8 bytes of SHA-256 → 16 hex chars; pin the
480        // width so a change to the digest slice can't silently shrink the id.
481        assert_eq!(
482            a.split(':').next().unwrap().len(),
483            16,
484            "hashless chunk_id prefix must be 16 hex chars (8 bytes)"
485        );
486
487        let with_hash = content_chunk_id(Some("dochash123"), 7, "ignored when hash present");
488        assert_eq!(with_hash, "dochash123:7");
489
490        let other = content_chunk_id(None, 0, "different text");
491        assert_ne!(a, other);
492    }
493
494    #[test]
495    fn chunk_metadata_default_is_empty() {
496        let m = ChunkMetadata::default();
497        assert!(m.heading_path.is_empty());
498        assert_eq!(m.dominant_font, None);
499        assert!(!m.is_bold);
500        assert_eq!(m.min_confidence, 0.0);
501        assert!(!m.content_types.has_table);
502        assert_eq!(m.char_count, 0);
503        assert_eq!(m.language, None);
504        assert_eq!(m.language_confidence, None);
505        assert_eq!(m.language_reliable, None);
506        assert_eq!(m.chunk_id, "");
507        assert!(m.source.is_none());
508        assert_eq!(m.page_span, None);
509        assert!(m.page_regions.is_empty());
510        assert_eq!(m.table_rows, None);
511        assert_eq!(m.table_cols, None);
512    }
513
514    #[test]
515    fn document_source_with_file_sets_only_supplied_fields() {
516        let s = DocumentSource::with_file(Some("doc.pdf".to_string()), Some("h7".to_string()));
517        assert_eq!(s.filename.as_deref(), Some("doc.pdf"));
518        assert_eq!(s.doc_hash.as_deref(), Some("h7"));
519        // Everything the caller did not supply stays None for the info-dict
520        // auto-fill pass to populate.
521        assert_eq!(s.title, None);
522        assert_eq!(s.author, None);
523        assert_eq!(s.creation_date, None);
524        assert_eq!(s.total_pages, None);
525
526        let empty = DocumentSource::with_file(None, None);
527        assert_eq!(empty, DocumentSource::default());
528    }
529
530    #[test]
531    fn build_metadata_from_chunk_elements() {
532        let els = vec![
533            para("aaaa", "Helvetica", 12.0, true, 0.8),
534            para("bb. cc.", "Helvetica", 12.0, false, 0.6),
535        ];
536        let text = "aaaa\nbb. cc.";
537        let m = ChunkMetadata::from_elements(&els, text, text, 3, None);
538        assert_eq!(m.dominant_font.as_deref(), Some("Helvetica"));
539        assert!((m.min_confidence - 0.6).abs() < 1e-6);
540        assert_eq!(m.char_count, text.chars().count());
541        assert_eq!(m.chunk_id, content_chunk_id(None, 3, text));
542        assert!(m.source.is_none());
543        // Without the feature the field stays None; with it, detection runs on
544        // the chunk text (the exact code is whatlang's call, not asserted here).
545        #[cfg(not(feature = "language-detection"))]
546        assert_eq!(m.language, None);
547    }
548
549    fn el_at(text: &str, page: u32, x: f64, y: f64, w: f64, h: f64) -> Element {
550        Element::Paragraph(ElementData {
551            text: text.to_string(),
552            metadata: ElementMetadata {
553                page,
554                bbox: crate::pipeline::element::ElementBBox::new(x, y, w, h),
555                ..ElementMetadata::default()
556            },
557        })
558    }
559
560    #[test]
561    fn citation_anchor_page_span_and_per_page_union_bbox() {
562        let els = vec![
563            el_at("a", 1, 10.0, 700.0, 100.0, 20.0), // page1: x[10,110] y[700,720]
564            el_at("b", 1, 50.0, 600.0, 200.0, 10.0), // page1: x[50,250] y[600,610]
565            el_at("c", 2, 30.0, 500.0, 40.0, 40.0),  // page2: x[30,70]  y[500,540]
566        ];
567        let text = "a\nb\nc";
568        let m = ChunkMetadata::from_elements(&els, text, text, 0, None);
569
570        assert_eq!(m.page_span, Some((1, 2)));
571        assert_eq!(m.page_regions.len(), 2);
572        // Sorted ascending by page.
573        assert_eq!(m.page_regions[0].page, 1);
574        assert_eq!(m.page_regions[1].page, 2);
575
576        // Page 1 region = union of its two element bboxes.
577        let p1 = &m.page_regions[0].bbox;
578        assert_eq!(p1.x, 10.0);
579        assert_eq!(p1.y, 600.0);
580        assert_eq!(p1.right(), 250.0);
581        assert_eq!(p1.top(), 720.0);
582
583        // Page 2 region = the single element's bbox.
584        let p2 = &m.page_regions[1].bbox;
585        assert_eq!(p2.x, 30.0);
586        assert_eq!(p2.right(), 70.0);
587        assert_eq!(p2.top(), 540.0);
588    }
589
590    #[test]
591    fn citation_anchor_empty_for_no_elements() {
592        let m = ChunkMetadata::from_elements(&[], "", "", 0, None);
593        assert_eq!(m.page_span, None);
594        assert!(m.page_regions.is_empty());
595    }
596
597    #[cfg(feature = "language-detection")]
598    #[test]
599    fn language_reliability_populated_alongside_code() {
600        let els = vec![para("x", "F", 10.0, false, 1.0)];
601        let text =
602            "The annual report summarizes the financial performance of the company over the year.";
603        let m = ChunkMetadata::from_elements(&els, text, text, 0, None);
604        assert_eq!(m.language.as_deref(), Some("eng"));
605        let conf = m
606            .language_confidence
607            .expect("confidence present when a language is detected");
608        assert!(
609            conf > 0.0 && conf <= 1.0,
610            "confidence must be in (0, 1], got {conf}"
611        );
612        assert_eq!(
613            m.language_reliable,
614            Some(true),
615            "a full English sentence must be a reliable detection"
616        );
617    }
618
619    #[cfg(feature = "language-detection")]
620    #[test]
621    fn language_reliability_none_for_empty_text() {
622        let m = ChunkMetadata::from_elements(&[], "", "", 0, None);
623        assert_eq!(m.language, None);
624        assert_eq!(m.language_confidence, None);
625        assert_eq!(m.language_reliable, None);
626    }
627
628    fn table_with(rows: Vec<Vec<&str>>) -> Element {
629        Element::Table(crate::pipeline::element::TableElementData {
630            rows: rows
631                .into_iter()
632                .map(|r| r.into_iter().map(String::from).collect())
633                .collect(),
634            metadata: ElementMetadata::default(),
635        })
636    }
637
638    #[test]
639    fn table_dims_from_largest_table() {
640        let small = table_with(vec![vec!["a", "b"]]); // 1 row x 2 cols
641        let big = table_with(vec![vec!["a"], vec!["b"], vec!["c"]]); // 3 rows x 1 col
642        let els = vec![para("x", "F", 10.0, false, 1.0), small, big];
643        let text = "x";
644        let m = ChunkMetadata::from_elements(&els, text, text, 0, None);
645        // Largest by row count wins.
646        assert_eq!(m.table_rows, Some(3));
647        assert_eq!(m.table_cols, Some(1));
648    }
649
650    #[test]
651    fn table_cols_uses_widest_row() {
652        let ragged = table_with(vec![vec!["a", "b"], vec!["c", "d", "e", "f"]]);
653        let m = ChunkMetadata::from_elements(&[ragged], "t", "t", 0, None);
654        assert_eq!(m.table_rows, Some(2));
655        assert_eq!(m.table_cols, Some(4));
656    }
657
658    #[test]
659    fn table_dims_none_without_table() {
660        let els = vec![para("just prose", "F", 10.0, false, 1.0)];
661        let m = ChunkMetadata::from_elements(&els, "just prose", "just prose", 0, None);
662        assert_eq!(m.table_rows, None);
663        assert_eq!(m.table_cols, None);
664    }
665
666    #[cfg(feature = "semantic")]
667    #[test]
668    fn extra_bag_defaults_empty_and_roundtrips() {
669        let mut m = ChunkMetadata::default();
670        assert!(m.extra.is_empty(), "extra defaults to empty");
671
672        // Empty extra is omitted from the serialized output.
673        let json_empty = serde_json::to_string(&m).unwrap();
674        assert!(
675            !json_empty.contains("\"extra\""),
676            "empty extra must be skipped in JSON"
677        );
678
679        // Populated extra survives a deterministic round-trip.
680        m.extra
681            .insert("legal.clause_number".to_string(), serde_json::json!("3.2"));
682        m.extra.insert(
683            "legal.defined_terms".to_string(),
684            serde_json::json!(["Party", "Agreement"]),
685        );
686        let json = serde_json::to_string(&m).unwrap();
687        assert!(json.contains("\"extra\""));
688        let back: ChunkMetadata = serde_json::from_str(&json).unwrap();
689        assert_eq!(back.extra, m.extra, "extra survives round-trip");
690        assert_eq!(
691            back.extra.get("legal.clause_number").unwrap(),
692            &serde_json::json!("3.2")
693        );
694    }
695}