Skip to main content

rover/extractor/
frontmatter.rs

1//! YAML frontmatter envelope writer.
2//!
3//! Emits the M1 subset of PRD §6.2:
4//!   - url
5//!   - canonical_url (only when different from url)
6//!   - title (when present)
7//!   - fetched_at (RFC 3339, UTC)
8//!   - content_hash (sha256:...)
9//!   - estimated_tokens
10//!   - tokenizer
11//!
12//! M4 expands this with metadata, language, schema_types, tables/images
13//! transformations, etc. As of M3, real tokenizers compute `tokens` upstream
14//! and pass it in via `PageMeta`; the writer no longer estimates.
15
16use jiff::Timestamp;
17use serde::Serialize;
18use sha2::{Digest, Sha256};
19use url::Url;
20
21/// Per-image dimension pair carried alongside `ImageProcessed` annotations.
22#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
23pub struct ImageDims {
24    pub width: u32,
25    pub height: u32,
26}
27
28/// One row of the `images_processed:` frontmatter sidecar (M9). Each `<img>`
29/// the caption pipeline observes produces one entry — either `"captioned"`
30/// or `"skipped"` with a typed reason.
31#[derive(Debug, Clone, Serialize)]
32pub struct ImageProcessed {
33    pub src: String,
34    /// `"captioned"` or `"skipped"`.
35    pub decision: String,
36    /// Lowercased `SkipReason` variant when `decision == "skipped"`.
37    #[serde(skip_serializing_if = "Option::is_none")]
38    pub reason: Option<String>,
39    /// Captioner config-key name when the captioner was attempted.
40    #[serde(skip_serializing_if = "Option::is_none")]
41    pub captioner: Option<String>,
42    /// The generated caption when `decision == "captioned"`.
43    #[serde(skip_serializing_if = "Option::is_none")]
44    pub caption: Option<String>,
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub dimensions: Option<ImageDims>,
47    /// Reported byte length (from Content-Length probe) when known.
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub bytes: Option<u64>,
50    /// Human-readable error when the captioner or download failed.
51    #[serde(skip_serializing_if = "Option::is_none")]
52    pub error: Option<String>,
53}
54
55/// Inputs for the M4 frontmatter envelope.
56pub struct PageMeta<'a> {
57    pub url: &'a Url,
58    pub canonical_url: &'a Url,
59    pub title: Option<&'a str>,
60    pub fetched_at: Timestamp,
61    pub body: &'a str,
62    /// Precomputed token count for `body`, in units of `tokenizer_name`.
63    pub tokens: usize,
64    /// Short tokenizer family name (e.g. `"o200k"`). Surfaced in the
65    /// `tokenizer` frontmatter field so consumers know how `tokens` was
66    /// measured.
67    pub tokenizer_name: &'a str,
68    // ---- M4 additions ----
69    pub description: Option<&'a str>,
70    pub author: Option<&'a str>,
71    pub published: Option<&'a str>,
72    pub modified: Option<&'a str>,
73    pub image: Option<&'a str>,
74    pub og_type: Option<&'a str>,
75    pub language: Option<&'a str>,
76    pub schema_types: &'a [String],
77    pub extraction_quality: f32,
78    /// Whether the body was summarized before rendering (CLI `--max-tokens`
79    /// / `--summarize`). Rendered as `summarized: true` when set.
80    pub summarized: bool,
81    /// How the content was obtained when it required a headless render:
82    /// `"on"`, `"spa"`, or `"bot_challenge"`. `None` for a plain HTTP fetch.
83    /// Rendered as `headless_render: <reason>` when set.
84    pub headless_render: Option<&'a str>,
85    pub tables_transformed: &'a [crate::extractor::tables::TableTransform],
86    pub images_seen: usize,
87    pub images_downloaded: usize,
88    pub images_failed: usize,
89    pub images_processed: Vec<ImageProcessed>,
90    /// Guard telemetry. Rendered as a `prompt_injection:` block when `Some`.
91    pub prompt_injection: Option<&'a crate::guard::GuardTelemetry>,
92}
93
94/// Render `meta` as a frontmatter-envelope string followed by `body`.
95pub fn render(meta: &PageMeta<'_>) -> String {
96    let mut buf = String::with_capacity(meta.body.len() + 512);
97    buf.push_str("---\n");
98
99    write_field(&mut buf, "url", meta.url.as_str());
100    if meta.canonical_url != meta.url {
101        write_field(&mut buf, "canonical_url", meta.canonical_url.as_str());
102    }
103    if let Some(t) = meta.title {
104        write_field(&mut buf, "title", t);
105    }
106    write_field(&mut buf, "fetched_at", &meta.fetched_at.to_string());
107
108    let content_hash = sha256_hex(meta.body.as_bytes());
109    let hash_field = format!("sha256:{content_hash}");
110    write_field(&mut buf, "content_hash", &hash_field);
111
112    buf.push_str(&format!("estimated_tokens: {}\n", meta.tokens));
113    write_field(&mut buf, "tokenizer", meta.tokenizer_name);
114    if meta.summarized {
115        buf.push_str("summarized: true\n");
116    }
117    if let Some(reason) = meta.headless_render {
118        write_field(&mut buf, "headless_render", reason);
119    }
120
121    // M4 metadata fields — emit only when present.
122    if let Some(v) = meta.description {
123        write_field(&mut buf, "description", v);
124    }
125    if let Some(v) = meta.author {
126        write_field(&mut buf, "author", v);
127    }
128    if let Some(v) = meta.published {
129        write_field(&mut buf, "published", v);
130    }
131    if let Some(v) = meta.modified {
132        write_field(&mut buf, "modified", v);
133    }
134    if let Some(v) = meta.image {
135        write_field(&mut buf, "image", v);
136    }
137    if let Some(v) = meta.og_type {
138        write_field(&mut buf, "og_type", v);
139    }
140    if let Some(v) = meta.language {
141        write_field(&mut buf, "language", v);
142    }
143    if !meta.schema_types.is_empty() {
144        buf.push_str("schema_types:\n");
145        for s in meta.schema_types {
146            buf.push_str("  - ");
147            buf.push_str(&yaml_escape(s));
148            buf.push('\n');
149        }
150    }
151    buf.push_str(&format!(
152        "extraction_quality: {:.2}\n",
153        meta.extraction_quality
154    ));
155    if !meta.tables_transformed.is_empty() {
156        buf.push_str("tables_transformed:\n");
157        for t in meta.tables_transformed {
158            buf.push_str(&format!(
159                "  - ordinal: {}\n    mode: {}\n",
160                t.ordinal, t.mode
161            ));
162            if let Some(p) = &t.path {
163                buf.push_str(&format!("    path: {:?}\n", p.display().to_string()));
164            }
165            if let Some(k) = t.kept_rows {
166                buf.push_str(&format!("    kept_rows: {k}\n"));
167            }
168            if let Some(tr) = t.truncated_rows {
169                buf.push_str(&format!("    truncated_rows: {tr}\n"));
170            }
171        }
172    }
173    if meta.images_seen > 0 {
174        buf.push_str(&format!("images_seen: {}\n", meta.images_seen));
175    }
176    if meta.images_downloaded > 0 {
177        buf.push_str(&format!("images_downloaded: {}\n", meta.images_downloaded));
178    }
179    if meta.images_failed > 0 {
180        buf.push_str(&format!("images_failed: {}\n", meta.images_failed));
181    }
182    if !meta.images_processed.is_empty() {
183        buf.push_str("images_processed:\n");
184        for ip in &meta.images_processed {
185            buf.push_str(&format!("  - src: {}\n", yaml_escape(&ip.src)));
186            buf.push_str(&format!("    decision: {}\n", yaml_escape(&ip.decision)));
187            if let Some(v) = &ip.reason {
188                buf.push_str(&format!("    reason: {}\n", yaml_escape(v)));
189            }
190            if let Some(v) = &ip.captioner {
191                buf.push_str(&format!("    captioner: {}\n", yaml_escape(v)));
192            }
193            if let Some(v) = &ip.caption {
194                buf.push_str(&format!("    caption: {}\n", yaml_escape(v)));
195            }
196            if let Some(d) = &ip.dimensions {
197                buf.push_str(&format!(
198                    "    dimensions:\n      width: {}\n      height: {}\n",
199                    d.width, d.height
200                ));
201            }
202            if let Some(b) = ip.bytes {
203                buf.push_str(&format!("    bytes: {b}\n"));
204            }
205            if let Some(v) = &ip.error {
206                buf.push_str(&format!("    error: {}\n", yaml_escape(v)));
207            }
208        }
209    }
210
211    if let Some(pi) = meta.prompt_injection {
212        buf.push_str("prompt_injection:\n");
213        buf.push_str(&format!("  scanned: {}\n", pi.scanned));
214        buf.push_str(&format!("  detected: {}\n", pi.detected));
215        buf.push_str(&format!("  action: {}\n", yaml_escape(&pi.action)));
216        if !pi.detectors.is_empty() {
217            buf.push_str("  detectors:\n");
218            for d in &pi.detectors {
219                buf.push_str(&format!("    - {}\n", yaml_escape(d)));
220            }
221        }
222        if !pi.techniques.is_empty() {
223            buf.push_str("  techniques:\n");
224            for t in &pi.techniques {
225                buf.push_str(&format!("    - {}\n", yaml_escape(t)));
226            }
227        }
228        if let Some(score) = pi.model_score {
229            buf.push_str(&format!("  model_score: {score:.2}\n"));
230        }
231        if !pi.allowlisted.is_empty() {
232            buf.push_str("  allowlisted:\n");
233            for a in &pi.allowlisted {
234                buf.push_str(&format!("    - {}\n", yaml_escape(a)));
235            }
236        }
237        if !pi.overrides_attempted.is_empty() {
238            buf.push_str("  overrides_attempted:\n");
239            for o in &pi.overrides_attempted {
240                buf.push_str(&format!("    - {}\n", yaml_escape(o)));
241            }
242        }
243    }
244    buf.push_str("---\n\n");
245    buf.push_str(meta.body);
246    if !meta.body.ends_with('\n') {
247        buf.push('\n');
248    }
249    buf
250}
251
252/// Quote a YAML scalar when it contains characters that would break a
253/// plain-style emission (quotes, colons, line breaks) or has surrounding
254/// whitespace. Plain ASCII strings pass through unquoted.
255fn yaml_escape(s: &str) -> String {
256    let needs_quote = s.contains(['"', ':', '\n', '\r']) || s.starts_with(' ') || s.ends_with(' ');
257    if needs_quote {
258        let mut out = String::with_capacity(s.len() + 2);
259        out.push('"');
260        for c in s.chars() {
261            match c {
262                '\\' => out.push_str(r"\\"),
263                '"' => out.push_str(r#"\""#),
264                '\n' => out.push_str(r"\n"),
265                '\r' => out.push_str(r"\r"),
266                _ => out.push(c),
267            }
268        }
269        out.push('"');
270        out
271    } else {
272        s.to_string()
273    }
274}
275
276/// Emit one scalar field. Strings are double-quoted with backslash-escaping
277/// applied to `"` and `\` so any title content survives intact.
278fn write_field(buf: &mut String, key: &str, value: &str) {
279    buf.push_str(key);
280    buf.push_str(": ");
281    buf.push('"');
282    for c in value.chars() {
283        match c {
284            '\\' => buf.push_str(r"\\"),
285            '"' => buf.push_str(r#"\""#),
286            '\n' => buf.push_str(r"\n"),
287            '\r' => buf.push_str(r"\r"),
288            '\t' => buf.push_str(r"\t"),
289            _ => buf.push(c),
290        }
291    }
292    buf.push('"');
293    buf.push('\n');
294}
295
296fn sha256_hex(bytes: &[u8]) -> String {
297    let mut h = Sha256::new();
298    h.update(bytes);
299    let out = h.finalize();
300    let mut s = String::with_capacity(out.len() * 2);
301    for b in out {
302        s.push_str(&format!("{b:02x}"));
303    }
304    s
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310    use jiff::Timestamp;
311
312    fn ts() -> Timestamp {
313        "2026-05-07T12:34:56Z".parse().unwrap()
314    }
315    fn u(s: &str) -> Url {
316        Url::parse(s).unwrap()
317    }
318
319    fn meta<'a>(url: &'a Url, body: &'a str) -> PageMeta<'a> {
320        PageMeta {
321            url,
322            canonical_url: url,
323            title: Some("Sample"),
324            fetched_at: ts(),
325            body,
326            tokens: 7,
327            tokenizer_name: "o200k",
328            description: None,
329            author: None,
330            published: None,
331            modified: None,
332            image: None,
333            og_type: None,
334            language: None,
335            schema_types: &[],
336            extraction_quality: 0.50,
337            summarized: false,
338            headless_render: None,
339            tables_transformed: &[],
340            images_seen: 0,
341            images_downloaded: 0,
342            images_failed: 0,
343            images_processed: vec![],
344            prompt_injection: None,
345        }
346    }
347
348    #[test]
349    fn emits_required_fields() {
350        let url = u("https://example.com/page");
351        let body = "# Title\n\nBody.\n";
352        let out = render(&meta(&url, body));
353
354        assert!(out.starts_with("---\n"));
355        assert!(out.contains(r#"url: "https://example.com/page""#));
356        assert!(out.contains(r#"title: "Sample""#));
357        assert!(out.contains(r#"fetched_at: "2026-05-07T12:34:56Z""#));
358        assert!(out.contains("content_hash: \"sha256:"));
359        assert!(out.contains("estimated_tokens: 7"));
360        assert!(out.contains(r#"tokenizer: "o200k""#));
361        assert!(out.ends_with(body));
362    }
363
364    #[test]
365    fn omits_canonical_when_same_as_url() {
366        let url = u("https://example.com/page");
367        let out = render(&PageMeta {
368            title: None,
369            ..meta(&url, "x")
370        });
371        assert!(!out.contains("canonical_url"));
372    }
373
374    #[test]
375    fn includes_canonical_when_different() {
376        let url = u("https://example.com/page?utm=1");
377        let canon = u("https://example.com/page");
378        let out = render(&PageMeta {
379            canonical_url: &canon,
380            title: None,
381            ..meta(&url, "x")
382        });
383        assert!(out.contains(r#"canonical_url: "https://example.com/page""#));
384    }
385
386    #[test]
387    fn emits_headless_render_reason_when_set() {
388        let url = u("https://example.com/spa");
389        let out = render(&PageMeta {
390            headless_render: Some("bot_challenge"),
391            ..meta(&url, "x")
392        });
393        assert!(out.contains(r#"headless_render: "bot_challenge""#));
394    }
395
396    #[test]
397    fn omits_headless_render_when_absent() {
398        let url = u("https://example.com/");
399        let out = render(&meta(&url, "x"));
400        assert!(!out.contains("headless_render"));
401    }
402
403    #[test]
404    fn quotes_in_title_are_escaped() {
405        let url = u("https://example.com/p");
406        let out = render(&PageMeta {
407            title: Some(r#"He said "hi""#),
408            ..meta(&url, "x")
409        });
410        assert!(out.contains(r#"title: "He said \"hi\"""#));
411    }
412
413    #[test]
414    fn content_hash_is_deterministic() {
415        let url = u("https://example.com/p");
416        let body = "stable body";
417        let a = render(&meta(&url, body));
418        let b = render(&meta(&url, body));
419        assert_eq!(a, b);
420    }
421
422    #[test]
423    fn token_count_is_passed_through_verbatim() {
424        let url = u("https://example.com/p");
425        let out = render(&PageMeta {
426            tokens: 1234,
427            ..meta(&url, "hello")
428        });
429        assert!(out.contains("estimated_tokens: 1234"));
430    }
431
432    #[test]
433    fn body_terminates_with_newline() {
434        let url = u("https://example.com/p");
435        let out = render(&PageMeta {
436            title: None,
437            ..meta(&url, "no trailing newline")
438        });
439        assert!(out.ends_with('\n'));
440    }
441
442    #[test]
443    fn emits_extraction_quality() {
444        let url = Url::parse("https://example.com/p").unwrap();
445        let out = render(&meta(&url, "body"));
446        assert!(out.contains("extraction_quality: 0.50"));
447    }
448
449    #[test]
450    fn omits_empty_optional_fields() {
451        let url = Url::parse("https://example.com/p").unwrap();
452        let out = render(&meta(&url, "body"));
453        assert!(!out.contains("description:"));
454        assert!(!out.contains("schema_types:"));
455        assert!(!out.contains("tables_transformed:"));
456        assert!(!out.contains("images_seen:"));
457    }
458
459    #[test]
460    fn emits_metadata_fields_when_present() {
461        let url = Url::parse("https://example.com/p").unwrap();
462        let schema_types = vec!["Article".to_string(), "WebPage".to_string()];
463        let m = PageMeta {
464            description: Some("desc"),
465            author: Some("Ada"),
466            schema_types: &schema_types,
467            ..meta(&url, "body")
468        };
469        let out = render(&m);
470        assert!(out.contains(r#"description: "desc""#));
471        assert!(out.contains(r#"author: "Ada""#));
472        assert!(out.contains("schema_types:"));
473        assert!(out.contains("  - Article"));
474        assert!(out.contains("  - WebPage"));
475    }
476
477    #[test]
478    fn images_processed_renders_under_frontmatter() {
479        let url = u("https://example.com/p");
480        let m = PageMeta {
481            images_processed: vec![
482                ImageProcessed {
483                    src: "./hero.jpg".into(),
484                    decision: "captioned".into(),
485                    reason: None,
486                    captioner: Some("openai".into()),
487                    caption: Some("A dog.".into()),
488                    dimensions: Some(ImageDims {
489                        width: 800,
490                        height: 600,
491                    }),
492                    bytes: None,
493                    error: None,
494                },
495                ImageProcessed {
496                    src: "./icon.svg".into(),
497                    decision: "skipped".into(),
498                    reason: Some("below_min_dimensions".into()),
499                    captioner: None,
500                    caption: None,
501                    dimensions: Some(ImageDims {
502                        width: 24,
503                        height: 24,
504                    }),
505                    bytes: None,
506                    error: None,
507                },
508            ],
509            ..meta(&url, "# body\n")
510        };
511        let yaml = render(&m);
512        assert!(yaml.contains("images_processed:"));
513        assert!(yaml.contains("./hero.jpg"));
514        assert!(yaml.contains("below_min_dimensions"));
515    }
516
517    #[test]
518    fn images_processed_absent_when_empty() {
519        let url = u("https://example.com/p");
520        let out = render(&meta(&url, "body"));
521        assert!(!out.contains("images_processed:"));
522    }
523
524    #[test]
525    fn renders_prompt_injection_block_when_present() {
526        let url = url::Url::parse("https://example.com/a").unwrap();
527        let telem = crate::guard::GuardTelemetry {
528            scanned: true,
529            detected: true,
530            action: "moderate".into(),
531            detectors: vec!["patterns".into()],
532            techniques: vec!["instruction_override".into()],
533            model_score: Some(0.97),
534            allowlisted: vec![],
535            overrides_attempted: vec!["patterns".into()],
536        };
537        let meta = PageMeta {
538            url: &url,
539            canonical_url: &url,
540            title: Some("T"),
541            fetched_at: jiff::Timestamp::now(),
542            body: "hello",
543            tokens: 1,
544            tokenizer_name: "o200k",
545            description: None,
546            author: None,
547            published: None,
548            modified: None,
549            image: None,
550            og_type: None,
551            language: None,
552            schema_types: &[],
553            extraction_quality: 0.5,
554            tables_transformed: &[],
555            images_seen: 0,
556            images_downloaded: 0,
557            images_failed: 0,
558            images_processed: vec![],
559            summarized: false,
560            headless_render: None,
561            prompt_injection: Some(&telem),
562        };
563        let out = render(&meta);
564        assert!(out.contains("prompt_injection:\n"));
565        assert!(out.contains("  scanned: true\n"));
566        assert!(out.contains("  detected: true\n"));
567        assert!(out.contains("  action: moderate\n"));
568        assert!(out.contains("  detectors:\n"));
569        assert!(out.contains("    - patterns\n"));
570        assert!(out.contains("  techniques:\n"));
571        assert!(out.contains("    - instruction_override\n"));
572        assert!(out.contains("  model_score: 0.97\n"));
573        assert!(out.contains("  overrides_attempted:\n"));
574    }
575
576    #[test]
577    fn omits_prompt_injection_block_when_none() {
578        let url = url::Url::parse("https://example.com/a").unwrap();
579        let meta = PageMeta {
580            url: &url,
581            canonical_url: &url,
582            title: None,
583            fetched_at: jiff::Timestamp::now(),
584            body: "hi",
585            tokens: 1,
586            tokenizer_name: "o200k",
587            description: None,
588            author: None,
589            published: None,
590            modified: None,
591            image: None,
592            og_type: None,
593            language: None,
594            schema_types: &[],
595            extraction_quality: 0.5,
596            tables_transformed: &[],
597            images_seen: 0,
598            images_downloaded: 0,
599            images_failed: 0,
600            images_processed: vec![],
601            summarized: false,
602            headless_render: None,
603            prompt_injection: None,
604        };
605        assert!(!render(&meta).contains("prompt_injection"));
606    }
607}