1use jiff::Timestamp;
17use serde::Serialize;
18use sha2::{Digest, Sha256};
19use url::Url;
20
21#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
23pub struct ImageDims {
24 pub width: u32,
25 pub height: u32,
26}
27
28#[derive(Debug, Clone, Serialize)]
32pub struct ImageProcessed {
33 pub src: String,
34 pub decision: String,
36 #[serde(skip_serializing_if = "Option::is_none")]
38 pub reason: Option<String>,
39 #[serde(skip_serializing_if = "Option::is_none")]
41 pub captioner: Option<String>,
42 #[serde(skip_serializing_if = "Option::is_none")]
44 pub caption: Option<String>,
45 #[serde(skip_serializing_if = "Option::is_none")]
46 pub dimensions: Option<ImageDims>,
47 #[serde(skip_serializing_if = "Option::is_none")]
49 pub bytes: Option<u64>,
50 #[serde(skip_serializing_if = "Option::is_none")]
52 pub error: Option<String>,
53}
54
55pub struct PageMeta<'a> {
57 pub url: &'a Url,
58 pub canonical_url: &'a Url,
59 pub title: Option<&'a str>,
60 pub fetched_at: Timestamp,
61 pub body: &'a str,
62 pub tokens: usize,
64 pub tokenizer_name: &'a str,
68 pub description: Option<&'a str>,
70 pub author: Option<&'a str>,
71 pub published: Option<&'a str>,
72 pub modified: Option<&'a str>,
73 pub image: Option<&'a str>,
74 pub og_type: Option<&'a str>,
75 pub language: Option<&'a str>,
76 pub schema_types: &'a [String],
77 pub extraction_quality: f32,
78 pub summarized: bool,
81 pub headless_render: Option<&'a str>,
85 pub tables_transformed: &'a [crate::extractor::tables::TableTransform],
86 pub images_seen: usize,
87 pub images_downloaded: usize,
88 pub images_failed: usize,
89 pub images_processed: Vec<ImageProcessed>,
90 pub prompt_injection: Option<&'a crate::guard::GuardTelemetry>,
92}
93
94pub fn render(meta: &PageMeta<'_>) -> String {
96 let mut buf = String::with_capacity(meta.body.len() + 512);
97 buf.push_str("---\n");
98
99 write_field(&mut buf, "url", meta.url.as_str());
100 if meta.canonical_url != meta.url {
101 write_field(&mut buf, "canonical_url", meta.canonical_url.as_str());
102 }
103 if let Some(t) = meta.title {
104 write_field(&mut buf, "title", t);
105 }
106 write_field(&mut buf, "fetched_at", &meta.fetched_at.to_string());
107
108 let content_hash = sha256_hex(meta.body.as_bytes());
109 let hash_field = format!("sha256:{content_hash}");
110 write_field(&mut buf, "content_hash", &hash_field);
111
112 buf.push_str(&format!("estimated_tokens: {}\n", meta.tokens));
113 write_field(&mut buf, "tokenizer", meta.tokenizer_name);
114 if meta.summarized {
115 buf.push_str("summarized: true\n");
116 }
117 if let Some(reason) = meta.headless_render {
118 write_field(&mut buf, "headless_render", reason);
119 }
120
121 if let Some(v) = meta.description {
123 write_field(&mut buf, "description", v);
124 }
125 if let Some(v) = meta.author {
126 write_field(&mut buf, "author", v);
127 }
128 if let Some(v) = meta.published {
129 write_field(&mut buf, "published", v);
130 }
131 if let Some(v) = meta.modified {
132 write_field(&mut buf, "modified", v);
133 }
134 if let Some(v) = meta.image {
135 write_field(&mut buf, "image", v);
136 }
137 if let Some(v) = meta.og_type {
138 write_field(&mut buf, "og_type", v);
139 }
140 if let Some(v) = meta.language {
141 write_field(&mut buf, "language", v);
142 }
143 if !meta.schema_types.is_empty() {
144 buf.push_str("schema_types:\n");
145 for s in meta.schema_types {
146 buf.push_str(" - ");
147 buf.push_str(&yaml_escape(s));
148 buf.push('\n');
149 }
150 }
151 buf.push_str(&format!(
152 "extraction_quality: {:.2}\n",
153 meta.extraction_quality
154 ));
155 if !meta.tables_transformed.is_empty() {
156 buf.push_str("tables_transformed:\n");
157 for t in meta.tables_transformed {
158 buf.push_str(&format!(
159 " - ordinal: {}\n mode: {}\n",
160 t.ordinal, t.mode
161 ));
162 if let Some(p) = &t.path {
163 buf.push_str(&format!(" path: {:?}\n", p.display().to_string()));
164 }
165 if let Some(k) = t.kept_rows {
166 buf.push_str(&format!(" kept_rows: {k}\n"));
167 }
168 if let Some(tr) = t.truncated_rows {
169 buf.push_str(&format!(" truncated_rows: {tr}\n"));
170 }
171 }
172 }
173 if meta.images_seen > 0 {
174 buf.push_str(&format!("images_seen: {}\n", meta.images_seen));
175 }
176 if meta.images_downloaded > 0 {
177 buf.push_str(&format!("images_downloaded: {}\n", meta.images_downloaded));
178 }
179 if meta.images_failed > 0 {
180 buf.push_str(&format!("images_failed: {}\n", meta.images_failed));
181 }
182 if !meta.images_processed.is_empty() {
183 buf.push_str("images_processed:\n");
184 for ip in &meta.images_processed {
185 buf.push_str(&format!(" - src: {}\n", yaml_escape(&ip.src)));
186 buf.push_str(&format!(" decision: {}\n", yaml_escape(&ip.decision)));
187 if let Some(v) = &ip.reason {
188 buf.push_str(&format!(" reason: {}\n", yaml_escape(v)));
189 }
190 if let Some(v) = &ip.captioner {
191 buf.push_str(&format!(" captioner: {}\n", yaml_escape(v)));
192 }
193 if let Some(v) = &ip.caption {
194 buf.push_str(&format!(" caption: {}\n", yaml_escape(v)));
195 }
196 if let Some(d) = &ip.dimensions {
197 buf.push_str(&format!(
198 " dimensions:\n width: {}\n height: {}\n",
199 d.width, d.height
200 ));
201 }
202 if let Some(b) = ip.bytes {
203 buf.push_str(&format!(" bytes: {b}\n"));
204 }
205 if let Some(v) = &ip.error {
206 buf.push_str(&format!(" error: {}\n", yaml_escape(v)));
207 }
208 }
209 }
210
211 if let Some(pi) = meta.prompt_injection {
212 buf.push_str("prompt_injection:\n");
213 buf.push_str(&format!(" scanned: {}\n", pi.scanned));
214 buf.push_str(&format!(" detected: {}\n", pi.detected));
215 buf.push_str(&format!(" action: {}\n", yaml_escape(&pi.action)));
216 if !pi.detectors.is_empty() {
217 buf.push_str(" detectors:\n");
218 for d in &pi.detectors {
219 buf.push_str(&format!(" - {}\n", yaml_escape(d)));
220 }
221 }
222 if !pi.techniques.is_empty() {
223 buf.push_str(" techniques:\n");
224 for t in &pi.techniques {
225 buf.push_str(&format!(" - {}\n", yaml_escape(t)));
226 }
227 }
228 if let Some(score) = pi.model_score {
229 buf.push_str(&format!(" model_score: {score:.2}\n"));
230 }
231 if !pi.allowlisted.is_empty() {
232 buf.push_str(" allowlisted:\n");
233 for a in &pi.allowlisted {
234 buf.push_str(&format!(" - {}\n", yaml_escape(a)));
235 }
236 }
237 if !pi.overrides_attempted.is_empty() {
238 buf.push_str(" overrides_attempted:\n");
239 for o in &pi.overrides_attempted {
240 buf.push_str(&format!(" - {}\n", yaml_escape(o)));
241 }
242 }
243 }
244 buf.push_str("---\n\n");
245 buf.push_str(meta.body);
246 if !meta.body.ends_with('\n') {
247 buf.push('\n');
248 }
249 buf
250}
251
252fn yaml_escape(s: &str) -> String {
256 let needs_quote = s.contains(['"', ':', '\n', '\r']) || s.starts_with(' ') || s.ends_with(' ');
257 if needs_quote {
258 let mut out = String::with_capacity(s.len() + 2);
259 out.push('"');
260 for c in s.chars() {
261 match c {
262 '\\' => out.push_str(r"\\"),
263 '"' => out.push_str(r#"\""#),
264 '\n' => out.push_str(r"\n"),
265 '\r' => out.push_str(r"\r"),
266 _ => out.push(c),
267 }
268 }
269 out.push('"');
270 out
271 } else {
272 s.to_string()
273 }
274}
275
276fn write_field(buf: &mut String, key: &str, value: &str) {
279 buf.push_str(key);
280 buf.push_str(": ");
281 buf.push('"');
282 for c in value.chars() {
283 match c {
284 '\\' => buf.push_str(r"\\"),
285 '"' => buf.push_str(r#"\""#),
286 '\n' => buf.push_str(r"\n"),
287 '\r' => buf.push_str(r"\r"),
288 '\t' => buf.push_str(r"\t"),
289 _ => buf.push(c),
290 }
291 }
292 buf.push('"');
293 buf.push('\n');
294}
295
296fn sha256_hex(bytes: &[u8]) -> String {
297 let mut h = Sha256::new();
298 h.update(bytes);
299 let out = h.finalize();
300 let mut s = String::with_capacity(out.len() * 2);
301 for b in out {
302 s.push_str(&format!("{b:02x}"));
303 }
304 s
305}
306
307#[cfg(test)]
308mod tests {
309 use super::*;
310 use jiff::Timestamp;
311
312 fn ts() -> Timestamp {
313 "2026-05-07T12:34:56Z".parse().unwrap()
314 }
315 fn u(s: &str) -> Url {
316 Url::parse(s).unwrap()
317 }
318
319 fn meta<'a>(url: &'a Url, body: &'a str) -> PageMeta<'a> {
320 PageMeta {
321 url,
322 canonical_url: url,
323 title: Some("Sample"),
324 fetched_at: ts(),
325 body,
326 tokens: 7,
327 tokenizer_name: "o200k",
328 description: None,
329 author: None,
330 published: None,
331 modified: None,
332 image: None,
333 og_type: None,
334 language: None,
335 schema_types: &[],
336 extraction_quality: 0.50,
337 summarized: false,
338 headless_render: None,
339 tables_transformed: &[],
340 images_seen: 0,
341 images_downloaded: 0,
342 images_failed: 0,
343 images_processed: vec![],
344 prompt_injection: None,
345 }
346 }
347
348 #[test]
349 fn emits_required_fields() {
350 let url = u("https://example.com/page");
351 let body = "# Title\n\nBody.\n";
352 let out = render(&meta(&url, body));
353
354 assert!(out.starts_with("---\n"));
355 assert!(out.contains(r#"url: "https://example.com/page""#));
356 assert!(out.contains(r#"title: "Sample""#));
357 assert!(out.contains(r#"fetched_at: "2026-05-07T12:34:56Z""#));
358 assert!(out.contains("content_hash: \"sha256:"));
359 assert!(out.contains("estimated_tokens: 7"));
360 assert!(out.contains(r#"tokenizer: "o200k""#));
361 assert!(out.ends_with(body));
362 }
363
364 #[test]
365 fn omits_canonical_when_same_as_url() {
366 let url = u("https://example.com/page");
367 let out = render(&PageMeta {
368 title: None,
369 ..meta(&url, "x")
370 });
371 assert!(!out.contains("canonical_url"));
372 }
373
374 #[test]
375 fn includes_canonical_when_different() {
376 let url = u("https://example.com/page?utm=1");
377 let canon = u("https://example.com/page");
378 let out = render(&PageMeta {
379 canonical_url: &canon,
380 title: None,
381 ..meta(&url, "x")
382 });
383 assert!(out.contains(r#"canonical_url: "https://example.com/page""#));
384 }
385
386 #[test]
387 fn emits_headless_render_reason_when_set() {
388 let url = u("https://example.com/spa");
389 let out = render(&PageMeta {
390 headless_render: Some("bot_challenge"),
391 ..meta(&url, "x")
392 });
393 assert!(out.contains(r#"headless_render: "bot_challenge""#));
394 }
395
396 #[test]
397 fn omits_headless_render_when_absent() {
398 let url = u("https://example.com/");
399 let out = render(&meta(&url, "x"));
400 assert!(!out.contains("headless_render"));
401 }
402
403 #[test]
404 fn quotes_in_title_are_escaped() {
405 let url = u("https://example.com/p");
406 let out = render(&PageMeta {
407 title: Some(r#"He said "hi""#),
408 ..meta(&url, "x")
409 });
410 assert!(out.contains(r#"title: "He said \"hi\"""#));
411 }
412
413 #[test]
414 fn content_hash_is_deterministic() {
415 let url = u("https://example.com/p");
416 let body = "stable body";
417 let a = render(&meta(&url, body));
418 let b = render(&meta(&url, body));
419 assert_eq!(a, b);
420 }
421
422 #[test]
423 fn token_count_is_passed_through_verbatim() {
424 let url = u("https://example.com/p");
425 let out = render(&PageMeta {
426 tokens: 1234,
427 ..meta(&url, "hello")
428 });
429 assert!(out.contains("estimated_tokens: 1234"));
430 }
431
432 #[test]
433 fn body_terminates_with_newline() {
434 let url = u("https://example.com/p");
435 let out = render(&PageMeta {
436 title: None,
437 ..meta(&url, "no trailing newline")
438 });
439 assert!(out.ends_with('\n'));
440 }
441
442 #[test]
443 fn emits_extraction_quality() {
444 let url = Url::parse("https://example.com/p").unwrap();
445 let out = render(&meta(&url, "body"));
446 assert!(out.contains("extraction_quality: 0.50"));
447 }
448
449 #[test]
450 fn omits_empty_optional_fields() {
451 let url = Url::parse("https://example.com/p").unwrap();
452 let out = render(&meta(&url, "body"));
453 assert!(!out.contains("description:"));
454 assert!(!out.contains("schema_types:"));
455 assert!(!out.contains("tables_transformed:"));
456 assert!(!out.contains("images_seen:"));
457 }
458
459 #[test]
460 fn emits_metadata_fields_when_present() {
461 let url = Url::parse("https://example.com/p").unwrap();
462 let schema_types = vec!["Article".to_string(), "WebPage".to_string()];
463 let m = PageMeta {
464 description: Some("desc"),
465 author: Some("Ada"),
466 schema_types: &schema_types,
467 ..meta(&url, "body")
468 };
469 let out = render(&m);
470 assert!(out.contains(r#"description: "desc""#));
471 assert!(out.contains(r#"author: "Ada""#));
472 assert!(out.contains("schema_types:"));
473 assert!(out.contains(" - Article"));
474 assert!(out.contains(" - WebPage"));
475 }
476
477 #[test]
478 fn images_processed_renders_under_frontmatter() {
479 let url = u("https://example.com/p");
480 let m = PageMeta {
481 images_processed: vec![
482 ImageProcessed {
483 src: "./hero.jpg".into(),
484 decision: "captioned".into(),
485 reason: None,
486 captioner: Some("openai".into()),
487 caption: Some("A dog.".into()),
488 dimensions: Some(ImageDims {
489 width: 800,
490 height: 600,
491 }),
492 bytes: None,
493 error: None,
494 },
495 ImageProcessed {
496 src: "./icon.svg".into(),
497 decision: "skipped".into(),
498 reason: Some("below_min_dimensions".into()),
499 captioner: None,
500 caption: None,
501 dimensions: Some(ImageDims {
502 width: 24,
503 height: 24,
504 }),
505 bytes: None,
506 error: None,
507 },
508 ],
509 ..meta(&url, "# body\n")
510 };
511 let yaml = render(&m);
512 assert!(yaml.contains("images_processed:"));
513 assert!(yaml.contains("./hero.jpg"));
514 assert!(yaml.contains("below_min_dimensions"));
515 }
516
517 #[test]
518 fn images_processed_absent_when_empty() {
519 let url = u("https://example.com/p");
520 let out = render(&meta(&url, "body"));
521 assert!(!out.contains("images_processed:"));
522 }
523
524 #[test]
525 fn renders_prompt_injection_block_when_present() {
526 let url = url::Url::parse("https://example.com/a").unwrap();
527 let telem = crate::guard::GuardTelemetry {
528 scanned: true,
529 detected: true,
530 action: "moderate".into(),
531 detectors: vec!["patterns".into()],
532 techniques: vec!["instruction_override".into()],
533 model_score: Some(0.97),
534 allowlisted: vec![],
535 overrides_attempted: vec!["patterns".into()],
536 };
537 let meta = PageMeta {
538 url: &url,
539 canonical_url: &url,
540 title: Some("T"),
541 fetched_at: jiff::Timestamp::now(),
542 body: "hello",
543 tokens: 1,
544 tokenizer_name: "o200k",
545 description: None,
546 author: None,
547 published: None,
548 modified: None,
549 image: None,
550 og_type: None,
551 language: None,
552 schema_types: &[],
553 extraction_quality: 0.5,
554 tables_transformed: &[],
555 images_seen: 0,
556 images_downloaded: 0,
557 images_failed: 0,
558 images_processed: vec![],
559 summarized: false,
560 headless_render: None,
561 prompt_injection: Some(&telem),
562 };
563 let out = render(&meta);
564 assert!(out.contains("prompt_injection:\n"));
565 assert!(out.contains(" scanned: true\n"));
566 assert!(out.contains(" detected: true\n"));
567 assert!(out.contains(" action: moderate\n"));
568 assert!(out.contains(" detectors:\n"));
569 assert!(out.contains(" - patterns\n"));
570 assert!(out.contains(" techniques:\n"));
571 assert!(out.contains(" - instruction_override\n"));
572 assert!(out.contains(" model_score: 0.97\n"));
573 assert!(out.contains(" overrides_attempted:\n"));
574 }
575
576 #[test]
577 fn omits_prompt_injection_block_when_none() {
578 let url = url::Url::parse("https://example.com/a").unwrap();
579 let meta = PageMeta {
580 url: &url,
581 canonical_url: &url,
582 title: None,
583 fetched_at: jiff::Timestamp::now(),
584 body: "hi",
585 tokens: 1,
586 tokenizer_name: "o200k",
587 description: None,
588 author: None,
589 published: None,
590 modified: None,
591 image: None,
592 og_type: None,
593 language: None,
594 schema_types: &[],
595 extraction_quality: 0.5,
596 tables_transformed: &[],
597 images_seen: 0,
598 images_downloaded: 0,
599 images_failed: 0,
600 images_processed: vec![],
601 summarized: false,
602 headless_render: None,
603 prompt_injection: None,
604 };
605 assert!(!render(&meta).contains("prompt_injection"));
606 }
607}