Skip to main content

index_capture/
lib.rs

1//! Local capture artifact and redaction workflow.
2//!
3//! This crate prepares deterministic, local-only artifacts for unsupported page
4//! reports. It validates the source URL and redacts credential-shaped content,
5//! but it does not fetch network content, parse HTML, render terminal UI, or
6//! share artifacts.
7
8use std::fmt::{Display, Formatter};
9use std::fs;
10use std::path::{Path, PathBuf};
11
12use index_core::{IndexDocument, IndexNode, IndexUrl, Redactor, UrlError};
13
14const ARTIFACT_HEADER: &str = "index-capture-v1";
15const HTML_BEGIN: &str = "---BEGIN REDACTED HTML---";
16const HTML_END: &str = "---END REDACTED HTML---";
17const DIAGNOSTIC_BEGIN: &str = "---BEGIN DIAGNOSTIC---";
18const DIAGNOSTIC_END: &str = "---END DIAGNOSTIC---";
19const PREVIEW_BEGIN: &str = "---BEGIN CAPTURE PREVIEW---";
20const PREVIEW_END: &str = "---END CAPTURE PREVIEW---";
21const REPAIR_BEGIN: &str = "---BEGIN REPAIR HINTS---";
22const REPAIR_END: &str = "---END REPAIR HINTS---";
23const INDEX_ARTIFACT_HEADER: &str = "index-artifact-v1";
24const INDEX_ARTIFACT_VERSION: u8 = 1;
25const ARTIFACT_CAPTURE_BEGIN: &str = "---BEGIN CAPTURE ARTIFACT---";
26const ARTIFACT_CAPTURE_END: &str = "---END CAPTURE ARTIFACT---";
27const REDACTED: &str = "[REDACTED]";
28
29/// Errors returned by capture artifact preparation.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum CaptureError {
32    /// The source URL was rejected by Index URL policy.
33    InvalidSourceUrl(UrlError),
34    /// The capture artifact text does not match the local artifact format.
35    InvalidArtifact(String),
36}
37
38impl Display for CaptureError {
39    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
40        match self {
41            Self::InvalidSourceUrl(error) => write!(f, "capture source URL rejected: {error}"),
42            Self::InvalidArtifact(reason) => write!(f, "capture artifact is invalid: {reason}"),
43        }
44    }
45}
46
47impl std::error::Error for CaptureError {}
48
49/// Errors returned by canonical artifact persistence.
50#[derive(Debug)]
51pub enum ArtifactStoreError {
52    /// Filesystem persistence failed.
53    Io(std::io::Error),
54    /// Serialized artifact content was invalid.
55    Parse(String),
56    /// Artifact URL fields were invalid.
57    Url(UrlError),
58    /// Embedded capture artifact validation failed.
59    Capture(CaptureError),
60}
61
62impl Display for ArtifactStoreError {
63    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
64        match self {
65            Self::Io(error) => write!(f, "artifact store IO failed: {error}"),
66            Self::Parse(reason) => write!(f, "artifact store parse failed: {reason}"),
67            Self::Url(error) => write!(f, "artifact store URL failed: {error}"),
68            Self::Capture(error) => write!(f, "artifact store capture failed: {error}"),
69        }
70    }
71}
72
73impl std::error::Error for ArtifactStoreError {}
74
75impl From<std::io::Error> for ArtifactStoreError {
76    fn from(value: std::io::Error) -> Self {
77        Self::Io(value)
78    }
79}
80
81/// Local capture request.
82#[derive(Debug, Clone, PartialEq, Eq)]
83pub struct CaptureRequest {
84    /// Source URL whose page shape is represented by this local capture.
85    pub source_url: IndexUrl,
86    /// Raw HTML or snapshot source supplied locally by the user.
87    pub html: String,
88    /// Optional local diagnostic context.
89    pub diagnostic: Option<String>,
90}
91
92impl CaptureRequest {
93    /// Creates a capture request from a source URL string and local HTML.
94    pub fn new(source_url: impl AsRef<str>, html: impl Into<String>) -> Result<Self, CaptureError> {
95        let source_url = IndexUrl::parse(source_url).map_err(CaptureError::InvalidSourceUrl)?;
96        Ok(Self {
97            source_url,
98            html: html.into(),
99            diagnostic: None,
100        })
101    }
102
103    /// Adds local diagnostic context to the capture request.
104    #[must_use]
105    pub fn with_diagnostic(mut self, diagnostic: impl Into<String>) -> Self {
106        self.diagnostic = Some(diagnostic.into());
107        self
108    }
109}
110
111/// Sanitized capture artifact ready for local review.
112#[derive(Debug, Clone, PartialEq, Eq)]
113pub struct CaptureArtifact {
114    /// Redacted source URL.
115    pub source_url: String,
116    /// Redacted HTML or snapshot source.
117    pub redacted_html: String,
118    /// Redacted local diagnostic context.
119    pub diagnostic: Option<String>,
120    /// Deterministic command that can reproduce this artifact locally.
121    pub reproduction_command: String,
122}
123
124/// Capture context for canonical artifact runtime entries.
125#[derive(Debug, Clone, Copy, PartialEq, Eq)]
126pub enum ArtifactContext {
127    /// Live URL open flow (`GET`-like fetch path).
128    LiveGet,
129    /// Live form submission flow.
130    LiveSubmit,
131    /// Offline import or replay.
132    Offline,
133}
134
135impl ArtifactContext {
136    /// Stable lowercase identifier for serialization.
137    #[must_use]
138    pub const fn as_str(self) -> &'static str {
139        match self {
140            Self::LiveGet => "live-get",
141            Self::LiveSubmit => "live-submit",
142            Self::Offline => "offline",
143        }
144    }
145
146    /// Parses a serialized artifact context.
147    pub fn parse(input: &str) -> Result<Self, String> {
148        match input.trim() {
149            "live-get" => Ok(Self::LiveGet),
150            "live-submit" => Ok(Self::LiveSubmit),
151            "offline" => Ok(Self::Offline),
152            other => Err(format!("unsupported artifact context: {other}")),
153        }
154    }
155}
156
157/// Freshness state for stale-while-revalidate policy.
158#[derive(Debug, Clone, Copy, PartialEq, Eq)]
159pub enum ArtifactFreshness {
160    /// Artifact is within its max age policy.
161    Fresh,
162    /// Artifact is outside max age policy.
163    Stale,
164}
165
166/// Versioned canonical runtime artifact for rendering without raw page HTML.
167#[derive(Debug, Clone, PartialEq, Eq)]
168pub struct IndexArtifact {
169    /// Artifact schema version.
170    pub version: u8,
171    /// Canonical key URL used for cache identity.
172    pub canonical_url: String,
173    /// Final URL observed during retrieval.
174    pub final_url: String,
175    /// Capture context describing how the artifact was produced.
176    pub context: ArtifactContext,
177    /// Unix timestamp when this artifact was stored.
178    pub stored_at_unix_secs: u64,
179    /// Max age used for stale-while-revalidate checks.
180    pub max_age_secs: u64,
181    /// Embedded normalized capture artifact.
182    pub capture: CaptureArtifact,
183}
184
185impl IndexArtifact {
186    /// Creates a canonical artifact from a semantic document.
187    pub fn from_document(
188        document: &IndexDocument,
189        canonical_url: &IndexUrl,
190        final_url: &IndexUrl,
191        context: ArtifactContext,
192        stored_at_unix_secs: u64,
193        max_age_secs: u64,
194    ) -> Result<Self, CaptureError> {
195        let mut capture = capture_document(document)?;
196        let canonical = redact_sensitive_pairs(canonical_url.as_str());
197        let final_url = redact_sensitive_pairs(final_url.as_str());
198        capture.source_url = canonical.clone();
199        capture.reproduction_command =
200            format!("index capture --redact {canonical} - < local-page.html");
201        Ok(Self {
202            version: INDEX_ARTIFACT_VERSION,
203            canonical_url: canonical,
204            final_url,
205            context,
206            stored_at_unix_secs,
207            max_age_secs,
208            capture,
209        })
210    }
211
212    /// Returns whether the artifact is fresh at `now_unix_secs`.
213    #[must_use]
214    pub fn freshness(&self, now_unix_secs: u64) -> ArtifactFreshness {
215        let expires_at = self.stored_at_unix_secs.saturating_add(self.max_age_secs);
216        if now_unix_secs <= expires_at {
217            ArtifactFreshness::Fresh
218        } else {
219            ArtifactFreshness::Stale
220        }
221    }
222
223    /// Returns whether the artifact is fresh at `now_unix_secs`.
224    #[must_use]
225    pub fn is_fresh(&self, now_unix_secs: u64) -> bool {
226        self.freshness(now_unix_secs) == ArtifactFreshness::Fresh
227    }
228
229    /// Serializes the canonical artifact as deterministic text.
230    #[must_use]
231    pub fn to_text(&self) -> String {
232        format!(
233            "{INDEX_ARTIFACT_HEADER}\nversion: {}\ncontext: {}\ncanonical_url: {}\nfinal_url: {}\nstored_at_unix_secs: {}\nmax_age_secs: {}\n{ARTIFACT_CAPTURE_BEGIN}\n{}\
234\n{ARTIFACT_CAPTURE_END}\n",
235            self.version,
236            self.context.as_str(),
237            self.canonical_url,
238            self.final_url,
239            self.stored_at_unix_secs,
240            self.max_age_secs,
241            self.capture.to_text().trim_end()
242        )
243    }
244
245    /// Parses and validates an artifact.
246    pub fn from_text(input: &str) -> Result<Self, ArtifactStoreError> {
247        let mut lines = input.lines();
248        if lines.next() != Some(INDEX_ARTIFACT_HEADER) {
249            return Err(ArtifactStoreError::Parse(
250                "missing artifact header".to_owned(),
251            ));
252        }
253        let version = parse_artifact_u8_line(lines.next(), "version: ")?;
254        if version != INDEX_ARTIFACT_VERSION {
255            return Err(ArtifactStoreError::Parse(format!(
256                "unsupported artifact version: {version}"
257            )));
258        }
259        let context = ArtifactContext::parse(
260            &parse_prefixed_line(lines.next(), "context: ")
261                .map_err(|error| ArtifactStoreError::Parse(error.to_string()))?,
262        )
263        .map_err(ArtifactStoreError::Parse)?;
264        let canonical_url = parse_prefixed_line(lines.next(), "canonical_url: ")
265            .map_err(|error| ArtifactStoreError::Parse(error.to_string()))?;
266        let final_url = parse_prefixed_line(lines.next(), "final_url: ")
267            .map_err(|error| ArtifactStoreError::Parse(error.to_string()))?;
268        let stored_at_unix_secs = parse_artifact_u64_line(lines.next(), "stored_at_unix_secs: ")?;
269        let max_age_secs = parse_artifact_u64_line(lines.next(), "max_age_secs: ")?;
270        if lines.next() != Some(ARTIFACT_CAPTURE_BEGIN) {
271            return Err(ArtifactStoreError::Parse(
272                "missing capture section".to_owned(),
273            ));
274        }
275        let mut capture_lines = Vec::new();
276        for line in &mut lines {
277            if line == ARTIFACT_CAPTURE_END {
278                let capture_text = capture_lines.join("\n");
279                let capture =
280                    validate_capture_bundle(&capture_text).map_err(ArtifactStoreError::Capture)?;
281                IndexUrl::parse(canonical_url.replace(REDACTED, "redacted"))
282                    .map_err(ArtifactStoreError::Url)?;
283                IndexUrl::parse(final_url.replace(REDACTED, "redacted"))
284                    .map_err(ArtifactStoreError::Url)?;
285                return Ok(Self {
286                    version,
287                    canonical_url,
288                    final_url,
289                    context,
290                    stored_at_unix_secs,
291                    max_age_secs,
292                    capture,
293                });
294            }
295            capture_lines.push(line.to_owned());
296        }
297        Err(ArtifactStoreError::Parse(
298            "unterminated capture section".to_owned(),
299        ))
300    }
301}
302
303/// Filesystem-backed canonical artifact store.
304#[derive(Debug, Clone, PartialEq, Eq)]
305pub struct ArtifactStore {
306    root: PathBuf,
307}
308
309impl ArtifactStore {
310    /// Creates a store rooted at `root`.
311    #[must_use]
312    pub fn new(root: impl Into<PathBuf>) -> Self {
313        Self { root: root.into() }
314    }
315
316    /// Root directory used by the store.
317    #[must_use]
318    pub fn root(&self) -> &Path {
319        &self.root
320    }
321
322    /// Deterministic path for a canonical URL/context key.
323    #[must_use]
324    pub fn path_for(&self, canonical_url: &IndexUrl, context: ArtifactContext) -> PathBuf {
325        self.root.join(format!(
326            "{}.{}.idx",
327            canonical_url.cache_key(),
328            context.as_str()
329        ))
330    }
331
332    /// Stores an artifact under its canonical URL/context key.
333    pub fn store(&self, artifact: &IndexArtifact) -> Result<PathBuf, ArtifactStoreError> {
334        fs::create_dir_all(&self.root)?;
335        let canonical_url = IndexUrl::parse(artifact.canonical_url.replace(REDACTED, "redacted"))
336            .map_err(ArtifactStoreError::Url)?;
337        let path = self.path_for(&canonical_url, artifact.context);
338        fs::write(&path, artifact.to_text())?;
339        Ok(path)
340    }
341
342    /// Loads an artifact for a canonical URL/context key.
343    pub fn load(
344        &self,
345        canonical_url: &IndexUrl,
346        context: ArtifactContext,
347    ) -> Result<Option<IndexArtifact>, ArtifactStoreError> {
348        let path = self.path_for(canonical_url, context);
349        if !path.exists() {
350            return Ok(None);
351        }
352        let contents = fs::read_to_string(path)?;
353        let artifact = IndexArtifact::from_text(&contents)?;
354        Ok(Some(artifact))
355    }
356}
357
358/// Summary of local redaction categories removed from a capture.
359#[derive(Debug, Clone, PartialEq, Eq, Default)]
360pub struct RedactionSummary {
361    /// Credential-shaped source URL values were redacted.
362    pub source_url_values: usize,
363    /// Credential-shaped HTML text or attributes were redacted.
364    pub html_values: usize,
365    /// Credential-shaped diagnostic values were redacted.
366    pub diagnostic_values: usize,
367}
368
369impl RedactionSummary {
370    /// Returns the total redaction count.
371    #[must_use]
372    pub const fn total(&self) -> usize {
373        self.source_url_values + self.html_values + self.diagnostic_values
374    }
375
376    /// Serializes the summary as deterministic text.
377    #[must_use]
378    pub fn to_text(&self) -> String {
379        format!(
380            "redaction-summary-v1\nsource_url_values: {}\nhtml_values: {}\ndiagnostic_values: {}\ntotal: {}",
381            self.source_url_values,
382            self.html_values,
383            self.diagnostic_values,
384            self.total()
385        )
386    }
387}
388
389/// Local preview for reviewing a capture before sharing.
390#[derive(Debug, Clone, PartialEq, Eq)]
391pub struct CapturePreview {
392    /// Redacted artifact.
393    pub artifact: CaptureArtifact,
394    /// Redaction summary.
395    pub summary: RedactionSummary,
396    /// Fixture submission checklist.
397    pub checklist: String,
398}
399
400/// Review bundle combining capture preview, repair hints, and catalog guidance.
401#[derive(Debug, Clone, PartialEq, Eq)]
402pub struct CaptureReviewBundle {
403    /// Local capture preview.
404    pub preview: CapturePreview,
405    /// Optional serialized repair recipe or review hints.
406    pub repair_hints: String,
407    /// Suggested coverage catalog entry.
408    pub catalog_entry: String,
409}
410
411impl CapturePreview {
412    /// Serializes the preview without uploading or fetching anything.
413    #[must_use]
414    pub fn to_text(&self) -> String {
415        format!(
416            "index-capture-preview-v1\n{}\n\n{}\n\n{}",
417            self.summary.to_text(),
418            self.checklist,
419            self.artifact.to_text()
420        )
421    }
422}
423
424impl CaptureReviewBundle {
425    /// Serializes a local-only review bundle.
426    #[must_use]
427    pub fn to_text(&self) -> String {
428        format!(
429            "index-capture-review-bundle-v1\n{PREVIEW_BEGIN}\n{}\n{PREVIEW_END}\n{REPAIR_BEGIN}\n{}\n{REPAIR_END}\ncatalog_entry: {}\n",
430            self.preview.to_text(),
431            self.repair_hints,
432            self.catalog_entry
433        )
434    }
435
436    /// Parses a serialized review bundle and validates the embedded artifact.
437    pub fn from_text(input: &str) -> Result<Self, CaptureError> {
438        let mut lines = input.lines();
439        if lines.next() != Some("index-capture-review-bundle-v1") {
440            return Err(CaptureError::InvalidArtifact(
441                "missing review bundle header".to_owned(),
442            ));
443        }
444        if lines.next() != Some(PREVIEW_BEGIN) {
445            return Err(CaptureError::InvalidArtifact(
446                "missing capture preview section".to_owned(),
447            ));
448        }
449        let mut preview_lines = Vec::new();
450        for line in &mut lines {
451            if line == PREVIEW_END {
452                break;
453            }
454            preview_lines.push(line.to_owned());
455        }
456        if lines.next() != Some(REPAIR_BEGIN) {
457            return Err(CaptureError::InvalidArtifact(
458                "missing repair hint section".to_owned(),
459            ));
460        }
461        let mut repair_lines = Vec::new();
462        for line in &mut lines {
463            if line == REPAIR_END {
464                break;
465            }
466            repair_lines.push(line.to_owned());
467        }
468        let catalog_entry = parse_prefixed_line(lines.next(), "catalog_entry: ")?;
469        let preview = parse_preview_from_text(&preview_lines.join("\n"))?;
470        Ok(Self {
471            preview,
472            repair_hints: repair_lines.join("\n"),
473            catalog_entry,
474        })
475    }
476}
477
478impl CaptureArtifact {
479    /// Serializes the artifact to deterministic text.
480    #[must_use]
481    pub fn to_text(&self) -> String {
482        let diagnostic = self.diagnostic.as_deref().unwrap_or("none");
483        format!(
484            "{ARTIFACT_HEADER}\nsource_url: {}\nreproduce: {}\n{HTML_BEGIN}\n{}\n{HTML_END}\n{DIAGNOSTIC_BEGIN}\n{}\n{DIAGNOSTIC_END}\n",
485            self.source_url, self.reproduction_command, self.redacted_html, diagnostic
486        )
487    }
488
489    /// Returns a deterministic fixture submission checklist for this artifact.
490    #[must_use]
491    pub fn submission_checklist(&self) -> String {
492        format!(
493            "fixture-submission-checklist-v1\nsource_url: {}\nreproduce: {}\n[ ] confirm the URL is public or rewritten to a public equivalent\n[ ] confirm no cookies, credentials, account identifiers, private messages, or private URLs remain\n[ ] reduce HTML to the smallest shape that reproduces the behavior\n[ ] classify intent and support tier\n[ ] add or update regression tests\n[ ] record the fixture in docs/COVERAGE_CATALOG.md",
494            self.source_url, self.reproduction_command
495        )
496    }
497
498    /// Validates that this local-only bundle can be reviewed safely.
499    pub fn validate_bundle(&self) -> Result<(), CaptureError> {
500        let parseable_source_url = self.source_url.replace(REDACTED, "redacted");
501        IndexUrl::parse(&parseable_source_url).map_err(CaptureError::InvalidSourceUrl)?;
502        if self.redacted_html.trim().is_empty() {
503            return Err(CaptureError::InvalidArtifact(
504                "redacted HTML section is empty".to_owned(),
505            ));
506        }
507        if contains_unredacted_sensitive_pair(&self.source_url)
508            || contains_unredacted_sensitive_pair(&self.redacted_html)
509            || self
510                .diagnostic
511                .as_deref()
512                .is_some_and(contains_unredacted_sensitive_pair)
513        {
514            return Err(CaptureError::InvalidArtifact(
515                "artifact contains unredacted credential-shaped content".to_owned(),
516            ));
517        }
518        Ok(())
519    }
520
521    /// Parses a serialized capture artifact.
522    pub fn from_text(input: &str) -> Result<Self, CaptureError> {
523        let mut lines = input.lines();
524        if lines.next() != Some(ARTIFACT_HEADER) {
525            return Err(CaptureError::InvalidArtifact("missing header".to_owned()));
526        }
527
528        let source_url = parse_prefixed_line(lines.next(), "source_url: ")?;
529        let reproduction_command = parse_prefixed_line(lines.next(), "reproduce: ")?;
530        if lines.next() != Some(HTML_BEGIN) {
531            return Err(CaptureError::InvalidArtifact(
532                "missing redacted HTML section".to_owned(),
533            ));
534        }
535
536        let mut redacted_html = Vec::new();
537        for line in &mut lines {
538            if line == HTML_END {
539                break;
540            }
541            redacted_html.push(line.to_owned());
542        }
543
544        if lines.next() != Some(DIAGNOSTIC_BEGIN) {
545            return Err(CaptureError::InvalidArtifact(
546                "missing diagnostic section".to_owned(),
547            ));
548        }
549
550        let mut diagnostic = Vec::new();
551        for line in &mut lines {
552            if line == DIAGNOSTIC_END {
553                let diagnostic = diagnostic.join("\n");
554                let diagnostic = if diagnostic == "none" {
555                    None
556                } else {
557                    Some(diagnostic)
558                };
559                return Ok(Self {
560                    source_url,
561                    redacted_html: redacted_html.join("\n"),
562                    diagnostic,
563                    reproduction_command,
564                });
565            }
566            diagnostic.push(line.to_owned());
567        }
568
569        Err(CaptureError::InvalidArtifact(
570            "unterminated diagnostic section".to_owned(),
571        ))
572    }
573}
574
575/// Creates a redacted local capture artifact.
576pub fn capture_redacted(request: &CaptureRequest) -> CaptureArtifact {
577    let mut redactor = Redactor::new();
578    add_html_secret_values(&request.html, &mut redactor);
579    if let Some(diagnostic) = &request.diagnostic {
580        add_query_secret_values(diagnostic, &mut redactor);
581    }
582    add_query_secret_values(request.source_url.as_str(), &mut redactor);
583
584    let source_url = redact_sensitive_pairs(&redactor.redact(request.source_url.as_str()));
585    let redacted_html = redact_html(&request.html, &redactor);
586    let diagnostic = request
587        .diagnostic
588        .as_ref()
589        .map(|diagnostic| redact_sensitive_pairs(&redactor.redact(diagnostic)));
590
591    CaptureArtifact {
592        reproduction_command: format!("index capture --redact {source_url} - < local-page.html"),
593        source_url,
594        redacted_html,
595        diagnostic,
596    }
597}
598
599/// Creates a local capture preview with summary and checklist.
600pub fn preview_redacted(request: &CaptureRequest) -> CapturePreview {
601    let artifact = capture_redacted(request);
602    let summary = summarize_redactions(request, &artifact);
603    let checklist = artifact.submission_checklist();
604    CapturePreview {
605        artifact,
606        summary,
607        checklist,
608    }
609}
610
611/// Creates a local capture review bundle for fixture intake.
612pub fn capture_review_bundle(
613    request: &CaptureRequest,
614    fixture_path: &str,
615    repair_hints: impl Into<String>,
616) -> Result<CaptureReviewBundle, CaptureError> {
617    let preview = preview_redacted(request);
618    let catalog_entry = catalog_entry_for_fixture(fixture_path, "unknown", 0)?;
619    Ok(CaptureReviewBundle {
620        preview,
621        repair_hints: repair_hints.into(),
622        catalog_entry,
623    })
624}
625
626/// Creates a coverage catalog entry for an accepted local fixture.
627pub fn catalog_entry_for_fixture(
628    fixture_path: &str,
629    intent: &str,
630    tier: u8,
631) -> Result<String, CaptureError> {
632    if !Path::new(fixture_path).exists() {
633        return Err(CaptureError::InvalidArtifact(format!(
634            "fixture path does not exist: {fixture_path}"
635        )));
636    }
637    Ok(format!(
638        "| `{fixture_path}` | {intent} | Tier {tier} | capture | review private data before submission |"
639    ))
640}
641
642/// Creates a redacted capture artifact from the current semantic document.
643///
644/// This is used by the TUI when the original local HTML is not retained. The
645/// artifact remains deterministic and local-only, while still round-tripping
646/// through the normal capture validation and batch extraction paths.
647pub fn capture_document(document: &IndexDocument) -> Result<CaptureArtifact, CaptureError> {
648    Ok(capture_redacted(&document_capture_request(document)?))
649}
650
651/// Creates a local capture preview from the current semantic document.
652pub fn preview_document(document: &IndexDocument) -> Result<CapturePreview, CaptureError> {
653    Ok(preview_redacted(&document_capture_request(document)?))
654}
655
656fn document_capture_request(document: &IndexDocument) -> Result<CaptureRequest, CaptureError> {
657    let source_url = document
658        .metadata
659        .canonical_url
660        .as_deref()
661        .unwrap_or("https://index.local/current");
662    let source_url = if IndexUrl::parse(source_url).is_ok() {
663        source_url
664    } else {
665        "https://index.local/current"
666    };
667    CaptureRequest::new(source_url, document_to_html(document)).map(|request| {
668        request.with_diagnostic(format!(
669            "captured from current Index document: title={}",
670            document.title
671        ))
672    })
673}
674
675/// Validates a serialized local capture bundle.
676pub fn validate_capture_bundle(input: &str) -> Result<CaptureArtifact, CaptureError> {
677    let artifact = CaptureArtifact::from_text(input)?;
678    artifact.validate_bundle()?;
679    Ok(artifact)
680}
681
682fn parse_preview_from_text(input: &str) -> Result<CapturePreview, CaptureError> {
683    if !input.starts_with("index-capture-preview-v1\n") {
684        return Err(CaptureError::InvalidArtifact(
685            "missing capture preview header".to_owned(),
686        ));
687    }
688    let artifact_start = input
689        .find(ARTIFACT_HEADER)
690        .ok_or_else(|| CaptureError::InvalidArtifact("missing embedded artifact".to_owned()))?;
691    let artifact = validate_capture_bundle(&input[artifact_start..])?;
692    let checklist = artifact.submission_checklist();
693    Ok(CapturePreview {
694        artifact,
695        summary: RedactionSummary::default(),
696        checklist,
697    })
698}
699
700fn document_to_html(document: &IndexDocument) -> String {
701    let mut html = String::from("<!doctype html><html><head><meta charset=\"utf-8\"><title>");
702    html.push_str(&escape_html(&document.title));
703    html.push_str("</title></head><body><main>");
704    for node in &document.nodes {
705        push_node_html(node, &mut html);
706    }
707    html.push_str("</main></body></html>");
708    html
709}
710
711fn push_node_html(node: &IndexNode, output: &mut String) {
712    match node {
713        IndexNode::Heading { level, text } => {
714            let level = (*level).clamp(1, 6);
715            output.push_str(&format!("<h{level}>"));
716            output.push_str(&escape_html(text));
717            output.push_str(&format!("</h{level}>"));
718        }
719        IndexNode::Paragraph(text) => {
720            output.push_str("<p>");
721            output.push_str(&escape_html(text));
722            output.push_str("</p>");
723        }
724        IndexNode::Link(link) => {
725            output.push_str("<p><a href=\"");
726            output.push_str(&escape_html(&link.href));
727            output.push_str("\">");
728            output.push_str(&escape_html(&link.text));
729            output.push_str("</a></p>");
730        }
731        IndexNode::List { ordered, items } => {
732            let tag = if *ordered { "ol" } else { "ul" };
733            output.push_str(&format!("<{tag}>"));
734            for item in items {
735                output.push_str("<li>");
736                output.push_str(&escape_html(item));
737                output.push_str("</li>");
738            }
739            output.push_str(&format!("</{tag}>"));
740        }
741        IndexNode::CodeBlock { language, code } => {
742            output.push_str("<pre><code");
743            if let Some(language) = language {
744                output.push_str(" class=\"language-");
745                output.push_str(&escape_html(language));
746                output.push('"');
747            }
748            output.push('>');
749            output.push_str(&escape_html(code));
750            output.push_str("</code></pre>");
751        }
752        IndexNode::Table { rows } => {
753            output.push_str("<table>");
754            for row in rows {
755                output.push_str("<tr>");
756                for cell in row {
757                    output.push_str("<td>");
758                    output.push_str(&escape_html(cell));
759                    output.push_str("</td>");
760                }
761                output.push_str("</tr>");
762            }
763            output.push_str("</table>");
764        }
765        IndexNode::Spacer { .. } => {}
766        IndexNode::Section { title, nodes, .. } => {
767            output.push_str("<section>");
768            if let Some(title) = title {
769                output.push_str("<h2>");
770                output.push_str(&escape_html(title));
771                output.push_str("</h2>");
772            }
773            for node in nodes {
774                push_node_html(node, output);
775            }
776            output.push_str("</section>");
777        }
778        IndexNode::Image { alt, src } => {
779            output.push_str("<img alt=\"");
780            output.push_str(&escape_html(alt));
781            output.push('"');
782            if let Some(src) = src {
783                output.push_str(" src=\"");
784                output.push_str(&escape_html(src));
785                output.push('"');
786            }
787            output.push('>');
788        }
789        IndexNode::Form(form) => {
790            output.push_str("<form action=\"");
791            output.push_str(&escape_html(&form.action));
792            output.push_str("\" method=\"");
793            output.push_str(form.method.as_str());
794            output.push_str("\"><p>");
795            output.push_str(&escape_html(&form.name));
796            output.push_str("</p></form>");
797        }
798        IndexNode::Error(error) => {
799            output.push_str("<p data-index-error=\"true\">");
800            output.push_str(&escape_html(error));
801            output.push_str("</p>");
802        }
803    }
804}
805
806fn escape_html(input: &str) -> String {
807    input
808        .replace('&', "&amp;")
809        .replace('<', "&lt;")
810        .replace('>', "&gt;")
811        .replace('"', "&quot;")
812}
813
814fn summarize_redactions(request: &CaptureRequest, artifact: &CaptureArtifact) -> RedactionSummary {
815    RedactionSummary {
816        source_url_values: redaction_delta(request.source_url.as_str(), &artifact.source_url),
817        html_values: redaction_delta(&request.html, &artifact.redacted_html),
818        diagnostic_values: request
819            .diagnostic
820            .as_deref()
821            .zip(artifact.diagnostic.as_deref())
822            .map_or(0, |(before, after)| redaction_delta(before, after)),
823    }
824}
825
826fn redaction_delta(before: &str, after: &str) -> usize {
827    let before_count = before.matches(REDACTED).count();
828    let after_count = after.matches(REDACTED).count();
829    after_count.saturating_sub(before_count)
830}
831
832fn parse_prefixed_line(line: Option<&str>, prefix: &str) -> Result<String, CaptureError> {
833    let Some(line) = line else {
834        return Err(CaptureError::InvalidArtifact(format!(
835            "missing {prefix} line"
836        )));
837    };
838    let Some(value) = line.strip_prefix(prefix) else {
839        return Err(CaptureError::InvalidArtifact(format!(
840            "invalid {prefix} line"
841        )));
842    };
843    Ok(value.to_owned())
844}
845
846fn parse_artifact_u64_line(line: Option<&str>, prefix: &str) -> Result<u64, ArtifactStoreError> {
847    let line = line.ok_or_else(|| ArtifactStoreError::Parse(format!("missing {prefix} line")))?;
848    let value = line
849        .strip_prefix(prefix)
850        .ok_or_else(|| ArtifactStoreError::Parse(format!("invalid {prefix} line")))?;
851    value.parse::<u64>().map_err(|error| {
852        ArtifactStoreError::Parse(format!("failed to parse {prefix} value as u64: {error}"))
853    })
854}
855
856fn parse_artifact_u8_line(line: Option<&str>, prefix: &str) -> Result<u8, ArtifactStoreError> {
857    let line = line.ok_or_else(|| ArtifactStoreError::Parse(format!("missing {prefix} line")))?;
858    let value = line
859        .strip_prefix(prefix)
860        .ok_or_else(|| ArtifactStoreError::Parse(format!("invalid {prefix} line")))?;
861    value.parse::<u8>().map_err(|error| {
862        ArtifactStoreError::Parse(format!("failed to parse {prefix} value as u8: {error}"))
863    })
864}
865
866fn redact_html(input: &str, redactor: &Redactor) -> String {
867    let mut output = redact_sensitive_pairs(&redactor.redact(input));
868    output = redact_sensitive_attributes(&output);
869    output
870}
871
872fn add_html_secret_values(input: &str, redactor: &mut Redactor) {
873    add_query_secret_values(input, redactor);
874    let bytes = input.as_bytes();
875    let mut index = 0;
876    while index < bytes.len() {
877        let Some(name_start) = find_ascii_case_insensitive(&input[index..], "name=") else {
878            break;
879        };
880        let absolute_name_start = index + name_start + "name=".len();
881        let Some((name, after_name)) = read_quoted_value(input, absolute_name_start) else {
882            index = absolute_name_start;
883            continue;
884        };
885        if !is_sensitive_key(&name) {
886            index = after_name;
887            continue;
888        }
889
890        if let Some(value_start) = find_ascii_case_insensitive(&input[after_name..], "value=") {
891            let absolute_value_start = after_name + value_start + "value=".len();
892            if let Some((value, after_value)) = read_quoted_value(input, absolute_value_start) {
893                redactor.add_secret(value);
894                index = after_value;
895                continue;
896            }
897        }
898        index = after_name;
899    }
900}
901
902fn add_query_secret_values(input: &str, redactor: &mut Redactor) {
903    for marker in ["=", "%3D", "%3d"] {
904        let mut search_start = 0;
905        while let Some(relative_position) = input[search_start..].find(marker) {
906            let marker_position = search_start + relative_position;
907            let key_start = input[..marker_position]
908                .rfind(|ch: char| !is_key_char(ch))
909                .map_or(0, |position| position + 1);
910            let key = &input[key_start..marker_position];
911            if !is_sensitive_key(key) {
912                search_start = marker_position + marker.len();
913                continue;
914            }
915
916            let value_start = marker_position + marker.len();
917            let value_end = input[value_start..]
918                .find(is_value_delimiter)
919                .map_or(input.len(), |position| value_start + position);
920            if value_end > value_start {
921                redactor.add_secret(&input[value_start..value_end]);
922            }
923            search_start = value_end;
924        }
925    }
926}
927
928fn redact_sensitive_pairs(input: &str) -> String {
929    let mut output = String::with_capacity(input.len());
930    let mut index = 0;
931    while index < input.len() {
932        let Some(eq_relative) = input[index..].find('=') else {
933            output.push_str(&input[index..]);
934            break;
935        };
936
937        let eq_position = index + eq_relative;
938        let key_start = input[..eq_position]
939            .rfind(|ch: char| !is_key_char(ch))
940            .map_or(0, |position| position + 1);
941        let key = &input[key_start..eq_position];
942        if !is_sensitive_key(key) {
943            output.push_str(&input[index..=eq_position]);
944            index = eq_position + 1;
945            continue;
946        }
947
948        output.push_str(&input[index..eq_position + 1]);
949        let value_start = eq_position + 1;
950        let value_end = input[value_start..]
951            .find(is_value_delimiter)
952            .map_or(input.len(), |position| value_start + position);
953        output.push_str(REDACTED);
954        index = value_end;
955    }
956    output
957}
958
959fn redact_sensitive_attributes(input: &str) -> String {
960    let mut output = String::with_capacity(input.len());
961    let mut index = 0;
962    while index < input.len() {
963        let Some(relative_value) = find_ascii_case_insensitive(&input[index..], "value=") else {
964            output.push_str(&input[index..]);
965            break;
966        };
967        let absolute_value = index + relative_value;
968        output.push_str(&input[index..absolute_value]);
969        output.push_str("value=");
970
971        let value_start = absolute_value + "value=".len();
972        let Some((value, after_value, quote)) = read_quoted_value_with_quote(input, value_start)
973        else {
974            index = value_start;
975            continue;
976        };
977
978        let nearby_start = input[..absolute_value].rfind('<').map_or(index, |pos| pos);
979        let nearby = &input[nearby_start..absolute_value];
980        if find_ascii_case_insensitive(nearby, "password").is_some()
981            || find_ascii_case_insensitive(nearby, "token").is_some()
982            || find_ascii_case_insensitive(nearby, "secret").is_some()
983            || find_ascii_case_insensitive(nearby, "cookie").is_some()
984            || find_ascii_case_insensitive(nearby, "session").is_some()
985        {
986            output.push(quote);
987            output.push_str(REDACTED);
988            output.push(quote);
989        } else {
990            output.push(quote);
991            output.push_str(&value);
992            output.push(quote);
993        }
994        index = after_value;
995    }
996    output
997}
998
999fn read_quoted_value(input: &str, start: usize) -> Option<(String, usize)> {
1000    read_quoted_value_with_quote(input, start).map(|(value, after, _quote)| (value, after))
1001}
1002
1003fn read_quoted_value_with_quote(input: &str, start: usize) -> Option<(String, usize, char)> {
1004    let quote = input[start..].chars().next()?;
1005    if quote != '"' && quote != '\'' {
1006        return None;
1007    }
1008    let value_start = start + quote.len_utf8();
1009    let value_end = input[value_start..].find(quote)? + value_start;
1010    let after = value_end + quote.len_utf8();
1011    Some((input[value_start..value_end].to_owned(), after, quote))
1012}
1013
1014fn find_ascii_case_insensitive(haystack: &str, needle: &str) -> Option<usize> {
1015    let haystack = haystack.as_bytes();
1016    let needle = needle.as_bytes();
1017    if needle.is_empty() || needle.len() > haystack.len() {
1018        return None;
1019    }
1020
1021    haystack
1022        .windows(needle.len())
1023        .position(|window| window.eq_ignore_ascii_case(needle))
1024}
1025
1026fn is_sensitive_key(key: &str) -> bool {
1027    matches!(
1028        key.trim_matches(|ch: char| !ch.is_ascii_alphanumeric() && ch != '_' && ch != '-')
1029            .to_ascii_lowercase()
1030            .as_str(),
1031        "authorization"
1032            | "auth"
1033            | "api_key"
1034            | "api-key"
1035            | "cookie"
1036            | "csrf"
1037            | "csrf_token"
1038            | "key"
1039            | "password"
1040            | "passwd"
1041            | "secret"
1042            | "session"
1043            | "sessionid"
1044            | "sid"
1045            | "token"
1046            | "access_token"
1047            | "refresh_token"
1048    )
1049}
1050
1051fn is_key_char(ch: char) -> bool {
1052    ch.is_ascii_alphanumeric() || ch == '_' || ch == '-'
1053}
1054
1055fn is_value_delimiter(ch: char) -> bool {
1056    matches!(
1057        ch,
1058        '&' | '"' | '\'' | '<' | '>' | ' ' | '\t' | '\r' | '\n' | ';'
1059    )
1060}
1061
1062fn contains_unredacted_sensitive_pair(input: &str) -> bool {
1063    let mut index = 0;
1064    while index < input.len() {
1065        let Some(eq_relative) = input[index..].find('=') else {
1066            return false;
1067        };
1068        let eq_position = index + eq_relative;
1069        let key_start = input[..eq_position]
1070            .rfind(|ch: char| !is_key_char(ch))
1071            .map_or(0, |position| position + 1);
1072        let key = &input[key_start..eq_position];
1073        let value_start = eq_position + 1;
1074        let value_end = input[value_start..]
1075            .find(is_value_delimiter)
1076            .map_or(input.len(), |position| value_start + position);
1077        let value = &input[value_start..value_end];
1078        if is_sensitive_key(key) && !value.is_empty() && value != REDACTED {
1079            return true;
1080        }
1081        index = value_end.saturating_add(1);
1082    }
1083    false
1084}
1085
1086#[cfg(test)]
1087mod tests {
1088    use std::fs;
1089    use std::path::PathBuf;
1090    use std::time::{SystemTime, UNIX_EPOCH};
1091
1092    use index_core::{Form, IndexDocument, IndexNode, IndexUrl, Input, Link, SectionRole};
1093
1094    use super::{
1095        ArtifactContext, ArtifactFreshness, ArtifactStore, CaptureArtifact, CaptureError,
1096        CaptureRequest, CaptureReviewBundle, IndexArtifact, capture_document, capture_redacted,
1097        capture_review_bundle, catalog_entry_for_fixture, preview_document, preview_redacted,
1098        validate_capture_bundle,
1099    };
1100
1101    fn temp_artifact_dir(label: &str) -> PathBuf {
1102        let nanos = SystemTime::now()
1103            .duration_since(UNIX_EPOCH)
1104            .map_or(0, |duration| duration.as_nanos());
1105        std::env::temp_dir().join(format!("index-artifacts-{label}-{nanos}"))
1106    }
1107
1108    #[test]
1109    fn capture_redacts_credentials_cookies_and_private_fields()
1110    -> Result<(), Box<dyn std::error::Error>> {
1111        let request = CaptureRequest::new(
1112            "https://example.org/private?token=url-secret&topic=docs",
1113            r#"<html>
1114                <a href="/search?password=link-secret&q=docs">Search</a>
1115                <form action="/login?session=form-secret">
1116                    <input name="password" value="field-secret">
1117                    <input name="q" value="public">
1118                </form>
1119                <p>Cookie: sid=cookie-secret Authorization: Bearer bearer-secret</p>
1120            </html>"#,
1121        )?;
1122
1123        let artifact = capture_redacted(&request);
1124        let text = artifact.to_text();
1125
1126        for secret in [
1127            "url-secret",
1128            "link-secret",
1129            "form-secret",
1130            "field-secret",
1131            "cookie-secret",
1132            "bearer-secret",
1133        ] {
1134            assert!(!text.contains(secret), "leaked {secret}");
1135        }
1136        assert!(text.contains("[REDACTED]"));
1137        assert!(text.contains("topic=docs"));
1138        assert!(text.contains("value=\"public\""));
1139        Ok(())
1140    }
1141
1142    #[test]
1143    fn capture_artifact_roundtrips_deterministically() -> Result<(), Box<dyn std::error::Error>> {
1144        let request = CaptureRequest::new("https://example.org/docs", "<main>Docs</main>")?
1145            .with_diagnostic("token=diagnostic-secret path=/tmp/index");
1146        let artifact = capture_redacted(&request);
1147        let text = artifact.to_text();
1148        let parsed = CaptureArtifact::from_text(&text)?;
1149
1150        assert_eq!(parsed, artifact);
1151        assert!(!text.contains("diagnostic-secret"));
1152        assert!(text.contains("path=/tmp/index"));
1153        Ok(())
1154    }
1155
1156    #[test]
1157    fn capture_preview_reports_summary_and_checklist() -> Result<(), Box<dyn std::error::Error>> {
1158        let request = CaptureRequest::new(
1159            "https://example.org/page?token=url-secret",
1160            r#"<input name="password" value="field-secret"><p>public</p>"#,
1161        )?
1162        .with_diagnostic("session=diagnostic-secret");
1163        let preview = preview_redacted(&request);
1164        let text = preview.to_text();
1165
1166        assert!(text.contains("index-capture-preview-v1"));
1167        assert!(text.contains("redaction-summary-v1"));
1168        assert!(text.contains("fixture-submission-checklist-v1"));
1169        assert!(preview.summary.total() >= 3);
1170        assert!(!text.contains("url-secret"));
1171        assert!(!text.contains("field-secret"));
1172        assert!(!text.contains("diagnostic-secret"));
1173        Ok(())
1174    }
1175
1176    #[test]
1177    fn capture_review_bundle_roundtrips_with_repair_hints() -> Result<(), Box<dyn std::error::Error>>
1178    {
1179        let request = CaptureRequest::new(
1180            "https://example.org/private?token=url-secret",
1181            "<main><input name=\"password\" value=\"field-secret\"></main>",
1182        )?;
1183        let fixture_path = "../../examples/sample.html";
1184        let bundle = capture_review_bundle(&request, fixture_path, "index-repair-v1\nmain next")?;
1185        let text = bundle.to_text();
1186
1187        assert!(text.contains("index-capture-review-bundle-v1"));
1188        assert!(text.contains("index-repair-v1"));
1189        assert!(!text.contains("url-secret"));
1190        assert!(!text.contains("field-secret"));
1191
1192        let parsed = CaptureReviewBundle::from_text(&text)?;
1193        assert_eq!(parsed.repair_hints, "index-repair-v1\nmain next");
1194        assert!(parsed.catalog_entry.contains("examples/sample.html"));
1195        Ok(())
1196    }
1197
1198    #[test]
1199    fn catalog_entry_helper_rejects_missing_fixture_paths() {
1200        let result = catalog_entry_for_fixture("missing/not-here.html", "article", 1);
1201        assert!(
1202            matches!(result, Err(CaptureError::InvalidArtifact(reason)) if reason.contains("does not exist"))
1203        );
1204    }
1205
1206    #[test]
1207    fn capture_document_creates_valid_local_artifact() -> Result<(), Box<dyn std::error::Error>> {
1208        let mut document = IndexDocument::titled("Captured");
1209        document.metadata.canonical_url = Some("https://example.org/page?token=secret".to_owned());
1210        document.push(IndexNode::Heading {
1211            level: 2,
1212            text: "Main".to_owned(),
1213        });
1214        document.push(IndexNode::Paragraph("public text".to_owned()));
1215        document.push(IndexNode::Link(Link::new(
1216            "Docs",
1217            "https://example.org/docs",
1218        )));
1219
1220        let artifact = capture_document(&document)?;
1221        artifact.validate_bundle()?;
1222        assert!(artifact.redacted_html.contains("<main>"));
1223        assert!(artifact.redacted_html.contains("public text"));
1224        assert!(!artifact.to_text().contains("secret"));
1225
1226        let preview = preview_document(&document)?;
1227        assert!(preview.to_text().contains("index-capture-preview-v1"));
1228        Ok(())
1229    }
1230
1231    #[test]
1232    fn capture_document_projects_structured_nodes() -> Result<(), Box<dyn std::error::Error>> {
1233        let mut document = IndexDocument::titled("Structured <Capture>");
1234        document.push(IndexNode::List {
1235            ordered: true,
1236            items: vec!["one".to_owned(), "two".to_owned()],
1237        });
1238        document.push(IndexNode::List {
1239            ordered: false,
1240            items: vec!["plain".to_owned()],
1241        });
1242        document.push(IndexNode::CodeBlock {
1243            language: Some("rust".to_owned()),
1244            code: "fn main() { println!(\"hi\"); }".to_owned(),
1245        });
1246        document.push(IndexNode::CodeBlock {
1247            language: None,
1248            code: "<raw>".to_owned(),
1249        });
1250        document.push(IndexNode::Table {
1251            rows: vec![
1252                vec!["Name".to_owned(), "Value".to_owned()],
1253                vec!["A".to_owned(), "1".to_owned()],
1254            ],
1255        });
1256        document.push(IndexNode::Spacer { lines: 2 });
1257        document.push(IndexNode::Section {
1258            role: SectionRole::Main,
1259            title: Some("Body".to_owned()),
1260            collapsed: false,
1261            nodes: vec![IndexNode::Paragraph("inside".to_owned())],
1262        });
1263        document.push(IndexNode::Image {
1264            alt: "diagram".to_owned(),
1265            src: Some("https://example.org/image.png".to_owned()),
1266        });
1267        document.push(IndexNode::Image {
1268            alt: "missing".to_owned(),
1269            src: None,
1270        });
1271        document.push(IndexNode::Form(Form {
1272            name: "Search".to_owned(),
1273            method: "GET".to_owned(),
1274            action: "https://example.org/search".to_owned(),
1275            inputs: vec![Input {
1276                name: "q".to_owned(),
1277                kind: "text".to_owned(),
1278                value: None,
1279                required: false,
1280            }],
1281            buttons: Vec::new(),
1282        }));
1283        document.push(IndexNode::Error("could not parse sidebar".to_owned()));
1284
1285        let artifact = capture_document(&document)?;
1286        let html = artifact.redacted_html;
1287        assert!(html.contains("&lt;Capture&gt;"));
1288        assert!(html.contains("<ol><li>one</li><li>two</li></ol>"));
1289        assert!(html.contains("<ul><li>plain</li></ul>"));
1290        assert!(html.contains("class=\"language-rust\""));
1291        assert!(html.contains("&lt;raw&gt;"));
1292        assert!(html.contains("<table><tr><td>Name</td><td>Value</td></tr>"));
1293        assert!(html.contains("<section><h2>Body</h2><p>inside</p></section>"));
1294        assert!(html.contains("<img alt=\"diagram\" src=\"https://example.org/image.png\">"));
1295        assert!(html.contains("<img alt=\"missing\">"));
1296        assert!(html.contains("<form action=\"https://example.org/search\" method=\"GET\">"));
1297        assert!(html.contains("data-index-error=\"true\""));
1298        Ok(())
1299    }
1300
1301    #[test]
1302    fn capture_document_falls_back_from_invalid_canonical_url()
1303    -> Result<(), Box<dyn std::error::Error>> {
1304        let mut document = IndexDocument::titled("Fallback");
1305        document.metadata.canonical_url = Some("javascript:alert(1)".to_owned());
1306        document.push(IndexNode::Paragraph("content".to_owned()));
1307
1308        let artifact = capture_document(&document)?;
1309
1310        assert_eq!(artifact.source_url, "https://index.local/current");
1311        assert!(artifact.redacted_html.contains("content"));
1312        Ok(())
1313    }
1314
1315    #[test]
1316    fn capture_bundle_validation_accepts_redacted_artifact()
1317    -> Result<(), Box<dyn std::error::Error>> {
1318        let artifact = capture_redacted(&CaptureRequest::new(
1319            "https://example.org/page?token=secret",
1320            "<main>Public</main>",
1321        )?);
1322        let parsed = validate_capture_bundle(&artifact.to_text())?;
1323
1324        assert_eq!(
1325            parsed.source_url,
1326            "https://example.org/page?token=[REDACTED]"
1327        );
1328        Ok(())
1329    }
1330
1331    #[test]
1332    fn capture_bundle_validation_rejects_unredacted_sensitive_pairs() {
1333        let input = "index-capture-v1\nsource_url: https://example.org/page?token=secret\nreproduce: index capture --redact https://example.org/page - < local-page.html\n---BEGIN REDACTED HTML---\n<main>Public</main>\n---END REDACTED HTML---\n---BEGIN DIAGNOSTIC---\nnone\n---END DIAGNOSTIC---\n";
1334        let result = validate_capture_bundle(input);
1335
1336        assert!(
1337            matches!(result, Err(CaptureError::InvalidArtifact(reason)) if reason.contains("unredacted"))
1338        );
1339    }
1340
1341    #[test]
1342    fn capture_rejects_unsafe_source_url() {
1343        let request = CaptureRequest::new("javascript:alert(1)", "<main>Bad</main>");
1344        assert!(matches!(request, Err(CaptureError::InvalidSourceUrl(_))));
1345    }
1346
1347    #[test]
1348    fn artifact_parser_rejects_missing_header() {
1349        let artifact = CaptureArtifact::from_text("source_url: https://example.org");
1350        assert!(matches!(
1351            artifact,
1352            Err(CaptureError::InvalidArtifact(reason)) if reason.contains("header")
1353        ));
1354    }
1355
1356    #[test]
1357    fn index_artifact_roundtrips_deterministically() -> Result<(), Box<dyn std::error::Error>> {
1358        let canonical = IndexUrl::parse("https://example.org/docs?token=secret")?;
1359        let final_url = IndexUrl::parse("https://example.org/docs")?;
1360        let mut document = IndexDocument::titled("Artifact");
1361        document.push(IndexNode::Heading {
1362            level: 1,
1363            text: "Title".to_owned(),
1364        });
1365        document.push(IndexNode::Paragraph("Body".to_owned()));
1366
1367        let artifact = IndexArtifact::from_document(
1368            &document,
1369            &canonical,
1370            &final_url,
1371            ArtifactContext::LiveGet,
1372            1234,
1373            300,
1374        )?;
1375        let text = artifact.to_text();
1376        let parsed = IndexArtifact::from_text(&text)?;
1377
1378        assert_eq!(parsed, artifact);
1379        assert!(text.contains("index-artifact-v1"));
1380        assert!(text.contains("context: live-get"));
1381        assert!(text.contains("token=[REDACTED]"));
1382        Ok(())
1383    }
1384
1385    #[test]
1386    fn index_artifact_freshness_transitions_are_deterministic()
1387    -> Result<(), Box<dyn std::error::Error>> {
1388        let canonical = IndexUrl::parse("https://example.org/docs")?;
1389        let final_url = IndexUrl::parse("https://example.org/docs")?;
1390        let mut document = IndexDocument::titled("Freshness");
1391        document.push(IndexNode::Paragraph("Body".to_owned()));
1392
1393        let artifact = IndexArtifact::from_document(
1394            &document,
1395            &canonical,
1396            &final_url,
1397            ArtifactContext::LiveGet,
1398            100,
1399            60,
1400        )?;
1401
1402        assert_eq!(artifact.freshness(120), ArtifactFreshness::Fresh);
1403        assert_eq!(artifact.freshness(161), ArtifactFreshness::Stale);
1404        assert!(artifact.is_fresh(120));
1405        assert!(!artifact.is_fresh(161));
1406        Ok(())
1407    }
1408
1409    #[test]
1410    fn artifact_store_keys_by_url_and_context() -> Result<(), Box<dyn std::error::Error>> {
1411        let root = temp_artifact_dir("store");
1412        let store = ArtifactStore::new(&root);
1413        let canonical = IndexUrl::parse("https://example.org/forum/thread/1")?;
1414        let final_url = IndexUrl::parse("https://example.org/forum/thread/1")?;
1415        let mut document = IndexDocument::titled("Thread");
1416        document.push(IndexNode::Paragraph("Payload".to_owned()));
1417
1418        let get_artifact = IndexArtifact::from_document(
1419            &document,
1420            &canonical,
1421            &final_url,
1422            ArtifactContext::LiveGet,
1423            10,
1424            600,
1425        )?;
1426        let submit_artifact = IndexArtifact::from_document(
1427            &document,
1428            &canonical,
1429            &final_url,
1430            ArtifactContext::LiveSubmit,
1431            10,
1432            600,
1433        )?;
1434
1435        let get_path = store.store(&get_artifact)?;
1436        let submit_path = store.store(&submit_artifact)?;
1437        assert_ne!(get_path, submit_path);
1438        assert!(get_path.exists());
1439        assert!(submit_path.exists());
1440
1441        let loaded_get = store
1442            .load(&canonical, ArtifactContext::LiveGet)?
1443            .ok_or("missing live-get artifact")?;
1444        let loaded_submit = store
1445            .load(&canonical, ArtifactContext::LiveSubmit)?
1446            .ok_or("missing live-submit artifact")?;
1447        assert_eq!(loaded_get.context, ArtifactContext::LiveGet);
1448        assert_eq!(loaded_submit.context, ArtifactContext::LiveSubmit);
1449
1450        if root.exists() {
1451            let _ = fs::remove_dir_all(root);
1452        }
1453        Ok(())
1454    }
1455
1456    #[test]
1457    fn artifact_schema_rejects_unsupported_versions() -> Result<(), Box<dyn std::error::Error>> {
1458        let canonical = IndexUrl::parse("https://example.org/docs")?;
1459        let final_url = IndexUrl::parse("https://example.org/docs")?;
1460        let mut document = IndexDocument::titled("Schema");
1461        document.push(IndexNode::Paragraph("Body".to_owned()));
1462        let artifact = IndexArtifact::from_document(
1463            &document,
1464            &canonical,
1465            &final_url,
1466            ArtifactContext::LiveGet,
1467            1,
1468            60,
1469        )?;
1470        let invalid = artifact.to_text().replace("version: 1", "version: 9");
1471        let parsed = IndexArtifact::from_text(&invalid);
1472        assert!(parsed.is_err());
1473        Ok(())
1474    }
1475}