use std::fmt::{Display, Formatter};
use std::fs;
use std::path::{Path, PathBuf};
use index_core::{IndexDocument, IndexNode, IndexUrl, Redactor, UrlError};
const ARTIFACT_HEADER: &str = "index-capture-v1";
const HTML_BEGIN: &str = "---BEGIN REDACTED HTML---";
const HTML_END: &str = "---END REDACTED HTML---";
const DIAGNOSTIC_BEGIN: &str = "---BEGIN DIAGNOSTIC---";
const DIAGNOSTIC_END: &str = "---END DIAGNOSTIC---";
const PREVIEW_BEGIN: &str = "---BEGIN CAPTURE PREVIEW---";
const PREVIEW_END: &str = "---END CAPTURE PREVIEW---";
const REPAIR_BEGIN: &str = "---BEGIN REPAIR HINTS---";
const REPAIR_END: &str = "---END REPAIR HINTS---";
const INDEX_ARTIFACT_HEADER: &str = "index-artifact-v1";
const INDEX_ARTIFACT_VERSION: u8 = 1;
const ARTIFACT_CAPTURE_BEGIN: &str = "---BEGIN CAPTURE ARTIFACT---";
const ARTIFACT_CAPTURE_END: &str = "---END CAPTURE ARTIFACT---";
const REDACTED: &str = "[REDACTED]";
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CaptureError {
InvalidSourceUrl(UrlError),
InvalidArtifact(String),
}
impl Display for CaptureError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::InvalidSourceUrl(error) => write!(f, "capture source URL rejected: {error}"),
Self::InvalidArtifact(reason) => write!(f, "capture artifact is invalid: {reason}"),
}
}
}
impl std::error::Error for CaptureError {}
#[derive(Debug)]
pub enum ArtifactStoreError {
Io(std::io::Error),
Parse(String),
Url(UrlError),
Capture(CaptureError),
}
impl Display for ArtifactStoreError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::Io(error) => write!(f, "artifact store IO failed: {error}"),
Self::Parse(reason) => write!(f, "artifact store parse failed: {reason}"),
Self::Url(error) => write!(f, "artifact store URL failed: {error}"),
Self::Capture(error) => write!(f, "artifact store capture failed: {error}"),
}
}
}
impl std::error::Error for ArtifactStoreError {}
impl From<std::io::Error> for ArtifactStoreError {
fn from(value: std::io::Error) -> Self {
Self::Io(value)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CaptureRequest {
pub source_url: IndexUrl,
pub html: String,
pub diagnostic: Option<String>,
}
impl CaptureRequest {
pub fn new(source_url: impl AsRef<str>, html: impl Into<String>) -> Result<Self, CaptureError> {
let source_url = IndexUrl::parse(source_url).map_err(CaptureError::InvalidSourceUrl)?;
Ok(Self {
source_url,
html: html.into(),
diagnostic: None,
})
}
#[must_use]
pub fn with_diagnostic(mut self, diagnostic: impl Into<String>) -> Self {
self.diagnostic = Some(diagnostic.into());
self
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CaptureArtifact {
pub source_url: String,
pub redacted_html: String,
pub diagnostic: Option<String>,
pub reproduction_command: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ArtifactContext {
LiveGet,
LiveSubmit,
Offline,
}
impl ArtifactContext {
#[must_use]
pub const fn as_str(self) -> &'static str {
match self {
Self::LiveGet => "live-get",
Self::LiveSubmit => "live-submit",
Self::Offline => "offline",
}
}
pub fn parse(input: &str) -> Result<Self, String> {
match input.trim() {
"live-get" => Ok(Self::LiveGet),
"live-submit" => Ok(Self::LiveSubmit),
"offline" => Ok(Self::Offline),
other => Err(format!("unsupported artifact context: {other}")),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ArtifactFreshness {
Fresh,
Stale,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexArtifact {
pub version: u8,
pub canonical_url: String,
pub final_url: String,
pub context: ArtifactContext,
pub stored_at_unix_secs: u64,
pub max_age_secs: u64,
pub capture: CaptureArtifact,
}
impl IndexArtifact {
pub fn from_document(
document: &IndexDocument,
canonical_url: &IndexUrl,
final_url: &IndexUrl,
context: ArtifactContext,
stored_at_unix_secs: u64,
max_age_secs: u64,
) -> Result<Self, CaptureError> {
let mut capture = capture_document(document)?;
let canonical = redact_sensitive_pairs(canonical_url.as_str());
let final_url = redact_sensitive_pairs(final_url.as_str());
capture.source_url = canonical.clone();
capture.reproduction_command =
format!("index capture --redact {canonical} - < local-page.html");
Ok(Self {
version: INDEX_ARTIFACT_VERSION,
canonical_url: canonical,
final_url,
context,
stored_at_unix_secs,
max_age_secs,
capture,
})
}
#[must_use]
pub fn freshness(&self, now_unix_secs: u64) -> ArtifactFreshness {
let expires_at = self.stored_at_unix_secs.saturating_add(self.max_age_secs);
if now_unix_secs <= expires_at {
ArtifactFreshness::Fresh
} else {
ArtifactFreshness::Stale
}
}
#[must_use]
pub fn is_fresh(&self, now_unix_secs: u64) -> bool {
self.freshness(now_unix_secs) == ArtifactFreshness::Fresh
}
#[must_use]
pub fn to_text(&self) -> String {
format!(
"{INDEX_ARTIFACT_HEADER}\nversion: {}\ncontext: {}\ncanonical_url: {}\nfinal_url: {}\nstored_at_unix_secs: {}\nmax_age_secs: {}\n{ARTIFACT_CAPTURE_BEGIN}\n{}\
\n{ARTIFACT_CAPTURE_END}\n",
self.version,
self.context.as_str(),
self.canonical_url,
self.final_url,
self.stored_at_unix_secs,
self.max_age_secs,
self.capture.to_text().trim_end()
)
}
pub fn from_text(input: &str) -> Result<Self, ArtifactStoreError> {
let mut lines = input.lines();
if lines.next() != Some(INDEX_ARTIFACT_HEADER) {
return Err(ArtifactStoreError::Parse(
"missing artifact header".to_owned(),
));
}
let version = parse_artifact_u8_line(lines.next(), "version: ")?;
if version != INDEX_ARTIFACT_VERSION {
return Err(ArtifactStoreError::Parse(format!(
"unsupported artifact version: {version}"
)));
}
let context = ArtifactContext::parse(
&parse_prefixed_line(lines.next(), "context: ")
.map_err(|error| ArtifactStoreError::Parse(error.to_string()))?,
)
.map_err(ArtifactStoreError::Parse)?;
let canonical_url = parse_prefixed_line(lines.next(), "canonical_url: ")
.map_err(|error| ArtifactStoreError::Parse(error.to_string()))?;
let final_url = parse_prefixed_line(lines.next(), "final_url: ")
.map_err(|error| ArtifactStoreError::Parse(error.to_string()))?;
let stored_at_unix_secs = parse_artifact_u64_line(lines.next(), "stored_at_unix_secs: ")?;
let max_age_secs = parse_artifact_u64_line(lines.next(), "max_age_secs: ")?;
if lines.next() != Some(ARTIFACT_CAPTURE_BEGIN) {
return Err(ArtifactStoreError::Parse(
"missing capture section".to_owned(),
));
}
let mut capture_lines = Vec::new();
for line in &mut lines {
if line == ARTIFACT_CAPTURE_END {
let capture_text = capture_lines.join("\n");
let capture =
validate_capture_bundle(&capture_text).map_err(ArtifactStoreError::Capture)?;
IndexUrl::parse(canonical_url.replace(REDACTED, "redacted"))
.map_err(ArtifactStoreError::Url)?;
IndexUrl::parse(final_url.replace(REDACTED, "redacted"))
.map_err(ArtifactStoreError::Url)?;
return Ok(Self {
version,
canonical_url,
final_url,
context,
stored_at_unix_secs,
max_age_secs,
capture,
});
}
capture_lines.push(line.to_owned());
}
Err(ArtifactStoreError::Parse(
"unterminated capture section".to_owned(),
))
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ArtifactStore {
root: PathBuf,
}
impl ArtifactStore {
#[must_use]
pub fn new(root: impl Into<PathBuf>) -> Self {
Self { root: root.into() }
}
#[must_use]
pub fn root(&self) -> &Path {
&self.root
}
#[must_use]
pub fn path_for(&self, canonical_url: &IndexUrl, context: ArtifactContext) -> PathBuf {
self.root.join(format!(
"{}.{}.idx",
canonical_url.cache_key(),
context.as_str()
))
}
pub fn store(&self, artifact: &IndexArtifact) -> Result<PathBuf, ArtifactStoreError> {
fs::create_dir_all(&self.root)?;
let canonical_url = IndexUrl::parse(artifact.canonical_url.replace(REDACTED, "redacted"))
.map_err(ArtifactStoreError::Url)?;
let path = self.path_for(&canonical_url, artifact.context);
fs::write(&path, artifact.to_text())?;
Ok(path)
}
pub fn load(
&self,
canonical_url: &IndexUrl,
context: ArtifactContext,
) -> Result<Option<IndexArtifact>, ArtifactStoreError> {
let path = self.path_for(canonical_url, context);
if !path.exists() {
return Ok(None);
}
let contents = fs::read_to_string(path)?;
let artifact = IndexArtifact::from_text(&contents)?;
Ok(Some(artifact))
}
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct RedactionSummary {
pub source_url_values: usize,
pub html_values: usize,
pub diagnostic_values: usize,
}
impl RedactionSummary {
#[must_use]
pub const fn total(&self) -> usize {
self.source_url_values + self.html_values + self.diagnostic_values
}
#[must_use]
pub fn to_text(&self) -> String {
format!(
"redaction-summary-v1\nsource_url_values: {}\nhtml_values: {}\ndiagnostic_values: {}\ntotal: {}",
self.source_url_values,
self.html_values,
self.diagnostic_values,
self.total()
)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CapturePreview {
pub artifact: CaptureArtifact,
pub summary: RedactionSummary,
pub checklist: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CaptureReviewBundle {
pub preview: CapturePreview,
pub repair_hints: String,
pub catalog_entry: String,
}
impl CapturePreview {
#[must_use]
pub fn to_text(&self) -> String {
format!(
"index-capture-preview-v1\n{}\n\n{}\n\n{}",
self.summary.to_text(),
self.checklist,
self.artifact.to_text()
)
}
}
impl CaptureReviewBundle {
#[must_use]
pub fn to_text(&self) -> String {
format!(
"index-capture-review-bundle-v1\n{PREVIEW_BEGIN}\n{}\n{PREVIEW_END}\n{REPAIR_BEGIN}\n{}\n{REPAIR_END}\ncatalog_entry: {}\n",
self.preview.to_text(),
self.repair_hints,
self.catalog_entry
)
}
pub fn from_text(input: &str) -> Result<Self, CaptureError> {
let mut lines = input.lines();
if lines.next() != Some("index-capture-review-bundle-v1") {
return Err(CaptureError::InvalidArtifact(
"missing review bundle header".to_owned(),
));
}
if lines.next() != Some(PREVIEW_BEGIN) {
return Err(CaptureError::InvalidArtifact(
"missing capture preview section".to_owned(),
));
}
let mut preview_lines = Vec::new();
for line in &mut lines {
if line == PREVIEW_END {
break;
}
preview_lines.push(line.to_owned());
}
if lines.next() != Some(REPAIR_BEGIN) {
return Err(CaptureError::InvalidArtifact(
"missing repair hint section".to_owned(),
));
}
let mut repair_lines = Vec::new();
for line in &mut lines {
if line == REPAIR_END {
break;
}
repair_lines.push(line.to_owned());
}
let catalog_entry = parse_prefixed_line(lines.next(), "catalog_entry: ")?;
let preview = parse_preview_from_text(&preview_lines.join("\n"))?;
Ok(Self {
preview,
repair_hints: repair_lines.join("\n"),
catalog_entry,
})
}
}
impl CaptureArtifact {
#[must_use]
pub fn to_text(&self) -> String {
let diagnostic = self.diagnostic.as_deref().unwrap_or("none");
format!(
"{ARTIFACT_HEADER}\nsource_url: {}\nreproduce: {}\n{HTML_BEGIN}\n{}\n{HTML_END}\n{DIAGNOSTIC_BEGIN}\n{}\n{DIAGNOSTIC_END}\n",
self.source_url, self.reproduction_command, self.redacted_html, diagnostic
)
}
#[must_use]
pub fn submission_checklist(&self) -> String {
format!(
"fixture-submission-checklist-v1\nsource_url: {}\nreproduce: {}\n[ ] confirm the URL is public or rewritten to a public equivalent\n[ ] confirm no cookies, credentials, account identifiers, private messages, or private URLs remain\n[ ] reduce HTML to the smallest shape that reproduces the behavior\n[ ] classify intent and support tier\n[ ] add or update regression tests\n[ ] record the fixture in docs/COVERAGE_CATALOG.md",
self.source_url, self.reproduction_command
)
}
pub fn validate_bundle(&self) -> Result<(), CaptureError> {
let parseable_source_url = self.source_url.replace(REDACTED, "redacted");
IndexUrl::parse(&parseable_source_url).map_err(CaptureError::InvalidSourceUrl)?;
if self.redacted_html.trim().is_empty() {
return Err(CaptureError::InvalidArtifact(
"redacted HTML section is empty".to_owned(),
));
}
if contains_unredacted_sensitive_pair(&self.source_url)
|| contains_unredacted_sensitive_pair(&self.redacted_html)
|| self
.diagnostic
.as_deref()
.is_some_and(contains_unredacted_sensitive_pair)
{
return Err(CaptureError::InvalidArtifact(
"artifact contains unredacted credential-shaped content".to_owned(),
));
}
Ok(())
}
pub fn from_text(input: &str) -> Result<Self, CaptureError> {
let mut lines = input.lines();
if lines.next() != Some(ARTIFACT_HEADER) {
return Err(CaptureError::InvalidArtifact("missing header".to_owned()));
}
let source_url = parse_prefixed_line(lines.next(), "source_url: ")?;
let reproduction_command = parse_prefixed_line(lines.next(), "reproduce: ")?;
if lines.next() != Some(HTML_BEGIN) {
return Err(CaptureError::InvalidArtifact(
"missing redacted HTML section".to_owned(),
));
}
let mut redacted_html = Vec::new();
for line in &mut lines {
if line == HTML_END {
break;
}
redacted_html.push(line.to_owned());
}
if lines.next() != Some(DIAGNOSTIC_BEGIN) {
return Err(CaptureError::InvalidArtifact(
"missing diagnostic section".to_owned(),
));
}
let mut diagnostic = Vec::new();
for line in &mut lines {
if line == DIAGNOSTIC_END {
let diagnostic = diagnostic.join("\n");
let diagnostic = if diagnostic == "none" {
None
} else {
Some(diagnostic)
};
return Ok(Self {
source_url,
redacted_html: redacted_html.join("\n"),
diagnostic,
reproduction_command,
});
}
diagnostic.push(line.to_owned());
}
Err(CaptureError::InvalidArtifact(
"unterminated diagnostic section".to_owned(),
))
}
}
pub fn capture_redacted(request: &CaptureRequest) -> CaptureArtifact {
let mut redactor = Redactor::new();
add_html_secret_values(&request.html, &mut redactor);
if let Some(diagnostic) = &request.diagnostic {
add_query_secret_values(diagnostic, &mut redactor);
}
add_query_secret_values(request.source_url.as_str(), &mut redactor);
let source_url = redact_sensitive_pairs(&redactor.redact(request.source_url.as_str()));
let redacted_html = redact_html(&request.html, &redactor);
let diagnostic = request
.diagnostic
.as_ref()
.map(|diagnostic| redact_sensitive_pairs(&redactor.redact(diagnostic)));
CaptureArtifact {
reproduction_command: format!("index capture --redact {source_url} - < local-page.html"),
source_url,
redacted_html,
diagnostic,
}
}
pub fn preview_redacted(request: &CaptureRequest) -> CapturePreview {
let artifact = capture_redacted(request);
let summary = summarize_redactions(request, &artifact);
let checklist = artifact.submission_checklist();
CapturePreview {
artifact,
summary,
checklist,
}
}
pub fn capture_review_bundle(
request: &CaptureRequest,
fixture_path: &str,
repair_hints: impl Into<String>,
) -> Result<CaptureReviewBundle, CaptureError> {
let preview = preview_redacted(request);
let catalog_entry = catalog_entry_for_fixture(fixture_path, "unknown", 0)?;
Ok(CaptureReviewBundle {
preview,
repair_hints: repair_hints.into(),
catalog_entry,
})
}
pub fn catalog_entry_for_fixture(
fixture_path: &str,
intent: &str,
tier: u8,
) -> Result<String, CaptureError> {
if !Path::new(fixture_path).exists() {
return Err(CaptureError::InvalidArtifact(format!(
"fixture path does not exist: {fixture_path}"
)));
}
Ok(format!(
"| `{fixture_path}` | {intent} | Tier {tier} | capture | review private data before submission |"
))
}
pub fn capture_document(document: &IndexDocument) -> Result<CaptureArtifact, CaptureError> {
Ok(capture_redacted(&document_capture_request(document)?))
}
pub fn preview_document(document: &IndexDocument) -> Result<CapturePreview, CaptureError> {
Ok(preview_redacted(&document_capture_request(document)?))
}
fn document_capture_request(document: &IndexDocument) -> Result<CaptureRequest, CaptureError> {
let source_url = document
.metadata
.canonical_url
.as_deref()
.unwrap_or("https://index.local/current");
let source_url = if IndexUrl::parse(source_url).is_ok() {
source_url
} else {
"https://index.local/current"
};
CaptureRequest::new(source_url, document_to_html(document)).map(|request| {
request.with_diagnostic(format!(
"captured from current Index document: title={}",
document.title
))
})
}
pub fn validate_capture_bundle(input: &str) -> Result<CaptureArtifact, CaptureError> {
let artifact = CaptureArtifact::from_text(input)?;
artifact.validate_bundle()?;
Ok(artifact)
}
fn parse_preview_from_text(input: &str) -> Result<CapturePreview, CaptureError> {
if !input.starts_with("index-capture-preview-v1\n") {
return Err(CaptureError::InvalidArtifact(
"missing capture preview header".to_owned(),
));
}
let artifact_start = input
.find(ARTIFACT_HEADER)
.ok_or_else(|| CaptureError::InvalidArtifact("missing embedded artifact".to_owned()))?;
let artifact = validate_capture_bundle(&input[artifact_start..])?;
let checklist = artifact.submission_checklist();
Ok(CapturePreview {
artifact,
summary: RedactionSummary::default(),
checklist,
})
}
fn document_to_html(document: &IndexDocument) -> String {
let mut html = String::from("<!doctype html><html><head><meta charset=\"utf-8\"><title>");
html.push_str(&escape_html(&document.title));
html.push_str("</title></head><body><main>");
for node in &document.nodes {
push_node_html(node, &mut html);
}
html.push_str("</main></body></html>");
html
}
fn push_node_html(node: &IndexNode, output: &mut String) {
match node {
IndexNode::Heading { level, text } => {
let level = (*level).clamp(1, 6);
output.push_str(&format!("<h{level}>"));
output.push_str(&escape_html(text));
output.push_str(&format!("</h{level}>"));
}
IndexNode::Paragraph(text) => {
output.push_str("<p>");
output.push_str(&escape_html(text));
output.push_str("</p>");
}
IndexNode::Link(link) => {
output.push_str("<p><a href=\"");
output.push_str(&escape_html(&link.href));
output.push_str("\">");
output.push_str(&escape_html(&link.text));
output.push_str("</a></p>");
}
IndexNode::List { ordered, items } => {
let tag = if *ordered { "ol" } else { "ul" };
output.push_str(&format!("<{tag}>"));
for item in items {
output.push_str("<li>");
output.push_str(&escape_html(item));
output.push_str("</li>");
}
output.push_str(&format!("</{tag}>"));
}
IndexNode::CodeBlock { language, code } => {
output.push_str("<pre><code");
if let Some(language) = language {
output.push_str(" class=\"language-");
output.push_str(&escape_html(language));
output.push('"');
}
output.push('>');
output.push_str(&escape_html(code));
output.push_str("</code></pre>");
}
IndexNode::Table { rows } => {
output.push_str("<table>");
for row in rows {
output.push_str("<tr>");
for cell in row {
output.push_str("<td>");
output.push_str(&escape_html(cell));
output.push_str("</td>");
}
output.push_str("</tr>");
}
output.push_str("</table>");
}
IndexNode::Spacer { .. } => {}
IndexNode::Section { title, nodes, .. } => {
output.push_str("<section>");
if let Some(title) = title {
output.push_str("<h2>");
output.push_str(&escape_html(title));
output.push_str("</h2>");
}
for node in nodes {
push_node_html(node, output);
}
output.push_str("</section>");
}
IndexNode::Image { alt, src } => {
output.push_str("<img alt=\"");
output.push_str(&escape_html(alt));
output.push('"');
if let Some(src) = src {
output.push_str(" src=\"");
output.push_str(&escape_html(src));
output.push('"');
}
output.push('>');
}
IndexNode::Form(form) => {
output.push_str("<form action=\"");
output.push_str(&escape_html(&form.action));
output.push_str("\" method=\"");
output.push_str(form.method.as_str());
output.push_str("\"><p>");
output.push_str(&escape_html(&form.name));
output.push_str("</p></form>");
}
IndexNode::Error(error) => {
output.push_str("<p data-index-error=\"true\">");
output.push_str(&escape_html(error));
output.push_str("</p>");
}
}
}
fn escape_html(input: &str) -> String {
input
.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
}
fn summarize_redactions(request: &CaptureRequest, artifact: &CaptureArtifact) -> RedactionSummary {
RedactionSummary {
source_url_values: redaction_delta(request.source_url.as_str(), &artifact.source_url),
html_values: redaction_delta(&request.html, &artifact.redacted_html),
diagnostic_values: request
.diagnostic
.as_deref()
.zip(artifact.diagnostic.as_deref())
.map_or(0, |(before, after)| redaction_delta(before, after)),
}
}
fn redaction_delta(before: &str, after: &str) -> usize {
let before_count = before.matches(REDACTED).count();
let after_count = after.matches(REDACTED).count();
after_count.saturating_sub(before_count)
}
fn parse_prefixed_line(line: Option<&str>, prefix: &str) -> Result<String, CaptureError> {
let Some(line) = line else {
return Err(CaptureError::InvalidArtifact(format!(
"missing {prefix} line"
)));
};
let Some(value) = line.strip_prefix(prefix) else {
return Err(CaptureError::InvalidArtifact(format!(
"invalid {prefix} line"
)));
};
Ok(value.to_owned())
}
fn parse_artifact_u64_line(line: Option<&str>, prefix: &str) -> Result<u64, ArtifactStoreError> {
let line = line.ok_or_else(|| ArtifactStoreError::Parse(format!("missing {prefix} line")))?;
let value = line
.strip_prefix(prefix)
.ok_or_else(|| ArtifactStoreError::Parse(format!("invalid {prefix} line")))?;
value.parse::<u64>().map_err(|error| {
ArtifactStoreError::Parse(format!("failed to parse {prefix} value as u64: {error}"))
})
}
fn parse_artifact_u8_line(line: Option<&str>, prefix: &str) -> Result<u8, ArtifactStoreError> {
let line = line.ok_or_else(|| ArtifactStoreError::Parse(format!("missing {prefix} line")))?;
let value = line
.strip_prefix(prefix)
.ok_or_else(|| ArtifactStoreError::Parse(format!("invalid {prefix} line")))?;
value.parse::<u8>().map_err(|error| {
ArtifactStoreError::Parse(format!("failed to parse {prefix} value as u8: {error}"))
})
}
fn redact_html(input: &str, redactor: &Redactor) -> String {
let mut output = redact_sensitive_pairs(&redactor.redact(input));
output = redact_sensitive_attributes(&output);
output
}
fn add_html_secret_values(input: &str, redactor: &mut Redactor) {
add_query_secret_values(input, redactor);
let bytes = input.as_bytes();
let mut index = 0;
while index < bytes.len() {
let Some(name_start) = find_ascii_case_insensitive(&input[index..], "name=") else {
break;
};
let absolute_name_start = index + name_start + "name=".len();
let Some((name, after_name)) = read_quoted_value(input, absolute_name_start) else {
index = absolute_name_start;
continue;
};
if !is_sensitive_key(&name) {
index = after_name;
continue;
}
if let Some(value_start) = find_ascii_case_insensitive(&input[after_name..], "value=") {
let absolute_value_start = after_name + value_start + "value=".len();
if let Some((value, after_value)) = read_quoted_value(input, absolute_value_start) {
redactor.add_secret(value);
index = after_value;
continue;
}
}
index = after_name;
}
}
fn add_query_secret_values(input: &str, redactor: &mut Redactor) {
for marker in ["=", "%3D", "%3d"] {
let mut search_start = 0;
while let Some(relative_position) = input[search_start..].find(marker) {
let marker_position = search_start + relative_position;
let key_start = input[..marker_position]
.rfind(|ch: char| !is_key_char(ch))
.map_or(0, |position| position + 1);
let key = &input[key_start..marker_position];
if !is_sensitive_key(key) {
search_start = marker_position + marker.len();
continue;
}
let value_start = marker_position + marker.len();
let value_end = input[value_start..]
.find(is_value_delimiter)
.map_or(input.len(), |position| value_start + position);
if value_end > value_start {
redactor.add_secret(&input[value_start..value_end]);
}
search_start = value_end;
}
}
}
fn redact_sensitive_pairs(input: &str) -> String {
let mut output = String::with_capacity(input.len());
let mut index = 0;
while index < input.len() {
let Some(eq_relative) = input[index..].find('=') else {
output.push_str(&input[index..]);
break;
};
let eq_position = index + eq_relative;
let key_start = input[..eq_position]
.rfind(|ch: char| !is_key_char(ch))
.map_or(0, |position| position + 1);
let key = &input[key_start..eq_position];
if !is_sensitive_key(key) {
output.push_str(&input[index..=eq_position]);
index = eq_position + 1;
continue;
}
output.push_str(&input[index..eq_position + 1]);
let value_start = eq_position + 1;
let value_end = input[value_start..]
.find(is_value_delimiter)
.map_or(input.len(), |position| value_start + position);
output.push_str(REDACTED);
index = value_end;
}
output
}
fn redact_sensitive_attributes(input: &str) -> String {
let mut output = String::with_capacity(input.len());
let mut index = 0;
while index < input.len() {
let Some(relative_value) = find_ascii_case_insensitive(&input[index..], "value=") else {
output.push_str(&input[index..]);
break;
};
let absolute_value = index + relative_value;
output.push_str(&input[index..absolute_value]);
output.push_str("value=");
let value_start = absolute_value + "value=".len();
let Some((value, after_value, quote)) = read_quoted_value_with_quote(input, value_start)
else {
index = value_start;
continue;
};
let nearby_start = input[..absolute_value].rfind('<').map_or(index, |pos| pos);
let nearby = &input[nearby_start..absolute_value];
if find_ascii_case_insensitive(nearby, "password").is_some()
|| find_ascii_case_insensitive(nearby, "token").is_some()
|| find_ascii_case_insensitive(nearby, "secret").is_some()
|| find_ascii_case_insensitive(nearby, "cookie").is_some()
|| find_ascii_case_insensitive(nearby, "session").is_some()
{
output.push(quote);
output.push_str(REDACTED);
output.push(quote);
} else {
output.push(quote);
output.push_str(&value);
output.push(quote);
}
index = after_value;
}
output
}
fn read_quoted_value(input: &str, start: usize) -> Option<(String, usize)> {
read_quoted_value_with_quote(input, start).map(|(value, after, _quote)| (value, after))
}
fn read_quoted_value_with_quote(input: &str, start: usize) -> Option<(String, usize, char)> {
let quote = input[start..].chars().next()?;
if quote != '"' && quote != '\'' {
return None;
}
let value_start = start + quote.len_utf8();
let value_end = input[value_start..].find(quote)? + value_start;
let after = value_end + quote.len_utf8();
Some((input[value_start..value_end].to_owned(), after, quote))
}
fn find_ascii_case_insensitive(haystack: &str, needle: &str) -> Option<usize> {
let haystack = haystack.as_bytes();
let needle = needle.as_bytes();
if needle.is_empty() || needle.len() > haystack.len() {
return None;
}
haystack
.windows(needle.len())
.position(|window| window.eq_ignore_ascii_case(needle))
}
fn is_sensitive_key(key: &str) -> bool {
matches!(
key.trim_matches(|ch: char| !ch.is_ascii_alphanumeric() && ch != '_' && ch != '-')
.to_ascii_lowercase()
.as_str(),
"authorization"
| "auth"
| "api_key"
| "api-key"
| "cookie"
| "csrf"
| "csrf_token"
| "key"
| "password"
| "passwd"
| "secret"
| "session"
| "sessionid"
| "sid"
| "token"
| "access_token"
| "refresh_token"
)
}
fn is_key_char(ch: char) -> bool {
ch.is_ascii_alphanumeric() || ch == '_' || ch == '-'
}
fn is_value_delimiter(ch: char) -> bool {
matches!(
ch,
'&' | '"' | '\'' | '<' | '>' | ' ' | '\t' | '\r' | '\n' | ';'
)
}
fn contains_unredacted_sensitive_pair(input: &str) -> bool {
let mut index = 0;
while index < input.len() {
let Some(eq_relative) = input[index..].find('=') else {
return false;
};
let eq_position = index + eq_relative;
let key_start = input[..eq_position]
.rfind(|ch: char| !is_key_char(ch))
.map_or(0, |position| position + 1);
let key = &input[key_start..eq_position];
let value_start = eq_position + 1;
let value_end = input[value_start..]
.find(is_value_delimiter)
.map_or(input.len(), |position| value_start + position);
let value = &input[value_start..value_end];
if is_sensitive_key(key) && !value.is_empty() && value != REDACTED {
return true;
}
index = value_end.saturating_add(1);
}
false
}
#[cfg(test)]
mod tests {
use std::fs;
use std::path::PathBuf;
use std::time::{SystemTime, UNIX_EPOCH};
use index_core::{Form, IndexDocument, IndexNode, IndexUrl, Input, Link, SectionRole};
use super::{
ArtifactContext, ArtifactFreshness, ArtifactStore, CaptureArtifact, CaptureError,
CaptureRequest, CaptureReviewBundle, IndexArtifact, capture_document, capture_redacted,
capture_review_bundle, catalog_entry_for_fixture, preview_document, preview_redacted,
validate_capture_bundle,
};
fn temp_artifact_dir(label: &str) -> PathBuf {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_or(0, |duration| duration.as_nanos());
std::env::temp_dir().join(format!("index-artifacts-{label}-{nanos}"))
}
#[test]
fn capture_redacts_credentials_cookies_and_private_fields()
-> Result<(), Box<dyn std::error::Error>> {
let request = CaptureRequest::new(
"https://example.org/private?token=url-secret&topic=docs",
r#"<html>
<a href="/search?password=link-secret&q=docs">Search</a>
<form action="/login?session=form-secret">
<input name="password" value="field-secret">
<input name="q" value="public">
</form>
<p>Cookie: sid=cookie-secret Authorization: Bearer bearer-secret</p>
</html>"#,
)?;
let artifact = capture_redacted(&request);
let text = artifact.to_text();
for secret in [
"url-secret",
"link-secret",
"form-secret",
"field-secret",
"cookie-secret",
"bearer-secret",
] {
assert!(!text.contains(secret), "leaked {secret}");
}
assert!(text.contains("[REDACTED]"));
assert!(text.contains("topic=docs"));
assert!(text.contains("value=\"public\""));
Ok(())
}
#[test]
fn capture_artifact_roundtrips_deterministically() -> Result<(), Box<dyn std::error::Error>> {
let request = CaptureRequest::new("https://example.org/docs", "<main>Docs</main>")?
.with_diagnostic("token=diagnostic-secret path=/tmp/index");
let artifact = capture_redacted(&request);
let text = artifact.to_text();
let parsed = CaptureArtifact::from_text(&text)?;
assert_eq!(parsed, artifact);
assert!(!text.contains("diagnostic-secret"));
assert!(text.contains("path=/tmp/index"));
Ok(())
}
#[test]
fn capture_preview_reports_summary_and_checklist() -> Result<(), Box<dyn std::error::Error>> {
let request = CaptureRequest::new(
"https://example.org/page?token=url-secret",
r#"<input name="password" value="field-secret"><p>public</p>"#,
)?
.with_diagnostic("session=diagnostic-secret");
let preview = preview_redacted(&request);
let text = preview.to_text();
assert!(text.contains("index-capture-preview-v1"));
assert!(text.contains("redaction-summary-v1"));
assert!(text.contains("fixture-submission-checklist-v1"));
assert!(preview.summary.total() >= 3);
assert!(!text.contains("url-secret"));
assert!(!text.contains("field-secret"));
assert!(!text.contains("diagnostic-secret"));
Ok(())
}
#[test]
fn capture_review_bundle_roundtrips_with_repair_hints() -> Result<(), Box<dyn std::error::Error>>
{
let request = CaptureRequest::new(
"https://example.org/private?token=url-secret",
"<main><input name=\"password\" value=\"field-secret\"></main>",
)?;
let fixture_path = "../../examples/sample.html";
let bundle = capture_review_bundle(&request, fixture_path, "index-repair-v1\nmain next")?;
let text = bundle.to_text();
assert!(text.contains("index-capture-review-bundle-v1"));
assert!(text.contains("index-repair-v1"));
assert!(!text.contains("url-secret"));
assert!(!text.contains("field-secret"));
let parsed = CaptureReviewBundle::from_text(&text)?;
assert_eq!(parsed.repair_hints, "index-repair-v1\nmain next");
assert!(parsed.catalog_entry.contains("examples/sample.html"));
Ok(())
}
#[test]
fn catalog_entry_helper_rejects_missing_fixture_paths() {
let result = catalog_entry_for_fixture("missing/not-here.html", "article", 1);
assert!(
matches!(result, Err(CaptureError::InvalidArtifact(reason)) if reason.contains("does not exist"))
);
}
#[test]
fn capture_document_creates_valid_local_artifact() -> Result<(), Box<dyn std::error::Error>> {
let mut document = IndexDocument::titled("Captured");
document.metadata.canonical_url = Some("https://example.org/page?token=secret".to_owned());
document.push(IndexNode::Heading {
level: 2,
text: "Main".to_owned(),
});
document.push(IndexNode::Paragraph("public text".to_owned()));
document.push(IndexNode::Link(Link::new(
"Docs",
"https://example.org/docs",
)));
let artifact = capture_document(&document)?;
artifact.validate_bundle()?;
assert!(artifact.redacted_html.contains("<main>"));
assert!(artifact.redacted_html.contains("public text"));
assert!(!artifact.to_text().contains("secret"));
let preview = preview_document(&document)?;
assert!(preview.to_text().contains("index-capture-preview-v1"));
Ok(())
}
#[test]
fn capture_document_projects_structured_nodes() -> Result<(), Box<dyn std::error::Error>> {
let mut document = IndexDocument::titled("Structured <Capture>");
document.push(IndexNode::List {
ordered: true,
items: vec!["one".to_owned(), "two".to_owned()],
});
document.push(IndexNode::List {
ordered: false,
items: vec!["plain".to_owned()],
});
document.push(IndexNode::CodeBlock {
language: Some("rust".to_owned()),
code: "fn main() { println!(\"hi\"); }".to_owned(),
});
document.push(IndexNode::CodeBlock {
language: None,
code: "<raw>".to_owned(),
});
document.push(IndexNode::Table {
rows: vec![
vec!["Name".to_owned(), "Value".to_owned()],
vec!["A".to_owned(), "1".to_owned()],
],
});
document.push(IndexNode::Spacer { lines: 2 });
document.push(IndexNode::Section {
role: SectionRole::Main,
title: Some("Body".to_owned()),
collapsed: false,
nodes: vec![IndexNode::Paragraph("inside".to_owned())],
});
document.push(IndexNode::Image {
alt: "diagram".to_owned(),
src: Some("https://example.org/image.png".to_owned()),
});
document.push(IndexNode::Image {
alt: "missing".to_owned(),
src: None,
});
document.push(IndexNode::Form(Form {
name: "Search".to_owned(),
method: "GET".to_owned(),
action: "https://example.org/search".to_owned(),
inputs: vec![Input {
name: "q".to_owned(),
kind: "text".to_owned(),
value: None,
required: false,
}],
buttons: Vec::new(),
}));
document.push(IndexNode::Error("could not parse sidebar".to_owned()));
let artifact = capture_document(&document)?;
let html = artifact.redacted_html;
assert!(html.contains("<Capture>"));
assert!(html.contains("<ol><li>one</li><li>two</li></ol>"));
assert!(html.contains("<ul><li>plain</li></ul>"));
assert!(html.contains("class=\"language-rust\""));
assert!(html.contains("<raw>"));
assert!(html.contains("<table><tr><td>Name</td><td>Value</td></tr>"));
assert!(html.contains("<section><h2>Body</h2><p>inside</p></section>"));
assert!(html.contains("<img alt=\"diagram\" src=\"https://example.org/image.png\">"));
assert!(html.contains("<img alt=\"missing\">"));
assert!(html.contains("<form action=\"https://example.org/search\" method=\"GET\">"));
assert!(html.contains("data-index-error=\"true\""));
Ok(())
}
#[test]
fn capture_document_falls_back_from_invalid_canonical_url()
-> Result<(), Box<dyn std::error::Error>> {
let mut document = IndexDocument::titled("Fallback");
document.metadata.canonical_url = Some("javascript:alert(1)".to_owned());
document.push(IndexNode::Paragraph("content".to_owned()));
let artifact = capture_document(&document)?;
assert_eq!(artifact.source_url, "https://index.local/current");
assert!(artifact.redacted_html.contains("content"));
Ok(())
}
#[test]
fn capture_bundle_validation_accepts_redacted_artifact()
-> Result<(), Box<dyn std::error::Error>> {
let artifact = capture_redacted(&CaptureRequest::new(
"https://example.org/page?token=secret",
"<main>Public</main>",
)?);
let parsed = validate_capture_bundle(&artifact.to_text())?;
assert_eq!(
parsed.source_url,
"https://example.org/page?token=[REDACTED]"
);
Ok(())
}
#[test]
fn capture_bundle_validation_rejects_unredacted_sensitive_pairs() {
let input = "index-capture-v1\nsource_url: https://example.org/page?token=secret\nreproduce: index capture --redact https://example.org/page - < local-page.html\n---BEGIN REDACTED HTML---\n<main>Public</main>\n---END REDACTED HTML---\n---BEGIN DIAGNOSTIC---\nnone\n---END DIAGNOSTIC---\n";
let result = validate_capture_bundle(input);
assert!(
matches!(result, Err(CaptureError::InvalidArtifact(reason)) if reason.contains("unredacted"))
);
}
#[test]
fn capture_rejects_unsafe_source_url() {
let request = CaptureRequest::new("javascript:alert(1)", "<main>Bad</main>");
assert!(matches!(request, Err(CaptureError::InvalidSourceUrl(_))));
}
#[test]
fn artifact_parser_rejects_missing_header() {
let artifact = CaptureArtifact::from_text("source_url: https://example.org");
assert!(matches!(
artifact,
Err(CaptureError::InvalidArtifact(reason)) if reason.contains("header")
));
}
#[test]
fn index_artifact_roundtrips_deterministically() -> Result<(), Box<dyn std::error::Error>> {
let canonical = IndexUrl::parse("https://example.org/docs?token=secret")?;
let final_url = IndexUrl::parse("https://example.org/docs")?;
let mut document = IndexDocument::titled("Artifact");
document.push(IndexNode::Heading {
level: 1,
text: "Title".to_owned(),
});
document.push(IndexNode::Paragraph("Body".to_owned()));
let artifact = IndexArtifact::from_document(
&document,
&canonical,
&final_url,
ArtifactContext::LiveGet,
1234,
300,
)?;
let text = artifact.to_text();
let parsed = IndexArtifact::from_text(&text)?;
assert_eq!(parsed, artifact);
assert!(text.contains("index-artifact-v1"));
assert!(text.contains("context: live-get"));
assert!(text.contains("token=[REDACTED]"));
Ok(())
}
#[test]
fn index_artifact_freshness_transitions_are_deterministic()
-> Result<(), Box<dyn std::error::Error>> {
let canonical = IndexUrl::parse("https://example.org/docs")?;
let final_url = IndexUrl::parse("https://example.org/docs")?;
let mut document = IndexDocument::titled("Freshness");
document.push(IndexNode::Paragraph("Body".to_owned()));
let artifact = IndexArtifact::from_document(
&document,
&canonical,
&final_url,
ArtifactContext::LiveGet,
100,
60,
)?;
assert_eq!(artifact.freshness(120), ArtifactFreshness::Fresh);
assert_eq!(artifact.freshness(161), ArtifactFreshness::Stale);
assert!(artifact.is_fresh(120));
assert!(!artifact.is_fresh(161));
Ok(())
}
#[test]
fn artifact_store_keys_by_url_and_context() -> Result<(), Box<dyn std::error::Error>> {
let root = temp_artifact_dir("store");
let store = ArtifactStore::new(&root);
let canonical = IndexUrl::parse("https://example.org/forum/thread/1")?;
let final_url = IndexUrl::parse("https://example.org/forum/thread/1")?;
let mut document = IndexDocument::titled("Thread");
document.push(IndexNode::Paragraph("Payload".to_owned()));
let get_artifact = IndexArtifact::from_document(
&document,
&canonical,
&final_url,
ArtifactContext::LiveGet,
10,
600,
)?;
let submit_artifact = IndexArtifact::from_document(
&document,
&canonical,
&final_url,
ArtifactContext::LiveSubmit,
10,
600,
)?;
let get_path = store.store(&get_artifact)?;
let submit_path = store.store(&submit_artifact)?;
assert_ne!(get_path, submit_path);
assert!(get_path.exists());
assert!(submit_path.exists());
let loaded_get = store
.load(&canonical, ArtifactContext::LiveGet)?
.ok_or("missing live-get artifact")?;
let loaded_submit = store
.load(&canonical, ArtifactContext::LiveSubmit)?
.ok_or("missing live-submit artifact")?;
assert_eq!(loaded_get.context, ArtifactContext::LiveGet);
assert_eq!(loaded_submit.context, ArtifactContext::LiveSubmit);
if root.exists() {
let _ = fs::remove_dir_all(root);
}
Ok(())
}
#[test]
fn artifact_schema_rejects_unsupported_versions() -> Result<(), Box<dyn std::error::Error>> {
let canonical = IndexUrl::parse("https://example.org/docs")?;
let final_url = IndexUrl::parse("https://example.org/docs")?;
let mut document = IndexDocument::titled("Schema");
document.push(IndexNode::Paragraph("Body".to_owned()));
let artifact = IndexArtifact::from_document(
&document,
&canonical,
&final_url,
ArtifactContext::LiveGet,
1,
60,
)?;
let invalid = artifact.to_text().replace("version: 1", "version: 9");
let parsed = IndexArtifact::from_text(&invalid);
assert!(parsed.is_err());
Ok(())
}
}