gobby-wiki 0.2.0

use std::io::Read;
use std::net::{IpAddr, Ipv6Addr, ToSocketAddrs};
use std::path::Path;
use std::time::Duration;

use scraper::{ElementRef, Html, Node, Selector};

use crate::WikiError;
use crate::ingest::{
    IngestResult, index_after_ingest, markdown_metadata, markdown_title, path_to_string,
    single_line, text_from_utf8_lossy, write_asset, write_raw_markdown,
};
use crate::sources::{CompileStatus, IngestionMethod, SourceDraft, SourceKind, SourceManifest};
use crate::store::WikiIndexStore;

const URL_FETCH_TIMEOUT: Duration = Duration::from_secs(30);
const HTTP_STATUS_BODY_LIMIT_BYTES: u64 = 8 * 1024;
const MAX_REDIRECTS: usize = 10;
const USER_AGENT: &str = "gwiki/0.1";

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct UrlSnapshot {
    pub requested_url: String,
    pub final_url: String,
    pub fetched_at: String,
    pub body: Vec<u8>,
    pub content_type: Option<String>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct AcceptedUrlIngest {
    pub requested_url: String,
    pub final_url: String,
    pub result: IngestResult,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct UrlIngestFailure {
    pub url: String,
    pub code: String,
    pub message: String,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct UrlBatchIngest {
    pub accepted: Vec<AcceptedUrlIngest>,
    pub failed: Vec<UrlIngestFailure>,
}

impl UrlBatchIngest {
    pub fn status(&self) -> &'static str {
        match (self.accepted.is_empty(), self.failed.is_empty()) {
            (false, true) => "ingested",
            (false, false) => "partial",
            (true, _) => "failed",
        }
    }

    pub fn exit_code(&self) -> u8 {
        u8::from(self.accepted.is_empty())
    }
}

pub fn ingest_snapshot(
    vault_root: &Path,
    store: &mut impl WikiIndexStore,
    snapshot: UrlSnapshot,
) -> Result<IngestResult, WikiError> {
    let result = ingest_snapshot_without_index(vault_root, snapshot)?;
    index_after_ingest(vault_root, store)?;

    Ok(result)
}

pub(crate) fn ingest_snapshot_without_index(
    vault_root: &Path,
    mut snapshot: UrlSnapshot,
) -> Result<IngestResult, WikiError> {
    if !snapshot_is_html(&snapshot) {
        return ingest_non_html_snapshot_without_index(vault_root, snapshot);
    }

    let html = text_from_utf8_lossy(&snapshot.body);
    let source_hash = gobby_core::indexing::content_hash(&snapshot.body);
    let document = Html::parse_document(&html);
    let title = extract_title(&document).unwrap_or_else(|| snapshot.final_url.clone());
    let draft = SourceDraft {
        location: snapshot.final_url.clone(),
        kind: SourceKind::Url,
        fetched_at: snapshot.fetched_at.clone(),
        content: std::mem::take(&mut snapshot.body),
        title: Some(markdown_title(&title)),
        citation: Some(snapshot.final_url.clone()),
        license: None,
        ingestion_method: IngestionMethod::Manual,
        compile_status: CompileStatus::Pending,
    };
    let record = SourceManifest::register(vault_root, draft)?;
    let markdown = render_url_markdown(
        &snapshot,
        &record.canonical_location,
        &title,
        &document,
        &source_hash,
    );
    let raw_path = write_raw_markdown(vault_root, &record, &markdown)?;

    Ok(IngestResult {
        record,
        raw_path,
        asset_path: None,
    })
}

fn ingest_non_html_snapshot_without_index(
    vault_root: &Path,
    mut snapshot: UrlSnapshot,
) -> Result<IngestResult, WikiError> {
    let source_hash = gobby_core::indexing::content_hash(&snapshot.body);
    let kind = source_kind_for_url_response(snapshot.content_type.as_deref());
    let title = markdown_title(&file_name_for_url_response(&snapshot, &kind));
    let body = std::mem::take(&mut snapshot.body);
    let draft = SourceDraft {
        location: snapshot.final_url.clone(),
        kind: kind.clone(),
        fetched_at: snapshot.fetched_at.clone(),
        content: body.clone(),
        title: Some(title.clone()),
        citation: Some(snapshot.final_url.clone()),
        license: None,
        ingestion_method: IngestionMethod::Manual,
        compile_status: CompileStatus::Pending,
    };
    let record = SourceManifest::register(vault_root, draft)?;
    let asset_path = write_asset(vault_root, &record, &title, &body)?;
    let markdown = render_non_html_url_markdown(
        &snapshot,
        &record.canonical_location,
        &title,
        &kind,
        &source_hash,
        &asset_path,
    );
    let raw_path = write_raw_markdown(vault_root, &record, &markdown)?;

    Ok(IngestResult {
        record,
        raw_path,
        asset_path: Some(asset_path),
    })
}

pub(crate) fn ingest_urls(
    vault_root: &Path,
    store: &mut impl WikiIndexStore,
    urls: &[String],
    fetched_at: &str,
) -> Result<UrlBatchIngest, WikiError> {
    let fetcher = BlockingUrlFetcher::default();
    ingest_urls_with_fetcher(vault_root, store, urls, fetched_at, |url, fetched_at| {
        fetcher.fetch(url, fetched_at)
    })
}

pub(crate) fn fetch_url_snapshot(
    url: &str,
    fetched_at: &str,
) -> Result<UrlSnapshot, UrlIngestFailure> {
    BlockingUrlFetcher::default().fetch(url, fetched_at)
}

pub(crate) fn ingest_urls_with_fetcher(
    vault_root: &Path,
    store: &mut impl WikiIndexStore,
    urls: &[String],
    fetched_at: &str,
    mut fetch: impl FnMut(&str, &str) -> Result<UrlSnapshot, UrlIngestFailure>,
) -> Result<UrlBatchIngest, WikiError> {
    if urls.is_empty() {
        return Err(WikiError::InvalidInput {
            field: "ingest-url",
            message: "at least one URL is required".to_string(),
        });
    }

    let mut accepted = Vec::new();
    let mut failed = Vec::new();
    for url in urls {
        match fetch(url, fetched_at) {
            Ok(snapshot) => {
                let requested_url = snapshot.requested_url.clone();
                let final_url = snapshot.final_url.clone();
                match ingest_snapshot_without_index(vault_root, snapshot) {
                    Ok(result) => accepted.push(AcceptedUrlIngest {
                        requested_url,
                        final_url,
                        result,
                    }),
                    Err(error) => failed.push(UrlIngestFailure::from_wiki_error(url, error)),
                }
            }
            Err(error) => failed.push(error),
        }
    }

    if !accepted.is_empty() {
        index_after_ingest(vault_root, store)?;
    }

    Ok(UrlBatchIngest { accepted, failed })
}

#[derive(Debug, Clone)]
struct BlockingUrlFetcher {
    agent: ureq::Agent,
}

impl Default for BlockingUrlFetcher {
    fn default() -> Self {
        Self {
            agent: ureq::AgentBuilder::new()
                .timeout(URL_FETCH_TIMEOUT)
                .redirects(0)
                .build(),
        }
    }
}

impl BlockingUrlFetcher {
    fn fetch(&self, url: &str, fetched_at: &str) -> Result<UrlSnapshot, UrlIngestFailure> {
        validate_fetch_url(url)?;
        validate_resolved_fetch_url(url)?;
        let mut current_url = url.to_string();

        for _ in 0..=MAX_REDIRECTS {
            let response = match self
                .agent
                .get(&current_url)
                .set("User-Agent", USER_AGENT)
                .call()
            {
                Ok(response) => response,
                Err(ureq::Error::Status(status, response)) => {
                    return Err(UrlIngestFailure::http_status(
                        &current_url,
                        status,
                        response,
                    ));
                }
                Err(ureq::Error::Transport(error)) => {
                    return Err(UrlIngestFailure::new(
                        &current_url,
                        "transport_error",
                        error.to_string(),
                    ));
                }
            };

            if (300..400).contains(&response.status()) {
                let location = response.header("Location").ok_or_else(|| {
                    UrlIngestFailure::new(
                        &current_url,
                        "redirect_without_location",
                        format!(
                            "HTTP redirect {} did not include Location",
                            response.status()
                        ),
                    )
                })?;
                let next_url = resolve_redirect_url(&current_url, location)?;
                validate_fetch_url(&next_url)?;
                validate_resolved_fetch_url(&next_url)?;
                current_url = next_url;
                continue;
            }

            let final_url = response.get_url().to_string();
            validate_fetch_url(&final_url)?;
            validate_resolved_fetch_url(&final_url)?;
            let content_type = response.header("content-type").map(ToOwned::to_owned);
            let max_bytes = crate::support::env::max_inbox_item_bytes_from_env();
            if content_length_exceeds_limit(response.header("content-length"), max_bytes) {
                return Err(response_too_large(&final_url, max_bytes));
            }
            let body = read_limited_body(response.into_reader(), max_bytes, &final_url)?;

            return Ok(UrlSnapshot {
                requested_url: url.to_string(),
                final_url,
                fetched_at: fetched_at.to_string(),
                body,
                content_type,
            });
        }

        Err(UrlIngestFailure::new(
            url,
            "too_many_redirects",
            format!("exceeded {MAX_REDIRECTS} URL redirects"),
        ))
    }
}

fn content_length_exceeds_limit(content_length: Option<&str>, max_bytes: u64) -> bool {
    content_length
        .and_then(|value| value.trim().parse::<u64>().ok())
        .is_some_and(|length| length > max_bytes)
}

fn read_limited_body(
    reader: impl Read,
    max_bytes: u64,
    url: &str,
) -> Result<Vec<u8>, UrlIngestFailure> {
    let mut body = Vec::new();
    reader
        .take(max_bytes.saturating_add(1))
        .read_to_end(&mut body)
        .map_err(|error| UrlIngestFailure::new(url, "read_error", error.to_string()))?;
    if u64::try_from(body.len()).unwrap_or(u64::MAX) > max_bytes {
        return Err(response_too_large(url, max_bytes));
    }
    Ok(body)
}

fn response_too_large(url: &str, max_bytes: u64) -> UrlIngestFailure {
    UrlIngestFailure::new(
        url,
        "response_too_large",
        format!("response exceeds GWIKI_MAX_INBOX_ITEM_BYTES limit of {max_bytes} bytes"),
    )
}

impl UrlIngestFailure {
    fn new(url: impl Into<String>, code: impl Into<String>, message: impl Into<String>) -> Self {
        Self {
            url: url.into(),
            code: code.into(),
            message: message.into(),
        }
    }

    fn from_wiki_error(url: &str, error: WikiError) -> Self {
        Self::new(url, error.code(), error.to_string())
    }

    fn http_status(url: &str, status: u16, response: ureq::Response) -> Self {
        let body =
            match read_limited_body(response.into_reader(), HTTP_STATUS_BODY_LIMIT_BYTES, url) {
                Ok(body) => text_from_utf8_lossy(&body),
                Err(error) => error.message,
            };
        let body = single_line(&body);
        let detail = if body.is_empty() {
            format!("HTTP status {status}")
        } else {
            format!("HTTP status {status}: {}", truncate_message(&body))
        };
        Self::new(url, "http_status", detail)
    }
}

fn resolve_redirect_url(current_url: &str, location: &str) -> Result<String, UrlIngestFailure> {
    let base = url::Url::parse(current_url)
        .map_err(|error| UrlIngestFailure::new(current_url, "invalid_url", error.to_string()))?;
    base.join(location)
        .map(|url| url.to_string())
        .map_err(|error| UrlIngestFailure::new(current_url, "invalid_redirect", error.to_string()))
}

fn validate_fetch_url(raw_url: &str) -> Result<(), UrlIngestFailure> {
    let parsed = url::Url::parse(raw_url)
        .map_err(|error| UrlIngestFailure::new(raw_url, "invalid_url", error.to_string()))?;
    if matches!(parsed.scheme(), "http" | "https") {
        Ok(())
    } else {
        Err(UrlIngestFailure::new(
            raw_url,
            "invalid_url",
            format!("unsupported URL scheme `{}`", parsed.scheme()),
        ))
    }
}

fn validate_resolved_fetch_url(raw_url: &str) -> Result<(), UrlIngestFailure> {
    let parsed = url::Url::parse(raw_url)
        .map_err(|error| UrlIngestFailure::new(raw_url, "invalid_url", error.to_string()))?;
    let host = parsed
        .host_str()
        .ok_or_else(|| UrlIngestFailure::new(raw_url, "invalid_url", "URL host is required"))?;
    let port = parsed
        .port_or_known_default()
        .ok_or_else(|| UrlIngestFailure::new(raw_url, "invalid_url", "URL port is required"))?;
    let mut resolved_any = false;
    let addresses = (host, port).to_socket_addrs().map_err(|error| {
        UrlIngestFailure::new(raw_url, "dns_resolution_failed", error.to_string())
    })?;
    for address in addresses {
        resolved_any = true;
        let ip = address.ip();
        if is_disallowed_fetch_ip(ip) && !loopback_fetch_allowed_for_tests(ip) {
            return Err(UrlIngestFailure::new(
                raw_url,
                "disallowed_address",
                format!("URL host resolves to disallowed address {ip}"),
            ));
        }
    }
    if resolved_any {
        Ok(())
    } else {
        Err(UrlIngestFailure::new(
            raw_url,
            "dns_resolution_failed",
            "URL host did not resolve to any addresses",
        ))
    }
}

fn loopback_fetch_allowed_for_tests(ip: IpAddr) -> bool {
    cfg!(debug_assertions)
        && ip.is_loopback()
        && std::env::var_os("GWIKI_ALLOW_LOOPBACK_URL_FETCH_FOR_TESTS").is_some()
}

fn is_disallowed_fetch_ip(ip: IpAddr) -> bool {
    match ip {
        IpAddr::V4(ip) => {
            ip.is_private()
                || ip.is_loopback()
                || ip.is_link_local()
                || ip.is_multicast()
                || ip.is_unspecified()
                || ip.octets() == [169, 254, 169, 254]
        }
        IpAddr::V6(ip) => {
            if let Some(mapped) = ip.to_ipv4_mapped() {
                return is_disallowed_fetch_ip(IpAddr::V4(mapped));
            }
            ip.is_loopback()
                || ip.is_unspecified()
                || ip.is_multicast()
                || is_ipv6_unique_local(ip)
                || is_ipv6_unicast_link_local(ip)
        }
    }
}

fn is_ipv6_unique_local(ip: Ipv6Addr) -> bool {
    ip.segments()[0] & 0xfe00 == 0xfc00
}

fn is_ipv6_unicast_link_local(ip: Ipv6Addr) -> bool {
    ip.segments()[0] & 0xffc0 == 0xfe80
}

fn truncate_message(message: &str) -> String {
    const MAX_CHARS: usize = 200;
    let mut chars = message.chars();
    let truncated = chars.by_ref().take(MAX_CHARS).collect::<String>();
    if chars.next().is_some() {
        format!("{truncated}...")
    } else {
        truncated
    }
}

fn render_url_markdown(
    snapshot: &UrlSnapshot,
    canonical_url: &str,
    title: &str,
    document: &Html,
    source_hash: &str,
) -> String {
    let mut fields = vec![
        ("source_kind", "url".to_string()),
        ("source_url", snapshot.final_url.clone()),
        ("requested_url", snapshot.requested_url.clone()),
        ("canonical_url", canonical_url.to_string()),
        ("fetched_at", snapshot.fetched_at.clone()),
        ("source_hash", source_hash.to_string()),
    ];
    if let Some(content_type) = &snapshot.content_type {
        fields.push(("content_type", content_type.clone()));
    }
    let mut markdown = markdown_metadata(&fields);
    markdown.push_str("# ");
    markdown.push_str(&markdown_title(title));
    markdown.push_str("\n\n");
    markdown.push_str(&html_to_markdownish_text(document));
    markdown.push('\n');
    markdown
}

fn render_non_html_url_markdown(
    snapshot: &UrlSnapshot,
    canonical_url: &str,
    title: &str,
    kind: &SourceKind,
    source_hash: &str,
    asset_path: &Path,
) -> String {
    let mut fields = vec![
        ("source_kind", kind.to_string()),
        ("source_url", snapshot.final_url.clone()),
        ("requested_url", snapshot.requested_url.clone()),
        ("canonical_url", canonical_url.to_string()),
        ("fetched_at", snapshot.fetched_at.clone()),
        ("source_hash", source_hash.to_string()),
        ("source_asset", path_to_string(asset_path)),
        ("media_degradation", "url_non_html_asset".to_string()),
    ];
    if let Some(content_type) = &snapshot.content_type {
        fields.push(("content_type", content_type.clone()));
    }
    let mut markdown = markdown_metadata(&fields);
    markdown.push_str("# ");
    markdown.push_str(&markdown_title(title));
    markdown.push_str("\n\n");
    markdown.push_str("Non-HTML URL response preserved as a source asset.\n");
    markdown
}

fn snapshot_is_html(snapshot: &UrlSnapshot) -> bool {
    match content_type_media_type(snapshot.content_type.as_deref()).as_deref() {
        Some("text/html" | "application/xhtml+xml") => true,
        Some(_) => false,
        None => body_looks_like_html(&snapshot.body),
    }
}

fn source_kind_for_url_response(content_type: Option<&str>) -> SourceKind {
    match content_type_media_type(content_type).as_deref() {
        Some("application/pdf") => SourceKind::Pdf,
        Some(media_type) if media_type.starts_with("image/") => SourceKind::Image,
        Some(media_type) if media_type.starts_with("audio/") => SourceKind::Audio,
        Some(media_type) if media_type.starts_with("video/") => SourceKind::Video,
        Some("application/json" | "application/xml" | "text/plain" | "text/csv" | "text/xml") => {
            SourceKind::Text
        }
        Some(media_type) if media_type.starts_with("text/") => SourceKind::Text,
        _ => SourceKind::File,
    }
}

fn content_type_media_type(content_type: Option<&str>) -> Option<String> {
    content_type?
        .split(';')
        .next()
        .map(str::trim)
        .filter(|value| !value.is_empty())
        .map(str::to_ascii_lowercase)
}

fn body_looks_like_html(body: &[u8]) -> bool {
    let text = text_from_utf8_lossy(&body[..body.len().min(512)]).to_ascii_lowercase();
    let trimmed = text.trim_start();
    trimmed.starts_with("<!doctype html")
        || trimmed.starts_with("<html")
        || trimmed.contains("<body")
}

fn file_name_for_url_response(snapshot: &UrlSnapshot, kind: &SourceKind) -> String {
    let from_url = url::Url::parse(&snapshot.final_url)
        .ok()
        .and_then(|url| {
            url.path_segments()
                .and_then(|mut segments| segments.next_back().map(str::to_string))
        })
        .filter(|value| !value.trim().is_empty());
    from_url.unwrap_or_else(|| match kind {
        SourceKind::Pdf => "download.pdf".to_string(),
        SourceKind::Image => "image".to_string(),
        SourceKind::Audio => "audio".to_string(),
        SourceKind::Video => "video".to_string(),
        SourceKind::Text => "download.txt".to_string(),
        _ => "download".to_string(),
    })
}

fn extract_title(document: &Html) -> Option<String> {
    let selector = Selector::parse("title").ok()?;
    let title = document
        .select(&selector)
        .next()?
        .text()
        .collect::<Vec<_>>()
        .join(" ");
    let title = single_line(&title);
    (!title.is_empty()).then_some(title)
}

fn html_to_markdownish_text(document: &Html) -> String {
    let mut parts = Vec::new();
    let root = Selector::parse("body")
        .ok()
        .and_then(|selector| document.select(&selector).next())
        .unwrap_or_else(|| document.root_element());
    collect_visible_text(root, &mut parts);
    let text = parts.join("\n");
    normalize_markdown_text(&text)
}

fn collect_visible_text(element: ElementRef<'_>, parts: &mut Vec<String>) {
    if is_hidden_element(element.value().name()) {
        return;
    }
    if is_text_block(element.value().name()) {
        let mut text = String::new();
        collect_inline_text(element, &mut text);
        if !single_line(&text).is_empty() {
            parts.push(text);
        }
        return;
    }

    let mut inline = String::new();
    for child in element.children() {
        match child.value() {
            Node::Text(text) => inline.push_str(&text.text),
            Node::Element(_) => {
                if let Some(child_element) = ElementRef::wrap(child) {
                    if is_hidden_element(child_element.value().name()) {
                        continue;
                    }
                    if is_text_block(child_element.value().name()) {
                        push_inline_part(&mut inline, parts);
                        collect_visible_text(child_element, parts);
                    } else {
                        collect_inline_text(child_element, &mut inline);
                    }
                }
            }
            _ => {}
        }
    }
    push_inline_part(&mut inline, parts);
}

fn collect_inline_text(element: ElementRef<'_>, output: &mut String) {
    if is_hidden_element(element.value().name()) {
        return;
    }
    for child in element.children() {
        match child.value() {
            Node::Text(text) => output.push_str(&text.text),
            Node::Element(_) => {
                if let Some(child_element) = ElementRef::wrap(child) {
                    collect_inline_text(child_element, output);
                }
            }
            _ => {}
        }
    }
}

fn push_inline_part(inline: &mut String, parts: &mut Vec<String>) {
    if !single_line(inline).is_empty() {
        parts.push(std::mem::take(inline));
    } else {
        inline.clear();
    }
}

fn is_hidden_element(name: &str) -> bool {
    matches!(name, "head" | "script" | "style")
}

fn is_text_block(name: &str) -> bool {
    matches!(
        name,
        "address"
            | "blockquote"
            | "dd"
            | "dt"
            | "figcaption"
            | "h1"
            | "h2"
            | "h3"
            | "h4"
            | "h5"
            | "h6"
            | "li"
            | "p"
            | "pre"
            | "td"
            | "th"
    )
}

fn normalize_markdown_text(text: &str) -> String {
    let mut lines = Vec::new();
    for line in text.lines() {
        let line = single_line(line);
        if !line.is_empty() && lines.last().is_none_or(|last: &String| last != &line) {
            lines.push(line);
        }
    }
    lines.join("\n\n")
}

#[cfg(test)]
mod tests {
    use std::collections::BTreeMap;
    use std::path::Path;
    use std::path::PathBuf;

    use gobby_core::indexing::content_hash;

    use super::*;
    use crate::sources::{SourceKind, SourceManifest};
    use crate::store::{
        MemoryWikiStore, StoreError, WikiChunk, WikiDocument, WikiIndexStore, WikiIngestion,
        WikiLink, WikiSource,
    };

    #[test]
    fn url_ingest_writes_raw_and_manifest() {
        let temp = tempfile::tempdir().expect("tempdir");
        let body = br#"<!doctype html>
<html>
<head><title>Durable Wikis</title></head>
<body><main><h1>Durable Wikis</h1><p>Capture source material.</p></main></body>
</html>"#
            .to_vec();
        let expected_hash = content_hash(&body);
        let snapshot = UrlSnapshot {
            requested_url: "https://Example.com/docs/wiki#overview".to_string(),
            final_url: "https://example.com/docs/wiki/".to_string(),
            fetched_at: "2026-05-29T16:00:00Z".to_string(),
            body,
            content_type: Some("text/html".to_string()),
        };
        let mut store = MemoryWikiStore::default();

        let result =
            ingest_snapshot(temp.path(), &mut store, snapshot).expect("ingest url snapshot");

        assert_eq!(result.asset_path, None);
        let raw = std::fs::read_to_string(temp.path().join(&result.raw_path))
            .expect("raw markdown written");
        assert!(raw.contains("# Durable Wikis"));
        assert!(raw.contains("canonical_url: https://example.com/docs/wiki"));
        assert!(raw.contains("fetched_at: 2026-05-29T16:00:00Z"));
        assert!(raw.contains("content_type: text/html"));
        assert!(raw.contains(&format!("source_hash: {expected_hash}")));
        assert!(raw.contains("Capture source material."));

        let manifest = SourceManifest::read(temp.path()).expect("read source manifest");
        assert_eq!(manifest.entries.len(), 1);
        let entry = &manifest.entries[0];
        assert_eq!(entry.kind, SourceKind::Url);
        assert_eq!(entry.title.as_deref(), Some("Durable Wikis"));
        assert_eq!(entry.canonical_location, "https://example.com/docs/wiki");
        assert_eq!(entry.content_hash, expected_hash);
        assert_eq!(entry.fetched_at, "2026-05-29T16:00:00Z");
        assert!(store.documents.contains_key(&PathBuf::from("raw/INDEX.md")));
    }

    #[test]
    fn url_ingest_preserves_non_html_as_typed_asset() {
        let temp = tempfile::tempdir().expect("tempdir");
        let body = b"%PDF-1.7\nbinary-ish\n%%EOF\n".to_vec();
        let snapshot = UrlSnapshot {
            requested_url: "https://example.com/report".to_string(),
            final_url: "https://example.com/files/report.pdf".to_string(),
            fetched_at: "2026-05-29T16:00:00Z".to_string(),
            body: body.clone(),
            content_type: Some("Application/PDF; charset=binary".to_string()),
        };
        let mut store = MemoryWikiStore::default();

        let result =
            ingest_snapshot(temp.path(), &mut store, snapshot).expect("ingest pdf url snapshot");

        let asset_path = result.asset_path.expect("non-html asset path");
        assert_eq!(
            std::fs::read(temp.path().join(&asset_path)).expect("asset bytes"),
            body
        );
        let raw = std::fs::read_to_string(temp.path().join(&result.raw_path))
            .expect("raw markdown written");
        assert!(raw.contains("source_kind: pdf"));
        assert!(raw.contains("source_asset: "));
        assert!(raw.contains("media_degradation: url_non_html_asset"));
        assert!(raw.contains("Non-HTML URL response preserved as a source asset."));
        assert!(!raw.contains("binary-ish"));

        let manifest = SourceManifest::read(temp.path()).expect("read source manifest");
        assert_eq!(manifest.entries[0].kind, SourceKind::Pdf);
    }

    #[test]
    fn html_parser_extracts_body_text_and_decodes_entities() {
        let html = br#"<!doctype html>
<html>
<head><title>Hidden &amp; Title</title></head>
<body><main><p>Keep <strong>&amp; decode</strong> together.</p><script>drop()</script></main></body>
</html>"#;

        let html = Html::parse_document(&text_from_utf8_lossy(html));

        assert_eq!(extract_title(&html), Some("Hidden & Title".to_string()));
        assert_eq!(html_to_markdownish_text(&html), "Keep & decode together.");
    }

    #[test]
    fn batch_url_ingest_accepts_successes_and_records_failures() {
        let temp = tempfile::tempdir().expect("tempdir");
        let urls = vec![
            "https://example.test/accepted".to_string(),
            "https://example.test/failure".to_string(),
        ];
        let mut store = MemoryWikiStore::default();

        let result = ingest_urls_with_fetcher(
            temp.path(),
            &mut store,
            &urls,
            "2026-06-02T00:00:00Z",
            |url, fetched_at| {
                if url.ends_with("/accepted") {
                    Ok(test_snapshot(url, url, "Accepted URL", fetched_at))
                } else {
                    Err(UrlIngestFailure::new(url, "http_status", "HTTP status 500"))
                }
            },
        )
        .expect("batch ingest");

        assert_eq!(result.status(), "partial");
        assert_eq!(result.exit_code(), 0);
        assert_eq!(result.accepted.len(), 1);
        assert_eq!(result.failed.len(), 1);
        assert_eq!(
            result.accepted[0].requested_url,
            "https://example.test/accepted"
        );
        assert_eq!(result.failed[0].url, "https://example.test/failure");
        assert_eq!(result.failed[0].code, "http_status");
        assert!(store.documents.contains_key(&PathBuf::from("raw/INDEX.md")));

        let manifest = SourceManifest::read(temp.path()).expect("read source manifest");
        assert_eq!(manifest.entries.len(), 1);
        assert_eq!(manifest.entries[0].kind, SourceKind::Url);
        assert_eq!(
            manifest.entries[0].canonical_location,
            "https://example.test/accepted"
        );
    }

    #[test]
    fn batch_url_ingest_indexes_once_after_accepted_batch() {
        let temp = tempfile::tempdir().expect("tempdir");
        let urls = vec![
            "https://example.test/one".to_string(),
            "https://example.test/two".to_string(),
        ];
        let mut store = CountingStore::default();

        let result = ingest_urls_with_fetcher(
            temp.path(),
            &mut store,
            &urls,
            "2026-06-02T00:00:00Z",
            |url, fetched_at| Ok(test_snapshot(url, url, url, fetched_at)),
        )
        .expect("batch ingest");

        assert_eq!(result.status(), "ingested");
        assert_eq!(result.accepted.len(), 2);
        assert_eq!(store.indexed_hash_reads, 1);
    }

    #[test]
    fn url_fetch_limits_content_length_and_stream_bytes() {
        assert!(content_length_exceeds_limit(Some("11"), 10));
        assert!(!content_length_exceeds_limit(Some("10"), 10));
        assert!(!content_length_exceeds_limit(Some("invalid"), 10));

        let error = read_limited_body(std::io::Cursor::new(vec![0_u8; 11]), 10, "https://x.test")
            .expect_err("stream exceeding limit should fail");

        assert_eq!(error.code, "response_too_large");
        assert_eq!(
            read_limited_body(std::io::Cursor::new(vec![0_u8; 10]), 10, "https://x.test")
                .expect("stream at limit")
                .len(),
            10
        );
    }

    #[test]
    fn url_fetch_rejects_private_and_local_addresses() {
        for address in [
            "127.0.0.1",
            "10.0.0.1",
            "172.16.0.1",
            "192.168.1.1",
            "169.254.169.254",
            "0.0.0.0",
            "::1",
            "fc00::1",
            "fe80::1",
            "ff02::1",
        ] {
            let ip = address.parse::<IpAddr>().expect("test IP parses");
            assert!(is_disallowed_fetch_ip(ip), "{address} should be rejected");
        }

        assert!(!is_disallowed_fetch_ip(
            "93.184.216.34".parse().expect("public IP parses")
        ));
    }

    #[test]
    fn redirect_url_resolution_handles_relative_locations() {
        assert_eq!(
            resolve_redirect_url("https://example.com/a/b", "../next").expect("redirect"),
            "https://example.com/next"
        );
    }

    fn test_snapshot(
        requested_url: &str,
        final_url: &str,
        title: &str,
        fetched_at: &str,
    ) -> UrlSnapshot {
        UrlSnapshot {
            requested_url: requested_url.to_string(),
            final_url: final_url.to_string(),
            fetched_at: fetched_at.to_string(),
            body: format!(
                "<!doctype html><html><head><title>{title}</title></head><body><p>{title} body.</p></body></html>"
            )
            .into_bytes(),
            content_type: Some("text/html".to_string()),
        }
    }

    #[derive(Default)]
    struct CountingStore {
        inner: MemoryWikiStore,
        indexed_hash_reads: usize,
    }

    impl WikiIndexStore for CountingStore {
        fn indexed_hashes(&mut self) -> Result<BTreeMap<PathBuf, String>, StoreError> {
            self.indexed_hash_reads += 1;
            self.inner.indexed_hashes()
        }

        fn upsert_document(&mut self, document: WikiDocument) -> Result<(), StoreError> {
            self.inner.upsert_document(document)
        }

        fn replace_chunks(
            &mut self,
            path: &Path,
            chunks: Vec<WikiChunk>,
        ) -> Result<(), StoreError> {
            self.inner.replace_chunks(path, chunks)
        }

        fn replace_links(&mut self, path: &Path, links: Vec<WikiLink>) -> Result<(), StoreError> {
            self.inner.replace_links(path, links)
        }

        fn upsert_source(&mut self, source: WikiSource) -> Result<(), StoreError> {
            self.inner.upsert_source(source)
        }

        fn record_ingestion(&mut self, ingestion: WikiIngestion) -> Result<(), StoreError> {
            self.inner.record_ingestion(ingestion)
        }

        fn record_file_hash(
            &mut self,
            path: PathBuf,
            content_hash: String,
        ) -> Result<(), StoreError> {
            self.inner.record_file_hash(path, content_hash)
        }

        fn delete_derived_rows(&mut self, path: &Path) -> Result<(), StoreError> {
            self.inner.delete_derived_rows(path)
        }
    }
}